source/include/pthreadpool.h

0001 #ifndef PTHREADPOOL_H_
0002 #define PTHREADPOOL_H_
0003
0004 #include <stddef.h>
0005 #include <stdint.h>
0006
0007 typedef struct pthreadpool* pthreadpool_t;
0008
0009 typedef void (*pthreadpool_task_1d_t)(void*, size_t);
0010 typedef void (*pthreadpool_task_1d_with_thread_t)(void*, size_t, size_t);
0011 typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t);
0012 typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t);
0013 typedef void (*pthreadpool_task_2d_with_thread_t)(void*, size_t, size_t, size_t);
0014 typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t);
0015 typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t);
0016 typedef void (*pthreadpool_task_3d_t)(void*, size_t, size_t, size_t);
0017 typedef void (*pthreadpool_task_3d_tile_1d_t)(void*, size_t, size_t, size_t, size_t);
0018 typedef void (*pthreadpool_task_3d_tile_1d_with_thread_t)(void*, size_t, size_t, size_t, size_t, size_t);
0019 typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t);
0020 typedef void (*pthreadpool_task_4d_t)(void*, size_t, size_t, size_t, size_t);
0021 typedef void (*pthreadpool_task_4d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t);
0022 typedef void (*pthreadpool_task_4d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
0023 typedef void (*pthreadpool_task_5d_t)(void*, size_t, size_t, size_t, size_t, size_t);
0024 typedef void (*pthreadpool_task_5d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
0025 typedef void (*pthreadpool_task_5d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
0026 typedef void (*pthreadpool_task_6d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
0027 typedef void (*pthreadpool_task_6d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
0028 typedef void (*pthreadpool_task_6d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
0029
0030 typedef void (*pthreadpool_task_1d_with_id_t)(void*, uint32_t, size_t);
0031 typedef void (*pthreadpool_task_2d_tile_1d_with_id_t)(void*, uint32_t, size_t, size_t, size_t);
0032 typedef void (*pthreadpool_task_2d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
0033 typedef void (*pthreadpool_task_3d_tile_1d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
0034 typedef void (*pthreadpool_task_3d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t);
0035 typedef void (*pthreadpool_task_4d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t);
0036
0037 typedef void (*pthreadpool_task_2d_tile_1d_with_id_with_thread_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
0038 typedef void (*pthreadpool_task_3d_tile_1d_with_id_with_thread_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t);
0039
0040
0041 /**
0042  * Disable support for denormalized numbers to the maximum extent possible for
0043  * the duration of the computation.
0044  *
0045  * Handling denormalized floating-point numbers is often implemented in
0046  * microcode, and incurs significant performance degradation. This hint
0047  * instructs the thread pool to disable support for denormalized numbers before
0048  * running the computation by manipulating architecture-specific control
0049  * registers, and restore the initial value of control registers after the
0050  * computation is complete. The thread pool temporary disables denormalized
0051  * numbers on all threads involved in the computation (i.e. the caller threads,
0052  * and potentially worker threads).
0053  *
0054  * Disabling denormalized numbers may have a small negative effect on results'
0055  * accuracy. As various architectures differ in capabilities to control
0056  * processing of denormalized numbers, using this flag may also hurt results'
0057  * reproducibility across different instruction set architectures.
0058  */
0059 #define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001
0060
0061 /**
0062  * Yield worker threads to the system scheduler after the operation is finished.
0063  *
0064  * Force workers to use kernel wait (instead of active spin-wait by default) for
0065  * new commands after this command is processed. This flag affects only the
0066  * immediate next operation on this thread pool. To make the thread pool always
0067  * use kernel wait, pass this flag to all parallelization functions.
0068  */
0069 #define PTHREADPOOL_FLAG_YIELD_WORKERS 0x00000002
0070
0071 #ifdef __cplusplus
0072 extern "C" {
0073 #endif
0074
0075 /**
0076  * Create a thread pool with the specified number of threads.
0077  *
0078  * @param  threads_count  the number of threads in the thread pool.
0079  *    A value of 0 has special interpretation: it creates a thread pool with as
0080  *    many threads as there are logical processors in the system.
0081  *
0082  * @returns  A pointer to an opaque thread pool object if the call is
0083  *    successful, or NULL pointer if the call failed.
0084  */
0085 pthreadpool_t pthreadpool_create(size_t threads_count);
0086
0087 /**
0088  * Query the number of threads in a thread pool.
0089  *
0090  * @param  threadpool  the thread pool to query.
0091  *
0092  * @returns  The number of threads in the thread pool.
0093  */
0094 size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);
0095
0096 /**
0097  * Process items on a 1D grid.
0098  *
0099  * The function implements a parallel version of the following snippet:
0100  *
0101  *   for (size_t i = 0; i < range; i++)
0102  *     function(context, i);
0103  *
0104  * When the function returns, all items have been processed and the thread pool
0105  * is ready for a new task.
0106  *
0107  * @note If multiple threads call this function with the same thread pool, the
0108  *    calls are serialized.
0109  *
0110  * @param threadpool  the thread pool to use for parallelisation. If threadpool
0111  *    is NULL, all items are processed serially on the calling thread.
0112  * @param function    the function to call for each item.
0113  * @param context     the first argument passed to the specified function.
0114  * @param range       the number of items on the 1D grid to process. The
0115  *    specified function will be called once for each item.
0116  * @param flags       a bitwise combination of zero or more optional flags
0117  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
0118  */
0119 void pthreadpool_parallelize_1d(
0120     pthreadpool_t threadpool,
0121     pthreadpool_task_1d_t function,
0122     void* context,
0123     size_t range,
0124     uint32_t flags);
0125
0126 /**
0127  * Process items on a 1D grid passing along the current thread id.
0128  *
0129  * The function implements a parallel version of the following snippet:
0130  *
0131  *   for (size_t i = 0; i < range; i++)
0132  *     function(context, thread_index, i);
0133  *
0134  * When the function returns, all items have been processed and the thread pool
0135  * is ready for a new task.
0136  *
0137  * @note If multiple threads call this function with the same thread pool, the
0138  *    calls are serialized.
0139  *
0140  * @param threadpool  the thread pool to use for parallelisation. If threadpool
0141  *    is NULL, all items are processed serially on the calling thread.
0142  * @param function    the function to call for each item.
0143  * @param context     the first argument passed to the specified function.
0144  * @param range       the number of items on the 1D grid to process. The
0145  *    specified function will be called once for each item.
0146  * @param flags       a bitwise combination of zero or more optional flags
0147  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
0148  */
0149 void pthreadpool_parallelize_1d_with_thread(
0150     pthreadpool_t threadpool,
0151     pthreadpool_task_1d_with_thread_t function,
0152     void* context,
0153     size_t range,
0154     uint32_t flags);
0155
0156 /**
0157  * Process items on a 1D grid using a microarchitecture-aware task function.
0158  *
0159  * The function implements a parallel version of the following snippet:
0160  *
0161  *   uint32_t uarch_index = cpuinfo_initialize() ?
0162  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
0163  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
0164  *   for (size_t i = 0; i < range; i++)
0165  *     function(context, uarch_index, i);
0166  *
0167  * When the function returns, all items have been processed and the thread pool
0168  * is ready for a new task.
0169  *
0170  * @note If multiple threads call this function with the same thread pool, the
0171  *    calls are serialized.
0172  *
0173  * @param threadpool           the thread pool to use for parallelisation. If
0174  *    threadpool is NULL, all items are processed serially on the calling
0175  *    thread.
0176  * @param function             the function to call for each item.
0177  * @param context              the first argument passed to the specified
0178  *    function.
0179  * @param default_uarch_index  the microarchitecture index to use when
0180  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
0181  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
0182  *    max_uarch_index value.
0183  * @param max_uarch_index      the maximum microarchitecture index expected by
0184  *    the specified function. If the index returned by
0185  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
0186  *    will be used instead. default_uarch_index can exceed max_uarch_index.
0187  * @param range                the number of items on the 1D grid to process.
0188  *    The specified function will be called once for each item.
0189  * @param flags                a bitwise combination of zero or more optional
0190  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
0191  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
0192  */
0193 void pthreadpool_parallelize_1d_with_uarch(
0194     pthreadpool_t threadpool,
0195     pthreadpool_task_1d_with_id_t function,
0196     void* context,
0197     uint32_t default_uarch_index,
0198     uint32_t max_uarch_index,
0199     size_t range,
0200     uint32_t flags);
0201
0202 /**
0203  * Process items on a 1D grid with specified maximum tile size.
0204  *
0205  * The function implements a parallel version of the following snippet:
0206  *
0207  *   for (size_t i = 0; i < range; i += tile)
0208  *     function(context, i, min(range - i, tile));
0209  *
0210  * When the call returns, all items have been processed and the thread pool is
0211  * ready for a new task.
0212  *
0213  * @note If multiple threads call this function with the same thread pool,
0214  *    the calls are serialized.
0215  *
0216  * @param threadpool  the thread pool to use for parallelisation. If threadpool
0217  *    is NULL, all items are processed serially on the calling thread.
0218  * @param function    the function to call for each tile.
0219  * @param context     the first argument passed to the specified function.
0220  * @param range       the number of items on the 1D grid to process.
0221  * @param tile        the maximum number of items on the 1D grid to process in
0222  *    one function call.
0223  * @param flags       a bitwise combination of zero or more optional flags
0224  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
0225  */
0226 void pthreadpool_parallelize_1d_tile_1d(
0227     pthreadpool_t threadpool,
0228     pthreadpool_task_1d_tile_1d_t function,
0229     void* context,
0230     size_t range,
0231     size_t tile,
0232     uint32_t flags);
0233
0234 /**
0235  * Process items on a 2D grid.
0236  *
0237  * The function implements a parallel version of the following snippet:
0238  *
0239  *   for (size_t i = 0; i < range_i; i++)
0240  *     for (size_t j = 0; j < range_j; j++)
0241  *       function(context, i, j);
0242  *
0243  * When the function returns, all items have been processed and the thread pool
0244  * is ready for a new task.
0245  *
0246  * @note If multiple threads call this function with the same thread pool, the
0247  *    calls are serialized.
0248  *
0249  * @param threadpool  the thread pool to use for parallelisation. If threadpool
0250  *    is NULL, all items are processed serially on the calling thread.
0251  * @param function    the function to call for each item.
0252  * @param context     the first argument passed to the specified function.
0253  * @param range_i     the number of items to process along the first dimension
0254  *    of the 2D grid.
0255  * @param range_j     the number of items to process along the second dimension
0256  *    of the 2D grid.
0257  * @param flags       a bitwise combination of zero or more optional flags
0258  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
0259  */
0260 void pthreadpool_parallelize_2d(
0261     pthreadpool_t threadpool,
0262     pthreadpool_task_2d_t function,
0263     void* context,
0264     size_t range_i,
0265     size_t range_j,
0266     uint32_t flags);
0267
0268 /**
0269  * Process items on a 2D grid passing along the current thread id.
0270  *
0271  * The function implements a parallel version of the following snippet:
0272  *
0273  *   for (size_t i = 0; i < range_i; i++)
0274  *     for (size_t j = 0; j < range_j; j++)
0275  *       function(context, thread_index, i, j);
0276  *
0277  * When the function returns, all items have been processed and the thread pool
0278  * is ready for a new task.
0279  *
0280  * @note If multiple threads call this function with the same thread pool, the
0281  *    calls are serialized.
0282  *
0283  * @param threadpool  the thread pool to use for parallelisation. If threadpool
0284  *    is NULL, all items are processed serially on the calling thread.
0285  * @param function    the function to call for each item.
0286  * @param context     the first argument passed to the specified function.
0287  * @param range_i     the number of items to process along the first dimension
0288  *    of the 2D grid.
0289  * @param range_j     the number of items to process along the second dimension
0290  *    of the 2D grid.
0291  * @param flags       a bitwise combination of zero or more optional flags
0292  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
0293  */
0294 void pthreadpool_parallelize_2d_with_thread(
0295     pthreadpool_t threadpool,
0296     pthreadpool_task_2d_with_thread_t function,
0297     void* context,
0298     size_t range_i,
0299     size_t range_j,
0300     uint32_t flags);
0301
0302 /**
0303  * Process items on a 2D grid with the specified maximum tile size along the
0304  * last grid dimension.
0305  *
0306  * The function implements a parallel version of the following snippet:
0307  *
0308  *   for (size_t i = 0; i < range_i; i++)
0309  *     for (size_t j = 0; j < range_j; j += tile_j)
0310  *       function(context, i, j, min(range_j - j, tile_j));
0311  *
0312  * When the function returns, all items have been processed and the thread pool
0313  * is ready for a new task.
0314  *
0315  * @note If multiple threads call this function with the same thread pool, the
0316  *    calls are serialized.
0317  *
0318  * @param threadpool  the thread pool to use for parallelisation. If threadpool
0319  *    is NULL, all items are processed serially on the calling thread.
0320  * @param function    the function to call for each tile.
0321  * @param context     the first argument passed to the specified function.
0322  * @param range_i     the number of items to process along the first dimension
0323  *    of the 2D grid.
0324  * @param range_j     the number of items to process along the second dimension
0325  *    of the 2D grid.
0326  * @param tile_j      the maximum number of items along the second dimension of
0327  *    the 2D grid to process in one function call.
0328  * @param flags       a bitwise combination of zero or more optional flags
0329  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
0330  */
0331 void pthreadpool_parallelize_2d_tile_1d(
0332     pthreadpool_t threadpool,
0333     pthreadpool_task_2d_tile_1d_t function,
0334     void* context,
0335     size_t range_i,
0336     size_t range_j,
0337     size_t tile_j,
0338     uint32_t flags);
0339
0340 /**
0341  * Process items on a 2D grid with the specified maximum tile size along the
0342  * last grid dimension using a microarchitecture-aware task function.
0343  *
0344  * The function implements a parallel version of the following snippet:
0345  *
0346  *   uint32_t uarch_index = cpuinfo_initialize() ?
0347  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
0348  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
0349  *   for (size_t i = 0; i < range_i; i++)
0350  *     for (size_t j = 0; j < range_j; j += tile_j)
0351  *       function(context, uarch_index, i, j, min(range_j - j, tile_j));
0352  *
0353  * When the function returns, all items have been processed and the thread pool
0354  * is ready for a new task.
0355  *
0356  * @note If multiple threads call this function with the same thread pool, the
0357  *    calls are serialized.
0358  *
0359  * @param threadpool  the thread pool to use for parallelisation. If threadpool
0360  *    is NULL, all items are processed serially on the calling thread.
0361  * @param function    the function to call for each tile.
0362  * @param context     the first argument passed to the specified function.
0363  * @param default_uarch_index  the microarchitecture index to use when
0364  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
0365  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
0366  *    max_uarch_index value.
0367  * @param max_uarch_index      the maximum microarchitecture index expected by
0368  *    the specified function. If the index returned by
0369  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
0370  *    will be used instead. default_uarch_index can exceed max_uarch_index.
0371  * @param range_i     the number of items to process along the first dimension
0372  *    of the 2D grid.
0373  * @param range_j     the number of items to process along the second dimension
0374  *    of the 2D grid.
0375  * @param tile_j      the maximum number of items along the second dimension of
0376  *    the 2D grid to process in one function call.
0377  * @param flags       a bitwise combination of zero or more optional flags
0378  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
0379  */
0380 void pthreadpool_parallelize_2d_tile_1d_with_uarch(
0381     pthreadpool_t threadpool,
0382     pthreadpool_task_2d_tile_1d_with_id_t function,
0383     void* context,
0384     uint32_t default_uarch_index,
0385     uint32_t max_uarch_index,
0386     size_t range_i,
0387     size_t range_j,
0388     size_t tile_j,
0389     uint32_t flags);
0390
0391 /**
0392  * Process items on a 2D grid with the specified maximum tile size along the
0393  * last grid dimension using a microarchitecture-aware task function and passing
0394  * along the current thread id.
0395  *
0396  * The function implements a parallel version of the following snippet:
0397  *
0398  *   uint32_t uarch_index = cpuinfo_initialize() ?
0399  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
0400  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
0401  *   for (size_t i = 0; i < range_i; i++)
0402  *     for (size_t j = 0; j < range_j; j += tile_j)
0403  *       function(context, uarch_index, thread_index, i, j, min(range_j - j, tile_j));
0404  *
0405  * When the function returns, all items have been processed and the thread pool
0406  * is ready for a new task.
0407  *
0408  * @note If multiple threads call this function with the same thread pool, the
0409  *    calls are serialized.
0410  *
0411  * @param threadpool  the thread pool to use for parallelisation. If threadpool
0412  *    is NULL, all items are processed serially on the calling thread.
0413  * @param function    the function to call for each tile.
0414  * @param context     the first argument passed to the specified function.
0415  * @param default_uarch_index  the microarchitecture index to use when
0416  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
0417  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
0418  *    max_uarch_index value.
0419  * @param max_uarch_index      the maximum microarchitecture index expected by
0420  *    the specified function. If the index returned by
0421  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
0422  *    will be used instead. default_uarch_index can exceed max_uarch_index.
0423  * @param range_i     the number of items to process along the first dimension
0424  *    of the 2D grid.
0425  * @param range_j     the number of items to process along the second dimension
0426  *    of the 2D grid.
0427  * @param tile_j      the maximum number of items along the second dimension of
0428  *    the 2D grid to process in one function call.
0429  * @param flags       a bitwise combination of zero or more optional flags
0430  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
0431  */
0432 void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread(
0433     pthreadpool_t threadpool,
0434     pthreadpool_task_2d_tile_1d_with_id_with_thread_t function,
0435     void* context,
0436     uint32_t default_uarch_index,
0437     uint32_t max_uarch_index,
0438     size_t range_i,
0439     size_t range_j,
0440     size_t tile_j,
0441     uint32_t flags);
0442
0443 /**
0444  * Process items on a 2D grid with the specified maximum tile size along each
0445  * grid dimension.
0446  *
0447  * The function implements a parallel version of the following snippet:
0448  *
0449  *   for (size_t i = 0; i < range_i; i += tile_i)
0450  *     for (size_t j = 0; j < range_j; j += tile_j)
0451  *       function(context, i, j,
0452  *         min(range_i - i, tile_i), min(range_j - j, tile_j));
0453  *
0454  * When the function returns, all items have been processed and the thread pool
0455  * is ready for a new task.
0456  *
0457  * @note If multiple threads call this function with the same thread pool, the
0458  *    calls are serialized.
0459  *
0460  * @param threadpool  the thread pool to use for parallelisation. If threadpool
0461  *    is NULL, all items are processed serially on the calling thread.
0462  * @param function    the function to call for each tile.
0463  * @param context     the first argument passed to the specified function.
0464  * @param range_i     the number of items to process along the first dimension
0465  *    of the 2D grid.
0466  * @param range_j     the number of items to process along the second dimension
0467  *    of the 2D grid.
0468  * @param tile_j      the maximum number of items along the first dimension of
0469  *    the 2D grid to process in one function call.
0470  * @param tile_j      the maximum number of items along the second dimension of
0471  *    the 2D grid to process in one function call.
0472  * @param flags       a bitwise combination of zero or more optional flags
0473  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
0474  */
0475 void pthreadpool_parallelize_2d_tile_2d(
0476     pthreadpool_t threadpool,
0477     pthreadpool_task_2d_tile_2d_t function,
0478     void* context,
0479     size_t range_i,
0480     size_t range_j,
0481     size_t tile_i,
0482     size_t tile_j,
0483     uint32_t flags);
0484
0485 /**
0486  * Process items on a 2D grid with the specified maximum tile size along each
0487  * grid dimension using a microarchitecture-aware task function.
0488  *
0489  * The function implements a parallel version of the following snippet:
0490  *
0491  *   uint32_t uarch_index = cpuinfo_initialize() ?
0492  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
0493  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
0494  *   for (size_t i = 0; i < range_i; i += tile_i)
0495  *     for (size_t j = 0; j < range_j; j += tile_j)
0496  *       function(context, uarch_index, i, j,
0497  *         min(range_i - i, tile_i), min(range_j - j, tile_j));
0498  *
0499  * When the function returns, all items have been processed and the thread pool
0500  * is ready for a new task.
0501  *
0502  * @note If multiple threads call this function with the same thread pool, the
0503  *    calls are serialized.
0504  *
0505  * @param threadpool           the thread pool to use for parallelisation. If
0506  *    threadpool is NULL, all items are processed serially on the calling
0507  *    thread.
0508  * @param function             the function to call for each tile.
0509  * @param context              the first argument passed to the specified
0510  *    function.
0511  * @param default_uarch_index  the microarchitecture index to use when
0512  *                             pthreadpool is configured without cpuinfo,
0513  *                             cpuinfo initialization failed, or index returned
0514  *                             by cpuinfo_get_current_uarch_index() exceeds
0515  *                             the max_uarch_index value.
0516  * @param max_uarch_index      the maximum microarchitecture index expected
0517  *                             by the specified function. If the index returned
0518  *                             by cpuinfo_get_current_uarch_index() exceeds this
0519  *                             value, default_uarch_index will be used instead.
0520  *                             default_uarch_index can exceed max_uarch_index.
0521  * @param range_i              the number of items to process along the first
0522  *    dimension of the 2D grid.
0523  * @param range_j              the number of items to process along the second
0524  *    dimension of the 2D grid.
0525  * @param tile_j               the maximum number of items along the first
0526  *    dimension of the 2D grid to process in one function call.
0527  * @param tile_j               the maximum number of items along the second
0528  *    dimension of the 2D grid to process in one function call.
0529  * @param flags                a bitwise combination of zero or more optional
0530  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
0531  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
0532  */
0533 void pthreadpool_parallelize_2d_tile_2d_with_uarch(
0534     pthreadpool_t threadpool,
0535     pthreadpool_task_2d_tile_2d_with_id_t function,
0536     void* context,
0537     uint32_t default_uarch_index,
0538     uint32_t max_uarch_index,
0539     size_t range_i,
0540     size_t range_j,
0541     size_t tile_i,
0542     size_t tile_j,
0543     uint32_t flags);
0544
0545 /**
0546  * Process items on a 3D grid.
0547  *
0548  * The function implements a parallel version of the following snippet:
0549  *
0550  *   for (size_t i = 0; i < range_i; i++)
0551  *     for (size_t j = 0; j < range_j; j++)
0552  *       for (size_t k = 0; k < range_k; k++)
0553  *         function(context, i, j, k);
0554  *
0555  * When the function returns, all items have been processed and the thread pool
0556  * is ready for a new task.
0557  *
0558  * @note If multiple threads call this function with the same thread pool, the
0559  *    calls are serialized.
0560  *
0561  * @param threadpool  the thread pool to use for parallelisation. If threadpool
0562  *    is NULL, all items are processed serially on the calling thread.
0563  * @param function    the function to call for each tile.
0564  * @param context     the first argument passed to the specified function.
0565  * @param range_i     the number of items to process along the first dimension
0566  *    of the 3D grid.
0567  * @param range_j     the number of items to process along the second dimension
0568  *    of the 3D grid.
0569  * @param range_k     the number of items to process along the third dimension
0570  *    of the 3D grid.
0571  * @param flags       a bitwise combination of zero or more optional flags
0572  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
0573  */
0574 void pthreadpool_parallelize_3d(
0575     pthreadpool_t threadpool,
0576     pthreadpool_task_3d_t function,
0577     void* context,
0578     size_t range_i,
0579     size_t range_j,
0580     size_t range_k,
0581     uint32_t flags);
0582
0583 /**
0584  * Process items on a 3D grid with the specified maximum tile size along the
0585  * last grid dimension.
0586  *
0587  * The function implements a parallel version of the following snippet:
0588  *
0589  *   for (size_t i = 0; i < range_i; i++)
0590  *     for (size_t j = 0; j < range_j; j++)
0591  *       for (size_t k = 0; k < range_k; k += tile_k)
0592  *         function(context, i, j, k, min(range_k - k, tile_k));
0593  *
0594  * When the function returns, all items have been processed and the thread pool
0595  * is ready for a new task.
0596  *
0597  * @note If multiple threads call this function with the same thread pool, the
0598  *    calls are serialized.
0599  *
0600  * @param threadpool  the thread pool to use for parallelisation. If threadpool
0601  *    is NULL, all items are processed serially on the calling thread.
0602  * @param function    the function to call for each tile.
0603  * @param context     the first argument passed to the specified function.
0604  * @param range_i     the number of items to process along the first dimension
0605  *    of the 3D grid.
0606  * @param range_j     the number of items to process along the second dimension
0607  *    of the 3D grid.
0608  * @param range_k     the number of items to process along the third dimension
0609  *    of the 3D grid.
0610  * @param tile_k      the maximum number of items along the third dimension of
0611  *    the 3D grid to process in one function call.
0612  * @param flags       a bitwise combination of zero or more optional flags
0613  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
0614  */
0615 void pthreadpool_parallelize_3d_tile_1d(
0616     pthreadpool_t threadpool,
0617     pthreadpool_task_3d_tile_1d_t function,
0618     void* context,
0619     size_t range_i,
0620     size_t range_j,
0621     size_t range_k,
0622     size_t tile_k,
0623     uint32_t flags);
0624
0625 /**
0626  * Process items on a 3D grid with the specified maximum tile size along the
0627  * last grid dimension and passing along the current thread id.
0628  *
0629  * The function implements a parallel version of the following snippet:
0630  *
0631  *   for (size_t i = 0; i < range_i; i++)
0632  *     for (size_t j = 0; j < range_j; j++)
0633  *       for (size_t k = 0; k < range_k; k += tile_k)
0634  *         function(context, thread_index, i, j, k, min(range_k - k, tile_k));
0635  *
0636  * When the function returns, all items have been processed and the thread pool
0637  * is ready for a new task.
0638  *
0639  * @note If multiple threads call this function with the same thread pool, the
0640  *    calls are serialized.
0641  *
0642  * @param threadpool  the thread pool to use for parallelisation. If threadpool
0643  *    is NULL, all items are processed serially on the calling thread.
0644  * @param function    the function to call for each tile.
0645  * @param context     the first argument passed to the specified function.
0646  * @param range_i     the number of items to process along the first dimension
0647  *    of the 3D grid.
0648  * @param range_j     the number of items to process along the second dimension
0649  *    of the 3D grid.
0650  * @param range_k     the number of items to process along the third dimension
0651  *    of the 3D grid.
0652  * @param tile_k      the maximum number of items along the third dimension of
0653  *    the 3D grid to process in one function call.
0654  * @param flags       a bitwise combination of zero or more optional flags
0655  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
0656  */
0657 void pthreadpool_parallelize_3d_tile_1d_with_thread(
0658   pthreadpool_t threadpool,
0659   pthreadpool_task_3d_tile_1d_with_thread_t function,
0660   void* context,
0661   size_t range_i,
0662   size_t range_j,
0663   size_t range_k,
0664   size_t tile_k,
0665   uint32_t flags);
0666
0667 /**
0668  * Process items on a 3D grid with the specified maximum tile size along the
0669  * last grid dimension using a microarchitecture-aware task function.
0670  *
0671  * The function implements a parallel version of the following snippet:
0672  *
0673  *   uint32_t uarch_index = cpuinfo_initialize() ?
0674  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
0675  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
0676  *   for (size_t i = 0; i < range_i; i++)
0677  *     for (size_t j = 0; j < range_j; j++)
0678  *       for (size_t k = 0; k < range_k; k += tile_k)
0679  *         function(context, uarch_index, i, j, k, min(range_k - k, tile_k));
0680  *
0681  * When the function returns, all items have been processed and the thread pool
0682  * is ready for a new task.
0683  *
0684  * @note If multiple threads call this function with the same thread pool, the
0685  *    calls are serialized.
0686  *
0687  * @param threadpool           the thread pool to use for parallelisation. If
0688  *    threadpool is NULL, all items are processed serially on the calling
0689  *    thread.
0690  * @param function             the function to call for each tile.
0691  * @param context              the first argument passed to the specified
0692  *    function.
0693  * @param default_uarch_index  the microarchitecture index to use when
0694  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
0695  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
0696  *    max_uarch_index value.
0697  * @param max_uarch_index      the maximum microarchitecture index expected by
0698  *    the specified function. If the index returned by
0699  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
0700  *    will be used instead. default_uarch_index can exceed max_uarch_index.
0701  * @param range_i              the number of items to process along the first
0702  *    dimension of the 3D grid.
0703  * @param range_j              the number of items to process along the second
0704  *    dimension of the 3D grid.
0705  * @param range_k              the number of items to process along the third
0706  *    dimension of the 3D grid.
0707  * @param tile_k               the maximum number of items along the third
0708  *    dimension of the 3D grid to process in one function call.
0709  * @param flags                a bitwise combination of zero or more optional
0710  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
0711  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
0712  */
0713 void pthreadpool_parallelize_3d_tile_1d_with_uarch(
0714     pthreadpool_t threadpool,
0715     pthreadpool_task_3d_tile_1d_with_id_t function,
0716     void* context,
0717     uint32_t default_uarch_index,
0718     uint32_t max_uarch_index,
0719     size_t range_i,
0720     size_t range_j,
0721     size_t range_k,
0722     size_t tile_k,
0723     uint32_t flags);
0724
0725 /**
0726  * Process items on a 3D grid with the specified maximum tile size along the
0727  * last grid dimension using a microarchitecture-aware task function and passing
0728  * along the current thread id.
0729  *
0730  * The function implements a parallel version of the following snippet:
0731  *
0732  *   uint32_t uarch_index = cpuinfo_initialize() ?
0733  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
0734  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
0735  *   for (size_t i = 0; i < range_i; i++)
0736  *     for (size_t j = 0; j < range_j; j++)
0737  *       for (size_t k = 0; k < range_k; k += tile_k)
0738  *         function(context, uarch_index, thread_index, i, j, k, min(range_k - k, tile_k));
0739  *
0740  * When the function returns, all items have been processed and the thread pool
0741  * is ready for a new task.
0742  *
0743  * @note If multiple threads call this function with the same thread pool, the
0744  *    calls are serialized.
0745  *
0746  * @param threadpool           the thread pool to use for parallelisation. If
0747  *    threadpool is NULL, all items are processed serially on the calling
0748  *    thread.
0749  * @param function             the function to call for each tile.
0750  * @param context              the first argument passed to the specified
0751  *    function.
0752  * @param default_uarch_index  the microarchitecture index to use when
0753  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
0754  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
0755  *    max_uarch_index value.
0756  * @param max_uarch_index      the maximum microarchitecture index expected by
0757  *    the specified function. If the index returned by
0758  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
0759  *    will be used instead. default_uarch_index can exceed max_uarch_index.
0760  * @param range_i              the number of items to process along the first
0761  *    dimension of the 3D grid.
0762  * @param range_j              the number of items to process along the second
0763  *    dimension of the 3D grid.
0764  * @param range_k              the number of items to process along the third
0765  *    dimension of the 3D grid.
0766  * @param tile_k               the maximum number of items along the third
0767  *    dimension of the 3D grid to process in one function call.
0768  * @param flags                a bitwise combination of zero or more optional
0769  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
0770  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
0771  */
0772 void pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread(
0773     pthreadpool_t threadpool,
0774     pthreadpool_task_3d_tile_1d_with_id_with_thread_t function,
0775     void* context,
0776     uint32_t default_uarch_index,
0777     uint32_t max_uarch_index,
0778     size_t range_i,
0779     size_t range_j,
0780     size_t range_k,
0781     size_t tile_k,
0782     uint32_t flags);
0783
0784 /**
0785  * Process items on a 3D grid with the specified maximum tile size along the
0786  * last two grid dimensions.
0787  *
0788  * The function implements a parallel version of the following snippet:
0789  *
0790  *   for (size_t i = 0; i < range_i; i++)
0791  *     for (size_t j = 0; j < range_j; j += tile_j)
0792  *       for (size_t k = 0; k < range_k; k += tile_k)
0793  *         function(context, i, j, k,
0794  *           min(range_j - j, tile_j), min(range_k - k, tile_k));
0795  *
0796  * When the function returns, all items have been processed and the thread pool
0797  * is ready for a new task.
0798  *
0799  * @note If multiple threads call this function with the same thread pool, the
0800  *    calls are serialized.
0801  *
0802  * @param threadpool  the thread pool to use for parallelisation. If threadpool
0803  *    is NULL, all items are processed serially on the calling thread.
0804  * @param function    the function to call for each tile.
0805  * @param context     the first argument passed to the specified function.
0806  * @param range_i     the number of items to process along the first dimension
0807  *    of the 3D grid.
0808  * @param range_j     the number of items to process along the second dimension
0809  *    of the 3D grid.
0810  * @param range_k     the number of items to process along the third dimension
0811  *    of the 3D grid.
0812  * @param tile_j      the maximum number of items along the second dimension of
0813  *    the 3D grid to process in one function call.
0814  * @param tile_k      the maximum number of items along the third dimension of
0815  *    the 3D grid to process in one function call.
0816  * @param flags       a bitwise combination of zero or more optional flags
0817  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
0818  */
0819 void pthreadpool_parallelize_3d_tile_2d(
0820     pthreadpool_t threadpool,
0821     pthreadpool_task_3d_tile_2d_t function,
0822     void* context,
0823     size_t range_i,
0824     size_t range_j,
0825     size_t range_k,
0826     size_t tile_j,
0827     size_t tile_k,
0828     uint32_t flags);
0829
0830 /**
0831  * Process items on a 3D grid with the specified maximum tile size along the
0832  * last two grid dimensions using a microarchitecture-aware task function.
0833  *
0834  * The function implements a parallel version of the following snippet:
0835  *
0836  *   uint32_t uarch_index = cpuinfo_initialize() ?
0837  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
0838  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
0839  *   for (size_t i = 0; i < range_i; i++)
0840  *     for (size_t j = 0; j < range_j; j += tile_j)
0841  *       for (size_t k = 0; k < range_k; k += tile_k)
0842  *         function(context, uarch_index, i, j, k,
0843  *           min(range_j - j, tile_j), min(range_k - k, tile_k));
0844  *
0845  * When the function returns, all items have been processed and the thread pool
0846  * is ready for a new task.
0847  *
0848  * @note If multiple threads call this function with the same thread pool, the
0849  *    calls are serialized.
0850  *
0851  * @param threadpool           the thread pool to use for parallelisation. If
0852  *    threadpool is NULL, all items are processed serially on the calling
0853  *    thread.
0854  * @param function             the function to call for each tile.
0855  * @param context              the first argument passed to the specified
0856  *    function.
0857  * @param default_uarch_index  the microarchitecture index to use when
0858  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
0859  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
0860  *    max_uarch_index value.
0861  * @param max_uarch_index      the maximum microarchitecture index expected by
0862  *    the specified function. If the index returned by
0863  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
0864  *    will be used instead. default_uarch_index can exceed max_uarch_index.
0865  * @param range_i              the number of items to process along the first
0866  *    dimension of the 3D grid.
0867  * @param range_j              the number of items to process along the second
0868  *    dimension of the 3D grid.
0869  * @param range_k              the number of items to process along the third
0870  *    dimension of the 3D grid.
0871  * @param tile_j               the maximum number of items along the second
0872  *    dimension of the 3D grid to process in one function call.
0873  * @param tile_k               the maximum number of items along the third
0874  *    dimension of the 3D grid to process in one function call.
0875  * @param flags                a bitwise combination of zero or more optional
0876  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
0877  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
0878  */
0879 void pthreadpool_parallelize_3d_tile_2d_with_uarch(
0880     pthreadpool_t threadpool,
0881     pthreadpool_task_3d_tile_2d_with_id_t function,
0882     void* context,
0883     uint32_t default_uarch_index,
0884     uint32_t max_uarch_index,
0885     size_t range_i,
0886     size_t range_j,
0887     size_t range_k,
0888     size_t tile_j,
0889     size_t tile_k,
0890     uint32_t flags);
0891
0892 /**
0893  * Process items on a 4D grid.
0894  *
0895  * The function implements a parallel version of the following snippet:
0896  *
0897  *   for (size_t i = 0; i < range_i; i++)
0898  *     for (size_t j = 0; j < range_j; j++)
0899  *       for (size_t k = 0; k < range_k; k++)
0900  *         for (size_t l = 0; l < range_l; l++)
0901  *           function(context, i, j, k, l);
0902  *
0903  * When the function returns, all items have been processed and the thread pool
0904  * is ready for a new task.
0905  *
0906  * @note If multiple threads call this function with the same thread pool, the
0907  *    calls are serialized.
0908  *
0909  * @param threadpool  the thread pool to use for parallelisation. If threadpool
0910  *    is NULL, all items are processed serially on the calling thread.
0911  * @param function    the function to call for each tile.
0912  * @param context     the first argument passed to the specified function.
0913  * @param range_i     the number of items to process along the first dimension
0914  *    of the 4D grid.
0915  * @param range_j     the number of items to process along the second dimension
0916  *    of the 4D grid.
0917  * @param range_k     the number of items to process along the third dimension
0918  *    of the 4D grid.
0919  * @param range_l     the number of items to process along the fourth dimension
0920  *    of the 4D grid.
0921  * @param flags       a bitwise combination of zero or more optional flags
0922  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
0923  */
0924 void pthreadpool_parallelize_4d(
0925     pthreadpool_t threadpool,
0926     pthreadpool_task_4d_t function,
0927     void* context,
0928     size_t range_i,
0929     size_t range_j,
0930     size_t range_k,
0931     size_t range_l,
0932     uint32_t flags);
0933
0934 /**
0935  * Process items on a 4D grid with the specified maximum tile size along the
0936  * last grid dimension.
0937  *
0938  * The function implements a parallel version of the following snippet:
0939  *
0940  *   for (size_t i = 0; i < range_i; i++)
0941  *     for (size_t j = 0; j < range_j; j++)
0942  *       for (size_t k = 0; k < range_k; k++)
0943  *         for (size_t l = 0; l < range_l; l += tile_l)
0944  *           function(context, i, j, k, l, min(range_l - l, tile_l));
0945  *
0946  * When the function returns, all items have been processed and the thread pool
0947  * is ready for a new task.
0948  *
0949  * @note If multiple threads call this function with the same thread pool, the
0950  *    calls are serialized.
0951  *
0952  * @param threadpool  the thread pool to use for parallelisation. If threadpool
0953  *    is NULL, all items are processed serially on the calling thread.
0954  * @param function    the function to call for each tile.
0955  * @param context     the first argument passed to the specified function.
0956  * @param range_i     the number of items to process along the first dimension
0957  *    of the 4D grid.
0958  * @param range_j     the number of items to process along the second dimension
0959  *    of the 4D grid.
0960  * @param range_k     the number of items to process along the third dimension
0961  *    of the 4D grid.
0962  * @param range_l     the number of items to process along the fourth dimension
0963  *    of the 4D grid.
0964  * @param tile_l      the maximum number of items along the fourth dimension of
0965  *    the 4D grid to process in one function call.
0966  * @param flags       a bitwise combination of zero or more optional flags
0967  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
0968  */
0969 void pthreadpool_parallelize_4d_tile_1d(
0970     pthreadpool_t threadpool,
0971     pthreadpool_task_4d_tile_1d_t function,
0972     void* context,
0973     size_t range_i,
0974     size_t range_j,
0975     size_t range_k,
0976     size_t range_l,
0977     size_t tile_l,
0978     uint32_t flags);
0979
0980 /**
0981  * Process items on a 4D grid with the specified maximum tile size along the
0982  * last two grid dimensions.
0983  *
0984  * The function implements a parallel version of the following snippet:
0985  *
0986  *   for (size_t i = 0; i < range_i; i++)
0987  *     for (size_t j = 0; j < range_j; j++)
0988  *       for (size_t k = 0; k < range_k; k += tile_k)
0989  *         for (size_t l = 0; l < range_l; l += tile_l)
0990  *           function(context, i, j, k, l,
0991  *             min(range_k - k, tile_k), min(range_l - l, tile_l));
0992  *
0993  * When the function returns, all items have been processed and the thread pool
0994  * is ready for a new task.
0995  *
0996  * @note If multiple threads call this function with the same thread pool, the
0997  *    calls are serialized.
0998  *
0999  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1000  *    is NULL, all items are processed serially on the calling thread.
1001  * @param function    the function to call for each tile.
1002  * @param context     the first argument passed to the specified function.
1003  * @param range_i     the number of items to process along the first dimension
1004  *    of the 4D grid.
1005  * @param range_j     the number of items to process along the second dimension
1006  *    of the 4D grid.
1007  * @param range_k     the number of items to process along the third dimension
1008  *    of the 4D grid.
1009  * @param range_l     the number of items to process along the fourth dimension
1010  *    of the 4D grid.
1011  * @param tile_k      the maximum number of items along the third dimension of
1012  *    the 4D grid to process in one function call.
1013  * @param tile_l      the maximum number of items along the fourth dimension of
1014  *    the 4D grid to process in one function call.
1015  * @param flags       a bitwise combination of zero or more optional flags
1016  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1017  */
1018 void pthreadpool_parallelize_4d_tile_2d(
1019     pthreadpool_t threadpool,
1020     pthreadpool_task_4d_tile_2d_t function,
1021     void* context,
1022     size_t range_i,
1023     size_t range_j,
1024     size_t range_k,
1025     size_t range_l,
1026     size_t tile_k,
1027     size_t tile_l,
1028     uint32_t flags);
1029
1030 /**
1031  * Process items on a 4D grid with the specified maximum tile size along the
1032  * last two grid dimensions using a microarchitecture-aware task function.
1033  *
1034  * The function implements a parallel version of the following snippet:
1035  *
1036  *   uint32_t uarch_index = cpuinfo_initialize() ?
1037  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
1038  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
1039  *   for (size_t i = 0; i < range_i; i++)
1040  *     for (size_t j = 0; j < range_j; j++)
1041  *       for (size_t k = 0; k < range_k; k += tile_k)
1042  *         for (size_t l = 0; l < range_l; l += tile_l)
1043  *           function(context, uarch_index, i, j, k, l,
1044  *             min(range_k - k, tile_k), min(range_l - l, tile_l));
1045  *
1046  * When the function returns, all items have been processed and the thread pool
1047  * is ready for a new task.
1048  *
1049  * @note If multiple threads call this function with the same thread pool, the
1050  *    calls are serialized.
1051  *
1052  * @param threadpool           the thread pool to use for parallelisation. If
1053  *    threadpool is NULL, all items are processed serially on the calling
1054  *    thread.
1055  * @param function             the function to call for each tile.
1056  * @param context              the first argument passed to the specified
1057  *    function.
1058  * @param default_uarch_index  the microarchitecture index to use when
1059  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
1060  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
1061  *    max_uarch_index value.
1062  * @param max_uarch_index      the maximum microarchitecture index expected by
1063  *    the specified function. If the index returned by
1064  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
1065  *    will be used instead. default_uarch_index can exceed max_uarch_index.
1066  * @param range_i              the number of items to process along the first
1067  *    dimension of the 4D grid.
1068  * @param range_j              the number of items to process along the second
1069  *    dimension of the 4D grid.
1070  * @param range_k              the number of items to process along the third
1071  *    dimension of the 4D grid.
1072  * @param range_l              the number of items to process along the fourth
1073  *    dimension of the 4D grid.
1074  * @param tile_k               the maximum number of items along the third
1075  *    dimension of the 4D grid to process in one function call.
1076  * @param tile_l               the maximum number of items along the fourth
1077  *    dimension of the 4D grid to process in one function call.
1078  * @param flags                a bitwise combination of zero or more optional
1079  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
1080  *    PTHREADPOOL_FLAG_YIELD_WORKERS)
1081  */
1082 void pthreadpool_parallelize_4d_tile_2d_with_uarch(
1083     pthreadpool_t threadpool,
1084     pthreadpool_task_4d_tile_2d_with_id_t function,
1085     void* context,
1086     uint32_t default_uarch_index,
1087     uint32_t max_uarch_index,
1088     size_t range_i,
1089     size_t range_j,
1090     size_t range_k,
1091     size_t range_l,
1092     size_t tile_k,
1093     size_t tile_l,
1094     uint32_t flags);
1095
1096 /**
1097  * Process items on a 5D grid.
1098  *
1099  * The function implements a parallel version of the following snippet:
1100  *
1101  *   for (size_t i = 0; i < range_i; i++)
1102  *     for (size_t j = 0; j < range_j; j++)
1103  *       for (size_t k = 0; k < range_k; k++)
1104  *         for (size_t l = 0; l < range_l; l++)
1105  *           for (size_t m = 0; m < range_m; m++)
1106  *             function(context, i, j, k, l, m);
1107  *
1108  * When the function returns, all items have been processed and the thread pool
1109  * is ready for a new task.
1110  *
1111  * @note If multiple threads call this function with the same thread pool, the
1112  *    calls are serialized.
1113  *
1114  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1115  *    is NULL, all items are processed serially on the calling thread.
1116  * @param function    the function to call for each tile.
1117  * @param context     the first argument passed to the specified function.
1118  * @param range_i     the number of items to process along the first dimension
1119  *    of the 5D grid.
1120  * @param range_j     the number of items to process along the second dimension
1121  *    of the 5D grid.
1122  * @param range_k     the number of items to process along the third dimension
1123  *    of the 5D grid.
1124  * @param range_l     the number of items to process along the fourth dimension
1125  *    of the 5D grid.
1126  * @param range_m     the number of items to process along the fifth dimension
1127  *    of the 5D grid.
1128  * @param flags       a bitwise combination of zero or more optional flags
1129  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1130  */
1131 void pthreadpool_parallelize_5d(
1132     pthreadpool_t threadpool,
1133     pthreadpool_task_5d_t function,
1134     void* context,
1135     size_t range_i,
1136     size_t range_j,
1137     size_t range_k,
1138     size_t range_l,
1139     size_t range_m,
1140     uint32_t flags);
1141
1142 /**
1143  * Process items on a 5D grid with the specified maximum tile size along the
1144  * last grid dimension.
1145  *
1146  * The function implements a parallel version of the following snippet:
1147  *
1148  *   for (size_t i = 0; i < range_i; i++)
1149  *     for (size_t j = 0; j < range_j; j++)
1150  *       for (size_t k = 0; k < range_k; k++)
1151  *         for (size_t l = 0; l < range_l; l++)
1152  *           for (size_t m = 0; m < range_m; m += tile_m)
1153  *             function(context, i, j, k, l, m, min(range_m - m, tile_m));
1154  *
1155  * When the function returns, all items have been processed and the thread pool
1156  * is ready for a new task.
1157  *
1158  * @note If multiple threads call this function with the same thread pool, the
1159  *    calls are serialized.
1160  *
1161  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1162  *    is NULL, all items are processed serially on the calling thread.
1163  * @param function    the function to call for each tile.
1164  * @param context     the first argument passed to the specified function.
1165  * @param range_i     the number of items to process along the first dimension
1166  *    of the 5D grid.
1167  * @param range_j     the number of items to process along the second dimension
1168  *    of the 5D grid.
1169  * @param range_k     the number of items to process along the third dimension
1170  *    of the 5D grid.
1171  * @param range_l     the number of items to process along the fourth dimension
1172  *    of the 5D grid.
1173  * @param range_m     the number of items to process along the fifth dimension
1174  *    of the 5D grid.
1175  * @param tile_m      the maximum number of items along the fifth dimension of
1176  *    the 5D grid to process in one function call.
1177  * @param flags       a bitwise combination of zero or more optional flags
1178  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1179  */
1180 void pthreadpool_parallelize_5d_tile_1d(
1181     pthreadpool_t threadpool,
1182     pthreadpool_task_5d_tile_1d_t function,
1183     void* context,
1184     size_t range_i,
1185     size_t range_j,
1186     size_t range_k,
1187     size_t range_l,
1188     size_t range_m,
1189     size_t tile_m,
1190     uint32_t flags);
1191
1192 /**
1193  * Process items on a 5D grid with the specified maximum tile size along the
1194  * last two grid dimensions.
1195  *
1196  * The function implements a parallel version of the following snippet:
1197  *
1198  *   for (size_t i = 0; i < range_i; i++)
1199  *     for (size_t j = 0; j < range_j; j++)
1200  *       for (size_t k = 0; k < range_k; k++)
1201  *         for (size_t l = 0; l < range_l; l += tile_l)
1202  *           for (size_t m = 0; m < range_m; m += tile_m)
1203  *             function(context, i, j, k, l, m,
1204  *               min(range_l - l, tile_l), min(range_m - m, tile_m));
1205  *
1206  * When the function returns, all items have been processed and the thread pool
1207  * is ready for a new task.
1208  *
1209  * @note If multiple threads call this function with the same thread pool, the
1210  *    calls are serialized.
1211  *
1212  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1213  *    is NULL, all items are processed serially on the calling thread.
1214  * @param function    the function to call for each tile.
1215  * @param context     the first argument passed to the specified function.
1216  * @param range_i     the number of items to process along the first dimension
1217  *    of the 5D grid.
1218  * @param range_j     the number of items to process along the second dimension
1219  *    of the 5D grid.
1220  * @param range_k     the number of items to process along the third dimension
1221  *    of the 5D grid.
1222  * @param range_l     the number of items to process along the fourth dimension
1223  *    of the 5D grid.
1224  * @param range_m     the number of items to process along the fifth dimension
1225  *    of the 5D grid.
1226  * @param tile_l      the maximum number of items along the fourth dimension of
1227  *    the 5D grid to process in one function call.
1228  * @param tile_m      the maximum number of items along the fifth dimension of
1229  *    the 5D grid to process in one function call.
1230  * @param flags       a bitwise combination of zero or more optional flags
1231  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1232  */
1233 void pthreadpool_parallelize_5d_tile_2d(
1234     pthreadpool_t threadpool,
1235     pthreadpool_task_5d_tile_2d_t function,
1236     void* context,
1237     size_t range_i,
1238     size_t range_j,
1239     size_t range_k,
1240     size_t range_l,
1241     size_t range_m,
1242     size_t tile_l,
1243     size_t tile_m,
1244     uint32_t flags);
1245
1246 /**
1247  * Process items on a 6D grid.
1248  *
1249  * The function implements a parallel version of the following snippet:
1250  *
1251  *   for (size_t i = 0; i < range_i; i++)
1252  *     for (size_t j = 0; j < range_j; j++)
1253  *       for (size_t k = 0; k < range_k; k++)
1254  *         for (size_t l = 0; l < range_l; l++)
1255  *           for (size_t m = 0; m < range_m; m++)
1256  *             for (size_t n = 0; n < range_n; n++)
1257  *               function(context, i, j, k, l, m, n);
1258  *
1259  * When the function returns, all items have been processed and the thread pool
1260  * is ready for a new task.
1261  *
1262  * @note If multiple threads call this function with the same thread pool, the
1263  *    calls are serialized.
1264  *
1265  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1266  *    is NULL, all items are processed serially on the calling thread.
1267  * @param function    the function to call for each tile.
1268  * @param context     the first argument passed to the specified function.
1269  * @param range_i     the number of items to process along the first dimension
1270  *    of the 6D grid.
1271  * @param range_j     the number of items to process along the second dimension
1272  *    of the 6D grid.
1273  * @param range_k     the number of items to process along the third dimension
1274  *    of the 6D grid.
1275  * @param range_l     the number of items to process along the fourth dimension
1276  *    of the 6D grid.
1277  * @param range_m     the number of items to process along the fifth dimension
1278  *    of the 6D grid.
1279  * @param range_n     the number of items to process along the sixth dimension
1280  *    of the 6D grid.
1281  * @param tile_n      the maximum number of items along the sixth dimension of
1282  *    the 6D grid to process in one function call.
1283  * @param flags       a bitwise combination of zero or more optional flags
1284  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1285  */
1286 void pthreadpool_parallelize_6d(
1287     pthreadpool_t threadpool,
1288     pthreadpool_task_6d_t function,
1289     void* context,
1290     size_t range_i,
1291     size_t range_j,
1292     size_t range_k,
1293     size_t range_l,
1294     size_t range_m,
1295     size_t range_n,
1296     uint32_t flags);
1297
1298 /**
1299  * Process items on a 6D grid with the specified maximum tile size along the
1300  * last grid dimension.
1301  *
1302  * The function implements a parallel version of the following snippet:
1303  *
1304  *   for (size_t i = 0; i < range_i; i++)
1305  *     for (size_t j = 0; j < range_j; j++)
1306  *       for (size_t k = 0; k < range_k; k++)
1307  *         for (size_t l = 0; l < range_l; l++)
1308  *           for (size_t m = 0; m < range_m; m++)
1309  *             for (size_t n = 0; n < range_n; n += tile_n)
1310  *               function(context, i, j, k, l, m, n, min(range_n - n, tile_n));
1311  *
1312  * When the function returns, all items have been processed and the thread pool
1313  * is ready for a new task.
1314  *
1315  * @note If multiple threads call this function with the same thread pool, the
1316  *    calls are serialized.
1317  *
1318  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1319  *    is NULL, all items are processed serially on the calling thread.
1320  * @param function    the function to call for each tile.
1321  * @param context     the first argument passed to the specified function.
1322  * @param range_i     the number of items to process along the first dimension
1323  *    of the 6D grid.
1324  * @param range_j     the number of items to process along the second dimension
1325  *    of the 6D grid.
1326  * @param range_k     the number of items to process along the third dimension
1327  *    of the 6D grid.
1328  * @param range_l     the number of items to process along the fourth dimension
1329  *    of the 6D grid.
1330  * @param range_m     the number of items to process along the fifth dimension
1331  *    of the 6D grid.
1332  * @param range_n     the number of items to process along the sixth dimension
1333  *    of the 6D grid.
1334  * @param tile_n      the maximum number of items along the sixth dimension of
1335  *    the 6D grid to process in one function call.
1336  * @param flags       a bitwise combination of zero or more optional flags
1337  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1338  */
1339 void pthreadpool_parallelize_6d_tile_1d(
1340     pthreadpool_t threadpool,
1341     pthreadpool_task_6d_tile_1d_t function,
1342     void* context,
1343     size_t range_i,
1344     size_t range_j,
1345     size_t range_k,
1346     size_t range_l,
1347     size_t range_m,
1348     size_t range_n,
1349     size_t tile_n,
1350     uint32_t flags);
1351
1352 /**
1353  * Process items on a 6D grid with the specified maximum tile size along the
1354  * last two grid dimensions.
1355  *
1356  * The function implements a parallel version of the following snippet:
1357  *
1358  *   for (size_t i = 0; i < range_i; i++)
1359  *     for (size_t j = 0; j < range_j; j++)
1360  *       for (size_t k = 0; k < range_k; k++)
1361  *         for (size_t l = 0; l < range_l; l++)
1362  *           for (size_t m = 0; m < range_m; m += tile_m)
1363  *             for (size_t n = 0; n < range_n; n += tile_n)
1364  *               function(context, i, j, k, l, m, n,
1365  *                 min(range_m - m, tile_m), min(range_n - n, tile_n));
1366  *
1367  * When the function returns, all items have been processed and the thread pool
1368  * is ready for a new task.
1369  *
1370  * @note If multiple threads call this function with the same thread pool, the
1371  *    calls are serialized.
1372  *
1373  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1374  *    is NULL, all items are processed serially on the calling thread.
1375  * @param function    the function to call for each tile.
1376  * @param context     the first argument passed to the specified function.
1377  * @param range_i     the number of items to process along the first dimension
1378  *    of the 6D grid.
1379  * @param range_j     the number of items to process along the second dimension
1380  *    of the 6D grid.
1381  * @param range_k     the number of items to process along the third dimension
1382  *    of the 6D grid.
1383  * @param range_l     the number of items to process along the fourth dimension
1384  *    of the 6D grid.
1385  * @param range_m     the number of items to process along the fifth dimension
1386  *    of the 6D grid.
1387  * @param range_n     the number of items to process along the sixth dimension
1388  *    of the 6D grid.
1389  * @param tile_m      the maximum number of items along the fifth dimension of
1390  *    the 6D grid to process in one function call.
1391  * @param tile_n      the maximum number of items along the sixth dimension of
1392  *    the 6D grid to process in one function call.
1393  * @param flags       a bitwise combination of zero or more optional flags
1394  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1395  */
1396 void pthreadpool_parallelize_6d_tile_2d(
1397     pthreadpool_t threadpool,
1398     pthreadpool_task_6d_tile_2d_t function,
1399     void* context,
1400     size_t range_i,
1401     size_t range_j,
1402     size_t range_k,
1403     size_t range_l,
1404     size_t range_m,
1405     size_t range_n,
1406     size_t tile_m,
1407     size_t tile_n,
1408     uint32_t flags);
1409
1410 /**
1411  * Terminates threads in the thread pool and releases associated resources.
1412  *
1413  * @warning  Accessing the thread pool after a call to this function constitutes
1414  *    undefined behaviour and may cause data corruption.
1415  *
1416  * @param[in,out]  threadpool  The thread pool to destroy.
1417  */
1418 void pthreadpool_destroy(pthreadpool_t threadpool);
1419
1420 #ifndef PTHREADPOOL_NO_DEPRECATED_API
1421
1422 /* Legacy API for compatibility with pre-existing users (e.g. NNPACK) */
1423 #if defined(__GNUC__)
1424     #define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__))
1425 #else
1426     #define PTHREADPOOL_DEPRECATED
1427 #endif
1428
1429 typedef void (*pthreadpool_function_1d_t)(void*, size_t);
1430 typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t);
1431 typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t);
1432 typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t);
1433 typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
1434 typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
1435
1436 void pthreadpool_compute_1d(
1437     pthreadpool_t threadpool,
1438     pthreadpool_function_1d_t function,
1439     void* argument,
1440     size_t range) PTHREADPOOL_DEPRECATED;
1441
1442 void pthreadpool_compute_1d_tiled(
1443     pthreadpool_t threadpool,
1444     pthreadpool_function_1d_tiled_t function,
1445     void* argument,
1446     size_t range,
1447     size_t tile) PTHREADPOOL_DEPRECATED;
1448
1449 void pthreadpool_compute_2d(
1450     pthreadpool_t threadpool,
1451     pthreadpool_function_2d_t function,
1452     void* argument,
1453     size_t range_i,
1454     size_t range_j) PTHREADPOOL_DEPRECATED;
1455
1456 void pthreadpool_compute_2d_tiled(
1457     pthreadpool_t threadpool,
1458     pthreadpool_function_2d_tiled_t function,
1459     void* argument,
1460     size_t range_i,
1461     size_t range_j,
1462     size_t tile_i,
1463     size_t tile_j) PTHREADPOOL_DEPRECATED;
1464
1465 void pthreadpool_compute_3d_tiled(
1466     pthreadpool_t threadpool,
1467     pthreadpool_function_3d_tiled_t function,
1468     void* argument,
1469     size_t range_i,
1470     size_t range_j,
1471     size_t range_k,
1472     size_t tile_i,
1473     size_t tile_j,
1474     size_t tile_k) PTHREADPOOL_DEPRECATED;
1475
1476 void pthreadpool_compute_4d_tiled(
1477     pthreadpool_t threadpool,
1478     pthreadpool_function_4d_tiled_t function,
1479     void* argument,
1480     size_t range_i,
1481     size_t range_j,
1482     size_t range_k,
1483     size_t range_l,
1484     size_t tile_i,
1485     size_t tile_j,
1486     size_t tile_k,
1487     size_t tile_l) PTHREADPOOL_DEPRECATED;
1488
1489 #endif /* PTHREADPOOL_NO_DEPRECATED_API */
1490
1491 #ifdef __cplusplus
1492 } /* extern "C" */
1493 #endif
1494
1495 #ifdef __cplusplus
1496
1497 namespace libpthreadpool {
1498 namespace detail {
1499 namespace {
1500
1501 template<class T>
1502 void call_wrapper_1d(void* arg, size_t i) {
1503     (*static_cast<const T*>(arg))(i);
1504 }
1505
1506 template<class T>
1507 void call_wrapper_1d_tile_1d(void* arg, size_t range_i, size_t tile_i) {
1508     (*static_cast<const T*>(arg))(range_i, tile_i);
1509 }
1510
1511 template<class T>
1512 void call_wrapper_2d(void* functor, size_t i, size_t j) {
1513     (*static_cast<const T*>(functor))(i, j);
1514 }
1515
1516 template<class T>
1517 void call_wrapper_2d_tile_1d(void* functor,
1518                                  size_t i, size_t range_j, size_t tile_j)
1519 {
1520     (*static_cast<const T*>(functor))(i, range_j, tile_j);
1521 }
1522
1523 template<class T>
1524 void call_wrapper_2d_tile_2d(void* functor,
1525                                  size_t range_i, size_t range_j,
1526                                  size_t tile_i, size_t tile_j)
1527 {
1528     (*static_cast<const T*>(functor))(range_i, range_j, tile_i, tile_j);
1529 }
1530
1531 template<class T>
1532 void call_wrapper_3d(void* functor, size_t i, size_t j, size_t k) {
1533     (*static_cast<const T*>(functor))(i, j, k);
1534 }
1535
1536 template<class T>
1537 void call_wrapper_3d_tile_1d(void* functor,
1538                                  size_t i, size_t j, size_t range_k,
1539                                  size_t tile_k)
1540 {
1541     (*static_cast<const T*>(functor))(i, j, range_k, tile_k);
1542 }
1543
1544 template<class T>
1545 void call_wrapper_3d_tile_2d(void* functor,
1546                                  size_t i, size_t range_j, size_t range_k,
1547                                  size_t tile_j, size_t tile_k)
1548 {
1549     (*static_cast<const T*>(functor))(i, range_j, range_k, tile_j, tile_k);
1550 }
1551
1552 template<class T>
1553 void call_wrapper_4d(void* functor, size_t i, size_t j, size_t k, size_t l) {
1554     (*static_cast<const T*>(functor))(i, j, k, l);
1555 }
1556
1557 template<class T>
1558 void call_wrapper_4d_tile_1d(void* functor,
1559                                  size_t i, size_t j, size_t k, size_t range_l,
1560                                  size_t tile_l)
1561 {
1562     (*static_cast<const T*>(functor))(i, j, k, range_l, tile_l);
1563 }
1564
1565 template<class T>
1566 void call_wrapper_4d_tile_2d(void* functor,
1567                                  size_t i, size_t j, size_t range_k, size_t range_l,
1568                                  size_t tile_k, size_t tile_l)
1569 {
1570     (*static_cast<const T*>(functor))(i, j, range_k, range_l, tile_k, tile_l);
1571 }
1572
1573 template<class T>
1574 void call_wrapper_5d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m) {
1575     (*static_cast<const T*>(functor))(i, j, k, l, m);
1576 }
1577
1578 template<class T>
1579 void call_wrapper_5d_tile_1d(void* functor,
1580                                  size_t i, size_t j, size_t k, size_t l, size_t range_m,
1581                                  size_t tile_m)
1582 {
1583     (*static_cast<const T*>(functor))(i, j, k, l, range_m, tile_m);
1584 }
1585
1586 template<class T>
1587 void call_wrapper_5d_tile_2d(void* functor,
1588                                  size_t i, size_t j, size_t k, size_t range_l, size_t range_m,
1589                                  size_t tile_l, size_t tile_m)
1590 {
1591     (*static_cast<const T*>(functor))(i, j, k, range_l, range_m, tile_l, tile_m);
1592 }
1593
1594 template<class T>
1595 void call_wrapper_6d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
1596     (*static_cast<const T*>(functor))(i, j, k, l, m, n);
1597 }
1598
1599 template<class T>
1600 void call_wrapper_6d_tile_1d(void* functor,
1601                                  size_t i, size_t j, size_t k, size_t l, size_t m, size_t range_n,
1602                                  size_t tile_n)
1603 {
1604     (*static_cast<const T*>(functor))(i, j, k, l, m, range_n, tile_n);
1605 }
1606
1607 template<class T>
1608 void call_wrapper_6d_tile_2d(void* functor,
1609                                  size_t i, size_t j, size_t k, size_t l, size_t range_m, size_t range_n,
1610                                  size_t tile_m, size_t tile_n)
1611 {
1612     (*static_cast<const T*>(functor))(i, j, k, l, range_m, range_n, tile_m, tile_n);
1613 }
1614
1615 }  /* namespace */
1616 }  /* namespace detail */
1617 }  /* namespace libpthreadpool */
1618
1619 /**
1620  * Process items on a 1D grid.
1621  *
1622  * The function implements a parallel version of the following snippet:
1623  *
1624  *   for (size_t i = 0; i < range; i++)
1625  *     functor(i);
1626  *
1627  * When the function returns, all items have been processed and the thread pool
1628  * is ready for a new task.
1629  *
1630  * @note If multiple threads call this function with the same thread pool, the
1631  *    calls are serialized.
1632  *
1633  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1634  *    is NULL, all items are processed serially on the calling thread.
1635  * @param functor     the functor to call for each item.
1636  * @param range       the number of items on the 1D grid to process. The
1637  *    specified functor will be called once for each item.
1638  * @param flags       a bitwise combination of zero or more optional flags
1639  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1640  */
1641 template<class T>
1642 inline void pthreadpool_parallelize_1d(
1643     pthreadpool_t threadpool,
1644     const T& functor,
1645     size_t range,
1646     uint32_t flags = 0)
1647 {
1648     pthreadpool_parallelize_1d(
1649         threadpool,
1650         &libpthreadpool::detail::call_wrapper_1d<const T>,
1651         const_cast<void*>(static_cast<const void*>(&functor)),
1652         range,
1653         flags);
1654 }
1655
1656 /**
1657  * Process items on a 1D grid with specified maximum tile size.
1658  *
1659  * The function implements a parallel version of the following snippet:
1660  *
1661  *   for (size_t i = 0; i < range; i += tile)
1662  *     functor(i, min(range - i, tile));
1663  *
1664  * When the call returns, all items have been processed and the thread pool is
1665  * ready for a new task.
1666  *
1667  * @note If multiple threads call this function with the same thread pool,
1668  *    the calls are serialized.
1669  *
1670  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1671  *    is NULL, all items are processed serially on the calling thread.
1672  * @param functor     the functor to call for each tile.
1673  * @param range       the number of items on the 1D grid to process.
1674  * @param tile        the maximum number of items on the 1D grid to process in
1675  *    one functor call.
1676  * @param flags       a bitwise combination of zero or more optional flags
1677  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1678  */
1679 template<class T>
1680 inline void pthreadpool_parallelize_1d_tile_1d(
1681     pthreadpool_t threadpool,
1682     const T& functor,
1683     size_t range,
1684     size_t tile,
1685     uint32_t flags = 0)
1686 {
1687     pthreadpool_parallelize_1d_tile_1d(
1688         threadpool,
1689         &libpthreadpool::detail::call_wrapper_1d_tile_1d<const T>,
1690         const_cast<void*>(static_cast<const void*>(&functor)),
1691         range,
1692         tile,
1693         flags);
1694 }
1695
1696 /**
1697  * Process items on a 2D grid.
1698  *
1699  * The function implements a parallel version of the following snippet:
1700  *
1701  *   for (size_t i = 0; i < range_i; i++)
1702  *     for (size_t j = 0; j < range_j; j++)
1703  *       functor(i, j);
1704  *
1705  * When the function returns, all items have been processed and the thread pool
1706  * is ready for a new task.
1707  *
1708  * @note If multiple threads call this function with the same thread pool, the
1709  *    calls are serialized.
1710  *
1711  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1712  *    is NULL, all items are processed serially on the calling thread.
1713  * @param functor     the functor to call for each item.
1714  * @param range_i     the number of items to process along the first dimension
1715  *    of the 2D grid.
1716  * @param range_j     the number of items to process along the second dimension
1717  *    of the 2D grid.
1718  * @param flags       a bitwise combination of zero or more optional flags
1719  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1720  */
1721 template<class T>
1722 inline void pthreadpool_parallelize_2d(
1723     pthreadpool_t threadpool,
1724     const T& functor,
1725     size_t range_i,
1726     size_t range_j,
1727     uint32_t flags = 0)
1728 {
1729     pthreadpool_parallelize_2d(
1730         threadpool,
1731         &libpthreadpool::detail::call_wrapper_2d<const T>,
1732         const_cast<void*>(static_cast<const void*>(&functor)),
1733         range_i,
1734         range_j,
1735         flags);
1736 }
1737
1738 /**
1739  * Process items on a 2D grid with the specified maximum tile size along the
1740  * last grid dimension.
1741  *
1742  * The function implements a parallel version of the following snippet:
1743  *
1744  *   for (size_t i = 0; i < range_i; i++)
1745  *     for (size_t j = 0; j < range_j; j += tile_j)
1746  *       functor(i, j, min(range_j - j, tile_j));
1747  *
1748  * When the function returns, all items have been processed and the thread pool
1749  * is ready for a new task.
1750  *
1751  * @note If multiple threads call this function with the same thread pool, the
1752  *    calls are serialized.
1753  *
1754  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1755  *    is NULL, all items are processed serially on the calling thread.
1756  * @param functor     the functor to call for each tile.
1757  * @param range_i     the number of items to process along the first dimension
1758  *    of the 2D grid.
1759  * @param range_j     the number of items to process along the second dimension
1760  *    of the 2D grid.
1761  * @param tile_j      the maximum number of items along the second dimension of
1762  *    the 2D grid to process in one functor call.
1763  * @param flags       a bitwise combination of zero or more optional flags
1764  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1765  */
1766 template<class T>
1767 inline void pthreadpool_parallelize_2d_tile_1d(
1768     pthreadpool_t threadpool,
1769     const T& functor,
1770     size_t range_i,
1771     size_t range_j,
1772     size_t tile_j,
1773     uint32_t flags = 0)
1774 {
1775     pthreadpool_parallelize_2d_tile_1d(
1776         threadpool,
1777         &libpthreadpool::detail::call_wrapper_2d_tile_1d<const T>,
1778         const_cast<void*>(static_cast<const void*>(&functor)),
1779         range_i,
1780         range_j,
1781         tile_j,
1782         flags);
1783 }
1784
1785 /**
1786  * Process items on a 2D grid with the specified maximum tile size along each
1787  * grid dimension.
1788  *
1789  * The function implements a parallel version of the following snippet:
1790  *
1791  *   for (size_t i = 0; i < range_i; i += tile_i)
1792  *     for (size_t j = 0; j < range_j; j += tile_j)
1793  *       functor(i, j,
1794  *         min(range_i - i, tile_i), min(range_j - j, tile_j));
1795  *
1796  * When the function returns, all items have been processed and the thread pool
1797  * is ready for a new task.
1798  *
1799  * @note If multiple threads call this function with the same thread pool, the
1800  *    calls are serialized.
1801  *
1802  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1803  *    is NULL, all items are processed serially on the calling thread.
1804  * @param functor     the functor to call for each tile.
1805  * @param range_i     the number of items to process along the first dimension
1806  *    of the 2D grid.
1807  * @param range_j     the number of items to process along the second dimension
1808  *    of the 2D grid.
1809  * @param tile_j      the maximum number of items along the first dimension of
1810  *    the 2D grid to process in one functor call.
1811  * @param tile_j      the maximum number of items along the second dimension of
1812  *    the 2D grid to process in one functor call.
1813  * @param flags       a bitwise combination of zero or more optional flags
1814  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1815  */
1816 template<class T>
1817 inline void pthreadpool_parallelize_2d_tile_2d(
1818     pthreadpool_t threadpool,
1819     const T& functor,
1820     size_t range_i,
1821     size_t range_j,
1822     size_t tile_i,
1823     size_t tile_j,
1824     uint32_t flags = 0)
1825 {
1826     pthreadpool_parallelize_2d_tile_2d(
1827         threadpool,
1828         &libpthreadpool::detail::call_wrapper_2d_tile_2d<const T>,
1829         const_cast<void*>(static_cast<const void*>(&functor)),
1830         range_i,
1831         range_j,
1832         tile_i,
1833         tile_j,
1834         flags);
1835 }
1836
1837 /**
1838  * Process items on a 3D grid.
1839  *
1840  * The function implements a parallel version of the following snippet:
1841  *
1842  *   for (size_t i = 0; i < range_i; i++)
1843  *     for (size_t j = 0; j < range_j; j++)
1844  *       for (size_t k = 0; k < range_k; k++)
1845  *         functor(i, j, k);
1846  *
1847  * When the function returns, all items have been processed and the thread pool
1848  * is ready for a new task.
1849  *
1850  * @note If multiple threads call this function with the same thread pool, the
1851  *    calls are serialized.
1852  *
1853  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1854  *    is NULL, all items are processed serially on the calling thread.
1855  * @param functor     the functor to call for each tile.
1856  * @param range_i     the number of items to process along the first dimension
1857  *    of the 3D grid.
1858  * @param range_j     the number of items to process along the second dimension
1859  *    of the 3D grid.
1860  * @param range_k     the number of items to process along the third dimension
1861  *    of the 3D grid.
1862  * @param flags       a bitwise combination of zero or more optional flags
1863  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1864  */
1865 template<class T>
1866 inline void pthreadpool_parallelize_3d(
1867     pthreadpool_t threadpool,
1868     const T& functor,
1869     size_t range_i,
1870     size_t range_j,
1871     size_t range_k,
1872     uint32_t flags = 0)
1873 {
1874     pthreadpool_parallelize_3d(
1875         threadpool,
1876         &libpthreadpool::detail::call_wrapper_3d<const T>,
1877         const_cast<void*>(static_cast<const void*>(&functor)),
1878         range_i,
1879         range_j,
1880         range_k,
1881         flags);
1882 }
1883
1884 /**
1885  * Process items on a 3D grid with the specified maximum tile size along the
1886  * last grid dimension.
1887  *
1888  * The function implements a parallel version of the following snippet:
1889  *
1890  *   for (size_t i = 0; i < range_i; i++)
1891  *     for (size_t j = 0; j < range_j; j++)
1892  *       for (size_t k = 0; k < range_k; k += tile_k)
1893  *         functor(i, j, k, min(range_k - k, tile_k));
1894  *
1895  * When the function returns, all items have been processed and the thread pool
1896  * is ready for a new task.
1897  *
1898  * @note If multiple threads call this function with the same thread pool, the
1899  *    calls are serialized.
1900  *
1901  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1902  *    is NULL, all items are processed serially on the calling thread.
1903  * @param functor     the functor to call for each tile.
1904  * @param range_i     the number of items to process along the first dimension
1905  *    of the 3D grid.
1906  * @param range_j     the number of items to process along the second dimension
1907  *    of the 3D grid.
1908  * @param range_k     the number of items to process along the third dimension
1909  *    of the 3D grid.
1910  * @param tile_k      the maximum number of items along the third dimension of
1911  *    the 3D grid to process in one functor call.
1912  * @param flags       a bitwise combination of zero or more optional flags
1913  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1914  */
1915 template<class T>
1916 inline void pthreadpool_parallelize_3d_tile_1d(
1917     pthreadpool_t threadpool,
1918     const T& functor,
1919     size_t range_i,
1920     size_t range_j,
1921     size_t range_k,
1922     size_t tile_k,
1923     uint32_t flags = 0)
1924 {
1925     pthreadpool_parallelize_3d_tile_1d(
1926         threadpool,
1927         &libpthreadpool::detail::call_wrapper_3d_tile_1d<const T>,
1928         const_cast<void*>(static_cast<const void*>(&functor)),
1929         range_i,
1930         range_j,
1931         range_k,
1932         tile_k,
1933         flags);
1934 }
1935
1936 /**
1937  * Process items on a 3D grid with the specified maximum tile size along the
1938  * last two grid dimensions.
1939  *
1940  * The function implements a parallel version of the following snippet:
1941  *
1942  *   for (size_t i = 0; i < range_i; i++)
1943  *     for (size_t j = 0; j < range_j; j += tile_j)
1944  *       for (size_t k = 0; k < range_k; k += tile_k)
1945  *         functor(i, j, k,
1946  *           min(range_j - j, tile_j), min(range_k - k, tile_k));
1947  *
1948  * When the function returns, all items have been processed and the thread pool
1949  * is ready for a new task.
1950  *
1951  * @note If multiple threads call this function with the same thread pool, the
1952  *    calls are serialized.
1953  *
1954  * @param threadpool  the thread pool to use for parallelisation. If threadpool
1955  *    is NULL, all items are processed serially on the calling thread.
1956  * @param functor     the functor to call for each tile.
1957  * @param range_i     the number of items to process along the first dimension
1958  *    of the 3D grid.
1959  * @param range_j     the number of items to process along the second dimension
1960  *    of the 3D grid.
1961  * @param range_k     the number of items to process along the third dimension
1962  *    of the 3D grid.
1963  * @param tile_j      the maximum number of items along the second dimension of
1964  *    the 3D grid to process in one functor call.
1965  * @param tile_k      the maximum number of items along the third dimension of
1966  *    the 3D grid to process in one functor call.
1967  * @param flags       a bitwise combination of zero or more optional flags
1968  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
1969  */
1970 template<class T>
1971 inline void pthreadpool_parallelize_3d_tile_2d(
1972     pthreadpool_t threadpool,
1973     const T& functor,
1974     size_t range_i,
1975     size_t range_j,
1976     size_t range_k,
1977     size_t tile_j,
1978     size_t tile_k,
1979     uint32_t flags = 0)
1980 {
1981     pthreadpool_parallelize_3d_tile_2d(
1982         threadpool,
1983         &libpthreadpool::detail::call_wrapper_3d_tile_2d<const T>,
1984         const_cast<void*>(static_cast<const void*>(&functor)),
1985         range_i,
1986         range_j,
1987         range_k,
1988         tile_j,
1989         tile_k,
1990         flags);
1991 }
1992
1993 /**
1994  * Process items on a 4D grid.
1995  *
1996  * The function implements a parallel version of the following snippet:
1997  *
1998  *   for (size_t i = 0; i < range_i; i++)
1999  *     for (size_t j = 0; j < range_j; j++)
2000  *       for (size_t k = 0; k < range_k; k++)
2001  *         for (size_t l = 0; l < range_l; l++)
2002  *           functor(i, j, k, l);
2003  *
2004  * When the function returns, all items have been processed and the thread pool
2005  * is ready for a new task.
2006  *
2007  * @note If multiple threads call this function with the same thread pool, the
2008  *    calls are serialized.
2009  *
2010  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2011  *    is NULL, all items are processed serially on the calling thread.
2012  * @param functor     the functor to call for each tile.
2013  * @param range_i     the number of items to process along the first dimension
2014  *    of the 4D grid.
2015  * @param range_j     the number of items to process along the second dimension
2016  *    of the 4D grid.
2017  * @param range_k     the number of items to process along the third dimension
2018  *    of the 4D grid.
2019  * @param range_l     the number of items to process along the fourth dimension
2020  *    of the 4D grid.
2021  * @param flags       a bitwise combination of zero or more optional flags
2022  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2023  */
2024 template<class T>
2025 inline void pthreadpool_parallelize_4d(
2026     pthreadpool_t threadpool,
2027     const T& functor,
2028     size_t range_i,
2029     size_t range_j,
2030     size_t range_k,
2031     size_t range_l,
2032     uint32_t flags = 0)
2033 {
2034     pthreadpool_parallelize_4d(
2035         threadpool,
2036         &libpthreadpool::detail::call_wrapper_4d<const T>,
2037         const_cast<void*>(static_cast<const void*>(&functor)),
2038         range_i,
2039         range_j,
2040         range_k,
2041         range_l,
2042         flags);
2043 }
2044
2045 /**
2046  * Process items on a 4D grid with the specified maximum tile size along the
2047  * last grid dimension.
2048  *
2049  * The function implements a parallel version of the following snippet:
2050  *
2051  *   for (size_t i = 0; i < range_i; i++)
2052  *     for (size_t j = 0; j < range_j; j++)
2053  *       for (size_t k = 0; k < range_k; k++)
2054  *         for (size_t l = 0; l < range_l; l += tile_l)
2055  *           functor(i, j, k, l, min(range_l - l, tile_l));
2056  *
2057  * When the function returns, all items have been processed and the thread pool
2058  * is ready for a new task.
2059  *
2060  * @note If multiple threads call this function with the same thread pool, the
2061  *    calls are serialized.
2062  *
2063  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2064  *    is NULL, all items are processed serially on the calling thread.
2065  * @param functor     the functor to call for each tile.
2066  * @param range_i     the number of items to process along the first dimension
2067  *    of the 4D grid.
2068  * @param range_j     the number of items to process along the second dimension
2069  *    of the 4D grid.
2070  * @param range_k     the number of items to process along the third dimension
2071  *    of the 4D grid.
2072  * @param range_l     the number of items to process along the fourth dimension
2073  *    of the 4D grid.
2074  * @param tile_l      the maximum number of items along the fourth dimension of
2075  *    the 4D grid to process in one functor call.
2076  * @param flags       a bitwise combination of zero or more optional flags
2077  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2078  */
2079 template<class T>
2080 inline void pthreadpool_parallelize_4d_tile_1d(
2081     pthreadpool_t threadpool,
2082     const T& functor,
2083     size_t range_i,
2084     size_t range_j,
2085     size_t range_k,
2086     size_t range_l,
2087     size_t tile_l,
2088     uint32_t flags = 0)
2089 {
2090     pthreadpool_parallelize_4d_tile_1d(
2091         threadpool,
2092         &libpthreadpool::detail::call_wrapper_4d_tile_1d<const T>,
2093         const_cast<void*>(static_cast<const void*>(&functor)),
2094         range_i,
2095         range_j,
2096         range_k,
2097         range_l,
2098         tile_l,
2099         flags);
2100 }
2101
2102 /**
2103  * Process items on a 4D grid with the specified maximum tile size along the
2104  * last two grid dimensions.
2105  *
2106  * The function implements a parallel version of the following snippet:
2107  *
2108  *   for (size_t i = 0; i < range_i; i++)
2109  *     for (size_t j = 0; j < range_j; j++)
2110  *       for (size_t k = 0; k < range_k; k += tile_k)
2111  *         for (size_t l = 0; l < range_l; l += tile_l)
2112  *           functor(i, j, k, l,
2113  *             min(range_k - k, tile_k), min(range_l - l, tile_l));
2114  *
2115  * When the function returns, all items have been processed and the thread pool
2116  * is ready for a new task.
2117  *
2118  * @note If multiple threads call this function with the same thread pool, the
2119  *    calls are serialized.
2120  *
2121  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2122  *    is NULL, all items are processed serially on the calling thread.
2123  * @param functor     the functor to call for each tile.
2124  * @param range_i     the number of items to process along the first dimension
2125  *    of the 4D grid.
2126  * @param range_j     the number of items to process along the second dimension
2127  *    of the 4D grid.
2128  * @param range_k     the number of items to process along the third dimension
2129  *    of the 4D grid.
2130  * @param range_l     the number of items to process along the fourth dimension
2131  *    of the 4D grid.
2132  * @param tile_k      the maximum number of items along the third dimension of
2133  *    the 4D grid to process in one functor call.
2134  * @param tile_l      the maximum number of items along the fourth dimension of
2135  *    the 4D grid to process in one functor call.
2136  * @param flags       a bitwise combination of zero or more optional flags
2137  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2138  */
2139 template<class T>
2140 inline void pthreadpool_parallelize_4d_tile_2d(
2141     pthreadpool_t threadpool,
2142     const T& functor,
2143     size_t range_i,
2144     size_t range_j,
2145     size_t range_k,
2146     size_t range_l,
2147     size_t tile_k,
2148     size_t tile_l,
2149     uint32_t flags = 0)
2150 {
2151     pthreadpool_parallelize_4d_tile_2d(
2152         threadpool,
2153         &libpthreadpool::detail::call_wrapper_4d_tile_2d<const T>,
2154         const_cast<void*>(static_cast<const void*>(&functor)),
2155         range_i,
2156         range_j,
2157         range_k,
2158         range_l,
2159         tile_k,
2160         tile_l,
2161         flags);
2162 }
2163
2164 /**
2165  * Process items on a 5D grid.
2166  *
2167  * The function implements a parallel version of the following snippet:
2168  *
2169  *   for (size_t i = 0; i < range_i; i++)
2170  *     for (size_t j = 0; j < range_j; j++)
2171  *       for (size_t k = 0; k < range_k; k++)
2172  *         for (size_t l = 0; l < range_l; l++)
2173  *           for (size_t m = 0; m < range_m; m++)
2174  *             functor(i, j, k, l, m);
2175  *
2176  * When the function returns, all items have been processed and the thread pool
2177  * is ready for a new task.
2178  *
2179  * @note If multiple threads call this function with the same thread pool, the
2180  *    calls are serialized.
2181  *
2182  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2183  *    is NULL, all items are processed serially on the calling thread.
2184  * @param functor     the functor to call for each tile.
2185  * @param range_i     the number of items to process along the first dimension
2186  *    of the 5D grid.
2187  * @param range_j     the number of items to process along the second dimension
2188  *    of the 5D grid.
2189  * @param range_k     the number of items to process along the third dimension
2190  *    of the 5D grid.
2191  * @param range_l     the number of items to process along the fourth dimension
2192  *    of the 5D grid.
2193  * @param range_m     the number of items to process along the fifth dimension
2194  *    of the 5D grid.
2195  * @param flags       a bitwise combination of zero or more optional flags
2196  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2197  */
2198 template<class T>
2199 inline void pthreadpool_parallelize_5d(
2200     pthreadpool_t threadpool,
2201     const T& functor,
2202     size_t range_i,
2203     size_t range_j,
2204     size_t range_k,
2205     size_t range_l,
2206     size_t range_m,
2207     uint32_t flags = 0)
2208 {
2209     pthreadpool_parallelize_5d(
2210         threadpool,
2211         &libpthreadpool::detail::call_wrapper_5d<const T>,
2212         const_cast<void*>(static_cast<const void*>(&functor)),
2213         range_i,
2214         range_j,
2215         range_k,
2216         range_l,
2217         range_m,
2218         flags);
2219 }
2220
2221 /**
2222  * Process items on a 5D grid with the specified maximum tile size along the
2223  * last grid dimension.
2224  *
2225  * The function implements a parallel version of the following snippet:
2226  *
2227  *   for (size_t i = 0; i < range_i; i++)
2228  *     for (size_t j = 0; j < range_j; j++)
2229  *       for (size_t k = 0; k < range_k; k++)
2230  *         for (size_t l = 0; l < range_l; l++)
2231  *           for (size_t m = 0; m < range_m; m += tile_m)
2232  *             functor(i, j, k, l, m, min(range_m - m, tile_m));
2233  *
2234  * When the function returns, all items have been processed and the thread pool
2235  * is ready for a new task.
2236  *
2237  * @note If multiple threads call this function with the same thread pool, the
2238  *    calls are serialized.
2239  *
2240  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2241  *    is NULL, all items are processed serially on the calling thread.
2242  * @param functor     the functor to call for each tile.
2243  * @param range_i     the number of items to process along the first dimension
2244  *    of the 5D grid.
2245  * @param range_j     the number of items to process along the second dimension
2246  *    of the 5D grid.
2247  * @param range_k     the number of items to process along the third dimension
2248  *    of the 5D grid.
2249  * @param range_l     the number of items to process along the fourth dimension
2250  *    of the 5D grid.
2251  * @param range_m     the number of items to process along the fifth dimension
2252  *    of the 5D grid.
2253  * @param tile_m      the maximum number of items along the fifth dimension of
2254  *    the 5D grid to process in one functor call.
2255  * @param flags       a bitwise combination of zero or more optional flags
2256  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2257  */
2258 template<class T>
2259 inline void pthreadpool_parallelize_5d_tile_1d(
2260     pthreadpool_t threadpool,
2261     const T& functor,
2262     size_t range_i,
2263     size_t range_j,
2264     size_t range_k,
2265     size_t range_l,
2266     size_t range_m,
2267     size_t tile_m,
2268     uint32_t flags = 0)
2269 {
2270     pthreadpool_parallelize_5d_tile_1d(
2271         threadpool,
2272         &libpthreadpool::detail::call_wrapper_5d_tile_1d<const T>,
2273         const_cast<void*>(static_cast<const void*>(&functor)),
2274         range_i,
2275         range_j,
2276         range_k,
2277         range_l,
2278         range_m,
2279         tile_m,
2280         flags);
2281 }
2282
2283 /**
2284  * Process items on a 5D grid with the specified maximum tile size along the
2285  * last two grid dimensions.
2286  *
2287  * The function implements a parallel version of the following snippet:
2288  *
2289  *   for (size_t i = 0; i < range_i; i++)
2290  *     for (size_t j = 0; j < range_j; j++)
2291  *       for (size_t k = 0; k < range_k; k++)
2292  *         for (size_t l = 0; l < range_l; l += tile_l)
2293  *           for (size_t m = 0; m < range_m; m += tile_m)
2294  *             functor(i, j, k, l, m,
2295  *               min(range_l - l, tile_l), min(range_m - m, tile_m));
2296  *
2297  * When the function returns, all items have been processed and the thread pool
2298  * is ready for a new task.
2299  *
2300  * @note If multiple threads call this function with the same thread pool, the
2301  *    calls are serialized.
2302  *
2303  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2304  *    is NULL, all items are processed serially on the calling thread.
2305  * @param functor     the functor to call for each tile.
2306  * @param range_i     the number of items to process along the first dimension
2307  *    of the 5D grid.
2308  * @param range_j     the number of items to process along the second dimension
2309  *    of the 5D grid.
2310  * @param range_k     the number of items to process along the third dimension
2311  *    of the 5D grid.
2312  * @param range_l     the number of items to process along the fourth dimension
2313  *    of the 5D grid.
2314  * @param range_m     the number of items to process along the fifth dimension
2315  *    of the 5D grid.
2316  * @param tile_l      the maximum number of items along the fourth dimension of
2317  *    the 5D grid to process in one functor call.
2318  * @param tile_m      the maximum number of items along the fifth dimension of
2319  *    the 5D grid to process in one functor call.
2320  * @param flags       a bitwise combination of zero or more optional flags
2321  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2322  */
2323 template<class T>
2324 inline void pthreadpool_parallelize_5d_tile_2d(
2325     pthreadpool_t threadpool,
2326     const T& functor,
2327     size_t range_i,
2328     size_t range_j,
2329     size_t range_k,
2330     size_t range_l,
2331     size_t range_m,
2332     size_t tile_l,
2333     size_t tile_m,
2334     uint32_t flags = 0)
2335 {
2336     pthreadpool_parallelize_5d_tile_2d(
2337         threadpool,
2338         &libpthreadpool::detail::call_wrapper_5d_tile_2d<const T>,
2339         const_cast<void*>(static_cast<const void*>(&functor)),
2340         range_i,
2341         range_j,
2342         range_k,
2343         range_l,
2344         range_m,
2345         tile_l,
2346         tile_m,
2347         flags);
2348 }
2349
2350 /**
2351  * Process items on a 6D grid.
2352  *
2353  * The function implements a parallel version of the following snippet:
2354  *
2355  *   for (size_t i = 0; i < range_i; i++)
2356  *     for (size_t j = 0; j < range_j; j++)
2357  *       for (size_t k = 0; k < range_k; k++)
2358  *         for (size_t l = 0; l < range_l; l++)
2359  *           for (size_t m = 0; m < range_m; m++)
2360  *             for (size_t n = 0; n < range_n; n++)
2361  *               functor(i, j, k, l, m, n);
2362  *
2363  * When the function returns, all items have been processed and the thread pool
2364  * is ready for a new task.
2365  *
2366  * @note If multiple threads call this function with the same thread pool, the
2367  *    calls are serialized.
2368  *
2369  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2370  *    is NULL, all items are processed serially on the calling thread.
2371  * @param functor     the functor to call for each tile.
2372  * @param range_i     the number of items to process along the first dimension
2373  *    of the 6D grid.
2374  * @param range_j     the number of items to process along the second dimension
2375  *    of the 6D grid.
2376  * @param range_k     the number of items to process along the third dimension
2377  *    of the 6D grid.
2378  * @param range_l     the number of items to process along the fourth dimension
2379  *    of the 6D grid.
2380  * @param range_m     the number of items to process along the fifth dimension
2381  *    of the 6D grid.
2382  * @param range_n     the number of items to process along the sixth dimension
2383  *    of the 6D grid.
2384  * @param tile_n      the maximum number of items along the sixth dimension of
2385  *    the 6D grid to process in one functor call.
2386  * @param flags       a bitwise combination of zero or more optional flags
2387  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2388  */
2389 template<class T>
2390 inline void pthreadpool_parallelize_6d(
2391     pthreadpool_t threadpool,
2392     const T& functor,
2393     size_t range_i,
2394     size_t range_j,
2395     size_t range_k,
2396     size_t range_l,
2397     size_t range_m,
2398     size_t range_n,
2399     uint32_t flags = 0)
2400 {
2401     pthreadpool_parallelize_6d(
2402         threadpool,
2403         &libpthreadpool::detail::call_wrapper_6d<const T>,
2404         const_cast<void*>(static_cast<const void*>(&functor)),
2405         range_i,
2406         range_j,
2407         range_k,
2408         range_l,
2409         range_m,
2410         range_n,
2411         flags);
2412 }
2413
2414 /**
2415  * Process items on a 6D grid with the specified maximum tile size along the
2416  * last grid dimension.
2417  *
2418  * The function implements a parallel version of the following snippet:
2419  *
2420  *   for (size_t i = 0; i < range_i; i++)
2421  *     for (size_t j = 0; j < range_j; j++)
2422  *       for (size_t k = 0; k < range_k; k++)
2423  *         for (size_t l = 0; l < range_l; l++)
2424  *           for (size_t m = 0; m < range_m; m++)
2425  *             for (size_t n = 0; n < range_n; n += tile_n)
2426  *               functor(i, j, k, l, m, n, min(range_n - n, tile_n));
2427  *
2428  * When the function returns, all items have been processed and the thread pool
2429  * is ready for a new task.
2430  *
2431  * @note If multiple threads call this function with the same thread pool, the
2432  *    calls are serialized.
2433  *
2434  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2435  *    is NULL, all items are processed serially on the calling thread.
2436  * @param functor     the functor to call for each tile.
2437  * @param range_i     the number of items to process along the first dimension
2438  *    of the 6D grid.
2439  * @param range_j     the number of items to process along the second dimension
2440  *    of the 6D grid.
2441  * @param range_k     the number of items to process along the third dimension
2442  *    of the 6D grid.
2443  * @param range_l     the number of items to process along the fourth dimension
2444  *    of the 6D grid.
2445  * @param range_m     the number of items to process along the fifth dimension
2446  *    of the 6D grid.
2447  * @param range_n     the number of items to process along the sixth dimension
2448  *    of the 6D grid.
2449  * @param tile_n      the maximum number of items along the sixth dimension of
2450  *    the 6D grid to process in one functor call.
2451  * @param flags       a bitwise combination of zero or more optional flags
2452  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2453  */
2454 template<class T>
2455 inline void pthreadpool_parallelize_6d_tile_1d(
2456     pthreadpool_t threadpool,
2457     const T& functor,
2458     size_t range_i,
2459     size_t range_j,
2460     size_t range_k,
2461     size_t range_l,
2462     size_t range_m,
2463     size_t range_n,
2464     size_t tile_n,
2465     uint32_t flags = 0)
2466 {
2467     pthreadpool_parallelize_6d_tile_1d(
2468         threadpool,
2469         &libpthreadpool::detail::call_wrapper_6d_tile_1d<const T>,
2470         const_cast<void*>(static_cast<const void*>(&functor)),
2471         range_i,
2472         range_j,
2473         range_k,
2474         range_l,
2475         range_m,
2476         range_n,
2477         tile_n,
2478         flags);
2479 }
2480
2481 /**
2482  * Process items on a 6D grid with the specified maximum tile size along the
2483  * last two grid dimensions.
2484  *
2485  * The function implements a parallel version of the following snippet:
2486  *
2487  *   for (size_t i = 0; i < range_i; i++)
2488  *     for (size_t j = 0; j < range_j; j++)
2489  *       for (size_t k = 0; k < range_k; k++)
2490  *         for (size_t l = 0; l < range_l; l++)
2491  *           for (size_t m = 0; m < range_m; m += tile_m)
2492  *             for (size_t n = 0; n < range_n; n += tile_n)
2493  *               functor(i, j, k, l, m, n,
2494  *                 min(range_m - m, tile_m), min(range_n - n, tile_n));
2495  *
2496  * When the function returns, all items have been processed and the thread pool
2497  * is ready for a new task.
2498  *
2499  * @note If multiple threads call this function with the same thread pool, the
2500  *    calls are serialized.
2501  *
2502  * @param threadpool  the thread pool to use for parallelisation. If threadpool
2503  *    is NULL, all items are processed serially on the calling thread.
2504  * @param functor     the functor to call for each tile.
2505  * @param range_i     the number of items to process along the first dimension
2506  *    of the 6D grid.
2507  * @param range_j     the number of items to process along the second dimension
2508  *    of the 6D grid.
2509  * @param range_k     the number of items to process along the third dimension
2510  *    of the 6D grid.
2511  * @param range_l     the number of items to process along the fourth dimension
2512  *    of the 6D grid.
2513  * @param range_m     the number of items to process along the fifth dimension
2514  *    of the 6D grid.
2515  * @param range_n     the number of items to process along the sixth dimension
2516  *    of the 6D grid.
2517  * @param tile_m      the maximum number of items along the fifth dimension of
2518  *    the 6D grid to process in one functor call.
2519  * @param tile_n      the maximum number of items along the sixth dimension of
2520  *    the 6D grid to process in one functor call.
2521  * @param flags       a bitwise combination of zero or more optional flags
2522  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
2523  */
2524 template<class T>
2525 inline void pthreadpool_parallelize_6d_tile_2d(
2526     pthreadpool_t threadpool,
2527     const T& functor,
2528     size_t range_i,
2529     size_t range_j,
2530     size_t range_k,
2531     size_t range_l,
2532     size_t range_m,
2533     size_t range_n,
2534     size_t tile_m,
2535     size_t tile_n,
2536     uint32_t flags = 0)
2537 {
2538     pthreadpool_parallelize_6d_tile_2d(
2539         threadpool,
2540         &libpthreadpool::detail::call_wrapper_6d_tile_2d<const T>,
2541         const_cast<void*>(static_cast<const void*>(&functor)),
2542         range_i,
2543         range_j,
2544         range_k,
2545         range_l,
2546         range_m,
2547         range_n,
2548         tile_m,
2549         tile_n,
2550         flags);
2551 }
2552
2553 #endif  /* __cplusplus */
2554
2555 #endif /* PTHREADPOOL_H_ */