|
||||
File indexing completed on 2025-01-17 09:55:58
0001 #ifndef PTHREADPOOL_H_ 0002 #define PTHREADPOOL_H_ 0003 0004 #include <stddef.h> 0005 #include <stdint.h> 0006 0007 typedef struct pthreadpool* pthreadpool_t; 0008 0009 typedef void (*pthreadpool_task_1d_t)(void*, size_t); 0010 typedef void (*pthreadpool_task_1d_with_thread_t)(void*, size_t, size_t); 0011 typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t); 0012 typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t); 0013 typedef void (*pthreadpool_task_2d_with_thread_t)(void*, size_t, size_t, size_t); 0014 typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t); 0015 typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t); 0016 typedef void (*pthreadpool_task_3d_t)(void*, size_t, size_t, size_t); 0017 typedef void (*pthreadpool_task_3d_tile_1d_t)(void*, size_t, size_t, size_t, size_t); 0018 typedef void (*pthreadpool_task_3d_tile_1d_with_thread_t)(void*, size_t, size_t, size_t, size_t, size_t); 0019 typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t); 0020 typedef void (*pthreadpool_task_4d_t)(void*, size_t, size_t, size_t, size_t); 0021 typedef void (*pthreadpool_task_4d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t); 0022 typedef void (*pthreadpool_task_4d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); 0023 typedef void (*pthreadpool_task_5d_t)(void*, size_t, size_t, size_t, size_t, size_t); 0024 typedef void (*pthreadpool_task_5d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); 0025 typedef void (*pthreadpool_task_5d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t); 0026 typedef void (*pthreadpool_task_6d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); 0027 typedef void (*pthreadpool_task_6d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t); 0028 typedef void (*pthreadpool_task_6d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t); 0029 0030 typedef void (*pthreadpool_task_1d_with_id_t)(void*, uint32_t, size_t); 0031 typedef void (*pthreadpool_task_2d_tile_1d_with_id_t)(void*, uint32_t, size_t, size_t, size_t); 0032 typedef void (*pthreadpool_task_2d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t); 0033 typedef void (*pthreadpool_task_3d_tile_1d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t); 0034 typedef void (*pthreadpool_task_3d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t); 0035 typedef void (*pthreadpool_task_4d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t); 0036 0037 typedef void (*pthreadpool_task_2d_tile_1d_with_id_with_thread_t)(void*, uint32_t, size_t, size_t, size_t, size_t); 0038 typedef void (*pthreadpool_task_3d_tile_1d_with_id_with_thread_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t); 0039 0040 0041 /** 0042 * Disable support for denormalized numbers to the maximum extent possible for 0043 * the duration of the computation. 0044 * 0045 * Handling denormalized floating-point numbers is often implemented in 0046 * microcode, and incurs significant performance degradation. This hint 0047 * instructs the thread pool to disable support for denormalized numbers before 0048 * running the computation by manipulating architecture-specific control 0049 * registers, and restore the initial value of control registers after the 0050 * computation is complete. The thread pool temporary disables denormalized 0051 * numbers on all threads involved in the computation (i.e. the caller threads, 0052 * and potentially worker threads). 0053 * 0054 * Disabling denormalized numbers may have a small negative effect on results' 0055 * accuracy. As various architectures differ in capabilities to control 0056 * processing of denormalized numbers, using this flag may also hurt results' 0057 * reproducibility across different instruction set architectures. 0058 */ 0059 #define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001 0060 0061 /** 0062 * Yield worker threads to the system scheduler after the operation is finished. 0063 * 0064 * Force workers to use kernel wait (instead of active spin-wait by default) for 0065 * new commands after this command is processed. This flag affects only the 0066 * immediate next operation on this thread pool. To make the thread pool always 0067 * use kernel wait, pass this flag to all parallelization functions. 0068 */ 0069 #define PTHREADPOOL_FLAG_YIELD_WORKERS 0x00000002 0070 0071 #ifdef __cplusplus 0072 extern "C" { 0073 #endif 0074 0075 /** 0076 * Create a thread pool with the specified number of threads. 0077 * 0078 * @param threads_count the number of threads in the thread pool. 0079 * A value of 0 has special interpretation: it creates a thread pool with as 0080 * many threads as there are logical processors in the system. 0081 * 0082 * @returns A pointer to an opaque thread pool object if the call is 0083 * successful, or NULL pointer if the call failed. 0084 */ 0085 pthreadpool_t pthreadpool_create(size_t threads_count); 0086 0087 /** 0088 * Query the number of threads in a thread pool. 0089 * 0090 * @param threadpool the thread pool to query. 0091 * 0092 * @returns The number of threads in the thread pool. 0093 */ 0094 size_t pthreadpool_get_threads_count(pthreadpool_t threadpool); 0095 0096 /** 0097 * Process items on a 1D grid. 0098 * 0099 * The function implements a parallel version of the following snippet: 0100 * 0101 * for (size_t i = 0; i < range; i++) 0102 * function(context, i); 0103 * 0104 * When the function returns, all items have been processed and the thread pool 0105 * is ready for a new task. 0106 * 0107 * @note If multiple threads call this function with the same thread pool, the 0108 * calls are serialized. 0109 * 0110 * @param threadpool the thread pool to use for parallelisation. If threadpool 0111 * is NULL, all items are processed serially on the calling thread. 0112 * @param function the function to call for each item. 0113 * @param context the first argument passed to the specified function. 0114 * @param range the number of items on the 1D grid to process. The 0115 * specified function will be called once for each item. 0116 * @param flags a bitwise combination of zero or more optional flags 0117 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 0118 */ 0119 void pthreadpool_parallelize_1d( 0120 pthreadpool_t threadpool, 0121 pthreadpool_task_1d_t function, 0122 void* context, 0123 size_t range, 0124 uint32_t flags); 0125 0126 /** 0127 * Process items on a 1D grid passing along the current thread id. 0128 * 0129 * The function implements a parallel version of the following snippet: 0130 * 0131 * for (size_t i = 0; i < range; i++) 0132 * function(context, thread_index, i); 0133 * 0134 * When the function returns, all items have been processed and the thread pool 0135 * is ready for a new task. 0136 * 0137 * @note If multiple threads call this function with the same thread pool, the 0138 * calls are serialized. 0139 * 0140 * @param threadpool the thread pool to use for parallelisation. If threadpool 0141 * is NULL, all items are processed serially on the calling thread. 0142 * @param function the function to call for each item. 0143 * @param context the first argument passed to the specified function. 0144 * @param range the number of items on the 1D grid to process. The 0145 * specified function will be called once for each item. 0146 * @param flags a bitwise combination of zero or more optional flags 0147 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 0148 */ 0149 void pthreadpool_parallelize_1d_with_thread( 0150 pthreadpool_t threadpool, 0151 pthreadpool_task_1d_with_thread_t function, 0152 void* context, 0153 size_t range, 0154 uint32_t flags); 0155 0156 /** 0157 * Process items on a 1D grid using a microarchitecture-aware task function. 0158 * 0159 * The function implements a parallel version of the following snippet: 0160 * 0161 * uint32_t uarch_index = cpuinfo_initialize() ? 0162 * cpuinfo_get_current_uarch_index() : default_uarch_index; 0163 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; 0164 * for (size_t i = 0; i < range; i++) 0165 * function(context, uarch_index, i); 0166 * 0167 * When the function returns, all items have been processed and the thread pool 0168 * is ready for a new task. 0169 * 0170 * @note If multiple threads call this function with the same thread pool, the 0171 * calls are serialized. 0172 * 0173 * @param threadpool the thread pool to use for parallelisation. If 0174 * threadpool is NULL, all items are processed serially on the calling 0175 * thread. 0176 * @param function the function to call for each item. 0177 * @param context the first argument passed to the specified 0178 * function. 0179 * @param default_uarch_index the microarchitecture index to use when 0180 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed, 0181 * or index returned by cpuinfo_get_current_uarch_index() exceeds the 0182 * max_uarch_index value. 0183 * @param max_uarch_index the maximum microarchitecture index expected by 0184 * the specified function. If the index returned by 0185 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index 0186 * will be used instead. default_uarch_index can exceed max_uarch_index. 0187 * @param range the number of items on the 1D grid to process. 0188 * The specified function will be called once for each item. 0189 * @param flags a bitwise combination of zero or more optional 0190 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or 0191 * PTHREADPOOL_FLAG_YIELD_WORKERS) 0192 */ 0193 void pthreadpool_parallelize_1d_with_uarch( 0194 pthreadpool_t threadpool, 0195 pthreadpool_task_1d_with_id_t function, 0196 void* context, 0197 uint32_t default_uarch_index, 0198 uint32_t max_uarch_index, 0199 size_t range, 0200 uint32_t flags); 0201 0202 /** 0203 * Process items on a 1D grid with specified maximum tile size. 0204 * 0205 * The function implements a parallel version of the following snippet: 0206 * 0207 * for (size_t i = 0; i < range; i += tile) 0208 * function(context, i, min(range - i, tile)); 0209 * 0210 * When the call returns, all items have been processed and the thread pool is 0211 * ready for a new task. 0212 * 0213 * @note If multiple threads call this function with the same thread pool, 0214 * the calls are serialized. 0215 * 0216 * @param threadpool the thread pool to use for parallelisation. If threadpool 0217 * is NULL, all items are processed serially on the calling thread. 0218 * @param function the function to call for each tile. 0219 * @param context the first argument passed to the specified function. 0220 * @param range the number of items on the 1D grid to process. 0221 * @param tile the maximum number of items on the 1D grid to process in 0222 * one function call. 0223 * @param flags a bitwise combination of zero or more optional flags 0224 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 0225 */ 0226 void pthreadpool_parallelize_1d_tile_1d( 0227 pthreadpool_t threadpool, 0228 pthreadpool_task_1d_tile_1d_t function, 0229 void* context, 0230 size_t range, 0231 size_t tile, 0232 uint32_t flags); 0233 0234 /** 0235 * Process items on a 2D grid. 0236 * 0237 * The function implements a parallel version of the following snippet: 0238 * 0239 * for (size_t i = 0; i < range_i; i++) 0240 * for (size_t j = 0; j < range_j; j++) 0241 * function(context, i, j); 0242 * 0243 * When the function returns, all items have been processed and the thread pool 0244 * is ready for a new task. 0245 * 0246 * @note If multiple threads call this function with the same thread pool, the 0247 * calls are serialized. 0248 * 0249 * @param threadpool the thread pool to use for parallelisation. If threadpool 0250 * is NULL, all items are processed serially on the calling thread. 0251 * @param function the function to call for each item. 0252 * @param context the first argument passed to the specified function. 0253 * @param range_i the number of items to process along the first dimension 0254 * of the 2D grid. 0255 * @param range_j the number of items to process along the second dimension 0256 * of the 2D grid. 0257 * @param flags a bitwise combination of zero or more optional flags 0258 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 0259 */ 0260 void pthreadpool_parallelize_2d( 0261 pthreadpool_t threadpool, 0262 pthreadpool_task_2d_t function, 0263 void* context, 0264 size_t range_i, 0265 size_t range_j, 0266 uint32_t flags); 0267 0268 /** 0269 * Process items on a 2D grid passing along the current thread id. 0270 * 0271 * The function implements a parallel version of the following snippet: 0272 * 0273 * for (size_t i = 0; i < range_i; i++) 0274 * for (size_t j = 0; j < range_j; j++) 0275 * function(context, thread_index, i, j); 0276 * 0277 * When the function returns, all items have been processed and the thread pool 0278 * is ready for a new task. 0279 * 0280 * @note If multiple threads call this function with the same thread pool, the 0281 * calls are serialized. 0282 * 0283 * @param threadpool the thread pool to use for parallelisation. If threadpool 0284 * is NULL, all items are processed serially on the calling thread. 0285 * @param function the function to call for each item. 0286 * @param context the first argument passed to the specified function. 0287 * @param range_i the number of items to process along the first dimension 0288 * of the 2D grid. 0289 * @param range_j the number of items to process along the second dimension 0290 * of the 2D grid. 0291 * @param flags a bitwise combination of zero or more optional flags 0292 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 0293 */ 0294 void pthreadpool_parallelize_2d_with_thread( 0295 pthreadpool_t threadpool, 0296 pthreadpool_task_2d_with_thread_t function, 0297 void* context, 0298 size_t range_i, 0299 size_t range_j, 0300 uint32_t flags); 0301 0302 /** 0303 * Process items on a 2D grid with the specified maximum tile size along the 0304 * last grid dimension. 0305 * 0306 * The function implements a parallel version of the following snippet: 0307 * 0308 * for (size_t i = 0; i < range_i; i++) 0309 * for (size_t j = 0; j < range_j; j += tile_j) 0310 * function(context, i, j, min(range_j - j, tile_j)); 0311 * 0312 * When the function returns, all items have been processed and the thread pool 0313 * is ready for a new task. 0314 * 0315 * @note If multiple threads call this function with the same thread pool, the 0316 * calls are serialized. 0317 * 0318 * @param threadpool the thread pool to use for parallelisation. If threadpool 0319 * is NULL, all items are processed serially on the calling thread. 0320 * @param function the function to call for each tile. 0321 * @param context the first argument passed to the specified function. 0322 * @param range_i the number of items to process along the first dimension 0323 * of the 2D grid. 0324 * @param range_j the number of items to process along the second dimension 0325 * of the 2D grid. 0326 * @param tile_j the maximum number of items along the second dimension of 0327 * the 2D grid to process in one function call. 0328 * @param flags a bitwise combination of zero or more optional flags 0329 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 0330 */ 0331 void pthreadpool_parallelize_2d_tile_1d( 0332 pthreadpool_t threadpool, 0333 pthreadpool_task_2d_tile_1d_t function, 0334 void* context, 0335 size_t range_i, 0336 size_t range_j, 0337 size_t tile_j, 0338 uint32_t flags); 0339 0340 /** 0341 * Process items on a 2D grid with the specified maximum tile size along the 0342 * last grid dimension using a microarchitecture-aware task function. 0343 * 0344 * The function implements a parallel version of the following snippet: 0345 * 0346 * uint32_t uarch_index = cpuinfo_initialize() ? 0347 * cpuinfo_get_current_uarch_index() : default_uarch_index; 0348 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; 0349 * for (size_t i = 0; i < range_i; i++) 0350 * for (size_t j = 0; j < range_j; j += tile_j) 0351 * function(context, uarch_index, i, j, min(range_j - j, tile_j)); 0352 * 0353 * When the function returns, all items have been processed and the thread pool 0354 * is ready for a new task. 0355 * 0356 * @note If multiple threads call this function with the same thread pool, the 0357 * calls are serialized. 0358 * 0359 * @param threadpool the thread pool to use for parallelisation. If threadpool 0360 * is NULL, all items are processed serially on the calling thread. 0361 * @param function the function to call for each tile. 0362 * @param context the first argument passed to the specified function. 0363 * @param default_uarch_index the microarchitecture index to use when 0364 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed, 0365 * or index returned by cpuinfo_get_current_uarch_index() exceeds the 0366 * max_uarch_index value. 0367 * @param max_uarch_index the maximum microarchitecture index expected by 0368 * the specified function. If the index returned by 0369 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index 0370 * will be used instead. default_uarch_index can exceed max_uarch_index. 0371 * @param range_i the number of items to process along the first dimension 0372 * of the 2D grid. 0373 * @param range_j the number of items to process along the second dimension 0374 * of the 2D grid. 0375 * @param tile_j the maximum number of items along the second dimension of 0376 * the 2D grid to process in one function call. 0377 * @param flags a bitwise combination of zero or more optional flags 0378 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 0379 */ 0380 void pthreadpool_parallelize_2d_tile_1d_with_uarch( 0381 pthreadpool_t threadpool, 0382 pthreadpool_task_2d_tile_1d_with_id_t function, 0383 void* context, 0384 uint32_t default_uarch_index, 0385 uint32_t max_uarch_index, 0386 size_t range_i, 0387 size_t range_j, 0388 size_t tile_j, 0389 uint32_t flags); 0390 0391 /** 0392 * Process items on a 2D grid with the specified maximum tile size along the 0393 * last grid dimension using a microarchitecture-aware task function and passing 0394 * along the current thread id. 0395 * 0396 * The function implements a parallel version of the following snippet: 0397 * 0398 * uint32_t uarch_index = cpuinfo_initialize() ? 0399 * cpuinfo_get_current_uarch_index() : default_uarch_index; 0400 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; 0401 * for (size_t i = 0; i < range_i; i++) 0402 * for (size_t j = 0; j < range_j; j += tile_j) 0403 * function(context, uarch_index, thread_index, i, j, min(range_j - j, tile_j)); 0404 * 0405 * When the function returns, all items have been processed and the thread pool 0406 * is ready for a new task. 0407 * 0408 * @note If multiple threads call this function with the same thread pool, the 0409 * calls are serialized. 0410 * 0411 * @param threadpool the thread pool to use for parallelisation. If threadpool 0412 * is NULL, all items are processed serially on the calling thread. 0413 * @param function the function to call for each tile. 0414 * @param context the first argument passed to the specified function. 0415 * @param default_uarch_index the microarchitecture index to use when 0416 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed, 0417 * or index returned by cpuinfo_get_current_uarch_index() exceeds the 0418 * max_uarch_index value. 0419 * @param max_uarch_index the maximum microarchitecture index expected by 0420 * the specified function. If the index returned by 0421 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index 0422 * will be used instead. default_uarch_index can exceed max_uarch_index. 0423 * @param range_i the number of items to process along the first dimension 0424 * of the 2D grid. 0425 * @param range_j the number of items to process along the second dimension 0426 * of the 2D grid. 0427 * @param tile_j the maximum number of items along the second dimension of 0428 * the 2D grid to process in one function call. 0429 * @param flags a bitwise combination of zero or more optional flags 0430 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 0431 */ 0432 void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( 0433 pthreadpool_t threadpool, 0434 pthreadpool_task_2d_tile_1d_with_id_with_thread_t function, 0435 void* context, 0436 uint32_t default_uarch_index, 0437 uint32_t max_uarch_index, 0438 size_t range_i, 0439 size_t range_j, 0440 size_t tile_j, 0441 uint32_t flags); 0442 0443 /** 0444 * Process items on a 2D grid with the specified maximum tile size along each 0445 * grid dimension. 0446 * 0447 * The function implements a parallel version of the following snippet: 0448 * 0449 * for (size_t i = 0; i < range_i; i += tile_i) 0450 * for (size_t j = 0; j < range_j; j += tile_j) 0451 * function(context, i, j, 0452 * min(range_i - i, tile_i), min(range_j - j, tile_j)); 0453 * 0454 * When the function returns, all items have been processed and the thread pool 0455 * is ready for a new task. 0456 * 0457 * @note If multiple threads call this function with the same thread pool, the 0458 * calls are serialized. 0459 * 0460 * @param threadpool the thread pool to use for parallelisation. If threadpool 0461 * is NULL, all items are processed serially on the calling thread. 0462 * @param function the function to call for each tile. 0463 * @param context the first argument passed to the specified function. 0464 * @param range_i the number of items to process along the first dimension 0465 * of the 2D grid. 0466 * @param range_j the number of items to process along the second dimension 0467 * of the 2D grid. 0468 * @param tile_j the maximum number of items along the first dimension of 0469 * the 2D grid to process in one function call. 0470 * @param tile_j the maximum number of items along the second dimension of 0471 * the 2D grid to process in one function call. 0472 * @param flags a bitwise combination of zero or more optional flags 0473 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 0474 */ 0475 void pthreadpool_parallelize_2d_tile_2d( 0476 pthreadpool_t threadpool, 0477 pthreadpool_task_2d_tile_2d_t function, 0478 void* context, 0479 size_t range_i, 0480 size_t range_j, 0481 size_t tile_i, 0482 size_t tile_j, 0483 uint32_t flags); 0484 0485 /** 0486 * Process items on a 2D grid with the specified maximum tile size along each 0487 * grid dimension using a microarchitecture-aware task function. 0488 * 0489 * The function implements a parallel version of the following snippet: 0490 * 0491 * uint32_t uarch_index = cpuinfo_initialize() ? 0492 * cpuinfo_get_current_uarch_index() : default_uarch_index; 0493 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; 0494 * for (size_t i = 0; i < range_i; i += tile_i) 0495 * for (size_t j = 0; j < range_j; j += tile_j) 0496 * function(context, uarch_index, i, j, 0497 * min(range_i - i, tile_i), min(range_j - j, tile_j)); 0498 * 0499 * When the function returns, all items have been processed and the thread pool 0500 * is ready for a new task. 0501 * 0502 * @note If multiple threads call this function with the same thread pool, the 0503 * calls are serialized. 0504 * 0505 * @param threadpool the thread pool to use for parallelisation. If 0506 * threadpool is NULL, all items are processed serially on the calling 0507 * thread. 0508 * @param function the function to call for each tile. 0509 * @param context the first argument passed to the specified 0510 * function. 0511 * @param default_uarch_index the microarchitecture index to use when 0512 * pthreadpool is configured without cpuinfo, 0513 * cpuinfo initialization failed, or index returned 0514 * by cpuinfo_get_current_uarch_index() exceeds 0515 * the max_uarch_index value. 0516 * @param max_uarch_index the maximum microarchitecture index expected 0517 * by the specified function. If the index returned 0518 * by cpuinfo_get_current_uarch_index() exceeds this 0519 * value, default_uarch_index will be used instead. 0520 * default_uarch_index can exceed max_uarch_index. 0521 * @param range_i the number of items to process along the first 0522 * dimension of the 2D grid. 0523 * @param range_j the number of items to process along the second 0524 * dimension of the 2D grid. 0525 * @param tile_j the maximum number of items along the first 0526 * dimension of the 2D grid to process in one function call. 0527 * @param tile_j the maximum number of items along the second 0528 * dimension of the 2D grid to process in one function call. 0529 * @param flags a bitwise combination of zero or more optional 0530 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or 0531 * PTHREADPOOL_FLAG_YIELD_WORKERS) 0532 */ 0533 void pthreadpool_parallelize_2d_tile_2d_with_uarch( 0534 pthreadpool_t threadpool, 0535 pthreadpool_task_2d_tile_2d_with_id_t function, 0536 void* context, 0537 uint32_t default_uarch_index, 0538 uint32_t max_uarch_index, 0539 size_t range_i, 0540 size_t range_j, 0541 size_t tile_i, 0542 size_t tile_j, 0543 uint32_t flags); 0544 0545 /** 0546 * Process items on a 3D grid. 0547 * 0548 * The function implements a parallel version of the following snippet: 0549 * 0550 * for (size_t i = 0; i < range_i; i++) 0551 * for (size_t j = 0; j < range_j; j++) 0552 * for (size_t k = 0; k < range_k; k++) 0553 * function(context, i, j, k); 0554 * 0555 * When the function returns, all items have been processed and the thread pool 0556 * is ready for a new task. 0557 * 0558 * @note If multiple threads call this function with the same thread pool, the 0559 * calls are serialized. 0560 * 0561 * @param threadpool the thread pool to use for parallelisation. If threadpool 0562 * is NULL, all items are processed serially on the calling thread. 0563 * @param function the function to call for each tile. 0564 * @param context the first argument passed to the specified function. 0565 * @param range_i the number of items to process along the first dimension 0566 * of the 3D grid. 0567 * @param range_j the number of items to process along the second dimension 0568 * of the 3D grid. 0569 * @param range_k the number of items to process along the third dimension 0570 * of the 3D grid. 0571 * @param flags a bitwise combination of zero or more optional flags 0572 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 0573 */ 0574 void pthreadpool_parallelize_3d( 0575 pthreadpool_t threadpool, 0576 pthreadpool_task_3d_t function, 0577 void* context, 0578 size_t range_i, 0579 size_t range_j, 0580 size_t range_k, 0581 uint32_t flags); 0582 0583 /** 0584 * Process items on a 3D grid with the specified maximum tile size along the 0585 * last grid dimension. 0586 * 0587 * The function implements a parallel version of the following snippet: 0588 * 0589 * for (size_t i = 0; i < range_i; i++) 0590 * for (size_t j = 0; j < range_j; j++) 0591 * for (size_t k = 0; k < range_k; k += tile_k) 0592 * function(context, i, j, k, min(range_k - k, tile_k)); 0593 * 0594 * When the function returns, all items have been processed and the thread pool 0595 * is ready for a new task. 0596 * 0597 * @note If multiple threads call this function with the same thread pool, the 0598 * calls are serialized. 0599 * 0600 * @param threadpool the thread pool to use for parallelisation. If threadpool 0601 * is NULL, all items are processed serially on the calling thread. 0602 * @param function the function to call for each tile. 0603 * @param context the first argument passed to the specified function. 0604 * @param range_i the number of items to process along the first dimension 0605 * of the 3D grid. 0606 * @param range_j the number of items to process along the second dimension 0607 * of the 3D grid. 0608 * @param range_k the number of items to process along the third dimension 0609 * of the 3D grid. 0610 * @param tile_k the maximum number of items along the third dimension of 0611 * the 3D grid to process in one function call. 0612 * @param flags a bitwise combination of zero or more optional flags 0613 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 0614 */ 0615 void pthreadpool_parallelize_3d_tile_1d( 0616 pthreadpool_t threadpool, 0617 pthreadpool_task_3d_tile_1d_t function, 0618 void* context, 0619 size_t range_i, 0620 size_t range_j, 0621 size_t range_k, 0622 size_t tile_k, 0623 uint32_t flags); 0624 0625 /** 0626 * Process items on a 3D grid with the specified maximum tile size along the 0627 * last grid dimension and passing along the current thread id. 0628 * 0629 * The function implements a parallel version of the following snippet: 0630 * 0631 * for (size_t i = 0; i < range_i; i++) 0632 * for (size_t j = 0; j < range_j; j++) 0633 * for (size_t k = 0; k < range_k; k += tile_k) 0634 * function(context, thread_index, i, j, k, min(range_k - k, tile_k)); 0635 * 0636 * When the function returns, all items have been processed and the thread pool 0637 * is ready for a new task. 0638 * 0639 * @note If multiple threads call this function with the same thread pool, the 0640 * calls are serialized. 0641 * 0642 * @param threadpool the thread pool to use for parallelisation. If threadpool 0643 * is NULL, all items are processed serially on the calling thread. 0644 * @param function the function to call for each tile. 0645 * @param context the first argument passed to the specified function. 0646 * @param range_i the number of items to process along the first dimension 0647 * of the 3D grid. 0648 * @param range_j the number of items to process along the second dimension 0649 * of the 3D grid. 0650 * @param range_k the number of items to process along the third dimension 0651 * of the 3D grid. 0652 * @param tile_k the maximum number of items along the third dimension of 0653 * the 3D grid to process in one function call. 0654 * @param flags a bitwise combination of zero or more optional flags 0655 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 0656 */ 0657 void pthreadpool_parallelize_3d_tile_1d_with_thread( 0658 pthreadpool_t threadpool, 0659 pthreadpool_task_3d_tile_1d_with_thread_t function, 0660 void* context, 0661 size_t range_i, 0662 size_t range_j, 0663 size_t range_k, 0664 size_t tile_k, 0665 uint32_t flags); 0666 0667 /** 0668 * Process items on a 3D grid with the specified maximum tile size along the 0669 * last grid dimension using a microarchitecture-aware task function. 0670 * 0671 * The function implements a parallel version of the following snippet: 0672 * 0673 * uint32_t uarch_index = cpuinfo_initialize() ? 0674 * cpuinfo_get_current_uarch_index() : default_uarch_index; 0675 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; 0676 * for (size_t i = 0; i < range_i; i++) 0677 * for (size_t j = 0; j < range_j; j++) 0678 * for (size_t k = 0; k < range_k; k += tile_k) 0679 * function(context, uarch_index, i, j, k, min(range_k - k, tile_k)); 0680 * 0681 * When the function returns, all items have been processed and the thread pool 0682 * is ready for a new task. 0683 * 0684 * @note If multiple threads call this function with the same thread pool, the 0685 * calls are serialized. 0686 * 0687 * @param threadpool the thread pool to use for parallelisation. If 0688 * threadpool is NULL, all items are processed serially on the calling 0689 * thread. 0690 * @param function the function to call for each tile. 0691 * @param context the first argument passed to the specified 0692 * function. 0693 * @param default_uarch_index the microarchitecture index to use when 0694 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed, 0695 * or index returned by cpuinfo_get_current_uarch_index() exceeds the 0696 * max_uarch_index value. 0697 * @param max_uarch_index the maximum microarchitecture index expected by 0698 * the specified function. If the index returned by 0699 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index 0700 * will be used instead. default_uarch_index can exceed max_uarch_index. 0701 * @param range_i the number of items to process along the first 0702 * dimension of the 3D grid. 0703 * @param range_j the number of items to process along the second 0704 * dimension of the 3D grid. 0705 * @param range_k the number of items to process along the third 0706 * dimension of the 3D grid. 0707 * @param tile_k the maximum number of items along the third 0708 * dimension of the 3D grid to process in one function call. 0709 * @param flags a bitwise combination of zero or more optional 0710 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or 0711 * PTHREADPOOL_FLAG_YIELD_WORKERS) 0712 */ 0713 void pthreadpool_parallelize_3d_tile_1d_with_uarch( 0714 pthreadpool_t threadpool, 0715 pthreadpool_task_3d_tile_1d_with_id_t function, 0716 void* context, 0717 uint32_t default_uarch_index, 0718 uint32_t max_uarch_index, 0719 size_t range_i, 0720 size_t range_j, 0721 size_t range_k, 0722 size_t tile_k, 0723 uint32_t flags); 0724 0725 /** 0726 * Process items on a 3D grid with the specified maximum tile size along the 0727 * last grid dimension using a microarchitecture-aware task function and passing 0728 * along the current thread id. 0729 * 0730 * The function implements a parallel version of the following snippet: 0731 * 0732 * uint32_t uarch_index = cpuinfo_initialize() ? 0733 * cpuinfo_get_current_uarch_index() : default_uarch_index; 0734 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; 0735 * for (size_t i = 0; i < range_i; i++) 0736 * for (size_t j = 0; j < range_j; j++) 0737 * for (size_t k = 0; k < range_k; k += tile_k) 0738 * function(context, uarch_index, thread_index, i, j, k, min(range_k - k, tile_k)); 0739 * 0740 * When the function returns, all items have been processed and the thread pool 0741 * is ready for a new task. 0742 * 0743 * @note If multiple threads call this function with the same thread pool, the 0744 * calls are serialized. 0745 * 0746 * @param threadpool the thread pool to use for parallelisation. If 0747 * threadpool is NULL, all items are processed serially on the calling 0748 * thread. 0749 * @param function the function to call for each tile. 0750 * @param context the first argument passed to the specified 0751 * function. 0752 * @param default_uarch_index the microarchitecture index to use when 0753 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed, 0754 * or index returned by cpuinfo_get_current_uarch_index() exceeds the 0755 * max_uarch_index value. 0756 * @param max_uarch_index the maximum microarchitecture index expected by 0757 * the specified function. If the index returned by 0758 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index 0759 * will be used instead. default_uarch_index can exceed max_uarch_index. 0760 * @param range_i the number of items to process along the first 0761 * dimension of the 3D grid. 0762 * @param range_j the number of items to process along the second 0763 * dimension of the 3D grid. 0764 * @param range_k the number of items to process along the third 0765 * dimension of the 3D grid. 0766 * @param tile_k the maximum number of items along the third 0767 * dimension of the 3D grid to process in one function call. 0768 * @param flags a bitwise combination of zero or more optional 0769 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or 0770 * PTHREADPOOL_FLAG_YIELD_WORKERS) 0771 */ 0772 void pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( 0773 pthreadpool_t threadpool, 0774 pthreadpool_task_3d_tile_1d_with_id_with_thread_t function, 0775 void* context, 0776 uint32_t default_uarch_index, 0777 uint32_t max_uarch_index, 0778 size_t range_i, 0779 size_t range_j, 0780 size_t range_k, 0781 size_t tile_k, 0782 uint32_t flags); 0783 0784 /** 0785 * Process items on a 3D grid with the specified maximum tile size along the 0786 * last two grid dimensions. 0787 * 0788 * The function implements a parallel version of the following snippet: 0789 * 0790 * for (size_t i = 0; i < range_i; i++) 0791 * for (size_t j = 0; j < range_j; j += tile_j) 0792 * for (size_t k = 0; k < range_k; k += tile_k) 0793 * function(context, i, j, k, 0794 * min(range_j - j, tile_j), min(range_k - k, tile_k)); 0795 * 0796 * When the function returns, all items have been processed and the thread pool 0797 * is ready for a new task. 0798 * 0799 * @note If multiple threads call this function with the same thread pool, the 0800 * calls are serialized. 0801 * 0802 * @param threadpool the thread pool to use for parallelisation. If threadpool 0803 * is NULL, all items are processed serially on the calling thread. 0804 * @param function the function to call for each tile. 0805 * @param context the first argument passed to the specified function. 0806 * @param range_i the number of items to process along the first dimension 0807 * of the 3D grid. 0808 * @param range_j the number of items to process along the second dimension 0809 * of the 3D grid. 0810 * @param range_k the number of items to process along the third dimension 0811 * of the 3D grid. 0812 * @param tile_j the maximum number of items along the second dimension of 0813 * the 3D grid to process in one function call. 0814 * @param tile_k the maximum number of items along the third dimension of 0815 * the 3D grid to process in one function call. 0816 * @param flags a bitwise combination of zero or more optional flags 0817 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 0818 */ 0819 void pthreadpool_parallelize_3d_tile_2d( 0820 pthreadpool_t threadpool, 0821 pthreadpool_task_3d_tile_2d_t function, 0822 void* context, 0823 size_t range_i, 0824 size_t range_j, 0825 size_t range_k, 0826 size_t tile_j, 0827 size_t tile_k, 0828 uint32_t flags); 0829 0830 /** 0831 * Process items on a 3D grid with the specified maximum tile size along the 0832 * last two grid dimensions using a microarchitecture-aware task function. 0833 * 0834 * The function implements a parallel version of the following snippet: 0835 * 0836 * uint32_t uarch_index = cpuinfo_initialize() ? 0837 * cpuinfo_get_current_uarch_index() : default_uarch_index; 0838 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; 0839 * for (size_t i = 0; i < range_i; i++) 0840 * for (size_t j = 0; j < range_j; j += tile_j) 0841 * for (size_t k = 0; k < range_k; k += tile_k) 0842 * function(context, uarch_index, i, j, k, 0843 * min(range_j - j, tile_j), min(range_k - k, tile_k)); 0844 * 0845 * When the function returns, all items have been processed and the thread pool 0846 * is ready for a new task. 0847 * 0848 * @note If multiple threads call this function with the same thread pool, the 0849 * calls are serialized. 0850 * 0851 * @param threadpool the thread pool to use for parallelisation. If 0852 * threadpool is NULL, all items are processed serially on the calling 0853 * thread. 0854 * @param function the function to call for each tile. 0855 * @param context the first argument passed to the specified 0856 * function. 0857 * @param default_uarch_index the microarchitecture index to use when 0858 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed, 0859 * or index returned by cpuinfo_get_current_uarch_index() exceeds the 0860 * max_uarch_index value. 0861 * @param max_uarch_index the maximum microarchitecture index expected by 0862 * the specified function. If the index returned by 0863 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index 0864 * will be used instead. default_uarch_index can exceed max_uarch_index. 0865 * @param range_i the number of items to process along the first 0866 * dimension of the 3D grid. 0867 * @param range_j the number of items to process along the second 0868 * dimension of the 3D grid. 0869 * @param range_k the number of items to process along the third 0870 * dimension of the 3D grid. 0871 * @param tile_j the maximum number of items along the second 0872 * dimension of the 3D grid to process in one function call. 0873 * @param tile_k the maximum number of items along the third 0874 * dimension of the 3D grid to process in one function call. 0875 * @param flags a bitwise combination of zero or more optional 0876 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or 0877 * PTHREADPOOL_FLAG_YIELD_WORKERS) 0878 */ 0879 void pthreadpool_parallelize_3d_tile_2d_with_uarch( 0880 pthreadpool_t threadpool, 0881 pthreadpool_task_3d_tile_2d_with_id_t function, 0882 void* context, 0883 uint32_t default_uarch_index, 0884 uint32_t max_uarch_index, 0885 size_t range_i, 0886 size_t range_j, 0887 size_t range_k, 0888 size_t tile_j, 0889 size_t tile_k, 0890 uint32_t flags); 0891 0892 /** 0893 * Process items on a 4D grid. 0894 * 0895 * The function implements a parallel version of the following snippet: 0896 * 0897 * for (size_t i = 0; i < range_i; i++) 0898 * for (size_t j = 0; j < range_j; j++) 0899 * for (size_t k = 0; k < range_k; k++) 0900 * for (size_t l = 0; l < range_l; l++) 0901 * function(context, i, j, k, l); 0902 * 0903 * When the function returns, all items have been processed and the thread pool 0904 * is ready for a new task. 0905 * 0906 * @note If multiple threads call this function with the same thread pool, the 0907 * calls are serialized. 0908 * 0909 * @param threadpool the thread pool to use for parallelisation. If threadpool 0910 * is NULL, all items are processed serially on the calling thread. 0911 * @param function the function to call for each tile. 0912 * @param context the first argument passed to the specified function. 0913 * @param range_i the number of items to process along the first dimension 0914 * of the 4D grid. 0915 * @param range_j the number of items to process along the second dimension 0916 * of the 4D grid. 0917 * @param range_k the number of items to process along the third dimension 0918 * of the 4D grid. 0919 * @param range_l the number of items to process along the fourth dimension 0920 * of the 4D grid. 0921 * @param flags a bitwise combination of zero or more optional flags 0922 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 0923 */ 0924 void pthreadpool_parallelize_4d( 0925 pthreadpool_t threadpool, 0926 pthreadpool_task_4d_t function, 0927 void* context, 0928 size_t range_i, 0929 size_t range_j, 0930 size_t range_k, 0931 size_t range_l, 0932 uint32_t flags); 0933 0934 /** 0935 * Process items on a 4D grid with the specified maximum tile size along the 0936 * last grid dimension. 0937 * 0938 * The function implements a parallel version of the following snippet: 0939 * 0940 * for (size_t i = 0; i < range_i; i++) 0941 * for (size_t j = 0; j < range_j; j++) 0942 * for (size_t k = 0; k < range_k; k++) 0943 * for (size_t l = 0; l < range_l; l += tile_l) 0944 * function(context, i, j, k, l, min(range_l - l, tile_l)); 0945 * 0946 * When the function returns, all items have been processed and the thread pool 0947 * is ready for a new task. 0948 * 0949 * @note If multiple threads call this function with the same thread pool, the 0950 * calls are serialized. 0951 * 0952 * @param threadpool the thread pool to use for parallelisation. If threadpool 0953 * is NULL, all items are processed serially on the calling thread. 0954 * @param function the function to call for each tile. 0955 * @param context the first argument passed to the specified function. 0956 * @param range_i the number of items to process along the first dimension 0957 * of the 4D grid. 0958 * @param range_j the number of items to process along the second dimension 0959 * of the 4D grid. 0960 * @param range_k the number of items to process along the third dimension 0961 * of the 4D grid. 0962 * @param range_l the number of items to process along the fourth dimension 0963 * of the 4D grid. 0964 * @param tile_l the maximum number of items along the fourth dimension of 0965 * the 4D grid to process in one function call. 0966 * @param flags a bitwise combination of zero or more optional flags 0967 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 0968 */ 0969 void pthreadpool_parallelize_4d_tile_1d( 0970 pthreadpool_t threadpool, 0971 pthreadpool_task_4d_tile_1d_t function, 0972 void* context, 0973 size_t range_i, 0974 size_t range_j, 0975 size_t range_k, 0976 size_t range_l, 0977 size_t tile_l, 0978 uint32_t flags); 0979 0980 /** 0981 * Process items on a 4D grid with the specified maximum tile size along the 0982 * last two grid dimensions. 0983 * 0984 * The function implements a parallel version of the following snippet: 0985 * 0986 * for (size_t i = 0; i < range_i; i++) 0987 * for (size_t j = 0; j < range_j; j++) 0988 * for (size_t k = 0; k < range_k; k += tile_k) 0989 * for (size_t l = 0; l < range_l; l += tile_l) 0990 * function(context, i, j, k, l, 0991 * min(range_k - k, tile_k), min(range_l - l, tile_l)); 0992 * 0993 * When the function returns, all items have been processed and the thread pool 0994 * is ready for a new task. 0995 * 0996 * @note If multiple threads call this function with the same thread pool, the 0997 * calls are serialized. 0998 * 0999 * @param threadpool the thread pool to use for parallelisation. If threadpool 1000 * is NULL, all items are processed serially on the calling thread. 1001 * @param function the function to call for each tile. 1002 * @param context the first argument passed to the specified function. 1003 * @param range_i the number of items to process along the first dimension 1004 * of the 4D grid. 1005 * @param range_j the number of items to process along the second dimension 1006 * of the 4D grid. 1007 * @param range_k the number of items to process along the third dimension 1008 * of the 4D grid. 1009 * @param range_l the number of items to process along the fourth dimension 1010 * of the 4D grid. 1011 * @param tile_k the maximum number of items along the third dimension of 1012 * the 4D grid to process in one function call. 1013 * @param tile_l the maximum number of items along the fourth dimension of 1014 * the 4D grid to process in one function call. 1015 * @param flags a bitwise combination of zero or more optional flags 1016 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1017 */ 1018 void pthreadpool_parallelize_4d_tile_2d( 1019 pthreadpool_t threadpool, 1020 pthreadpool_task_4d_tile_2d_t function, 1021 void* context, 1022 size_t range_i, 1023 size_t range_j, 1024 size_t range_k, 1025 size_t range_l, 1026 size_t tile_k, 1027 size_t tile_l, 1028 uint32_t flags); 1029 1030 /** 1031 * Process items on a 4D grid with the specified maximum tile size along the 1032 * last two grid dimensions using a microarchitecture-aware task function. 1033 * 1034 * The function implements a parallel version of the following snippet: 1035 * 1036 * uint32_t uarch_index = cpuinfo_initialize() ? 1037 * cpuinfo_get_current_uarch_index() : default_uarch_index; 1038 * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; 1039 * for (size_t i = 0; i < range_i; i++) 1040 * for (size_t j = 0; j < range_j; j++) 1041 * for (size_t k = 0; k < range_k; k += tile_k) 1042 * for (size_t l = 0; l < range_l; l += tile_l) 1043 * function(context, uarch_index, i, j, k, l, 1044 * min(range_k - k, tile_k), min(range_l - l, tile_l)); 1045 * 1046 * When the function returns, all items have been processed and the thread pool 1047 * is ready for a new task. 1048 * 1049 * @note If multiple threads call this function with the same thread pool, the 1050 * calls are serialized. 1051 * 1052 * @param threadpool the thread pool to use for parallelisation. If 1053 * threadpool is NULL, all items are processed serially on the calling 1054 * thread. 1055 * @param function the function to call for each tile. 1056 * @param context the first argument passed to the specified 1057 * function. 1058 * @param default_uarch_index the microarchitecture index to use when 1059 * pthreadpool is configured without cpuinfo, cpuinfo initialization failed, 1060 * or index returned by cpuinfo_get_current_uarch_index() exceeds the 1061 * max_uarch_index value. 1062 * @param max_uarch_index the maximum microarchitecture index expected by 1063 * the specified function. If the index returned by 1064 * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index 1065 * will be used instead. default_uarch_index can exceed max_uarch_index. 1066 * @param range_i the number of items to process along the first 1067 * dimension of the 4D grid. 1068 * @param range_j the number of items to process along the second 1069 * dimension of the 4D grid. 1070 * @param range_k the number of items to process along the third 1071 * dimension of the 4D grid. 1072 * @param range_l the number of items to process along the fourth 1073 * dimension of the 4D grid. 1074 * @param tile_k the maximum number of items along the third 1075 * dimension of the 4D grid to process in one function call. 1076 * @param tile_l the maximum number of items along the fourth 1077 * dimension of the 4D grid to process in one function call. 1078 * @param flags a bitwise combination of zero or more optional 1079 * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or 1080 * PTHREADPOOL_FLAG_YIELD_WORKERS) 1081 */ 1082 void pthreadpool_parallelize_4d_tile_2d_with_uarch( 1083 pthreadpool_t threadpool, 1084 pthreadpool_task_4d_tile_2d_with_id_t function, 1085 void* context, 1086 uint32_t default_uarch_index, 1087 uint32_t max_uarch_index, 1088 size_t range_i, 1089 size_t range_j, 1090 size_t range_k, 1091 size_t range_l, 1092 size_t tile_k, 1093 size_t tile_l, 1094 uint32_t flags); 1095 1096 /** 1097 * Process items on a 5D grid. 1098 * 1099 * The function implements a parallel version of the following snippet: 1100 * 1101 * for (size_t i = 0; i < range_i; i++) 1102 * for (size_t j = 0; j < range_j; j++) 1103 * for (size_t k = 0; k < range_k; k++) 1104 * for (size_t l = 0; l < range_l; l++) 1105 * for (size_t m = 0; m < range_m; m++) 1106 * function(context, i, j, k, l, m); 1107 * 1108 * When the function returns, all items have been processed and the thread pool 1109 * is ready for a new task. 1110 * 1111 * @note If multiple threads call this function with the same thread pool, the 1112 * calls are serialized. 1113 * 1114 * @param threadpool the thread pool to use for parallelisation. If threadpool 1115 * is NULL, all items are processed serially on the calling thread. 1116 * @param function the function to call for each tile. 1117 * @param context the first argument passed to the specified function. 1118 * @param range_i the number of items to process along the first dimension 1119 * of the 5D grid. 1120 * @param range_j the number of items to process along the second dimension 1121 * of the 5D grid. 1122 * @param range_k the number of items to process along the third dimension 1123 * of the 5D grid. 1124 * @param range_l the number of items to process along the fourth dimension 1125 * of the 5D grid. 1126 * @param range_m the number of items to process along the fifth dimension 1127 * of the 5D grid. 1128 * @param flags a bitwise combination of zero or more optional flags 1129 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1130 */ 1131 void pthreadpool_parallelize_5d( 1132 pthreadpool_t threadpool, 1133 pthreadpool_task_5d_t function, 1134 void* context, 1135 size_t range_i, 1136 size_t range_j, 1137 size_t range_k, 1138 size_t range_l, 1139 size_t range_m, 1140 uint32_t flags); 1141 1142 /** 1143 * Process items on a 5D grid with the specified maximum tile size along the 1144 * last grid dimension. 1145 * 1146 * The function implements a parallel version of the following snippet: 1147 * 1148 * for (size_t i = 0; i < range_i; i++) 1149 * for (size_t j = 0; j < range_j; j++) 1150 * for (size_t k = 0; k < range_k; k++) 1151 * for (size_t l = 0; l < range_l; l++) 1152 * for (size_t m = 0; m < range_m; m += tile_m) 1153 * function(context, i, j, k, l, m, min(range_m - m, tile_m)); 1154 * 1155 * When the function returns, all items have been processed and the thread pool 1156 * is ready for a new task. 1157 * 1158 * @note If multiple threads call this function with the same thread pool, the 1159 * calls are serialized. 1160 * 1161 * @param threadpool the thread pool to use for parallelisation. If threadpool 1162 * is NULL, all items are processed serially on the calling thread. 1163 * @param function the function to call for each tile. 1164 * @param context the first argument passed to the specified function. 1165 * @param range_i the number of items to process along the first dimension 1166 * of the 5D grid. 1167 * @param range_j the number of items to process along the second dimension 1168 * of the 5D grid. 1169 * @param range_k the number of items to process along the third dimension 1170 * of the 5D grid. 1171 * @param range_l the number of items to process along the fourth dimension 1172 * of the 5D grid. 1173 * @param range_m the number of items to process along the fifth dimension 1174 * of the 5D grid. 1175 * @param tile_m the maximum number of items along the fifth dimension of 1176 * the 5D grid to process in one function call. 1177 * @param flags a bitwise combination of zero or more optional flags 1178 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1179 */ 1180 void pthreadpool_parallelize_5d_tile_1d( 1181 pthreadpool_t threadpool, 1182 pthreadpool_task_5d_tile_1d_t function, 1183 void* context, 1184 size_t range_i, 1185 size_t range_j, 1186 size_t range_k, 1187 size_t range_l, 1188 size_t range_m, 1189 size_t tile_m, 1190 uint32_t flags); 1191 1192 /** 1193 * Process items on a 5D grid with the specified maximum tile size along the 1194 * last two grid dimensions. 1195 * 1196 * The function implements a parallel version of the following snippet: 1197 * 1198 * for (size_t i = 0; i < range_i; i++) 1199 * for (size_t j = 0; j < range_j; j++) 1200 * for (size_t k = 0; k < range_k; k++) 1201 * for (size_t l = 0; l < range_l; l += tile_l) 1202 * for (size_t m = 0; m < range_m; m += tile_m) 1203 * function(context, i, j, k, l, m, 1204 * min(range_l - l, tile_l), min(range_m - m, tile_m)); 1205 * 1206 * When the function returns, all items have been processed and the thread pool 1207 * is ready for a new task. 1208 * 1209 * @note If multiple threads call this function with the same thread pool, the 1210 * calls are serialized. 1211 * 1212 * @param threadpool the thread pool to use for parallelisation. If threadpool 1213 * is NULL, all items are processed serially on the calling thread. 1214 * @param function the function to call for each tile. 1215 * @param context the first argument passed to the specified function. 1216 * @param range_i the number of items to process along the first dimension 1217 * of the 5D grid. 1218 * @param range_j the number of items to process along the second dimension 1219 * of the 5D grid. 1220 * @param range_k the number of items to process along the third dimension 1221 * of the 5D grid. 1222 * @param range_l the number of items to process along the fourth dimension 1223 * of the 5D grid. 1224 * @param range_m the number of items to process along the fifth dimension 1225 * of the 5D grid. 1226 * @param tile_l the maximum number of items along the fourth dimension of 1227 * the 5D grid to process in one function call. 1228 * @param tile_m the maximum number of items along the fifth dimension of 1229 * the 5D grid to process in one function call. 1230 * @param flags a bitwise combination of zero or more optional flags 1231 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1232 */ 1233 void pthreadpool_parallelize_5d_tile_2d( 1234 pthreadpool_t threadpool, 1235 pthreadpool_task_5d_tile_2d_t function, 1236 void* context, 1237 size_t range_i, 1238 size_t range_j, 1239 size_t range_k, 1240 size_t range_l, 1241 size_t range_m, 1242 size_t tile_l, 1243 size_t tile_m, 1244 uint32_t flags); 1245 1246 /** 1247 * Process items on a 6D grid. 1248 * 1249 * The function implements a parallel version of the following snippet: 1250 * 1251 * for (size_t i = 0; i < range_i; i++) 1252 * for (size_t j = 0; j < range_j; j++) 1253 * for (size_t k = 0; k < range_k; k++) 1254 * for (size_t l = 0; l < range_l; l++) 1255 * for (size_t m = 0; m < range_m; m++) 1256 * for (size_t n = 0; n < range_n; n++) 1257 * function(context, i, j, k, l, m, n); 1258 * 1259 * When the function returns, all items have been processed and the thread pool 1260 * is ready for a new task. 1261 * 1262 * @note If multiple threads call this function with the same thread pool, the 1263 * calls are serialized. 1264 * 1265 * @param threadpool the thread pool to use for parallelisation. If threadpool 1266 * is NULL, all items are processed serially on the calling thread. 1267 * @param function the function to call for each tile. 1268 * @param context the first argument passed to the specified function. 1269 * @param range_i the number of items to process along the first dimension 1270 * of the 6D grid. 1271 * @param range_j the number of items to process along the second dimension 1272 * of the 6D grid. 1273 * @param range_k the number of items to process along the third dimension 1274 * of the 6D grid. 1275 * @param range_l the number of items to process along the fourth dimension 1276 * of the 6D grid. 1277 * @param range_m the number of items to process along the fifth dimension 1278 * of the 6D grid. 1279 * @param range_n the number of items to process along the sixth dimension 1280 * of the 6D grid. 1281 * @param tile_n the maximum number of items along the sixth dimension of 1282 * the 6D grid to process in one function call. 1283 * @param flags a bitwise combination of zero or more optional flags 1284 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1285 */ 1286 void pthreadpool_parallelize_6d( 1287 pthreadpool_t threadpool, 1288 pthreadpool_task_6d_t function, 1289 void* context, 1290 size_t range_i, 1291 size_t range_j, 1292 size_t range_k, 1293 size_t range_l, 1294 size_t range_m, 1295 size_t range_n, 1296 uint32_t flags); 1297 1298 /** 1299 * Process items on a 6D grid with the specified maximum tile size along the 1300 * last grid dimension. 1301 * 1302 * The function implements a parallel version of the following snippet: 1303 * 1304 * for (size_t i = 0; i < range_i; i++) 1305 * for (size_t j = 0; j < range_j; j++) 1306 * for (size_t k = 0; k < range_k; k++) 1307 * for (size_t l = 0; l < range_l; l++) 1308 * for (size_t m = 0; m < range_m; m++) 1309 * for (size_t n = 0; n < range_n; n += tile_n) 1310 * function(context, i, j, k, l, m, n, min(range_n - n, tile_n)); 1311 * 1312 * When the function returns, all items have been processed and the thread pool 1313 * is ready for a new task. 1314 * 1315 * @note If multiple threads call this function with the same thread pool, the 1316 * calls are serialized. 1317 * 1318 * @param threadpool the thread pool to use for parallelisation. If threadpool 1319 * is NULL, all items are processed serially on the calling thread. 1320 * @param function the function to call for each tile. 1321 * @param context the first argument passed to the specified function. 1322 * @param range_i the number of items to process along the first dimension 1323 * of the 6D grid. 1324 * @param range_j the number of items to process along the second dimension 1325 * of the 6D grid. 1326 * @param range_k the number of items to process along the third dimension 1327 * of the 6D grid. 1328 * @param range_l the number of items to process along the fourth dimension 1329 * of the 6D grid. 1330 * @param range_m the number of items to process along the fifth dimension 1331 * of the 6D grid. 1332 * @param range_n the number of items to process along the sixth dimension 1333 * of the 6D grid. 1334 * @param tile_n the maximum number of items along the sixth dimension of 1335 * the 6D grid to process in one function call. 1336 * @param flags a bitwise combination of zero or more optional flags 1337 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1338 */ 1339 void pthreadpool_parallelize_6d_tile_1d( 1340 pthreadpool_t threadpool, 1341 pthreadpool_task_6d_tile_1d_t function, 1342 void* context, 1343 size_t range_i, 1344 size_t range_j, 1345 size_t range_k, 1346 size_t range_l, 1347 size_t range_m, 1348 size_t range_n, 1349 size_t tile_n, 1350 uint32_t flags); 1351 1352 /** 1353 * Process items on a 6D grid with the specified maximum tile size along the 1354 * last two grid dimensions. 1355 * 1356 * The function implements a parallel version of the following snippet: 1357 * 1358 * for (size_t i = 0; i < range_i; i++) 1359 * for (size_t j = 0; j < range_j; j++) 1360 * for (size_t k = 0; k < range_k; k++) 1361 * for (size_t l = 0; l < range_l; l++) 1362 * for (size_t m = 0; m < range_m; m += tile_m) 1363 * for (size_t n = 0; n < range_n; n += tile_n) 1364 * function(context, i, j, k, l, m, n, 1365 * min(range_m - m, tile_m), min(range_n - n, tile_n)); 1366 * 1367 * When the function returns, all items have been processed and the thread pool 1368 * is ready for a new task. 1369 * 1370 * @note If multiple threads call this function with the same thread pool, the 1371 * calls are serialized. 1372 * 1373 * @param threadpool the thread pool to use for parallelisation. If threadpool 1374 * is NULL, all items are processed serially on the calling thread. 1375 * @param function the function to call for each tile. 1376 * @param context the first argument passed to the specified function. 1377 * @param range_i the number of items to process along the first dimension 1378 * of the 6D grid. 1379 * @param range_j the number of items to process along the second dimension 1380 * of the 6D grid. 1381 * @param range_k the number of items to process along the third dimension 1382 * of the 6D grid. 1383 * @param range_l the number of items to process along the fourth dimension 1384 * of the 6D grid. 1385 * @param range_m the number of items to process along the fifth dimension 1386 * of the 6D grid. 1387 * @param range_n the number of items to process along the sixth dimension 1388 * of the 6D grid. 1389 * @param tile_m the maximum number of items along the fifth dimension of 1390 * the 6D grid to process in one function call. 1391 * @param tile_n the maximum number of items along the sixth dimension of 1392 * the 6D grid to process in one function call. 1393 * @param flags a bitwise combination of zero or more optional flags 1394 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1395 */ 1396 void pthreadpool_parallelize_6d_tile_2d( 1397 pthreadpool_t threadpool, 1398 pthreadpool_task_6d_tile_2d_t function, 1399 void* context, 1400 size_t range_i, 1401 size_t range_j, 1402 size_t range_k, 1403 size_t range_l, 1404 size_t range_m, 1405 size_t range_n, 1406 size_t tile_m, 1407 size_t tile_n, 1408 uint32_t flags); 1409 1410 /** 1411 * Terminates threads in the thread pool and releases associated resources. 1412 * 1413 * @warning Accessing the thread pool after a call to this function constitutes 1414 * undefined behaviour and may cause data corruption. 1415 * 1416 * @param[in,out] threadpool The thread pool to destroy. 1417 */ 1418 void pthreadpool_destroy(pthreadpool_t threadpool); 1419 1420 #ifndef PTHREADPOOL_NO_DEPRECATED_API 1421 1422 /* Legacy API for compatibility with pre-existing users (e.g. NNPACK) */ 1423 #if defined(__GNUC__) 1424 #define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__)) 1425 #else 1426 #define PTHREADPOOL_DEPRECATED 1427 #endif 1428 1429 typedef void (*pthreadpool_function_1d_t)(void*, size_t); 1430 typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t); 1431 typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t); 1432 typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t); 1433 typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); 1434 typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t); 1435 1436 void pthreadpool_compute_1d( 1437 pthreadpool_t threadpool, 1438 pthreadpool_function_1d_t function, 1439 void* argument, 1440 size_t range) PTHREADPOOL_DEPRECATED; 1441 1442 void pthreadpool_compute_1d_tiled( 1443 pthreadpool_t threadpool, 1444 pthreadpool_function_1d_tiled_t function, 1445 void* argument, 1446 size_t range, 1447 size_t tile) PTHREADPOOL_DEPRECATED; 1448 1449 void pthreadpool_compute_2d( 1450 pthreadpool_t threadpool, 1451 pthreadpool_function_2d_t function, 1452 void* argument, 1453 size_t range_i, 1454 size_t range_j) PTHREADPOOL_DEPRECATED; 1455 1456 void pthreadpool_compute_2d_tiled( 1457 pthreadpool_t threadpool, 1458 pthreadpool_function_2d_tiled_t function, 1459 void* argument, 1460 size_t range_i, 1461 size_t range_j, 1462 size_t tile_i, 1463 size_t tile_j) PTHREADPOOL_DEPRECATED; 1464 1465 void pthreadpool_compute_3d_tiled( 1466 pthreadpool_t threadpool, 1467 pthreadpool_function_3d_tiled_t function, 1468 void* argument, 1469 size_t range_i, 1470 size_t range_j, 1471 size_t range_k, 1472 size_t tile_i, 1473 size_t tile_j, 1474 size_t tile_k) PTHREADPOOL_DEPRECATED; 1475 1476 void pthreadpool_compute_4d_tiled( 1477 pthreadpool_t threadpool, 1478 pthreadpool_function_4d_tiled_t function, 1479 void* argument, 1480 size_t range_i, 1481 size_t range_j, 1482 size_t range_k, 1483 size_t range_l, 1484 size_t tile_i, 1485 size_t tile_j, 1486 size_t tile_k, 1487 size_t tile_l) PTHREADPOOL_DEPRECATED; 1488 1489 #endif /* PTHREADPOOL_NO_DEPRECATED_API */ 1490 1491 #ifdef __cplusplus 1492 } /* extern "C" */ 1493 #endif 1494 1495 #ifdef __cplusplus 1496 1497 namespace libpthreadpool { 1498 namespace detail { 1499 namespace { 1500 1501 template<class T> 1502 void call_wrapper_1d(void* arg, size_t i) { 1503 (*static_cast<const T*>(arg))(i); 1504 } 1505 1506 template<class T> 1507 void call_wrapper_1d_tile_1d(void* arg, size_t range_i, size_t tile_i) { 1508 (*static_cast<const T*>(arg))(range_i, tile_i); 1509 } 1510 1511 template<class T> 1512 void call_wrapper_2d(void* functor, size_t i, size_t j) { 1513 (*static_cast<const T*>(functor))(i, j); 1514 } 1515 1516 template<class T> 1517 void call_wrapper_2d_tile_1d(void* functor, 1518 size_t i, size_t range_j, size_t tile_j) 1519 { 1520 (*static_cast<const T*>(functor))(i, range_j, tile_j); 1521 } 1522 1523 template<class T> 1524 void call_wrapper_2d_tile_2d(void* functor, 1525 size_t range_i, size_t range_j, 1526 size_t tile_i, size_t tile_j) 1527 { 1528 (*static_cast<const T*>(functor))(range_i, range_j, tile_i, tile_j); 1529 } 1530 1531 template<class T> 1532 void call_wrapper_3d(void* functor, size_t i, size_t j, size_t k) { 1533 (*static_cast<const T*>(functor))(i, j, k); 1534 } 1535 1536 template<class T> 1537 void call_wrapper_3d_tile_1d(void* functor, 1538 size_t i, size_t j, size_t range_k, 1539 size_t tile_k) 1540 { 1541 (*static_cast<const T*>(functor))(i, j, range_k, tile_k); 1542 } 1543 1544 template<class T> 1545 void call_wrapper_3d_tile_2d(void* functor, 1546 size_t i, size_t range_j, size_t range_k, 1547 size_t tile_j, size_t tile_k) 1548 { 1549 (*static_cast<const T*>(functor))(i, range_j, range_k, tile_j, tile_k); 1550 } 1551 1552 template<class T> 1553 void call_wrapper_4d(void* functor, size_t i, size_t j, size_t k, size_t l) { 1554 (*static_cast<const T*>(functor))(i, j, k, l); 1555 } 1556 1557 template<class T> 1558 void call_wrapper_4d_tile_1d(void* functor, 1559 size_t i, size_t j, size_t k, size_t range_l, 1560 size_t tile_l) 1561 { 1562 (*static_cast<const T*>(functor))(i, j, k, range_l, tile_l); 1563 } 1564 1565 template<class T> 1566 void call_wrapper_4d_tile_2d(void* functor, 1567 size_t i, size_t j, size_t range_k, size_t range_l, 1568 size_t tile_k, size_t tile_l) 1569 { 1570 (*static_cast<const T*>(functor))(i, j, range_k, range_l, tile_k, tile_l); 1571 } 1572 1573 template<class T> 1574 void call_wrapper_5d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m) { 1575 (*static_cast<const T*>(functor))(i, j, k, l, m); 1576 } 1577 1578 template<class T> 1579 void call_wrapper_5d_tile_1d(void* functor, 1580 size_t i, size_t j, size_t k, size_t l, size_t range_m, 1581 size_t tile_m) 1582 { 1583 (*static_cast<const T*>(functor))(i, j, k, l, range_m, tile_m); 1584 } 1585 1586 template<class T> 1587 void call_wrapper_5d_tile_2d(void* functor, 1588 size_t i, size_t j, size_t k, size_t range_l, size_t range_m, 1589 size_t tile_l, size_t tile_m) 1590 { 1591 (*static_cast<const T*>(functor))(i, j, k, range_l, range_m, tile_l, tile_m); 1592 } 1593 1594 template<class T> 1595 void call_wrapper_6d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) { 1596 (*static_cast<const T*>(functor))(i, j, k, l, m, n); 1597 } 1598 1599 template<class T> 1600 void call_wrapper_6d_tile_1d(void* functor, 1601 size_t i, size_t j, size_t k, size_t l, size_t m, size_t range_n, 1602 size_t tile_n) 1603 { 1604 (*static_cast<const T*>(functor))(i, j, k, l, m, range_n, tile_n); 1605 } 1606 1607 template<class T> 1608 void call_wrapper_6d_tile_2d(void* functor, 1609 size_t i, size_t j, size_t k, size_t l, size_t range_m, size_t range_n, 1610 size_t tile_m, size_t tile_n) 1611 { 1612 (*static_cast<const T*>(functor))(i, j, k, l, range_m, range_n, tile_m, tile_n); 1613 } 1614 1615 } /* namespace */ 1616 } /* namespace detail */ 1617 } /* namespace libpthreadpool */ 1618 1619 /** 1620 * Process items on a 1D grid. 1621 * 1622 * The function implements a parallel version of the following snippet: 1623 * 1624 * for (size_t i = 0; i < range; i++) 1625 * functor(i); 1626 * 1627 * When the function returns, all items have been processed and the thread pool 1628 * is ready for a new task. 1629 * 1630 * @note If multiple threads call this function with the same thread pool, the 1631 * calls are serialized. 1632 * 1633 * @param threadpool the thread pool to use for parallelisation. If threadpool 1634 * is NULL, all items are processed serially on the calling thread. 1635 * @param functor the functor to call for each item. 1636 * @param range the number of items on the 1D grid to process. The 1637 * specified functor will be called once for each item. 1638 * @param flags a bitwise combination of zero or more optional flags 1639 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1640 */ 1641 template<class T> 1642 inline void pthreadpool_parallelize_1d( 1643 pthreadpool_t threadpool, 1644 const T& functor, 1645 size_t range, 1646 uint32_t flags = 0) 1647 { 1648 pthreadpool_parallelize_1d( 1649 threadpool, 1650 &libpthreadpool::detail::call_wrapper_1d<const T>, 1651 const_cast<void*>(static_cast<const void*>(&functor)), 1652 range, 1653 flags); 1654 } 1655 1656 /** 1657 * Process items on a 1D grid with specified maximum tile size. 1658 * 1659 * The function implements a parallel version of the following snippet: 1660 * 1661 * for (size_t i = 0; i < range; i += tile) 1662 * functor(i, min(range - i, tile)); 1663 * 1664 * When the call returns, all items have been processed and the thread pool is 1665 * ready for a new task. 1666 * 1667 * @note If multiple threads call this function with the same thread pool, 1668 * the calls are serialized. 1669 * 1670 * @param threadpool the thread pool to use for parallelisation. If threadpool 1671 * is NULL, all items are processed serially on the calling thread. 1672 * @param functor the functor to call for each tile. 1673 * @param range the number of items on the 1D grid to process. 1674 * @param tile the maximum number of items on the 1D grid to process in 1675 * one functor call. 1676 * @param flags a bitwise combination of zero or more optional flags 1677 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1678 */ 1679 template<class T> 1680 inline void pthreadpool_parallelize_1d_tile_1d( 1681 pthreadpool_t threadpool, 1682 const T& functor, 1683 size_t range, 1684 size_t tile, 1685 uint32_t flags = 0) 1686 { 1687 pthreadpool_parallelize_1d_tile_1d( 1688 threadpool, 1689 &libpthreadpool::detail::call_wrapper_1d_tile_1d<const T>, 1690 const_cast<void*>(static_cast<const void*>(&functor)), 1691 range, 1692 tile, 1693 flags); 1694 } 1695 1696 /** 1697 * Process items on a 2D grid. 1698 * 1699 * The function implements a parallel version of the following snippet: 1700 * 1701 * for (size_t i = 0; i < range_i; i++) 1702 * for (size_t j = 0; j < range_j; j++) 1703 * functor(i, j); 1704 * 1705 * When the function returns, all items have been processed and the thread pool 1706 * is ready for a new task. 1707 * 1708 * @note If multiple threads call this function with the same thread pool, the 1709 * calls are serialized. 1710 * 1711 * @param threadpool the thread pool to use for parallelisation. If threadpool 1712 * is NULL, all items are processed serially on the calling thread. 1713 * @param functor the functor to call for each item. 1714 * @param range_i the number of items to process along the first dimension 1715 * of the 2D grid. 1716 * @param range_j the number of items to process along the second dimension 1717 * of the 2D grid. 1718 * @param flags a bitwise combination of zero or more optional flags 1719 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1720 */ 1721 template<class T> 1722 inline void pthreadpool_parallelize_2d( 1723 pthreadpool_t threadpool, 1724 const T& functor, 1725 size_t range_i, 1726 size_t range_j, 1727 uint32_t flags = 0) 1728 { 1729 pthreadpool_parallelize_2d( 1730 threadpool, 1731 &libpthreadpool::detail::call_wrapper_2d<const T>, 1732 const_cast<void*>(static_cast<const void*>(&functor)), 1733 range_i, 1734 range_j, 1735 flags); 1736 } 1737 1738 /** 1739 * Process items on a 2D grid with the specified maximum tile size along the 1740 * last grid dimension. 1741 * 1742 * The function implements a parallel version of the following snippet: 1743 * 1744 * for (size_t i = 0; i < range_i; i++) 1745 * for (size_t j = 0; j < range_j; j += tile_j) 1746 * functor(i, j, min(range_j - j, tile_j)); 1747 * 1748 * When the function returns, all items have been processed and the thread pool 1749 * is ready for a new task. 1750 * 1751 * @note If multiple threads call this function with the same thread pool, the 1752 * calls are serialized. 1753 * 1754 * @param threadpool the thread pool to use for parallelisation. If threadpool 1755 * is NULL, all items are processed serially on the calling thread. 1756 * @param functor the functor to call for each tile. 1757 * @param range_i the number of items to process along the first dimension 1758 * of the 2D grid. 1759 * @param range_j the number of items to process along the second dimension 1760 * of the 2D grid. 1761 * @param tile_j the maximum number of items along the second dimension of 1762 * the 2D grid to process in one functor call. 1763 * @param flags a bitwise combination of zero or more optional flags 1764 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1765 */ 1766 template<class T> 1767 inline void pthreadpool_parallelize_2d_tile_1d( 1768 pthreadpool_t threadpool, 1769 const T& functor, 1770 size_t range_i, 1771 size_t range_j, 1772 size_t tile_j, 1773 uint32_t flags = 0) 1774 { 1775 pthreadpool_parallelize_2d_tile_1d( 1776 threadpool, 1777 &libpthreadpool::detail::call_wrapper_2d_tile_1d<const T>, 1778 const_cast<void*>(static_cast<const void*>(&functor)), 1779 range_i, 1780 range_j, 1781 tile_j, 1782 flags); 1783 } 1784 1785 /** 1786 * Process items on a 2D grid with the specified maximum tile size along each 1787 * grid dimension. 1788 * 1789 * The function implements a parallel version of the following snippet: 1790 * 1791 * for (size_t i = 0; i < range_i; i += tile_i) 1792 * for (size_t j = 0; j < range_j; j += tile_j) 1793 * functor(i, j, 1794 * min(range_i - i, tile_i), min(range_j - j, tile_j)); 1795 * 1796 * When the function returns, all items have been processed and the thread pool 1797 * is ready for a new task. 1798 * 1799 * @note If multiple threads call this function with the same thread pool, the 1800 * calls are serialized. 1801 * 1802 * @param threadpool the thread pool to use for parallelisation. If threadpool 1803 * is NULL, all items are processed serially on the calling thread. 1804 * @param functor the functor to call for each tile. 1805 * @param range_i the number of items to process along the first dimension 1806 * of the 2D grid. 1807 * @param range_j the number of items to process along the second dimension 1808 * of the 2D grid. 1809 * @param tile_j the maximum number of items along the first dimension of 1810 * the 2D grid to process in one functor call. 1811 * @param tile_j the maximum number of items along the second dimension of 1812 * the 2D grid to process in one functor call. 1813 * @param flags a bitwise combination of zero or more optional flags 1814 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1815 */ 1816 template<class T> 1817 inline void pthreadpool_parallelize_2d_tile_2d( 1818 pthreadpool_t threadpool, 1819 const T& functor, 1820 size_t range_i, 1821 size_t range_j, 1822 size_t tile_i, 1823 size_t tile_j, 1824 uint32_t flags = 0) 1825 { 1826 pthreadpool_parallelize_2d_tile_2d( 1827 threadpool, 1828 &libpthreadpool::detail::call_wrapper_2d_tile_2d<const T>, 1829 const_cast<void*>(static_cast<const void*>(&functor)), 1830 range_i, 1831 range_j, 1832 tile_i, 1833 tile_j, 1834 flags); 1835 } 1836 1837 /** 1838 * Process items on a 3D grid. 1839 * 1840 * The function implements a parallel version of the following snippet: 1841 * 1842 * for (size_t i = 0; i < range_i; i++) 1843 * for (size_t j = 0; j < range_j; j++) 1844 * for (size_t k = 0; k < range_k; k++) 1845 * functor(i, j, k); 1846 * 1847 * When the function returns, all items have been processed and the thread pool 1848 * is ready for a new task. 1849 * 1850 * @note If multiple threads call this function with the same thread pool, the 1851 * calls are serialized. 1852 * 1853 * @param threadpool the thread pool to use for parallelisation. If threadpool 1854 * is NULL, all items are processed serially on the calling thread. 1855 * @param functor the functor to call for each tile. 1856 * @param range_i the number of items to process along the first dimension 1857 * of the 3D grid. 1858 * @param range_j the number of items to process along the second dimension 1859 * of the 3D grid. 1860 * @param range_k the number of items to process along the third dimension 1861 * of the 3D grid. 1862 * @param flags a bitwise combination of zero or more optional flags 1863 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1864 */ 1865 template<class T> 1866 inline void pthreadpool_parallelize_3d( 1867 pthreadpool_t threadpool, 1868 const T& functor, 1869 size_t range_i, 1870 size_t range_j, 1871 size_t range_k, 1872 uint32_t flags = 0) 1873 { 1874 pthreadpool_parallelize_3d( 1875 threadpool, 1876 &libpthreadpool::detail::call_wrapper_3d<const T>, 1877 const_cast<void*>(static_cast<const void*>(&functor)), 1878 range_i, 1879 range_j, 1880 range_k, 1881 flags); 1882 } 1883 1884 /** 1885 * Process items on a 3D grid with the specified maximum tile size along the 1886 * last grid dimension. 1887 * 1888 * The function implements a parallel version of the following snippet: 1889 * 1890 * for (size_t i = 0; i < range_i; i++) 1891 * for (size_t j = 0; j < range_j; j++) 1892 * for (size_t k = 0; k < range_k; k += tile_k) 1893 * functor(i, j, k, min(range_k - k, tile_k)); 1894 * 1895 * When the function returns, all items have been processed and the thread pool 1896 * is ready for a new task. 1897 * 1898 * @note If multiple threads call this function with the same thread pool, the 1899 * calls are serialized. 1900 * 1901 * @param threadpool the thread pool to use for parallelisation. If threadpool 1902 * is NULL, all items are processed serially on the calling thread. 1903 * @param functor the functor to call for each tile. 1904 * @param range_i the number of items to process along the first dimension 1905 * of the 3D grid. 1906 * @param range_j the number of items to process along the second dimension 1907 * of the 3D grid. 1908 * @param range_k the number of items to process along the third dimension 1909 * of the 3D grid. 1910 * @param tile_k the maximum number of items along the third dimension of 1911 * the 3D grid to process in one functor call. 1912 * @param flags a bitwise combination of zero or more optional flags 1913 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1914 */ 1915 template<class T> 1916 inline void pthreadpool_parallelize_3d_tile_1d( 1917 pthreadpool_t threadpool, 1918 const T& functor, 1919 size_t range_i, 1920 size_t range_j, 1921 size_t range_k, 1922 size_t tile_k, 1923 uint32_t flags = 0) 1924 { 1925 pthreadpool_parallelize_3d_tile_1d( 1926 threadpool, 1927 &libpthreadpool::detail::call_wrapper_3d_tile_1d<const T>, 1928 const_cast<void*>(static_cast<const void*>(&functor)), 1929 range_i, 1930 range_j, 1931 range_k, 1932 tile_k, 1933 flags); 1934 } 1935 1936 /** 1937 * Process items on a 3D grid with the specified maximum tile size along the 1938 * last two grid dimensions. 1939 * 1940 * The function implements a parallel version of the following snippet: 1941 * 1942 * for (size_t i = 0; i < range_i; i++) 1943 * for (size_t j = 0; j < range_j; j += tile_j) 1944 * for (size_t k = 0; k < range_k; k += tile_k) 1945 * functor(i, j, k, 1946 * min(range_j - j, tile_j), min(range_k - k, tile_k)); 1947 * 1948 * When the function returns, all items have been processed and the thread pool 1949 * is ready for a new task. 1950 * 1951 * @note If multiple threads call this function with the same thread pool, the 1952 * calls are serialized. 1953 * 1954 * @param threadpool the thread pool to use for parallelisation. If threadpool 1955 * is NULL, all items are processed serially on the calling thread. 1956 * @param functor the functor to call for each tile. 1957 * @param range_i the number of items to process along the first dimension 1958 * of the 3D grid. 1959 * @param range_j the number of items to process along the second dimension 1960 * of the 3D grid. 1961 * @param range_k the number of items to process along the third dimension 1962 * of the 3D grid. 1963 * @param tile_j the maximum number of items along the second dimension of 1964 * the 3D grid to process in one functor call. 1965 * @param tile_k the maximum number of items along the third dimension of 1966 * the 3D grid to process in one functor call. 1967 * @param flags a bitwise combination of zero or more optional flags 1968 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 1969 */ 1970 template<class T> 1971 inline void pthreadpool_parallelize_3d_tile_2d( 1972 pthreadpool_t threadpool, 1973 const T& functor, 1974 size_t range_i, 1975 size_t range_j, 1976 size_t range_k, 1977 size_t tile_j, 1978 size_t tile_k, 1979 uint32_t flags = 0) 1980 { 1981 pthreadpool_parallelize_3d_tile_2d( 1982 threadpool, 1983 &libpthreadpool::detail::call_wrapper_3d_tile_2d<const T>, 1984 const_cast<void*>(static_cast<const void*>(&functor)), 1985 range_i, 1986 range_j, 1987 range_k, 1988 tile_j, 1989 tile_k, 1990 flags); 1991 } 1992 1993 /** 1994 * Process items on a 4D grid. 1995 * 1996 * The function implements a parallel version of the following snippet: 1997 * 1998 * for (size_t i = 0; i < range_i; i++) 1999 * for (size_t j = 0; j < range_j; j++) 2000 * for (size_t k = 0; k < range_k; k++) 2001 * for (size_t l = 0; l < range_l; l++) 2002 * functor(i, j, k, l); 2003 * 2004 * When the function returns, all items have been processed and the thread pool 2005 * is ready for a new task. 2006 * 2007 * @note If multiple threads call this function with the same thread pool, the 2008 * calls are serialized. 2009 * 2010 * @param threadpool the thread pool to use for parallelisation. If threadpool 2011 * is NULL, all items are processed serially on the calling thread. 2012 * @param functor the functor to call for each tile. 2013 * @param range_i the number of items to process along the first dimension 2014 * of the 4D grid. 2015 * @param range_j the number of items to process along the second dimension 2016 * of the 4D grid. 2017 * @param range_k the number of items to process along the third dimension 2018 * of the 4D grid. 2019 * @param range_l the number of items to process along the fourth dimension 2020 * of the 4D grid. 2021 * @param flags a bitwise combination of zero or more optional flags 2022 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 2023 */ 2024 template<class T> 2025 inline void pthreadpool_parallelize_4d( 2026 pthreadpool_t threadpool, 2027 const T& functor, 2028 size_t range_i, 2029 size_t range_j, 2030 size_t range_k, 2031 size_t range_l, 2032 uint32_t flags = 0) 2033 { 2034 pthreadpool_parallelize_4d( 2035 threadpool, 2036 &libpthreadpool::detail::call_wrapper_4d<const T>, 2037 const_cast<void*>(static_cast<const void*>(&functor)), 2038 range_i, 2039 range_j, 2040 range_k, 2041 range_l, 2042 flags); 2043 } 2044 2045 /** 2046 * Process items on a 4D grid with the specified maximum tile size along the 2047 * last grid dimension. 2048 * 2049 * The function implements a parallel version of the following snippet: 2050 * 2051 * for (size_t i = 0; i < range_i; i++) 2052 * for (size_t j = 0; j < range_j; j++) 2053 * for (size_t k = 0; k < range_k; k++) 2054 * for (size_t l = 0; l < range_l; l += tile_l) 2055 * functor(i, j, k, l, min(range_l - l, tile_l)); 2056 * 2057 * When the function returns, all items have been processed and the thread pool 2058 * is ready for a new task. 2059 * 2060 * @note If multiple threads call this function with the same thread pool, the 2061 * calls are serialized. 2062 * 2063 * @param threadpool the thread pool to use for parallelisation. If threadpool 2064 * is NULL, all items are processed serially on the calling thread. 2065 * @param functor the functor to call for each tile. 2066 * @param range_i the number of items to process along the first dimension 2067 * of the 4D grid. 2068 * @param range_j the number of items to process along the second dimension 2069 * of the 4D grid. 2070 * @param range_k the number of items to process along the third dimension 2071 * of the 4D grid. 2072 * @param range_l the number of items to process along the fourth dimension 2073 * of the 4D grid. 2074 * @param tile_l the maximum number of items along the fourth dimension of 2075 * the 4D grid to process in one functor call. 2076 * @param flags a bitwise combination of zero or more optional flags 2077 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 2078 */ 2079 template<class T> 2080 inline void pthreadpool_parallelize_4d_tile_1d( 2081 pthreadpool_t threadpool, 2082 const T& functor, 2083 size_t range_i, 2084 size_t range_j, 2085 size_t range_k, 2086 size_t range_l, 2087 size_t tile_l, 2088 uint32_t flags = 0) 2089 { 2090 pthreadpool_parallelize_4d_tile_1d( 2091 threadpool, 2092 &libpthreadpool::detail::call_wrapper_4d_tile_1d<const T>, 2093 const_cast<void*>(static_cast<const void*>(&functor)), 2094 range_i, 2095 range_j, 2096 range_k, 2097 range_l, 2098 tile_l, 2099 flags); 2100 } 2101 2102 /** 2103 * Process items on a 4D grid with the specified maximum tile size along the 2104 * last two grid dimensions. 2105 * 2106 * The function implements a parallel version of the following snippet: 2107 * 2108 * for (size_t i = 0; i < range_i; i++) 2109 * for (size_t j = 0; j < range_j; j++) 2110 * for (size_t k = 0; k < range_k; k += tile_k) 2111 * for (size_t l = 0; l < range_l; l += tile_l) 2112 * functor(i, j, k, l, 2113 * min(range_k - k, tile_k), min(range_l - l, tile_l)); 2114 * 2115 * When the function returns, all items have been processed and the thread pool 2116 * is ready for a new task. 2117 * 2118 * @note If multiple threads call this function with the same thread pool, the 2119 * calls are serialized. 2120 * 2121 * @param threadpool the thread pool to use for parallelisation. If threadpool 2122 * is NULL, all items are processed serially on the calling thread. 2123 * @param functor the functor to call for each tile. 2124 * @param range_i the number of items to process along the first dimension 2125 * of the 4D grid. 2126 * @param range_j the number of items to process along the second dimension 2127 * of the 4D grid. 2128 * @param range_k the number of items to process along the third dimension 2129 * of the 4D grid. 2130 * @param range_l the number of items to process along the fourth dimension 2131 * of the 4D grid. 2132 * @param tile_k the maximum number of items along the third dimension of 2133 * the 4D grid to process in one functor call. 2134 * @param tile_l the maximum number of items along the fourth dimension of 2135 * the 4D grid to process in one functor call. 2136 * @param flags a bitwise combination of zero or more optional flags 2137 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 2138 */ 2139 template<class T> 2140 inline void pthreadpool_parallelize_4d_tile_2d( 2141 pthreadpool_t threadpool, 2142 const T& functor, 2143 size_t range_i, 2144 size_t range_j, 2145 size_t range_k, 2146 size_t range_l, 2147 size_t tile_k, 2148 size_t tile_l, 2149 uint32_t flags = 0) 2150 { 2151 pthreadpool_parallelize_4d_tile_2d( 2152 threadpool, 2153 &libpthreadpool::detail::call_wrapper_4d_tile_2d<const T>, 2154 const_cast<void*>(static_cast<const void*>(&functor)), 2155 range_i, 2156 range_j, 2157 range_k, 2158 range_l, 2159 tile_k, 2160 tile_l, 2161 flags); 2162 } 2163 2164 /** 2165 * Process items on a 5D grid. 2166 * 2167 * The function implements a parallel version of the following snippet: 2168 * 2169 * for (size_t i = 0; i < range_i; i++) 2170 * for (size_t j = 0; j < range_j; j++) 2171 * for (size_t k = 0; k < range_k; k++) 2172 * for (size_t l = 0; l < range_l; l++) 2173 * for (size_t m = 0; m < range_m; m++) 2174 * functor(i, j, k, l, m); 2175 * 2176 * When the function returns, all items have been processed and the thread pool 2177 * is ready for a new task. 2178 * 2179 * @note If multiple threads call this function with the same thread pool, the 2180 * calls are serialized. 2181 * 2182 * @param threadpool the thread pool to use for parallelisation. If threadpool 2183 * is NULL, all items are processed serially on the calling thread. 2184 * @param functor the functor to call for each tile. 2185 * @param range_i the number of items to process along the first dimension 2186 * of the 5D grid. 2187 * @param range_j the number of items to process along the second dimension 2188 * of the 5D grid. 2189 * @param range_k the number of items to process along the third dimension 2190 * of the 5D grid. 2191 * @param range_l the number of items to process along the fourth dimension 2192 * of the 5D grid. 2193 * @param range_m the number of items to process along the fifth dimension 2194 * of the 5D grid. 2195 * @param flags a bitwise combination of zero or more optional flags 2196 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 2197 */ 2198 template<class T> 2199 inline void pthreadpool_parallelize_5d( 2200 pthreadpool_t threadpool, 2201 const T& functor, 2202 size_t range_i, 2203 size_t range_j, 2204 size_t range_k, 2205 size_t range_l, 2206 size_t range_m, 2207 uint32_t flags = 0) 2208 { 2209 pthreadpool_parallelize_5d( 2210 threadpool, 2211 &libpthreadpool::detail::call_wrapper_5d<const T>, 2212 const_cast<void*>(static_cast<const void*>(&functor)), 2213 range_i, 2214 range_j, 2215 range_k, 2216 range_l, 2217 range_m, 2218 flags); 2219 } 2220 2221 /** 2222 * Process items on a 5D grid with the specified maximum tile size along the 2223 * last grid dimension. 2224 * 2225 * The function implements a parallel version of the following snippet: 2226 * 2227 * for (size_t i = 0; i < range_i; i++) 2228 * for (size_t j = 0; j < range_j; j++) 2229 * for (size_t k = 0; k < range_k; k++) 2230 * for (size_t l = 0; l < range_l; l++) 2231 * for (size_t m = 0; m < range_m; m += tile_m) 2232 * functor(i, j, k, l, m, min(range_m - m, tile_m)); 2233 * 2234 * When the function returns, all items have been processed and the thread pool 2235 * is ready for a new task. 2236 * 2237 * @note If multiple threads call this function with the same thread pool, the 2238 * calls are serialized. 2239 * 2240 * @param threadpool the thread pool to use for parallelisation. If threadpool 2241 * is NULL, all items are processed serially on the calling thread. 2242 * @param functor the functor to call for each tile. 2243 * @param range_i the number of items to process along the first dimension 2244 * of the 5D grid. 2245 * @param range_j the number of items to process along the second dimension 2246 * of the 5D grid. 2247 * @param range_k the number of items to process along the third dimension 2248 * of the 5D grid. 2249 * @param range_l the number of items to process along the fourth dimension 2250 * of the 5D grid. 2251 * @param range_m the number of items to process along the fifth dimension 2252 * of the 5D grid. 2253 * @param tile_m the maximum number of items along the fifth dimension of 2254 * the 5D grid to process in one functor call. 2255 * @param flags a bitwise combination of zero or more optional flags 2256 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 2257 */ 2258 template<class T> 2259 inline void pthreadpool_parallelize_5d_tile_1d( 2260 pthreadpool_t threadpool, 2261 const T& functor, 2262 size_t range_i, 2263 size_t range_j, 2264 size_t range_k, 2265 size_t range_l, 2266 size_t range_m, 2267 size_t tile_m, 2268 uint32_t flags = 0) 2269 { 2270 pthreadpool_parallelize_5d_tile_1d( 2271 threadpool, 2272 &libpthreadpool::detail::call_wrapper_5d_tile_1d<const T>, 2273 const_cast<void*>(static_cast<const void*>(&functor)), 2274 range_i, 2275 range_j, 2276 range_k, 2277 range_l, 2278 range_m, 2279 tile_m, 2280 flags); 2281 } 2282 2283 /** 2284 * Process items on a 5D grid with the specified maximum tile size along the 2285 * last two grid dimensions. 2286 * 2287 * The function implements a parallel version of the following snippet: 2288 * 2289 * for (size_t i = 0; i < range_i; i++) 2290 * for (size_t j = 0; j < range_j; j++) 2291 * for (size_t k = 0; k < range_k; k++) 2292 * for (size_t l = 0; l < range_l; l += tile_l) 2293 * for (size_t m = 0; m < range_m; m += tile_m) 2294 * functor(i, j, k, l, m, 2295 * min(range_l - l, tile_l), min(range_m - m, tile_m)); 2296 * 2297 * When the function returns, all items have been processed and the thread pool 2298 * is ready for a new task. 2299 * 2300 * @note If multiple threads call this function with the same thread pool, the 2301 * calls are serialized. 2302 * 2303 * @param threadpool the thread pool to use for parallelisation. If threadpool 2304 * is NULL, all items are processed serially on the calling thread. 2305 * @param functor the functor to call for each tile. 2306 * @param range_i the number of items to process along the first dimension 2307 * of the 5D grid. 2308 * @param range_j the number of items to process along the second dimension 2309 * of the 5D grid. 2310 * @param range_k the number of items to process along the third dimension 2311 * of the 5D grid. 2312 * @param range_l the number of items to process along the fourth dimension 2313 * of the 5D grid. 2314 * @param range_m the number of items to process along the fifth dimension 2315 * of the 5D grid. 2316 * @param tile_l the maximum number of items along the fourth dimension of 2317 * the 5D grid to process in one functor call. 2318 * @param tile_m the maximum number of items along the fifth dimension of 2319 * the 5D grid to process in one functor call. 2320 * @param flags a bitwise combination of zero or more optional flags 2321 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 2322 */ 2323 template<class T> 2324 inline void pthreadpool_parallelize_5d_tile_2d( 2325 pthreadpool_t threadpool, 2326 const T& functor, 2327 size_t range_i, 2328 size_t range_j, 2329 size_t range_k, 2330 size_t range_l, 2331 size_t range_m, 2332 size_t tile_l, 2333 size_t tile_m, 2334 uint32_t flags = 0) 2335 { 2336 pthreadpool_parallelize_5d_tile_2d( 2337 threadpool, 2338 &libpthreadpool::detail::call_wrapper_5d_tile_2d<const T>, 2339 const_cast<void*>(static_cast<const void*>(&functor)), 2340 range_i, 2341 range_j, 2342 range_k, 2343 range_l, 2344 range_m, 2345 tile_l, 2346 tile_m, 2347 flags); 2348 } 2349 2350 /** 2351 * Process items on a 6D grid. 2352 * 2353 * The function implements a parallel version of the following snippet: 2354 * 2355 * for (size_t i = 0; i < range_i; i++) 2356 * for (size_t j = 0; j < range_j; j++) 2357 * for (size_t k = 0; k < range_k; k++) 2358 * for (size_t l = 0; l < range_l; l++) 2359 * for (size_t m = 0; m < range_m; m++) 2360 * for (size_t n = 0; n < range_n; n++) 2361 * functor(i, j, k, l, m, n); 2362 * 2363 * When the function returns, all items have been processed and the thread pool 2364 * is ready for a new task. 2365 * 2366 * @note If multiple threads call this function with the same thread pool, the 2367 * calls are serialized. 2368 * 2369 * @param threadpool the thread pool to use for parallelisation. If threadpool 2370 * is NULL, all items are processed serially on the calling thread. 2371 * @param functor the functor to call for each tile. 2372 * @param range_i the number of items to process along the first dimension 2373 * of the 6D grid. 2374 * @param range_j the number of items to process along the second dimension 2375 * of the 6D grid. 2376 * @param range_k the number of items to process along the third dimension 2377 * of the 6D grid. 2378 * @param range_l the number of items to process along the fourth dimension 2379 * of the 6D grid. 2380 * @param range_m the number of items to process along the fifth dimension 2381 * of the 6D grid. 2382 * @param range_n the number of items to process along the sixth dimension 2383 * of the 6D grid. 2384 * @param tile_n the maximum number of items along the sixth dimension of 2385 * the 6D grid to process in one functor call. 2386 * @param flags a bitwise combination of zero or more optional flags 2387 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 2388 */ 2389 template<class T> 2390 inline void pthreadpool_parallelize_6d( 2391 pthreadpool_t threadpool, 2392 const T& functor, 2393 size_t range_i, 2394 size_t range_j, 2395 size_t range_k, 2396 size_t range_l, 2397 size_t range_m, 2398 size_t range_n, 2399 uint32_t flags = 0) 2400 { 2401 pthreadpool_parallelize_6d( 2402 threadpool, 2403 &libpthreadpool::detail::call_wrapper_6d<const T>, 2404 const_cast<void*>(static_cast<const void*>(&functor)), 2405 range_i, 2406 range_j, 2407 range_k, 2408 range_l, 2409 range_m, 2410 range_n, 2411 flags); 2412 } 2413 2414 /** 2415 * Process items on a 6D grid with the specified maximum tile size along the 2416 * last grid dimension. 2417 * 2418 * The function implements a parallel version of the following snippet: 2419 * 2420 * for (size_t i = 0; i < range_i; i++) 2421 * for (size_t j = 0; j < range_j; j++) 2422 * for (size_t k = 0; k < range_k; k++) 2423 * for (size_t l = 0; l < range_l; l++) 2424 * for (size_t m = 0; m < range_m; m++) 2425 * for (size_t n = 0; n < range_n; n += tile_n) 2426 * functor(i, j, k, l, m, n, min(range_n - n, tile_n)); 2427 * 2428 * When the function returns, all items have been processed and the thread pool 2429 * is ready for a new task. 2430 * 2431 * @note If multiple threads call this function with the same thread pool, the 2432 * calls are serialized. 2433 * 2434 * @param threadpool the thread pool to use for parallelisation. If threadpool 2435 * is NULL, all items are processed serially on the calling thread. 2436 * @param functor the functor to call for each tile. 2437 * @param range_i the number of items to process along the first dimension 2438 * of the 6D grid. 2439 * @param range_j the number of items to process along the second dimension 2440 * of the 6D grid. 2441 * @param range_k the number of items to process along the third dimension 2442 * of the 6D grid. 2443 * @param range_l the number of items to process along the fourth dimension 2444 * of the 6D grid. 2445 * @param range_m the number of items to process along the fifth dimension 2446 * of the 6D grid. 2447 * @param range_n the number of items to process along the sixth dimension 2448 * of the 6D grid. 2449 * @param tile_n the maximum number of items along the sixth dimension of 2450 * the 6D grid to process in one functor call. 2451 * @param flags a bitwise combination of zero or more optional flags 2452 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 2453 */ 2454 template<class T> 2455 inline void pthreadpool_parallelize_6d_tile_1d( 2456 pthreadpool_t threadpool, 2457 const T& functor, 2458 size_t range_i, 2459 size_t range_j, 2460 size_t range_k, 2461 size_t range_l, 2462 size_t range_m, 2463 size_t range_n, 2464 size_t tile_n, 2465 uint32_t flags = 0) 2466 { 2467 pthreadpool_parallelize_6d_tile_1d( 2468 threadpool, 2469 &libpthreadpool::detail::call_wrapper_6d_tile_1d<const T>, 2470 const_cast<void*>(static_cast<const void*>(&functor)), 2471 range_i, 2472 range_j, 2473 range_k, 2474 range_l, 2475 range_m, 2476 range_n, 2477 tile_n, 2478 flags); 2479 } 2480 2481 /** 2482 * Process items on a 6D grid with the specified maximum tile size along the 2483 * last two grid dimensions. 2484 * 2485 * The function implements a parallel version of the following snippet: 2486 * 2487 * for (size_t i = 0; i < range_i; i++) 2488 * for (size_t j = 0; j < range_j; j++) 2489 * for (size_t k = 0; k < range_k; k++) 2490 * for (size_t l = 0; l < range_l; l++) 2491 * for (size_t m = 0; m < range_m; m += tile_m) 2492 * for (size_t n = 0; n < range_n; n += tile_n) 2493 * functor(i, j, k, l, m, n, 2494 * min(range_m - m, tile_m), min(range_n - n, tile_n)); 2495 * 2496 * When the function returns, all items have been processed and the thread pool 2497 * is ready for a new task. 2498 * 2499 * @note If multiple threads call this function with the same thread pool, the 2500 * calls are serialized. 2501 * 2502 * @param threadpool the thread pool to use for parallelisation. If threadpool 2503 * is NULL, all items are processed serially on the calling thread. 2504 * @param functor the functor to call for each tile. 2505 * @param range_i the number of items to process along the first dimension 2506 * of the 6D grid. 2507 * @param range_j the number of items to process along the second dimension 2508 * of the 6D grid. 2509 * @param range_k the number of items to process along the third dimension 2510 * of the 6D grid. 2511 * @param range_l the number of items to process along the fourth dimension 2512 * of the 6D grid. 2513 * @param range_m the number of items to process along the fifth dimension 2514 * of the 6D grid. 2515 * @param range_n the number of items to process along the sixth dimension 2516 * of the 6D grid. 2517 * @param tile_m the maximum number of items along the fifth dimension of 2518 * the 6D grid to process in one functor call. 2519 * @param tile_n the maximum number of items along the sixth dimension of 2520 * the 6D grid to process in one functor call. 2521 * @param flags a bitwise combination of zero or more optional flags 2522 * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) 2523 */ 2524 template<class T> 2525 inline void pthreadpool_parallelize_6d_tile_2d( 2526 pthreadpool_t threadpool, 2527 const T& functor, 2528 size_t range_i, 2529 size_t range_j, 2530 size_t range_k, 2531 size_t range_l, 2532 size_t range_m, 2533 size_t range_n, 2534 size_t tile_m, 2535 size_t tile_n, 2536 uint32_t flags = 0) 2537 { 2538 pthreadpool_parallelize_6d_tile_2d( 2539 threadpool, 2540 &libpthreadpool::detail::call_wrapper_6d_tile_2d<const T>, 2541 const_cast<void*>(static_cast<const void*>(&functor)), 2542 range_i, 2543 range_j, 2544 range_k, 2545 range_l, 2546 range_m, 2547 range_n, 2548 tile_m, 2549 tile_n, 2550 flags); 2551 } 2552 2553 #endif /* __cplusplus */ 2554 2555 #endif /* PTHREADPOOL_H_ */
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |