diff options
author | Andrey Fedorov <andrey.fedorov@intel.com> | 2021-02-18 12:59:32 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-02-18 12:59:32 +0300 |
commit | 899a5b64c98ec3401f14b70d3fb301f794469fb6 (patch) | |
tree | a06a90d5205b857d19eb48ac58db8ec3de7acdc8 | |
parent | CMake: add openmp-simd compiler option (#116) (diff) | |
download | llvm-project-899a5b64c98ec3401f14b70d3fb301f794469fb6.tar.gz llvm-project-899a5b64c98ec3401f14b70d3fb301f794469fb6.tar.bz2 llvm-project-899a5b64c98ec3401f14b70d3fb301f794469fb6.zip |
Fix for reduction stage of scan pattern (#125)
* fixed debug issue
* some cosmetic changes
* Update comment
Co-authored-by: Dmitriy Sobolev <Dmitriy.Sobolev@intel.com>
* remove extra braces
Co-authored-by: Dmitriy Sobolev <Dmitriy.Sobolev@intel.com>
-rw-r--r-- | include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h | 30 |
1 files changed, 18 insertions, 12 deletions
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index 241dec088393..1e905359338e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -484,21 +484,21 @@ struct __scan _InitType __init = __scan_no_init<typename _InitType::__value_type>{}) const { using _Tp = typename _InitType::__value_type; - auto __group_id = __item.get_group(0); - auto __global_id = __item.get_global_id(0); - auto __local_id = __item.get_local_id(0); - auto __use_init = __scan_init_processing<_Tp>{}; + ::std::size_t __group_id = __item.get_group(0); + ::std::size_t __global_id = __item.get_global_id(0); + ::std::size_t __local_id = __item.get_local_id(0); + __scan_init_processing<_Tp> __use_init{}; - auto __shift = 0; + ::std::size_t __shift = 0; __internal::__invoke_if_not(_Inclusive{}, [&]() { __shift = 1; if (__global_id == 0) __use_init(__init, __out_acc[__global_id]); }); - auto __adjusted_global_id = __local_id + __size_per_wg * __group_id; + ::std::size_t __adjusted_global_id = __local_id + __size_per_wg * __group_id; auto __adder = __local_acc[0]; - for (auto __iter = 0; __iter < __iters_per_wg; ++__iter, __adjusted_global_id += __wgroup_size) + for (_ItersPerWG __iter = 0; __iter < __iters_per_wg; ++__iter, __adjusted_global_id += __wgroup_size) { if (__adjusted_global_id < __n) { @@ -513,14 +513,19 @@ struct __scan __use_init(__init, __local_acc[__global_id], __bin_op); // 1. reduce - auto __k = 1; + ::std::size_t __k = 1; + // TODO: use adjacent work items for better SIMD utilization + // Consider the example with the mask of work items performing reduction: + // iter now proposed + // 1: 01010101 11110000 + // 2: 00010001 11000000 + // 3: 00000001 10000000 do { __item.barrier(sycl::access::fence_space::local_space); - if (__local_id % (2 * __k) == 0 && __local_id + __k < __wgroup_size && __adjusted_global_id + __k < __n) + if (__adjusted_global_id < __n && __local_id % (2 * __k) == 2 * __k - 1) { - __local_acc[__local_id + 2 * __k - 1] = - __bin_op(__local_acc[__local_id + __k - 1], __local_acc[__local_id + 2 * __k - 1]); + __local_acc[__local_id] = __bin_op(__local_acc[__local_id - __k], __local_acc[__local_id]); } __k *= 2; } while (__k < __wgroup_size); @@ -531,7 +536,8 @@ struct __scan __k = 2; do { - auto __shifted_local_id = __local_id - __local_id % __k - 1; + // use signed type to avoid overflowing + ::std::int32_t __shifted_local_id = __local_id - __local_id % __k - 1; if (__shifted_local_id >= 0 && __adjusted_global_id < __n && __local_id % (2 * __k) >= __k && __local_id % (2 * __k) < 2 * __k - 1) { |