aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrey Fedorov <andrey.fedorov@intel.com>2021-02-18 12:59:32 +0300
committerGitHub <noreply@github.com>2021-02-18 12:59:32 +0300
commit899a5b64c98ec3401f14b70d3fb301f794469fb6 (patch)
treea06a90d5205b857d19eb48ac58db8ec3de7acdc8
parentCMake: add openmp-simd compiler option (#116) (diff)
downloadllvm-project-899a5b64c98ec3401f14b70d3fb301f794469fb6.tar.gz
llvm-project-899a5b64c98ec3401f14b70d3fb301f794469fb6.tar.bz2
llvm-project-899a5b64c98ec3401f14b70d3fb301f794469fb6.zip
Fix for reduction stage of scan pattern (#125)
* fixed debug issue * some cosmetic changes * Update comment Co-authored-by: Dmitriy Sobolev <Dmitriy.Sobolev@intel.com> * remove extra braces Co-authored-by: Dmitriy Sobolev <Dmitriy.Sobolev@intel.com>
-rw-r--r--include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h30
1 files changed, 18 insertions, 12 deletions
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 241dec088393..1e905359338e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -484,21 +484,21 @@ struct __scan
_InitType __init = __scan_no_init<typename _InitType::__value_type>{}) const
{
using _Tp = typename _InitType::__value_type;
- auto __group_id = __item.get_group(0);
- auto __global_id = __item.get_global_id(0);
- auto __local_id = __item.get_local_id(0);
- auto __use_init = __scan_init_processing<_Tp>{};
+ ::std::size_t __group_id = __item.get_group(0);
+ ::std::size_t __global_id = __item.get_global_id(0);
+ ::std::size_t __local_id = __item.get_local_id(0);
+ __scan_init_processing<_Tp> __use_init{};
- auto __shift = 0;
+ ::std::size_t __shift = 0;
__internal::__invoke_if_not(_Inclusive{}, [&]() {
__shift = 1;
if (__global_id == 0)
__use_init(__init, __out_acc[__global_id]);
});
- auto __adjusted_global_id = __local_id + __size_per_wg * __group_id;
+ ::std::size_t __adjusted_global_id = __local_id + __size_per_wg * __group_id;
auto __adder = __local_acc[0];
- for (auto __iter = 0; __iter < __iters_per_wg; ++__iter, __adjusted_global_id += __wgroup_size)
+ for (_ItersPerWG __iter = 0; __iter < __iters_per_wg; ++__iter, __adjusted_global_id += __wgroup_size)
{
if (__adjusted_global_id < __n)
{
@@ -513,14 +513,19 @@ struct __scan
__use_init(__init, __local_acc[__global_id], __bin_op);
// 1. reduce
- auto __k = 1;
+ ::std::size_t __k = 1;
+ // TODO: use adjacent work items for better SIMD utilization
+ // Consider the example with the mask of work items performing reduction:
+ // iter now proposed
+ // 1: 01010101 11110000
+ // 2: 00010001 11000000
+ // 3: 00000001 10000000
do
{
__item.barrier(sycl::access::fence_space::local_space);
- if (__local_id % (2 * __k) == 0 && __local_id + __k < __wgroup_size && __adjusted_global_id + __k < __n)
+ if (__adjusted_global_id < __n && __local_id % (2 * __k) == 2 * __k - 1)
{
- __local_acc[__local_id + 2 * __k - 1] =
- __bin_op(__local_acc[__local_id + __k - 1], __local_acc[__local_id + 2 * __k - 1]);
+ __local_acc[__local_id] = __bin_op(__local_acc[__local_id - __k], __local_acc[__local_id]);
}
__k *= 2;
} while (__k < __wgroup_size);
@@ -531,7 +536,8 @@ struct __scan
__k = 2;
do
{
- auto __shifted_local_id = __local_id - __local_id % __k - 1;
+ // use signed type to avoid overflowing
+ ::std::int32_t __shifted_local_id = __local_id - __local_id % __k - 1;
if (__shifted_local_id >= 0 && __adjusted_global_id < __n && __local_id % (2 * __k) >= __k &&
__local_id % (2 * __k) < 2 * __k - 1)
{