@@ -92,6 +92,24 @@ namespace hpx::execution::experimental::detail {
9292 (n + static_cast <std::size_t >(num_threads) - 1 ) / num_threads);
9393 }
9494
95+ // / Round a chunk up to a multiple of 16 when it is
96+ // / smaller than size
97+ HPX_CXX_CORE_EXPORT constexpr std::uint32_t align_chunk_for_vectorization (
98+ std::uint32_t chunk, std::uint32_t const size) noexcept
99+ {
100+ constexpr std::uint32_t g = 16 ;
101+ if (chunk == 0 || chunk >= size)
102+ return chunk;
103+ std::uint64_t c = chunk;
104+ if (c % g != 0 )
105+ {
106+ c = ((c + g - 1 ) / g) * g;
107+ }
108+ if (c > size)
109+ c = size;
110+ return static_cast <std::uint32_t >(c);
111+ }
112+
95113 // For bulk_unchunked: f(index, ...)
96114 HPX_CXX_CORE_EXPORT template <std::size_t ... Is, typename F, typename T,
97115 typename Ts>
@@ -183,9 +201,8 @@ namespace hpx::execution::experimental::detail {
183201
184202 auto const i_begin =
185203 static_cast <std::size_t >(index) * op_state->chunk_size ;
186- auto const i_end =
187- (std::min) (i_begin + op_state->chunk_size ,
188- static_cast <std::size_t >(op_state->size ));
204+ auto const i_end = (std::min) (i_begin + op_state->chunk_size ,
205+ static_cast <std::size_t >(op_state->size ));
189206
190207 if constexpr (OperationState::is_chunked)
191208 {
@@ -195,14 +212,14 @@ namespace hpx::execution::experimental::detail {
195212 }
196213 else
197214 {
198- // bulk_unchunked: f(index, values...) for each element
199- // In unchunked case, chunk_size is 1
200- // so each chunk will only have one element.
201- // The regular bulk invocation will go through the is_chunked case.
215+ // bulk_unchunked: one element call f(shape_index, values...) per i.
202216 auto it = std::ranges::next (
203217 hpx::util::begin (op_state->shape ), i_begin);
204- bulk_scheduler_invoke_helper (
205- index_pack_type{}, op_state->f , *it, ts);
218+ for (auto i = i_begin; i < i_end; ++i, ++it)
219+ {
220+ bulk_scheduler_invoke_helper (
221+ index_pack_type{}, op_state->f , *it, ts);
222+ }
206223 }
207224 }
208225
@@ -319,7 +336,8 @@ namespace hpx::execution::experimental::detail {
319336 // Otherwise, it will call set_value on the connected receiver.
320337 void finish () const
321338 {
322- if (--(op_state->tasks_remaining .data_ ) == 0 )
339+ if (op_state->tasks_remaining .data_ .fetch_sub (
340+ 1 , std::memory_order_acq_rel) == 1 )
323341 {
324342 if (op_state->bad_alloc_thrown .load (std::memory_order_relaxed))
325343 {
@@ -557,8 +575,16 @@ namespace hpx::execution::experimental::detail {
557575 }
558576 else
559577 {
560- chunk_size = 1 ;
561- num_chunks = size;
578+ chunk_size = get_bulk_scheduler_chunk_size (
579+ op_state->num_worker_threads , size);
580+ num_chunks = (size + chunk_size - 1 ) / chunk_size;
581+ }
582+
583+ if constexpr (OperationState::is_unsequenced &&
584+ OperationState::is_parallel)
585+ {
586+ chunk_size = align_chunk_for_vectorization (chunk_size, size);
587+ num_chunks = (size + chunk_size - 1 ) / chunk_size;
562588 }
563589
564590 // launch only as many tasks as we have chunks
@@ -723,6 +749,16 @@ namespace hpx::execution::experimental::detail {
723749#endif
724750 };
725751
752+ #if !defined(HPX_HAVE_STDEXEC)
753+ // With stdexec, thread_pool_scheduler.hpp forward declares this template
754+ // with default arguments; without it, declare here so the definition below
755+ // does not repeat default template arguments.
756+ template <typename Policy, typename Sender, typename Shape, typename F,
757+ bool IsChunked = false , bool IsParallel = true ,
758+ bool IsUnsequenced = false >
759+ class thread_pool_bulk_sender ;
760+ #endif
761+
726762 // This sender represents bulk work that will be performed using the
727763 // thread_pool_scheduler.
728764 //
@@ -740,8 +776,8 @@ namespace hpx::execution::experimental::detail {
740776 // threads.
741777 //
742778 HPX_CXX_CORE_EXPORT template <typename Policy, typename Sender,
743- typename Shape, typename F, bool IsChunked = false ,
744- bool IsParallel = true >
779+ typename Shape, typename F, bool IsChunked, bool IsParallel ,
780+ bool IsUnsequenced >
745781 class thread_pool_bulk_sender
746782 {
747783 private:
@@ -885,6 +921,7 @@ namespace hpx::execution::experimental::detail {
885921 {
886922 static constexpr bool is_chunked = IsChunked;
887923 static constexpr bool is_parallel = IsParallel;
924+ static constexpr bool is_unsequenced = IsUnsequenced;
888925
889926 using operation_state_type =
890927 hpx::execution::experimental::connect_result_t <Sender,
@@ -899,9 +936,11 @@ namespace hpx::execution::experimental::detail {
899936 bool reverse_placement = false ;
900937 bool allow_stealing = false ;
901938 hpx::threads::mask_type pu_mask;
939+
902940 std::vector<hpx::util::cache_aligned_data<
903941 hpx::concurrency::detail::non_contiguous_index_queue<>>>
904942 queues;
943+
905944 HPX_NO_UNIQUE_ADDRESS std::decay_t <Shape> shape;
906945 HPX_NO_UNIQUE_ADDRESS std::decay_t <F> f;
907946 HPX_NO_UNIQUE_ADDRESS std::decay_t <Receiver> receiver;
0 commit comments