parallel scheduler uses cached mask

charan-003 · charan-003 · commit 83b4eb947bf5 · 2026-03-27T21:47:01.000-05:00
diff --git a/libs/core/executors/include/hpx/executors/parallel_scheduler.hpp b/libs/core/executors/include/hpx/executors/parallel_scheduler.hpp
@@ -59,13 +59,13 @@ namespace hpx::execution::experimental {
                 // Get the parallel_scheduler from the child sender's
                 // completion scheduler (completes_on pattern)
                 auto par_sched = [&]() {
-                    if constexpr (hpx::is_invocable_v<
-                                      hpx::execution::experimental::
-                                          get_completion_scheduler_t<hpx::
-                                              execution::experimental::
-                                                  set_value_t>,
-                                      decltype(hpx::execution::experimental::
-                                                   get_env(child))>)
+                    if constexpr (
+                        hpx::is_invocable_v<
+                            hpx::execution::experimental::
+                                get_completion_scheduler_t<
+                                    hpx::execution::experimental::set_value_t>,
+                            decltype(hpx::execution::experimental::get_env(
+                                child))>)
                     {
                         return hpx::execution::experimental::
                             get_completion_scheduler<
@@ -93,15 +93,18 @@ namespace hpx::execution::experimental {
                 constexpr bool is_parallel =
                     !is_sequenced_policy_v<std::decay_t<decltype(pol.__get())>>;
 
+                constexpr bool is_unsequenced = is_unsequenced_bulk_policy_v<
+                    std::decay_t<decltype(pol.__get())>>;
+
                 // Pass the pre-cached PU mask so thread_pool_bulk_sender
                 // skips its own full_mask() computation on every invocation.
                 hpx::threads::mask_type pu_mask = par_sched.get_pu_mask();
                 return hpx::execution::experimental::detail::
                     thread_pool_bulk_sender<hpx::launch,
                         std::decay_t<decltype(child)>,
                         std::decay_t<decltype(iota_shape)>,
-                        std::decay_t<decltype(f)>, is_chunked, is_parallel>(
-                        HPX_MOVE(underlying),
+                        std::decay_t<decltype(f)>, is_chunked, is_parallel,
+                        is_unsequenced>(HPX_MOVE(underlying),
                         HPX_FORWARD(decltype(child), child),
                         HPX_MOVE(iota_shape), HPX_FORWARD(decltype(f), f),
                         HPX_MOVE(pu_mask));
diff --git a/libs/core/executors/include/hpx/executors/scheduler_executor.hpp b/libs/core/executors/include/hpx/executors/scheduler_executor.hpp
@@ -20,6 +20,7 @@
 
 #if defined(HPX_HAVE_STDEXEC)
 #include <hpx/executors/detail/index_queue_spawning.hpp>
+#include <hpx/executors/parallel_scheduler.hpp>
 #endif
 
 #include <cstddef>
@@ -47,10 +48,48 @@ namespace hpx::execution::experimental {
         {
         };
 
+        // parallel_scheduler wraps thread_pool_policy_scheduler; use the same
+        // index_queue fast path with thread_pool_params<parallel_scheduler>
+        // so pu_mask() can return the cached mask from get_pu_mask().
+        template <>
+        struct has_thread_pool_backend<parallel_scheduler> : std::true_type
+        {
+        };
+
         // Helper to extract thread pool parameters from a scheduler
         template <typename Scheduler>
         struct thread_pool_params;    // primary: not defined
 
+        template <>
+        struct thread_pool_params<parallel_scheduler>
+        {
+            static auto* pool(parallel_scheduler const& sched)
+            {
+                return sched.get_underlying_scheduler().get_thread_pool();
+            }
+            static std::size_t first_core(parallel_scheduler const& sched)
+            {
+                return hpx::execution::experimental::get_first_core(
+                    sched.get_underlying_scheduler());
+            }
+            static std::size_t num_cores(parallel_scheduler const& sched)
+            {
+                return hpx::execution::experimental::processing_units_count(
+                    hpx::execution::experimental::null_parameters,
+                    sched.get_underlying_scheduler(),
+                    hpx::chrono::null_duration, 0);
+            }
+            static auto const& policy(parallel_scheduler const& sched)
+            {
+                return sched.get_underlying_scheduler().policy();
+            }
+            static hpx::threads::mask_type pu_mask(
+                parallel_scheduler const& sched)
+            {
+                return sched.get_pu_mask();
+            }
+        };
+
         template <typename Policy>
         struct thread_pool_params<thread_pool_policy_scheduler<Policy>>
         {
diff --git a/libs/core/executors/include/hpx/executors/thread_pool_scheduler.hpp b/libs/core/executors/include/hpx/executors/thread_pool_scheduler.hpp
@@ -29,7 +29,8 @@
 // Forward declaration
 namespace hpx::execution::experimental::detail {
     template <typename Policy, typename Sender, typename Shape, typename F,
-        bool IsChunked, bool IsParallel>
+        bool IsChunked = false, bool IsParallel = true,
+        bool IsUnsequenced = false>
     class thread_pool_bulk_sender;
 }
 #endif
@@ -85,6 +86,19 @@ namespace hpx::execution::experimental {
     inline constexpr bool is_sequenced_policy_v<stdexec::unsequenced_policy> =
         true;
 
+    //True for unseq and par_unseq
+    template <typename Policy>
+    inline constexpr bool is_unsequenced_bulk_policy_v = false;
+
+    template <>
+    inline constexpr bool
+        is_unsequenced_bulk_policy_v<stdexec::unsequenced_policy> = true;
+
+    template <>
+    inline constexpr bool
+        is_unsequenced_bulk_policy_v<stdexec::parallel_unsequenced_policy> =
+            true;
+
     // Domain customization for stdexec bulk operations
     // Only the env-based transform_sender is provided. The early (no-env)
     // transform falls through to default_domain, and the late transform
@@ -129,12 +143,23 @@ namespace hpx::execution::experimental {
             constexpr bool is_parallel =
                 !is_sequenced_policy_v<std::decay_t<decltype(pol.__get())>>;
 
+            constexpr bool is_unsequenced = is_unsequenced_bulk_policy_v<
+                std::decay_t<decltype(pol.__get())>>;
+
+            // Pre-compute the PU mask once and pass it to the 5-arg
+            // constructor to avoid the expensive full_mask() call (O(N^2))
+            // that the 4-arg constructor would trigger on every bulk
+            // operation.
+            auto pu_mask =
+                hpx::execution::experimental::get_processing_units_mask(sched);
+
             return hpx::execution::experimental::detail::
                 thread_pool_bulk_sender<Policy, std::decay_t<decltype(child)>,
                     std::decay_t<decltype(iota_shape)>,
-                    std::decay_t<decltype(f)>, is_chunked, is_parallel>{
-                    HPX_MOVE(sched), HPX_FORWARD(decltype(child), child),
-                    HPX_MOVE(iota_shape), HPX_FORWARD(decltype(f), f)};
+                    std::decay_t<decltype(f)>, is_chunked, is_parallel,
+                    is_unsequenced>{HPX_MOVE(sched),
+                    HPX_FORWARD(decltype(child), child), HPX_MOVE(iota_shape),
+                    HPX_FORWARD(decltype(f), f), HPX_MOVE(pu_mask)};
         }
     };
 
diff --git a/libs/core/executors/include/hpx/executors/thread_pool_scheduler_bulk.hpp b/libs/core/executors/include/hpx/executors/thread_pool_scheduler_bulk.hpp
@@ -92,6 +92,24 @@ namespace hpx::execution::experimental::detail {
             (n + static_cast<std::size_t>(num_threads) - 1) / num_threads);
     }
 
+    /// Round a chunk up to a multiple of 16 when it is
+    /// smaller than size
+    HPX_CXX_CORE_EXPORT constexpr std::uint32_t align_chunk_for_vectorization(
+        std::uint32_t chunk, std::uint32_t const size) noexcept
+    {
+        constexpr std::uint32_t g = 16;
+        if (chunk == 0 || chunk >= size)
+            return chunk;
+        std::uint64_t c = chunk;
+        if (c % g != 0)
+        {
+            c = ((c + g - 1) / g) * g;
+        }
+        if (c > size)
+            c = size;
+        return static_cast<std::uint32_t>(c);
+    }
+
     // For bulk_unchunked: f(index, ...)
     HPX_CXX_CORE_EXPORT template <std::size_t... Is, typename F, typename T,
         typename Ts>
@@ -183,9 +201,8 @@ namespace hpx::execution::experimental::detail {
 
             auto const i_begin =
                 static_cast<std::size_t>(index) * op_state->chunk_size;
-            auto const i_end =
-                (std::min) (i_begin + op_state->chunk_size,
-                    static_cast<std::size_t>(op_state->size));
+            auto const i_end = (std::min) (i_begin + op_state->chunk_size,
+                static_cast<std::size_t>(op_state->size));
 
             if constexpr (OperationState::is_chunked)
             {
@@ -195,14 +212,14 @@ namespace hpx::execution::experimental::detail {
             }
             else
             {
-                // bulk_unchunked: f(index, values...) for each element
-                // In unchunked case, chunk_size is 1
-                // so each chunk will only have one element.
-                // The regular bulk invocation will go through the is_chunked case.
+                // bulk_unchunked: one element call f(shape_index, values...) per i.
                 auto it = std::ranges::next(
                     hpx::util::begin(op_state->shape), i_begin);
-                bulk_scheduler_invoke_helper(
-                    index_pack_type{}, op_state->f, *it, ts);
+                for (auto i = i_begin; i < i_end; ++i, ++it)
+                {
+                    bulk_scheduler_invoke_helper(
+                        index_pack_type{}, op_state->f, *it, ts);
+                }
             }
         }
 
@@ -319,7 +336,8 @@ namespace hpx::execution::experimental::detail {
         // Otherwise, it will call set_value on the connected receiver.
         void finish() const
         {
-            if (--(op_state->tasks_remaining.data_) == 0)
+            if (op_state->tasks_remaining.data_.fetch_sub(
+                    1, std::memory_order_acq_rel) == 1)
             {
                 if (op_state->bad_alloc_thrown.load(std::memory_order_relaxed))
                 {
@@ -557,8 +575,16 @@ namespace hpx::execution::experimental::detail {
             }
             else
             {
-                chunk_size = 1;
-                num_chunks = size;
+                chunk_size = get_bulk_scheduler_chunk_size(
+                    op_state->num_worker_threads, size);
+                num_chunks = (size + chunk_size - 1) / chunk_size;
+            }
+
+            if constexpr (OperationState::is_unsequenced &&
+                OperationState::is_parallel)
+            {
+                chunk_size = align_chunk_for_vectorization(chunk_size, size);
+                num_chunks = (size + chunk_size - 1) / chunk_size;
             }
 
             // launch only as many tasks as we have chunks
@@ -723,6 +749,16 @@ namespace hpx::execution::experimental::detail {
 #endif
     };
 
+#if !defined(HPX_HAVE_STDEXEC)
+    // With stdexec, thread_pool_scheduler.hpp forward declares this template
+    // with default arguments; without it, declare here so the definition below
+    // does not repeat default template arguments.
+    template <typename Policy, typename Sender, typename Shape, typename F,
+        bool IsChunked = false, bool IsParallel = true,
+        bool IsUnsequenced = false>
+    class thread_pool_bulk_sender;
+#endif
+
     // This sender represents bulk work that will be performed using the
     // thread_pool_scheduler.
     //
@@ -740,8 +776,8 @@ namespace hpx::execution::experimental::detail {
     // threads.
     //
     HPX_CXX_CORE_EXPORT template <typename Policy, typename Sender,
-        typename Shape, typename F, bool IsChunked = false,
-        bool IsParallel = true>
+        typename Shape, typename F, bool IsChunked, bool IsParallel,
+        bool IsUnsequenced>
     class thread_pool_bulk_sender
     {
     private:
@@ -885,6 +921,7 @@ namespace hpx::execution::experimental::detail {
         {
             static constexpr bool is_chunked = IsChunked;
             static constexpr bool is_parallel = IsParallel;
+            static constexpr bool is_unsequenced = IsUnsequenced;
 
             using operation_state_type =
                 hpx::execution::experimental::connect_result_t<Sender,
@@ -899,9 +936,11 @@ namespace hpx::execution::experimental::detail {
             bool reverse_placement = false;
             bool allow_stealing = false;
             hpx::threads::mask_type pu_mask;
+
             std::vector<hpx::util::cache_aligned_data<
                 hpx::concurrency::detail::non_contiguous_index_queue<>>>
                 queues;
+
             HPX_NO_UNIQUE_ADDRESS std::decay_t<Shape> shape;
             HPX_NO_UNIQUE_ADDRESS std::decay_t<F> f;
             HPX_NO_UNIQUE_ADDRESS std::decay_t<Receiver> receiver;
diff --git a/libs/core/executors/tests/unit/parallel_scheduler.cpp b/libs/core/executors/tests/unit/parallel_scheduler.cpp