TheHPXProject
diff --git a/‎libs/core/affinity/include/hpx/affinity/affinity_data.hpp‎
Lines changed: 2 additions & 2 deletions b/‎libs/core/affinity/include/hpx/affinity/affinity_data.hpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎libs/core/affinity/src/affinity_data.cpp‎
Lines changed: 4 additions & 5 deletions b/‎libs/core/affinity/src/affinity_data.cpp‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎libs/core/algorithms/tests/performance/benchmark_merge.cpp‎
Lines changed: 26 additions & 5 deletions b/‎libs/core/algorithms/tests/performance/benchmark_merge.cpp‎
Lines changed: 26 additions & 5 deletions
diff --git a/‎libs/core/algorithms/tests/performance/benchmark_merge_sweep.cpp‎
Lines changed: 18 additions & 4 deletions b/‎libs/core/algorithms/tests/performance/benchmark_merge_sweep.cpp‎
Lines changed: 18 additions & 4 deletions
diff --git a/‎libs/core/compute_local/include/hpx/compute_local/host/block_executor.hpp‎
Lines changed: 38 additions & 18 deletions b/‎libs/core/compute_local/include/hpx/compute_local/host/block_executor.hpp‎
Lines changed: 38 additions & 18 deletions
diff --git a/‎libs/core/compute_local/tests/unit/numa_allocator.cpp‎
Lines changed: 4 additions & 1 deletion b/‎libs/core/compute_local/tests/unit/numa_allocator.cpp‎
Lines changed: 4 additions & 1 deletion
@@ -58,8 +58,8 @@ namespace hpx::threads::policies::detail {
         mask_type get_pu_mask(
             threads::topology const& topo, std::size_t global_thread_num) const;
 
-        mask_type get_used_pus_mask(
-            threads::topology const& topo, std::size_t pu_num) const;
+        mask_type get_used_pus_mask(threads::topology const& topo,
+            std::size_t pu_num = static_cast<std::size_t>(-1)) const;
         std::size_t get_thread_occupancy(
             threads::topology const& topo, std::size_t pu_num) const;
 
 
@@ -234,27 +234,26 @@ namespace hpx::threads::policies::detail {
         threads::resize(ret, overall_threads);
 
         // --hpx:bind=none disables all affinity
-        if (threads::test(no_affinity_, pu_num))
+        if (static_cast<std::size_t>(-1) != pu_num &&
+            threads::test(no_affinity_, pu_num))
         {
             threads::set(ret, pu_num);
             return ret;
         }
 
-        // clang-format off
         for (std::size_t thread_num = 0; thread_num != num_threads_;
             ++thread_num)
-        // clang-format on
         {
             auto const thread_mask = get_pu_mask(topo, thread_num);
             for (std::size_t i = 0; i != overall_threads; ++i)
             {
-                if (threads::test(thread_mask, i))
+                if (threads::test(no_affinity_, i) ||
+                    threads::test(thread_mask, i))
                 {
                     threads::set(ret, i);
                 }
             }
         }
-
         return ret;
     }
 
 
@@ -62,12 +62,19 @@ double run_merge_benchmark_hpx(int const test_count, ExPolicy policy,
     // warmup
     hpx::merge(policy, first1, last1, first2, last2, dest);
 
+#if HPX_HAVE_ITTNOTIFY != 0 && !defined(HPX_HAVE_APEX)
+    auto local_policy = hpx::execution::experimental::with_annotation(
+        policy, "run_merge_benchmark_hpx (child)");
+#else
+    auto local_policy = policy;
+#endif
+
     // actual measurement
     std::uint64_t time = hpx::chrono::high_resolution_clock::now();
 
     for (int i = 0; i < test_count; ++i)
     {
-        hpx::merge(policy, first1, last1, first2, last2, dest);
+        hpx::merge(local_policy, first1, last1, first2, last2, dest);
     }
 
     time = hpx::chrono::high_resolution_clock::now() - time;
@@ -138,19 +145,33 @@ struct enable_fast_idle_mode
     template <typename Executor>
     friend void tag_override_invoke(
         hpx::execution::experimental::mark_begin_execution_t,
-        enable_fast_idle_mode, Executor&&)
+        enable_fast_idle_mode, Executor&& exec)
     {
+        auto const pu_mask =
+            hpx::execution::experimental::get_processing_units_mask(exec);
+        auto const full_pu_mask =
+            hpx::resource::get_partitioner().get_used_pus_mask();
+
+        // Enable fast-idle mode only for PU's that are not used by this
+        // algorithm invocation.
         hpx::threads::add_scheduler_mode(
-            hpx::threads::policies::scheduler_mode::fast_idle_mode);
+            hpx::threads::policies::scheduler_mode::fast_idle_mode,
+            full_pu_mask & ~pu_mask);
     }
 
     template <typename Executor>
     friend void tag_override_invoke(
         hpx::execution::experimental::mark_end_execution_t,
-        enable_fast_idle_mode, Executor&&)
+        enable_fast_idle_mode, Executor&& exec)
     {
+        auto const pu_mask =
+            hpx::execution::experimental::get_processing_units_mask(exec);
+        auto const full_pu_mask =
+            hpx::resource::get_partitioner().get_used_pus_mask();
+
         hpx::threads::remove_scheduler_mode(
-            hpx::threads::policies::scheduler_mode::fast_idle_mode);
+            hpx::threads::policies::scheduler_mode::fast_idle_mode,
+            full_pu_mask & ~pu_mask);
     }
 };
 
 
@@ -169,19 +169,33 @@ struct enable_fast_idle_mode
     template <typename Executor>
     friend void tag_override_invoke(
         hpx::execution::experimental::mark_begin_execution_t,
-        enable_fast_idle_mode, Executor&&)
+        enable_fast_idle_mode, Executor&& exec)
     {
+        auto const pu_mask =
+            hpx::execution::experimental::get_processing_units_mask(exec);
+        auto const full_pu_mask =
+            hpx::resource::get_partitioner().get_used_pus_mask();
+
+        // Enable fast-idle mode only for PU's that are not used by this
+        // algorithm invocation.
         hpx::threads::add_scheduler_mode(
-            hpx::threads::policies::scheduler_mode::fast_idle_mode);
+            hpx::threads::policies::scheduler_mode::fast_idle_mode,
+            full_pu_mask & ~pu_mask);
     }
 
     template <typename Executor>
     friend void tag_override_invoke(
         hpx::execution::experimental::mark_end_execution_t,
-        enable_fast_idle_mode, Executor&&)
+        enable_fast_idle_mode, Executor&& exec)
     {
+        auto const pu_mask =
+            hpx::execution::experimental::get_processing_units_mask(exec);
+        auto const full_pu_mask =
+            hpx::resource::get_partitioner().get_used_pus_mask();
+
         hpx::threads::remove_scheduler_mode(
-            hpx::threads::policies::scheduler_mode::fast_idle_mode);
+            hpx::threads::policies::scheduler_mode::fast_idle_mode,
+            full_pu_mask & ~pu_mask);
     }
 };
 
 
@@ -224,34 +224,54 @@ namespace hpx::compute::host {
                 hpx::parallel::execution::detail::bulk_execute_result_t<F,
                     Shape, Ts...>;
 
-            std::vector<result_type> results;
             std::size_t cnt = util::size(shape);
             std::size_t const num_executors = executors_.size();
 
-            results.reserve(cnt);
-
             try
             {
-                auto begin = util::begin(shape);
-                for (std::size_t i = 0; i != num_executors; ++i)
+                if constexpr (!std::is_void_v<result_type>)
                 {
-                    std::size_t part_begin_offset = (i * cnt) / num_executors;
-                    std::size_t part_end_offset =
-                        ((i + 1) * cnt) / num_executors;
-                    auto part_begin = begin;
-                    auto part_end = begin;
-                    std::advance(part_begin, part_begin_offset);
-                    std::advance(part_end, part_end_offset);
-                    auto part_results =
+                    std::vector<result_type> results;
+                    results.reserve(cnt);
+
+                    auto begin = util::begin(shape);
+                    for (std::size_t i = 0; i != num_executors; ++i)
+                    {
+                        std::size_t part_begin_offset =
+                            (i * cnt) / num_executors;
+                        std::size_t part_end_offset =
+                            ((i + 1) * cnt) / num_executors;
+                        auto part_begin = std::next(begin, part_begin_offset);
+                        auto part_end = std::next(begin, part_end_offset);
+                        auto part_results =
+                            hpx::parallel::execution::bulk_sync_execute(
+                                executors_[i], HPX_FORWARD(F, f),
+                                util::iterator_range(part_begin, part_end),
+                                HPX_FORWARD(Ts, ts)...);
+                        results.emplace(results.end(),
+                            std::make_move_iterator(part_results.begin()),
+                            std::make_move_iterator(part_results.end()));
+                    }
+
+                    return results;
+                }
+                else
+                {
+                    auto begin = util::begin(shape);
+                    for (std::size_t i = 0; i != num_executors; ++i)
+                    {
+                        std::size_t part_begin_offset =
+                            (i * cnt) / num_executors;
+                        std::size_t part_end_offset =
+                            ((i + 1) * cnt) / num_executors;
+                        auto part_begin = std::next(begin, part_begin_offset);
+                        auto part_end = std::next(begin, part_end_offset);
                         hpx::parallel::execution::bulk_sync_execute(
                             executors_[i], HPX_FORWARD(F, f),
-                            util::iterator_range(begin, part_end),
+                            util::iterator_range(part_begin, part_end),
                             HPX_FORWARD(Ts, ts)...);
-                    results.emplace(results.end(),
-                        std::make_move_iterator(part_results.begin()),
-                        std::make_move_iterator(part_results.end()));
+                    }
                 }
-                return results;
             }
             catch (std::bad_alloc const&)
             {
 
@@ -226,8 +226,11 @@ void init_resource_partitioner_handler(
             numa_scheduler::init_parameter_type scheduler_init(
                 init.num_threads_, {1, 1, 64}, init.affinity_data_,
                 thread_queue_init, "shared-priority-scheduler");
+
+            auto const full_mask =
+                hpx::resource::get_partitioner().get_pool_pus_mask(init.name_);
             std::unique_ptr<numa_scheduler> scheduler(
-                new numa_scheduler(scheduler_init));
+                new numa_scheduler(scheduler_init, full_mask));
 
             scheduler_mode mode =
                 scheduler_mode(scheduler_mode::do_background_work |
Original file line number	Diff line number	Diff line change
`@@ -234,27 +234,26 @@ namespace hpx::threads::policies::detail {`
`234`	`234`	`threads::resize(ret, overall_threads);`
`235`	`235`
`236`	`236`	`// --hpx:bind=none disables all affinity`
`237`		`- if (threads::test(no_affinity_, pu_num))`
	`237`	`+ if (static_cast<std::size_t>(-1) != pu_num &&`
	`238`	`+ threads::test(no_affinity_, pu_num))`
`238`	`239`	`{`
`239`	`240`	`threads::set(ret, pu_num);`
`240`	`241`	`return ret;`
`241`	`242`	`}`
`242`	`243`
`243`		`- // clang-format off`
`244`	`244`	`for (std::size_t thread_num = 0; thread_num != num_threads_;`
`245`	`245`	`++thread_num)`
`246`		`- // clang-format on`
`247`	`246`	`{`
`248`	`247`	`auto const thread_mask = get_pu_mask(topo, thread_num);`
`249`	`248`	`for (std::size_t i = 0; i != overall_threads; ++i)`
`250`	`249`	`{`
`251`		`- if (threads::test(thread_mask, i))`
	`250`	`+ if (threads::test(no_affinity_, i) \|\|`
	`251`	`+ threads::test(thread_mask, i))`
`252`	`252`	`{`
`253`	`253`	`threads::set(ret, i);`
`254`	`254`	`}`
`255`	`255`	`}`
`256`	`256`	`}`
`257`		`-`
`258`	`257`	`return ret;`
`259`	`258`	`}`
`260`	`259`