Skip to content

Commit 4053a56

Browse files
committed
Every thread stores its own scheduler-mode now
Signed-off-by: Hartmut Kaiser <hartmut.kaiser@gmail.com>
1 parent a29b1c4 commit 4053a56

32 files changed

Lines changed: 589 additions & 275 deletions

File tree

libs/core/affinity/include/hpx/affinity/affinity_data.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,8 @@ namespace hpx::threads::policies::detail {
5858
mask_type get_pu_mask(
5959
threads::topology const& topo, std::size_t global_thread_num) const;
6060

61-
mask_type get_used_pus_mask(
62-
threads::topology const& topo, std::size_t pu_num) const;
61+
mask_type get_used_pus_mask(threads::topology const& topo,
62+
std::size_t pu_num = static_cast<std::size_t>(-1)) const;
6363
std::size_t get_thread_occupancy(
6464
threads::topology const& topo, std::size_t pu_num) const;
6565

libs/core/affinity/src/affinity_data.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -234,27 +234,26 @@ namespace hpx::threads::policies::detail {
234234
threads::resize(ret, overall_threads);
235235

236236
// --hpx:bind=none disables all affinity
237-
if (threads::test(no_affinity_, pu_num))
237+
if (static_cast<std::size_t>(-1) != pu_num &&
238+
threads::test(no_affinity_, pu_num))
238239
{
239240
threads::set(ret, pu_num);
240241
return ret;
241242
}
242243

243-
// clang-format off
244244
for (std::size_t thread_num = 0; thread_num != num_threads_;
245245
++thread_num)
246-
// clang-format on
247246
{
248247
auto const thread_mask = get_pu_mask(topo, thread_num);
249248
for (std::size_t i = 0; i != overall_threads; ++i)
250249
{
251-
if (threads::test(thread_mask, i))
250+
if (threads::test(no_affinity_, i) ||
251+
threads::test(thread_mask, i))
252252
{
253253
threads::set(ret, i);
254254
}
255255
}
256256
}
257-
258257
return ret;
259258
}
260259

libs/core/algorithms/tests/performance/benchmark_merge.cpp

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,19 @@ double run_merge_benchmark_hpx(int const test_count, ExPolicy policy,
6262
// warmup
6363
hpx::merge(policy, first1, last1, first2, last2, dest);
6464

65+
#if HPX_HAVE_ITTNOTIFY != 0 && !defined(HPX_HAVE_APEX)
66+
auto local_policy = hpx::execution::experimental::with_annotation(
67+
policy, "run_merge_benchmark_hpx (child)");
68+
#else
69+
auto local_policy = policy;
70+
#endif
71+
6572
// actual measurement
6673
std::uint64_t time = hpx::chrono::high_resolution_clock::now();
6774

6875
for (int i = 0; i < test_count; ++i)
6976
{
70-
hpx::merge(policy, first1, last1, first2, last2, dest);
77+
hpx::merge(local_policy, first1, last1, first2, last2, dest);
7178
}
7279

7380
time = hpx::chrono::high_resolution_clock::now() - time;
@@ -138,19 +145,33 @@ struct enable_fast_idle_mode
138145
template <typename Executor>
139146
friend void tag_override_invoke(
140147
hpx::execution::experimental::mark_begin_execution_t,
141-
enable_fast_idle_mode, Executor&&)
148+
enable_fast_idle_mode, Executor&& exec)
142149
{
150+
auto const pu_mask =
151+
hpx::execution::experimental::get_processing_units_mask(exec);
152+
auto const full_pu_mask =
153+
hpx::resource::get_partitioner().get_used_pus_mask();
154+
155+
// Enable fast-idle mode only for PU's that are not used by this
156+
// algorithm invocation.
143157
hpx::threads::add_scheduler_mode(
144-
hpx::threads::policies::scheduler_mode::fast_idle_mode);
158+
hpx::threads::policies::scheduler_mode::fast_idle_mode,
159+
full_pu_mask & ~pu_mask);
145160
}
146161

147162
template <typename Executor>
148163
friend void tag_override_invoke(
149164
hpx::execution::experimental::mark_end_execution_t,
150-
enable_fast_idle_mode, Executor&&)
165+
enable_fast_idle_mode, Executor&& exec)
151166
{
167+
auto const pu_mask =
168+
hpx::execution::experimental::get_processing_units_mask(exec);
169+
auto const full_pu_mask =
170+
hpx::resource::get_partitioner().get_used_pus_mask();
171+
152172
hpx::threads::remove_scheduler_mode(
153-
hpx::threads::policies::scheduler_mode::fast_idle_mode);
173+
hpx::threads::policies::scheduler_mode::fast_idle_mode,
174+
full_pu_mask & ~pu_mask);
154175
}
155176
};
156177

libs/core/algorithms/tests/performance/benchmark_merge_sweep.cpp

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,19 +169,33 @@ struct enable_fast_idle_mode
169169
template <typename Executor>
170170
friend void tag_override_invoke(
171171
hpx::execution::experimental::mark_begin_execution_t,
172-
enable_fast_idle_mode, Executor&&)
172+
enable_fast_idle_mode, Executor&& exec)
173173
{
174+
auto const pu_mask =
175+
hpx::execution::experimental::get_processing_units_mask(exec);
176+
auto const full_pu_mask =
177+
hpx::resource::get_partitioner().get_used_pus_mask();
178+
179+
// Enable fast-idle mode only for PU's that are not used by this
180+
// algorithm invocation.
174181
hpx::threads::add_scheduler_mode(
175-
hpx::threads::policies::scheduler_mode::fast_idle_mode);
182+
hpx::threads::policies::scheduler_mode::fast_idle_mode,
183+
full_pu_mask & ~pu_mask);
176184
}
177185

178186
template <typename Executor>
179187
friend void tag_override_invoke(
180188
hpx::execution::experimental::mark_end_execution_t,
181-
enable_fast_idle_mode, Executor&&)
189+
enable_fast_idle_mode, Executor&& exec)
182190
{
191+
auto const pu_mask =
192+
hpx::execution::experimental::get_processing_units_mask(exec);
193+
auto const full_pu_mask =
194+
hpx::resource::get_partitioner().get_used_pus_mask();
195+
183196
hpx::threads::remove_scheduler_mode(
184-
hpx::threads::policies::scheduler_mode::fast_idle_mode);
197+
hpx::threads::policies::scheduler_mode::fast_idle_mode,
198+
full_pu_mask & ~pu_mask);
185199
}
186200
};
187201

libs/core/compute_local/include/hpx/compute_local/host/block_executor.hpp

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -224,34 +224,54 @@ namespace hpx::compute::host {
224224
hpx::parallel::execution::detail::bulk_execute_result_t<F,
225225
Shape, Ts...>;
226226

227-
std::vector<result_type> results;
228227
std::size_t cnt = util::size(shape);
229228
std::size_t const num_executors = executors_.size();
230229

231-
results.reserve(cnt);
232-
233230
try
234231
{
235-
auto begin = util::begin(shape);
236-
for (std::size_t i = 0; i != num_executors; ++i)
232+
if constexpr (!std::is_void_v<result_type>)
237233
{
238-
std::size_t part_begin_offset = (i * cnt) / num_executors;
239-
std::size_t part_end_offset =
240-
((i + 1) * cnt) / num_executors;
241-
auto part_begin = begin;
242-
auto part_end = begin;
243-
std::advance(part_begin, part_begin_offset);
244-
std::advance(part_end, part_end_offset);
245-
auto part_results =
234+
std::vector<result_type> results;
235+
results.reserve(cnt);
236+
237+
auto begin = util::begin(shape);
238+
for (std::size_t i = 0; i != num_executors; ++i)
239+
{
240+
std::size_t part_begin_offset =
241+
(i * cnt) / num_executors;
242+
std::size_t part_end_offset =
243+
((i + 1) * cnt) / num_executors;
244+
auto part_begin = std::next(begin, part_begin_offset);
245+
auto part_end = std::next(begin, part_end_offset);
246+
auto part_results =
247+
hpx::parallel::execution::bulk_sync_execute(
248+
executors_[i], HPX_FORWARD(F, f),
249+
util::iterator_range(part_begin, part_end),
250+
HPX_FORWARD(Ts, ts)...);
251+
results.emplace(results.end(),
252+
std::make_move_iterator(part_results.begin()),
253+
std::make_move_iterator(part_results.end()));
254+
}
255+
256+
return results;
257+
}
258+
else
259+
{
260+
auto begin = util::begin(shape);
261+
for (std::size_t i = 0; i != num_executors; ++i)
262+
{
263+
std::size_t part_begin_offset =
264+
(i * cnt) / num_executors;
265+
std::size_t part_end_offset =
266+
((i + 1) * cnt) / num_executors;
267+
auto part_begin = std::next(begin, part_begin_offset);
268+
auto part_end = std::next(begin, part_end_offset);
246269
hpx::parallel::execution::bulk_sync_execute(
247270
executors_[i], HPX_FORWARD(F, f),
248-
util::iterator_range(begin, part_end),
271+
util::iterator_range(part_begin, part_end),
249272
HPX_FORWARD(Ts, ts)...);
250-
results.emplace(results.end(),
251-
std::make_move_iterator(part_results.begin()),
252-
std::make_move_iterator(part_results.end()));
273+
}
253274
}
254-
return results;
255275
}
256276
catch (std::bad_alloc const&)
257277
{

libs/core/compute_local/tests/unit/numa_allocator.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,8 +226,11 @@ void init_resource_partitioner_handler(
226226
numa_scheduler::init_parameter_type scheduler_init(
227227
init.num_threads_, {1, 1, 64}, init.affinity_data_,
228228
thread_queue_init, "shared-priority-scheduler");
229+
230+
auto const full_mask =
231+
hpx::resource::get_partitioner().get_pool_pus_mask(init.name_);
229232
std::unique_ptr<numa_scheduler> scheduler(
230-
new numa_scheduler(scheduler_init));
233+
new numa_scheduler(scheduler_init, full_mask));
231234

232235
scheduler_mode mode =
233236
scheduler_mode(scheduler_mode::do_background_work |

0 commit comments

Comments
 (0)