Skip to content

Commit 3795ba8

Browse files
committed
Applying minor optimizations and fixes in various places
Signed-off-by: Hartmut Kaiser <hartmut.kaiser@gmail.com>
1 parent feed1d5 commit 3795ba8

10 files changed

Lines changed: 151 additions & 56 deletions

File tree

docs/sphinx/manual/executors.rst

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -418,19 +418,6 @@ Full example code
418418
// The annotating executor exposes the same executor categories as its
419419
// underlying (wrapped) executor.
420420

421-
template <typename BaseExecutor>
422-
struct is_never_blocking_one_way_executor<
423-
simple_annotating_executor<BaseExecutor>>
424-
: is_never_blocking_one_way_executor<BaseExecutor>
425-
{
426-
};
427-
428-
template <typename BaseExecutor>
429-
struct is_one_way_executor<simple_annotating_executor<BaseExecutor>>
430-
: is_one_way_executor<BaseExecutor>
431-
{
432-
};
433-
434421
template <typename BaseExecutor>
435422
struct is_two_way_executor<simple_annotating_executor<BaseExecutor>>
436423
: is_two_way_executor<BaseExecutor>

libs/core/algorithms/include/hpx/parallel/util/foreach_partitioner.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,9 @@ namespace hpx::parallel::util::detail {
170170
try
171171
{
172172
if constexpr (std::is_void_v<decltype(foreach_partition<Result>(
173-
policy, first, count, f1, reshape))>)
173+
HPX_FORWARD(ExPolicy_, policy), first, count,
174+
HPX_FORWARD(F1, f1),
175+
HPX_FORWARD(ReShape, reshape)))>)
174176
{
175177
detail::foreach_partition<Result>(
176178
HPX_FORWARD(ExPolicy_, policy), first, count,

libs/core/algorithms/tests/performance/benchmark_merge.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,8 @@ struct random_to_item_t
212212
}
213213
};
214214

215+
using data_type = int;
216+
215217
///////////////////////////////////////////////////////////////////////////////
216218
template <typename IteratorTag, typename Allocator>
217219
void run_benchmark(std::size_t vector_size1, std::size_t vector_size2,
@@ -231,7 +233,7 @@ void run_benchmark(std::size_t vector_size1, std::size_t vector_size2,
231233
hpx::generate(par, std::begin(uniform_distribution),
232234
std::end(uniform_distribution), [&] { return dist(re); });
233235

234-
using test_container = test_container<IteratorTag, int, Allocator>;
236+
using test_container = test_container<IteratorTag, data_type, Allocator>;
235237
using container = typename test_container::type;
236238
using T = typename container::value_type;
237239

@@ -416,7 +418,8 @@ int hpx_main(hpx::program_options::variables_map& vm)
416418
{
417419
auto policy = hpx::execution::par;
418420
using allocator_type =
419-
hpx::compute::host::detail::policy_allocator<int, decltype(policy)>;
421+
hpx::compute::host::detail::policy_allocator<data_type,
422+
decltype(policy)>;
420423
allocator_type alloc(policy);
421424

422425
run_benchmark(vector_size1, vector_size2, test_count,

libs/core/execution/include/hpx/execution/detail/post_policy_dispatch.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ namespace hpx::detail {
4040
{
4141
hint.runs_as_child_mode(
4242
hpx::threads::thread_execution_hint::none);
43-
policy.set_hint(hint);
4443
}
4544

4645
threads::thread_init_data data(

libs/core/execution_base/include/hpx/execution_base/execution.hpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ namespace hpx::parallel::execution {
103103
/// \returns f(ts...)'s result
104104
///
105105
/// \note It will call tag_invoke(sync_execute_t, exec, f, ts...) if it
106-
/// exists. For two-way executors it will invoke asynch_execute_t
106+
/// exists. For two-way executors it will invoke async_execute_t
107107
/// and wait for the task's completion before returning.
108108
///
109109
HPX_CXX_EXPORT inline constexpr struct sync_execute_t final
@@ -148,7 +148,7 @@ namespace hpx::parallel::execution {
148148
///
149149
/// \note This is valid for one way executors (calls
150150
/// make_ready_future(exec.sync_execute(f, ts...) if it exists)
151-
/// and for two way executors (calls exec.async_execute(f, ts...)
151+
/// and for two-way executors (calls exec.async_execute(f, ts...)
152152
/// if it exists).
153153
///
154154
/// \returns f(ts...)'s result through a future
@@ -188,7 +188,7 @@ namespace hpx::parallel::execution {
188188
///
189189
/// \returns f(ts...)'s result through a future
190190
///
191-
/// \note This is valid for two way executors (calls
191+
/// \note This is valid for two-way executors (calls
192192
/// exec.then_execute(f, predecessor, ts...) if it exists) and
193193
/// for one way executors (calls predecessor.then(bind(f, ts...))).
194194
///
@@ -227,10 +227,10 @@ namespace hpx::parallel::execution {
227227
/// given executor.
228228
/// \param ts [in] Additional arguments to use to invoke \a f.
229229
///
230-
/// \note This is valid for two way executors (calls
230+
/// \note This is valid for two-way executors (calls
231231
/// exec.post(f, ts...), if available, otherwise
232232
/// it calls exec.async_execute(f, ts...) while discarding the
233-
/// returned future), and for non-blocking two way executors
233+
/// returned future), and for non-blocking two-way executors
234234
/// (calls exec.post(f, ts...) if it exists).
235235
///
236236
HPX_CXX_EXPORT inline constexpr struct post_t final
@@ -283,7 +283,7 @@ namespace hpx::parallel::execution {
283283
/// \param ts [in] Additional arguments to use to invoke \a f.
284284
///
285285
/// \returns The return type of \a executor_type::bulk_sync_execute
286-
/// if defined by \a executor_type. Otherwise a vector holding
286+
/// if defined by \a executor_type. Otherwise, a vector holding
287287
/// the returned values of each invocation of \a f except when
288288
/// \a f returns void, which case void is returned.
289289
///
@@ -352,7 +352,7 @@ namespace hpx::parallel::execution {
352352
/// \param ts [in] Additional arguments to use to invoke \a f.
353353
///
354354
/// \returns The return type of \a executor_type::bulk_async_execute if
355-
/// defined by \a executor_type. Otherwise a vector
355+
/// defined by \a executor_type. Otherwise, a vector
356356
/// of futures holding the returned values of each invocation
357357
/// of \a f.
358358
///
@@ -423,7 +423,7 @@ namespace hpx::parallel::execution {
423423
/// \param ts [in] Additional arguments to use to invoke \a f.
424424
///
425425
/// \returns The return type of \a executor_type::bulk_then_execute
426-
/// if defined by \a executor_type. Otherwise a vector holding
426+
/// if defined by \a executor_type. Otherwise, a vector holding
427427
/// the returned values of each invocation of \a f.
428428
///
429429
/// \note This calls exec.bulk_then_execute(f, shape, pred, ts...) if it
@@ -489,7 +489,7 @@ namespace hpx::parallel::execution {
489489
/// given executor.
490490
///
491491
/// \returns The return type of \a executor_type::async_invoke if defined by
492-
/// \a executor_type. Otherwise a future<void>
492+
/// \a executor_type. Otherwise, a future<void>
493493
/// representing finishing the execution of all functions \a fs.
494494
///
495495
/// \note This calls exec.async_invoke(fs...) if it exists; otherwise it

libs/core/execution_base/src/this_thread.cpp

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <utility>
2626

2727
#if defined(HPX_WINDOWS)
28+
#include <ctime>
2829
#include <windows.h>
2930
#else
3031
#ifndef _AIX
@@ -40,12 +41,82 @@ namespace hpx::execution_base {
4041

4142
namespace {
4243

44+
#if defined(HPX_WINDOWS)
45+
// Number of performance counter increments per nanosecond, or zero if
46+
// it could not be determined.
47+
struct ticks_per_nanosecond
48+
{
49+
ticks_per_nanosecond()
50+
{
51+
LARGE_INTEGER freq;
52+
if (QueryPerformanceFrequency(&freq))
53+
ticks = static_cast<double>(freq.QuadPart) / 1e9;
54+
}
55+
56+
double ticks = 0.0;
57+
};
58+
ticks_per_nanosecond ticks;
59+
60+
// our own (crude) implementation of nanosleep
61+
int win_nanosleep(std::timespec const& delay)
62+
{
63+
if (delay.tv_nsec < 0 || delay.tv_nsec >= 999999999)
64+
{
65+
return -1;
66+
}
67+
68+
// for small delays we busy-wait
69+
if (delay.tv_sec == 0 && ticks.ticks != 0.0)
70+
{
71+
// compensate for fluctuations introduced by Sleep()
72+
auto const sleep_for = delay.tv_nsec / 1000000 - 10;
73+
74+
// overall number of ticks to delay
75+
auto const ticks_to_delay = static_cast<long>(
76+
static_cast<double>(delay.tv_nsec) * ticks.ticks);
77+
78+
LARGE_INTEGER counter;
79+
if (QueryPerformanceCounter(&counter))
80+
{
81+
// wait until the performance counter has reached this value
82+
auto const wait_until = counter.QuadPart + ticks_to_delay;
83+
84+
// use Sleep() if appropriate
85+
if (sleep_for > 0)
86+
{
87+
Sleep(sleep_for);
88+
}
89+
90+
// simply busy-wait for the remaining amount of time, don't
91+
// wait if delay is zero
92+
while (counter.QuadPart < wait_until &&
93+
QueryPerformanceCounter(&counter))
94+
{
95+
}
96+
return 0;
97+
}
98+
}
99+
100+
// Fallback and longer delays
101+
Sleep(static_cast<long>(delay.tv_sec) * 1000 +
102+
delay.tv_nsec / 1000000);
103+
104+
return 0;
105+
}
106+
107+
constexpr std::timespec wait_0ns = {.tv_sec = 0, .tv_nsec = 0};
108+
constexpr std::timespec wait_1000ns = {.tv_sec = 0, .tv_nsec = 1000};
109+
#endif
110+
111+
///////////////////////////////////////////////////////////////////////
43112
struct default_context final : execution_base::context_base
44113
{
45-
resource_base const& resource() const noexcept override
114+
[[nodiscard]] resource_base const& resource()
115+
const noexcept override
46116
{
47117
return resource_;
48118
}
119+
49120
resource_base resource_;
50121
};
51122

@@ -96,7 +167,7 @@ namespace hpx::execution_base {
96167
void default_agent::yield(char const* /* desc */)
97168
{
98169
#if defined(HPX_WINDOWS)
99-
Sleep(0);
170+
win_nanosleep(wait_0ns);
100171
#else
101172
sched_yield();
102173
#endif
@@ -116,7 +187,7 @@ namespace hpx::execution_base {
116187
else if (k < 32 || k & 1) //-V112
117188
{
118189
#if defined(HPX_WINDOWS)
119-
Sleep(0);
190+
win_nanosleep(wait_0ns);
120191
return true;
121192
#else
122193
sched_yield();
@@ -126,7 +197,7 @@ namespace hpx::execution_base {
126197
else
127198
{
128199
#if defined(HPX_WINDOWS)
129-
Sleep(1);
200+
win_nanosleep(wait_1000ns);
130201
return true;
131202
#else
132203
// g++ -Wextra warns on {} or {0}

libs/core/executors/include/hpx/executors/parallel_executor.hpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ namespace hpx::parallel::execution::detail {
6565
struct then_bulk_function_result;
6666
} // namespace hpx::parallel::execution::detail
6767

68+
#if !defined(HPX_HAVE_MORE_THAN_64_THREADS) || \
69+
(defined(HPX_HAVE_MAX_CPU_COUNT) && HPX_HAVE_MAX_CPU_COUNT <= 64)
70+
#define HPX_MASK_TYPE_IS_CONSTEXPR_CONSTRUCTIBLE
71+
#endif
72+
6873
namespace hpx::execution {
6974

7075
///////////////////////////////////////////////////////////////////////////
@@ -223,6 +228,11 @@ namespace hpx::execution {
223228
{
224229
auto exec_with_num_cores = exec;
225230
exec_with_num_cores.num_cores_ = num_cores;
231+
232+
#if defined(HPX_MASK_TYPE_IS_CONSTEXPR_CONSTRUCTIBLE)
233+
// force recomputing cached pu mask
234+
exec_with_num_cores.mask_ = hpx::threads::mask_type();
235+
#endif
226236
return exec_with_num_cores;
227237
}
228238

@@ -594,6 +604,12 @@ namespace hpx::execution {
594604

595605
hpx::threads::mask_type pu_mask() const noexcept
596606
{
607+
#if defined(HPX_MASK_TYPE_IS_CONSTEXPR_CONSTRUCTIBLE)
608+
if (hpx::threads::any(mask_))
609+
{
610+
return mask_;
611+
}
612+
#endif
597613
auto const num_threads = get_num_cores();
598614
auto const* pool =
599615
pool_ ? pool_ : threads::detail::get_self_or_default_pool();
@@ -621,6 +637,10 @@ namespace hpx::execution {
621637
}
622638
}
623639
}
640+
641+
#if defined(HPX_MASK_TYPE_IS_CONSTEXPR_CONSTRUCTIBLE)
642+
mask_ = mask;
643+
#endif
624644
return mask;
625645
}
626646

@@ -644,6 +664,9 @@ namespace hpx::execution {
644664
std::size_t hierarchical_threshold_ = hierarchical_threshold_default_;
645665
std::size_t first_core_ = 0;
646666
std::size_t num_cores_ = 0;
667+
#if defined(HPX_MASK_TYPE_IS_CONSTEXPR_CONSTRUCTIBLE)
668+
mutable hpx::threads::mask_type mask_ = hpx::threads::mask_type();
669+
#endif
647670
#if defined(HPX_HAVE_THREAD_DESCRIPTION)
648671
char const* annotation_ = nullptr;
649672
#endif
@@ -685,6 +708,8 @@ namespace hpx::execution {
685708
parallel_policy_executor<hpx::launch>;
686709
} // namespace hpx::execution
687710

711+
#undef HPX_MASK_TYPE_IS_CONSTEXPR_CONSTRUCTIBLE
712+
688713
namespace hpx::execution::experimental {
689714

690715
/// \cond NOINTERNAL

libs/core/topology/include/hpx/topology/topology.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -320,8 +320,8 @@ namespace hpx::threads {
320320
mask_type init_thread_affinity_mask(
321321
std::size_t num_core, std::size_t num_pu) const;
322322

323-
hwloc_bitmap_t mask_to_bitmap(
324-
mask_cref_type mask, hwloc_obj_type_t htype) const;
323+
hwloc_bitmap_t mask_to_bitmap(mask_cref_type mask,
324+
hwloc_obj_type_t htype, unsigned* count = nullptr) const;
325325
mask_type bitmap_to_mask(
326326
hwloc_bitmap_t bitmap, hwloc_obj_type_t htype) const;
327327

0 commit comments

Comments
 (0)