From 4efd71a93fb88e6967f1a38802c2109d967926ec Mon Sep 17 00:00:00 2001 From: Tristan Rice Date: Wed, 18 Mar 2026 17:21:44 -0700 Subject: [PATCH] Fix IBVERBS CI failures by probing QP creation before constructing Device On CI runners without real RDMA hardware, rdma-core software providers let ibv_open_device/ibv_alloc_pd/ibv_create_comp_channel succeed but ibv_create_qp fails with EINVAL. Creating a gloo Device starts a background thread; after fork() in TransportMultiProcTest the thread handle is invalid, causing SIGSEGV (exit 139) in Device::~Device. Fix: probe ibverbs capability using raw APIs (through ibv_create_qp) in the test's createDevice() before constructing a gloo Device. If QP creation fails, mark IBVERBS as unavailable and return nullptr. Also moves GTEST_SKIP() out of worker threads to avoid concurrent calls racing on GTest internals (exit 134), adds a SIGSEGV backtrace handler for test debugging, and builds with RelWithDebInfo. --- .github/workflows/build-linux.yml | 2 +- gloo/test/base_test.cc | 75 ++++++++++++++++++++++++++++++- gloo/test/base_test.h | 32 +++++++++++-- gloo/test/main.cc | 15 +++++++ 4 files changed, 119 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml index 7d2c7fedc..6921bea40 100644 --- a/.github/workflows/build-linux.yml +++ b/.github/workflows/build-linux.yml @@ -75,7 +75,7 @@ jobs: sudo apt-get install -y gcc g++ mkdir -p build cd build - cmake ../ -DCMAKE_VERBOSE_MAKEFILE=ON -DBUILD_TEST=ON ${{matrix.cmake_args}} -DOPENSSL_ROOT_DIR=/opt/openssl/ + cmake ../ -DCMAKE_VERBOSE_MAKEFILE=ON -DBUILD_TEST=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo ${{matrix.cmake_args}} -DOPENSSL_ROOT_DIR=/opt/openssl/ make - name: Test run: | diff --git a/gloo/test/base_test.cc b/gloo/test/base_test.cc index ccccfa3bb..aa1ffb184 100644 --- a/gloo/test/base_test.cc +++ b/gloo/test/base_test.cc @@ -9,6 +9,10 @@ #include "gloo/test/base_test.h" #include "gloo/test/openssl_utils.h" +#if GLOO_HAVE_TRANSPORT_IBVERBS +#include +#endif + namespace gloo { namespace test { @@ -49,6 +53,72 @@ const std::vector kTransportsForFunctionAlgorithms = { #endif }; +#if GLOO_HAVE_TRANSPORT_IBVERBS +static bool probeIbverbs() { + int numDevices = 0; + struct ibv_device** deviceList = ibv_get_device_list(&numDevices); + if (!deviceList || numDevices == 0) { + if (deviceList) + ibv_free_device_list(deviceList); + return false; + } + struct ibv_context* ctx = ibv_open_device(deviceList[0]); + ibv_free_device_list(deviceList); + if (!ctx) { + return false; + } + struct ibv_pd* pd = ibv_alloc_pd(ctx); + if (!pd) { + ibv_close_device(ctx); + return false; + } + struct ibv_comp_channel* channel = ibv_create_comp_channel(ctx); + if (!channel) { + ibv_dealloc_pd(pd); + ibv_close_device(ctx); + return false; + } + struct ibv_cq* cq = ibv_create_cq(ctx, 64, nullptr, channel, 0); + if (!cq) { + ibv_destroy_comp_channel(channel); + ibv_dealloc_pd(pd); + ibv_close_device(ctx); + return false; + } + struct ibv_qp_init_attr qpAttr{}; + qpAttr.send_cq = cq; + qpAttr.recv_cq = cq; + qpAttr.cap.max_send_wr = 16; + qpAttr.cap.max_recv_wr = 16; + qpAttr.cap.max_send_sge = 1; + qpAttr.cap.max_recv_sge = 1; + qpAttr.qp_type = IBV_QPT_RC; + struct ibv_qp* qp = ibv_create_qp(pd, &qpAttr); + if (!qp) { + ibv_destroy_cq(cq); + ibv_destroy_comp_channel(channel); + ibv_dealloc_pd(pd); + ibv_close_device(ctx); + return false; + } + ibv_destroy_qp(qp); + ibv_destroy_cq(cq); + ibv_destroy_comp_channel(channel); + ibv_dealloc_pd(pd); + ibv_close_device(ctx); + return true; +} +#endif + +bool ibverbsAvailable() { +#if GLOO_HAVE_TRANSPORT_IBVERBS + static bool available = probeIbverbs(); + return available; +#else + return false; +#endif +} + std::shared_ptr<::gloo::transport::Device> createDevice(Transport transport) { #if GLOO_HAVE_TRANSPORT_TCP if (transport == Transport::TCP) { @@ -76,11 +146,14 @@ std::shared_ptr<::gloo::transport::Device> createDevice(Transport transport) { #endif #if GLOO_HAVE_TRANSPORT_IBVERBS if (transport == Transport::IBVERBS) { + if (!ibverbsAvailable()) { + return nullptr; + } gloo::transport::ibverbs::attr attr; attr.port = 1; try { return ::gloo::transport::ibverbs::CreateDevice(attr); - } catch (const InvalidOperationException& e) { + } catch (const std::exception& e) { GLOO_INFO("IBVERBS not available: ", e.what()); } } diff --git a/gloo/test/base_test.h b/gloo/test/base_test.h index 307b4a8a6..27a00ba74 100644 --- a/gloo/test/base_test.h +++ b/gloo/test/base_test.h @@ -10,6 +10,7 @@ #include +#include #include #include #include @@ -75,6 +76,14 @@ extern const std::vector kTransportsForClassAlgorithms; extern const std::vector kTransportsForFunctionAlgorithms; extern const std::vector kTransportsForRDMA; +// Returns true if ibverbs is available with functional RDMA hardware. +// Probes once using raw ibverbs APIs (through ibv_create_qp) and caches +// the result. On CI runners without real RDMA hardware, rdma-core software +// providers let device open / PD alloc / CQ creation succeed but QP creation +// fails. Creating a gloo Device (which starts a background thread) on such +// systems causes segfaults after fork() in TransportMultiProcTest. +bool ibverbsAvailable(); + std::shared_ptr<::gloo::transport::Device> createDevice(Transport transport); class BaseTest : public ::testing::Test { @@ -115,18 +124,31 @@ class BaseTest : public ::testing::Test { Barrier barrier(size); auto store = std::make_shared<::gloo::rendezvous::HashStore>(); + // Track whether workers found the transport unavailable so we can + // call GTEST_SKIP() from the main thread after joining. + // GTEST_SKIP() is not safe to call from worker threads — concurrent + // calls race on GTest internals and can cause "terminate called + // recursively" (SIGABRT / exit code 134). + std::atomic transportUnavailable{false}; + spawnThreads(size, [&](int rank) { auto context = std::make_shared<::gloo::rendezvous::Context>(rank, size, base); - // Create device per thread to avoid collisions then they are using the + // Create device per thread to avoid collisions when they are using the // socket address. auto device = device_creator(transport); if (!device) { - GTEST_SKIP() << "Skipping test: transport not available"; + transportUnavailable.store(true); + return; + } + + try { + context->connectFullMesh(store, device); + } catch (const std::exception&) { + transportUnavailable.store(true); return; } - context->connectFullMesh(store, device); try { fn(context); @@ -150,6 +172,10 @@ class BaseTest : public ::testing::Test { context->closeConnections(); } }); + + if (transportUnavailable.load()) { + GTEST_SKIP() << "Skipping test: transport not available"; + } } void spawn( diff --git a/gloo/test/main.cc b/gloo/test/main.cc index cbc662a90..a6225c8ad 100644 --- a/gloo/test/main.cc +++ b/gloo/test/main.cc @@ -10,10 +10,25 @@ // One-time init to use EPIPE errors instead of SIGPIPE #ifndef _WIN32 +#include +#include +#include +#include + namespace { + +static void segfault_handler(int sig) { + void* array[30]; + int size = backtrace(array, 30); + fprintf(stderr, "[DIAG] Signal %d caught, backtrace:\n", sig); + backtrace_symbols_fd(array, size, STDERR_FILENO); + _exit(128 + sig); +} + struct Initializer { Initializer() { signal(SIGPIPE, SIG_IGN); + signal(SIGSEGV, segfault_handler); } }; Initializer initializer;