From 758e03c816e966754523afd780741540182c5160 Mon Sep 17 00:00:00 2001 From: Erwan Viollet Date: Mon, 11 May 2026 15:14:35 +0200 Subject: [PATCH] proposal(profiling-ffi): catch_unwind via global panic callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Exploration PR — one of three sibling proposals for adding catch_unwind at the profiling FFI boundary. Routes caught panics through a globally registered C callback; the returned ProfileStatus carries a static sentinel so the return path performs no allocation on the panic side. - New panic_handler module: ddog_prof_set_panic_handler + fire_panic_handler. - New wrap_with_profile_status! macro: catch_unwind around the body, fires the registered handler (if any) on caught payloads, returns ProfileStatus::from(c"libdatadog panicked"). - Nested catch_unwind inside fire_panic_handler so OOM during message formatting cannot itself unwind. - One example migration: ddog_prof_ProfilesDictionary_insert_function. See docs/proposals/profiling-ffi-catch-unwind-callback.md. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../profiling-ffi-catch-unwind-callback.md | 92 +++++++++++++++++++ libdd-profiling-ffi/src/lib.rs | 2 + libdd-profiling-ffi/src/panic_handler.rs | 81 ++++++++++++++++ libdd-profiling-ffi/src/profiles/mod.rs | 24 ++++- .../src/profiles/profiles_dictionary.rs | 12 ++- 5 files changed, 206 insertions(+), 5 deletions(-) create mode 100644 docs/proposals/profiling-ffi-catch-unwind-callback.md create mode 100644 libdd-profiling-ffi/src/panic_handler.rs diff --git a/docs/proposals/profiling-ffi-catch-unwind-callback.md b/docs/proposals/profiling-ffi-catch-unwind-callback.md new file mode 100644 index 0000000000..9542c6c446 --- /dev/null +++ b/docs/proposals/profiling-ffi-catch-unwind-callback.md @@ -0,0 +1,92 @@ +# Proposal: catch_unwind at the profiling FFI boundary — callback variant + +Status: draft, exploration. + +## Problem + +Same as the bit-flag proposal: panics that unwind through the profiling FFI +boundary are undefined behavior in C callers, and there is no panic guard on +the new `ProfileStatus`-returning hot-path APIs. + +## Proposal + +Catch the panic inside the FFI body and route the panic message through a +**globally registered callback**. The returned `ProfileStatus` carries a static +sentinel (`c"libdatadog panicked"`), so the panic path performs no allocation +on the return-value side; the only allocation is whatever the wrapper hands to +the callback. + +```c +typedef void (*ddog_prof_PanicHandler)( + const char *function_name, + const char *message, + void *userdata); + +void ddog_prof_set_panic_handler(ddog_prof_PanicHandler cb, void *userdata); +``` + +Internally, every `ProfileStatus`-returning FFI function wraps its body in +`wrap_with_profile_status!`, which: + +1. Runs the body inside `catch_unwind(AssertUnwindSafe(...))`. +2. On `Ok(Ok(()))` → `ProfileStatus::OK`. +3. On `Ok(Err(e))` → existing `ProfileStatus::from(e)` path. +4. On `Err(payload)` → fire the registered callback (if any), then return + `ProfileStatus::from(c"libdatadog panicked")`. + +## Caller pattern (C) + +```c +static void on_panic(const char *fn, const char *msg, void *ud) { + fprintf(stderr, "libdatadog panic in %s: %s\n", fn, msg); + flag_profile_dead((profile_t *)ud); +} + +ddog_prof_set_panic_handler(on_panic, my_profile); + +// normal calls — error path is one branch, panic info routes via callback. +ddog_prof_ProfileStatus s = ddog_prof_ProfilesDictionary_insert_function(&id, dict, &fn); +if (s.err) { /* one branch covers panic-as-error too */ } +``` + +## Pros + +- **Single observability point.** Useful for metrics: emit a + `profiling.ffi.panic` counter directly in the callback, regardless of which + FFI function panicked. +- **Hot-path callers don't have to learn a new bit.** They handle `err != null` + the same way they always have; panic vs. recoverable error is a separate + concern handled out-of-band. +- **Allocation-free return path.** The returned status uses a static `CStr` — + even when the panic was caused by OOM, the return value itself does not + allocate. (The callback path may allocate to format the message.) +- Native fit for tracer-side panic reporting: profilers can forward libdatadog + panics into the tracer's existing crash/error pipeline. + +## Cons + +- **Global mutable state.** Registration must be thread-safe (atomic pointers). + Init ordering matters: panics that fire before `set_panic_handler` is called + are silently swallowed. +- **Fork semantics need thought.** A handler registered in the parent is still + registered in the child; userdata may not be valid post-fork. +- **Reentrancy hazard.** The callback must NOT call back into any libdatadog + FFI — that risks reentry on a half-mutated handle. The contract has to be + documented and ideally enforced (e.g., a thread-local "in panic handler" + guard that turns reentry into a no-op). +- **Locality is lost.** The callback knows the function name but not which + `Profile` / `Dictionary` handle was being operated on. Callers that need + per-handle recovery must encode that themselves via `userdata`. + +## Example migration + +This PR migrates exactly one function — +`ddog_prof_ProfilesDictionary_insert_function` — to show the shape. If +accepted, every `ProfileStatus`-returning FFI function would follow. + +## Sibling proposals + +- `r1viollet/profiling-ffi-panic-bit` — bit-flag on `ProfileStatus`, no global + state. +- `r1viollet/profiling-ffi-panic-bit-and-callback` — both, with the callback as + an optional observability overlay on top of the bit. diff --git a/libdd-profiling-ffi/src/lib.rs b/libdd-profiling-ffi/src/lib.rs index aac70dab2c..1dfb590f58 100644 --- a/libdd-profiling-ffi/src/lib.rs +++ b/libdd-profiling-ffi/src/lib.rs @@ -9,12 +9,14 @@ mod arc_handle; mod exporter; +mod panic_handler; mod profile_error; mod profile_status; mod profiles; mod string_storage; pub use arc_handle::*; +pub use panic_handler::*; pub use profile_error::*; pub use profile_status::*; diff --git a/libdd-profiling-ffi/src/panic_handler.rs b/libdd-profiling-ffi/src/panic_handler.rs new file mode 100644 index 0000000000..cdc1cd5da9 --- /dev/null +++ b/libdd-profiling-ffi/src/panic_handler.rs @@ -0,0 +1,81 @@ +// Copyright 2026-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +//! Global panic handler for the profiling FFI. +//! +//! Callers register a single C callback that fires when `catch_unwind` in +//! `wrap_with_profile_status!` catches a panic. The handler receives the FFI +//! function name and a formatted panic message; the returned `ProfileStatus` +//! uses a static sentinel so the return path does not allocate. + +use std::ffi::{c_char, c_void, CString}; +use std::sync::atomic::{AtomicPtr, Ordering}; + +/// C-callable panic handler. Fired once per caught panic in any +/// `ProfileStatus`-returning FFI function. +/// +/// # Reentrancy +/// +/// The handler MUST NOT call back into any libdatadog profiling FFI function. +/// A panic that occurred mid-mutation may have left handles in a half-mutated +/// state; reentry can compound the corruption. +pub type PanicHandler = extern "C" fn( + function_name: *const c_char, + message: *const c_char, + userdata: *mut c_void, +); + +static HANDLER: AtomicPtr<()> = AtomicPtr::new(std::ptr::null_mut()); +static USERDATA: AtomicPtr = AtomicPtr::new(std::ptr::null_mut()); + +/// Registers (or clears) the global panic handler. Pass `None` to unregister. +/// +/// Thread-safe. The most recent registration wins; concurrent calls are +/// serialized by the atomic stores but can racy-overwrite each other. +/// +/// # Safety +/// +/// `userdata` must remain valid for as long as the handler may fire, which is +/// "until a subsequent call to this function replaces or clears the handler". +#[no_mangle] +pub unsafe extern "C" fn ddog_prof_set_panic_handler( + handler: Option, + userdata: *mut c_void, +) { + let ptr = handler.map_or(std::ptr::null_mut(), |h| h as *mut ()); + HANDLER.store(ptr, Ordering::Release); + USERDATA.store(userdata, Ordering::Release); +} + +/// Fires the registered handler, if any. Best-effort: any failure to format +/// the message is silently dropped, since this is itself the panic path. +pub(crate) fn fire_panic_handler( + function_name: &str, + payload: &(dyn std::any::Any + Send), +) { + let handler_ptr = HANDLER.load(Ordering::Acquire); + if handler_ptr.is_null() { + return; + } + // SAFETY: `HANDLER` only holds values written via `ddog_prof_set_panic_handler`, + // which stores a `PanicHandler` fn pointer cast to `*mut ()`. + let cb: PanicHandler = unsafe { std::mem::transmute(handler_ptr) }; + + // Nested catch_unwind so a panic while formatting (e.g. OOM in `format!`) + // does not unwind through the FFI boundary. + let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + let msg: String = if let Some(s) = payload.downcast_ref::() { + s.clone() + } else if let Some(s) = payload.downcast_ref::<&'static str>() { + (*s).to_string() + } else { + "".to_string() + }; + + let Ok(fn_c) = CString::new(function_name) else { return }; + let Ok(msg_c) = CString::new(msg) else { return }; + + let ud = USERDATA.load(Ordering::Acquire); + cb(fn_c.as_ptr(), msg_c.as_ptr(), ud); + })); +} diff --git a/libdd-profiling-ffi/src/profiles/mod.rs b/libdd-profiling-ffi/src/profiles/mod.rs index 077500d41a..a1306bbedc 100644 --- a/libdd-profiling-ffi/src/profiles/mod.rs +++ b/libdd-profiling-ffi/src/profiles/mod.rs @@ -24,4 +24,26 @@ macro_rules! ensure_non_null_insert { }; } -pub(crate) use {ensure_non_null_insert, ensure_non_null_out_parameter}; +/// Wraps the body of a `ProfileStatus`-returning FFI function in +/// `catch_unwind`. The body must evaluate to a `Result<(), E>` where +/// `ProfileStatus: From`. On caught panic the panic payload is routed +/// through the globally registered handler (see `ddog_prof_set_panic_handler`) +/// and the returned status carries a static `c"libdatadog panicked"` sentinel. +/// +/// The enclosing function must carry `#[function_name::named]` so the panic +/// callback receives the function name. +#[macro_export] +macro_rules! wrap_with_profile_status { + ($body:block) => {{ + use std::panic::{catch_unwind, AssertUnwindSafe}; + match catch_unwind(AssertUnwindSafe(|| $body)) { + Ok(result) => $crate::ProfileStatus::from(result), + Err(payload) => { + $crate::panic_handler::fire_panic_handler(function_name!(), &*payload); + $crate::ProfileStatus::from(c"libdatadog panicked") + } + } + }}; +} + +pub(crate) use {ensure_non_null_insert, ensure_non_null_out_parameter, wrap_with_profile_status}; diff --git a/libdd-profiling-ffi/src/profiles/profiles_dictionary.rs b/libdd-profiling-ffi/src/profiles/profiles_dictionary.rs index e4ddf38601..1ea5f3aa80 100644 --- a/libdd-profiling-ffi/src/profiles/profiles_dictionary.rs +++ b/libdd-profiling-ffi/src/profiles/profiles_dictionary.rs @@ -4,8 +4,11 @@ use crate::arc_handle::ArcHandle; use crate::profile_status::ProfileStatus; use crate::profiles::utf8::{self, Utf8Option}; -use crate::profiles::{ensure_non_null_insert, ensure_non_null_out_parameter}; +use crate::profiles::{ + ensure_non_null_insert, ensure_non_null_out_parameter, wrap_with_profile_status, +}; use crate::ProfileError; +use function_name::named; use libdd_common_ffi::slice::{CharSlice, Slice}; use libdd_common_ffi::MutSlice; use libdd_profiling::profiles::collections::StringRef; @@ -113,6 +116,7 @@ pub unsafe extern "C" fn ddog_prof_ProfilesDictionary_try_clone( /// - `dict` must refer to a live dictionary. /// - `function` must be non-null and point to a valid `Function` for the duration of the call. #[no_mangle] +#[named] pub unsafe extern "C" fn ddog_prof_ProfilesDictionary_insert_function( function_id: *mut FunctionId2, dict: Option<&ProfilesDictionary>, @@ -120,13 +124,13 @@ pub unsafe extern "C" fn ddog_prof_ProfilesDictionary_insert_function( ) -> ProfileStatus { ensure_non_null_out_parameter!(function_id); ensure_non_null_insert!(function); - ProfileStatus::from(|| -> Result<(), ProfileError> { + wrap_with_profile_status!({ let dict = dict.ok_or(NULL_PROFILES_DICTIONARY)?; let f2: Function2 = unsafe { *function }; let id = dict.try_insert_function2(f2)?; unsafe { function_id.write(id) }; - Ok(()) - }()) + Ok::<(), ProfileError>(()) + }) } /// Inserts a `Mapping` into the dictionary and returns its id.