From e8c4ddef42f75002f285a56e385056f06e531d62 Mon Sep 17 00:00:00 2001 From: NachoEchevarria Date: Tue, 26 May 2026 15:05:34 +0200 Subject: [PATCH 1/9] add logs --- utils/build/docker/dotnet/poc.Dockerfile | 8 ++++ utils/build/docker/dotnet/uds.Dockerfile | 8 ++++ .../weblog/Controllers/RaspController.cs | 38 +++++++++++++++++-- utils/build/docker/dotnet/weblog/Program.cs | 34 +++++++++++++++++ utils/build/docker/dotnet/weblog/Startup.cs | 21 ++++++++++ 5 files changed, 105 insertions(+), 4 deletions(-) diff --git a/utils/build/docker/dotnet/poc.Dockerfile b/utils/build/docker/dotnet/poc.Dockerfile index 803354f7187..6ca4e0ee9c9 100644 --- a/utils/build/docker/dotnet/poc.Dockerfile +++ b/utils/build/docker/dotnet/poc.Dockerfile @@ -38,6 +38,14 @@ ENV DD_TRACE_HEADER_TAGS='user-agent:http.request.headers.user-agent' ENV DD_DATA_STREAMS_ENABLED=true ENV DD_INTERNAL_TELEMETRY_V2_ENABLED=true +# [DIAG do-not-merge] enable verbose tracer logs to diagnose Test_SqlServiceNameSource stall +ENV DD_TRACE_DEBUG=true +ENV DD_INTERNAL_LOG_LEVEL=debug +# [DIAG do-not-merge] enable Kestrel + ASP.NET pipeline debug logs to see connection accept events +ENV Logging__LogLevel__Default=Debug +ENV Logging__LogLevel__Microsoft.AspNetCore.Server.Kestrel=Debug +ENV Logging__LogLevel__Microsoft.AspNetCore.Hosting=Debug + # .NET runtime config ENV ASPNETCORE_hostBuilder__reloadConfigOnChange=false # - Enable dump on crash diff --git a/utils/build/docker/dotnet/uds.Dockerfile b/utils/build/docker/dotnet/uds.Dockerfile index b0aaad82efe..e78baafb1aa 100644 --- a/utils/build/docker/dotnet/uds.Dockerfile +++ b/utils/build/docker/dotnet/uds.Dockerfile @@ -32,6 +32,14 @@ ENV DD_TRACE_HEADER_TAGS='user-agent:http.request.headers.user-agent' ENV DD_DATA_STREAMS_ENABLED=true ENV DD_INTERNAL_TELEMETRY_V2_ENABLED=true +# [DIAG do-not-merge] enable verbose tracer logs to diagnose Test_SqlServiceNameSource stall +ENV DD_TRACE_DEBUG=true +ENV DD_INTERNAL_LOG_LEVEL=debug +# [DIAG do-not-merge] enable Kestrel + ASP.NET pipeline debug logs to see connection accept events +ENV Logging__LogLevel__Default=Debug +ENV Logging__LogLevel__Microsoft.AspNetCore.Server.Kestrel=Debug +ENV Logging__LogLevel__Microsoft.AspNetCore.Hosting=Debug + # .NET runtime config ENV ASPNETCORE_hostBuilder__reloadConfigOnChange=false # - Enable dump on crash diff --git a/utils/build/docker/dotnet/weblog/Controllers/RaspController.cs b/utils/build/docker/dotnet/weblog/Controllers/RaspController.cs index d2b4cf82bb4..63a7619507e 100644 --- a/utils/build/docker/dotnet/weblog/Controllers/RaspController.cs +++ b/utils/build/docker/dotnet/weblog/Controllers/RaspController.cs @@ -219,13 +219,26 @@ public IActionResult SsrfPostJson([FromBody] Model data) [HttpGet("sqli")] public IActionResult SqliGet(string user_id) { - if (!string.IsNullOrEmpty(user_id)) + // [DIAG do-not-merge] timing in SqliGet to localize the 5s stall on Test_SqlServiceNameSource. + var t0 = System.DateTime.UtcNow; + System.Console.WriteLine($"[DIAG-CTRL ] {t0:HH:mm:ss.fffffff} SqliGet ENTER user_id={user_id}"); + try { - return Content(SqlQuery(user_id)); + if (!string.IsNullOrEmpty(user_id)) + { + var sw = System.Diagnostics.Stopwatch.StartNew(); + var result = SqlQuery(user_id); + System.Console.WriteLine($"[DIAG-CTRL ] {System.DateTime.UtcNow:HH:mm:ss.fffffff} SqliGet SqlQuery done in {sw.Elapsed.TotalMilliseconds:F2}ms"); + return Content(result); + } + else + { + return BadRequest("No params provided"); + } } - else + finally { - return BadRequest("No params provided"); + System.Console.WriteLine($"[DIAG-CTRL ] {System.DateTime.UtcNow:HH:mm:ss.fffffff} SqliGet EXIT elapsed_ms={(System.DateTime.UtcNow - t0).TotalMilliseconds:F2}"); } } @@ -281,19 +294,36 @@ public IActionResult SqliPostJson([FromBody] Model data) private string SqlQuery(string user) { + // [DIAG do-not-merge] sub-step timing inside the SQLite path. + var swTotal = System.Diagnostics.Stopwatch.StartNew(); var sb = new System.Text.StringBuilder(); sb.AppendLine("Insecure SQL command executed:"); + + var sw = System.Diagnostics.Stopwatch.StartNew(); using var conn = Sql.GetSqliteConnection(); + var tCreate = sw.Elapsed.TotalMilliseconds; + + sw.Restart(); conn.Open(); + var tOpen = sw.Elapsed.TotalMilliseconds; + + sw.Restart(); using var cmd = conn.CreateCommand(); cmd.CommandText = "SELECT * FROM users WHERE id='" + user + "'"; + var tCmd = sw.Elapsed.TotalMilliseconds; + + sw.Restart(); using var reader = cmd.ExecuteReader(); + var tExec = sw.Elapsed.TotalMilliseconds; + sw.Restart(); while (reader.Read()) { sb.AppendLine($"{reader["user"]}, {reader["pwd"]}"); } + var tRead = sw.Elapsed.TotalMilliseconds; + System.Console.WriteLine($"[DIAG-SQL ] {System.DateTime.UtcNow:HH:mm:ss.fffffff} SqlQuery user='{user}' total={swTotal.Elapsed.TotalMilliseconds:F2}ms create={tCreate:F2} open={tOpen:F2} cmd={tCmd:F2} exec={tExec:F2} read={tRead:F2}"); return sb.ToString(); } } diff --git a/utils/build/docker/dotnet/weblog/Program.cs b/utils/build/docker/dotnet/weblog/Program.cs index d6b1cba06d0..fe432381e95 100644 --- a/utils/build/docker/dotnet/weblog/Program.cs +++ b/utils/build/docker/dotnet/weblog/Program.cs @@ -1,4 +1,6 @@ using System; +using System.Threading; +using System.Threading.Tasks; using Microsoft.AspNetCore.Hosting; using Microsoft.Extensions.Hosting; using Microsoft.Extensions.Http; @@ -11,6 +13,38 @@ public class Program { public static void Main(string[] args) { + // [DIAG do-not-merge] periodic ThreadPool / GC stats logger for Test_SqlServiceNameSource stall diagnosis. + // 250ms cadence gives ~20 datapoints across a 5s window. Look for: pending shooting up, avail dropping + // to 0, gen2 GC tick during the gap, or a long quiet period in the log itself (== process frozen). + _ = Task.Run(async () => + { + long lastGc0 = 0, lastGc1 = 0, lastGc2 = 0; + while (true) + { + try + { + ThreadPool.GetMinThreads(out var minW, out var minIo); + ThreadPool.GetMaxThreads(out var maxW, out var maxIo); + ThreadPool.GetAvailableThreads(out var availW, out var availIo); + var gc0 = GC.CollectionCount(0); + var gc1 = GC.CollectionCount(1); + var gc2 = GC.CollectionCount(2); + var heapMB = GC.GetTotalMemory(false) / (1024 * 1024); + Console.WriteLine($"[DIAG-POOL ] {DateTime.UtcNow:HH:mm:ss.fffffff} tpThreads={ThreadPool.ThreadCount} pending={ThreadPool.PendingWorkItemCount} completed={ThreadPool.CompletedWorkItemCount} min=({minW}/{minIo}) avail=({availW}/{availIo}) max=({maxW}/{maxIo}) gc=({gc0},{gc1},{gc2}) heapMB={heapMB}"); + if (gc0 != lastGc0 || gc1 != lastGc1 || gc2 != lastGc2) + { + Console.WriteLine($"[DIAG-GC ] {DateTime.UtcNow:HH:mm:ss.fffffff} gen0+={gc0 - lastGc0} gen1+={gc1 - lastGc1} gen2+={gc2 - lastGc2}"); + lastGc0 = gc0; lastGc1 = gc1; lastGc2 = gc2; + } + } + catch (Exception ex) + { + Console.WriteLine($"[DIAG-POOL ] ERR {ex.Message}"); + } + await Task.Delay(250).ConfigureAwait(false); + } + }); + // Enable Datadog log injection only if CONFIG_CHAINING_TEST is set to "true" if (Environment.GetEnvironmentVariable("CONFIG_CHAINING_TEST") == "true") { diff --git a/utils/build/docker/dotnet/weblog/Startup.cs b/utils/build/docker/dotnet/weblog/Startup.cs index 3fef703b1ad..8ff9e49539f 100644 --- a/utils/build/docker/dotnet/weblog/Startup.cs +++ b/utils/build/docker/dotnet/weblog/Startup.cs @@ -16,6 +16,8 @@ public class Startup public void ConfigureServices(IServiceCollection services) { services.AddSerilog((services, lc) => lc + // [DIAG do-not-merge] surface Kestrel + ASP.NET Hosting Debug events for stall diagnosis + .MinimumLevel.Debug() .Enrich.FromLogContext() .WriteTo.Console(new CompactJsonFormatter())); @@ -50,6 +52,25 @@ public void Configure(IApplicationBuilder app, IWebHostEnvironment env) Sql.Setup(); + // [DIAG do-not-merge] front-of-pipeline timing to localize the 5s stall on Test_SqlServiceNameSource. + // Compare timestamps with: Kestrel "Request starting", DIAG-CTRL ENTER/EXIT, and DIAG-POOL. + app.Use(async (ctx, next) => + { + var t0 = System.DateTime.UtcNow; + var cid = ctx.Connection.Id; + var ua = ctx.Request.Headers.UserAgent.ToString(); + System.Console.WriteLine($"[DIAG-MW-IN ] {t0:HH:mm:ss.fffffff} cid={cid} {ctx.Request.Method} {ctx.Request.Path}{ctx.Request.QueryString} ua={ua}"); + try + { + await next(); + } + finally + { + var t1 = System.DateTime.UtcNow; + System.Console.WriteLine($"[DIAG-MW-OUT] {t1:HH:mm:ss.fffffff} cid={cid} status={ctx.Response.StatusCode} elapsed_ms={(t1 - t0).TotalMilliseconds:F1}"); + } + }); + app.UseSession(); app.UseRouting(); app.UseAuthorization(); From 4ffe2942ab7bfa65877ee217783ecdbfdfd4fc2e Mon Sep 17 00:00:00 2001 From: NachoEchevarria Date: Tue, 26 May 2026 15:27:29 +0200 Subject: [PATCH 2/9] Update dotnet.yml --- manifests/dotnet.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/manifests/dotnet.yml b/manifests/dotnet.yml index 3e109f4b37f..eb2c5c510b6 100644 --- a/manifests/dotnet.yml +++ b/manifests/dotnet.yml @@ -598,11 +598,11 @@ manifest: weblog: [uds, poc] - declaration: missing_feature (Not yet implemented) excluded_weblog: [uds, poc] - tests/debugger/test_debugger_exception_replay.py::Test_Debugger_Exception_Replay: - - weblog_declaration: - "*": v2.50.0 - - declaration: irrelevant (DEBUG-4582) - component_version: '<=3.28.0' + # [DIAG do-not-merge] Test_Debugger_Exception_Replay disabled on this branch — the diagnostic + # middleware added in Startup.cs shifts the ASP.NET middleware chain frames captured by + # Exception Replay snapshots, causing snapshot comparisons to diverge. Restore original + # declaration before merging. + tests/debugger/test_debugger_exception_replay.py::Test_Debugger_Exception_Replay: irrelevant (DIAG do-not-merge) tests/debugger/test_debugger_exception_replay.py::Test_Debugger_Exception_Replay::test_exception_replay_async: flaky (DEBUG-3281) tests/debugger/test_debugger_exception_replay.py::Test_Debugger_Exception_Replay::test_exception_replay_inner: - declaration: bug (DEBUG-2799) From c4320cf00f945fefad26a97770a1c7093126ec16 Mon Sep 17 00:00:00 2001 From: NachoEchevarria Date: Tue, 26 May 2026 15:49:39 +0200 Subject: [PATCH 3/9] Update dotnet.yml --- manifests/dotnet.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/dotnet.yml b/manifests/dotnet.yml index eb2c5c510b6..60e4cdc28a6 100644 --- a/manifests/dotnet.yml +++ b/manifests/dotnet.yml @@ -763,7 +763,7 @@ manifest: tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_With_Distributed_Context: v3.15.0 tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_With_Error: v3.15.0 tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_v2: missing_feature - tests/integrations/test_mongo.py::Test_Mongo: missing_feature (Endpoint is not implemented on weblog) + tests/integrations/test_mongo.py::Test_Mongo: missing_feature (Endpoints is not implemented on weblog) tests/integrations/test_otel_drop_in.py::Test_Otel_Drop_In: missing_feature tests/integrations/test_service_overrides.py::Test_SqlServiceNameSource: v3.40.0 tests/integrations/test_sql.py::Test_Sql: missing_feature (Endpoint is not implemented on weblog) From 8d2102b1b2c7a26720e4a22e5af538792c9bcdda Mon Sep 17 00:00:00 2001 From: NachoEchevarria Date: Tue, 26 May 2026 16:24:37 +0200 Subject: [PATCH 4/9] Update dotnet.yml --- manifests/dotnet.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/dotnet.yml b/manifests/dotnet.yml index 60e4cdc28a6..eb2c5c510b6 100644 --- a/manifests/dotnet.yml +++ b/manifests/dotnet.yml @@ -763,7 +763,7 @@ manifest: tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_With_Distributed_Context: v3.15.0 tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_With_Error: v3.15.0 tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_v2: missing_feature - tests/integrations/test_mongo.py::Test_Mongo: missing_feature (Endpoints is not implemented on weblog) + tests/integrations/test_mongo.py::Test_Mongo: missing_feature (Endpoint is not implemented on weblog) tests/integrations/test_otel_drop_in.py::Test_Otel_Drop_In: missing_feature tests/integrations/test_service_overrides.py::Test_SqlServiceNameSource: v3.40.0 tests/integrations/test_sql.py::Test_Sql: missing_feature (Endpoint is not implemented on weblog) From 5878022a068c730bc789e52eb4c8875187521342 Mon Sep 17 00:00:00 2001 From: NachoEchevarria Date: Tue, 26 May 2026 16:58:55 +0200 Subject: [PATCH 5/9] Update dotnet.yml --- manifests/dotnet.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/dotnet.yml b/manifests/dotnet.yml index eb2c5c510b6..60e4cdc28a6 100644 --- a/manifests/dotnet.yml +++ b/manifests/dotnet.yml @@ -763,7 +763,7 @@ manifest: tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_With_Distributed_Context: v3.15.0 tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_With_Error: v3.15.0 tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_v2: missing_feature - tests/integrations/test_mongo.py::Test_Mongo: missing_feature (Endpoint is not implemented on weblog) + tests/integrations/test_mongo.py::Test_Mongo: missing_feature (Endpoints is not implemented on weblog) tests/integrations/test_otel_drop_in.py::Test_Otel_Drop_In: missing_feature tests/integrations/test_service_overrides.py::Test_SqlServiceNameSource: v3.40.0 tests/integrations/test_sql.py::Test_Sql: missing_feature (Endpoint is not implemented on weblog) From bf705df4980e09ad33a4fab32d669814e54ddafc Mon Sep 17 00:00:00 2001 From: NachoEchevarria Date: Tue, 26 May 2026 17:20:14 +0200 Subject: [PATCH 6/9] Update dotnet.yml --- manifests/dotnet.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/dotnet.yml b/manifests/dotnet.yml index 60e4cdc28a6..b2ec1606be5 100644 --- a/manifests/dotnet.yml +++ b/manifests/dotnet.yml @@ -763,7 +763,7 @@ manifest: tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_With_Distributed_Context: v3.15.0 tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_With_Error: v3.15.0 tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_v2: missing_feature - tests/integrations/test_mongo.py::Test_Mongo: missing_feature (Endpoints is not implemented on weblog) + tests/integrations/test_mongo.py::Test_Mongo: missing_feature (Endpointsi is not implemented on weblog) tests/integrations/test_otel_drop_in.py::Test_Otel_Drop_In: missing_feature tests/integrations/test_service_overrides.py::Test_SqlServiceNameSource: v3.40.0 tests/integrations/test_sql.py::Test_Sql: missing_feature (Endpoint is not implemented on weblog) From a61eda1fa489efa845783f4544e7912d90f9813e Mon Sep 17 00:00:00 2001 From: NachoEchevarria Date: Tue, 26 May 2026 17:39:37 +0200 Subject: [PATCH 7/9] Force error --- .github/workflows/run-end-to-end.yml | 25 +++++++++++++++- tests/integrations/test_service_overrides.py | 30 +++++++++++++++++++- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run-end-to-end.yml b/.github/workflows/run-end-to-end.yml index 928551f0f2f..79c2e5b0da5 100644 --- a/.github/workflows/run-end-to-end.yml +++ b/.github/workflows/run-end-to-end.yml @@ -222,7 +222,30 @@ jobs: run: ./run.sh TRACE_PROPAGATION_STYLE_W3C - name: Run INTEGRATIONS scenario if: steps.build.outcome == 'success' && !cancelled() && contains(inputs.scenarios, '"INTEGRATIONS"') - run: ./run.sh INTEGRATIONS + run: | + # [DIAG do-not-merge] For .NET, loop the INTEGRATIONS scenario 10x to amplify the + # Test_SqlServiceNameSource stall reproduction rate. Each iteration's logs are + # renamed to logs_integrations_iter_N so they all end up in the artifact tarball + # (which globs logs*/). The job fails if any iteration fails so CI surfaces it. + if [ "${{ inputs.library }}" = "dotnet" ]; then + iter_fail=0 + for i in $(seq 1 10); do + echo "===== [DIAG] INTEGRATIONS iteration $i / 10 =====" + rm -rf logs_integrations + ./run.sh INTEGRATIONS || iter_fail=$((iter_fail + 1)) + if [ -d logs_integrations ]; then + mv logs_integrations "logs_integrations_iter_$(printf %02d $i)" + fi + done + echo "===== [DIAG] INTEGRATIONS summary: $iter_fail failed iterations / 10 =====" + # Mirror the last iteration's logs into logs_integrations so downstream steps that + # expect that path keep working. + last_dir=$(ls -d logs_integrations_iter_* 2>/dev/null | tail -1) + if [ -n "$last_dir" ]; then cp -r "$last_dir" logs_integrations; fi + if [ "$iter_fail" -gt 0 ]; then exit 1; fi + else + ./run.sh INTEGRATIONS + fi - name: Run INTEGRATIONS_AWS scenario if: steps.build.outcome == 'success' && !cancelled() && contains(inputs.scenarios, '"INTEGRATIONS_AWS"') run: ./run.sh INTEGRATIONS_AWS diff --git a/tests/integrations/test_service_overrides.py b/tests/integrations/test_service_overrides.py index 55c04b576aa..c813444e00c 100644 --- a/tests/integrations/test_service_overrides.py +++ b/tests/integrations/test_service_overrides.py @@ -2,6 +2,8 @@ # This product includes software developed at Datadog (https://www.datadoghq.com/). # Copyright 2021 Datadog, Inc. +import time + from utils import context, weblog, interfaces, scenarios, features, irrelevant @@ -11,10 +13,36 @@ class Test_SqlServiceNameSource: """Verify that _dd.svc_src is set on SQL spans when the integration overrides the service name""" + # [DIAG do-not-merge] number of attempts to amplify the ~2% stall rate. + # With 100 attempts the chance of seeing at least one stall per CI run is ~87%. + _DIAG_ATTEMPTS = 100 + def setup_sql_srv_src(self): - self.r = weblog.get("/rasp/sqli?user_id=1") + # [DIAG do-not-merge] loop the request to amplify the stall reproduction rate. + # Each attempt uses timeout=30 (vs the usual 5) so we can see whether the request + # eventually completes after the stall, and what the true elapsed time was. + self.responses = [] + for i in range(self._DIAG_ATTEMPTS): + t0 = time.monotonic() + r = weblog.get("/rasp/sqli?user_id=1", timeout=30) + elapsed = time.monotonic() - t0 + self.responses.append((i, r, elapsed)) + # Keep the original-shape attribute so the legacy assertion below still has something + # to look at (we point it at the last response). + self.r = self.responses[-1][1] def test_sql_srv_src(self): + # [DIAG do-not-merge] surface every attempt that stalled or failed. + slow = [(i, r, e) for (i, r, e) in self.responses if r.status_code != 200 or e > 1.0] + print(f"[DIAG] /rasp/sqli stall summary: {len(slow)}/{len(self.responses)} slow-or-failed") + for i, r, e in slow: + rid = r.get_rid() if r.status_code is not None else "" + print(f"[DIAG] iter={i} status={r.status_code} elapsed={e:.3f}s rid={rid}") + # Surface a stall as a test failure so CI flags the run and we collect artifacts. + assert not slow, f"{len(slow)} stalls detected across {len(self.responses)} attempts" + + # Original assertions retained against the last response so we still validate the feature + # whenever no stall occurs. assert self.r.status_code == 200 srv_src_found = False From 762e238ce937d42736d3f8cf948c22e8f6e5e89e Mon Sep 17 00:00:00 2001 From: NachoEchevarria Date: Tue, 26 May 2026 18:04:29 +0200 Subject: [PATCH 8/9] Update test_service_overrides.py --- tests/integrations/test_service_overrides.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integrations/test_service_overrides.py b/tests/integrations/test_service_overrides.py index c813444e00c..2cca4933713 100644 --- a/tests/integrations/test_service_overrides.py +++ b/tests/integrations/test_service_overrides.py @@ -4,7 +4,7 @@ import time -from utils import context, weblog, interfaces, scenarios, features, irrelevant +from utils import context, weblog, interfaces, scenarios, features, irrelevant, logger @features.service_override_source @@ -34,10 +34,10 @@ def setup_sql_srv_src(self): def test_sql_srv_src(self): # [DIAG do-not-merge] surface every attempt that stalled or failed. slow = [(i, r, e) for (i, r, e) in self.responses if r.status_code != 200 or e > 1.0] - print(f"[DIAG] /rasp/sqli stall summary: {len(slow)}/{len(self.responses)} slow-or-failed") + logger.warning(f"[DIAG] /rasp/sqli stall summary: {len(slow)}/{len(self.responses)} slow-or-failed") for i, r, e in slow: rid = r.get_rid() if r.status_code is not None else "" - print(f"[DIAG] iter={i} status={r.status_code} elapsed={e:.3f}s rid={rid}") + logger.warning(f"[DIAG] iter={i} status={r.status_code} elapsed={e:.3f}s rid={rid}") # Surface a stall as a test failure so CI flags the run and we collect artifacts. assert not slow, f"{len(slow)} stalls detected across {len(self.responses)} attempts" From 225c9c2aa50b75c7e6db1aa8b2a16be1b3a5f170 Mon Sep 17 00:00:00 2001 From: NachoEchevarria Date: Wed, 27 May 2026 11:16:31 +0200 Subject: [PATCH 9/9] update --- .github/workflows/run-end-to-end.yml | 16 +++++++++------- tests/integrations/test_service_overrides.py | 9 ++++++--- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/.github/workflows/run-end-to-end.yml b/.github/workflows/run-end-to-end.yml index 79c2e5b0da5..1e181f2004d 100644 --- a/.github/workflows/run-end-to-end.yml +++ b/.github/workflows/run-end-to-end.yml @@ -223,21 +223,23 @@ jobs: - name: Run INTEGRATIONS scenario if: steps.build.outcome == 'success' && !cancelled() && contains(inputs.scenarios, '"INTEGRATIONS"') run: | - # [DIAG do-not-merge] For .NET, loop the INTEGRATIONS scenario 10x to amplify the - # Test_SqlServiceNameSource stall reproduction rate. Each iteration's logs are - # renamed to logs_integrations_iter_N so they all end up in the artifact tarball - # (which globs logs*/). The job fails if any iteration fails so CI surfaces it. + # [DIAG do-not-merge] For .NET, loop the INTEGRATIONS scenario 30x to amplify the + # Test_SqlServiceNameSource stall reproduction rate. Each iteration is a fresh scenario + # run (new containers, new tracer process), which is what the bug appears to need — + # looping inside a single scenario doesn't repro past the first call. Each iteration's + # logs are renamed to logs_integrations_iter_NN so they all end up in the artifact + # tarball (which globs logs*/). The job fails if any iteration fails so CI surfaces it. if [ "${{ inputs.library }}" = "dotnet" ]; then iter_fail=0 - for i in $(seq 1 10); do - echo "===== [DIAG] INTEGRATIONS iteration $i / 10 =====" + for i in $(seq 1 30); do + echo "===== [DIAG] INTEGRATIONS iteration $i / 30 =====" rm -rf logs_integrations ./run.sh INTEGRATIONS || iter_fail=$((iter_fail + 1)) if [ -d logs_integrations ]; then mv logs_integrations "logs_integrations_iter_$(printf %02d $i)" fi done - echo "===== [DIAG] INTEGRATIONS summary: $iter_fail failed iterations / 10 =====" + echo "===== [DIAG] INTEGRATIONS summary: $iter_fail failed iterations / 30 =====" # Mirror the last iteration's logs into logs_integrations so downstream steps that # expect that path keep working. last_dir=$(ls -d logs_integrations_iter_* 2>/dev/null | tail -1) diff --git a/tests/integrations/test_service_overrides.py b/tests/integrations/test_service_overrides.py index 2cca4933713..c09064ce081 100644 --- a/tests/integrations/test_service_overrides.py +++ b/tests/integrations/test_service_overrides.py @@ -13,9 +13,12 @@ class Test_SqlServiceNameSource: """Verify that _dd.svc_src is set on SQL spans when the integration overrides the service name""" - # [DIAG do-not-merge] number of attempts to amplify the ~2% stall rate. - # With 100 attempts the chance of seeing at least one stall per CI run is ~87%. - _DIAG_ATTEMPTS = 100 + # [DIAG do-not-merge] number of attempts per scenario run. Kept small (3) because the bug + # appears to be position-sensitive — looping inside one scenario doesn't reproduce it past + # the first call. The repeated calls instead let us compare timing of "second call" vs + # "third call" in case the bug is specifically the first repeat. + # The real amplification happens via the scenario-loop in run-end-to-end.yml. + _DIAG_ATTEMPTS = 3 def setup_sql_srv_src(self): # [DIAG do-not-merge] loop the request to amplify the stall reproduction rate.