diff --git a/.github/workflows/run-end-to-end.yml b/.github/workflows/run-end-to-end.yml index 928551f0f2f..1e181f2004d 100644 --- a/.github/workflows/run-end-to-end.yml +++ b/.github/workflows/run-end-to-end.yml @@ -222,7 +222,32 @@ jobs: run: ./run.sh TRACE_PROPAGATION_STYLE_W3C - name: Run INTEGRATIONS scenario if: steps.build.outcome == 'success' && !cancelled() && contains(inputs.scenarios, '"INTEGRATIONS"') - run: ./run.sh INTEGRATIONS + run: | + # [DIAG do-not-merge] For .NET, loop the INTEGRATIONS scenario 30x to amplify the + # Test_SqlServiceNameSource stall reproduction rate. Each iteration is a fresh scenario + # run (new containers, new tracer process), which is what the bug appears to need — + # looping inside a single scenario doesn't repro past the first call. Each iteration's + # logs are renamed to logs_integrations_iter_NN so they all end up in the artifact + # tarball (which globs logs*/). The job fails if any iteration fails so CI surfaces it. + if [ "${{ inputs.library }}" = "dotnet" ]; then + iter_fail=0 + for i in $(seq 1 30); do + echo "===== [DIAG] INTEGRATIONS iteration $i / 30 =====" + rm -rf logs_integrations + ./run.sh INTEGRATIONS || iter_fail=$((iter_fail + 1)) + if [ -d logs_integrations ]; then + mv logs_integrations "logs_integrations_iter_$(printf %02d $i)" + fi + done + echo "===== [DIAG] INTEGRATIONS summary: $iter_fail failed iterations / 30 =====" + # Mirror the last iteration's logs into logs_integrations so downstream steps that + # expect that path keep working. + last_dir=$(ls -d logs_integrations_iter_* 2>/dev/null | tail -1) + if [ -n "$last_dir" ]; then cp -r "$last_dir" logs_integrations; fi + if [ "$iter_fail" -gt 0 ]; then exit 1; fi + else + ./run.sh INTEGRATIONS + fi - name: Run INTEGRATIONS_AWS scenario if: steps.build.outcome == 'success' && !cancelled() && contains(inputs.scenarios, '"INTEGRATIONS_AWS"') run: ./run.sh INTEGRATIONS_AWS diff --git a/manifests/dotnet.yml b/manifests/dotnet.yml index 3e109f4b37f..b2ec1606be5 100644 --- a/manifests/dotnet.yml +++ b/manifests/dotnet.yml @@ -598,11 +598,11 @@ manifest: weblog: [uds, poc] - declaration: missing_feature (Not yet implemented) excluded_weblog: [uds, poc] - tests/debugger/test_debugger_exception_replay.py::Test_Debugger_Exception_Replay: - - weblog_declaration: - "*": v2.50.0 - - declaration: irrelevant (DEBUG-4582) - component_version: '<=3.28.0' + # [DIAG do-not-merge] Test_Debugger_Exception_Replay disabled on this branch — the diagnostic + # middleware added in Startup.cs shifts the ASP.NET middleware chain frames captured by + # Exception Replay snapshots, causing snapshot comparisons to diverge. Restore original + # declaration before merging. + tests/debugger/test_debugger_exception_replay.py::Test_Debugger_Exception_Replay: irrelevant (DIAG do-not-merge) tests/debugger/test_debugger_exception_replay.py::Test_Debugger_Exception_Replay::test_exception_replay_async: flaky (DEBUG-3281) tests/debugger/test_debugger_exception_replay.py::Test_Debugger_Exception_Replay::test_exception_replay_inner: - declaration: bug (DEBUG-2799) @@ -763,7 +763,7 @@ manifest: tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_With_Distributed_Context: v3.15.0 tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_With_Error: v3.15.0 tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_v2: missing_feature - tests/integrations/test_mongo.py::Test_Mongo: missing_feature (Endpoint is not implemented on weblog) + tests/integrations/test_mongo.py::Test_Mongo: missing_feature (Endpointsi is not implemented on weblog) tests/integrations/test_otel_drop_in.py::Test_Otel_Drop_In: missing_feature tests/integrations/test_service_overrides.py::Test_SqlServiceNameSource: v3.40.0 tests/integrations/test_sql.py::Test_Sql: missing_feature (Endpoint is not implemented on weblog) diff --git a/tests/integrations/test_service_overrides.py b/tests/integrations/test_service_overrides.py index 55c04b576aa..c09064ce081 100644 --- a/tests/integrations/test_service_overrides.py +++ b/tests/integrations/test_service_overrides.py @@ -2,7 +2,9 @@ # This product includes software developed at Datadog (https://www.datadoghq.com/). # Copyright 2021 Datadog, Inc. -from utils import context, weblog, interfaces, scenarios, features, irrelevant +import time + +from utils import context, weblog, interfaces, scenarios, features, irrelevant, logger @features.service_override_source @@ -11,10 +13,39 @@ class Test_SqlServiceNameSource: """Verify that _dd.svc_src is set on SQL spans when the integration overrides the service name""" + # [DIAG do-not-merge] number of attempts per scenario run. Kept small (3) because the bug + # appears to be position-sensitive — looping inside one scenario doesn't reproduce it past + # the first call. The repeated calls instead let us compare timing of "second call" vs + # "third call" in case the bug is specifically the first repeat. + # The real amplification happens via the scenario-loop in run-end-to-end.yml. + _DIAG_ATTEMPTS = 3 + def setup_sql_srv_src(self): - self.r = weblog.get("/rasp/sqli?user_id=1") + # [DIAG do-not-merge] loop the request to amplify the stall reproduction rate. + # Each attempt uses timeout=30 (vs the usual 5) so we can see whether the request + # eventually completes after the stall, and what the true elapsed time was. + self.responses = [] + for i in range(self._DIAG_ATTEMPTS): + t0 = time.monotonic() + r = weblog.get("/rasp/sqli?user_id=1", timeout=30) + elapsed = time.monotonic() - t0 + self.responses.append((i, r, elapsed)) + # Keep the original-shape attribute so the legacy assertion below still has something + # to look at (we point it at the last response). + self.r = self.responses[-1][1] def test_sql_srv_src(self): + # [DIAG do-not-merge] surface every attempt that stalled or failed. + slow = [(i, r, e) for (i, r, e) in self.responses if r.status_code != 200 or e > 1.0] + logger.warning(f"[DIAG] /rasp/sqli stall summary: {len(slow)}/{len(self.responses)} slow-or-failed") + for i, r, e in slow: + rid = r.get_rid() if r.status_code is not None else "" + logger.warning(f"[DIAG] iter={i} status={r.status_code} elapsed={e:.3f}s rid={rid}") + # Surface a stall as a test failure so CI flags the run and we collect artifacts. + assert not slow, f"{len(slow)} stalls detected across {len(self.responses)} attempts" + + # Original assertions retained against the last response so we still validate the feature + # whenever no stall occurs. assert self.r.status_code == 200 srv_src_found = False diff --git a/utils/build/docker/dotnet/poc.Dockerfile b/utils/build/docker/dotnet/poc.Dockerfile index 803354f7187..6ca4e0ee9c9 100644 --- a/utils/build/docker/dotnet/poc.Dockerfile +++ b/utils/build/docker/dotnet/poc.Dockerfile @@ -38,6 +38,14 @@ ENV DD_TRACE_HEADER_TAGS='user-agent:http.request.headers.user-agent' ENV DD_DATA_STREAMS_ENABLED=true ENV DD_INTERNAL_TELEMETRY_V2_ENABLED=true +# [DIAG do-not-merge] enable verbose tracer logs to diagnose Test_SqlServiceNameSource stall +ENV DD_TRACE_DEBUG=true +ENV DD_INTERNAL_LOG_LEVEL=debug +# [DIAG do-not-merge] enable Kestrel + ASP.NET pipeline debug logs to see connection accept events +ENV Logging__LogLevel__Default=Debug +ENV Logging__LogLevel__Microsoft.AspNetCore.Server.Kestrel=Debug +ENV Logging__LogLevel__Microsoft.AspNetCore.Hosting=Debug + # .NET runtime config ENV ASPNETCORE_hostBuilder__reloadConfigOnChange=false # - Enable dump on crash diff --git a/utils/build/docker/dotnet/uds.Dockerfile b/utils/build/docker/dotnet/uds.Dockerfile index b0aaad82efe..e78baafb1aa 100644 --- a/utils/build/docker/dotnet/uds.Dockerfile +++ b/utils/build/docker/dotnet/uds.Dockerfile @@ -32,6 +32,14 @@ ENV DD_TRACE_HEADER_TAGS='user-agent:http.request.headers.user-agent' ENV DD_DATA_STREAMS_ENABLED=true ENV DD_INTERNAL_TELEMETRY_V2_ENABLED=true +# [DIAG do-not-merge] enable verbose tracer logs to diagnose Test_SqlServiceNameSource stall +ENV DD_TRACE_DEBUG=true +ENV DD_INTERNAL_LOG_LEVEL=debug +# [DIAG do-not-merge] enable Kestrel + ASP.NET pipeline debug logs to see connection accept events +ENV Logging__LogLevel__Default=Debug +ENV Logging__LogLevel__Microsoft.AspNetCore.Server.Kestrel=Debug +ENV Logging__LogLevel__Microsoft.AspNetCore.Hosting=Debug + # .NET runtime config ENV ASPNETCORE_hostBuilder__reloadConfigOnChange=false # - Enable dump on crash diff --git a/utils/build/docker/dotnet/weblog/Controllers/RaspController.cs b/utils/build/docker/dotnet/weblog/Controllers/RaspController.cs index d2b4cf82bb4..63a7619507e 100644 --- a/utils/build/docker/dotnet/weblog/Controllers/RaspController.cs +++ b/utils/build/docker/dotnet/weblog/Controllers/RaspController.cs @@ -219,13 +219,26 @@ public IActionResult SsrfPostJson([FromBody] Model data) [HttpGet("sqli")] public IActionResult SqliGet(string user_id) { - if (!string.IsNullOrEmpty(user_id)) + // [DIAG do-not-merge] timing in SqliGet to localize the 5s stall on Test_SqlServiceNameSource. + var t0 = System.DateTime.UtcNow; + System.Console.WriteLine($"[DIAG-CTRL ] {t0:HH:mm:ss.fffffff} SqliGet ENTER user_id={user_id}"); + try { - return Content(SqlQuery(user_id)); + if (!string.IsNullOrEmpty(user_id)) + { + var sw = System.Diagnostics.Stopwatch.StartNew(); + var result = SqlQuery(user_id); + System.Console.WriteLine($"[DIAG-CTRL ] {System.DateTime.UtcNow:HH:mm:ss.fffffff} SqliGet SqlQuery done in {sw.Elapsed.TotalMilliseconds:F2}ms"); + return Content(result); + } + else + { + return BadRequest("No params provided"); + } } - else + finally { - return BadRequest("No params provided"); + System.Console.WriteLine($"[DIAG-CTRL ] {System.DateTime.UtcNow:HH:mm:ss.fffffff} SqliGet EXIT elapsed_ms={(System.DateTime.UtcNow - t0).TotalMilliseconds:F2}"); } } @@ -281,19 +294,36 @@ public IActionResult SqliPostJson([FromBody] Model data) private string SqlQuery(string user) { + // [DIAG do-not-merge] sub-step timing inside the SQLite path. + var swTotal = System.Diagnostics.Stopwatch.StartNew(); var sb = new System.Text.StringBuilder(); sb.AppendLine("Insecure SQL command executed:"); + + var sw = System.Diagnostics.Stopwatch.StartNew(); using var conn = Sql.GetSqliteConnection(); + var tCreate = sw.Elapsed.TotalMilliseconds; + + sw.Restart(); conn.Open(); + var tOpen = sw.Elapsed.TotalMilliseconds; + + sw.Restart(); using var cmd = conn.CreateCommand(); cmd.CommandText = "SELECT * FROM users WHERE id='" + user + "'"; + var tCmd = sw.Elapsed.TotalMilliseconds; + + sw.Restart(); using var reader = cmd.ExecuteReader(); + var tExec = sw.Elapsed.TotalMilliseconds; + sw.Restart(); while (reader.Read()) { sb.AppendLine($"{reader["user"]}, {reader["pwd"]}"); } + var tRead = sw.Elapsed.TotalMilliseconds; + System.Console.WriteLine($"[DIAG-SQL ] {System.DateTime.UtcNow:HH:mm:ss.fffffff} SqlQuery user='{user}' total={swTotal.Elapsed.TotalMilliseconds:F2}ms create={tCreate:F2} open={tOpen:F2} cmd={tCmd:F2} exec={tExec:F2} read={tRead:F2}"); return sb.ToString(); } } diff --git a/utils/build/docker/dotnet/weblog/Program.cs b/utils/build/docker/dotnet/weblog/Program.cs index d6b1cba06d0..fe432381e95 100644 --- a/utils/build/docker/dotnet/weblog/Program.cs +++ b/utils/build/docker/dotnet/weblog/Program.cs @@ -1,4 +1,6 @@ using System; +using System.Threading; +using System.Threading.Tasks; using Microsoft.AspNetCore.Hosting; using Microsoft.Extensions.Hosting; using Microsoft.Extensions.Http; @@ -11,6 +13,38 @@ public class Program { public static void Main(string[] args) { + // [DIAG do-not-merge] periodic ThreadPool / GC stats logger for Test_SqlServiceNameSource stall diagnosis. + // 250ms cadence gives ~20 datapoints across a 5s window. Look for: pending shooting up, avail dropping + // to 0, gen2 GC tick during the gap, or a long quiet period in the log itself (== process frozen). + _ = Task.Run(async () => + { + long lastGc0 = 0, lastGc1 = 0, lastGc2 = 0; + while (true) + { + try + { + ThreadPool.GetMinThreads(out var minW, out var minIo); + ThreadPool.GetMaxThreads(out var maxW, out var maxIo); + ThreadPool.GetAvailableThreads(out var availW, out var availIo); + var gc0 = GC.CollectionCount(0); + var gc1 = GC.CollectionCount(1); + var gc2 = GC.CollectionCount(2); + var heapMB = GC.GetTotalMemory(false) / (1024 * 1024); + Console.WriteLine($"[DIAG-POOL ] {DateTime.UtcNow:HH:mm:ss.fffffff} tpThreads={ThreadPool.ThreadCount} pending={ThreadPool.PendingWorkItemCount} completed={ThreadPool.CompletedWorkItemCount} min=({minW}/{minIo}) avail=({availW}/{availIo}) max=({maxW}/{maxIo}) gc=({gc0},{gc1},{gc2}) heapMB={heapMB}"); + if (gc0 != lastGc0 || gc1 != lastGc1 || gc2 != lastGc2) + { + Console.WriteLine($"[DIAG-GC ] {DateTime.UtcNow:HH:mm:ss.fffffff} gen0+={gc0 - lastGc0} gen1+={gc1 - lastGc1} gen2+={gc2 - lastGc2}"); + lastGc0 = gc0; lastGc1 = gc1; lastGc2 = gc2; + } + } + catch (Exception ex) + { + Console.WriteLine($"[DIAG-POOL ] ERR {ex.Message}"); + } + await Task.Delay(250).ConfigureAwait(false); + } + }); + // Enable Datadog log injection only if CONFIG_CHAINING_TEST is set to "true" if (Environment.GetEnvironmentVariable("CONFIG_CHAINING_TEST") == "true") { diff --git a/utils/build/docker/dotnet/weblog/Startup.cs b/utils/build/docker/dotnet/weblog/Startup.cs index 3fef703b1ad..8ff9e49539f 100644 --- a/utils/build/docker/dotnet/weblog/Startup.cs +++ b/utils/build/docker/dotnet/weblog/Startup.cs @@ -16,6 +16,8 @@ public class Startup public void ConfigureServices(IServiceCollection services) { services.AddSerilog((services, lc) => lc + // [DIAG do-not-merge] surface Kestrel + ASP.NET Hosting Debug events for stall diagnosis + .MinimumLevel.Debug() .Enrich.FromLogContext() .WriteTo.Console(new CompactJsonFormatter())); @@ -50,6 +52,25 @@ public void Configure(IApplicationBuilder app, IWebHostEnvironment env) Sql.Setup(); + // [DIAG do-not-merge] front-of-pipeline timing to localize the 5s stall on Test_SqlServiceNameSource. + // Compare timestamps with: Kestrel "Request starting", DIAG-CTRL ENTER/EXIT, and DIAG-POOL. + app.Use(async (ctx, next) => + { + var t0 = System.DateTime.UtcNow; + var cid = ctx.Connection.Id; + var ua = ctx.Request.Headers.UserAgent.ToString(); + System.Console.WriteLine($"[DIAG-MW-IN ] {t0:HH:mm:ss.fffffff} cid={cid} {ctx.Request.Method} {ctx.Request.Path}{ctx.Request.QueryString} ua={ua}"); + try + { + await next(); + } + finally + { + var t1 = System.DateTime.UtcNow; + System.Console.WriteLine($"[DIAG-MW-OUT] {t1:HH:mm:ss.fffffff} cid={cid} status={ctx.Response.StatusCode} elapsed_ms={(t1 - t0).TotalMilliseconds:F1}"); + } + }); + app.UseSession(); app.UseRouting(); app.UseAuthorization();