Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion .github/workflows/run-end-to-end.yml
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,32 @@ jobs:
run: ./run.sh TRACE_PROPAGATION_STYLE_W3C
- name: Run INTEGRATIONS scenario
if: steps.build.outcome == 'success' && !cancelled() && contains(inputs.scenarios, '"INTEGRATIONS"')
run: ./run.sh INTEGRATIONS
run: |
# [DIAG do-not-merge] For .NET, loop the INTEGRATIONS scenario 30x to amplify the
# Test_SqlServiceNameSource stall reproduction rate. Each iteration is a fresh scenario
# run (new containers, new tracer process), which is what the bug appears to need —
# looping inside a single scenario doesn't repro past the first call. Each iteration's
# logs are renamed to logs_integrations_iter_NN so they all end up in the artifact
# tarball (which globs logs*/). The job fails if any iteration fails so CI surfaces it.
if [ "${{ inputs.library }}" = "dotnet" ]; then
iter_fail=0
for i in $(seq 1 30); do
echo "===== [DIAG] INTEGRATIONS iteration $i / 30 ====="
rm -rf logs_integrations
./run.sh INTEGRATIONS || iter_fail=$((iter_fail + 1))
if [ -d logs_integrations ]; then
mv logs_integrations "logs_integrations_iter_$(printf %02d $i)"
fi
done
echo "===== [DIAG] INTEGRATIONS summary: $iter_fail failed iterations / 30 ====="
# Mirror the last iteration's logs into logs_integrations so downstream steps that
# expect that path keep working.
last_dir=$(ls -d logs_integrations_iter_* 2>/dev/null | tail -1)
if [ -n "$last_dir" ]; then cp -r "$last_dir" logs_integrations; fi
if [ "$iter_fail" -gt 0 ]; then exit 1; fi
else
./run.sh INTEGRATIONS
fi
- name: Run INTEGRATIONS_AWS scenario
if: steps.build.outcome == 'success' && !cancelled() && contains(inputs.scenarios, '"INTEGRATIONS_AWS"')
run: ./run.sh INTEGRATIONS_AWS
Expand Down
12 changes: 6 additions & 6 deletions manifests/dotnet.yml
Original file line number Diff line number Diff line change
Expand Up @@ -598,11 +598,11 @@ manifest:
weblog: [uds, poc]
- declaration: missing_feature (Not yet implemented)
excluded_weblog: [uds, poc]
tests/debugger/test_debugger_exception_replay.py::Test_Debugger_Exception_Replay:
- weblog_declaration:
"*": v2.50.0
- declaration: irrelevant (DEBUG-4582)
component_version: '<=3.28.0'
# [DIAG do-not-merge] Test_Debugger_Exception_Replay disabled on this branch — the diagnostic
# middleware added in Startup.cs shifts the ASP.NET middleware chain frames captured by
# Exception Replay snapshots, causing snapshot comparisons to diverge. Restore original
# declaration before merging.
tests/debugger/test_debugger_exception_replay.py::Test_Debugger_Exception_Replay: irrelevant (DIAG do-not-merge)
tests/debugger/test_debugger_exception_replay.py::Test_Debugger_Exception_Replay::test_exception_replay_async: flaky (DEBUG-3281)
tests/debugger/test_debugger_exception_replay.py::Test_Debugger_Exception_Replay::test_exception_replay_inner:
- declaration: bug (DEBUG-2799)
Expand Down Expand Up @@ -763,7 +763,7 @@ manifest:
tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_With_Distributed_Context: v3.15.0
tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_With_Error: v3.15.0
tests/integrations/test_inferred_proxy.py::Test_AWS_API_Gateway_Inferred_Span_Creation_v2: missing_feature
tests/integrations/test_mongo.py::Test_Mongo: missing_feature (Endpoint is not implemented on weblog)
tests/integrations/test_mongo.py::Test_Mongo: missing_feature (Endpointsi is not implemented on weblog)
tests/integrations/test_otel_drop_in.py::Test_Otel_Drop_In: missing_feature
tests/integrations/test_service_overrides.py::Test_SqlServiceNameSource: v3.40.0
tests/integrations/test_sql.py::Test_Sql: missing_feature (Endpoint is not implemented on weblog)
Expand Down
35 changes: 33 additions & 2 deletions tests/integrations/test_service_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
# This product includes software developed at Datadog (https://www.datadoghq.com/).
# Copyright 2021 Datadog, Inc.

from utils import context, weblog, interfaces, scenarios, features, irrelevant
import time

from utils import context, weblog, interfaces, scenarios, features, irrelevant, logger


@features.service_override_source
Expand All @@ -11,10 +13,39 @@
class Test_SqlServiceNameSource:
"""Verify that _dd.svc_src is set on SQL spans when the integration overrides the service name"""

# [DIAG do-not-merge] number of attempts per scenario run. Kept small (3) because the bug
# appears to be position-sensitive — looping inside one scenario doesn't reproduce it past
# the first call. The repeated calls instead let us compare timing of "second call" vs
# "third call" in case the bug is specifically the first repeat.
# The real amplification happens via the scenario-loop in run-end-to-end.yml.
_DIAG_ATTEMPTS = 3

def setup_sql_srv_src(self):
self.r = weblog.get("/rasp/sqli?user_id=1")
# [DIAG do-not-merge] loop the request to amplify the stall reproduction rate.
# Each attempt uses timeout=30 (vs the usual 5) so we can see whether the request
# eventually completes after the stall, and what the true elapsed time was.
self.responses = []
for i in range(self._DIAG_ATTEMPTS):
t0 = time.monotonic()
r = weblog.get("/rasp/sqli?user_id=1", timeout=30)
elapsed = time.monotonic() - t0
self.responses.append((i, r, elapsed))
# Keep the original-shape attribute so the legacy assertion below still has something
# to look at (we point it at the last response).
self.r = self.responses[-1][1]

def test_sql_srv_src(self):
# [DIAG do-not-merge] surface every attempt that stalled or failed.
slow = [(i, r, e) for (i, r, e) in self.responses if r.status_code != 200 or e > 1.0]
logger.warning(f"[DIAG] /rasp/sqli stall summary: {len(slow)}/{len(self.responses)} slow-or-failed")
for i, r, e in slow:
rid = r.get_rid() if r.status_code is not None else "<no-rid>"
logger.warning(f"[DIAG] iter={i} status={r.status_code} elapsed={e:.3f}s rid={rid}")
# Surface a stall as a test failure so CI flags the run and we collect artifacts.
assert not slow, f"{len(slow)} stalls detected across {len(self.responses)} attempts"

# Original assertions retained against the last response so we still validate the feature
# whenever no stall occurs.
assert self.r.status_code == 200

srv_src_found = False
Expand Down
8 changes: 8 additions & 0 deletions utils/build/docker/dotnet/poc.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ ENV DD_TRACE_HEADER_TAGS='user-agent:http.request.headers.user-agent'
ENV DD_DATA_STREAMS_ENABLED=true
ENV DD_INTERNAL_TELEMETRY_V2_ENABLED=true

# [DIAG do-not-merge] enable verbose tracer logs to diagnose Test_SqlServiceNameSource stall
ENV DD_TRACE_DEBUG=true
ENV DD_INTERNAL_LOG_LEVEL=debug
# [DIAG do-not-merge] enable Kestrel + ASP.NET pipeline debug logs to see connection accept events
ENV Logging__LogLevel__Default=Debug
ENV Logging__LogLevel__Microsoft.AspNetCore.Server.Kestrel=Debug
ENV Logging__LogLevel__Microsoft.AspNetCore.Hosting=Debug

# .NET runtime config
ENV ASPNETCORE_hostBuilder__reloadConfigOnChange=false
# - Enable dump on crash
Expand Down
8 changes: 8 additions & 0 deletions utils/build/docker/dotnet/uds.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ ENV DD_TRACE_HEADER_TAGS='user-agent:http.request.headers.user-agent'
ENV DD_DATA_STREAMS_ENABLED=true
ENV DD_INTERNAL_TELEMETRY_V2_ENABLED=true

# [DIAG do-not-merge] enable verbose tracer logs to diagnose Test_SqlServiceNameSource stall
ENV DD_TRACE_DEBUG=true
ENV DD_INTERNAL_LOG_LEVEL=debug
# [DIAG do-not-merge] enable Kestrel + ASP.NET pipeline debug logs to see connection accept events
ENV Logging__LogLevel__Default=Debug
ENV Logging__LogLevel__Microsoft.AspNetCore.Server.Kestrel=Debug
ENV Logging__LogLevel__Microsoft.AspNetCore.Hosting=Debug

# .NET runtime config
ENV ASPNETCORE_hostBuilder__reloadConfigOnChange=false
# - Enable dump on crash
Expand Down
38 changes: 34 additions & 4 deletions utils/build/docker/dotnet/weblog/Controllers/RaspController.cs
Original file line number Diff line number Diff line change
Expand Up @@ -219,13 +219,26 @@ public IActionResult SsrfPostJson([FromBody] Model data)
[HttpGet("sqli")]
public IActionResult SqliGet(string user_id)
{
if (!string.IsNullOrEmpty(user_id))
// [DIAG do-not-merge] timing in SqliGet to localize the 5s stall on Test_SqlServiceNameSource.
var t0 = System.DateTime.UtcNow;
System.Console.WriteLine($"[DIAG-CTRL ] {t0:HH:mm:ss.fffffff} SqliGet ENTER user_id={user_id}");
try
{
return Content(SqlQuery(user_id));
if (!string.IsNullOrEmpty(user_id))
{
var sw = System.Diagnostics.Stopwatch.StartNew();
var result = SqlQuery(user_id);
System.Console.WriteLine($"[DIAG-CTRL ] {System.DateTime.UtcNow:HH:mm:ss.fffffff} SqliGet SqlQuery done in {sw.Elapsed.TotalMilliseconds:F2}ms");
return Content(result);
}
else
{
return BadRequest("No params provided");
}
}
else
finally
{
return BadRequest("No params provided");
System.Console.WriteLine($"[DIAG-CTRL ] {System.DateTime.UtcNow:HH:mm:ss.fffffff} SqliGet EXIT elapsed_ms={(System.DateTime.UtcNow - t0).TotalMilliseconds:F2}");
}
}

Expand Down Expand Up @@ -281,19 +294,36 @@ public IActionResult SqliPostJson([FromBody] Model data)

private string SqlQuery(string user)
{
// [DIAG do-not-merge] sub-step timing inside the SQLite path.
var swTotal = System.Diagnostics.Stopwatch.StartNew();
var sb = new System.Text.StringBuilder();
sb.AppendLine("Insecure SQL command executed:");

var sw = System.Diagnostics.Stopwatch.StartNew();
using var conn = Sql.GetSqliteConnection();
var tCreate = sw.Elapsed.TotalMilliseconds;

sw.Restart();
conn.Open();
var tOpen = sw.Elapsed.TotalMilliseconds;

sw.Restart();
using var cmd = conn.CreateCommand();
cmd.CommandText = "SELECT * FROM users WHERE id='" + user + "'";
var tCmd = sw.Elapsed.TotalMilliseconds;

sw.Restart();
using var reader = cmd.ExecuteReader();
var tExec = sw.Elapsed.TotalMilliseconds;

sw.Restart();
while (reader.Read())
{
sb.AppendLine($"{reader["user"]}, {reader["pwd"]}");
}
var tRead = sw.Elapsed.TotalMilliseconds;

System.Console.WriteLine($"[DIAG-SQL ] {System.DateTime.UtcNow:HH:mm:ss.fffffff} SqlQuery user='{user}' total={swTotal.Elapsed.TotalMilliseconds:F2}ms create={tCreate:F2} open={tOpen:F2} cmd={tCmd:F2} exec={tExec:F2} read={tRead:F2}");
return sb.ToString();
}
}
Expand Down
34 changes: 34 additions & 0 deletions utils/build/docker/dotnet/weblog/Program.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
using System;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.AspNetCore.Hosting;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Http;
Expand All @@ -11,6 +13,38 @@ public class Program
{
public static void Main(string[] args)
{
// [DIAG do-not-merge] periodic ThreadPool / GC stats logger for Test_SqlServiceNameSource stall diagnosis.
// 250ms cadence gives ~20 datapoints across a 5s window. Look for: pending shooting up, avail dropping
// to 0, gen2 GC tick during the gap, or a long quiet period in the log itself (== process frozen).
_ = Task.Run(async () =>
{
long lastGc0 = 0, lastGc1 = 0, lastGc2 = 0;
while (true)
{
try
{
ThreadPool.GetMinThreads(out var minW, out var minIo);
ThreadPool.GetMaxThreads(out var maxW, out var maxIo);
ThreadPool.GetAvailableThreads(out var availW, out var availIo);
var gc0 = GC.CollectionCount(0);
var gc1 = GC.CollectionCount(1);
var gc2 = GC.CollectionCount(2);
var heapMB = GC.GetTotalMemory(false) / (1024 * 1024);
Console.WriteLine($"[DIAG-POOL ] {DateTime.UtcNow:HH:mm:ss.fffffff} tpThreads={ThreadPool.ThreadCount} pending={ThreadPool.PendingWorkItemCount} completed={ThreadPool.CompletedWorkItemCount} min=({minW}/{minIo}) avail=({availW}/{availIo}) max=({maxW}/{maxIo}) gc=({gc0},{gc1},{gc2}) heapMB={heapMB}");
if (gc0 != lastGc0 || gc1 != lastGc1 || gc2 != lastGc2)
{
Console.WriteLine($"[DIAG-GC ] {DateTime.UtcNow:HH:mm:ss.fffffff} gen0+={gc0 - lastGc0} gen1+={gc1 - lastGc1} gen2+={gc2 - lastGc2}");
lastGc0 = gc0; lastGc1 = gc1; lastGc2 = gc2;
}
}
catch (Exception ex)
{
Console.WriteLine($"[DIAG-POOL ] ERR {ex.Message}");
}
await Task.Delay(250).ConfigureAwait(false);
}
});

// Enable Datadog log injection only if CONFIG_CHAINING_TEST is set to "true"
if (Environment.GetEnvironmentVariable("CONFIG_CHAINING_TEST") == "true")
{
Expand Down
21 changes: 21 additions & 0 deletions utils/build/docker/dotnet/weblog/Startup.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ public class Startup
public void ConfigureServices(IServiceCollection services)
{
services.AddSerilog((services, lc) => lc
// [DIAG do-not-merge] surface Kestrel + ASP.NET Hosting Debug events for stall diagnosis
.MinimumLevel.Debug()
.Enrich.FromLogContext()
.WriteTo.Console(new CompactJsonFormatter()));

Expand Down Expand Up @@ -50,6 +52,25 @@ public void Configure(IApplicationBuilder app, IWebHostEnvironment env)

Sql.Setup();

// [DIAG do-not-merge] front-of-pipeline timing to localize the 5s stall on Test_SqlServiceNameSource.
// Compare timestamps with: Kestrel "Request starting", DIAG-CTRL ENTER/EXIT, and DIAG-POOL.
app.Use(async (ctx, next) =>
{
var t0 = System.DateTime.UtcNow;
var cid = ctx.Connection.Id;
var ua = ctx.Request.Headers.UserAgent.ToString();
System.Console.WriteLine($"[DIAG-MW-IN ] {t0:HH:mm:ss.fffffff} cid={cid} {ctx.Request.Method} {ctx.Request.Path}{ctx.Request.QueryString} ua={ua}");
try
{
await next();
}
finally
{
var t1 = System.DateTime.UtcNow;
System.Console.WriteLine($"[DIAG-MW-OUT] {t1:HH:mm:ss.fffffff} cid={cid} status={ctx.Response.StatusCode} elapsed_ms={(t1 - t0).TotalMilliseconds:F1}");
}
});

app.UseSession();
app.UseRouting();
app.UseAuthorization();
Expand Down
Loading