From d64199dd68acc4edb32a9d7185b00d68a87a8a31 Mon Sep 17 00:00:00 2001 From: r2k1 Date: Wed, 3 Jun 2026 12:05:53 +1200 Subject: [PATCH 1/2] fix(e2e): bound per-exec attempt to 15s in wireserver validator The strict validator from #8580 cut the overall poll budget to 30s but left the SPDY exec call inheriting the poll's inner ctx. A single hung exec consumed the entire budget, with zero retries. Build 166502267 hit exactly that: the kube-apiserver exec subresource hung 30s (kubelet log shows no exec entry for the debug pod during the window), the cluster control plane was under load (7 pods on the same build took >60s to become ready), and the validator failed with 'context deadline exceeded'. Fix: - restore 1m overall poll budget - wrap each exec call in a 15s per-attempt context so a stuck SPDY setup gets cancelled and retried instead of starving the budget - bound curl total runtime with --max-time 8 (--connect-timeout only covers TCP connect) - distinguish per-attempt deadlines from other exec errors in logs Strict semantics from #8580 preserved: unexpected curl exits still fail loudly; only transport/setup errors retry. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/validation.go | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/e2e/validation.go b/e2e/validation.go index 7d95efe4bcd..8e994469a5e 100644 --- a/e2e/validation.go +++ b/e2e/validation.go @@ -3,6 +3,7 @@ package e2e import ( "context" "encoding/json" + "errors" "fmt" "strings" "time" @@ -300,11 +301,11 @@ func validateWireServerBlocked(ctx context.Context, s *Scenario) { checks := []wireServerCheck{ { - cmd: "curl http://168.63.129.16/machine/?comp=goalstate -H 'x-ms-version: 2015-04-05' -s --connect-timeout 4", + cmd: "curl http://168.63.129.16/machine/?comp=goalstate -H 'x-ms-version: 2015-04-05' -s --connect-timeout 4 --max-time 8", desc: "wireserver port 80 goalstate", }, { - cmd: "curl http://168.63.129.16:32526/vmSettings --connect-timeout 4", + cmd: "curl http://168.63.129.16:32526/vmSettings --connect-timeout 4 --max-time 8", desc: "wireserver port 32526 vmSettings", }, } @@ -313,10 +314,19 @@ func validateWireServerBlocked(ctx context.Context, s *Scenario) { for _, check := range checks { var execResult *podExecResult - pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) { - r, execErr := execOnUnprivilegedPod(ctx, s.Runtime.Cluster.Kube, nonHostPod.Namespace, nonHostPod.Name, check.cmd) + // Per-attempt cap (15s) prevents a single SPDY/exec hang from consuming the entire + // poll budget. Derived from the poll's inner ctx so it honors both the per-attempt + // cap and the overall poll deadline, whichever fires first. + pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 1*time.Minute, true, func(ctx context.Context) (bool, error) { + attemptCtx, cancel := context.WithTimeout(ctx, 15*time.Second) + defer cancel() + r, execErr := execOnUnprivilegedPod(attemptCtx, s.Runtime.Cluster.Kube, nonHostPod.Namespace, nonHostPod.Name, check.cmd) if execErr != nil { - s.T.Logf("wireserver check %q: exec error (retrying): %v", check.desc, execErr) + if errors.Is(execErr, context.DeadlineExceeded) { + s.T.Logf("wireserver check %q: exec attempt timed out after 15s (retrying): %v", check.desc, execErr) + } else { + s.T.Logf("wireserver check %q: exec error (retrying): %v", check.desc, execErr) + } return false, nil } execResult = r From 65e84615924017c8c65810008cd361a567804614 Mon Sep 17 00:00:00 2001 From: Artur Khantimirov Date: Wed, 3 Jun 2026 12:29:02 +1200 Subject: [PATCH 2/2] fix(e2e): add HTTP/2 keep-alive ping + dial timeout to kube client MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two defenses against silent connection wedges that have caused flaky e2e failures (most recently the wireserver validator hang in build 166502267 where a kube SPDY exec sat for 30s before the caller's per-attempt context fired): * net.Dialer{Timeout: 10s, KeepAlive: 30s} on the REST config so TCP-layer hangs (LB/NAT drops, half-open connections) surface as dial errors instead of indefinite blocking. * HTTP/2 ReadIdleTimeout=30s + PingTimeout=15s so apiserver connections that go silent (control-plane stress, network blip) are actively probed and torn down, returning a connection error that the existing retry layer can act on. These don't cover the active SPDY exec stream itself (SPDY hijacks the connection after the HTTP upgrade), so they complement — not replace — the per-attempt context timeout introduced for the wireserver validator. They do cover every regular CoreV1 / AppsV1 call plus the initial /exec POST. Promotes golang.org/x/net from indirect to direct in e2e/go.mod. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/go.mod | 2 +- e2e/kube.go | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/e2e/go.mod b/e2e/go.mod index 10ccc9ce9bf..9d6e46795a9 100644 --- a/e2e/go.mod +++ b/e2e/go.mod @@ -91,7 +91,7 @@ require ( github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.1 // indirect github.com/x448/float16 v0.8.4 // indirect - golang.org/x/net v0.55.0 // indirect + golang.org/x/net v0.55.0 golang.org/x/oauth2 v0.30.0 // indirect golang.org/x/sys v0.45.0 // indirect golang.org/x/term v0.43.0 // indirect diff --git a/e2e/kube.go b/e2e/kube.go index 9650ab343bc..1f5ea29d5c2 100644 --- a/e2e/kube.go +++ b/e2e/kube.go @@ -5,6 +5,8 @@ import ( "encoding/base64" "encoding/json" "fmt" + "net" + "net/http" "strings" "testing" "time" @@ -13,6 +15,7 @@ import ( "github.com/Azure/agentbaker/e2e/toolkit" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v8" + "golang.org/x/net/http2" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" @@ -58,6 +61,25 @@ func getClusterKubeClient(ctx context.Context, cluster *armcontainerservice.Mana config.QPS = 200 config.Burst = 400 + // Defense-in-depth against silent connection wedges (apiserver SPDY proxy + // hangs, NAT/LB idle timeouts) which manifest as kube exec calls that hang + // indefinitely. Bound the TCP dial and enable HTTP/2 keep-alive pings so + // the transport itself surfaces a dead peer as a connection error, + // triggering retries instead of consuming the caller's timeout budget. + config.Dial = (&net.Dialer{ + Timeout: 10 * time.Second, + KeepAlive: 30 * time.Second, + }).DialContext + config.WrapTransport = func(rt http.RoundTripper) http.RoundTripper { + if t, ok := rt.(*http.Transport); ok { + if h2, err := http2.ConfigureTransports(t); err == nil { + h2.ReadIdleTimeout = 30 * time.Second + h2.PingTimeout = 15 * time.Second + } + } + return rt + } + dynamic, err := client.New(config, client.Options{}) if err != nil { return nil, fmt.Errorf("create dynamic Kubeclient: %w", err)