diff --git a/e2e/go.mod b/e2e/go.mod index 10ccc9ce9bf..9d6e46795a9 100644 --- a/e2e/go.mod +++ b/e2e/go.mod @@ -91,7 +91,7 @@ require ( github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.1 // indirect github.com/x448/float16 v0.8.4 // indirect - golang.org/x/net v0.55.0 // indirect + golang.org/x/net v0.55.0 golang.org/x/oauth2 v0.30.0 // indirect golang.org/x/sys v0.45.0 // indirect golang.org/x/term v0.43.0 // indirect diff --git a/e2e/kube.go b/e2e/kube.go index 9650ab343bc..1f5ea29d5c2 100644 --- a/e2e/kube.go +++ b/e2e/kube.go @@ -5,6 +5,8 @@ import ( "encoding/base64" "encoding/json" "fmt" + "net" + "net/http" "strings" "testing" "time" @@ -13,6 +15,7 @@ import ( "github.com/Azure/agentbaker/e2e/toolkit" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v8" + "golang.org/x/net/http2" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" @@ -58,6 +61,25 @@ func getClusterKubeClient(ctx context.Context, cluster *armcontainerservice.Mana config.QPS = 200 config.Burst = 400 + // Defense-in-depth against silent connection wedges (apiserver SPDY proxy + // hangs, NAT/LB idle timeouts) which manifest as kube exec calls that hang + // indefinitely. Bound the TCP dial and enable HTTP/2 keep-alive pings so + // the transport itself surfaces a dead peer as a connection error, + // triggering retries instead of consuming the caller's timeout budget. + config.Dial = (&net.Dialer{ + Timeout: 10 * time.Second, + KeepAlive: 30 * time.Second, + }).DialContext + config.WrapTransport = func(rt http.RoundTripper) http.RoundTripper { + if t, ok := rt.(*http.Transport); ok { + if h2, err := http2.ConfigureTransports(t); err == nil { + h2.ReadIdleTimeout = 30 * time.Second + h2.PingTimeout = 15 * time.Second + } + } + return rt + } + dynamic, err := client.New(config, client.Options{}) if err != nil { return nil, fmt.Errorf("create dynamic Kubeclient: %w", err) diff --git a/e2e/validation.go b/e2e/validation.go index 7d95efe4bcd..8e994469a5e 100644 --- a/e2e/validation.go +++ b/e2e/validation.go @@ -3,6 +3,7 @@ package e2e import ( "context" "encoding/json" + "errors" "fmt" "strings" "time" @@ -300,11 +301,11 @@ func validateWireServerBlocked(ctx context.Context, s *Scenario) { checks := []wireServerCheck{ { - cmd: "curl http://168.63.129.16/machine/?comp=goalstate -H 'x-ms-version: 2015-04-05' -s --connect-timeout 4", + cmd: "curl http://168.63.129.16/machine/?comp=goalstate -H 'x-ms-version: 2015-04-05' -s --connect-timeout 4 --max-time 8", desc: "wireserver port 80 goalstate", }, { - cmd: "curl http://168.63.129.16:32526/vmSettings --connect-timeout 4", + cmd: "curl http://168.63.129.16:32526/vmSettings --connect-timeout 4 --max-time 8", desc: "wireserver port 32526 vmSettings", }, } @@ -313,10 +314,19 @@ func validateWireServerBlocked(ctx context.Context, s *Scenario) { for _, check := range checks { var execResult *podExecResult - pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) { - r, execErr := execOnUnprivilegedPod(ctx, s.Runtime.Cluster.Kube, nonHostPod.Namespace, nonHostPod.Name, check.cmd) + // Per-attempt cap (15s) prevents a single SPDY/exec hang from consuming the entire + // poll budget. Derived from the poll's inner ctx so it honors both the per-attempt + // cap and the overall poll deadline, whichever fires first. + pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 1*time.Minute, true, func(ctx context.Context) (bool, error) { + attemptCtx, cancel := context.WithTimeout(ctx, 15*time.Second) + defer cancel() + r, execErr := execOnUnprivilegedPod(attemptCtx, s.Runtime.Cluster.Kube, nonHostPod.Namespace, nonHostPod.Name, check.cmd) if execErr != nil { - s.T.Logf("wireserver check %q: exec error (retrying): %v", check.desc, execErr) + if errors.Is(execErr, context.DeadlineExceeded) { + s.T.Logf("wireserver check %q: exec attempt timed out after 15s (retrying): %v", check.desc, execErr) + } else { + s.T.Logf("wireserver check %q: exec error (retrying): %v", check.desc, execErr) + } return false, nil } execResult = r