Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion e2e/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ require (
github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.1 // indirect
github.com/x448/float16 v0.8.4 // indirect
golang.org/x/net v0.55.0 // indirect
golang.org/x/net v0.55.0
golang.org/x/oauth2 v0.30.0 // indirect
golang.org/x/sys v0.45.0 // indirect
golang.org/x/term v0.43.0 // indirect
Expand Down
22 changes: 22 additions & 0 deletions e2e/kube.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"encoding/base64"
"encoding/json"
"fmt"
"net"
"net/http"
"strings"
"testing"
"time"
Expand All @@ -13,6 +15,7 @@ import (
"github.com/Azure/agentbaker/e2e/toolkit"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v8"
"golang.org/x/net/http2"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -58,6 +61,25 @@ func getClusterKubeClient(ctx context.Context, cluster *armcontainerservice.Mana
config.QPS = 200
config.Burst = 400

// Defense-in-depth against silent connection wedges (apiserver SPDY proxy
// hangs, NAT/LB idle timeouts) which manifest as kube exec calls that hang
// indefinitely. Bound the TCP dial and enable HTTP/2 keep-alive pings so
// the transport itself surfaces a dead peer as a connection error,
// triggering retries instead of consuming the caller's timeout budget.
config.Dial = (&net.Dialer{
Timeout: 10 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext
config.WrapTransport = func(rt http.RoundTripper) http.RoundTripper {
if t, ok := rt.(*http.Transport); ok {
if h2, err := http2.ConfigureTransports(t); err == nil {
h2.ReadIdleTimeout = 30 * time.Second
h2.PingTimeout = 15 * time.Second
}
}
return rt
}

dynamic, err := client.New(config, client.Options{})
if err != nil {
return nil, fmt.Errorf("create dynamic Kubeclient: %w", err)
Expand Down
20 changes: 15 additions & 5 deletions e2e/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package e2e
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"time"
Expand Down Expand Up @@ -300,11 +301,11 @@ func validateWireServerBlocked(ctx context.Context, s *Scenario) {

checks := []wireServerCheck{
{
cmd: "curl http://168.63.129.16/machine/?comp=goalstate -H 'x-ms-version: 2015-04-05' -s --connect-timeout 4",
cmd: "curl http://168.63.129.16/machine/?comp=goalstate -H 'x-ms-version: 2015-04-05' -s --connect-timeout 4 --max-time 8",
desc: "wireserver port 80 goalstate",
},
{
cmd: "curl http://168.63.129.16:32526/vmSettings --connect-timeout 4",
cmd: "curl http://168.63.129.16:32526/vmSettings --connect-timeout 4 --max-time 8",
desc: "wireserver port 32526 vmSettings",
},
}
Expand All @@ -313,10 +314,19 @@ func validateWireServerBlocked(ctx context.Context, s *Scenario) {

for _, check := range checks {
var execResult *podExecResult
pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) {
r, execErr := execOnUnprivilegedPod(ctx, s.Runtime.Cluster.Kube, nonHostPod.Namespace, nonHostPod.Name, check.cmd)
// Per-attempt cap (15s) prevents a single SPDY/exec hang from consuming the entire
// poll budget. Derived from the poll's inner ctx so it honors both the per-attempt
// cap and the overall poll deadline, whichever fires first.
pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 1*time.Minute, true, func(ctx context.Context) (bool, error) {
attemptCtx, cancel := context.WithTimeout(ctx, 15*time.Second)
defer cancel()
r, execErr := execOnUnprivilegedPod(attemptCtx, s.Runtime.Cluster.Kube, nonHostPod.Namespace, nonHostPod.Name, check.cmd)
if execErr != nil {
s.T.Logf("wireserver check %q: exec error (retrying): %v", check.desc, execErr)
if errors.Is(execErr, context.DeadlineExceeded) {
s.T.Logf("wireserver check %q: exec attempt timed out after 15s (retrying): %v", check.desc, execErr)
Comment on lines +322 to +326
} else {
s.T.Logf("wireserver check %q: exec error (retrying): %v", check.desc, execErr)
}
return false, nil
}
execResult = r
Expand Down
Loading