-
Notifications
You must be signed in to change notification settings - Fork 213
fix: Adds retry on GCP "No Capacity" error during endpoint creation for privatelinkendpoint #4221
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -149,8 +149,20 @@ func resourceCreate(ctx context.Context, d *schema.ResourceData, meta any) diag. | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| privateEndpoint, _, err := connV2.PrivateEndpointServicesApi.CreatePrivateEndpointService(ctx, projectID, request).Execute() | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| if err != nil { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| const maxRetries = 5 | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| const retrySleep = 10 * time.Second | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| var privateEndpoint *admin.EndpointService | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| for attempt := range maxRetries { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| var err error | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| privateEndpoint, _, err = connV2.PrivateEndpointServicesApi.CreatePrivateEndpointService(ctx, projectID, request).Execute() | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| if err == nil { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| break | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| if admin.IsErrorCode(err, "ATLAS_GENERAL_ERROR") && strings.Contains(err.Error(), "No Capacity") && attempt < maxRetries-1 { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| log.Printf("[DEBUG] Attempt %d/%d: GCP private endpoint creation returned 'No Capacity', retrying in %s...", attempt+1, maxRetries, retrySleep) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| time.Sleep(retrySleep) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| time.Sleep(retrySleep) | |
| select { | |
| case <-ctx.Done(): | |
| return diag.FromErr(ctx.Err()) | |
| case <-time.After(retrySleep): | |
| } |
Copilot
AI
Feb 24, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This custom retry implementation is inconsistent with the established codebase pattern. The repository extensively uses retry.RetryContext from the Terraform SDK's helper package for retry logic (see internal/service/globalclusterconfig/resource_global_cluster_config.go:129, internal/service/streamconnection/state_transition.go:27, internal/service/team/resource_team.go:177, among others).
Using retry.RetryContext provides several advantages: it respects context cancellation, handles timeouts properly, integrates with Terraform's logging, and provides exponential backoff. The current implementation with time.Sleep doesn't check for context cancellation and uses a fixed delay, which could cause the operation to exceed configured timeouts.
| const maxRetries = 5 | |
| const retrySleep = 10 * time.Second | |
| var privateEndpoint *admin.EndpointService | |
| for attempt := range maxRetries { | |
| var err error | |
| privateEndpoint, _, err = connV2.PrivateEndpointServicesApi.CreatePrivateEndpointService(ctx, projectID, request).Execute() | |
| if err == nil { | |
| break | |
| } | |
| if admin.IsErrorCode(err, "ATLAS_GENERAL_ERROR") && strings.Contains(err.Error(), "No Capacity") && attempt < maxRetries-1 { | |
| log.Printf("[DEBUG] Attempt %d/%d: GCP private endpoint creation returned 'No Capacity', retrying in %s...", attempt+1, maxRetries, retrySleep) | |
| time.Sleep(retrySleep) | |
| continue | |
| } | |
| return diag.FromErr(fmt.Errorf(errorPrivateLinkEndpointsCreate, err)) | |
| var privateEndpoint *admin.EndpointService | |
| attempts := 0 | |
| retryErr := retry.RetryContext(ctx, d.Timeout(schema.TimeoutCreate), func() *retry.RetryError { | |
| attempts++ | |
| pe, _, err := connV2.PrivateEndpointServicesApi.CreatePrivateEndpointService(ctx, projectID, request).Execute() | |
| if err != nil { | |
| if admin.IsErrorCode(err, "ATLAS_GENERAL_ERROR") && strings.Contains(err.Error(), "No Capacity") { | |
| log.Printf("[DEBUG] Attempt %d: GCP private endpoint creation returned 'No Capacity', retrying...", attempts) | |
| return retry.RetryableError(err) | |
| } | |
| return retry.NonRetryableError(err) | |
| } | |
| privateEndpoint = pe | |
| return nil | |
| }) | |
| if retryErr != nil { | |
| return diag.FromErr(fmt.Errorf(errorPrivateLinkEndpointsCreate, retryErr)) |
Copilot
AI
Feb 24, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new retry logic for handling "No Capacity" errors lacks test coverage. The existing tests in resource_test.go don't cover this retry scenario. Given that this file has comprehensive test coverage for other scenarios (basicAWS, basicAzure, basicGCP, deleteOnCreateTimeout, etc.), the new retry behavior should also have test coverage to ensure it works correctly and doesn't introduce regressions.
Copilot
AI
Feb 24, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's a potential nil pointer dereference bug. If all retries are exhausted and the last error is a "No Capacity" error that passes the retry condition check (line 161), the loop will exit normally without breaking or returning. This means privateEndpoint will remain nil, causing a panic at line 169 when calling privateEndpoint.GetId().
The condition attempt < maxRetries-1 prevents retrying on the last attempt, but doesn't handle the case where the last attempt also returns a "No Capacity" error. When attempt equals 4 (the last iteration), the condition on line 161 will be false, so the code will skip both the retry logic (line 162-164) and the error return (line 166), allowing the loop to complete with privateEndpoint still nil.
| for attempt := range maxRetries { | |
| var err error | |
| privateEndpoint, _, err = connV2.PrivateEndpointServicesApi.CreatePrivateEndpointService(ctx, projectID, request).Execute() | |
| if err == nil { | |
| break | |
| } | |
| if admin.IsErrorCode(err, "ATLAS_GENERAL_ERROR") && strings.Contains(err.Error(), "No Capacity") && attempt < maxRetries-1 { | |
| log.Printf("[DEBUG] Attempt %d/%d: GCP private endpoint creation returned 'No Capacity', retrying in %s...", attempt+1, maxRetries, retrySleep) | |
| time.Sleep(retrySleep) | |
| continue | |
| } | |
| return diag.FromErr(fmt.Errorf(errorPrivateLinkEndpointsCreate, err)) | |
| } | |
| var lastErr error | |
| for attempt := range maxRetries { | |
| privateEndpoint, _, lastErr = connV2.PrivateEndpointServicesApi.CreatePrivateEndpointService(ctx, projectID, request).Execute() | |
| if lastErr == nil { | |
| break | |
| } | |
| if admin.IsErrorCode(lastErr, "ATLAS_GENERAL_ERROR") && strings.Contains(lastErr.Error(), "No Capacity") && attempt < maxRetries-1 { | |
| log.Printf("[DEBUG] Attempt %d/%d: GCP private endpoint creation returned 'No Capacity', retrying in %s...", attempt+1, maxRetries, retrySleep) | |
| time.Sleep(retrySleep) | |
| continue | |
| } | |
| return diag.FromErr(fmt.Errorf(errorPrivateLinkEndpointsCreate, lastErr)) | |
| } | |
| if privateEndpoint == nil { | |
| return diag.FromErr(fmt.Errorf(errorPrivateLinkEndpointsCreate, lastErr)) | |
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The retry configuration uses magic numbers (5 for maxRetries and 10 seconds for retrySleep) instead of following the pattern of defining constants at the package level. The existing code defines timeout constants at the package level (see lines 24-30 where delayAndMinTimeout is defined). Consider defining these retry parameters as package-level constants with descriptive names for better maintainability and consistency.