Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/steps/bundle_source.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ func (s *bundleSourceStep) run(ctx context.Context) error {
// Bundle images are not multi-arch by design. Here we build it without creating a manifest-listed image.
// Note that we are not configuring a node selector here, so the build will be scheduled on any available
// node no matter the architecture.
return handleBuild(ctx, s.client, s.podClient, *build, func() bool { return false })
return handleBuild(ctx, s.client, s.podClient, *build)
}

func replaceCommand(pullSpec, with string) string {
Expand Down
2 changes: 1 addition & 1 deletion pkg/steps/project_image.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ func (s *projectDirectoryImageBuildStep) run(ctx context.Context) error {

// Bundle images are non multi-arch by design. No manifest list is needed. Here we spawn a single build.
if s.config.IsBundleImage() {
return handleBuild(ctx, s.client, s.podClient, *build, func() bool { return false })
return handleBuild(ctx, s.client, s.podClient, *build)
}

return handleBuilds(ctx, s.client, s.podClient, *build, s.metricsAgent, newImageBuildOptions(s.architectures.UnsortedList()))
Expand Down
42 changes: 5 additions & 37 deletions pkg/steps/source.go
Original file line number Diff line number Diff line change
Expand Up @@ -467,31 +467,13 @@ func isBuildPhaseTerminated(phase buildapi.BuildPhase) bool {
}

type ImageBuildOptions struct {
Architectures []string
NeedsMultiArchWorkaround func() bool
Architectures []string
}

func newImageBuildOptions(archs []string) ImageBuildOptions {
return ImageBuildOptions{Architectures: archs}
}

// multiArchRepos are repos that need the sleep workaround for multi-arch builds.
var multiArchRepos = sets.New[string](
"openshift/ci-tools",
"openshift/kueue-operator",
"openshift/loki",
"openshift/multiarch-tuning-operator",
"openshift/oadp-must-gather",
"openshift/oadp-operator",
"openshift/openshift-velero-plugin",
"openshift/velero",
"openshift/velero-plugin-for-aws",
"openshift/velero-plugin-for-gcp",
"openshift/velero-plugin-for-legacy-aws",
"openshift/velero-plugin-for-microsoft-azure",
"openshift-eng/baremetal-qe-infra",
)

func handleBuilds(ctx context.Context, buildClient BuildClient, podClient kubernetes.PodClient, build buildapi.Build, metricsAgent *metrics.MetricsAgent, opts ...ImageBuildOptions) error {
var wg sync.WaitGroup

Expand All @@ -500,17 +482,6 @@ func handleBuilds(ctx context.Context, buildClient BuildClient, podClient kubern
o = opts[0]
}

// Create closure that checks if this build is for a multi-arch repo
needsMultiArchWorkaround := o.NeedsMultiArchWorkaround
if needsMultiArchWorkaround == nil {
needsMultiArchWorkaround = func() bool {
if org, repo := build.Labels[LabelMetadataOrg], build.Labels[LabelMetadataRepo]; org != "" && repo != "" {
return multiArchRepos.Has(fmt.Sprintf("%s/%s", org, repo))
}
return false
}
}

builds := constructMultiArchBuilds(build, o.Architectures)
errChan := make(chan error, len(builds))

Expand All @@ -519,7 +490,7 @@ func handleBuilds(ctx context.Context, buildClient BuildClient, podClient kubern
go func(b buildapi.Build) {
defer wg.Done()
metricsAgent.AddNodeWorkload(ctx, b.Namespace, fmt.Sprintf("%s-build", b.Name), b.Name, podClient)
if err := handleBuild(ctx, buildClient, podClient, b, needsMultiArchWorkaround); err != nil {
if err := handleBuild(ctx, buildClient, podClient, b); err != nil {
errChan <- fmt.Errorf("error occurred handling build %s: %w", b.Name, err)
}
metricsAgent.RemoveNodeWorkload(b.Name)
Expand Down Expand Up @@ -577,17 +548,14 @@ func constructMultiArchBuilds(build buildapi.Build, stepArchitectures []string)
return ret
}

func handleBuild(ctx context.Context, client BuildClient, podClient kubernetes.PodClient, build buildapi.Build, needsMultiArchWorkaround func() bool) error {
func handleBuild(ctx context.Context, client BuildClient, podClient kubernetes.PodClient, build buildapi.Build) error {
const attempts = 5
ns, name := build.Namespace, build.Name
var errs []error
if err := wait.ExponentialBackoff(wait.Backoff{Duration: time.Minute, Factor: 1.5, Steps: attempts}, func() (bool, error) {
var attempt buildapi.Build
// TODO: This is a workaround to avoid race condition with multiple ci-operator instances building different architectures
// See https://issues.redhat.com/browse/OCPBUGS-65845
if needsMultiArchWorkaround != nil && needsMultiArchWorkaround() {
time.Sleep(5 * time.Minute)
}
// builds are using older src image, adding wait to avoid race condition
//time.Sleep(1 * time.Minute)
build.DeepCopyInto(&attempt)
Comment on lines +551 to 559
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Commented-out race condition fix is a concern if merged.

Lines 557-558 contain dead code with a comment acknowledging a race condition ("builds are using older src image"). The sleep that previously mitigated this is now commented out.

Given the PR title "DO NOT MERGE" and /hold label, this appears intentional for investigation. However, if this were to be merged:

  • The race condition described in the comment would remain unaddressed
  • The commented-out code should be removed entirely or replaced with a proper fix

Consider removing the dead code and comment, or implementing a proper solution before any future merge.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pkg/steps/source.go` around lines 551 - 559, In handleBuild, remove the
commented-out sleep and its explanatory comment (the dead code around the
"builds are using older src image" sleep) or replace it with a deterministic
wait: implement a proper retry/wait that checks the build/pod image readiness
before proceeding (e.g., use wait.PollImmediate or extend the existing
ExponentialBackoff to poll the relevant image/version field on the attempt/build
or associated pod via podClient until it reflects the expected src image),
ensuring you update the logic around build.DeepCopyInto(&attempt) accordingly.

if err := client.Create(ctx, &attempt); err == nil {
logrus.Infof("Created build %q", name)
Expand Down