diff --git a/config/302-pac-configmap.yaml b/config/302-pac-configmap.yaml index c035bf2179..e9cbf63aa2 100644 --- a/config/302-pac-configmap.yaml +++ b/config/302-pac-configmap.yaml @@ -173,6 +173,13 @@ data: # Default: true skip-push-event-for-pr-commits: "true" + # Selects the concurrency queue backend used by the watcher. + # "memory" keeps the existing in-process queue state. + # "lease" uses Kubernetes Leases plus PipelineRun claims to recover more safely + # from watcher restarts and cluster/API timing issues. + # Restart the watcher after changing this setting. + concurrency-backend: "memory" + # Configure a custom console here, the driver support custom parameters from # Repo CR along a few other template variable, see documentation for more # details diff --git a/docs/content/docs/advanced/concurrency.md b/docs/content/docs/advanced/concurrency.md index 6bd97f824b..cd41292618 100644 --- a/docs/content/docs/advanced/concurrency.md +++ b/docs/content/docs/advanced/concurrency.md @@ -4,36 +4,133 @@ weight: 2 --- This page illustrates how Pipelines-as-Code manages concurrent PipelineRun execution. When you set a concurrency limit on a Repository CR, Pipelines-as-Code queues incoming PipelineRuns and starts them only when capacity allows. +The watcher supports two queue backends controlled by the global `concurrency-backend` setting in the `pipelines-as-code` ConfigMap: + +- `memory` keeps queue state in the watcher process. This is the historical behavior and remains the default. +- `lease` stores queue coordination in Kubernetes using `Lease` objects and short-lived PipelineRun claims. This mode is more resilient when the watcher restarts or the cluster is slow to reconcile updates. + +{{< tech_preview "Lease-backed concurrency backend" >}} + ## Flow diagram ```mermaid -graph TD - A1[Controller] --> B1(Validate & Process Event) - B1 --> C1{Is concurrency defined?} - C1 -->|Not Defined| D1[Create PipelineRun with state='started'] - C1 -->|Defined| E1[Create PipelineRun with pending status and state='queued'] - - Z[Pipelines-as-Code] - - A[Watcher] --> B(PipelineRun Reconciler) - B --> C{Check state} - C --> |completed| F(Return, nothing to do!) - C --> |queued| D(Create Queue for Repository) - C --> |started| E{Is PipelineRun Done?} - D --> O(Add PipelineRun in the queue) - O --> P{If PipelineRuns running < concurrency_limit} - P --> |Yes| Q(Start the top most PipelineRun in the Queue) - Q --> P - P --> |No| R[Return and wait for your turn] - E --> |Yes| G(Report Status to provider) - E --> |No| H(Requeue Request) - H --> B - G --> I(Update status in Repository) - I --> J(Update state to 'completed') - J --> K{Check if concurrency was defined?} - K --> |Yes| L(Remove PipelineRun from Queue) - L --> M(Start the next PipelineRun from Queue) - M --> N[Done!] - K --> |No| N +flowchart TD + A[Webhook event] --> B[Controller resolves Repository CR] + B --> C{concurrency_limit set?} + C -->|No| D[Create PipelineRun with state=started] + C -->|Yes| E[Create PipelineRun with state=queued and spec.status=pending] + + D --> F[Watcher reconciles started PipelineRun] + E --> G[Watcher reconciles queued PipelineRun] + + G --> H{Queue backend} + H -->|memory| I[Use in-process semaphore] + H -->|lease| J[Acquire per-repository Lease and inspect live PipelineRuns] + + I --> K{Capacity available?} + J --> K + K -->|No| L[Keep PipelineRun queued] + K -->|Yes| M[Claim candidate and patch state=started] + + M --> F + F --> N{PipelineRun done?} + N -->|No| F + N -->|Yes| O[Report final status] + O --> P[Release slot and try next queued run] + P --> G +``` + +## Backend selection + +To enable the Kubernetes-backed queue coordination, set the global config to: +```yaml +data: + concurrency-backend: "lease" ``` + +Restart the watcher after changing `concurrency-backend`; the backend is selected at startup. + +When `lease` mode is enabled, Pipelines-as-Code still uses the existing `queued`, `started`, and `completed` PipelineRun states. The difference is that promotion of the next queued PipelineRun is serialized with a per-repository `Lease`, which reduces queue drift during cluster/API instability. + +## How lease promotion works + +When the watcher reconciles a queued PipelineRun under the `lease` backend, it follows this sequence: + +1. Acquire the per-repository Kubernetes Lease (retry up to 20 times with 100 ms delay). +2. List live PipelineRuns for that repository. +3. Separate them into running, claimed, and claimable queued runs. +4. Compute available capacity: `concurrency_limit - running - claimed`. +5. Patch one or more queued runs with short-lived claim annotations (`queue-claimed-by`, `queue-claimed-at`). +6. Release the repository Lease. +7. Re-fetch the claimed run and patch it to `started`. + +If promotion fails at step 7, the watcher records the failure on the PipelineRun, clears the claim, and another reconcile retries later. + +Claims expire after **30 seconds**. If a watcher crashes or stalls before completing promotion, another instance can pick up the run once the claim expires. + +## Recovery loop + +When the `lease` backend is active, the watcher starts a background recovery loop that runs every **31 seconds** (claim TTL + 1 s buffer). It looks for repositories where: + +- there is no started PipelineRun +- there is no queued PipelineRun with an active (unexpired) claim +- there is still at least one recoverable queued PipelineRun + +A queued PipelineRun is recoverable when it has `state=queued`, `spec.status=Pending`, is not done or cancelled, and has a valid `execution-order` annotation. + +When a candidate is found, the recovery loop clears stale debug annotations and re-enqueues the oldest recoverable run so normal promotion logic runs again. + +## Debugging the Lease Backend + +When `concurrency-backend: "lease"` is enabled, queued `PipelineRun`s expose queue debugging state directly in annotations: + +- `pipelinesascode.tekton.dev/queue-decision` +- `pipelinesascode.tekton.dev/queue-debug-summary` +- `pipelinesascode.tekton.dev/queue-claimed-by` +- `pipelinesascode.tekton.dev/queue-claimed-at` +- `pipelinesascode.tekton.dev/queue-promotion-retries` +- `pipelinesascode.tekton.dev/queue-promotion-last-error` + +This makes it possible to diagnose most queue issues with `kubectl` before looking at watcher logs. + +### Useful commands + +```bash +kubectl get pipelinerun -n -o jsonpath='{.metadata.annotations.pipelinesascode\.tekton\.dev/queue-decision}{"\n"}' +kubectl get pipelinerun -n -o jsonpath='{.metadata.annotations.pipelinesascode\.tekton\.dev/queue-debug-summary}{"\n"}' +kubectl describe pipelinerun -n +kubectl get events -n --field-selector involvedObject.kind=Repository +``` + +### Queue decisions + +- `waiting_for_slot`: the run is queued and waiting for repository capacity. +- `claim_active`: another watcher already holds an active short-lived claim on this run. +- `claimed_for_promotion`: this run has been claimed and is being promoted to `started`. +- `promotion_failed`: the watcher failed while promoting the run to `started`. +- `recovery_requeued`: the lease recovery loop noticed this run and enqueued it again. +- `missing_execution_order`: the run is queued but its execution order annotation does not include itself. +- `not_recoverable`: the run is still `queued` but is no longer eligible for lease recovery. + +### Events + +The watcher also emits repository-scoped Kubernetes events for the most important transitions: + +- `QueueClaimedForPromotion` +- `QueuePromotionFailed` +- `QueueRecoveryRequeued` +- `QueueLeaseAcquireTimeout` + +### Troubleshooting + +| Symptom | Queue decision | Likely cause | Action | +| --- | --- | --- | --- | +| Run stuck queued, nothing running | `waiting_for_slot` | Completed run was not cleaned up or finalizer is stuck | Check if a `started` PipelineRun still exists for the repo. If it is done but state was not updated, delete it or patch its state to `completed`. | +| Run stuck queued, another run is running | `waiting_for_slot` | Normal — the run is waiting for the active run to finish. | No action needed unless the running PipelineRun is itself stuck. | +| Run keeps cycling between queued and claimed | `claim_active` | Two watcher replicas are contending for the same run. | Wait for the claim to expire (30 s). If it persists, check watcher logs for lease acquisition errors. | +| Run shows promotion failures | `promotion_failed` | The watcher failed to patch the run to `started` (API error, webhook, or admission rejection). | Check `queue-promotion-last-error` and `queue-promotion-retries` annotations. Resolve the underlying API or admission error. | +| Run was recovered but is stuck again | `recovery_requeued` | The recovery loop re-enqueued the run but promotion failed again on the next attempt. | Check for repeated `QueuePromotionFailed` events on the repository. The underlying issue (permissions, resource quota, webhook) must be fixed. | +| Run is queued but marked not recoverable | `not_recoverable` | The run was cancelled, completed, or lost its `execution-order` annotation. | Inspect the PipelineRun — if it should still run, re-apply the `execution-order` annotation manually. | + +If the queue decision and events do not explain the behavior, switch the watcher to debug logging and grep for the repository key and PipelineRun key. The lease backend logs include lease acquisition attempts, active claim evaluation, queue-state snapshots, and recovery loop selections. diff --git a/docs/content/docs/api/configmap.md b/docs/content/docs/api/configmap.md index 607f71eba4..24d9b28e42 100644 --- a/docs/content/docs/api/configmap.md +++ b/docs/content/docs/api/configmap.md @@ -345,6 +345,20 @@ skip-push-event-for-pr-commits: "true" {{< /param >}} +{{< param name="concurrency-backend" type="string" default="memory" id="param-concurrency-backend" >}} +Selects the queue coordination backend used by the watcher. Supported values: + +- `memory`: in-process queue tracking. This is the default and matches the historical behavior. +- `lease`: Kubernetes-backed coordination using `Lease` objects and short-lived PipelineRun claims for improved recovery during watcher restarts or API instability. This backend is Technology Preview. + +Restart the watcher after changing this setting. + +```yaml +concurrency-backend: "memory" +``` + +{{< /param >}} + ## Complete Example ```yaml @@ -381,6 +395,7 @@ data: remember-ok-to-test: "true" require-ok-to-test-sha: "false" skip-push-event-for-pr-commits: "true" + concurrency-backend: "memory" ``` ## Updating configuration diff --git a/docs/content/docs/guides/repository-crd/concurrency.md b/docs/content/docs/guides/repository-crd/concurrency.md index e827dd3e7c..bb7f1b28ee 100644 --- a/docs/content/docs/guides/repository-crd/concurrency.md +++ b/docs/content/docs/guides/repository-crd/concurrency.md @@ -3,26 +3,113 @@ title: Concurrency weight: 2 --- -This page explains how to limit the number of concurrent PipelineRuns for a Repository CR and how to integrate with Kueue for Kubernetes-native job queueing. Use concurrency limits when you need to control resource consumption or prevent PipelineRuns from overwhelming your cluster. +Use `spec.concurrency_limit` on a Repository CR to cap how many `PipelineRun`s may run at once for that repository. +This is useful when you need to control cluster usage, preserve ordering for related runs, or avoid a burst of webhook events starting too many `PipelineRun`s at once. -Set the `concurrency_limit` field to define the maximum number of PipelineRuns running at any time for a Repository CR. This prevents resource exhaustion and ensures predictable scheduling when multiple events arrive in rapid succession. +## Repository setting + +Set the `concurrency_limit` field on the Repository CR: ```yaml spec: concurrency_limit: ``` -When multiple PipelineRuns match the event, Pipelines-as-Code starts them in alphabetical order by PipelineRun name. +When a webhook event produces multiple `PipelineRun`s for the same repository: + +- the controller creates them with an `execution-order` annotation +- runs that cannot start immediately are created as `state=queued` with Tekton `spec.status=pending` +- the watcher promotes queued runs to `state=started` only when repository capacity is available + +If `concurrency_limit: 1`, only one run for that repository is active at a time and the rest stay queued until the watcher promotes them. + +## End-to-end flow + +1. The controller decides whether the repository is concurrency-limited. +2. If there is no limit, it creates `PipelineRun`s directly in `started`. +3. If there is a limit, it creates `PipelineRun`s in `queued` and records `execution-order`. +4. The watcher reconciles every `PipelineRun` that has a Pipelines-as-Code state annotation. +5. For queued runs, the watcher asks the selected queue backend whether a slot is available. +6. If a run is selected, the watcher patches it to `started`. +7. When a started run finishes, the watcher reports status and asks the backend for the next queued candidate. + +## Queue flow diagram + +```mermaid +flowchart TD + A[Webhook event] --> B[Controller resolves Repository CR] + B --> C{concurrency_limit set?} + C -->|No| D[Create PipelineRun with state=started] + C -->|Yes| E[Create PipelineRun with state=queued and spec.status=pending] + + D --> F[Watcher reconciles started PipelineRun] + E --> G[Watcher reconciles queued PipelineRun] + + G --> H{Queue backend} + H -->|memory| I[Use in-process semaphore] + H -->|lease| J[Acquire per-repository Lease and inspect live PipelineRuns] + + I --> K{Capacity available?} + J --> K + K -->|No| L[Keep PipelineRun queued] + K -->|Yes| M[Claim candidate and patch state=started] + + M --> F + F --> N{PipelineRun done?} + N -->|No| F + N -->|Yes| O[Report final status] + O --> P[Release slot and try next queued run] + P --> G +``` + +## Backend behavior + +The watcher supports two queue backends controlled by the global `concurrency-backend` setting in the `pipelines-as-code` ConfigMap. + +### `memory` backend + +This is the default. -Example: +- Each repository gets an in-memory semaphore in the watcher process. +- The watcher keeps separate running and pending queues. +- Startup rebuilds queue state from existing `started` and `queued` `PipelineRun`s. +- Coordination is local to that watcher process. + +This backend is simple and fast, but it depends on watcher-local state remaining in sync with the cluster view. + +### `lease` backend + +{{< tech_preview "Lease-backed concurrency backend" >}} + +- Each repository uses a Kubernetes `Lease` as a short critical section. +- The watcher recomputes queue state from live `PipelineRun`s rather than trusting only process memory. +- A queued run is considered temporarily reserved when it carries short-lived claim annotations (`queue-claimed-by` and `queue-claimed-at`). If the watcher crashes or stalls, another instance can recover after the claim expires. +- The watcher sorts candidates using the recorded `execution-order`, then falls back to creation time. +- A background recovery loop re-enqueues the oldest recoverable queued run when a repository has no active started run and no active claim. + +This backend is designed for environments where the watcher may restart, the API server is slow, or promotion to `started` can fail transiently. + +For debugging annotations, queue decisions, events, and the full promotion flow see [Advanced Concurrency]({{< relref "/docs/advanced/concurrency" >}}). + +## Choosing the backend + +Select the global backend in the Pipelines-as-Code ConfigMap: + +```yaml +data: + concurrency-backend: "memory" +``` + +or: + +```yaml +data: + concurrency-backend: "lease" +``` -If you have three PipelineRuns in your `.tekton/` directory and you create a pull -request with a `concurrency_limit` of 1 in the repository configuration, -Pipelines-as-Code executes all PipelineRuns in alphabetical order, one after the -other. At any given time, only one PipelineRun is in the running state, -while the rest are queued. +Changing this setting requires restarting the watcher so it can recreate the queue manager with the new backend. -For additional concurrency strategies and global configuration options, see [Advanced Concurrency]({{< relref "/docs/advanced/concurrency" >}}). +For the global `concurrency-backend` setting itself, see [ConfigMap Reference]({{< relref "/docs/api/configmap" >}}). ## Kueue - Kubernetes-native Job Queueing diff --git a/hack/gh-workflow-ci.sh b/hack/gh-workflow-ci.sh index abbfc862df..7403deb25e 100755 --- a/hack/gh-workflow-ci.sh +++ b/hack/gh-workflow-ci.sh @@ -200,6 +200,13 @@ get_tests() { run_e2e_tests() { set +x + + # Enable lease-based concurrency backend for all E2E providers + kubectl -n pipelines-as-code patch configmap pipelines-as-code --type merge \ + -p '{"data":{"concurrency-backend":"lease"}}' + kubectl -n pipelines-as-code rollout restart deployment/pipelines-as-code-watcher + kubectl -n pipelines-as-code rollout status deployment/pipelines-as-code-watcher --timeout=120s + target="${TEST_PROVIDER}" export PAC_E2E_KEEP_NS=true diff --git a/pkg/apis/pipelinesascode/keys/keys.go b/pkg/apis/pipelinesascode/keys/keys.go index ef25cce505..21755a811a 100644 --- a/pkg/apis/pipelinesascode/keys/keys.go +++ b/pkg/apis/pipelinesascode/keys/keys.go @@ -61,6 +61,13 @@ const ( LogURL = pipelinesascode.GroupName + "/log-url" ExecutionOrder = pipelinesascode.GroupName + "/execution-order" SCMReportingPLRStarted = pipelinesascode.GroupName + "/scm-reporting-plr-started" + QueueClaimedBy = pipelinesascode.GroupName + "/queue-claimed-by" + QueueClaimedAt = pipelinesascode.GroupName + "/queue-claimed-at" + QueueDecision = pipelinesascode.GroupName + "/queue-decision" + QueueDebugSummary = pipelinesascode.GroupName + "/queue-debug-summary" + QueuePromotionRetries = pipelinesascode.GroupName + "/queue-promotion-retries" + QueuePromotionBlocked = pipelinesascode.GroupName + "/queue-promotion-blocked" + QueuePromotionLastErr = pipelinesascode.GroupName + "/queue-promotion-last-error" SecretCreated = pipelinesascode.GroupName + "/secret-created" CloneURL = pipelinesascode.GroupName + "/clone-url" // PublicGithubAPIURL default is "https://api.github.com" but it can be overridden by X-GitHub-Enterprise-Host header. diff --git a/pkg/params/config_sync.go b/pkg/params/config_sync.go index 7689edb3f0..be36e8aeff 100644 --- a/pkg/params/config_sync.go +++ b/pkg/params/config_sync.go @@ -13,6 +13,10 @@ import ( "knative.dev/pkg/system" ) +var terminateProcessForConfigChange = func() { + _ = syscall.Kill(os.Getpid(), syscall.SIGTERM) +} + func StartConfigSync(ctx context.Context, run *Run) { // init pac config _ = run.UpdatePacConfig(ctx) @@ -28,7 +32,19 @@ func StartConfigSync(ctx context.Context, run *Run) { // nothing to do }, UpdateFunc: func(_, _ any) { - _ = run.UpdatePacConfig(ctx) + oldBackend, newBackend, changed, err := updatePacConfigAndDetectBackendChange(ctx, run) + if err != nil { + return + } + if changed { + if run.Clients.Log != nil { + run.Clients.Log.Infof( + "concurrency-backend changed from %q to %q; restarting process so the queue backend is recreated", + oldBackend, newBackend, + ) + } + terminateProcessForConfigChange() + } }, DeleteFunc: func(_ any) { // nothing to do @@ -46,3 +62,13 @@ func StartConfigSync(ctx context.Context, run *Run) { signal.Notify(sig, syscall.SIGINT, syscall.SIGTERM) <-sig } + +func updatePacConfigAndDetectBackendChange(ctx context.Context, run *Run) (string, string, bool, error) { + oldBackend := run.Info.GetPacOpts().ConcurrencyBackend + if err := run.UpdatePacConfig(ctx); err != nil { + return oldBackend, oldBackend, false, err + } + + newBackend := run.Info.GetPacOpts().ConcurrencyBackend + return oldBackend, newBackend, oldBackend != "" && newBackend != "" && oldBackend != newBackend, nil +} diff --git a/pkg/params/config_sync_test.go b/pkg/params/config_sync_test.go new file mode 100644 index 0000000000..8cf6afc307 --- /dev/null +++ b/pkg/params/config_sync_test.go @@ -0,0 +1,123 @@ +package params + +import ( + "testing" + + "github.com/openshift-pipelines/pipelines-as-code/pkg/consoleui" + "github.com/openshift-pipelines/pipelines-as-code/pkg/params/clients" + "github.com/openshift-pipelines/pipelines-as-code/pkg/params/info" + "github.com/openshift-pipelines/pipelines-as-code/pkg/params/settings" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kubefake "k8s.io/client-go/kubernetes/fake" + rtesting "knative.dev/pkg/reconciler/testing" + + "go.uber.org/zap" + "gotest.tools/v3/assert" +) + +func TestUpdatePacConfigAndDetectBackendChange(t *testing.T) { + tests := []struct { + name string + initialBackend string + configMapName string + configData map[string]string + wantOld string + wantNew string + wantChanged bool + wantErr string + }{ + { + name: "detects backend change", + initialBackend: settings.ConcurrencyBackendMemory, + configMapName: "pac-config", + configData: map[string]string{ + "concurrency-backend": settings.ConcurrencyBackendLease, + "tekton-dashboard-url": "https://dashboard.example.test", + }, + wantOld: settings.ConcurrencyBackendMemory, + wantNew: settings.ConcurrencyBackendLease, + wantChanged: true, + }, + { + name: "ignores unchanged backend", + initialBackend: settings.ConcurrencyBackendLease, + configMapName: "pac-config", + configData: map[string]string{ + "concurrency-backend": settings.ConcurrencyBackendLease, + "tekton-dashboard-url": "https://dashboard.example.test", + }, + wantOld: settings.ConcurrencyBackendLease, + wantNew: settings.ConcurrencyBackendLease, + wantChanged: false, + }, + { + name: "does not report change when previous backend is empty", + initialBackend: "", + configMapName: "pac-config", + configData: map[string]string{ + "concurrency-backend": settings.ConcurrencyBackendLease, + "tekton-dashboard-url": "https://dashboard.example.test", + }, + wantOld: "", + wantNew: settings.ConcurrencyBackendLease, + wantChanged: false, + }, + { + name: "returns update error when configmap is missing", + initialBackend: settings.ConcurrencyBackendLease, + configMapName: "missing-config", + configData: map[string]string{ + "concurrency-backend": settings.ConcurrencyBackendLease, + }, + wantOld: settings.ConcurrencyBackendLease, + wantNew: settings.ConcurrencyBackendLease, + wantErr: "configmaps \"missing-config\" not found", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + ctx = info.StoreNS(ctx, "pac") + + run := &Run{ + Clients: clients.Clients{ + Kube: kubefake.NewSimpleClientset(&corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pac-config", + Namespace: "pac", + }, + Data: tt.configData, + }), + Log: zap.NewNop().Sugar(), + }, + Info: info.Info{ + Pac: &info.PacOpts{ + Settings: settings.Settings{ + ConcurrencyBackend: tt.initialBackend, + }, + }, + Controller: &info.ControllerInfo{ + Configmap: tt.configMapName, + }, + }, + } + run.Clients.InitClients() + run.Clients.SetConsoleUI(consoleui.FallBackConsole{}) + + oldBackend, newBackend, changed, err := updatePacConfigAndDetectBackendChange(ctx, run) + if tt.wantErr != "" { + assert.ErrorContains(t, err, tt.wantErr) + assert.Equal(t, oldBackend, tt.wantOld) + assert.Equal(t, newBackend, tt.wantNew) + assert.Equal(t, changed, false) + return + } + assert.NilError(t, err) + assert.Equal(t, oldBackend, tt.wantOld) + assert.Equal(t, newBackend, tt.wantNew) + assert.Equal(t, changed, tt.wantChanged) + }) + } +} diff --git a/pkg/params/settings/config.go b/pkg/params/settings/config.go index c1a38e2856..e557a6ebff 100644 --- a/pkg/params/settings/config.go +++ b/pkg/params/settings/config.go @@ -15,6 +15,8 @@ import ( const ( PACApplicationNameDefaultValue = "Pipelines as Code CI" + ConcurrencyBackendMemory = "memory" + ConcurrencyBackendLease = "lease" HubURLKey = "hub-url" HubCatalogNameKey = "hub-catalog-name" @@ -80,8 +82,9 @@ type Settings struct { CustomConsolePRTaskLog string `json:"custom-console-url-pr-tasklog"` CustomConsoleNamespaceURL string `json:"custom-console-url-namespace"` - RememberOKToTest bool `json:"remember-ok-to-test"` - RequireOkToTestSHA bool `json:"require-ok-to-test-sha"` + RememberOKToTest bool `json:"remember-ok-to-test"` + RequireOkToTestSHA bool `json:"require-ok-to-test-sha"` + ConcurrencyBackend string `default:"memory" json:"concurrency-backend"` } func (s *Settings) DeepCopy(out *Settings) { @@ -110,6 +113,7 @@ func DefaultValidators() map[string]func(string) error { "CustomConsoleURL": isValidURL, "CustomConsolePRTaskLog": startWithHTTPorHTTPS, "CustomConsolePRDetail": startWithHTTPorHTTPS, + "ConcurrencyBackend": isValidConcurrencyBackend, } } @@ -159,3 +163,12 @@ func startWithHTTPorHTTPS(url string) error { } return nil } + +func isValidConcurrencyBackend(backend string) error { + switch backend { + case "", ConcurrencyBackendMemory, ConcurrencyBackendLease: + return nil + default: + return fmt.Errorf("invalid concurrency backend %q, must be one of %q or %q", backend, ConcurrencyBackendMemory, ConcurrencyBackendLease) + } +} diff --git a/pkg/params/settings/config_test.go b/pkg/params/settings/config_test.go index 7ccdbfedc0..be158cd507 100644 --- a/pkg/params/settings/config_test.go +++ b/pkg/params/settings/config_test.go @@ -49,6 +49,7 @@ func TestSyncConfig(t *testing.T) { CustomConsolePRTaskLog: "", CustomConsoleNamespaceURL: "", RememberOKToTest: false, + ConcurrencyBackend: ConcurrencyBackendMemory, }, }, { @@ -79,6 +80,7 @@ func TestSyncConfig(t *testing.T) { "remember-ok-to-test": "false", "skip-push-event-for-pr-commits": "true", "require-ok-to-test-sha": "true", + "concurrency-backend": "lease", }, expectedStruct: Settings{ ApplicationName: "pac-pac", @@ -110,6 +112,7 @@ func TestSyncConfig(t *testing.T) { CustomConsoleNamespaceURL: "https://custom-console-namespace", RememberOKToTest: false, RequireOkToTestSHA: true, + ConcurrencyBackend: ConcurrencyBackendLease, }, }, { @@ -147,6 +150,13 @@ func TestSyncConfig(t *testing.T) { }, expectedError: "custom validation failed for field CustomConsolePRTaskLog: invalid value, must start with http:// or https://", }, + { + name: "invalid concurrency backend", + configMap: map[string]string{ + "concurrency-backend": "sqlite", + }, + expectedError: `custom validation failed for field ConcurrencyBackend: invalid concurrency backend "sqlite", must be one of "memory" or "lease"`, + }, } for _, tc := range testCases { diff --git a/pkg/queue/common.go b/pkg/queue/common.go index 4a891cad7b..a18f92e6c9 100644 --- a/pkg/queue/common.go +++ b/pkg/queue/common.go @@ -1,7 +1,13 @@ package queue import ( + "slices" + "strings" "time" + + "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/keys" + "github.com/openshift-pipelines/pipelines-as-code/pkg/kubeinteraction" + tektonv1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1" ) type Semaphore interface { @@ -18,3 +24,59 @@ type Semaphore interface { getCurrentRunning() []string getCurrentPending() []string } + +func IsRecoverableQueuedPipelineRun(pr *tektonv1.PipelineRun) bool { + if pr == nil { + return false + } + if pr.GetAnnotations()[keys.State] != kubeinteraction.StateQueued { + return false + } + if pr.Spec.Status != tektonv1.PipelineRunSpecStatusPending { + return false + } + if pr.IsDone() || pr.IsCancelled() { + return false + } + _, ok := executionOrderIndex(pr) + return ok +} + +func HasActiveLeaseQueueClaim(pr *tektonv1.PipelineRun, now time.Time, ttl time.Duration) bool { + if pr == nil { + return false + } + + claimedBy, claimAge := LeaseQueueClaimInfo(pr, now) + return claimedBy != "" && claimAge != unknownDuration() && claimAge <= ttl +} + +func ExecutionOrderList(pr *tektonv1.PipelineRun) []string { + order := pr.GetAnnotations()[keys.ExecutionOrder] + if order == "" { + return nil + } + return strings.Split(order, ",") +} + +func ExecutionOrderIndex(pr *tektonv1.PipelineRun) (int, bool) { + order := ExecutionOrderList(pr) + if len(order) == 0 { + return 0, false + } + + key := PrKey(pr) + index := slices.Index(order, key) + if index < 0 { + return 0, false + } + return index, true +} + +func executionOrderList(pr *tektonv1.PipelineRun) []string { + return ExecutionOrderList(pr) +} + +func executionOrderIndex(pr *tektonv1.PipelineRun) (int, bool) { + return ExecutionOrderIndex(pr) +} diff --git a/pkg/queue/common_test.go b/pkg/queue/common_test.go new file mode 100644 index 0000000000..cee3ab419f --- /dev/null +++ b/pkg/queue/common_test.go @@ -0,0 +1,150 @@ +package queue + +import ( + "testing" + "time" + + "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/keys" + "github.com/openshift-pipelines/pipelines-as-code/pkg/kubeinteraction" + tektonv1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1" + "gotest.tools/v3/assert" + duckv1 "knative.dev/pkg/apis/duck/v1" +) + +func TestIsRecoverableQueuedPipelineRun(t *testing.T) { + tests := []struct { + name string + pr *tektonv1.PipelineRun + want bool + }{ + { + name: "queued pending with execution order", + pr: newTestPR("queued", time.Unix(1_700_001_000, 0), nil, map[string]string{ + keys.State: kubeinteraction.StateQueued, + keys.ExecutionOrder: "test-ns/queued", + }, tektonv1.PipelineRunSpec{ + Status: tektonv1.PipelineRunSpecStatusPending, + }), + want: true, + }, + { + name: "nil pipelineRun", + pr: nil, + want: false, + }, + { + name: "non queued state", + pr: newTestPR("started", time.Unix(1_700_001_000, 0), nil, map[string]string{ + keys.State: kubeinteraction.StateStarted, + keys.ExecutionOrder: "test-ns/started", + }, tektonv1.PipelineRunSpec{ + Status: tektonv1.PipelineRunSpecStatusPending, + }), + want: false, + }, + { + name: "missing pending status", + pr: newTestPR("running", time.Unix(1_700_001_000, 0), nil, map[string]string{ + keys.State: kubeinteraction.StateQueued, + keys.ExecutionOrder: "test-ns/running", + }, tektonv1.PipelineRunSpec{}), + want: false, + }, + { + name: "done pipelineRun", + pr: &tektonv1.PipelineRun{ + ObjectMeta: newTestPR("done", time.Unix(1_700_001_000, 0), nil, map[string]string{ + keys.State: kubeinteraction.StateQueued, + keys.ExecutionOrder: "test-ns/done", + }, tektonv1.PipelineRunSpec{ + Status: tektonv1.PipelineRunSpecStatusPending, + }).ObjectMeta, + Spec: tektonv1.PipelineRunSpec{ + Status: tektonv1.PipelineRunSpecStatusPending, + }, + Status: tektonv1.PipelineRunStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{{ + Type: "Succeeded", + Status: "True", + }}, + }, + }, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, IsRecoverableQueuedPipelineRun(tt.pr), tt.want) + }) + } +} + +func TestHasActiveLeaseQueueClaim(t *testing.T) { + now := time.Unix(1_700_001_500, 0) + tests := []struct { + name string + pr *tektonv1.PipelineRun + want bool + }{ + { + name: "active claim within ttl", + pr: newTestPR("active", now, nil, map[string]string{ + keys.QueueClaimedBy: "watcher-a", + keys.QueueClaimedAt: now.Add(-time.Minute).Format(time.RFC3339Nano), + }, tektonv1.PipelineRunSpec{}), + want: true, + }, + { + name: "expired claim", + pr: newTestPR("expired", now, nil, map[string]string{ + keys.QueueClaimedBy: "watcher-a", + keys.QueueClaimedAt: now.Add(-10 * time.Minute).Format(time.RFC3339Nano), + }, tektonv1.PipelineRunSpec{}), + want: false, + }, + { + name: "malformed timestamp", + pr: newTestPR("malformed", now, nil, map[string]string{ + keys.QueueClaimedBy: "watcher-a", + keys.QueueClaimedAt: "not-a-time", + }, tektonv1.PipelineRunSpec{}), + want: false, + }, + { + name: "missing claimant", + pr: newTestPR("missing", now, nil, map[string]string{ + keys.QueueClaimedAt: now.Add(-time.Minute).Format(time.RFC3339Nano), + }, tektonv1.PipelineRunSpec{}), + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, HasActiveLeaseQueueClaim(tt.pr, now, 5*time.Minute), tt.want) + }) + } +} + +func TestExecutionOrderHelpers(t *testing.T) { + pr := newTestPR("second", time.Unix(1_700_001_800, 0), nil, map[string]string{ + keys.ExecutionOrder: "test-ns/first,test-ns/second", + }, tektonv1.PipelineRunSpec{}) + + assert.DeepEqual(t, ExecutionOrderList(pr), []string{"test-ns/first", "test-ns/second"}) + + index, ok := ExecutionOrderIndex(pr) + assert.Assert(t, ok) + assert.Equal(t, index, 1) + + assert.DeepEqual(t, ExecutionOrderList(newTestPR("empty", time.Unix(1_700_001_800, 0), nil, map[string]string{}, tektonv1.PipelineRunSpec{})), []string(nil)) + + index, ok = ExecutionOrderIndex(newTestPR("missing", time.Unix(1_700_001_800, 0), nil, map[string]string{ + keys.ExecutionOrder: "test-ns/first", + }, tektonv1.PipelineRunSpec{})) + assert.Equal(t, index, 0) + assert.Assert(t, !ok) +} diff --git a/pkg/queue/debug_info.go b/pkg/queue/debug_info.go new file mode 100644 index 0000000000..4c5eb3e4ae --- /dev/null +++ b/pkg/queue/debug_info.go @@ -0,0 +1,229 @@ +package queue + +import ( + "context" + "fmt" + "strconv" + "time" + + "github.com/openshift-pipelines/pipelines-as-code/pkg/action" + "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/keys" + "github.com/openshift-pipelines/pipelines-as-code/pkg/kubeinteraction" + "github.com/openshift-pipelines/pipelines-as-code/pkg/params/settings" + tektonv1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1" + tektonclient "github.com/tektoncd/pipeline/pkg/client/clientset/versioned" + "go.uber.org/zap" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + QueueDecisionWaitingForSlot = "waiting_for_slot" + QueueDecisionClaimActive = "claim_active" + QueueDecisionClaimedForPromote = "claimed_for_promotion" + QueueDecisionPromotionFailed = "promotion_failed" + QueueDecisionRecoveryRequeued = "recovery_requeued" + QueueDecisionMissingOrder = "missing_execution_order" + QueueDecisionNotRecoverable = "not_recoverable" +) + +const unknownQueueDebugValue = -1 + +type DebugSnapshot struct { + Backend string + RepoKey string + Position int + Running int + Claimed int + Queued int + Limit int + ClaimedBy string + ClaimAge time.Duration + LastDecision string +} + +func (d DebugSnapshot) Summary() string { + backend := d.Backend + if backend == "" { + backend = settings.ConcurrencyBackendLease + } + + return fmt.Sprintf( + "backend=%s repo=%s position=%s running=%s claimed=%s queued=%s limit=%s claimedBy=%s claimAge=%s lastDecision=%s", + backend, + formatQueueDebugString(d.RepoKey), + formatQueueDebugInt(d.Position), + formatQueueDebugInt(d.Running), + formatQueueDebugInt(d.Claimed), + formatQueueDebugInt(d.Queued), + formatQueueDebugInt(d.Limit), + formatQueueDebugString(d.ClaimedBy), + formatQueueDebugDuration(d.ClaimAge), + formatQueueDebugString(d.LastDecision), + ) +} + +func SyncQueueDebugAnnotations( + ctx context.Context, + logger *zap.SugaredLogger, + tekton tektonclient.Interface, + pr *tektonv1.PipelineRun, + snapshot DebugSnapshot, +) error { + if tekton == nil || pr == nil { + return nil + } + + latest, err := tekton.TektonV1().PipelineRuns(pr.GetNamespace()).Get(ctx, pr.GetName(), metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return err + } + + if !IsQueueOnlyAnnotationRelevant(latest) { + if logger != nil { + logger.Debugf( + "skipping queue debug annotation update for pipelinerun %s because latest state=%s spec.status=%s done=%t cancelled=%t", + PrKey(latest), latest.GetAnnotations()[keys.State], latest.Spec.Status, latest.IsDone(), latest.IsCancelled(), + ) + } + if hasQueueDebugAnnotations(latest) { + return ClearQueueDebugAnnotations(ctx, logger, tekton, latest) + } + return nil + } + + summary := snapshot.Summary() + currentAnnotations := latest.GetAnnotations() + if currentAnnotations[keys.QueueDecision] == snapshot.LastDecision && + currentAnnotations[keys.QueueDebugSummary] == summary { + return nil + } + + if logger != nil { + logger.Debugf( + "updating queue debug annotations for pipelinerun %s: decision=%s summary=%q", + PrKey(pr), snapshot.LastDecision, summary, + ) + } + + _, err = action.PatchPipelineRun(ctx, logger, "queue debug", tekton, latest, map[string]any{ + "metadata": map[string]any{ + "annotations": map[string]any{ + keys.QueueDecision: snapshot.LastDecision, + keys.QueueDebugSummary: summary, + }, + }, + }) + return err +} + +func ClearQueueDebugAnnotations( + ctx context.Context, + logger *zap.SugaredLogger, + tekton tektonclient.Interface, + pr *tektonv1.PipelineRun, +) error { + if tekton == nil || pr == nil { + return nil + } + + if !hasQueueDebugAnnotations(pr) { + return nil + } + + if logger != nil { + logger.Debugf("clearing queue debug annotations for pipelinerun %s", PrKey(pr)) + } + + _, err := action.PatchPipelineRun(ctx, logger, "queue debug cleanup", tekton, pr, map[string]any{ + "metadata": map[string]any{ + "annotations": map[string]any{ + keys.QueueDecision: nil, + keys.QueueDebugSummary: nil, + }, + }, + }) + return err +} + +func LeaseQueueCleanupAnnotations() map[string]any { + return map[string]any{ + keys.QueueClaimedBy: nil, + keys.QueueClaimedAt: nil, + keys.QueueDecision: nil, + keys.QueueDebugSummary: nil, + keys.QueuePromotionRetries: nil, + keys.QueuePromotionBlocked: nil, + keys.QueuePromotionLastErr: nil, + } +} + +func hasQueueDebugAnnotations(pr *tektonv1.PipelineRun) bool { + if pr == nil { + return false + } + annotations := pr.GetAnnotations() + return annotations[keys.QueueDecision] != "" || annotations[keys.QueueDebugSummary] != "" +} + +func IsQueueOnlyAnnotationRelevant(pr *tektonv1.PipelineRun) bool { + if pr == nil { + return false + } + if pr.GetAnnotations()[keys.State] != kubeinteraction.StateQueued { + return false + } + return pr.Spec.Status == tektonv1.PipelineRunSpecStatusPending && !pr.IsDone() && !pr.IsCancelled() +} + +func LeaseQueueClaimInfo(pr *tektonv1.PipelineRun, now time.Time) (string, time.Duration) { + if pr == nil { + return "", unknownDuration() + } + + annotations := pr.GetAnnotations() + claimedBy := annotations[keys.QueueClaimedBy] + claimedAt := annotations[keys.QueueClaimedAt] + if claimedBy == "" || claimedAt == "" { + return claimedBy, unknownDuration() + } + + claimedTime, err := time.Parse(time.RFC3339Nano, claimedAt) + if err != nil { + return claimedBy, unknownDuration() + } + + age := now.Sub(claimedTime) + if age < 0 { + age = 0 + } + return claimedBy, age +} + +func formatQueueDebugInt(v int) string { + if v == unknownQueueDebugValue { + return "n/a" + } + return strconv.Itoa(v) +} + +func formatQueueDebugString(v string) string { + if v == "" { + return "n/a" + } + return v +} + +func formatQueueDebugDuration(v time.Duration) string { + if v == unknownDuration() { + return "n/a" + } + return v.Truncate(time.Second).String() +} + +func unknownDuration() time.Duration { + return -1 +} diff --git a/pkg/queue/debug_info_test.go b/pkg/queue/debug_info_test.go new file mode 100644 index 0000000000..238e73795b --- /dev/null +++ b/pkg/queue/debug_info_test.go @@ -0,0 +1,215 @@ +package queue + +import ( + "strings" + "testing" + "time" + + "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/keys" + "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/v1alpha1" + "github.com/openshift-pipelines/pipelines-as-code/pkg/kubeinteraction" + testclient "github.com/openshift-pipelines/pipelines-as-code/pkg/test/clients" + tektonv1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1" + "go.uber.org/zap" + zapobserver "go.uber.org/zap/zaptest/observer" + "gotest.tools/v3/assert" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + duckv1 "knative.dev/pkg/apis/duck/v1" + rtesting "knative.dev/pkg/reconciler/testing" +) + +func TestDebugSnapshotSummary(t *testing.T) { + summary := DebugSnapshot{ + Backend: "lease", + RepoKey: "ns/repo", + Position: 2, + Running: 1, + Claimed: 1, + Queued: 3, + Limit: 2, + ClaimedBy: "watcher-a", + ClaimAge: 5 * time.Second, + LastDecision: QueueDecisionClaimActive, + }.Summary() + + for _, fragment := range []string{ + "backend=lease", + "repo=ns/repo", + "position=2", + "running=1", + "claimed=1", + "queued=3", + "limit=2", + "claimedBy=watcher-a", + "claimAge=5s", + "lastDecision=claim_active", + } { + assert.Assert(t, strings.Contains(summary, fragment), "missing fragment %q in %q", fragment, summary) + } +} + +func TestSyncQueueDebugAnnotationsSkipsUnchangedPatch(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + repo := &v1alpha1.Repository{ + ObjectMeta: metav1.ObjectMeta{ + Name: "repo", + Namespace: "test-ns", + }, + } + pipelineRun := newLeaseQueuedPRWithOrder("first", time.Unix(1_700_002_000, 0), repo, "test-ns/first", map[string]string{ + keys.QueueDecision: QueueDecisionWaitingForSlot, + keys.QueueDebugSummary: DebugSnapshot{RepoKey: "test-ns/repo", LastDecision: QueueDecisionWaitingForSlot}.Summary(), + }) + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{ + PipelineRuns: []*tektonv1.PipelineRun{pipelineRun}, + }) + + err := SyncQueueDebugAnnotations(ctx, logger, stdata.Pipeline, pipelineRun, DebugSnapshot{ + RepoKey: "test-ns/repo", + LastDecision: QueueDecisionWaitingForSlot, + }) + assert.NilError(t, err) + + patchCount := 0 + for _, action := range stdata.Pipeline.Actions() { + if action.GetVerb() == "patch" && action.GetResource().Resource == "pipelineruns" { + patchCount++ + } + } + assert.Equal(t, patchCount, 0) +} + +func TestSyncQueueDebugAnnotationsSkipsLiveNonQueuedPipelineRun(t *testing.T) { + tests := []struct { + name string + livePR *tektonv1.PipelineRun + }{ + { + name: "latest started", + livePR: &tektonv1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "first", + Namespace: "test-ns", + Annotations: map[string]string{ + keys.State: kubeinteraction.StateStarted, + }, + }, + Spec: tektonv1.PipelineRunSpec{}, + }, + }, + { + name: "latest completed", + livePR: &tektonv1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "first", + Namespace: "test-ns", + Annotations: map[string]string{ + keys.State: kubeinteraction.StateCompleted, + }, + }, + Spec: tektonv1.PipelineRunSpec{}, + Status: tektonv1.PipelineRunStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{{ + Type: "Succeeded", + Status: "True", + }}, + }, + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + repo := &v1alpha1.Repository{ + ObjectMeta: metav1.ObjectMeta{ + Name: "repo", + Namespace: "test-ns", + }, + } + staleQueued := newLeaseQueuedPRWithOrder("first", time.Unix(1_700_002_100, 0), repo, "test-ns/first", nil) + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{ + PipelineRuns: []*tektonv1.PipelineRun{tt.livePR}, + }) + + err := SyncQueueDebugAnnotations(ctx, logger, stdata.Pipeline, staleQueued, DebugSnapshot{ + RepoKey: "test-ns/repo", + LastDecision: QueueDecisionClaimActive, + }) + assert.NilError(t, err) + + updated, err := stdata.Pipeline.TektonV1().PipelineRuns("test-ns").Get(ctx, "first", metav1.GetOptions{}) + assert.NilError(t, err) + assert.Equal(t, updated.GetAnnotations()[keys.QueueDecision], "") + assert.Equal(t, updated.GetAnnotations()[keys.QueueDebugSummary], "") + + patchCount := 0 + for _, action := range stdata.Pipeline.Actions() { + if action.GetVerb() == "patch" && action.GetResource().Resource == "pipelineruns" { + patchCount++ + } + } + assert.Equal(t, patchCount, 0) + }) + } +} + +func TestSyncQueueDebugAnnotationsClearsStaleAnnotationsOnAdvancedPipelineRun(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + repo := &v1alpha1.Repository{ + ObjectMeta: metav1.ObjectMeta{ + Name: "repo", + Namespace: "test-ns", + }, + } + staleQueued := newLeaseQueuedPRWithOrder("first", time.Unix(1_700_002_200, 0), repo, "test-ns/first", nil) + liveCompleted := &tektonv1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "first", + Namespace: "test-ns", + Annotations: map[string]string{ + keys.State: kubeinteraction.StateCompleted, + keys.QueueDecision: QueueDecisionClaimActive, + keys.QueueDebugSummary: "backend=lease repo=test-ns/repo", + }, + }, + Status: tektonv1.PipelineRunStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{{ + Type: "Succeeded", + Status: "True", + }}, + }, + }, + } + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{ + PipelineRuns: []*tektonv1.PipelineRun{liveCompleted}, + }) + + err := SyncQueueDebugAnnotations(ctx, logger, stdata.Pipeline, staleQueued, DebugSnapshot{ + RepoKey: "test-ns/repo", + LastDecision: QueueDecisionWaitingForSlot, + }) + assert.NilError(t, err) + + updated, err := stdata.Pipeline.TektonV1().PipelineRuns("test-ns").Get(ctx, "first", metav1.GetOptions{}) + assert.NilError(t, err) + assert.Equal(t, updated.GetAnnotations()[keys.QueueDecision], "") + assert.Equal(t, updated.GetAnnotations()[keys.QueueDebugSummary], "") + + patchCount := 0 + for _, action := range stdata.Pipeline.Actions() { + if action.GetVerb() == "patch" && action.GetResource().Resource == "pipelineruns" { + patchCount++ + } + } + assert.Equal(t, patchCount, 1) +} diff --git a/pkg/queue/lease_manager.go b/pkg/queue/lease_manager.go new file mode 100644 index 0000000000..3433134620 --- /dev/null +++ b/pkg/queue/lease_manager.go @@ -0,0 +1,766 @@ +package queue + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "fmt" + "os" + "sort" + "strings" + "time" + + "github.com/openshift-pipelines/pipelines-as-code/pkg/action" + "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/keys" + "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/v1alpha1" + "github.com/openshift-pipelines/pipelines-as-code/pkg/formatting" + "github.com/openshift-pipelines/pipelines-as-code/pkg/generated/clientset/versioned" + "github.com/openshift-pipelines/pipelines-as-code/pkg/kubeinteraction" + "github.com/openshift-pipelines/pipelines-as-code/pkg/params/settings" + tektonv1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1" + tektonclient "github.com/tektoncd/pipeline/pkg/client/clientset/versioned" + "go.uber.org/zap" + coordinationv1 "k8s.io/api/coordination/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/util/retry" +) + +const ( + defaultLeaseClaimTTL = 30 * time.Second + defaultLeaseDuration = int32(30) + defaultLeaseAcquireRetries = 20 + defaultLeaseRetryDelay = 100 * time.Millisecond +) + +type LeaseManager struct { + logger *zap.SugaredLogger + kube kubernetes.Interface + tekton tektonclient.Interface + leaseNamespace string + identity string + claimTTL time.Duration + leaseDuration int32 + now func() time.Time +} + +var _ ManagerInterface = (*LeaseManager)(nil) + +func NewManagerForBackend( + logger *zap.SugaredLogger, + kube kubernetes.Interface, + tekton tektonclient.Interface, + leaseNamespace, backend string, +) ManagerInterface { + if backend == settings.ConcurrencyBackendLease { + logger.Debugf("initializing lease-backed concurrency manager in namespace %s", leaseNamespace) + return NewLeaseManager(logger, kube, tekton, leaseNamespace) + } + logger.Debugf("initializing in-memory concurrency manager for backend %q", backend) + return NewManager(logger) +} + +func NewLeaseManager( + logger *zap.SugaredLogger, + kube kubernetes.Interface, + tekton tektonclient.Interface, + leaseNamespace string, +) *LeaseManager { + hostname, err := os.Hostname() + if err != nil || hostname == "" { + hostname = "pac-watcher" + } + + return &LeaseManager{ + logger: logger, + kube: kube, + tekton: tekton, + leaseNamespace: leaseNamespace, + identity: fmt.Sprintf("%s-%d", hostname, time.Now().UnixNano()), + claimTTL: defaultLeaseClaimTTL, + leaseDuration: defaultLeaseDuration, + now: time.Now, + } +} + +func (m *LeaseManager) InitQueues(context.Context, tektonclient.Interface, versioned.Interface) error { + return nil +} + +func (m *LeaseManager) RecoveryInterval() time.Duration { + return m.claimTTL +} + +func DefaultLeaseClaimTTL() time.Duration { + return defaultLeaseClaimTTL +} + +func (m *LeaseManager) RemoveRepository(repo *v1alpha1.Repository) { + releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + m.logger.Debugf("deleting concurrency lease for repository %s", RepoKey(repo)) + if err := m.kube.CoordinationV1().Leases(m.leaseNamespace).Delete(releaseCtx, repoLeaseName(RepoKey(repo)), metav1.DeleteOptions{}); err != nil && + !apierrors.IsNotFound(err) { + m.logger.Warnf("failed to delete queue lease for repository %s: %v", RepoKey(repo), err) + } +} + +func (m *LeaseManager) QueuedPipelineRuns(repo *v1alpha1.Repository) []string { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + state, err := m.getRepoQueueState(ctx, repo, nil, "") + if err != nil { + m.logger.Warnf("failed to compute queued pipelineruns for repository %s: %v", RepoKey(repo), err) + return []string{} + } + + keys := make([]string, 0, len(state.claimed)+len(state.queued)) + for i := range state.claimed { + keys = append(keys, PrKey(&state.claimed[i])) + } + for i := range state.queued { + keys = append(keys, PrKey(&state.queued[i])) + } + return keys +} + +func (m *LeaseManager) RunningPipelineRuns(repo *v1alpha1.Repository) []string { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + state, err := m.getRepoQueueState(ctx, repo, nil, "") + if err != nil { + m.logger.Warnf("failed to compute running pipelineruns for repository %s: %v", RepoKey(repo), err) + return []string{} + } + + keys := make([]string, 0, len(state.running)) + for i := range state.running { + keys = append(keys, PrKey(&state.running[i])) + } + return keys +} + +func (m *LeaseManager) AddListToRunningQueue(ctx context.Context, repo *v1alpha1.Repository, orderedList []string) ([]string, error) { + if repo.Spec.ConcurrencyLimit == nil || *repo.Spec.ConcurrencyLimit == 0 { + m.logger.Debugf("skipping lease queue claim for repository %s because concurrency limit is disabled", RepoKey(repo)) + return []string{}, nil + } + m.logger.Debugf( + "attempting to claim queued pipelineruns for repository %s with concurrency limit %d and preferred order %v", + RepoKey(repo), *repo.Spec.ConcurrencyLimit, orderedList, + ) + return m.claimNextQueued(ctx, repo, orderedList, "") +} + +func (m *LeaseManager) AddToPendingQueue(*v1alpha1.Repository, []string) error { + return nil +} + +func (m *LeaseManager) RemoveFromQueue(ctx context.Context, repo *v1alpha1.Repository, prKey string) bool { + m.logger.Debugf("clearing queue claim for pipelinerun %s in repository %s", prKey, RepoKey(repo)) + if err := m.clearClaim(ctx, repo, prKey); err != nil { + m.logger.Warnf("failed to clear queue claim for %s: %v", prKey, err) + return false + } + return true +} + +func (m *LeaseManager) RemoveAndTakeItemFromQueue(ctx context.Context, repo *v1alpha1.Repository, run *tektonv1.PipelineRun) string { + if repo.Spec.ConcurrencyLimit == nil || *repo.Spec.ConcurrencyLimit == 0 { + m.logger.Debugf("not attempting queue handoff for repository %s because concurrency limit is disabled", RepoKey(repo)) + return "" + } + + orderedList := executionOrderList(run) + m.logger.Debugf( + "removing pipelinerun %s from repository %s and attempting to claim next queued item from order %v", + PrKey(run), RepoKey(repo), orderedList, + ) + claimed, err := m.claimNextQueued(ctx, repo, orderedList, PrKey(run)) + if err != nil { + m.logger.Warnf("failed to claim next queued pipelinerun for repository %s: %v", RepoKey(repo), err) + return "" + } + if len(claimed) == 0 { + m.logger.Debugf("no queued pipelinerun available to claim for repository %s after removing %s", RepoKey(repo), PrKey(run)) + return "" + } + m.logger.Debugf("claimed next queued pipelinerun %s for repository %s", claimed[0], RepoKey(repo)) + return claimed[0] +} + +type repoQueueState struct { + running []tektonv1.PipelineRun + claimed []tektonv1.PipelineRun + queued []tektonv1.PipelineRun +} + +type pipelineOrderMeta struct { + groupTime time.Time + position int +} + +func (m *LeaseManager) claimNextQueued(ctx context.Context, repo *v1alpha1.Repository, preferredOrder []string, excludeKey string) ([]string, error) { + claimed := []string{} + var debugState *repoQueueState + var debugNewlyClaimed map[string]struct{} + + err := m.withRepoLease(ctx, repo, func(lockCtx context.Context) error { + state, err := m.getRepoQueueState(lockCtx, repo, preferredOrder, excludeKey) + if err != nil { + return err + } + + occupied := len(state.running) + len(state.claimed) + available := *repo.Spec.ConcurrencyLimit - occupied + m.logger.Debugf( + "lease queue state for repository %s: running=%d claimed=%d queued=%d occupied=%d available=%d exclude=%q preferred=%v", + RepoKey(repo), len(state.running), len(state.claimed), len(state.queued), occupied, available, excludeKey, preferredOrder, + ) + remainingQueued := make([]tektonv1.PipelineRun, 0, len(state.queued)) + newlyClaimed := map[string]struct{}{} + for _, pr := range state.queued { + if available <= 0 { + remainingQueued = append(remainingQueued, pr) + continue + } + + m.logger.Debugf("attempting to claim queued pipelinerun %s for repository %s", PrKey(&pr), RepoKey(repo)) + ok, err := m.claimPipelineRun(lockCtx, &pr) + if err != nil { + return err + } + if ok { + prKey := PrKey(&pr) + annotations := pr.GetAnnotations() + if annotations == nil { + annotations = map[string]string{} + } + annotations[keys.QueueClaimedBy] = m.identity + annotations[keys.QueueClaimedAt] = m.now().UTC().Format(time.RFC3339Nano) + pr.SetAnnotations(annotations) + claimed = append(claimed, prKey) + newlyClaimed[prKey] = struct{}{} + state.claimed = append(state.claimed, pr) + available-- + m.logger.Debugf( + "claimed queued pipelinerun %s for repository %s; remaining available slots %d", + prKey, RepoKey(repo), available, + ) + continue + } + + remainingQueued = append(remainingQueued, pr) + m.logger.Debugf("pipelinerun %s could not be claimed for repository %s", PrKey(&pr), RepoKey(repo)) + } + state.queued = remainingQueued + debugState = cloneRepoQueueState(state) + debugNewlyClaimed = cloneStringSet(newlyClaimed) + + if available <= 0 && len(claimed) == 0 { + m.logger.Debugf("repository %s has no available concurrency slots", RepoKey(repo)) + } + return nil + }) + + if err == nil { + if debugState != nil { + if err := m.syncQueueDebugState(ctx, repo, debugState, debugNewlyClaimed); err != nil { + m.logger.Warnf("failed to sync queue debug state for repository %s: %v", RepoKey(repo), err) + } + } + m.logger.Debugf("finished lease queue claim for repository %s; claimed=%v", RepoKey(repo), claimed) + } + return claimed, err +} + +func (m *LeaseManager) getRepoQueueState(ctx context.Context, repo *v1alpha1.Repository, preferredOrder []string, excludeKey string) (*repoQueueState, error) { + prs, err := m.tekton.TektonV1().PipelineRuns(repo.Namespace).List(ctx, metav1.ListOptions{ + LabelSelector: fmt.Sprintf("%s=%s", keys.Repository, formatting.CleanValueKubernetes(repo.Name)), + }) + if err != nil { + return nil, err + } + + state := &repoQueueState{} + now := m.now() + groupMinTime := map[string]time.Time{} + preferredIndex := make(map[string]int, len(preferredOrder)) + for i, key := range preferredOrder { + preferredIndex[key] = i + } + + for i := range prs.Items { + pr := prs.Items[i] + if pr.GetAnnotations()[keys.Repository] != repo.Name { + continue + } + order := pr.GetAnnotations()[keys.ExecutionOrder] + if order == "" { + continue + } + if existing, ok := groupMinTime[order]; !ok || pr.CreationTimestamp.Time.Before(existing) { + groupMinTime[order] = pr.CreationTimestamp.Time + } + } + + orderMeta := map[string]pipelineOrderMeta{} + + for i := range prs.Items { + pr := prs.Items[i] + if pr.GetAnnotations()[keys.Repository] != repo.Name { + continue + } + if excludeKey != "" && PrKey(&pr) == excludeKey { + continue + } + + switch pr.GetAnnotations()[keys.State] { + case kubeinteraction.StateStarted: + if !pr.IsDone() && !pr.IsCancelled() { + state.running = append(state.running, pr) + } + case kubeinteraction.StateQueued: + if !IsRecoverableQueuedPipelineRun(&pr) { + decision := QueueDecisionNotRecoverable + if _, ok := executionOrderIndex(&pr); !ok { + decision = QueueDecisionMissingOrder + } + if err := SyncQueueDebugAnnotations(ctx, m.logger, m.tekton, &pr, DebugSnapshot{ + Backend: settings.ConcurrencyBackendLease, + RepoKey: RepoKey(repo), + Position: unknownQueueDebugValue, + Running: len(state.running), + Claimed: len(state.claimed), + Queued: len(state.queued), + Limit: repositoryConcurrencyLimit(repo), + LastDecision: decision, + }); err != nil { + m.logger.Warnf("failed to record queue debug state for pipelinerun %s: %v", PrKey(&pr), err) + } + m.logger.Debugf( + "skipping queued pipelinerun %s for repository %s because it is not recoverable", + PrKey(&pr), RepoKey(repo), + ) + continue + } + position, ok := executionOrderIndex(&pr) + if !ok { + m.logger.Warnf("ignoring queued pipelinerun %s because execution-order does not include itself", PrKey(&pr)) + continue + } + orderMeta[PrKey(&pr)] = pipelineOrderMeta{ + groupTime: groupMinTime[pr.GetAnnotations()[keys.ExecutionOrder]], + position: position, + } + if m.hasActiveClaim(&pr, now) { + m.logger.Debugf("queued pipelinerun %s for repository %s already has an active claim", PrKey(&pr), RepoKey(repo)) + state.claimed = append(state.claimed, pr) + } else { + m.logger.Debugf("queued pipelinerun %s for repository %s is available for claiming", PrKey(&pr), RepoKey(repo)) + state.queued = append(state.queued, pr) + } + } + } + + sort.Slice(state.running, func(i, j int) bool { + return comparePipelineRuns(&state.running[i], &state.running[j]) + }) + sort.Slice(state.claimed, func(i, j int) bool { + return compareQueueCandidates(&state.claimed[i], &state.claimed[j], preferredIndex, orderMeta) + }) + sort.Slice(state.queued, func(i, j int) bool { + return compareQueueCandidates(&state.queued[i], &state.queued[j], preferredIndex, orderMeta) + }) + + m.logger.Debugf( + "computed lease queue state for repository %s with running=%v claimed=%v queued=%v", + RepoKey(repo), pipelineRunKeys(state.running), pipelineRunKeys(state.claimed), pipelineRunKeys(state.queued), + ) + return state, nil +} + +func compareQueueCandidates( + left, right *tektonv1.PipelineRun, + preferredIndex map[string]int, + orderMeta map[string]pipelineOrderMeta, +) bool { + leftKey := PrKey(left) + rightKey := PrKey(right) + + leftPreferred, leftOK := preferredIndex[leftKey] + rightPreferred, rightOK := preferredIndex[rightKey] + switch { + case leftOK && rightOK && leftPreferred != rightPreferred: + return leftPreferred < rightPreferred + case leftOK != rightOK: + return leftOK + } + + leftMeta, leftMetaOK := orderMeta[leftKey] + rightMeta, rightMetaOK := orderMeta[rightKey] + switch { + case leftMetaOK && rightMetaOK: + if !leftMeta.groupTime.Equal(rightMeta.groupTime) { + return leftMeta.groupTime.Before(rightMeta.groupTime) + } + if leftMeta.position != rightMeta.position { + return leftMeta.position < rightMeta.position + } + case leftMetaOK != rightMetaOK: + return leftMetaOK + } + + return comparePipelineRuns(left, right) +} + +func comparePipelineRuns(left, right *tektonv1.PipelineRun) bool { + if left.CreationTimestamp.Equal(&right.CreationTimestamp) { + return left.GetName() < right.GetName() + } + return left.CreationTimestamp.Before(&right.CreationTimestamp) +} + +func (m *LeaseManager) hasActiveClaim(pr *tektonv1.PipelineRun, now time.Time) bool { + active := HasActiveLeaseQueueClaim(pr, now, m.claimTTL) + claimedBy, age := LeaseQueueClaimInfo(pr, now) + m.logger.Debugf( + "evaluated queue claim for pipelinerun %s: claimedBy=%s age=%s ttl=%s active=%t", + PrKey(pr), claimedBy, age, m.claimTTL, active, + ) + return active +} + +func (m *LeaseManager) claimPipelineRun(ctx context.Context, pr *tektonv1.PipelineRun) (bool, error) { + claimedAt := m.now().UTC().Format(time.RFC3339Nano) + m.logger.Debugf("patching pipelinerun %s with queue claim owned by %s at %s", PrKey(pr), m.identity, claimedAt) + mergePatch := map[string]any{ + "metadata": map[string]any{ + "annotations": map[string]any{ + keys.QueueClaimedBy: m.identity, + keys.QueueClaimedAt: claimedAt, + }, + }, + } + + if _, err := action.PatchPipelineRun(ctx, m.logger, "queue claim", m.tekton, pr, mergePatch); err != nil { + if apierrors.IsNotFound(err) { + m.logger.Debugf("pipelinerun %s disappeared before queue claim could be recorded", PrKey(pr)) + return false, nil + } + return false, err + } + m.logger.Debugf("successfully recorded queue claim for pipelinerun %s", PrKey(pr)) + return true, nil +} + +func (m *LeaseManager) clearClaim(ctx context.Context, repo *v1alpha1.Repository, prKey string) error { + nameParts := strings.Split(prKey, "/") + if len(nameParts) != 2 { + return fmt.Errorf("invalid pipelinerun key %q", prKey) + } + + pr, err := m.tekton.TektonV1().PipelineRuns(nameParts[0]).Get(ctx, nameParts[1], metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + m.logger.Debugf("pipelinerun %s was already deleted before queue claim cleanup", prKey) + return nil + } + return err + } + + if pr.GetAnnotations()[keys.Repository] != repo.Name { + m.logger.Debugf( + "skipping queue claim cleanup for pipelinerun %s because it belongs to repository %s instead of %s", + prKey, pr.GetAnnotations()[keys.Repository], repo.Name, + ) + return nil + } + + m.logger.Debugf("removing queue claim annotations from pipelinerun %s", prKey) + mergePatch := map[string]any{ + "metadata": map[string]any{ + "annotations": map[string]any{ + keys.QueueClaimedBy: nil, + keys.QueueClaimedAt: nil, + }, + }, + } + + _, err = action.PatchPipelineRun(ctx, m.logger, "queue claim cleanup", m.tekton, pr, mergePatch) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + m.logger.Debugf("queue claim cleanup completed for pipelinerun %s", prKey) + return nil +} + +func (m *LeaseManager) withRepoLease(ctx context.Context, repo *v1alpha1.Repository, fn func(context.Context) error) error { + leaseName := repoLeaseName(RepoKey(repo)) + + for attempt := 0; attempt < defaultLeaseAcquireRetries; attempt++ { + m.logger.Debugf( + "attempting to acquire concurrency lease %s for repository %s (attempt %d/%d)", + leaseName, RepoKey(repo), attempt+1, defaultLeaseAcquireRetries, + ) + acquired, err := m.tryAcquireLease(ctx, leaseName) + if err != nil { + return err + } + if acquired { + m.logger.Debugf("acquired concurrency lease %s for repository %s", leaseName, RepoKey(repo)) + defer m.releaseLease(leaseName) + return fn(ctx) + } + m.logger.Debugf("concurrency lease %s for repository %s is currently held by another watcher", leaseName, RepoKey(repo)) + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(defaultLeaseRetryDelay): + } + } + + return fmt.Errorf("timed out acquiring concurrency lease %s", leaseName) +} + +func (m *LeaseManager) tryAcquireLease(ctx context.Context, leaseName string) (bool, error) { + leases := m.kube.CoordinationV1().Leases(m.leaseNamespace) + now := metav1.MicroTime{Time: m.now()} + + lease, err := leases.Get(ctx, leaseName, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + m.logger.Debugf("creating new concurrency lease %s for identity %s", leaseName, m.identity) + _, err = leases.Create(ctx, &coordinationv1.Lease{ + ObjectMeta: metav1.ObjectMeta{ + Name: leaseName, + Namespace: m.leaseNamespace, + }, + Spec: coordinationv1.LeaseSpec{ + HolderIdentity: &m.identity, + LeaseDurationSeconds: &m.leaseDuration, + AcquireTime: &now, + RenewTime: &now, + }, + }, metav1.CreateOptions{}) + if err == nil { + m.logger.Debugf("created and acquired concurrency lease %s", leaseName) + return true, nil + } + if apierrors.IsAlreadyExists(err) { + m.logger.Debugf("concurrency lease %s was created by another watcher before acquisition completed", leaseName) + return false, nil + } + return false, err + } + if err != nil { + return false, err + } + + if !m.canTakeLease(lease, now.Time) { + holder := "" + if lease.Spec.HolderIdentity != nil { + holder = *lease.Spec.HolderIdentity + } + m.logger.Debugf("cannot acquire concurrency lease %s because it is still held by %s", leaseName, holder) + return false, nil + } + + updated := lease.DeepCopy() + updated.Spec.HolderIdentity = &m.identity + updated.Spec.LeaseDurationSeconds = &m.leaseDuration + updated.Spec.RenewTime = &now + if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity != m.identity { + updated.Spec.AcquireTime = &now + } + + if _, err := leases.Update(ctx, updated, metav1.UpdateOptions{}); err != nil { + if apierrors.IsConflict(err) { + m.logger.Debugf("conflict while updating concurrency lease %s; another watcher won the race", leaseName) + return false, nil + } + return false, err + } + + m.logger.Debugf("updated concurrency lease %s to holder %s", leaseName, m.identity) + return true, nil +} + +func (m *LeaseManager) canTakeLease(lease *coordinationv1.Lease, now time.Time) bool { + if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity == "" { + return true + } + if *lease.Spec.HolderIdentity == m.identity { + return true + } + return isLeaseExpired(lease, now) +} + +func isLeaseExpired(lease *coordinationv1.Lease, now time.Time) bool { + if lease.Spec.LeaseDurationSeconds == nil { + return true + } + + base := time.Time{} + if lease.Spec.RenewTime != nil { + base = lease.Spec.RenewTime.Time + } else if lease.Spec.AcquireTime != nil { + base = lease.Spec.AcquireTime.Time + } + if base.IsZero() { + return true + } + + return base.Add(time.Duration(*lease.Spec.LeaseDurationSeconds) * time.Second).Before(now) +} + +func (m *LeaseManager) releaseLease(leaseName string) { + releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + leases := m.kube.CoordinationV1().Leases(m.leaseNamespace) + now := metav1.MicroTime{Time: m.now()} + + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + lease, err := leases.Get(releaseCtx, leaseName, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + return err + } + + if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity == "" { + return nil + } + if *lease.Spec.HolderIdentity != m.identity { + m.logger.Debugf("skipping release for concurrency lease %s held by %s", leaseName, *lease.Spec.HolderIdentity) + return nil + } + + updated := lease.DeepCopy() + updated.Spec.HolderIdentity = nil + updated.Spec.RenewTime = &now + + _, err = leases.Update(releaseCtx, updated, metav1.UpdateOptions{}) + return err + }) + if err != nil { + m.logger.Warnf("failed to release concurrency lease %s: %v", leaseName, err) + } +} + +func repoLeaseName(repoKey string) string { + sum := sha256.Sum256([]byte(repoKey)) + return fmt.Sprintf("pac-concurrency-%s", hex.EncodeToString(sum[:8])) +} + +func pipelineRunKeys(prs []tektonv1.PipelineRun) []string { + keys := make([]string, 0, len(prs)) + for i := range prs { + keys = append(keys, PrKey(&prs[i])) + } + return keys +} + +func cloneRepoQueueState(state *repoQueueState) *repoQueueState { + if state == nil { + return nil + } + + cloned := &repoQueueState{} + if len(state.running) > 0 { + cloned.running = append([]tektonv1.PipelineRun(nil), state.running...) + } + if len(state.claimed) > 0 { + cloned.claimed = append([]tektonv1.PipelineRun(nil), state.claimed...) + } + if len(state.queued) > 0 { + cloned.queued = append([]tektonv1.PipelineRun(nil), state.queued...) + } + return cloned +} + +func cloneStringSet(values map[string]struct{}) map[string]struct{} { + if len(values) == 0 { + return nil + } + + cloned := make(map[string]struct{}, len(values)) + for key := range values { + cloned[key] = struct{}{} + } + return cloned +} + +func (m *LeaseManager) syncQueueDebugState( + ctx context.Context, + repo *v1alpha1.Repository, + state *repoQueueState, + newlyClaimed map[string]struct{}, +) error { + queueOrder := make([]tektonv1.PipelineRun, 0, len(state.claimed)+len(state.queued)) + queueOrder = append(queueOrder, state.claimed...) + queueOrder = append(queueOrder, state.queued...) + + positions := make(map[string]int, len(queueOrder)) + for i := range queueOrder { + positions[PrKey(&queueOrder[i])] = i + 1 + } + + limit := repositoryConcurrencyLimit(repo) + for i := range state.claimed { + pr := &state.claimed[i] + decision := QueueDecisionClaimActive + if _, ok := newlyClaimed[PrKey(pr)]; ok { + decision = QueueDecisionClaimedForPromote + } + claimedBy, claimAge := LeaseQueueClaimInfo(pr, m.now()) + if err := SyncQueueDebugAnnotations(ctx, m.logger, m.tekton, pr, DebugSnapshot{ + Backend: settings.ConcurrencyBackendLease, + RepoKey: RepoKey(repo), + Position: positions[PrKey(pr)], + Running: len(state.running), + Claimed: len(state.claimed), + Queued: len(state.queued), + Limit: limit, + ClaimedBy: claimedBy, + ClaimAge: claimAge, + LastDecision: decision, + }); err != nil { + return err + } + } + + for i := range state.queued { + pr := &state.queued[i] + claimedBy, claimAge := LeaseQueueClaimInfo(pr, m.now()) + if err := SyncQueueDebugAnnotations(ctx, m.logger, m.tekton, pr, DebugSnapshot{ + Backend: settings.ConcurrencyBackendLease, + RepoKey: RepoKey(repo), + Position: positions[PrKey(pr)], + Running: len(state.running), + Claimed: len(state.claimed), + Queued: len(state.queued), + Limit: limit, + ClaimedBy: claimedBy, + ClaimAge: claimAge, + LastDecision: QueueDecisionWaitingForSlot, + }); err != nil { + return err + } + } + + return nil +} + +func repositoryConcurrencyLimit(repo *v1alpha1.Repository) int { + if repo == nil || repo.Spec.ConcurrencyLimit == nil { + return unknownQueueDebugValue + } + return *repo.Spec.ConcurrencyLimit +} diff --git a/pkg/queue/lease_manager_test.go b/pkg/queue/lease_manager_test.go new file mode 100644 index 0000000000..0a211247f9 --- /dev/null +++ b/pkg/queue/lease_manager_test.go @@ -0,0 +1,904 @@ +package queue + +import ( + "context" + "testing" + "time" + + "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/keys" + "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/v1alpha1" + "github.com/openshift-pipelines/pipelines-as-code/pkg/formatting" + "github.com/openshift-pipelines/pipelines-as-code/pkg/kubeinteraction" + testclient "github.com/openshift-pipelines/pipelines-as-code/pkg/test/clients" + tektonv1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1" + "go.uber.org/zap" + zapobserver "go.uber.org/zap/zaptest/observer" + "gotest.tools/v3/assert" + coordinationv1 "k8s.io/api/coordination/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ktesting "k8s.io/client-go/testing" + rtesting "knative.dev/pkg/reconciler/testing" +) + +func TestLeaseManagerClaimsOnlyAvailableCapacity(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + now := time.Unix(1_700_000_000, 0) + repo := newTestRepo(1) + + first := newLeaseTestPR("first", now, repo, nil) + second := newLeaseTestPR("second", now.Add(time.Second), repo, nil) + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{ + PipelineRuns: []*tektonv1.PipelineRun{first, second}, + }) + + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + manager.now = func() time.Time { return now } + + claimed, err := manager.AddListToRunningQueue(ctx, repo, nil) + assert.NilError(t, err) + assert.DeepEqual(t, claimed, []string{PrKey(first)}) + + claimed, err = manager.AddListToRunningQueue(ctx, repo, nil) + assert.NilError(t, err) + assert.Equal(t, len(claimed), 0) + + firstUpdated, err := stdata.Pipeline.TektonV1().PipelineRuns(first.Namespace).Get(ctx, first.Name, metav1.GetOptions{}) + assert.NilError(t, err) + assert.Equal(t, firstUpdated.GetAnnotations()[keys.QueueDecision], QueueDecisionClaimActive) + assert.Assert(t, firstUpdated.GetAnnotations()[keys.QueueDebugSummary] != "") + + secondUpdated, err := stdata.Pipeline.TektonV1().PipelineRuns(second.Namespace).Get(ctx, second.Name, metav1.GetOptions{}) + assert.NilError(t, err) + assert.Equal(t, secondUpdated.GetAnnotations()[keys.QueueDecision], QueueDecisionWaitingForSlot) + assert.Assert(t, secondUpdated.GetAnnotations()[keys.QueueDebugSummary] != "") +} + +func TestLeaseManagerReclaimsStaleClaims(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + now := time.Unix(1_700_000_100, 0) + repo := newTestRepo(1) + + staleClaim := map[string]string{ + keys.QueueClaimedBy: "old-watcher", + keys.QueueClaimedAt: now.Add(-2 * defaultLeaseClaimTTL).Format(time.RFC3339Nano), + } + first := newLeaseTestPR("first", now, repo, staleClaim) + second := newLeaseTestPR("second", now.Add(time.Second), repo, nil) + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{ + PipelineRuns: []*tektonv1.PipelineRun{first, second}, + }) + + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + manager.now = func() time.Time { return now } + + claimed, err := manager.AddListToRunningQueue(ctx, repo, nil) + assert.NilError(t, err) + assert.DeepEqual(t, claimed, []string{PrKey(first)}) +} + +func TestLeaseManagerPromotesNextAfterCompletion(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + now := time.Unix(1_700_000_200, 0) + repo := newTestRepo(1) + + running := newLeaseStartedPR("first", now, repo) + queued := newLeaseTestPR("second", now.Add(time.Second), repo, nil) + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{ + PipelineRuns: []*tektonv1.PipelineRun{running, queued}, + }) + + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + manager.now = func() time.Time { return now } + + next := manager.RemoveAndTakeItemFromQueue(ctx, repo, running) + assert.Equal(t, next, PrKey(queued)) + + updated, err := stdata.Pipeline.TektonV1().PipelineRuns(queued.Namespace).Get(ctx, queued.Name, metav1.GetOptions{}) + assert.NilError(t, err) + assert.Equal(t, updated.GetAnnotations()[keys.QueueClaimedBy], manager.identity) +} + +func TestRepoLeaseNameIsDeterministic(t *testing.T) { + assert.Equal(t, repoLeaseName("ns/repo"), repoLeaseName("ns/repo")) + assert.Assert(t, repoLeaseName("ns/repo") != repoLeaseName("other/repo")) +} + +func TestLeaseManagerPrefersExecutionOrderWithinCurrentGroup(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + now := time.Unix(1_700_000_250, 0) + repo := newTestRepo(1) + + first := newLeaseQueuedPRWithOrder("bbb", now.Add(2*time.Second), repo, "test-ns/aaa,test-ns/bbb", nil) + second := newLeaseQueuedPRWithOrder("aaa", now.Add(3*time.Second), repo, "test-ns/aaa,test-ns/bbb", nil) + other := newLeaseQueuedPRWithOrder("ccc", now, repo, "test-ns/ccc", nil) + + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{ + PipelineRuns: []*tektonv1.PipelineRun{first, second, other}, + }) + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + manager.now = func() time.Time { return now } + + claimed, err := manager.AddListToRunningQueue(ctx, repo, []string{"test-ns/aaa", "test-ns/bbb"}) + assert.NilError(t, err) + assert.DeepEqual(t, claimed, []string{"test-ns/aaa"}) +} + +func TestLeaseManagerIgnoresMalformedExecutionOrder(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + now := time.Unix(1_700_000_275, 0) + repo := newTestRepo(1) + + malformed := newLeaseQueuedPRWithOrder("aaa", now, repo, "test-ns/zzz", nil) + valid := newLeaseQueuedPRWithOrder("bbb", now.Add(time.Second), repo, "test-ns/bbb", nil) + + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{ + PipelineRuns: []*tektonv1.PipelineRun{malformed, valid}, + }) + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + manager.now = func() time.Time { return now } + + claimed, err := manager.AddListToRunningQueue(ctx, repo, []string{"test-ns/aaa", "test-ns/bbb"}) + assert.NilError(t, err) + assert.DeepEqual(t, claimed, []string{"test-ns/bbb"}) + + updatedMalformed, err := stdata.Pipeline.TektonV1().PipelineRuns(malformed.Namespace).Get(ctx, malformed.Name, metav1.GetOptions{}) + assert.NilError(t, err) + assert.Equal(t, updatedMalformed.GetAnnotations()[keys.QueueDecision], QueueDecisionMissingOrder) +} + +func TestLeaseManagerStillConsidersPreviouslyBlockedQueuedRuns(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + now := time.Unix(1_700_000_290, 0) + repo := newTestRepo(1) + + blocked := newLeaseQueuedPRWithOrder("aaa", now, repo, "test-ns/aaa", map[string]string{ + keys.QueuePromotionBlocked: "true", + }) + valid := newLeaseQueuedPRWithOrder("bbb", now.Add(time.Second), repo, "test-ns/bbb", nil) + + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{ + PipelineRuns: []*tektonv1.PipelineRun{blocked, valid}, + }) + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + manager.now = func() time.Time { return now } + + claimed, err := manager.AddListToRunningQueue(ctx, repo, []string{"test-ns/aaa", "test-ns/bbb"}) + assert.NilError(t, err) + assert.DeepEqual(t, claimed, []string{"test-ns/aaa"}) +} + +func TestLeaseManagerFiltersBySanitizedRepositoryLabel(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + now := time.Unix(1_700_000_295, 0) + repo := &v1alpha1.Repository{ + ObjectMeta: metav1.ObjectMeta{ + Name: "org/repo", + Namespace: "test-ns", + }, + Spec: v1alpha1.RepositorySpec{ + ConcurrencyLimit: intPtr(1), + }, + } + repositoryLabel := formatting.CleanValueKubernetes(repo.Name) + + matching := newTestPR("matching", now, map[string]string{ + keys.Repository: repositoryLabel, + keys.State: kubeinteraction.StateQueued, + }, map[string]string{ + keys.Repository: repo.Name, + keys.State: kubeinteraction.StateQueued, + keys.ExecutionOrder: "test-ns/matching", + }, tektonv1.PipelineRunSpec{ + Status: tektonv1.PipelineRunSpecStatusPending, + }) + collision := newTestPR("collision", now.Add(-time.Second), map[string]string{ + keys.Repository: repositoryLabel, + keys.State: kubeinteraction.StateStarted, + }, map[string]string{ + keys.Repository: "org:repo", + keys.State: kubeinteraction.StateStarted, + }, tektonv1.PipelineRunSpec{}) + + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{ + PipelineRuns: []*tektonv1.PipelineRun{matching, collision}, + }) + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + manager.now = func() time.Time { return now } + + state, err := manager.getRepoQueueState(ctx, repo, nil, "") + assert.NilError(t, err) + assert.Equal(t, len(state.running), 0) + assert.Equal(t, len(state.queued), 1) + assert.Equal(t, PrKey(&state.queued[0]), PrKey(matching)) + + listSelector := "" + for _, action := range stdata.Pipeline.Actions() { + if action.GetVerb() != "list" || action.GetResource().Resource != "pipelineruns" { + continue + } + listAction, ok := action.(ktesting.ListAction) + assert.Assert(t, ok) + listSelector = listAction.GetListRestrictions().Labels.String() + } + + assert.Equal(t, listSelector, keys.Repository+"="+repositoryLabel) +} + +func TestLeaseManagerReleaseKeepsLeaseAndClearsHolder(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + now := time.Unix(1_700_000_300, 0) + + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{}) + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + manager.now = func() time.Time { return now } + + leaseName := repoLeaseName("test-ns/test") + acquired, err := manager.tryAcquireLease(ctx, leaseName) + assert.NilError(t, err) + assert.Assert(t, acquired) + + manager.releaseLease(leaseName) + + lease, err := stdata.Kube.CoordinationV1().Leases("pac").Get(ctx, leaseName, metav1.GetOptions{}) + assert.NilError(t, err) + assert.Assert(t, lease.Spec.HolderIdentity == nil) + assert.Assert(t, lease.Spec.RenewTime != nil) + assert.Equal(t, lease.Spec.RenewTime.Time, now) +} + +func TestLeaseManagerReusesLeaseAcrossAcquireCycles(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + now := time.Unix(1_700_000_400, 0) + + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{}) + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + manager.now = func() time.Time { return now } + + leaseName := repoLeaseName("test-ns/test") + acquired, err := manager.tryAcquireLease(ctx, leaseName) + assert.NilError(t, err) + assert.Assert(t, acquired) + manager.releaseLease(leaseName) + + acquired, err = manager.tryAcquireLease(ctx, leaseName) + assert.NilError(t, err) + assert.Assert(t, acquired) + manager.releaseLease(leaseName) + + var createCount, deleteCount int + for _, action := range stdata.Kube.Actions() { + if action.GetResource().Resource != "leases" { + continue + } + if action.GetVerb() == "create" { + createCount++ + } + if action.GetVerb() == "delete" { + deleteCount++ + } + } + + assert.Equal(t, createCount, 1) + assert.Equal(t, deleteCount, 0) +} + +func TestLeaseManagerReleaseIsNoOpForForeignHolder(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + now := time.Unix(1_700_000_500, 0) + + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{}) + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + manager.now = func() time.Time { return now } + + leaseName := repoLeaseName("test-ns/test") + foreignHolder := "other-manager" + _, err := stdata.Kube.CoordinationV1().Leases("pac").Create(ctx, &coordinationv1.Lease{ + ObjectMeta: metav1.ObjectMeta{ + Name: leaseName, + Namespace: "pac", + }, + Spec: coordinationv1.LeaseSpec{ + HolderIdentity: &foreignHolder, + RenewTime: &metav1.MicroTime{Time: now.Add(-time.Second)}, + }, + }, metav1.CreateOptions{}) + assert.NilError(t, err) + stdata.Kube.ClearActions() + + manager.releaseLease(leaseName) + + lease, err := stdata.Kube.CoordinationV1().Leases("pac").Get(ctx, leaseName, metav1.GetOptions{}) + assert.NilError(t, err) + assert.Assert(t, lease.Spec.HolderIdentity != nil) + assert.Equal(t, *lease.Spec.HolderIdentity, foreignHolder) + + for _, action := range stdata.Kube.Actions() { + if action.GetResource().Resource == "leases" { + assert.Assert(t, action.GetVerb() != "update") + } + } +} + +func TestLeaseManagerContentionAfterInPlaceRelease(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + now := time.Unix(1_700_000_600, 0) + + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{}) + first := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + second := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + first.now = func() time.Time { return now } + second.now = func() time.Time { return now } + + leaseName := repoLeaseName("test-ns/test") + acquired, err := first.tryAcquireLease(ctx, leaseName) + assert.NilError(t, err) + assert.Assert(t, acquired) + + acquired, err = second.tryAcquireLease(ctx, leaseName) + assert.NilError(t, err) + assert.Assert(t, !acquired) + + first.releaseLease(leaseName) + + acquired, err = second.tryAcquireLease(ctx, leaseName) + assert.NilError(t, err) + assert.Assert(t, acquired) +} + +func TestLeaseManagerWithRepoLeaseCreatesOnlyOneLease(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + now := time.Unix(1_700_000_700, 0) + repo := newTestRepo(1) + + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{}) + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + manager.now = func() time.Time { return now } + + err := manager.withRepoLease(context.Background(), repo, func(context.Context) error { return nil }) + assert.NilError(t, err) + err = manager.withRepoLease(context.Background(), repo, func(context.Context) error { return nil }) + assert.NilError(t, err) + + var createCount, deleteCount, updateCount int + for _, action := range stdata.Kube.Actions() { + if action.GetResource().Resource != "leases" { + continue + } + switch action.GetVerb() { + case "create": + createCount++ + case "delete": + deleteCount++ + case "update": + updateCount++ + } + } + + assert.Equal(t, createCount, 1) + assert.Equal(t, deleteCount, 0) + assert.Assert(t, updateCount >= 3) +} + +func TestLeaseManagerReleaseHandlesMissingLease(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{}) + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + + manager.releaseLease(repoLeaseName("test-ns/missing")) + + for _, action := range stdata.Kube.Actions() { + if action.GetResource().Resource == "leases" { + assert.Assert(t, action.GetVerb() != "update") + } + } +} + +func newLeaseTestPR(name string, ts time.Time, repo *v1alpha1.Repository, annotations map[string]string) *tektonv1.PipelineRun { + return newLeaseQueuedPRWithOrder(name, ts, repo, "test-ns/"+name, annotations) +} + +func newLeaseQueuedPRWithOrder(name string, ts time.Time, repo *v1alpha1.Repository, order string, annotations map[string]string) *tektonv1.PipelineRun { + mergedAnnotations := map[string]string{ + keys.Repository: repo.Name, + keys.State: kubeinteraction.StateQueued, + } + if order != "" { + mergedAnnotations[keys.ExecutionOrder] = order + } + for key, value := range annotations { + mergedAnnotations[key] = value + } + + return newTestPR(name, ts, map[string]string{ + keys.Repository: formatting.CleanValueKubernetes(repo.Name), + keys.State: kubeinteraction.StateQueued, + }, mergedAnnotations, tektonv1.PipelineRunSpec{ + Status: tektonv1.PipelineRunSpecStatusPending, + }) +} + +func newLeaseStartedPR(name string, ts time.Time, repo *v1alpha1.Repository) *tektonv1.PipelineRun { + return newTestPR(name, ts, map[string]string{ + keys.Repository: formatting.CleanValueKubernetes(repo.Name), + keys.State: kubeinteraction.StateStarted, + }, map[string]string{ + keys.Repository: repo.Name, + keys.State: kubeinteraction.StateStarted, + }, tektonv1.PipelineRunSpec{}) +} + +func TestIsLeaseExpired(t *testing.T) { + now := time.Unix(1_700_000_000, 0) + duration := int32(30) + + tests := []struct { + name string + lease *coordinationv1.Lease + want bool + }{ + { + name: "nil duration seconds", + lease: &coordinationv1.Lease{ + Spec: coordinationv1.LeaseSpec{ + LeaseDurationSeconds: nil, + }, + }, + want: true, + }, + { + name: "renew time set and not expired", + lease: &coordinationv1.Lease{ + Spec: coordinationv1.LeaseSpec{ + LeaseDurationSeconds: &duration, + RenewTime: &metav1.MicroTime{Time: now.Add(-10 * time.Second)}, + }, + }, + want: false, + }, + { + name: "renew time set and expired", + lease: &coordinationv1.Lease{ + Spec: coordinationv1.LeaseSpec{ + LeaseDurationSeconds: &duration, + RenewTime: &metav1.MicroTime{Time: now.Add(-60 * time.Second)}, + }, + }, + want: true, + }, + { + name: "acquire time only not expired", + lease: &coordinationv1.Lease{ + Spec: coordinationv1.LeaseSpec{ + LeaseDurationSeconds: &duration, + AcquireTime: &metav1.MicroTime{Time: now.Add(-10 * time.Second)}, + }, + }, + want: false, + }, + { + name: "acquire time only expired", + lease: &coordinationv1.Lease{ + Spec: coordinationv1.LeaseSpec{ + LeaseDurationSeconds: &duration, + AcquireTime: &metav1.MicroTime{Time: now.Add(-60 * time.Second)}, + }, + }, + want: true, + }, + { + name: "both times nil", + lease: &coordinationv1.Lease{ + Spec: coordinationv1.LeaseSpec{ + LeaseDurationSeconds: &duration, + }, + }, + want: true, + }, + { + name: "exact expiry boundary is not expired", + lease: &coordinationv1.Lease{ + Spec: coordinationv1.LeaseSpec{ + LeaseDurationSeconds: &duration, + RenewTime: &metav1.MicroTime{Time: now.Add(-30 * time.Second)}, + }, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, isLeaseExpired(tt.lease, now), tt.want) + }) + } +} + +func TestCanTakeLease(t *testing.T) { + now := time.Unix(1_700_000_000, 0) + duration := int32(30) + myIdentity := "my-watcher" + otherIdentity := "other-watcher" + empty := "" + + tests := []struct { + name string + lease *coordinationv1.Lease + want bool + }{ + { + name: "nil holder", + lease: &coordinationv1.Lease{ + Spec: coordinationv1.LeaseSpec{HolderIdentity: nil}, + }, + want: true, + }, + { + name: "empty holder", + lease: &coordinationv1.Lease{ + Spec: coordinationv1.LeaseSpec{HolderIdentity: &empty}, + }, + want: true, + }, + { + name: "same identity", + lease: &coordinationv1.Lease{ + Spec: coordinationv1.LeaseSpec{HolderIdentity: &myIdentity}, + }, + want: true, + }, + { + name: "different identity expired", + lease: &coordinationv1.Lease{ + Spec: coordinationv1.LeaseSpec{ + HolderIdentity: &otherIdentity, + LeaseDurationSeconds: &duration, + RenewTime: &metav1.MicroTime{Time: now.Add(-60 * time.Second)}, + }, + }, + want: true, + }, + { + name: "different identity not expired", + lease: &coordinationv1.Lease{ + Spec: coordinationv1.LeaseSpec{ + HolderIdentity: &otherIdentity, + LeaseDurationSeconds: &duration, + RenewTime: &metav1.MicroTime{Time: now.Add(-10 * time.Second)}, + }, + }, + want: false, + }, + } + + manager := &LeaseManager{identity: myIdentity} + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, manager.canTakeLease(tt.lease, now), tt.want) + }) + } +} + +func TestClearClaim(t *testing.T) { + tests := []struct { + name string + prKey string + seedPRs []*tektonv1.PipelineRun + wantErr string + wantPatched bool + }{ + { + name: "invalid key format", + prKey: "no-slash", + wantErr: "invalid pipelinerun key", + }, + { + name: "pipeline run not found", + prKey: "test-ns/missing", + }, + { + name: "repository mismatch", + prKey: "test-ns/wrong-repo", + seedPRs: []*tektonv1.PipelineRun{ + newTestPR("wrong-repo", time.Unix(1_700_000_000, 0), nil, map[string]string{ + keys.Repository: "other-repo", + }, tektonv1.PipelineRunSpec{}), + }, + }, + { + name: "successful cleanup", + prKey: "test-ns/claimed", + seedPRs: []*tektonv1.PipelineRun{ + newTestPR("claimed", time.Unix(1_700_000_000, 0), nil, map[string]string{ + keys.Repository: "test", + keys.QueueClaimedBy: "watcher-1", + keys.QueueClaimedAt: "2025-01-01T00:00:00Z", + }, tektonv1.PipelineRunSpec{}), + }, + wantPatched: true, + }, + } + + repo := newTestRepo(1) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{ + PipelineRuns: tt.seedPRs, + }) + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + + err := manager.clearClaim(ctx, repo, tt.prKey) + if tt.wantErr != "" { + assert.ErrorContains(t, err, tt.wantErr) + return + } + assert.NilError(t, err) + + if tt.wantPatched { + patched := false + for _, action := range stdata.Pipeline.Actions() { + if action.GetVerb() == "patch" { + patched = true + } + } + assert.Assert(t, patched) + } + }) + } +} + +func TestRemoveFromQueueLease(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + repo := newTestRepo(1) + + pr := newTestPR("claimed", time.Unix(1_700_000_000, 0), nil, map[string]string{ + keys.Repository: repo.Name, + keys.QueueClaimedBy: "watcher-1", + keys.QueueClaimedAt: "2025-01-01T00:00:00Z", + }, tektonv1.PipelineRunSpec{}) + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{ + PipelineRuns: []*tektonv1.PipelineRun{pr}, + }) + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + + assert.Assert(t, manager.RemoveFromQueue(ctx, repo, PrKey(pr))) + assert.Assert(t, !manager.RemoveFromQueue(ctx, repo, "bad-key")) +} + +func TestRemoveRepository(t *testing.T) { + tests := []struct { + name string + seedLease bool + }{ + { + name: "deletes existing lease", + seedLease: true, + }, + { + name: "no op when lease missing", + seedLease: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + repo := newTestRepo(1) + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{}) + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + + if tt.seedLease { + leaseName := repoLeaseName(RepoKey(repo)) + _, err := stdata.Kube.CoordinationV1().Leases("pac").Create(ctx, &coordinationv1.Lease{ + ObjectMeta: metav1.ObjectMeta{ + Name: leaseName, + Namespace: "pac", + }, + Spec: coordinationv1.LeaseSpec{ + HolderIdentity: &manager.identity, + }, + }, metav1.CreateOptions{}) + assert.NilError(t, err) + } + + manager.RemoveRepository(repo) + + if tt.seedLease { + deleted := false + for _, action := range stdata.Kube.Actions() { + if action.GetResource().Resource == "leases" && action.GetVerb() == "delete" { + deleted = true + } + } + assert.Assert(t, deleted) + } + }) + } +} + +func TestQueuedAndRunningPipelineRunsLease(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + now := time.Unix(1_700_000_000, 0) + repo := newTestRepo(1) + + running := newLeaseStartedPR("running", now, repo) + queued := newLeaseTestPR("queued", now.Add(time.Second), repo, nil) + claimed := newLeaseTestPR("claimed", now.Add(2*time.Second), repo, map[string]string{ + keys.QueueClaimedBy: "watcher-1", + keys.QueueClaimedAt: now.Format(time.RFC3339Nano), + }) + + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{ + PipelineRuns: []*tektonv1.PipelineRun{running, queued, claimed}, + }) + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + manager.now = func() time.Time { return now } + + runningKeys := manager.RunningPipelineRuns(repo) + assert.DeepEqual(t, runningKeys, []string{PrKey(running)}) + + queuedKeys := manager.QueuedPipelineRuns(repo) + assert.Assert(t, len(queuedKeys) >= 1) +} + +func TestCloneRepoQueueStateAndCloneStringSet(t *testing.T) { + t.Run("nil state returns nil", func(t *testing.T) { + assert.Assert(t, cloneRepoQueueState(nil) == nil) + }) + + t.Run("deep copies populated state", func(t *testing.T) { + original := &repoQueueState{ + running: []tektonv1.PipelineRun{{ObjectMeta: metav1.ObjectMeta{Name: "r1"}}}, + claimed: []tektonv1.PipelineRun{{ObjectMeta: metav1.ObjectMeta{Name: "c1"}}}, + queued: []tektonv1.PipelineRun{{ObjectMeta: metav1.ObjectMeta{Name: "q1"}}}, + } + cloned := cloneRepoQueueState(original) + assert.Equal(t, len(cloned.running), 1) + assert.Equal(t, len(cloned.claimed), 1) + assert.Equal(t, len(cloned.queued), 1) + + original.running = append(original.running, tektonv1.PipelineRun{ObjectMeta: metav1.ObjectMeta{Name: "r2"}}) + assert.Equal(t, len(cloned.running), 1) + }) + + t.Run("empty set returns nil", func(t *testing.T) { + assert.Assert(t, cloneStringSet(nil) == nil) + assert.Assert(t, cloneStringSet(map[string]struct{}{}) == nil) + }) + + t.Run("deep copies populated set", func(t *testing.T) { + original := map[string]struct{}{"a": {}, "b": {}} + cloned := cloneStringSet(original) + assert.Equal(t, len(cloned), 2) + + original["c"] = struct{}{} + assert.Equal(t, len(cloned), 2) + }) +} + +func TestRepositoryConcurrencyLimit(t *testing.T) { + tests := []struct { + name string + repo *v1alpha1.Repository + want int + }{ + { + name: "nil repo", + repo: nil, + want: -1, + }, + { + name: "nil concurrency limit", + repo: &v1alpha1.Repository{}, + want: -1, + }, + { + name: "valid limit", + repo: newTestRepo(5), + want: 5, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, repositoryConcurrencyLimit(tt.repo), tt.want) + }) + } +} + +func TestAddListAndRemoveNilConcurrencyLimit(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{}) + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + + nilLimitRepo := &v1alpha1.Repository{ + ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "test-ns"}, + } + zeroLimitRepo := &v1alpha1.Repository{ + ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "test-ns"}, + Spec: v1alpha1.RepositorySpec{ConcurrencyLimit: intPtr(0)}, + } + + claimed, err := manager.AddListToRunningQueue(ctx, nilLimitRepo, nil) + assert.NilError(t, err) + assert.Equal(t, len(claimed), 0) + + claimed, err = manager.AddListToRunningQueue(ctx, zeroLimitRepo, nil) + assert.NilError(t, err) + assert.Equal(t, len(claimed), 0) + + pr := &tektonv1.PipelineRun{ObjectMeta: metav1.ObjectMeta{Name: "pr", Namespace: "test-ns"}} + assert.Equal(t, manager.RemoveAndTakeItemFromQueue(ctx, nilLimitRepo, pr), "") + assert.Equal(t, manager.RemoveAndTakeItemFromQueue(ctx, zeroLimitRepo, pr), "") +} + +func TestWithRepoLeaseContextCancellation(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + now := time.Unix(1_700_000_000, 0) + repo := newTestRepo(1) + + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{}) + manager := NewLeaseManager(logger, stdata.Kube, stdata.Pipeline, "pac") + manager.now = func() time.Time { return now } + + foreignHolder := "other-watcher" + leaseName := repoLeaseName(RepoKey(repo)) + duration := int32(600) + _, err := stdata.Kube.CoordinationV1().Leases("pac").Create(ctx, &coordinationv1.Lease{ + ObjectMeta: metav1.ObjectMeta{Name: leaseName, Namespace: "pac"}, + Spec: coordinationv1.LeaseSpec{ + HolderIdentity: &foreignHolder, + LeaseDurationSeconds: &duration, + RenewTime: &metav1.MicroTime{Time: now}, + }, + }, metav1.CreateOptions{}) + assert.NilError(t, err) + + cancelCtx, cancel := context.WithCancel(ctx) + cancel() + + err = manager.withRepoLease(cancelCtx, repo, func(context.Context) error { + t.Fatal("callback should not be invoked") + return nil + }) + assert.ErrorContains(t, err, "context canceled") //nolint:misspell +} diff --git a/pkg/queue/queue_manager.go b/pkg/queue/queue_manager.go index cd1aa62374..0b8024ecb2 100644 --- a/pkg/queue/queue_manager.go +++ b/pkg/queue/queue_manager.go @@ -37,6 +37,10 @@ func NewManager(logger *zap.SugaredLogger) *Manager { } } +func (*Manager) RecoveryInterval() time.Duration { + return 0 +} + // getSemaphore returns existing semaphore created for repository or create // a new one with limit provided in repository // Semaphore: nothing but a waiting and a running queue for a repository @@ -76,7 +80,7 @@ func (qm *Manager) checkAndUpdateSemaphoreSize(repo *v1alpha1.Repository, semaph // and if it is at the top and ready to run which means currently running pipelineRun < limit // then move it to running queue // This adds the pipelineRuns in the same order as in the list. -func (qm *Manager) AddListToRunningQueue(repo *v1alpha1.Repository, list []string) ([]string, error) { +func (qm *Manager) AddListToRunningQueue(_ context.Context, repo *v1alpha1.Repository, list []string) ([]string, error) { qm.lock.Lock() defer qm.lock.Unlock() @@ -127,10 +131,11 @@ func (qm *Manager) AddToPendingQueue(repo *v1alpha1.Repository, list []string) e return nil } -func (qm *Manager) RemoveFromQueue(repoKey, prKey string) bool { +func (qm *Manager) RemoveFromQueue(_ context.Context, repo *v1alpha1.Repository, prKey string) bool { qm.lock.Lock() defer qm.lock.Unlock() + repoKey := RepoKey(repo) sema, found := qm.queueMap[repoKey] if !found { return false @@ -142,10 +147,10 @@ func (qm *Manager) RemoveFromQueue(repoKey, prKey string) bool { return true } -func (qm *Manager) RemoveAndTakeItemFromQueue(repo *v1alpha1.Repository, run *tektonv1.PipelineRun) string { +func (qm *Manager) RemoveAndTakeItemFromQueue(ctx context.Context, repo *v1alpha1.Repository, run *tektonv1.PipelineRun) string { repoKey := RepoKey(repo) prKey := PrKey(run) - if !qm.RemoveFromQueue(repoKey, prKey) { + if !qm.RemoveFromQueue(ctx, repo, prKey) { return "" } sema, found := qm.queueMap[repoKey] @@ -225,7 +230,7 @@ func (qm *Manager) InitQueues(ctx context.Context, tekton versioned2.Interface, } orderedList := FilterPipelineRunByState(ctx, tekton, strings.Split(order, ","), "", kubeinteraction.StateStarted) - _, err = qm.AddListToRunningQueue(&repo, orderedList) + _, err = qm.AddListToRunningQueue(ctx, &repo, orderedList) if err != nil { qm.logger.Error("failed to init queue for repo: ", repo.GetName()) } diff --git a/pkg/queue/queue_manager_interface.go b/pkg/queue/queue_manager_interface.go index 60a40befec..04fa95c001 100644 --- a/pkg/queue/queue_manager_interface.go +++ b/pkg/queue/queue_manager_interface.go @@ -3,6 +3,7 @@ package queue import ( "context" "fmt" + "time" "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/v1alpha1" "github.com/openshift-pipelines/pipelines-as-code/pkg/generated/clientset/versioned" @@ -12,13 +13,14 @@ import ( type ManagerInterface interface { InitQueues(ctx context.Context, tekton tektonVersionedClient.Interface, pac versioned.Interface) error + RecoveryInterval() time.Duration RemoveRepository(repo *v1alpha1.Repository) QueuedPipelineRuns(repo *v1alpha1.Repository) []string RunningPipelineRuns(repo *v1alpha1.Repository) []string - AddListToRunningQueue(repo *v1alpha1.Repository, list []string) ([]string, error) + AddListToRunningQueue(ctx context.Context, repo *v1alpha1.Repository, list []string) ([]string, error) AddToPendingQueue(repo *v1alpha1.Repository, list []string) error - RemoveFromQueue(repoKey, prKey string) bool - RemoveAndTakeItemFromQueue(repo *v1alpha1.Repository, run *tektonv1.PipelineRun) string + RemoveFromQueue(ctx context.Context, repo *v1alpha1.Repository, prKey string) bool + RemoveAndTakeItemFromQueue(ctx context.Context, repo *v1alpha1.Repository, run *tektonv1.PipelineRun) string } func RepoKey(repo *v1alpha1.Repository) string { diff --git a/pkg/queue/queue_manager_test.go b/pkg/queue/queue_manager_test.go index 0bfa76c38f..2392b5f485 100644 --- a/pkg/queue/queue_manager_test.go +++ b/pkg/queue/queue_manager_test.go @@ -1,6 +1,7 @@ package queue import ( + "context" "fmt" "testing" "time" @@ -40,11 +41,22 @@ func TestSomeoneElseSetPendingWithNoConcurrencyLimit(t *testing.T) { Reason: v1beta1.PipelineRunReasonPending.String(), }, } - started, err := qm.AddListToRunningQueue(repo, []string{PrKey(pr)}) + started, err := qm.AddListToRunningQueue(context.Background(), repo, []string{PrKey(pr)}) assert.NilError(t, err) assert.Equal(t, len(started), 1) } +func TestManagerRecoveryInterval(t *testing.T) { + observer, _ := zapobserver.New(zap.InfoLevel) + logger := zap.New(observer).Sugar() + + memoryManager := NewManager(logger) + assert.Equal(t, memoryManager.RecoveryInterval(), time.Duration(0)) + + leaseManager := NewLeaseManager(logger, nil, nil, "pac") + assert.Equal(t, leaseManager.RecoveryInterval(), defaultLeaseClaimTTL) +} + func TestAddToPendingQueueDirectly(t *testing.T) { observer, _ := zapobserver.New(zap.InfoLevel) logger := zap.New(observer).Sugar() @@ -82,19 +94,19 @@ func TestNewManagerForList(t *testing.T) { prFirst := newTestPR("first", time.Now(), nil, nil, tektonv1.PipelineRunSpec{}) // added to queue, as there is only one should start - started, err := qm.AddListToRunningQueue(repo, []string{PrKey(prFirst)}) + started, err := qm.AddListToRunningQueue(context.Background(), repo, []string{PrKey(prFirst)}) assert.NilError(t, err) assert.Equal(t, len(started), 1) // removing the running from queue - assert.Equal(t, qm.RemoveAndTakeItemFromQueue(repo, prFirst), "") + assert.Equal(t, qm.RemoveAndTakeItemFromQueue(context.Background(), repo, prFirst), "") // adding another 2 pipelineRun, limit is 1 so this will be added to pending queue and // then one will be started prSecond := newTestPR("second", time.Now().Add(1*time.Second), nil, nil, tektonv1.PipelineRunSpec{}) prThird := newTestPR("third", time.Now().Add(7*time.Second), nil, nil, tektonv1.PipelineRunSpec{}) - started, err = qm.AddListToRunningQueue(repo, []string{PrKey(prSecond), PrKey(prThird)}) + started, err = qm.AddListToRunningQueue(context.Background(), repo, []string{PrKey(prSecond), PrKey(prThird)}) assert.NilError(t, err) assert.Equal(t, len(started), 1) // as per the list, 2nd must be started @@ -104,12 +116,12 @@ func TestNewManagerForList(t *testing.T) { prFourth := newTestPR("fourth", time.Now().Add(5*time.Second), nil, nil, tektonv1.PipelineRunSpec{}) prFifth := newTestPR("fifth", time.Now().Add(4*time.Second), nil, nil, tektonv1.PipelineRunSpec{}) - started, err = qm.AddListToRunningQueue(repo, []string{PrKey(prFourth), PrKey(prFifth)}) + started, err = qm.AddListToRunningQueue(context.Background(), repo, []string{PrKey(prFourth), PrKey(prFifth)}) assert.NilError(t, err) assert.Equal(t, len(started), 0) // removing 2nd from queue, which means it should start 3rd - assert.Equal(t, qm.RemoveAndTakeItemFromQueue(repo, prSecond), PrKey(prThird)) + assert.Equal(t, qm.RemoveAndTakeItemFromQueue(context.Background(), repo, prSecond), PrKey(prThird)) // changing the concurrency limit to 2 repo.Spec.ConcurrencyLimit = intPtr(2) @@ -118,7 +130,7 @@ func TestNewManagerForList(t *testing.T) { prSeventh := newTestPR("seventh", time.Now().Add(5*time.Second), nil, nil, tektonv1.PipelineRunSpec{}) prEight := newTestPR("eight", time.Now().Add(4*time.Second), nil, nil, tektonv1.PipelineRunSpec{}) - started, err = qm.AddListToRunningQueue(repo, []string{PrKey(prSixth), PrKey(prSeventh), PrKey(prEight)}) + started, err = qm.AddListToRunningQueue(context.Background(), repo, []string{PrKey(prSixth), PrKey(prSeventh), PrKey(prEight)}) assert.NilError(t, err) // third is running, but limit is changed now, so one more should be moved to running assert.Equal(t, len(started), 1) @@ -139,18 +151,18 @@ func TestNewManagerReListing(t *testing.T) { prThird := newTestPR("third", time.Now().Add(7*time.Second), nil, nil, tektonv1.PipelineRunSpec{}) // added to queue, as there is only one should start - started, err := qm.AddListToRunningQueue(repo, []string{PrKey(prFirst), PrKey(prSecond), PrKey(prThird)}) + started, err := qm.AddListToRunningQueue(context.Background(), repo, []string{PrKey(prFirst), PrKey(prSecond), PrKey(prThird)}) assert.NilError(t, err) assert.Equal(t, len(started), 2) // if first is running and other pipelineRuns are reconciling // then adding again shouldn't have any effect - started, err = qm.AddListToRunningQueue(repo, []string{PrKey(prFirst), PrKey(prSecond), PrKey(prThird)}) + started, err = qm.AddListToRunningQueue(context.Background(), repo, []string{PrKey(prFirst), PrKey(prSecond), PrKey(prThird)}) assert.NilError(t, err) assert.Equal(t, len(started), 0) // again - started, err = qm.AddListToRunningQueue(repo, []string{PrKey(prFirst), PrKey(prSecond), PrKey(prThird)}) + started, err = qm.AddListToRunningQueue(context.Background(), repo, []string{PrKey(prFirst), PrKey(prSecond), PrKey(prThird)}) assert.NilError(t, err) assert.Equal(t, len(started), 0) @@ -164,7 +176,7 @@ func TestNewManagerReListing(t *testing.T) { prFifth := newTestPR("fifth", time.Now().Add(1*time.Second), nil, nil, tektonv1.PipelineRunSpec{}) prSixths := newTestPR("sixth", time.Now().Add(7*time.Second), nil, nil, tektonv1.PipelineRunSpec{}) - started, err = qm.AddListToRunningQueue(repo, []string{PrKey(prFourth), PrKey(prFifth), PrKey(prSixths)}) + started, err = qm.AddListToRunningQueue(context.Background(), repo, []string{PrKey(prFourth), PrKey(prFifth), PrKey(prSixths)}) assert.NilError(t, err) assert.Equal(t, len(started), 0) @@ -250,7 +262,7 @@ func TestQueueManagerInitQueues(t *testing.T) { // now if first is completed and removed from running queue // then second must start as per execution order - qm.RemoveAndTakeItemFromQueue(repo, firstPR) + qm.RemoveAndTakeItemFromQueue(context.Background(), repo, firstPR) assert.Equal(t, sema.getCurrentRunning()[0], PrKey(secondPR)) assert.Equal(t, sema.getCurrentPending()[0], PrKey(thirdPR)) diff --git a/pkg/reconciler/controller.go b/pkg/reconciler/controller.go index b00881a321..5826de00b4 100644 --- a/pkg/reconciler/controller.go +++ b/pkg/reconciler/controller.go @@ -3,19 +3,28 @@ package reconciler import ( "context" "path" + "sort" + "time" "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode" "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/keys" + pacv1alpha1 "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/v1alpha1" "github.com/openshift-pipelines/pipelines-as-code/pkg/events" "github.com/openshift-pipelines/pipelines-as-code/pkg/generated/injection/informers/pipelinesascode/v1alpha1/repository" "github.com/openshift-pipelines/pipelines-as-code/pkg/kubeinteraction" "github.com/openshift-pipelines/pipelines-as-code/pkg/params" "github.com/openshift-pipelines/pipelines-as-code/pkg/params/info" + "github.com/openshift-pipelines/pipelines-as-code/pkg/params/settings" prmetrics "github.com/openshift-pipelines/pipelines-as-code/pkg/pipelinerunmetrics" queuepkg "github.com/openshift-pipelines/pipelines-as-code/pkg/queue" tektonv1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1" + tektonclient "github.com/tektoncd/pipeline/pkg/client/clientset/versioned" tektonPipelineRunInformerv1 "github.com/tektoncd/pipeline/pkg/client/injection/informers/pipeline/v1/pipelinerun" tektonPipelineRunReconcilerv1 "github.com/tektoncd/pipeline/pkg/client/injection/reconciler/pipeline/v1/pipelinerun" + tektonv1lister "github.com/tektoncd/pipeline/pkg/client/listers/pipeline/v1" + "go.uber.org/zap" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" "knative.dev/pkg/configmap" "knative.dev/pkg/controller" @@ -24,6 +33,8 @@ import ( "knative.dev/pkg/system" ) +const leaseQueueRecoveryBuffer = time.Second + func NewController() func(context.Context, configmap.Watcher) *controller.Impl { return func(ctx context.Context, _ configmap.Watcher) *controller.Impl { ctx = info.StoreNS(ctx, system.Namespace()) @@ -40,21 +51,24 @@ func NewController() func(context.Context, configmap.Watcher) *controller.Impl { log.Fatal("failed to init kinit client : ", err) } - // Start pac config syncer - go params.StartConfigSync(ctx, run) + if err := run.UpdatePacConfig(ctx); err != nil { + log.Fatal("failed to load pac config: ", err) + } pipelineRunInformer := tektonPipelineRunInformerv1.Get(ctx) metrics, err := prmetrics.NewRecorder() if err != nil { log.Fatalf("Failed to create pipeline as code metrics recorder %v", err) } + pacInfo := run.Info.GetPacOpts() + log.Infof("using concurrency backend %q; changing this setting requires restarting the watcher", pacInfo.ConcurrencyBackend) r := &Reconciler{ run: run, kinteract: kinteract, pipelineRunLister: pipelineRunInformer.Lister(), repoLister: repository.Get(ctx).Lister(), - qm: queuepkg.NewManager(run.Clients.Log), + qm: queuepkg.NewManagerForBackend(run.Clients.Log, run.Clients.Kube, run.Clients.Tekton, system.Namespace(), pacInfo.ConcurrencyBackend), metrics: metrics, eventEmitter: events.NewEventEmitter(run.Clients.Kube, run.Clients.Log), } @@ -68,10 +82,228 @@ func NewController() func(context.Context, configmap.Watcher) *controller.Impl { logging.FromContext(ctx).Panicf("Couldn't register PipelineRun informer event handler: %w", err) } + if recoveryInterval := r.qm.RecoveryInterval(); recoveryInterval > 0 { + startLeaseQueueRecoveryLoop(ctx, log, impl, r.pipelineRunLister, run.Clients.Tekton, r.eventEmitter, recoveryInterval+leaseQueueRecoveryBuffer) + } + + // Start pac config syncer after the initial settings have been loaded. + go params.StartConfigSync(ctx, run) + return impl } } +func startLeaseQueueRecoveryLoop( + ctx context.Context, + logger *zap.SugaredLogger, + impl *controller.Impl, + lister tektonv1lister.PipelineRunLister, + tekton tektonclient.Interface, + eventEmitter *events.EventEmitter, + interval time.Duration, +) { + if interval <= 0 { + return + } + + logger.Infof("starting lease queue recovery loop with interval %s", interval) + runLeaseQueueRecovery(ctx, logger, impl, lister, tekton, eventEmitter) + + go func() { + ticker := time.NewTicker(interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + runLeaseQueueRecovery(ctx, logger, impl, lister, tekton, eventEmitter) + } + } + }() +} + +func runLeaseQueueRecovery( + ctx context.Context, + logger *zap.SugaredLogger, + impl *controller.Impl, + lister tektonv1lister.PipelineRunLister, + tekton tektonclient.Interface, + eventEmitter *events.EventEmitter, +) { + recoveryCandidates, err := leaseQueueRecoveryCandidates(lister) + if err != nil { + logger.Warnf("failed to list queued PipelineRuns for lease recovery: %v", err) + return + } + recoveryKeys := selectLeaseQueueRecoveryKeys(recoveryCandidates) + logger.Debugf("lease queue recovery selected %d repository candidate(s): %v", len(recoveryKeys), recoveryKeys) + + for _, pipelineRun := range recoveryCandidates { + latest, err := tekton.TektonV1().PipelineRuns(pipelineRun.GetNamespace()).Get(ctx, pipelineRun.GetName(), metav1.GetOptions{}) + if err != nil { + logger.Warnf("failed to re-fetch queued pipelinerun %s/%s for lease recovery: %v", pipelineRun.GetNamespace(), pipelineRun.GetName(), err) + continue + } + if !queuepkg.IsRecoverableQueuedPipelineRun(latest) { + logger.Debugf( + "skipping stale lease recovery candidate %s/%s because latest state=%s spec.status=%s done=%t cancelled=%t", + latest.GetNamespace(), latest.GetName(), latest.GetAnnotations()[keys.State], latest.Spec.Status, latest.IsDone(), latest.IsCancelled(), + ) + if err := queuepkg.ClearQueueDebugAnnotations(ctx, logger, tekton, latest); err != nil { + logger.Warnf("failed to clear stale queue debug state for %s/%s during lease recovery: %v", latest.GetNamespace(), latest.GetName(), err) + } + continue + } + + if err := queuepkg.SyncQueueDebugAnnotations(ctx, logger, tekton, latest, queuepkg.DebugSnapshot{ + Backend: settings.ConcurrencyBackendLease, + RepoKey: types.NamespacedName{Namespace: latest.GetNamespace(), Name: latest.GetAnnotations()[keys.Repository]}.String(), + Position: 1, + Running: -1, + Claimed: -1, + Queued: -1, + Limit: -1, + LastDecision: queuepkg.QueueDecisionRecoveryRequeued, + }); err != nil { + logger.Warnf("failed to record queue recovery debug state for %s/%s: %v", latest.GetNamespace(), latest.GetName(), err) + } + + repoName := latest.GetAnnotations()[keys.Repository] + if eventEmitter != nil && repoName != "" { + eventEmitter.EmitMessage(&pacv1alpha1.Repository{ + ObjectMeta: metav1.ObjectMeta{ + Name: repoName, + Namespace: latest.GetNamespace(), + }, + }, zap.InfoLevel, "QueueRecoveryRequeued", + "recovery loop re-enqueued queued PipelineRun "+latest.GetNamespace()+"/"+latest.GetName()) + } + + key := types.NamespacedName{Namespace: latest.GetNamespace(), Name: latest.GetName()} + logger.Debugf("enqueuing queued pipelinerun %s for lease recovery", key.String()) + impl.EnqueueKey(key) + } +} + +func leaseQueueRecoveryCandidates(lister tektonv1lister.PipelineRunLister) ([]*tektonv1.PipelineRun, error) { + pipelineRuns, err := lister.List(labels.Everything()) + if err != nil { + return nil, err + } + return selectLeaseQueueRecoveryCandidates(pipelineRuns), nil +} + +func selectLeaseQueueRecoveryCandidates(pipelineRuns []*tektonv1.PipelineRun) []*tektonv1.PipelineRun { + return selectLeaseQueueRecoveryCandidatesAt(pipelineRuns, time.Now(), queuepkg.DefaultLeaseClaimTTL()) +} + +func selectLeaseQueueRecoveryCandidatesAt( + pipelineRuns []*tektonv1.PipelineRun, + now time.Time, + claimTTL time.Duration, +) []*tektonv1.PipelineRun { + candidatesByRepo := map[string]*tektonv1.PipelineRun{} + healthyRepos := map[string]struct{}{} + + for _, pipelineRun := range pipelineRuns { + repoKey, ok := leaseQueueRecoveryRepoKey(pipelineRun) + if !ok { + continue + } + + if hasHealthyLeaseQueueProgress(pipelineRun, now, claimTTL) { + healthyRepos[repoKey] = struct{}{} + } + if !isEligibleLeaseQueueRecoveryCandidate(pipelineRun) { + continue + } + if existing, ok := candidatesByRepo[repoKey]; !ok || shouldPreferLeaseQueueRecoveryCandidate(pipelineRun, existing) { + candidatesByRepo[repoKey] = pipelineRun + } + } + + selected := make([]*tektonv1.PipelineRun, 0, len(candidatesByRepo)) + for repoKey, pipelineRun := range candidatesByRepo { + if _, ok := healthyRepos[repoKey]; ok { + continue + } + selected = append(selected, pipelineRun) + } + + sort.Slice(selected, func(i, j int) bool { + return shouldPreferLeaseQueueRecoveryCandidate(selected[i], selected[j]) + }) + + return selected +} + +func selectLeaseQueueRecoveryKeys(pipelineRuns []*tektonv1.PipelineRun) []types.NamespacedName { + selected := selectLeaseQueueRecoveryCandidates(pipelineRuns) + + recoveryKeys := make([]types.NamespacedName, 0, len(selected)) + for _, pipelineRun := range selected { + recoveryKeys = append(recoveryKeys, types.NamespacedName{ + Namespace: pipelineRun.GetNamespace(), + Name: pipelineRun.GetName(), + }) + } + return recoveryKeys +} + +func isEligibleLeaseQueueRecoveryCandidate(pipelineRun *tektonv1.PipelineRun) bool { + if pipelineRun == nil { + return false + } + if pipelineRun.GetAnnotations()[keys.Repository] == "" { + return false + } + return queuepkg.IsRecoverableQueuedPipelineRun(pipelineRun) +} + +func leaseQueueRecoveryRepoKey(pipelineRun *tektonv1.PipelineRun) (string, bool) { + if pipelineRun == nil { + return "", false + } + + repoName := pipelineRun.GetAnnotations()[keys.Repository] + if repoName == "" { + return "", false + } + + return types.NamespacedName{ + Namespace: pipelineRun.GetNamespace(), + Name: repoName, + }.String(), true +} + +func hasHealthyLeaseQueueProgress( + pipelineRun *tektonv1.PipelineRun, + now time.Time, + claimTTL time.Duration, +) bool { + if pipelineRun == nil { + return false + } + + switch pipelineRun.GetAnnotations()[keys.State] { + case kubeinteraction.StateStarted: + return !pipelineRun.IsDone() && !pipelineRun.IsCancelled() + case kubeinteraction.StateQueued: + return queuepkg.HasActiveLeaseQueueClaim(pipelineRun, now, claimTTL) + default: + return false + } +} + +func shouldPreferLeaseQueueRecoveryCandidate(left, right *tektonv1.PipelineRun) bool { + if left.CreationTimestamp.Equal(&right.CreationTimestamp) { + return left.GetName() < right.GetName() + } + return left.CreationTimestamp.Before(&right.CreationTimestamp) +} + // enqueue only the pipelineruns which are in `started` state // pipelinerun will have a label `pipelinesascode.tekton.dev/state` to describe the state. func checkStateAndEnqueue(impl *controller.Impl) func(obj any) { diff --git a/pkg/reconciler/controller_test.go b/pkg/reconciler/controller_test.go index a667baf701..0e698cd079 100644 --- a/pkg/reconciler/controller_test.go +++ b/pkg/reconciler/controller_test.go @@ -2,18 +2,30 @@ package reconciler import ( "context" + "maps" "path" "testing" + "time" "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode" "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/keys" + "github.com/openshift-pipelines/pipelines-as-code/pkg/events" + "github.com/openshift-pipelines/pipelines-as-code/pkg/kubeinteraction" + queuepkg "github.com/openshift-pipelines/pipelines-as-code/pkg/queue" + testclient "github.com/openshift-pipelines/pipelines-as-code/pkg/test/clients" tektontest "github.com/openshift-pipelines/pipelines-as-code/pkg/test/tekton" pipelinev1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1" + tektonv1lister "github.com/tektoncd/pipeline/pkg/client/listers/pipeline/v1" "go.uber.org/zap" zapobserver "go.uber.org/zap/zaptest/observer" "gotest.tools/v3/assert" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/cache" + duckv1 "knative.dev/pkg/apis/duck/v1" "knative.dev/pkg/controller" + rtesting "knative.dev/pkg/reconciler/testing" ) type fakeReconciler struct{} @@ -81,3 +93,544 @@ func TestCtrlOpts(t *testing.T) { // Assert that the promote filter function returns true. assert.Assert(t, promote) } + +func TestSelectLeaseQueueRecoveryKeys(t *testing.T) { + now := time.Unix(1_700_001_000, 0) + + tests := []struct { + name string + pipelineRuns []*pipelinev1.PipelineRun + want []string + }{ + { + name: "selects one oldest queued pending run per repository", + pipelineRuns: []*pipelinev1.PipelineRun{ + newLeaseRecoveryPR("later", "test-ns", "repo-a", now.Add(2*time.Second), map[string]string{}, false), + newLeaseRecoveryPR("earlier", "test-ns", "repo-a", now.Add(time.Second), map[string]string{}, false), + newLeaseRecoveryPR("missing-order", "test-ns", "repo-b", now, map[string]string{ + keys.ExecutionOrder: "", + }, false), + newLeaseRecoveryPR("malformed-order", "test-ns", "repo-c", now.Add(250*time.Millisecond), map[string]string{ + keys.ExecutionOrder: "test-ns/some-other-pr", + }, false), + newLeaseRecoveryPR("valid", "test-ns", "repo-b", now.Add(3*time.Second), map[string]string{}, false), + newLeaseRecoveryPR("valid-after-malformed", "test-ns", "repo-c", now.Add(4*time.Second), map[string]string{}, false), + newLeaseRecoveryPR("other-namespace", "other-ns", "repo-a", now.Add(4*time.Second), map[string]string{}, false), + newLeaseRecoveryPR("waiting-behind-started", "test-ns", "repo-e", now.Add(4*time.Second), map[string]string{}, false), + newLeaseRecoveryStartedPR("running", "repo-e", now.Add(5*time.Second)), + newLeaseRecoveryPR("actively-claimed", "test-ns", "repo-f", now.Add(6*time.Second), map[string]string{ + keys.QueueClaimedBy: "watcher-1", + keys.QueueClaimedAt: now.Format(time.RFC3339Nano), + }, false), + newLeaseRecoveryPR("missing-repo", "test-ns", "", now.Add(5*time.Second), map[string]string{}, false), + newLeaseRecoveryStartedPR("started", "repo-d", now.Add(6*time.Second)), + newLeaseRecoveryPR("done", "test-ns", "repo-d", now.Add(7*time.Second), map[string]string{}, true), + }, + want: []string{ + "test-ns/earlier", + "test-ns/valid", + "other-ns/other-namespace", + "test-ns/valid-after-malformed", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + recoveryKeys := selectLeaseQueueRecoveryKeysAt(tt.pipelineRuns, now, queuepkg.DefaultLeaseClaimTTL()) + got := make([]string, 0, len(recoveryKeys)) + for _, key := range recoveryKeys { + got = append(got, key.String()) + } + assert.DeepEqual(t, got, tt.want) + }) + } +} + +func TestRunLeaseQueueRecovery(t *testing.T) { + observer, catcher := zapobserver.New(zap.DebugLevel) + logger := zap.New(observer).Sugar() + ctx, _ := rtesting.SetupFakeContext(t) + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{}) + wh := &fakeReconciler{} + impl := controller.NewContext(context.TODO(), wh, controller.ControllerOptions{ + WorkQueueName: "LeaseRecovery", + Logger: logger.Named("LeaseRecovery"), + }) + + now := time.Unix(1_700_001_100, 0) + indexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{}) + for _, pipelineRun := range []*pipelinev1.PipelineRun{ + newLeaseRecoveryPR("first", "test-ns", "repo-a", now, map[string]string{}, false), + newLeaseRecoveryPR("second", "test-ns", "repo-a", now.Add(time.Second), map[string]string{}, false), + newLeaseRecoveryPR("third", "test-ns", "repo-b", now.Add(2*time.Second), map[string]string{}, false), + newLeaseRecoveryStartedPR("started", "repo-c", now.Add(3*time.Second)), + } { + assert.NilError(t, indexer.Add(pipelineRun)) + _, err := stdata.Pipeline.TektonV1().PipelineRuns(pipelineRun.Namespace).Create(ctx, pipelineRun, metav1.CreateOptions{}) + assert.NilError(t, err) + } + + emitter := events.NewEventEmitter(stdata.Kube, logger) + runLeaseQueueRecovery(ctx, logger, impl, tektonv1lister.NewPipelineRunLister(indexer), stdata.Pipeline, emitter) + + assert.Equal(t, catcher.FilterMessageSnippet("Adding to queue test-ns/first").Len(), 1) + assert.Equal(t, catcher.FilterMessageSnippet("Adding to queue test-ns/third").Len(), 1) + assert.Equal(t, catcher.FilterMessageSnippet("Adding to queue").Len(), 2) + + first, err := stdata.Pipeline.TektonV1().PipelineRuns("test-ns").Get(ctx, "first", metav1.GetOptions{}) + assert.NilError(t, err) + assert.Equal(t, first.GetAnnotations()[keys.QueueDecision], "recovery_requeued") + + events, err := stdata.Kube.CoreV1().Events("test-ns").List(ctx, metav1.ListOptions{}) + assert.NilError(t, err) + assert.Assert(t, len(events.Items) >= 1) + recoveryEvents := 0 + for _, event := range events.Items { + if event.Reason == "QueueRecoveryRequeued" { + recoveryEvents++ + assert.Equal(t, event.Type, corev1.EventTypeNormal) + } + } + assert.Assert(t, recoveryEvents >= 1) +} + +func TestRunLeaseQueueRecoverySkipsStaleAdvancedLivePipelineRun(t *testing.T) { + tests := []struct { + name string + livePR *pipelinev1.PipelineRun + }{ + { + name: "latest started", + livePR: &pipelinev1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "first", + Namespace: "test-ns", + Annotations: map[string]string{ + keys.Repository: "repo-a", + keys.State: kubeinteraction.StateStarted, + }, + }, + Spec: pipelinev1.PipelineRunSpec{ + Status: pipelinev1.PipelineRunSpecStatusPending, + }, + }, + }, + { + name: "latest completed", + livePR: &pipelinev1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "first", + Namespace: "test-ns", + Annotations: map[string]string{ + keys.Repository: "repo-a", + keys.State: kubeinteraction.StateCompleted, + }, + }, + Status: pipelinev1.PipelineRunStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{{ + Type: "Succeeded", + Status: "True", + }}, + }, + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + observer, catcher := zapobserver.New(zap.DebugLevel) + logger := zap.New(observer).Sugar() + ctx, _ := rtesting.SetupFakeContext(t) + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{ + PipelineRuns: []*pipelinev1.PipelineRun{tt.livePR}, + }) + wh := &fakeReconciler{} + impl := controller.NewContext(context.TODO(), wh, controller.ControllerOptions{ + WorkQueueName: "LeaseRecovery", + Logger: logger.Named("LeaseRecovery"), + }) + + indexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{}) + staleQueued := newLeaseRecoveryPR("first", "test-ns", "repo-a", time.Unix(1_700_001_200, 0), map[string]string{}, false) + assert.NilError(t, indexer.Add(staleQueued)) + + emitter := events.NewEventEmitter(stdata.Kube, logger) + runLeaseQueueRecovery(ctx, logger, impl, tektonv1lister.NewPipelineRunLister(indexer), stdata.Pipeline, emitter) + + assert.Equal(t, catcher.FilterMessageSnippet("Adding to queue test-ns/first").Len(), 0) + assert.Assert(t, catcher.FilterMessageSnippet("skipping stale lease recovery candidate test-ns/first").Len() >= 1) + + updated, err := stdata.Pipeline.TektonV1().PipelineRuns("test-ns").Get(ctx, "first", metav1.GetOptions{}) + assert.NilError(t, err) + assert.Equal(t, updated.GetAnnotations()[keys.QueueDecision], "") + + events, err := stdata.Kube.CoreV1().Events("test-ns").List(ctx, metav1.ListOptions{}) + assert.NilError(t, err) + recoveryEvents := 0 + for _, event := range events.Items { + if event.Reason == "QueueRecoveryRequeued" { + recoveryEvents++ + } + } + assert.Equal(t, recoveryEvents, 0) + }) + } +} + +func TestRunLeaseQueueRecoverySkipsHealthyQueuedRuns(t *testing.T) { + observer, catcher := zapobserver.New(zap.DebugLevel) + logger := zap.New(observer).Sugar() + ctx, _ := rtesting.SetupFakeContext(t) + now := time.Unix(1_700_001_300, 0) + claimedAt := time.Now().UTC().Format(time.RFC3339Nano) + + started := newLeaseRecoveryStartedPR("running", "repo-a", now) + waiting := newLeaseRecoveryPR("waiting", "test-ns", "repo-a", now.Add(time.Second), map[string]string{}, false) + claimed := newLeaseRecoveryPR("claimed", "test-ns", "repo-b", now.Add(2*time.Second), map[string]string{ + keys.QueueClaimedBy: "watcher-1", + keys.QueueClaimedAt: claimedAt, + }, false) + + stdata, _ := testclient.SeedTestData(t, ctx, testclient.Data{ + PipelineRuns: []*pipelinev1.PipelineRun{started, waiting, claimed}, + }) + wh := &fakeReconciler{} + impl := controller.NewContext(context.TODO(), wh, controller.ControllerOptions{ + WorkQueueName: "LeaseRecovery", + Logger: logger.Named("LeaseRecovery"), + }) + + indexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{}) + for _, pipelineRun := range []*pipelinev1.PipelineRun{started, waiting, claimed} { + assert.NilError(t, indexer.Add(pipelineRun)) + } + + emitter := events.NewEventEmitter(stdata.Kube, logger) + runLeaseQueueRecovery(ctx, logger, impl, tektonv1lister.NewPipelineRunLister(indexer), stdata.Pipeline, emitter) + + assert.Equal(t, catcher.FilterMessageSnippet("Adding to queue").Len(), 0) + + for _, name := range []string{"waiting", "claimed"} { + updated, err := stdata.Pipeline.TektonV1().PipelineRuns("test-ns").Get(ctx, name, metav1.GetOptions{}) + assert.NilError(t, err) + assert.Equal(t, updated.GetAnnotations()[keys.QueueDecision], "") + } + + events, err := stdata.Kube.CoreV1().Events("test-ns").List(ctx, metav1.ListOptions{}) + assert.NilError(t, err) + recoveryEvents := 0 + for _, event := range events.Items { + if event.Reason == "QueueRecoveryRequeued" { + recoveryEvents++ + } + } + assert.Equal(t, recoveryEvents, 0) +} + +func selectLeaseQueueRecoveryKeysAt( + pipelineRuns []*pipelinev1.PipelineRun, + now time.Time, + claimTTL time.Duration, +) []types.NamespacedName { + selected := selectLeaseQueueRecoveryCandidatesAt(pipelineRuns, now, claimTTL) + + recoveryKeys := make([]types.NamespacedName, 0, len(selected)) + for _, pipelineRun := range selected { + recoveryKeys = append(recoveryKeys, types.NamespacedName{ + Namespace: pipelineRun.GetNamespace(), + Name: pipelineRun.GetName(), + }) + } + return recoveryKeys +} + +func newLeaseRecoveryPR( + name, namespace, repo string, + createdAt time.Time, + extraAnnotations map[string]string, + done bool, +) *pipelinev1.PipelineRun { + annotations := map[string]string{ + keys.State: kubeinteraction.StateQueued, + keys.ExecutionOrder: namespace + "/" + name, + } + if repo != "" { + annotations[keys.Repository] = repo + } + maps.Copy(annotations, extraAnnotations) + + pipelineRun := &pipelinev1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + CreationTimestamp: metav1.Time{Time: createdAt}, + Annotations: annotations, + }, + Spec: pipelinev1.PipelineRunSpec{ + Status: pipelinev1.PipelineRunSpecStatusPending, + }, + } + if done { + pipelineRun.Status.Conditions = duckv1.Conditions{{ + Type: "Succeeded", + Status: "True", + }} + } + return pipelineRun +} + +func newLeaseRecoveryStartedPR(name, repo string, createdAt time.Time) *pipelinev1.PipelineRun { + return &pipelinev1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "test-ns", + CreationTimestamp: metav1.Time{Time: createdAt}, + Annotations: map[string]string{ + keys.Repository: repo, + keys.State: kubeinteraction.StateStarted, + }, + }, + Spec: pipelinev1.PipelineRunSpec{ + Status: pipelinev1.PipelineRunSpecStatusPending, + }, + } +} + +func TestHasHealthyLeaseQueueProgress(t *testing.T) { + now := time.Unix(1_700_001_000, 0) + claimTTL := 5 * time.Minute + + tests := []struct { + name string + pr *pipelinev1.PipelineRun + want bool + }{ + { + name: "nil pipeline run", + pr: nil, + want: false, + }, + { + name: "started not done not cancelled", + pr: &pipelinev1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "running", + Namespace: "test-ns", + Annotations: map[string]string{keys.State: kubeinteraction.StateStarted}, + }, + }, + want: true, + }, + { + name: "started and done", + pr: &pipelinev1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "done", + Namespace: "test-ns", + Annotations: map[string]string{keys.State: kubeinteraction.StateStarted}, + }, + Status: pipelinev1.PipelineRunStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{{Type: "Succeeded", Status: "True"}}, + }, + }, + }, + want: false, + }, + { + name: "started and cancelled", + pr: &pipelinev1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cancelled", + Namespace: "test-ns", + Annotations: map[string]string{keys.State: kubeinteraction.StateStarted}, + }, + Spec: pipelinev1.PipelineRunSpec{ + Status: pipelinev1.PipelineRunSpecStatusCancelled, + }, + }, + want: false, + }, + { + name: "queued with active claim", + pr: &pipelinev1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "claimed", + Namespace: "test-ns", + Annotations: map[string]string{ + keys.State: kubeinteraction.StateQueued, + keys.QueueClaimedBy: "watcher-1", + keys.QueueClaimedAt: now.Add(-time.Minute).Format(time.RFC3339Nano), + }, + }, + }, + want: true, + }, + { + name: "queued with expired claim", + pr: &pipelinev1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "expired", + Namespace: "test-ns", + Annotations: map[string]string{ + keys.State: kubeinteraction.StateQueued, + keys.QueueClaimedBy: "watcher-1", + keys.QueueClaimedAt: now.Add(-10 * time.Minute).Format(time.RFC3339Nano), + }, + }, + }, + want: false, + }, + { + name: "queued with no claim", + pr: &pipelinev1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "unclaimed", + Namespace: "test-ns", + Annotations: map[string]string{keys.State: kubeinteraction.StateQueued}, + }, + }, + want: false, + }, + { + name: "unknown state", + pr: &pipelinev1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "completed", + Namespace: "test-ns", + Annotations: map[string]string{keys.State: kubeinteraction.StateCompleted}, + }, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, hasHealthyLeaseQueueProgress(tt.pr, now, claimTTL), tt.want) + }) + } +} + +func TestIsEligibleLeaseQueueRecoveryCandidate(t *testing.T) { + tests := []struct { + name string + pr *pipelinev1.PipelineRun + want bool + }{ + { + name: "nil pipeline run", + pr: nil, + want: false, + }, + { + name: "empty repository annotation", + pr: &pipelinev1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "no-repo", + Namespace: "test-ns", + Annotations: map[string]string{keys.State: kubeinteraction.StateQueued}, + }, + }, + want: false, + }, + { + name: "eligible queued pending run", + pr: newLeaseRecoveryPR("eligible", "test-ns", "repo-a", time.Unix(1_700_001_000, 0), map[string]string{}, false), + want: true, + }, + { + name: "started state is not eligible", + pr: newLeaseRecoveryStartedPR("started", "repo-a", time.Unix(1_700_001_000, 0)), + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, isEligibleLeaseQueueRecoveryCandidate(tt.pr), tt.want) + }) + } +} + +func TestLeaseQueueRecoveryRepoKey(t *testing.T) { + tests := []struct { + name string + pr *pipelinev1.PipelineRun + wantKey string + wantOK bool + }{ + { + name: "nil pipeline run", + pr: nil, + wantOK: false, + }, + { + name: "empty repository annotation", + pr: &pipelinev1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "no-repo", + Namespace: "test-ns", + Annotations: map[string]string{}, + }, + }, + wantOK: false, + }, + { + name: "valid key extraction", + pr: &pipelinev1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-pr", + Namespace: "test-ns", + Annotations: map[string]string{keys.Repository: "my-repo"}, + }, + }, + wantKey: "test-ns/my-repo", + wantOK: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + key, ok := leaseQueueRecoveryRepoKey(tt.pr) + assert.Equal(t, ok, tt.wantOK) + if tt.wantOK { + assert.Equal(t, key, tt.wantKey) + } + }) + } +} + +func TestStartLeaseQueueRecoveryLoopEarlyReturn(t *testing.T) { + observer, catcher := zapobserver.New(zap.DebugLevel) + logger := zap.New(observer).Sugar() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + startLeaseQueueRecoveryLoop(ctx, logger, nil, nil, nil, nil, 0) + startLeaseQueueRecoveryLoop(ctx, logger, nil, nil, nil, nil, -1*time.Second) + + assert.Equal(t, catcher.FilterMessageSnippet("starting lease queue recovery loop").Len(), 0) +} + +func TestCtrlOptsPromoteFilterMissingState(t *testing.T) { + observer, _ := zapobserver.New(zap.DebugLevel) + logger := zap.New(observer).Sugar() + wh := &fakeReconciler{} + impl := controller.NewContext(context.TODO(), wh, controller.ControllerOptions{ + WorkQueueName: "Test", + Logger: logger.Named("Test"), + }) + opts := ctrlOpts()(impl) + + pr := &pipelinev1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "no-state", + Namespace: "test-ns", + }, + } + assert.Assert(t, !opts.PromoteFilterFunc(pr)) +} diff --git a/pkg/reconciler/finalizer.go b/pkg/reconciler/finalizer.go index aa019de07f..bb988e02ec 100644 --- a/pkg/reconciler/finalizer.go +++ b/pkg/reconciler/finalizer.go @@ -54,7 +54,7 @@ func (r *Reconciler) FinalizeKind(ctx context.Context, pr *tektonv1.PipelineRun) repo.Spec.Merge(r.globalRepo.Spec) } logger = logger.With("namespace", repo.Namespace) - next := r.qm.RemoveAndTakeItemFromQueue(repo, pr) + next := r.qm.RemoveAndTakeItemFromQueue(ctx, repo, pr) if next != "" { key := strings.Split(next, "/") pr, err := r.run.Clients.Tekton.TektonV1().PipelineRuns(key[0]).Get(ctx, key[1], metav1.GetOptions{}) diff --git a/pkg/reconciler/finalizer_test.go b/pkg/reconciler/finalizer_test.go index 18551061b5..975a3e8298 100644 --- a/pkg/reconciler/finalizer_test.go +++ b/pkg/reconciler/finalizer_test.go @@ -166,7 +166,7 @@ func TestReconcilerFinalizeKind(t *testing.T) { if len(tt.addToQueue) != 0 { for _, pr := range tt.addToQueue { - _, err := r.qm.AddListToRunningQueue(finalizeTestRepo, []string{pr.GetNamespace() + "/" + pr.GetName()}) + _, err := r.qm.AddListToRunningQueue(ctx, finalizeTestRepo, []string{pr.GetNamespace() + "/" + pr.GetName()}) assert.NilError(t, err) } } diff --git a/pkg/reconciler/queue_pipelineruns.go b/pkg/reconciler/queue_pipelineruns.go index a8ca68d09a..9044eeee29 100644 --- a/pkg/reconciler/queue_pipelineruns.go +++ b/pkg/reconciler/queue_pipelineruns.go @@ -3,11 +3,15 @@ package reconciler import ( "context" "fmt" + "strconv" "strings" + "time" + "github.com/openshift-pipelines/pipelines-as-code/pkg/action" "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/keys" pacAPIv1alpha1 "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/v1alpha1" "github.com/openshift-pipelines/pipelines-as-code/pkg/kubeinteraction" + "github.com/openshift-pipelines/pipelines-as-code/pkg/params/settings" queuepkg "github.com/openshift-pipelines/pipelines-as-code/pkg/queue" tektonv1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1" "go.uber.org/zap" @@ -55,7 +59,8 @@ func (r *Reconciler) queuePipelineRun(ctx context.Context, logger *zap.SugaredLo // if concurrency was set and later removed or changed to zero // then remove pipelineRun from Queue and update pending state to running if repo.Spec.ConcurrencyLimit != nil && *repo.Spec.ConcurrencyLimit == 0 { - _ = r.qm.RemoveAndTakeItemFromQueue(repo, pr) + logger.Debugf("concurrency disabled for repository %s; promoting queued pipelinerun %s immediately", repo.GetName(), pr.GetName()) + _ = r.qm.RemoveAndTakeItemFromQueue(ctx, repo, pr) if err := r.updatePipelineRunToInProgress(ctx, logger, repo, pr); err != nil { return fmt.Errorf("failed to update PipelineRun to in_progress: %w", err) } @@ -67,30 +72,65 @@ func (r *Reconciler) queuePipelineRun(ctx context.Context, logger *zap.SugaredLo maxIterations := 5 orderedList := queuepkg.FilterPipelineRunByState(ctx, r.run.Clients.Tekton, strings.Split(order, ","), tektonv1.PipelineRunSpecStatusPending, kubeinteraction.StateQueued) + logger.Debugf( + "processing queued pipelinerun %s/%s for repository %s with concurrency limit %v and ordered candidates %v", + pr.Namespace, pr.Name, repo.GetName(), repo.Spec.ConcurrencyLimit, orderedList, + ) for { - acquired, err := r.qm.AddListToRunningQueue(repo, orderedList) + logger.Debugf("attempting queue acquisition loop %d for repository %s and pipelinerun %s/%s", itered+1, repo.GetName(), pr.Namespace, pr.Name) + acquired, err := r.qm.AddListToRunningQueue(ctx, repo, orderedList) if err != nil { + if r.eventEmitter != nil && strings.Contains(err.Error(), "timed out acquiring concurrency lease") { + r.eventEmitter.EmitMessage(repo, zap.WarnLevel, "QueueLeaseAcquireTimeout", + "timed out acquiring the lease-backed concurrency lock for repository "+queuepkg.RepoKey(repo)) + } return fmt.Errorf("failed to add to queue: %s: %w", pr.GetName(), err) } + logger.Debugf("queue acquisition for repository %s returned candidates %v", repo.GetName(), acquired) if len(acquired) == 0 { logger.Infof("no new PipelineRun acquired for repo %s", repo.GetName()) break } for _, prKeys := range acquired { + logger.Debugf("attempting to promote queued pipelinerun %s for repository %s", prKeys, repo.GetName()) + if r.eventEmitter != nil { + r.eventEmitter.EmitMessage(repo, zap.InfoLevel, "QueueClaimedForPromotion", + "claimed queued PipelineRun "+prKeys+" for promotion in repository "+queuepkg.RepoKey(repo)) + } nsName := strings.Split(prKeys, "/") - repoKey := queuepkg.RepoKey(repo) pr, err = r.run.Clients.Tekton.TektonV1().PipelineRuns(nsName[0]).Get(ctx, nsName[1], metav1.GetOptions{}) if err != nil { logger.Info("failed to get pr with namespace and name: ", nsName[0], nsName[1]) - _ = r.qm.RemoveFromQueue(repoKey, prKeys) + logger.Debugf("clearing queue claim for missing pipelinerun %s after failed fetch", prKeys) + _ = r.qm.RemoveFromQueue(ctx, repo, prKeys) } else { if err := r.updatePipelineRunToInProgress(ctx, logger, repo, pr); err != nil { + started, startedErr := r.pipelineRunReachedStartedState(ctx, pr) + if startedErr != nil { + logger.Warnf("failed to verify whether pipelineRun %s reached started state: %v", pr.GetName(), startedErr) + } + if started { + logger.Warnf("pipelineRun %s already reached started state despite promotion error: %v", pr.GetName(), err) + processed = true + continue + } logger.Errorf("failed to update pipelineRun to in_progress: %w", err) - _ = r.qm.RemoveFromQueue(repoKey, prKeys) - } else { - processed = true + logger.Debugf("recording queue promotion failure for pipelinerun %s after promotion error", queuepkg.PrKey(pr)) + retryErr := r.recordQueuePromotionFailure(ctx, logger, repo, pr, err) + if retryErr != nil { + return fmt.Errorf("failed to record queue promotion failure for %s after promotion error: %w", pr.GetName(), retryErr) + } + if r.eventEmitter != nil { + r.eventEmitter.EmitMessage(repo, zap.WarnLevel, "QueuePromotionFailed", + "failed to promote queued PipelineRun "+queuepkg.PrKey(pr)+": "+err.Error()) + } + logger.Debugf("removing queue claim for pipelinerun %s after promotion failure", prKeys) + _ = r.qm.RemoveFromQueue(ctx, repo, prKeys) + return fmt.Errorf("failed to update pipelineRun to in_progress: %w", err) } + logger.Debugf("successfully promoted queued pipelinerun %s for repository %s", queuepkg.PrKey(pr), repo.GetName()) + processed = true } } if processed { @@ -103,3 +143,96 @@ func (r *Reconciler) queuePipelineRun(ctx context.Context, logger *zap.SugaredLo } return nil } + +func (r *Reconciler) pipelineRunReachedStartedState(ctx context.Context, pr *tektonv1.PipelineRun) (bool, error) { + latest, err := r.run.Clients.Tekton.TektonV1().PipelineRuns(pr.Namespace).Get(ctx, pr.Name, metav1.GetOptions{}) + if err != nil { + return false, err + } + started := latest.GetAnnotations()[keys.State] == kubeinteraction.StateStarted + r.run.Clients.Log.Debugf("checked latest state for pipelinerun %s/%s while handling queue promotion: state=%s started=%t", + pr.Namespace, pr.Name, latest.GetAnnotations()[keys.State], started) + return started, nil +} + +func (r *Reconciler) recordQueuePromotionFailure( + ctx context.Context, + logger *zap.SugaredLogger, + repo *pacAPIv1alpha1.Repository, + pr *tektonv1.PipelineRun, + cause error, +) error { + retries := 0 + if current := pr.GetAnnotations()[keys.QueuePromotionRetries]; current != "" { + parsed, err := strconv.Atoi(current) + if err == nil { + retries = parsed + } + } + retries++ + logger.Debugf( + "recording queue promotion failure for pipelinerun %s/%s with retry=%d cause=%v", + pr.Namespace, pr.Name, retries, cause, + ) + + annotations := map[string]any{ + keys.QueuePromotionRetries: strconv.Itoa(retries), + keys.QueuePromotionLastErr: cause.Error(), + } + snapshot := r.queueDebugSnapshotForPipelineRun(repo, pr, queuepkg.QueueDecisionPromotionFailed) + annotations[keys.QueueDecision] = snapshot.LastDecision + annotations[keys.QueueDebugSummary] = snapshot.Summary() + + _, err := action.PatchPipelineRun(ctx, logger, "queue promotion failure", r.run.Clients.Tekton, pr, map[string]any{ + "metadata": map[string]any{ + "annotations": annotations, + }, + }) + if err != nil { + return err + } + logger.Debugf("recorded queue promotion failure annotations for pipelinerun %s/%s", pr.Namespace, pr.Name) + return nil +} + +func (r *Reconciler) queueDebugSnapshotForPipelineRun( + repo *pacAPIv1alpha1.Repository, + pr *tektonv1.PipelineRun, + decision string, +) queuepkg.DebugSnapshot { + position := -1 + if index, ok := queuepkg.ExecutionOrderIndex(pr); ok { + position = index + 1 + } + + limit := -1 + if repo != nil && repo.Spec.ConcurrencyLimit != nil { + limit = *repo.Spec.ConcurrencyLimit + } + + claimedBy, claimAge := queuepkg.LeaseQueueClaimInfo(pr, time.Now()) + repoKey := "" + if repo != nil { + repoKey = queuepkg.RepoKey(repo) + } + backend := settings.ConcurrencyBackendMemory + if r != nil && r.run != nil && r.run.Info.Pac != nil { + backend = r.run.Info.GetPacOpts().ConcurrencyBackend + if backend == "" { + backend = settings.ConcurrencyBackendMemory + } + } + + return queuepkg.DebugSnapshot{ + Backend: backend, + RepoKey: repoKey, + Position: position, + Running: -1, + Claimed: -1, + Queued: -1, + Limit: limit, + ClaimedBy: claimedBy, + ClaimAge: claimAge, + LastDecision: decision, + } +} diff --git a/pkg/reconciler/queue_pipelineruns_test.go b/pkg/reconciler/queue_pipelineruns_test.go index 945c9d1f6a..5764559d26 100644 --- a/pkg/reconciler/queue_pipelineruns_test.go +++ b/pkg/reconciler/queue_pipelineruns_test.go @@ -1,14 +1,18 @@ package reconciler import ( + "errors" "fmt" + "strings" "testing" "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/keys" pacv1alpha1 "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/v1alpha1" + "github.com/openshift-pipelines/pipelines-as-code/pkg/kubeinteraction" "github.com/openshift-pipelines/pipelines-as-code/pkg/params" "github.com/openshift-pipelines/pipelines-as-code/pkg/params/clients" "github.com/openshift-pipelines/pipelines-as-code/pkg/params/info" + queuepkg "github.com/openshift-pipelines/pipelines-as-code/pkg/queue" testclient "github.com/openshift-pipelines/pipelines-as-code/pkg/test/clients" testconcurrency "github.com/openshift-pipelines/pipelines-as-code/pkg/test/concurrency" tektonv1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1" @@ -16,6 +20,8 @@ import ( zapobserver "go.uber.org/zap/zaptest/observer" "gotest.tools/v3/assert" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + k8stesting "k8s.io/client-go/testing" rtesting "knative.dev/pkg/reconciler/testing" ) @@ -208,3 +214,152 @@ func TestQueuePipelineRun(t *testing.T) { }) } } + +func TestRecordQueuePromotionFailure(t *testing.T) { + observer, _ := zapobserver.New(zap.InfoLevel) + fakelogger := zap.New(observer).Sugar() + + tests := []struct { + name string + annotations map[string]string + wantRetries string + }{ + { + name: "first failure records retry metadata", + annotations: map[string]string{}, + wantRetries: "1", + }, + { + name: "later failures keep incrementing retries without blocking promotion", + annotations: map[string]string{ + keys.QueuePromotionRetries: "4", + }, + wantRetries: "5", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx, _ := rtesting.SetupFakeContext(t) + repo := &pacv1alpha1.Repository{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-repo", + Namespace: "test", + }, + Spec: pacv1alpha1.RepositorySpec{ + ConcurrencyLimit: func() *int { limit := 2; return &limit }(), + }, + } + pipelineRun := &tektonv1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test", + Name: "test-pr", + Annotations: tt.annotations, + }, + } + testData := testclient.Data{ + PipelineRuns: []*tektonv1.PipelineRun{pipelineRun}, + } + stdata, _ := testclient.SeedTestData(t, ctx, testData) + + r := &Reconciler{ + run: ¶ms.Run{ + Clients: clients.Clients{ + Tekton: stdata.Pipeline, + }, + }, + } + + err := r.recordQueuePromotionFailure(ctx, fakelogger, repo, pipelineRun, errors.New("cannot patch")) + assert.NilError(t, err) + + updatedPR, err := stdata.Pipeline.TektonV1().PipelineRuns("test").Get(ctx, "test-pr", metav1.GetOptions{}) + assert.NilError(t, err) + assert.Equal(t, updatedPR.GetAnnotations()[keys.QueuePromotionRetries], tt.wantRetries) + assert.Equal(t, updatedPR.GetAnnotations()[keys.QueuePromotionLastErr], "cannot patch") + assert.Equal(t, updatedPR.GetAnnotations()[keys.QueueDecision], queuepkg.QueueDecisionPromotionFailed) + assert.Assert(t, strings.Contains(updatedPR.GetAnnotations()[keys.QueueDebugSummary], "lastDecision="+queuepkg.QueueDecisionPromotionFailed)) + _, exists := updatedPR.GetAnnotations()[keys.QueuePromotionBlocked] + assert.Assert(t, !exists, "QueuePromotionBlocked should not be set when promotion fails") + }) + } +} + +func TestQueuePipelineRunStopsAfterSinglePromotionFailure(t *testing.T) { + observer, _ := zapobserver.New(zap.InfoLevel) + fakelogger := zap.New(observer).Sugar() + ctx, _ := rtesting.SetupFakeContext(t) + + pipelineRun := &tektonv1.PipelineRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "queued", + Namespace: "test", + Annotations: map[string]string{ + keys.ExecutionOrder: "test/queued", + keys.Repository: "test", + keys.State: kubeinteraction.StateQueued, + }, + }, + Spec: tektonv1.PipelineRunSpec{ + Status: tektonv1.PipelineRunSpecStatusPending, + }, + } + testRepo := &pacv1alpha1.Repository{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "test", + }, + Spec: pacv1alpha1.RepositorySpec{ + URL: randomURL, + }, + } + + testData := testclient.Data{ + Repositories: []*pacv1alpha1.Repository{testRepo}, + PipelineRuns: []*tektonv1.PipelineRun{pipelineRun}, + } + stdata, informers := testclient.SeedTestData(t, ctx, testData) + patchCalls := 0 + stdata.Pipeline.PrependReactor("patch", "pipelineruns", func(_ k8stesting.Action) (bool, runtime.Object, error) { + patchCalls++ + if patchCalls == 1 { + return true, nil, errors.New("boom") + } + return false, nil, nil + }) + + r := &Reconciler{ + qm: testconcurrency.TestQMI{ + RunningQueue: []string{"test/queued"}, + }, + repoLister: informers.Repository.Lister(), + run: ¶ms.Run{ + Info: info.Info{ + Kube: &info.KubeOpts{ + Namespace: "global", + }, + Controller: &info.ControllerInfo{}, + }, + Clients: clients.Clients{ + PipelineAsCode: stdata.PipelineAsCode, + Tekton: stdata.Pipeline, + Kube: stdata.Kube, + Log: fakelogger, + }, + }, + } + + err := r.queuePipelineRun(ctx, fakelogger, pipelineRun) + assert.ErrorContains(t, err, "failed to update pipelineRun to in_progress") + + updatedPR, getErr := stdata.Pipeline.TektonV1().PipelineRuns("test").Get(ctx, "queued", metav1.GetOptions{}) + assert.NilError(t, getErr) + assert.Equal(t, updatedPR.GetAnnotations()[keys.QueuePromotionRetries], "1") + assert.Assert(t, updatedPR.GetAnnotations()[keys.QueuePromotionLastErr] != "") + assert.Equal(t, updatedPR.GetAnnotations()[keys.QueueDecision], queuepkg.QueueDecisionPromotionFailed) + assert.Assert(t, strings.Contains(updatedPR.GetAnnotations()[keys.QueueDebugSummary], "lastDecision="+queuepkg.QueueDecisionPromotionFailed)) + assert.Assert(t, strings.Contains(updatedPR.GetAnnotations()[keys.QueuePromotionLastErr], "cannot update state")) + assert.Assert(t, strings.Contains(updatedPR.GetAnnotations()[keys.QueuePromotionLastErr], "boom")) + _, exists := updatedPR.GetAnnotations()[keys.QueuePromotionBlocked] + assert.Assert(t, !exists, "QueuePromotionBlocked should not be set when queuePipelineRun returns after a failed promotion") +} diff --git a/pkg/reconciler/reconciler.go b/pkg/reconciler/reconciler.go index a0528a3f35..7392b936b0 100644 --- a/pkg/reconciler/reconciler.go +++ b/pkg/reconciler/reconciler.go @@ -340,10 +340,16 @@ func (r *Reconciler) reportFinalStatus(ctx context.Context, logger *zap.SugaredL // remove pipelineRun from Queue and start the next one for { - next := r.qm.RemoveAndTakeItemFromQueue(repo, pr) + next := r.qm.RemoveAndTakeItemFromQueue(ctx, repo, pr) if next == "" { + logger.Debugf("no next queued pipelinerun available for repository %s after completing %s/%s", repo.GetName(), pr.Namespace, pr.Name) break } + logger.Debugf("attempting to promote next queued pipelinerun %s for repository %s after completing %s/%s", next, repo.GetName(), pr.Namespace, pr.Name) + if r.eventEmitter != nil { + r.eventEmitter.EmitMessage(repo, zap.InfoLevel, "QueueClaimedForPromotion", + "claimed queued PipelineRun "+next+" for promotion in repository "+queuepkg.RepoKey(repo)) + } key := strings.Split(next, "/") pr, err := r.run.Clients.Tekton.TektonV1().PipelineRuns(key[0]).Get(ctx, key[1], metav1.GetOptions{}) if err != nil { @@ -353,9 +359,15 @@ func (r *Reconciler) reportFinalStatus(ctx context.Context, logger *zap.SugaredL if err := r.updatePipelineRunToInProgress(ctx, logger, repo, pr); err != nil { logger.Errorf("failed to update status: %w", err) - _ = r.qm.RemoveFromQueue(queuepkg.RepoKey(repo), queuepkg.PrKey(pr)) + logger.Debugf("clearing queue claim for pipelinerun %s after failed promotion from finalizer path", queuepkg.PrKey(pr)) + _ = r.qm.RemoveFromQueue(ctx, repo, queuepkg.PrKey(pr)) + if r.eventEmitter != nil { + r.eventEmitter.EmitMessage(repo, zap.WarnLevel, "QueuePromotionFailed", + "failed to promote queued PipelineRun "+queuepkg.PrKey(pr)+": "+err.Error()) + } continue } + logger.Debugf("successfully promoted queued pipelinerun %s after completion of %s/%s", queuepkg.PrKey(pr), key[0], key[1]) break } @@ -458,9 +470,15 @@ func (r *Reconciler) initGitProviderClient(ctx context.Context, logger *zap.Suga func (r *Reconciler) updatePipelineRunState(ctx context.Context, logger *zap.SugaredLogger, pr *tektonv1.PipelineRun, state string) (*tektonv1.PipelineRun, error) { currentState := pr.GetAnnotations()[keys.State] logger.Infof("updating pipelineRun %v/%v state from %s to %s", pr.GetNamespace(), pr.GetName(), currentState, state) - annotations := map[string]string{ + annotations := map[string]any{ keys.State: state, } + if state != kubeinteraction.StateQueued { + for key, value := range queuepkg.LeaseQueueCleanupAnnotations() { + annotations[key] = value + } + logger.Debugf("clearing queue-only annotations for pipelinerun %s/%s while transitioning to %s", pr.GetNamespace(), pr.GetName(), state) + } if state == kubeinteraction.StateStarted { annotations[keys.SCMReportingPLRStarted] = "true" } diff --git a/pkg/reconciler/reconciler_test.go b/pkg/reconciler/reconciler_test.go index 00da0ced48..4fc55170d4 100644 --- a/pkg/reconciler/reconciler_test.go +++ b/pkg/reconciler/reconciler_test.go @@ -259,7 +259,14 @@ func TestUpdatePipelineRunState(t *testing.T) { Namespace: "test", Name: "test", Annotations: map[string]string{ - keys.State: kubeinteraction.StateQueued, + keys.State: kubeinteraction.StateQueued, + keys.QueueClaimedBy: "watcher-a", + keys.QueueClaimedAt: "2026-04-15T10:00:00Z", + keys.QueueDecision: "claim_active", + keys.QueueDebugSummary: "backend=lease", + keys.QueuePromotionRetries: "3", + keys.QueuePromotionBlocked: "true", + keys.QueuePromotionLastErr: "boom", }, }, Spec: tektonv1.PipelineRunSpec{ @@ -276,7 +283,14 @@ func TestUpdatePipelineRunState(t *testing.T) { Namespace: "test", Name: "test", Annotations: map[string]string{ - keys.State: kubeinteraction.StateStarted, + keys.State: kubeinteraction.StateStarted, + keys.QueueClaimedBy: "watcher-a", + keys.QueueClaimedAt: "2026-04-15T10:00:00Z", + keys.QueueDecision: "claim_active", + keys.QueueDebugSummary: "backend=lease", + keys.QueuePromotionRetries: "3", + keys.QueuePromotionBlocked: "true", + keys.QueuePromotionLastErr: "boom", }, }, Spec: tektonv1.PipelineRunSpec{}, @@ -315,6 +329,18 @@ func TestUpdatePipelineRunState(t *testing.T) { _, exists := updatedPR.GetAnnotations()[keys.SCMReportingPLRStarted] assert.Assert(t, !exists, "SCMReportingPLRStarted annotation should not exist for non-started states") } + for _, key := range []string{ + keys.QueueClaimedBy, + keys.QueueClaimedAt, + keys.QueueDecision, + keys.QueueDebugSummary, + keys.QueuePromotionRetries, + keys.QueuePromotionBlocked, + keys.QueuePromotionLastErr, + } { + _, exists := updatedPR.GetAnnotations()[key] + assert.Assert(t, !exists, "%s annotation should be removed when state is %s", key, tt.state) + } }) } } diff --git a/pkg/test/concurrency/concurrency.go b/pkg/test/concurrency/concurrency.go index 0b3a4cf09c..e4edca5b16 100644 --- a/pkg/test/concurrency/concurrency.go +++ b/pkg/test/concurrency/concurrency.go @@ -2,6 +2,7 @@ package concurrency import ( "context" + "time" pacv1alpha1 "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/v1alpha1" pacVersionedClient "github.com/openshift-pipelines/pipelines-as-code/pkg/generated/clientset/versioned" @@ -15,8 +16,11 @@ type TestQMI struct { } func (TestQMI) InitQueues(_ context.Context, _ tektonVersionedClient.Interface, _ pacVersionedClient.Interface) error { - // TODO implement me - panic("implement me") + return nil +} + +func (TestQMI) RecoveryInterval() time.Duration { + return 0 } func (TestQMI) RemoveRepository(_ *pacv1alpha1.Repository) { @@ -26,25 +30,22 @@ func (t TestQMI) QueuedPipelineRuns(_ *pacv1alpha1.Repository) []string { return t.QueuedPrs } -func (TestQMI) RunningPipelineRuns(_ *pacv1alpha1.Repository) []string { - // TODO implement me - panic("implement me") +func (t TestQMI) RunningPipelineRuns(_ *pacv1alpha1.Repository) []string { + return t.RunningQueue } -func (t TestQMI) AddListToRunningQueue(_ *pacv1alpha1.Repository, _ []string) ([]string, error) { +func (t TestQMI) AddListToRunningQueue(_ context.Context, _ *pacv1alpha1.Repository, _ []string) ([]string, error) { return t.RunningQueue, nil } func (TestQMI) AddToPendingQueue(_ *pacv1alpha1.Repository, _ []string) error { - // TODO implement me - panic("implement me") + return nil } -func (t TestQMI) RemoveFromQueue(_, _ string) bool { +func (t TestQMI) RemoveFromQueue(_ context.Context, _ *pacv1alpha1.Repository, _ string) bool { return false } -func (TestQMI) RemoveAndTakeItemFromQueue(_ *pacv1alpha1.Repository, _ *tektonv1.PipelineRun) string { - // TODO implement me - panic("implement me") +func (TestQMI) RemoveAndTakeItemFromQueue(_ context.Context, _ *pacv1alpha1.Repository, _ *tektonv1.PipelineRun) string { + return "" } diff --git a/pkg/test/concurrency/concurrency_test.go b/pkg/test/concurrency/concurrency_test.go new file mode 100644 index 0000000000..edc9897c6e --- /dev/null +++ b/pkg/test/concurrency/concurrency_test.go @@ -0,0 +1,27 @@ +package concurrency + +import ( + "context" + "testing" + + "gotest.tools/v3/assert" +) + +func TestTestQMIHelpers(t *testing.T) { + qm := TestQMI{ + QueuedPrs: []string{"test/queued"}, + RunningQueue: []string{"test/running"}, + } + + assert.NilError(t, qm.InitQueues(context.Background(), nil, nil)) + assert.DeepEqual(t, qm.QueuedPipelineRuns(nil), []string{"test/queued"}) + assert.DeepEqual(t, qm.RunningPipelineRuns(nil), []string{"test/running"}) + + running, err := qm.AddListToRunningQueue(context.Background(), nil, nil) + assert.NilError(t, err) + assert.DeepEqual(t, running, []string{"test/running"}) + + assert.NilError(t, qm.AddToPendingQueue(nil, []string{"test/pending"})) + assert.Equal(t, qm.RemoveFromQueue(context.Background(), nil, "test/running"), false) + assert.Equal(t, qm.RemoveAndTakeItemFromQueue(context.Background(), nil, nil), "") +} diff --git a/test/pkg/gitea/test.go b/test/pkg/gitea/test.go index 2fdaddc13a..be08b00169 100644 --- a/test/pkg/gitea/test.go +++ b/test/pkg/gitea/test.go @@ -78,24 +78,30 @@ func PostCommentOnPullRequest(t *testing.T, topt *TestOpts, body string) { func checkEvents(t *testing.T, events *corev1.EventList, topts *TestOpts) { t.Helper() - newEvents := make([]corev1.Event, 0) - // filter out events that are not related to the test like checking for cancelled pipelineruns - for i := len(events.Items) - 1; i >= 0; i-- { + unexpected := unexpectedEvents(events) + for i := range events.Items { topts.ParamsRun.Clients.Log.Infof("Reason is %s", events.Items[i].Reason) - if events.Items[i].Reason == "CancelInProgress" { - continue - } - newEvents = append(newEvents, events.Items[i]) } - if len(newEvents) > 0 { - topts.ParamsRun.Clients.Log.Infof("0 events expected in case of failure but got %d", len(newEvents)) - for _, em := range newEvents { + if len(unexpected) > 0 { + topts.ParamsRun.Clients.Log.Infof("0 warning events expected in case of failure but got %d", len(unexpected)) + for _, em := range unexpected { topts.ParamsRun.Clients.Log.Infof("Event: Reason: %s Type: %s ReportingInstance: %s Message: %s", em.Reason, em.Type, em.ReportingInstance, em.Message) } t.FailNow() } } +func unexpectedEvents(events *corev1.EventList) []corev1.Event { + unexpected := make([]corev1.Event, 0) + for i := len(events.Items) - 1; i >= 0; i-- { + if events.Items[i].Type == corev1.EventTypeNormal { + continue + } + unexpected = append(unexpected, events.Items[i]) + } + return unexpected +} + func AddLabelToIssue(t *testing.T, topt *TestOpts, label string) { var targetID int64 allLabels, _, err := topt.GiteaCNX.Client().ListRepoLabels(topt.Opts.Organization, topt.Opts.Repo, forgejo.ListLabelsOptions{})