diff --git a/Makefile b/Makefile index ae5918b..254a4c3 100755 --- a/Makefile +++ b/Makefile @@ -249,8 +249,7 @@ test-helm: ## Test Helm charts (lint, template, validate) --set image.repository=openshift-hyperfleet/hyperfleet-sentinel \ --set image.tag=latest \ --set config.resourceType=nodepools \ - --set config.pollInterval=10s \ - --set config.maxAgeReady=1h > /dev/null + --set config.pollInterval=10s > /dev/null @echo "Custom resource selector template OK" @echo "" @echo "All Helm chart tests passed!" diff --git a/README.md b/README.md index 9923c4c..5027d78 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # hyperfleet-sentinel -HyperFleet Sentinel Service - Kubernetes service that polls HyperFleet API, makes orchestration decisions, and publishes events. Features configurable max age intervals, horizontal sharding via SentinelConfig CRD, and broker abstraction (GCP Pub/Sub, RabbitMQ, Stub). Centralized reconciliation logic. +HyperFleet Sentinel Service - Kubernetes service that polls HyperFleet API, makes orchestration decisions, and publishes events. Features configurable CEL-based decision logic, horizontal sharding via SentinelConfig CRD, and broker abstraction (GCP Pub/Sub, RabbitMQ, Stub). Centralized reconciliation logic. ## Development Setup @@ -125,8 +125,7 @@ Create a configuration file based on the examples in the `configs/` directory: | Field | Type | Default | Description | |-------|------|---------|-------------| | `poll_interval` | duration | `5s` | How often to poll the API for resource updates | -| `max_age_not_ready` | duration | `10s` | Max age interval for resources not ready | -| `max_age_ready` | duration | `30m` | Max age interval for ready resources | +| `message_decision` | object | See below | CEL-based decision logic (params + result) | | `hyperfleet_api.timeout` | duration | `5s` | Request timeout for API calls | | `resource_selector` | array | `[]` | Label selectors for filtering resources (enables sharding) | | `message_data` | map | `{}` | Template fields for CloudEvents data payload | @@ -231,8 +230,17 @@ This uses all defaults. Broker configuration is managed via `broker.yaml` or env ```yaml resource_type: clusters poll_interval: 5s -max_age_not_ready: 10s -max_age_ready: 30m + +message_decision: + params: + ref_time: 'condition("Ready").last_updated_time' + is_ready: 'condition("Ready").status == "True"' + has_ref_time: 'ref_time != ""' + is_new_resource: '!is_ready && resource.generation == 1' + generation_mismatch: 'resource.generation > condition("Ready").observed_generation' + ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")' + not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")' + result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced" resource_selector: - label: shard diff --git a/charts/README.md b/charts/README.md index af794aa..f5aacf6 100644 --- a/charts/README.md +++ b/charts/README.md @@ -80,8 +80,7 @@ The following table lists the configurable parameters of the Sentinel chart and |-----------|-------------|---------| | `config.resourceType` | Resource type to watch | `clusters` | | `config.pollInterval` | Polling interval | `5s` | -| `config.maxAgeNotReady` | Max age for not ready resources | `10s` | -| `config.maxAgeReady` | Max age for ready resources | `30m` | +| `config.messageDecision` | CEL-based decision logic (params + result) | See values.yaml | | `config.resourceSelector` | Resource selector for sharding | See values.yaml | | `config.hyperfleetApi.baseUrl` | HyperFleet API base URL | `http://hyperfleet-api:8000` | | `config.hyperfleetApi.timeout` | API timeout | `5s` | diff --git a/charts/templates/configmap.yaml b/charts/templates/configmap.yaml index 5efa0a3..3c3b356 100644 --- a/charts/templates/configmap.yaml +++ b/charts/templates/configmap.yaml @@ -9,8 +9,6 @@ data: # Sentinel configuration resource_type: {{ .Values.config.resourceType }} poll_interval: {{ .Values.config.pollInterval }} - max_age_not_ready: {{ .Values.config.maxAgeNotReady }} - max_age_ready: {{ .Values.config.maxAgeReady }} {{- if .Values.config.resourceSelector }} # Resource selector for horizontal sharding @@ -26,6 +24,18 @@ data: endpoint: {{ .Values.config.hyperfleetApi.baseUrl }} timeout: {{ .Values.config.hyperfleetApi.timeout }} + {{- if .Values.config.messageDecision }} + # Configurable CEL-based decision logic + message_decision: + {{- if .Values.config.messageDecision.params }} + params: + {{- range $name, $expr := .Values.config.messageDecision.params }} + {{ $name }}: {{ $expr | quote }} + {{- end }} + {{- end }} + result: {{ .Values.config.messageDecision.result | quote }} + {{- end }} + {{- if .Values.config.messageData }} # CloudEvents data payload configuration message_data: diff --git a/charts/values.yaml b/charts/values.yaml index 495a6d0..de639a9 100644 --- a/charts/values.yaml +++ b/charts/values.yaml @@ -84,11 +84,19 @@ config: # How often to poll the API for resource updates pollInterval: 5s - # Max age interval for resources not ready - maxAgeNotReady: 10s - - # Max age interval for ready resources - maxAgeReady: 30m + # Configurable CEL-based decision logic + # Params are named CEL expressions evaluated in dependency order. + # Result is a CEL boolean expression that determines whether to publish an event. + messageDecision: + params: + ref_time: 'condition("Ready").last_updated_time' + is_ready: 'condition("Ready").status == "True"' + has_ref_time: 'ref_time != ""' + is_new_resource: '!is_ready && resource.generation == 1' + generation_mismatch: 'resource.generation > condition("Ready").observed_generation' + ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")' + not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")' + result: 'is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced' # Resource selector for horizontal sharding # Deploy multiple Sentinel instances with different shard values diff --git a/cmd/sentinel/main.go b/cmd/sentinel/main.go index 3728f94..44055d7 100755 --- a/cmd/sentinel/main.go +++ b/cmd/sentinel/main.go @@ -217,7 +217,11 @@ func runServe( } log.Info(ctx, "Initialized HyperFleet client") - decisionEngine := engine.NewDecisionEngine(cfg.MaxAgeNotReady, cfg.MaxAgeReady) + decisionEngine, err := engine.NewDecisionEngine(cfg.MessageDecision) + if err != nil { + log.Errorf(ctx, "Failed to create decision engine: %v", err) + return fmt.Errorf("failed to create decision engine: %w", err) + } // Initialize broker metrics recorder // Broker metrics (messages_published_total, errors_total, etc.) are registered diff --git a/configs/dev-example.yaml b/configs/dev-example.yaml index cdccca0..40ac0cd 100644 --- a/configs/dev-example.yaml +++ b/configs/dev-example.yaml @@ -22,9 +22,18 @@ resource_type: clusters # Faster polling for development - see changes quickly. poll_interval: 2s -# Shorter max age intervals for development. -max_age_not_ready: 5s -max_age_ready: 2m +# Decision logic for publishing events (CEL expressions). +# Shorter debounce intervals for development. +message_decision: + params: + ref_time: 'condition("Ready").last_updated_time' + is_ready: 'condition("Ready").status == "True"' + has_ref_time: 'ref_time != ""' + is_new_resource: '!is_ready && resource.generation == 1' + generation_mismatch: 'resource.generation > condition("Ready").observed_generation' + ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("2m")' + not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("5s")' + result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced" # Messaging system type for OpenTelemetry tracing attributes. # Should match the actual message broker being used for accurate observability. @@ -51,11 +60,11 @@ hyperfleet_api: # # Available CEL variables: # resource — the HyperFleet resource (id, kind, href, generation, status, labels, ...) -# reason — the decision reason string (e.g. "max_age_exceeded") +# reason — the decision reason string (e.g. "message decision matched") # # Examples: # field: "resource.id" — field access -# field: 'resource.status.ready ? "Ready" : "NotReady"' — conditional +# field: 'condition("Ready").status == "True" ? "Ready" : "NotReady"' — conditional # field: '"constant"' — CEL string literal # field: "reason" — decision reason # diff --git a/configs/gcp-pubsub-example.yaml b/configs/gcp-pubsub-example.yaml index 6f957a3..0a81d71 100644 --- a/configs/gcp-pubsub-example.yaml +++ b/configs/gcp-pubsub-example.yaml @@ -18,13 +18,18 @@ resource_type: clusters # Accepts Go duration format: ns, us/µs, ms, s, m, h. poll_interval: 5s -# Max age interval for resources that are not ready. -# Resources in transitional states are re-checked more frequently. -max_age_not_ready: 10s - -# Max age interval for resources that are ready and stable. -# Stable resources are checked less frequently to reduce API load. -max_age_ready: 30m +# Decision logic for publishing events (CEL expressions). +# Uses condition-based checks with configurable thresholds. +message_decision: + params: + ref_time: 'condition("Ready").last_updated_time' + is_ready: 'condition("Ready").status == "True"' + has_ref_time: 'ref_time != ""' + is_new_resource: '!is_ready && resource.generation == 1' + generation_mismatch: 'resource.generation > condition("Ready").observed_generation' + ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")' + not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")' + result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced" # Messaging system type for OpenTelemetry tracing attributes. # Should match the actual message broker being used for accurate observability. @@ -57,11 +62,11 @@ hyperfleet_api: # # Available CEL variables: # resource — the HyperFleet resource (id, kind, href, generation, status, labels, ...) -# reason — the decision reason string (e.g. "max_age_exceeded") +# reason — the decision reason string (e.g. "message decision matched") # # Examples: # field: "resource.id" — field access -# field: 'resource.status.ready ? "Ready" : "NotReady"' — conditional +# field: 'condition("Ready").status == "True" ? "Ready" : "NotReady"' — conditional # field: '"constant"' — CEL string literal # field: "reason" — decision reason # diff --git a/configs/rabbitmq-example.yaml b/configs/rabbitmq-example.yaml index ff50c3a..e085d5a 100644 --- a/configs/rabbitmq-example.yaml +++ b/configs/rabbitmq-example.yaml @@ -18,13 +18,18 @@ resource_type: clusters # Accepts Go duration format: ns, us/µs, ms, s, m, h. poll_interval: 5s -# Max age interval for resources that are not ready. -# Resources in transitional states are re-checked more frequently. -max_age_not_ready: 10s - -# Max age interval for resources that are ready and stable. -# Stable resources are checked less frequently to reduce API load. -max_age_ready: 30m +# Decision logic for publishing events (CEL expressions). +# Uses condition-based checks with configurable thresholds. +message_decision: + params: + ref_time: 'condition("Ready").last_updated_time' + is_ready: 'condition("Ready").status == "True"' + has_ref_time: 'ref_time != ""' + is_new_resource: '!is_ready && resource.generation == 1' + generation_mismatch: 'resource.generation > condition("Ready").observed_generation' + ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")' + not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")' + result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced" # Messaging system type for OpenTelemetry tracing attributes. # Should match the actual message broker being used for accurate observability. @@ -57,11 +62,11 @@ hyperfleet_api: # # Available CEL variables: # resource — the HyperFleet resource (id, kind, href, generation, status, labels, ...) -# reason — the decision reason string (e.g. "max_age_exceeded") +# reason — the decision reason string (e.g. "message decision matched") # # Examples: # field: "resource.id" — field access -# field: 'resource.status.ready ? "Ready" : "NotReady"' — conditional +# field: 'condition("Ready").status == "True" ? "Ready" : "NotReady"' — conditional # field: '"constant"' — CEL string literal # field: "reason" — decision reason # diff --git a/docs/alerts.md b/docs/alerts.md index 12ae298..3621a11 100644 --- a/docs/alerts.md +++ b/docs/alerts.md @@ -156,11 +156,11 @@ labels: component: sentinel annotations: summary: "High resource skip ratio in Sentinel" - description: "{{ $value | humanizePercentage }} of resources are being skipped. This may indicate max_age configuration issues." + description: "{{ $value | humanizePercentage }} of resources are being skipped. This may indicate message_decision configuration issues." ``` -**Impact**: May indicate max age intervals too long or adapter status update issues. +**Impact**: May indicate decision thresholds too restrictive or adapter status update issues. -**Response**: Review max age configuration and adapter health. +**Response**: Review message_decision configuration and adapter health. --- diff --git a/docs/metrics.md b/docs/metrics.md index 12bbadf..f6475cc 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -60,7 +60,7 @@ sum by (resource_type) (hyperfleet_sentinel_pending_resources) **Labels:** - `resource_type`: Type of resource - `resource_selector`: Label selector -- `reason`: Reason for publishing the event (e.g., `max_age_exceeded`, `generation_mismatch`) +- `reason`: Reason for publishing the event (e.g., `message decision matched`) **Use Cases:** - Monitor event publishing rate @@ -87,7 +87,7 @@ sum by (reason) (rate(hyperfleet_sentinel_events_published_total[5m])) **Labels:** - `resource_type`: Type of resource - `resource_selector`: Label selector -- `reason`: Reason for skipping (e.g., `within_max_age`, `generation_match`) +- `reason`: Reason for skipping (e.g., `message decision result is false`) **Use Cases:** - Monitor decision engine effectiveness diff --git a/docs/multi-instance-deployment.md b/docs/multi-instance-deployment.md index 3cf4147..9f19160 100644 --- a/docs/multi-instance-deployment.md +++ b/docs/multi-instance-deployment.md @@ -206,7 +206,7 @@ Memory: 64Mi + (5 × 32Mi) + 16Mi = 240Mi │ resource_selector: │ │ resource_selector: │ │ resource_selector: │ │ - label: region │ │ - label: region │ │ - label: region │ │ value: us-east │ │ value: us-west │ │ value: eu-west │ -│ max_age_ready=30m │ │ max_age_ready=1h │ │ max_age_ready=45m │ +│ ready_stale=30m │ │ ready_stale=1h │ │ ready_stale=45m │ └──────────┬──────────┘ └──────────┬──────────┘ └──────────┬──────────┘ │ │ │ │ ▼ │ diff --git a/docs/running-sentinel.md b/docs/running-sentinel.md index e4ba30b..5e346f7 100644 --- a/docs/running-sentinel.md +++ b/docs/running-sentinel.md @@ -533,8 +533,6 @@ gcloud projects remove-iam-policy-binding ${GCP_PROJECT} \ # sentinel-config.yaml resource_type: clusters poll_interval: 5s -max_age_not_ready: 10s -max_age_ready: 30m # Watch all clusters (no filtering) resource_selector: [] @@ -556,8 +554,6 @@ message_data: # sentinel-dev-config.yaml resource_type: clusters poll_interval: 10s # Slower polling for dev -max_age_not_ready: 30s # Longer intervals for dev -max_age_ready: 2h resource_selector: - label: environment diff --git a/docs/sentinel-operator-guide.md b/docs/sentinel-operator-guide.md index 817b16e..d85ea2d 100644 --- a/docs/sentinel-operator-guide.md +++ b/docs/sentinel-operator-guide.md @@ -142,45 +142,47 @@ Time-based reconciliation ensures **eventual consistency** by publishing events **How It Works:** -Sentinel uses two configurable max age intervals based on the resource's status (`Ready` condition): +Sentinel uses a configurable CEL-based decision engine (`message_decision`) that evaluates named parameters and a boolean result expression: -| Resource State | Default Interval | Rationale | -|----------------|------------------|----------------------------------------------------------------------------------------| -| **Not Ready** (`Ready` condition status = False) | `10s` | Faster reconciliation for transitional states requires more frequent checks to complete quickly | -| **Ready** (`Ready` condition status = True) | `30m` | Lower frequency for drift detection on stable resources to avoid excessive load | +| Default Check | CEL Expression | Rationale | +|---------------|---------------|-----------| +| **New resource** | `!is_ready && resource.generation == 1` | Always publish for never-processed resources | +| **Generation mismatch** | `resource.generation > condition("Ready").observed_generation` | Immediate reconciliation when spec changes | +| **Ready and stale** | `is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")` | Lower frequency drift detection on stable resources | +| **Not ready and debounced** | `!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")` | Faster reconciliation for transitional states | **Decision Logic:** -At this point, we know `status.last_updated` is not zero, so we can use it as the reference timestamp: +The `condition("Ready")` CEL function accesses the resource's status conditions. Named params are evaluated in dependency order (topological sort): -1. Determine max age interval based on ready status: - - If resource is ready (`Ready` condition status == True) → use `max_age_ready` (default: 30m) - - If resource is not ready (`Ready` condition status == False) → use `max_age_not_ready` (default: 10s) +1. Extract reference time: `ref_time = condition("Ready").last_updated_time` +2. Determine readiness: `is_ready = condition("Ready").status == "True"` +3. Guard against missing conditions: `has_ref_time = ref_time != ""` +4. Evaluate checks: `is_new_resource`, `generation_mismatch`, `ready_and_stale`, `not_ready_and_debounced` +5. Result: `is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced` -2. Calculate next event time: - - `next_event = last_updated + max_age_interval` - -3. Compare with current time: - - If `now >= next_event` → **Publish event** (reason: "max age exceeded") - - Otherwise → **Skip** (reason: "max age not exceeded") +If the result is `true` → **Publish event** (reason: "message decision matched") +Otherwise → **Skip** (reason: "message decision result is false") #### 2.1.4 Complete Decision Flow -The three reconciliation checks work together in priority order to determine when to publish events: +The four reconciliation checks work together in priority order to determine when to publish events: ```mermaid graph TD - START[Poll Resource] --> CHECK1{last_updated == zero?} - CHECK1 -->|Yes| PUB1[Publish: never processed] - CHECK1 -->|No| CHECK2{Generation > ObservedGeneration?} - CHECK2 -->|Yes| PUB2[Publish: generation changed] - CHECK2 -->|No| CHECK3{Resource Ready?} - CHECK3 -->|Yes| AGE1[Max Age = 30m] - CHECK3 -->|No| AGE2[Max Age = 10s] - AGE1 --> CHECK4{now >= last_updated + max_age?} - AGE2 --> CHECK4 - CHECK4 -->|Yes| PUB3[Publish: max age exceeded] - CHECK4 -->|No| SKIP[Skip: within max age] + START[Poll Resource] --> CHECK1{is_new_resource?
gen==1 && !ready} + CHECK1 -->|Yes| PUB1[Publish: new resource] + CHECK1 -->|No| CHECK1B{generation_mismatch?
gen > observed_gen} + CHECK1B -->|Yes| PUB1B[Publish: spec changed] + CHECK1B -->|No| CHECK2{has_ref_time?} + CHECK2 -->|No| SKIP[Skip: no condition data] + CHECK2 -->|Yes| CHECK3{is_ready?} + CHECK3 -->|Yes| CHECK4{"now - ref_time > 30m?"} + CHECK3 -->|No| CHECK5{"now - ref_time > 10s?"} + CHECK4 -->|Yes| PUB2[Publish: ready and stale] + CHECK4 -->|No| SKIP2[Skip: within threshold] + CHECK5 -->|Yes| PUB3[Publish: not ready, debounced] + CHECK5 -->|No| SKIP2 ``` **Key Takeaways:** @@ -189,6 +191,8 @@ graph TD - **State changes override max age** - Spec changes don't wait for intervals - **Max age is the fallback** - Ensures eventual consistency when nothing else triggers +> **Scalability note:** The CEL decision engine evaluates all resources matching the label selector in-memory each poll cycle. At large scale (thousands of resources), use `resource_selector` to shard resources across multiple Sentinel instances. A future `server_filters` config field is planned to allow server-side pre-filtering before CEL evaluation. + ### 2.2 Resource Filtering Resource filtering enables **horizontal scaling** by allowing operators to distribute resources across multiple Sentinel instances using label-based selectors. @@ -284,10 +288,8 @@ Sentinel uses YAML-based configuration with environment variable overrides for s # Required: Resource type to watch resource_type: clusters -# Optional: Polling and age intervals +# Optional: Polling interval poll_interval: 5s -max_age_not_ready: 10s -max_age_ready: 30m # Optional: Resource filtering resource_selector: @@ -329,8 +331,7 @@ These fields have sensible defaults and can be omitted: | Field | Type | Default | Description | |-------|------|---------|-------------| | `poll_interval` | duration | `5s` | How often to poll the API for resource updates | -| `max_age_not_ready` | duration | `10s` | Max age interval for not-ready resources | -| `max_age_ready` | duration | `30m` | Max age interval for ready resources | +| `message_decision` | object | See defaults | CEL-based decision logic (params + result expression) | | `hyperfleet_api.timeout` | duration | `5s` | Request timeout for API calls | | `resource_selector` | array | `[]` | Label selectors for filtering (empty = all resources) | | `message_data` | map | `{}` | CEL expressions for CloudEvents payload | @@ -385,18 +386,14 @@ message_data: # Nested field access cluster_id: "resource.owner_references.id" - # Conditionals (ternary operator) - ready_status: 'resource.status.ready ? "Ready" : "NotReady"' + # Conditionals (ternary operator) — use condition() function to access status conditions + ready_status: 'condition("Ready").status == "True" ? "Ready" : "NotReady"' # String literals (must use quotes inside CEL expression) source: '"hyperfleet-sentinel"' # Numeric/boolean literals generation: "resource.generation" - is_ready: "resource.status.ready" - - # Complex conditionals with CEL filter - ready_condition: 'resource.status.conditions.filter(c, c.type=="Ready")[0].value == "True" ? "Ready" : "NotReady"' # Nested objects owner_references: "resource.owner_references" @@ -572,10 +569,9 @@ Follow this checklist to ensure successful Sentinel deployment and operation. **Configure Reconciliation Parameters** -- [ ] Review and adjust polling intervals: +- [ ] Review and adjust polling and decision parameters: - `poll_interval` - How often to poll the HyperFleet API (default: `5s`) - - `max_age_not_ready` - Reconciliation interval for not-ready resources (default: `10s`) - - `max_age_ready` - Reconciliation interval for ready resources (default: `30m`) + - `message_decision` - CEL-based decision logic with configurable thresholds - Reference: [Optional Fields](#optional-fields) **Design CloudEvents Payload** @@ -692,7 +688,7 @@ For detailed deployment guidance, see [docs/running-sentinel.md](running-sentine | Symptom | Likely Cause | Solution | |------------------------------------------------------------------------------------------------------------------------------|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | **Events not published, resources not found** | Resource selector mismatch | Verify `resource_selector` matches resource labels. Empty selector watches ALL resources. Check logs: `kubectl logs -n hyperfleet-system -l app.kubernetes.io/name=sentinel` | -| **Events not published, resources found but skipped** | Max age not exceeded | Normal behavior. Events publish when `generation > observed_generation` OR max age interval elapsed (`max_age_ready`: 30m, `max_age_not_ready`: 10s). | +| **Events not published, resources found but skipped** | Decision result is false | Normal behavior. Events publish when the CEL decision expression evaluates to true (new resource, ready and stale >30m, or not ready and debounced >10s by default). | | **API connection errors, DNS lookup fails** | Wrong service name or namespace | Verify endpoint format: `http://..svc.cluster.local:8000`. Check API is running: `kubectl get pods -n hyperfleet-system -l app=hyperfleet-api` | | **API returns 401 Unauthorized** | Missing authentication | Add auth headers to `hyperfleet_api` config if API requires authentication. | | **API returns 404 Not Found** | Wrong API version in path | Verify endpoint uses correct API version: `/api/v1/clusters` or `/api/hyperfleet/v1/clusters` | diff --git a/internal/client/client.go b/internal/client/client.go index 95c8d97..80a868a 100644 --- a/internal/client/client.go +++ b/internal/client/client.go @@ -87,13 +87,10 @@ type Resource struct { Generation int32 `json:"generation"` } -// ResourceStatus represents the status of a resource +// ResourceStatus represents the status of a resource. +// All status data is accessed through Conditions only. type ResourceStatus struct { - LastTransitionTime time.Time `json:"lastTransitionTime"` // Updates only when ready status changes - LastUpdated time.Time `json:"lastUpdated"` // Updates every time an adapter checks the resource - Conditions []Condition `json:"conditions,omitempty"` - ObservedGeneration int32 `json:"observedGeneration"` // The generation last processed by the adapter - Ready bool `json:"ready"` // True if Ready condition status is "True" + Conditions []Condition `json:"conditions,omitempty"` } // Condition represents a status condition @@ -341,7 +338,7 @@ func (c *HyperFleetClient) fetchClusters(ctx context.Context, searchParam string resource.Labels = *item.Labels } - // Convert conditions from OpenAPI model and extract Ready condition for ready status + // Convert conditions from OpenAPI model if len(item.Status.Conditions) > 0 { resource.Status.Conditions = make([]Condition, 0, len(item.Status.Conditions)) for _, cond := range item.Status.Conditions { @@ -359,14 +356,6 @@ func (c *HyperFleetClient) fetchClusters(ctx context.Context, searchParam string condition.Message = *cond.Message } resource.Status.Conditions = append(resource.Status.Conditions, condition) - - // Extract ready status and timing from the Ready condition - if cond.Type == "Ready" { - resource.Status.Ready = cond.Status == "True" - resource.Status.LastTransitionTime = cond.LastTransitionTime - resource.Status.LastUpdated = cond.LastUpdatedTime - resource.Status.ObservedGeneration = cond.ObservedGeneration - } } } @@ -468,7 +457,7 @@ func (c *HyperFleetClient) fetchNodePools(ctx context.Context, searchParam strin resource.OwnerReferences = ref } - // Convert conditions from OpenAPI model and extract Ready condition for ready status + // Convert conditions from OpenAPI model if len(item.Status.Conditions) > 0 { resource.Status.Conditions = make([]Condition, 0, len(item.Status.Conditions)) for _, cond := range item.Status.Conditions { @@ -486,14 +475,6 @@ func (c *HyperFleetClient) fetchNodePools(ctx context.Context, searchParam strin condition.Message = *cond.Message } resource.Status.Conditions = append(resource.Status.Conditions, condition) - - // Extract ready status and timing from the Ready condition - if cond.Type == "Ready" { - resource.Status.Ready = cond.Status == "True" - resource.Status.LastTransitionTime = cond.LastTransitionTime - resource.Status.LastUpdated = cond.LastUpdatedTime - resource.Status.ObservedGeneration = cond.ObservedGeneration - } } } diff --git a/internal/client/client_test.go b/internal/client/client_test.go index 32ed30c..c747532 100644 --- a/internal/client/client_test.go +++ b/internal/client/client_test.go @@ -106,8 +106,11 @@ func TestFetchResources_Success(t *testing.T) { if resources[0].Generation != 5 { t.Errorf("Expected generation 5, got %d", resources[0].Generation) } - if !resources[0].Status.Ready { - t.Errorf("Expected Ready=true, got %t", resources[0].Status.Ready) + if len(resources[0].Status.Conditions) == 0 { + t.Fatal("Expected at least one condition") + } + if resources[0].Status.Conditions[0].Status != "True" { + t.Errorf("Expected Ready condition status 'True', got %s", resources[0].Status.Conditions[0].Status) } } @@ -430,8 +433,11 @@ func TestFetchResources_NilStatus(t *testing.T) { t.Errorf("Expected cluster-2 (valid status), got %s", resources[0].ID) } - if !resources[0].Status.Ready { - t.Errorf("Expected Ready=true, got %t", resources[0].Status.Ready) + if len(resources[0].Status.Conditions) == 0 { + t.Fatal("Expected at least one condition") + } + if resources[0].Status.Conditions[0].Status != "True" { + t.Errorf("Expected Ready condition status 'True', got %s", resources[0].Status.Conditions[0].Status) } } @@ -618,8 +624,11 @@ func TestFetchResources_NodePools(t *testing.T) { if resources[0].Generation != 3 { t.Errorf("Expected generation 3 for nodepool, got %d", resources[0].Generation) } - if resources[0].Status.ObservedGeneration != 3 { - t.Errorf("Expected observed generation 3, got %d", resources[0].Status.ObservedGeneration) + if len(resources[0].Status.Conditions) == 0 { + t.Fatal("Expected at least one condition for nodepool") + } + if resources[0].Status.Conditions[0].ObservedGeneration != 3 { + t.Errorf("Expected observed generation 3, got %d", resources[0].Status.Conditions[0].ObservedGeneration) } if resources[0].OwnerReferences == nil { t.Fatal("Expected OwnerReferences to be set, got nil") diff --git a/internal/config/config.go b/internal/config/config.go index d2390f0..616bfb5 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -24,17 +24,24 @@ type LabelSelector struct { // LabelSelectorList is a list of label selectors type LabelSelectorList []LabelSelector +// MessageDecisionConfig represents configurable CEL-based decision logic. +// Params are named CEL expressions evaluated in dependency order. +// Result is a CEL expression that evaluates to a boolean. +type MessageDecisionConfig struct { + Params map[string]string `mapstructure:"params"` + Result string `mapstructure:"result"` +} + // SentinelConfig represents the Sentinel configuration type SentinelConfig struct { HyperFleetAPI *HyperFleetAPIConfig `mapstructure:"hyperfleet_api"` MessageData map[string]interface{} `mapstructure:"message_data"` + MessageDecision *MessageDecisionConfig `mapstructure:"message_decision"` ResourceType string `mapstructure:"resource_type"` Topic string `mapstructure:"topic"` MessagingSystem string `mapstructure:"messaging_system"` ResourceSelector LabelSelectorList `mapstructure:"resource_selector"` PollInterval time.Duration `mapstructure:"poll_interval"` - MaxAgeNotReady time.Duration `mapstructure:"max_age_not_ready"` - MaxAgeReady time.Duration `mapstructure:"max_age_ready"` } // HyperFleetAPIConfig defines the HyperFleet API client configuration @@ -58,13 +65,28 @@ func (ls LabelSelectorList) ToMap() map[string]string { return result } +// DefaultMessageDecision returns the default message_decision configuration +// that replicates the previous hardcoded max_age_not_ready/max_age_ready behavior. +func DefaultMessageDecision() *MessageDecisionConfig { + return &MessageDecisionConfig{ + Params: map[string]string{ + "ref_time": `condition("Ready").last_updated_time`, + "is_ready": `condition("Ready").status == "True"`, + "has_ref_time": `ref_time != ""`, + "is_new_resource": `!is_ready && resource.generation == 1`, + "generation_mismatch": `resource.generation > condition("Ready").observed_generation`, + "ready_and_stale": `is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")`, + "not_ready_and_debounced": `!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")`, + }, + Result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced", + } +} + // NewSentinelConfig creates a new configuration with defaults func NewSentinelConfig() *SentinelConfig { return &SentinelConfig{ // ResourceType is required and must be set in config file PollInterval: 5 * time.Second, - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, ResourceSelector: []LabelSelector{}, // Empty means watch all resources HyperFleetAPI: &HyperFleetAPIConfig{ // Endpoint is required and must be set in config file @@ -108,6 +130,11 @@ func LoadConfig(configFile string) (*SentinelConfig, error) { } } + // Apply default message_decision if not configured + if cfg.MessageDecision == nil { + cfg.MessageDecision = DefaultMessageDecision() + } + // Override topic from environment variable if explicitly provided // Environment variable takes precedence over config file (including empty value to clear) if topic, ok := os.LookupEnv("BROKER_TOPIC"); ok { @@ -151,12 +178,12 @@ func (c *SentinelConfig) Validate() error { return fmt.Errorf("poll_interval must be positive") } - if c.MaxAgeNotReady <= 0 { - return fmt.Errorf("max_age_not_ready must be positive") + if c.MessageDecision == nil { + return fmt.Errorf("message_decision is required") } - if c.MaxAgeReady <= 0 { - return fmt.Errorf("max_age_ready must be positive") + if err := c.MessageDecision.Validate(); err != nil { + return fmt.Errorf("message_decision: %w", err) } if c.MessageData == nil { @@ -170,6 +197,137 @@ func (c *SentinelConfig) Validate() error { return nil } +// Validate validates the message decision configuration +func (md *MessageDecisionConfig) Validate() error { + if md.Result == "" { + return fmt.Errorf("result expression is required") + } + + for name, expr := range md.Params { + if expr == "" { + return fmt.Errorf("param %q has empty expression", name) + } + } + + // Check for circular dependencies + if _, err := md.TopologicalSort(); err != nil { + return err + } + + return nil +} + +// TopologicalSort resolves param evaluation order based on inter-param dependencies. +// Returns an ordered list of param names or an error if circular dependencies exist. +func (md *MessageDecisionConfig) TopologicalSort() ([]string, error) { + // Build dependency graph: for each param, find which other params it references + deps := make(map[string][]string, len(md.Params)) + paramNames := make(map[string]bool, len(md.Params)) + for name := range md.Params { + paramNames[name] = true + deps[name] = nil + } + + for name, expr := range md.Params { + for otherName := range md.Params { + if otherName != name && containsIdentifier(expr, otherName) { + deps[name] = append(deps[name], otherName) + } + } + } + + // Kahn's algorithm for topological sort + inDegree := make(map[string]int, len(md.Params)) + for name := range md.Params { + inDegree[name] = 0 + } + for _, dependencies := range deps { + for _, dep := range dependencies { + inDegree[dep]++ + } + } + + // Wait — inDegree should count how many params depend ON each param, + // but we need the reverse: how many dependencies each param HAS. + // Actually for topological sort, we need: for each node, its in-degree + // is the number of edges pointing TO it. In our graph, an edge from A to B + // means "A depends on B" (B must be evaluated before A). + // So B's in-degree is the count of params that depend on B. + // Nodes with in-degree 0 have no dependents waiting — wrong. + // Actually, in topological sort, we want to process nodes with no DEPENDENCIES first. + // In our graph, edge A→B means "A depends on B", so A has out-degree to B. + // For Kahn's, we reverse: edge B→A means "B must come before A". + // In-degree of A = number of dependencies A has. + + // Let me redo this correctly. + // deps[A] = [B, C] means A depends on B and C (B and C must be evaluated first) + // For Kahn's: in-degree of A = len(deps[A]) + inDegree = make(map[string]int, len(md.Params)) + for name := range md.Params { + inDegree[name] = len(deps[name]) + } + + var queue []string + for name, degree := range inDegree { + if degree == 0 { + queue = append(queue, name) + } + } + + var sorted []string + for len(queue) > 0 { + node := queue[0] + queue = queue[1:] + sorted = append(sorted, node) + + // For each param that depends on this node, decrease its in-degree + for other, dependencies := range deps { + for _, dep := range dependencies { + if dep == node { + inDegree[other]-- + if inDegree[other] == 0 { + queue = append(queue, other) + } + } + } + } + } + + if len(sorted) != len(md.Params) { + return nil, fmt.Errorf("circular dependency detected in params") + } + + return sorted, nil +} + +// containsIdentifier checks if an expression contains a reference to a param name. +// Uses word boundary detection to avoid false positives (e.g., "is_ready" matching "is_ready_2"). +func containsIdentifier(expr, identifier string) bool { + idx := 0 + for { + pos := strings.Index(expr[idx:], identifier) + if pos == -1 { + return false + } + absPos := idx + pos + endPos := absPos + len(identifier) + + // Check word boundaries + validStart := absPos == 0 || !isIdentChar(expr[absPos-1]) + validEnd := endPos >= len(expr) || !isIdentChar(expr[endPos]) + + if validStart && validEnd { + return true + } + idx = absPos + 1 + } +} + +// isIdentChar returns true if the byte is a valid identifier character (letter, digit, underscore) +func isIdentChar(b byte) bool { + return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9') || b == '_' +} + // validateMessageDataLeaves recursively checks that every leaf value in a // message_data map is a non-empty string (CEL expression). nil values and // empty strings are rejected early so that the error is reported at config diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 9f165ac..c39a564 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -25,6 +25,16 @@ func createTempConfigFile(t *testing.T, content string) string { return configPath } +// newTestMessageDecision returns a valid message decision config for testing +func newTestMessageDecision() *MessageDecisionConfig { + return &MessageDecisionConfig{ + Params: map[string]string{ + "is_ready": `condition("Ready").status == "True"`, + }, + Result: "!is_ready", + } +} + // ============================================================================ // Loading & Parsing Tests // ============================================================================ @@ -46,12 +56,6 @@ func TestLoadConfig_ValidComplete(t *testing.T) { if cfg.PollInterval != 5*time.Second { t.Errorf("Expected poll_interval 5s, got %v", cfg.PollInterval) } - if cfg.MaxAgeNotReady != 10*time.Second { - t.Errorf("Expected max_age_not_ready 10s, got %v", cfg.MaxAgeNotReady) - } - if cfg.MaxAgeReady != 30*time.Minute { - t.Errorf("Expected max_age_ready 30m, got %v", cfg.MaxAgeReady) - } // Verify resource selector if len(cfg.ResourceSelector) != 2 { @@ -66,6 +70,17 @@ func TestLoadConfig_ValidComplete(t *testing.T) { t.Errorf("Expected timeout 5s, got %v", cfg.HyperFleetAPI.Timeout) } + // Verify message_decision + if cfg.MessageDecision == nil { + t.Fatal("Expected message_decision to be set") + } + if cfg.MessageDecision.Result == "" { + t.Error("Expected message_decision.result to be set") + } + if len(cfg.MessageDecision.Params) != 7 { + t.Errorf("Expected 7 message_decision params, got %d", len(cfg.MessageDecision.Params)) + } + // Verify message data if len(cfg.MessageData) != 3 { t.Errorf("Expected 3 message_data fields, got %d", len(cfg.MessageData)) @@ -91,11 +106,16 @@ func TestLoadConfig_Minimal(t *testing.T) { if cfg.PollInterval != 5*time.Second { t.Errorf("Expected default poll_interval 5s, got %v", cfg.PollInterval) } - if cfg.MaxAgeNotReady != 10*time.Second { - t.Errorf("Expected default max_age_not_ready 10s, got %v", cfg.MaxAgeNotReady) + + // Verify default message_decision was applied + if cfg.MessageDecision == nil { + t.Fatal("Expected default message_decision to be applied") + } + if cfg.MessageDecision.Result == "" { + t.Error("Expected default message_decision.result to be set") } - if cfg.MaxAgeReady != 30*time.Minute { - t.Errorf("Expected default max_age_ready 30m, got %v", cfg.MaxAgeReady) + if len(cfg.MessageDecision.Params) != 7 { + t.Errorf("Expected 7 default message_decision params, got %d", len(cfg.MessageDecision.Params)) } } @@ -144,14 +164,8 @@ func TestNewSentinelConfig_Defaults(t *testing.T) { if cfg.PollInterval != 5*time.Second { t.Errorf("Expected default poll_interval 5s, got %v", cfg.PollInterval) } - if cfg.MaxAgeNotReady != 10*time.Second { - t.Errorf("Expected default max_age_not_ready 10s, got %v", cfg.MaxAgeNotReady) - } - if cfg.MaxAgeReady != 30*time.Minute { - t.Errorf("Expected default max_age_ready 30m, got %v", cfg.MaxAgeReady) - } if cfg.HyperFleetAPI.Timeout != 5*time.Second { - t.Errorf("Expected default timeout 30s, got %v", cfg.HyperFleetAPI.Timeout) + t.Errorf("Expected default timeout 5s, got %v", cfg.HyperFleetAPI.Timeout) } // Endpoint has no default - must be set in config file if cfg.HyperFleetAPI.Endpoint != "" { @@ -163,6 +177,10 @@ func TestNewSentinelConfig_Defaults(t *testing.T) { if cfg.MessageData != nil { t.Errorf("Expected nil message_data, got %v", cfg.MessageData) } + // MessageDecision has no default in NewSentinelConfig - LoadConfig applies it + if cfg.MessageDecision != nil { + t.Errorf("Expected nil message_decision in NewSentinelConfig, got %v", cfg.MessageDecision) + } } // ============================================================================ @@ -185,7 +203,7 @@ func TestValidate_MissingResourceType(t *testing.T) { func TestValidate_MissingEndpoint(t *testing.T) { cfg := NewSentinelConfig() - cfg.ResourceType = testResourceType // Set valid resource_type to test endpoint validation + cfg.ResourceType = testResourceType cfg.HyperFleetAPI.Endpoint = "" err := cfg.Validate() @@ -233,6 +251,7 @@ func TestValidate_InvalidResourceTypes(t *testing.T) { cfg.ResourceType = tt.resourceType cfg.HyperFleetAPI.Endpoint = testAPIEndpoint cfg.MessageData = map[string]interface{}{"id": "resource.id"} + cfg.MessageDecision = newTestMessageDecision() err := cfg.Validate() if tt.shouldFail && err == nil { @@ -262,24 +281,13 @@ func TestValidate_NegativeDurations(t *testing.T) { c.PollInterval = 0 }, }, - { - name: "negative max_age_not_ready", - modifier: func(c *SentinelConfig) { - c.MaxAgeNotReady = -10 * time.Second - }, - }, - { - name: "zero max_age_ready", - modifier: func(c *SentinelConfig) { - c.MaxAgeReady = 0 - }, - }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { cfg := NewSentinelConfig() cfg.HyperFleetAPI.Endpoint = testAPIEndpoint + cfg.MessageDecision = newTestMessageDecision() tt.modifier(cfg) err := cfg.Validate() @@ -345,6 +353,7 @@ func TestValidate_ValidMessageDataFlat(t *testing.T) { cfg := NewSentinelConfig() cfg.ResourceType = testResourceType cfg.HyperFleetAPI.Endpoint = testAPIEndpoint + cfg.MessageDecision = newTestMessageDecision() cfg.MessageData = map[string]interface{}{ "id": "resource.id", "kind": "resource.kind", @@ -360,6 +369,7 @@ func TestValidate_ValidMessageDataNested(t *testing.T) { cfg := NewSentinelConfig() cfg.ResourceType = testResourceType cfg.HyperFleetAPI.Endpoint = testAPIEndpoint + cfg.MessageDecision = newTestMessageDecision() cfg.MessageData = map[string]interface{}{ "origin": `"sentinel"`, "ref": map[string]interface{}{ @@ -377,6 +387,7 @@ func TestValidate_NilMessageData(t *testing.T) { cfg := NewSentinelConfig() cfg.ResourceType = testResourceType cfg.HyperFleetAPI.Endpoint = testAPIEndpoint + cfg.MessageDecision = newTestMessageDecision() // MessageData is nil by default — message_data is required so this must fail if err := cfg.Validate(); err == nil { @@ -385,10 +396,10 @@ func TestValidate_NilMessageData(t *testing.T) { } func TestValidate_NilLeafInMessageData(t *testing.T) { - // Mirrors YAML: `id:` — viper may drop the key, but if it doesn't the nil leaf must be rejected. cfg := NewSentinelConfig() cfg.ResourceType = testResourceType cfg.HyperFleetAPI.Endpoint = testAPIEndpoint + cfg.MessageDecision = newTestMessageDecision() cfg.MessageData = map[string]interface{}{ "id": nil, "kind": "resource.kind", @@ -400,10 +411,10 @@ func TestValidate_NilLeafInMessageData(t *testing.T) { } func TestValidate_EmptyStringLeafInMessageData(t *testing.T) { - // Mirrors YAML: `id: ""` — an explicitly-set empty CEL expression. cfg := NewSentinelConfig() cfg.ResourceType = testResourceType cfg.HyperFleetAPI.Endpoint = testAPIEndpoint + cfg.MessageDecision = newTestMessageDecision() cfg.MessageData = map[string]interface{}{ "id": "", "kind": "resource.kind", @@ -415,10 +426,10 @@ func TestValidate_EmptyStringLeafInMessageData(t *testing.T) { } func TestValidate_NilLeafInNestedMessageData(t *testing.T) { - // Ensures the recursive check reaches nested objects. cfg := NewSentinelConfig() cfg.ResourceType = testResourceType cfg.HyperFleetAPI.Endpoint = testAPIEndpoint + cfg.MessageDecision = newTestMessageDecision() cfg.MessageData = map[string]interface{}{ "ref": map[string]interface{}{ "id": nil, @@ -431,15 +442,202 @@ func TestValidate_NilLeafInNestedMessageData(t *testing.T) { } } +// ============================================================================ +// Message Decision Validation Tests +// ============================================================================ + +func TestValidate_MissingMessageDecision(t *testing.T) { + cfg := NewSentinelConfig() + cfg.ResourceType = testResourceType + cfg.HyperFleetAPI.Endpoint = testAPIEndpoint + cfg.MessageData = map[string]interface{}{"id": "resource.id"} + cfg.MessageDecision = nil + + if err := cfg.Validate(); err == nil { + t.Error("Expected error for nil message_decision, got nil") + } +} + +func TestValidate_EmptyResultExpression(t *testing.T) { + cfg := NewSentinelConfig() + cfg.ResourceType = testResourceType + cfg.HyperFleetAPI.Endpoint = testAPIEndpoint + cfg.MessageData = map[string]interface{}{"id": "resource.id"} + cfg.MessageDecision = &MessageDecisionConfig{ + Params: map[string]string{}, + Result: "", + } + + err := cfg.Validate() + if err == nil { + t.Error("Expected error for empty result expression, got nil") + } +} + +func TestValidate_EmptyParamExpression(t *testing.T) { + cfg := NewSentinelConfig() + cfg.ResourceType = testResourceType + cfg.HyperFleetAPI.Endpoint = testAPIEndpoint + cfg.MessageData = map[string]interface{}{"id": "resource.id"} + cfg.MessageDecision = &MessageDecisionConfig{ + Params: map[string]string{ + "is_ready": "", + }, + Result: "is_ready", + } + + err := cfg.Validate() + if err == nil { + t.Error("Expected error for empty param expression, got nil") + } +} + +func TestValidate_CircularDependency(t *testing.T) { + cfg := NewSentinelConfig() + cfg.ResourceType = testResourceType + cfg.HyperFleetAPI.Endpoint = testAPIEndpoint + cfg.MessageData = map[string]interface{}{"id": "resource.id"} + cfg.MessageDecision = &MessageDecisionConfig{ + Params: map[string]string{ + "a": "b", + "b": "a", + }, + Result: "a", + } + + err := cfg.Validate() + if err == nil { + t.Error("Expected error for circular dependency, got nil") + } + if !strings.Contains(err.Error(), "circular dependency") { + t.Errorf("Expected circular dependency error, got: %v", err) + } +} + +// ============================================================================ +// Topological Sort Tests +// ============================================================================ + +func TestTopologicalSort_NoDependencies(t *testing.T) { + md := &MessageDecisionConfig{ + Params: map[string]string{ + "a": `condition("Ready").status`, + "b": `condition("Available").status`, + }, + Result: "a == b", + } + + order, err := md.TopologicalSort() + if err != nil { + t.Fatalf("Expected no error, got: %v", err) + } + if len(order) != 2 { + t.Fatalf("Expected 2 params, got %d", len(order)) + } +} + +func TestTopologicalSort_LinearDependency(t *testing.T) { + md := &MessageDecisionConfig{ + Params: map[string]string{ + "is_ready": `condition("Ready").status == "True"`, + "should_publish": `!is_ready`, + }, + Result: "should_publish", + } + + order, err := md.TopologicalSort() + if err != nil { + t.Fatalf("Expected no error, got: %v", err) + } + if len(order) != 2 { + t.Fatalf("Expected 2 params, got %d", len(order)) + } + + // is_ready must come before should_publish + isReadyIdx := -1 + shouldPublishIdx := -1 + for i, name := range order { + if name == "is_ready" { + isReadyIdx = i + } + if name == "should_publish" { + shouldPublishIdx = i + } + } + if isReadyIdx >= shouldPublishIdx { + t.Errorf("Expected is_ready (%d) before should_publish (%d)", isReadyIdx, shouldPublishIdx) + } +} + +func TestTopologicalSort_CircularDependency(t *testing.T) { + md := &MessageDecisionConfig{ + Params: map[string]string{ + "a": "b && true", + "b": "c || false", + "c": "a", + }, + Result: "a", + } + + _, err := md.TopologicalSort() + if err == nil { + t.Error("Expected error for circular dependency, got nil") + } +} + +func TestTopologicalSort_EmptyParams(t *testing.T) { + md := &MessageDecisionConfig{ + Params: map[string]string{}, + Result: "true", + } + + order, err := md.TopologicalSort() + if err != nil { + t.Fatalf("Expected no error, got: %v", err) + } + if len(order) != 0 { + t.Errorf("Expected 0 params, got %d", len(order)) + } +} + +// ============================================================================ +// containsIdentifier Tests +// ============================================================================ + +func TestContainsIdentifier(t *testing.T) { + tests := []struct { + name string + expr string + identifier string + want bool + }{ + {"exact match", "is_ready", "is_ready", true}, + {"in expression", "!is_ready && x", "is_ready", true}, + {"prefix match should fail", "is_ready_2", "is_ready", false}, + {"suffix match should fail", "not_is_ready", "is_ready", false}, + {"substring in middle", "foo_is_ready_bar", "is_ready", false}, + {"not present", "something_else", "is_ready", false}, + {"at start with operator", "is_ready || other", "is_ready", true}, + {"at end with operator", "other && is_ready", "is_ready", true}, + {"in parentheses", "(is_ready)", "is_ready", true}, + {"with negation", "!is_ready", "is_ready", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := containsIdentifier(tt.expr, tt.identifier) + if got != tt.want { + t.Errorf("containsIdentifier(%q, %q) = %v, want %v", tt.expr, tt.identifier, got, tt.want) + } + }) + } +} + // ============================================================================ // Integration-like Test with Full Config // ============================================================================ func TestLoadConfig_BlankMessageDataLeafReturnsError(t *testing.T) { - // A blank leaf (e.g. `id:`) in message_data is decoded as nil by the YAML - // parser. mapstructure then silently drops nil-valued keys during Unmarshal, - // so the key disappears from cfg.MessageData before Validate() runs. - // LoadConfig must catch this via the raw viper value. _, err := LoadConfig(filepath.Join("testdata", "message-data-blank-id.yaml")) if err == nil { t.Fatal("expected error for blank message_data leaf, got nil") @@ -471,6 +669,14 @@ func TestLoadConfig_FullWorkflow(t *testing.T) { if len(md) != 4 { t.Errorf("Expected 4 message_data fields, got %d", len(md)) } + + // Verify message_decision is loaded from file + if cfg.MessageDecision == nil { + t.Fatal("Expected message_decision to be set") + } + if len(cfg.MessageDecision.Params) != 7 { + t.Errorf("Expected 7 message_decision params, got %d", len(cfg.MessageDecision.Params)) + } } // ============================================================================ @@ -478,7 +684,6 @@ func TestLoadConfig_FullWorkflow(t *testing.T) { // ============================================================================ func TestLoadConfig_TopicFromEnvVar(t *testing.T) { - // Set environment variable (t.Setenv auto-cleans after test) t.Setenv("BROKER_TOPIC", "test-namespace-clusters") configPath := filepath.Join("testdata", "minimal.yaml") @@ -494,10 +699,8 @@ func TestLoadConfig_TopicFromEnvVar(t *testing.T) { } func TestLoadConfig_TopicEnvVarOverridesConfig(t *testing.T) { - // Set environment variable (t.Setenv auto-cleans after test) t.Setenv("BROKER_TOPIC", "env-topic") - // Create config with topic set yaml := ` resource_type: clusters hyperfleet_api: @@ -513,14 +716,12 @@ message_data: t.Fatalf("Expected no error, got: %v", err) } - // Environment variable should override config file if cfg.Topic != "env-topic" { t.Errorf("Expected topic 'env-topic' (from env), got '%s'", cfg.Topic) } } func TestLoadConfig_TopicFromConfigFile(t *testing.T) { - // Save and restore original value, then unset for test origValue, wasSet := os.LookupEnv("BROKER_TOPIC") if wasSet { defer func() { _ = os.Setenv("BROKER_TOPIC", origValue) }() @@ -548,7 +749,6 @@ message_data: } func TestLoadConfig_TopicEmpty(t *testing.T) { - // Save and restore original value, then unset for test origValue, wasSet := os.LookupEnv("BROKER_TOPIC") if wasSet { defer func() { _ = os.Setenv("BROKER_TOPIC", origValue) }() @@ -562,18 +762,14 @@ func TestLoadConfig_TopicEmpty(t *testing.T) { t.Fatalf("Expected no error, got: %v", err) } - // Topic should be empty when not configured if cfg.Topic != "" { t.Errorf("Expected empty topic, got '%s'", cfg.Topic) } } func TestLoadConfig_TopicEnvVarEmptyClearsConfig(t *testing.T) { - // Set environment variable to empty string (explicitly clears config value) - // t.Setenv auto-cleans after test t.Setenv("BROKER_TOPIC", "") - // Create config with topic set yaml := ` resource_type: clusters hyperfleet_api: @@ -589,7 +785,6 @@ message_data: t.Fatalf("Expected no error, got: %v", err) } - // Empty env var should clear the config value (using os.LookupEnv) if cfg.Topic != "" { t.Errorf("Expected empty topic (cleared by env var), got '%s'", cfg.Topic) } diff --git a/internal/config/testdata/full-workflow.yaml b/internal/config/testdata/full-workflow.yaml index ce8c07b..0002773 100644 --- a/internal/config/testdata/full-workflow.yaml +++ b/internal/config/testdata/full-workflow.yaml @@ -1,7 +1,5 @@ resource_type: nodepools poll_interval: 3s -max_age_not_ready: 5s -max_age_ready: 15m resource_selector: - label: cluster-id @@ -13,6 +11,17 @@ hyperfleet_api: endpoint: http://hyperfleet-api.hyperfleet-system.svc.cluster.local:8080 timeout: 20s +message_decision: + params: + ref_time: 'condition("Ready").last_updated_time' + is_ready: 'condition("Ready").status == "True"' + has_ref_time: 'ref_time != ""' + is_new_resource: '!is_ready && resource.generation == 1' + generation_mismatch: 'resource.generation > condition("Ready").observed_generation' + ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("15m")' + not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("5s")' + result: 'is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced' + message_data: id: "resource.id" kind: "resource.kind" diff --git a/internal/config/testdata/message-data-blank-id.yaml b/internal/config/testdata/message-data-blank-id.yaml index 7b9c69a..51b60dc 100644 --- a/internal/config/testdata/message-data-blank-id.yaml +++ b/internal/config/testdata/message-data-blank-id.yaml @@ -2,8 +2,16 @@ # it has a nil in the message_data.id property resource_type: clusters poll_interval: 2s -max_age_not_ready: 5s -max_age_ready: 2m +message_decision: + params: + ref_time: 'condition("Ready").last_updated_time' + is_ready: 'condition("Ready").status == "True"' + has_ref_time: 'ref_time != ""' + is_new_resource: '!is_ready && resource.generation == 1' + generation_mismatch: 'resource.generation > condition("Ready").observed_generation' + ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("2m")' + not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("5s")' + result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced" hyperfleet_api: endpoint: http://localhost:8000 timeout: 10s diff --git a/internal/config/testdata/valid-complete.yaml b/internal/config/testdata/valid-complete.yaml index 62dee5f..a4b8c2b 100644 --- a/internal/config/testdata/valid-complete.yaml +++ b/internal/config/testdata/valid-complete.yaml @@ -1,7 +1,5 @@ resource_type: clusters poll_interval: 5s -max_age_not_ready: 10s -max_age_ready: 30m resource_selector: - label: region @@ -13,6 +11,17 @@ hyperfleet_api: endpoint: https://api.hyperfleet.example.com timeout: 5s +message_decision: + params: + ref_time: 'condition("Ready").last_updated_time' + is_ready: 'condition("Ready").status == "True"' + has_ref_time: 'ref_time != ""' + is_new_resource: '!is_ready && resource.generation == 1' + generation_mismatch: 'resource.generation > condition("Ready").observed_generation' + ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")' + not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")' + result: 'is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced' + message_data: id: "resource.id" kind: "resource.kind" diff --git a/internal/engine/decision.go b/internal/engine/decision.go index e989fd1..2ef4431 100644 --- a/internal/engine/decision.go +++ b/internal/engine/decision.go @@ -2,125 +2,263 @@ package engine import ( "fmt" + "sync" "time" + "github.com/google/cel-go/cel" + "github.com/google/cel-go/common/types" + "github.com/google/cel-go/common/types/ref" "github.com/openshift-hyperfleet/hyperfleet-sentinel/internal/client" + "github.com/openshift-hyperfleet/hyperfleet-sentinel/internal/config" ) -// Decision reasons -const ( - ReasonMaxAgeExceeded = "max age exceeded" - ReasonGenerationChanged = "generation changed" - ReasonNeverProcessed = "never processed" - ReasonNilResource = "resource is nil" - ReasonZeroNow = "now time is zero" -) +// Decision represents the result of evaluating a resource +type Decision struct { + Reason string // Human-readable explanation for the decision + ShouldPublish bool // Indicates whether an event should be published for the resource +} // DecisionEngine evaluates whether a resource needs an event published +// using configurable CEL expressions. type DecisionEngine struct { - maxAgeNotReady time.Duration - maxAgeReady time.Duration + resultProg cel.Program + paramProgs map[string]cel.Program + conditionsLookup map[string]map[string]interface{} + paramOrder []string + mu sync.Mutex } -// NewDecisionEngine creates a new decision engine -func NewDecisionEngine(maxAgeNotReady, maxAgeReady time.Duration) *DecisionEngine { - return &DecisionEngine{ - maxAgeNotReady: maxAgeNotReady, - maxAgeReady: maxAgeReady, +// NewDecisionEngine creates a new CEL-based decision engine from a MessageDecisionConfig. +// All CEL expressions are compiled at creation time for fail-fast validation. +func NewDecisionEngine(cfg *config.MessageDecisionConfig) (*DecisionEngine, error) { + if cfg == nil { + return nil, fmt.Errorf("message_decision config is required") } -} -// Decision represents the result of evaluating a resource -type Decision struct { - Reason string // Human-readable explanation for the decision - ShouldPublish bool // Indicates whether an event should be published for the resource + // Resolve param evaluation order + paramOrder, err := cfg.TopologicalSort() + if err != nil { + return nil, fmt.Errorf("failed to resolve param dependencies: %w", err) + } + + de := &DecisionEngine{ + paramOrder: paramOrder, + } + + // Build CEL environment with all variables and the condition() function. + // The function declaration includes the implementation via FunctionBinding, + // which reads from the engine's conditionsLookup (updated per-evaluation). + envOpts := []cel.EnvOption{ + cel.Variable("resource", cel.DynType), + cel.Variable("now", cel.TimestampType), + cel.Function("condition", + cel.Overload("condition_string_to_dyn", + []*cel.Type{cel.StringType}, + cel.DynType, + cel.UnaryBinding(func(val ref.Val) ref.Val { + name, ok := val.Value().(string) + if !ok { + return types.DefaultTypeAdapter.NativeToValue(zeroCondition()) + } + de.mu.Lock() + lookup := de.conditionsLookup + de.mu.Unlock() + if cond, exists := lookup[name]; exists { + return types.DefaultTypeAdapter.NativeToValue(cond) + } + return types.DefaultTypeAdapter.NativeToValue(zeroCondition()) + }), + ), + ), + } + + // Declare all param names as DynType variables for inter-param references + for name := range cfg.Params { + envOpts = append(envOpts, cel.Variable(name, cel.DynType)) + } + + env, err := cel.NewEnv(envOpts...) + if err != nil { + return nil, fmt.Errorf("failed to create CEL environment: %w", err) + } + + // Compile all param expressions + paramProgs := make(map[string]cel.Program, len(cfg.Params)) + for name, expr := range cfg.Params { + ast, issues := env.Compile(expr) + if issues != nil && issues.Err() != nil { + return nil, fmt.Errorf("failed to compile param %q expression %q: %w", name, expr, issues.Err()) + } + prg, prgErr := env.Program(ast) + if prgErr != nil { + return nil, fmt.Errorf("failed to create program for param %q: %w", name, prgErr) + } + paramProgs[name] = prg + } + + // Compile result expression + resultAST, issues := env.Compile(cfg.Result) + if issues != nil && issues.Err() != nil { + return nil, fmt.Errorf("failed to compile result expression %q: %w", cfg.Result, issues.Err()) + } + resultPrg, err := env.Program(resultAST) + if err != nil { + return nil, fmt.Errorf("failed to create result program: %w", err) + } + + de.paramProgs = paramProgs + de.resultProg = resultPrg + + return de, nil } // Evaluate determines if an event should be published for the resource. -// -// Decision Logic (in priority order): -// 1. Never-processed reconciliation: If status.LastUpdated is zero, publish immediately -// (resource has never been processed by any adapter) -// 2. Generation-based reconciliation: If resource.Generation > status.ObservedGeneration, -// publish immediately (spec has changed, adapter needs to reconcile) -// 3. Time-based reconciliation: If max age exceeded since last update, publish -// - Uses status.LastUpdated as reference timestamp -// -// Max Age Intervals: -// - Resources with Ready=true: maxAgeReady (default 30m) -// - Resources with Ready=false: maxAgeNotReady (default 10s) -// -// Adapter Contract: -// - Adapters MUST update status.LastUpdated on EVERY evaluation -// - Adapters MUST update status.ObservedGeneration to resource.Generation when processing -// - This prevents infinite event loops when adapters skip work due to unmet preconditions -// -// Returns a Decision indicating whether to publish and why. Returns ShouldPublish=false -// for invalid inputs (nil resource, zero now time). +// Returns a Decision indicating whether to publish and why. func (e *DecisionEngine) Evaluate(resource *client.Resource, now time.Time) Decision { - // Validate inputs if resource == nil { + return Decision{ShouldPublish: false, Reason: "resource is nil"} + } + if now.IsZero() { + return Decision{ShouldPublish: false, Reason: "now time is zero"} + } + + // Build resource map for CEL evaluation + resourceMap := resourceToMap(resource) + + // Update the conditions lookup for the condition() function binding + e.mu.Lock() + e.conditionsLookup = buildConditionsLookup(resource.Status.Conditions) + e.mu.Unlock() + + // Build base activation with resource and now + activation := map[string]interface{}{ + "resource": resourceMap, + "now": now, + } + + // Evaluate params in topological order + paramValues := make(map[string]interface{}, len(e.paramOrder)) + for _, name := range e.paramOrder { + prg := e.paramProgs[name] + + // Merge param values into activation for inter-param references + evalActivation := make(map[string]interface{}, len(activation)+len(paramValues)) + for k, v := range activation { + evalActivation[k] = v + } + for k, v := range paramValues { + evalActivation[k] = v + } + + out, _, err := prg.Eval(evalActivation) + if err != nil { + return Decision{ + ShouldPublish: false, + Reason: fmt.Sprintf("param %q evaluation failed: %v", name, err), + } + } + paramValues[name] = out.Value() + } + + // Evaluate result expression + resultActivation := make(map[string]interface{}, len(activation)+len(paramValues)) + for k, v := range activation { + resultActivation[k] = v + } + for k, v := range paramValues { + resultActivation[k] = v + } + + out, _, err := e.resultProg.Eval(resultActivation) + if err != nil { return Decision{ ShouldPublish: false, - Reason: ReasonNilResource, + Reason: fmt.Sprintf("result evaluation failed: %v", err), } } - if now.IsZero() { + shouldPublish, ok := out.Value().(bool) + if !ok { return Decision{ ShouldPublish: false, - Reason: ReasonZeroNow, + Reason: fmt.Sprintf("result expression did not return bool, got %T", out.Value()), } } - // Check if resource has never been processed by an adapter - // LastUpdated is zero means no adapter has updated the status yet - // This ensures first-time resources are published immediately - if resource.Status.LastUpdated.IsZero() { + if shouldPublish { return Decision{ ShouldPublish: true, - Reason: ReasonNeverProcessed, + Reason: "message decision matched", } } - // Check for generation mismatch - // This triggers immediate reconciliation regardless of max age - if resource.Generation > resource.Status.ObservedGeneration { - return Decision{ - ShouldPublish: true, - Reason: ReasonGenerationChanged, + return Decision{ + ShouldPublish: false, + Reason: "message decision result is false", + } +} + +// buildConditionsLookup creates a map from condition type name to condition data +// for use by the condition() CEL function. +func buildConditionsLookup(conditions []client.Condition) map[string]map[string]interface{} { + lookup := make(map[string]map[string]interface{}, len(conditions)) + for _, c := range conditions { + lookup[c.Type] = map[string]interface{}{ + "status": c.Status, + "observed_generation": int64(c.ObservedGeneration), + "last_updated_time": c.LastUpdatedTime.Format(time.RFC3339Nano), + "last_transition_time": c.LastTransitionTime.Format(time.RFC3339Nano), + "reason": c.Reason, + "message": c.Message, } } + return lookup +} - // Determine the reference timestamp for max age calculation - // At this point, we know LastUpdated is not zero (checked above) - // so we can use it directly for the max age calculation - referenceTime := resource.Status.LastUpdated +// zeroCondition returns a zero-value condition map for safe field access +// when a condition is not found. Time fields are empty strings so that +// CEL expressions can guard against missing conditions with `ref_time != ""`. +func zeroCondition() map[string]interface{} { + return map[string]interface{}{ + "status": "", + "observed_generation": int64(0), + "last_updated_time": "", + "last_transition_time": "", + "reason": "", + "message": "", + } +} - // Determine the appropriate max age based on resource ready status - var maxAge time.Duration - if resource.Status.Ready { - maxAge = e.maxAgeReady - } else { - maxAge = e.maxAgeNotReady +// resourceToMap converts a Resource into a plain map for CEL evaluation. +func resourceToMap(r *client.Resource) map[string]interface{} { + m := map[string]interface{}{ + "id": r.ID, + "href": r.Href, + "kind": r.Kind, + "created_time": r.CreatedTime.Format(time.RFC3339Nano), + "updated_time": r.UpdatedTime.Format(time.RFC3339Nano), + "generation": int64(r.Generation), } - // Calculate the next event time based on reference timestamp - // Adapters update LastUpdated on every check, enabling proper max age - // calculation even when resources stay in the same status - nextEventTime := referenceTime.Add(maxAge) + if len(r.Labels) > 0 { + labels := make(map[string]interface{}, len(r.Labels)) + for k, v := range r.Labels { + labels[k] = v + } + m["labels"] = labels + } - // Check if enough time has passed - if now.Before(nextEventTime) { - timeUntilNext := nextEventTime.Sub(now) - return Decision{ - ShouldPublish: false, - Reason: fmt.Sprintf("max age not exceeded (waiting %s)", timeUntilNext), + if r.OwnerReferences != nil { + m["owner_references"] = map[string]interface{}{ + "id": r.OwnerReferences.ID, + "href": r.OwnerReferences.Href, + "kind": r.OwnerReferences.Kind, } } - return Decision{ - ShouldPublish: true, - Reason: ReasonMaxAgeExceeded, + if r.Metadata != nil { + m["metadata"] = r.Metadata } + + return m } diff --git a/internal/engine/decision_test.go b/internal/engine/decision_test.go index 908fbd0..298d942 100644 --- a/internal/engine/decision_test.go +++ b/internal/engine/decision_test.go @@ -1,361 +1,392 @@ package engine import ( - "strings" "testing" "time" "github.com/openshift-hyperfleet/hyperfleet-sentinel/internal/client" + "github.com/openshift-hyperfleet/hyperfleet-sentinel/internal/config" ) -// Test helpers and factories +// Test helpers -// newTestResource creates a test resource with the given parameters -// This follows TRex pattern of using test factories for consistent test data -func newTestResource(ready bool, lastUpdated time.Time) *client.Resource { - return &client.Resource{ - ID: testResourceID, - Kind: testResourceKind, - Generation: 1, // Default generation - CreatedTime: time.Now().Add(-1 * time.Hour), // Default: created 1 hour ago - Status: client.ResourceStatus{ - Ready: ready, - LastUpdated: lastUpdated, - ObservedGeneration: 1, // Default: in sync with generation - }, +const ( + testResourceID = "test-cluster-1" + testResourceKind = "Cluster" +) + +// newDefaultDecisionConfig returns the default message decision config +// matching the sentinel architecture: ref_time, is_ready, is_new_resource, +// ready_and_stale, not_ready_and_debounced. +func newDefaultDecisionConfig() *config.MessageDecisionConfig { + return config.DefaultMessageDecision() +} + +// newTestDecisionEngine creates a decision engine with the default config. +func newTestDecisionEngine(t *testing.T) *DecisionEngine { + t.Helper() + cfg := newDefaultDecisionConfig() + engine, err := NewDecisionEngine(cfg) + if err != nil { + t.Fatalf("NewDecisionEngine failed: %v", err) } + return engine } -// newTestResourceWithCreatedTime creates a test resource with explicit created_time -func newTestResourceWithCreatedTime(id, kind string, ready bool, createdTime, lastUpdated time.Time) *client.Resource { +// newResourceWithCondition creates a test resource with a single "Ready" condition. +func newResourceWithCondition(status string, lastUpdated time.Time, generation int32) *client.Resource { return &client.Resource{ - ID: id, - Kind: kind, - Generation: 1, // Default generation - CreatedTime: createdTime, + ID: testResourceID, + Kind: testResourceKind, + Generation: generation, + CreatedTime: time.Now().Add(-1 * time.Hour), Status: client.ResourceStatus{ - Ready: ready, - LastUpdated: lastUpdated, - ObservedGeneration: 1, // Default: in sync with generation + Conditions: []client.Condition{ + { + Type: "Ready", + Status: status, + LastUpdatedTime: lastUpdated, + ObservedGeneration: generation, + }, + }, }, } } -// newTestResourceWithGeneration creates a test resource with explicit -// generation values -func newTestResourceWithGeneration( - id, kind string, - ready bool, - lastUpdated time.Time, - generation, observedGeneration int32, +// newResourceWithGenerationMismatch creates a test resource where generation > observed_generation. +func newResourceWithGenerationMismatch( + status string, lastUpdated time.Time, generation, observedGeneration int32, ) *client.Resource { return &client.Resource{ - ID: id, - Kind: kind, + ID: testResourceID, + Kind: testResourceKind, Generation: generation, - CreatedTime: time.Now().Add(-1 * time.Hour), // Default: created 1 hour ago + CreatedTime: time.Now().Add(-1 * time.Hour), Status: client.ResourceStatus{ - Ready: ready, - LastUpdated: lastUpdated, - ObservedGeneration: observedGeneration, + Conditions: []client.Condition{ + { + Type: "Ready", + Status: status, + LastUpdatedTime: lastUpdated, + ObservedGeneration: observedGeneration, + }, + }, }, } } -// newTestEngine creates a decision engine with standard test values -func newTestEngine() *DecisionEngine { - return NewDecisionEngine(testMaxAgeNotReady, testMaxAgeReady) -} - -// assertDecision verifies a decision matches expected values -func assertDecision(t *testing.T, got Decision, wantPublish bool, wantReasonContains string) { - t.Helper() - - if got.ShouldPublish != wantPublish { - t.Errorf("ShouldPublish = %v, want %v", got.ShouldPublish, wantPublish) - } - - if wantReasonContains != "" && !strings.Contains(got.Reason, wantReasonContains) { - t.Errorf("Reason = %q, want it to contain %q", got.Reason, wantReasonContains) - } - - if got.Reason == "" { - t.Error("Reason should never be empty") +// newResourceNoConditions creates a test resource with no conditions. +func newResourceNoConditions(generation int32) *client.Resource { + return &client.Resource{ + ID: testResourceID, + Kind: testResourceKind, + Generation: generation, + CreatedTime: time.Now().Add(-1 * time.Hour), + Status: client.ResourceStatus{}, } } -// Test constants -const ( - testMaxAgeNotReady = 10 * time.Second - testMaxAgeReady = 30 * time.Minute - testResourceID = "test-cluster-1" - testResourceKind = "Cluster" -) - func TestNewDecisionEngine(t *testing.T) { - engine := newTestEngine() - - if engine == nil { - t.Fatal("NewDecisionEngine returned nil") - } - - if engine.maxAgeNotReady != testMaxAgeNotReady { - t.Errorf("maxAgeNotReady = %v, want %v", engine.maxAgeNotReady, testMaxAgeNotReady) - } - - if engine.maxAgeReady != testMaxAgeReady { - t.Errorf("maxAgeReady = %v, want %v", engine.maxAgeReady, testMaxAgeReady) - } + t.Run("valid config", func(t *testing.T) { + engine, err := NewDecisionEngine(newDefaultDecisionConfig()) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if engine == nil { + t.Fatal("engine is nil") + } + }) + + t.Run("nil config", func(t *testing.T) { + _, err := NewDecisionEngine(nil) + if err == nil { + t.Fatal("expected error for nil config") + } + }) + + t.Run("invalid CEL expression", func(t *testing.T) { + cfg := &config.MessageDecisionConfig{ + Params: map[string]string{ + "bad": "this is not valid CEL !!!", + }, + Result: "bad", + } + _, err := NewDecisionEngine(cfg) + if err == nil { + t.Fatal("expected error for invalid CEL expression") + } + }) + + t.Run("invalid result expression", func(t *testing.T) { + cfg := &config.MessageDecisionConfig{ + Params: map[string]string{}, + Result: "not valid !!!", + } + _, err := NewDecisionEngine(cfg) + if err == nil { + t.Fatal("expected error for invalid result expression") + } + }) + + t.Run("simple boolean result", func(t *testing.T) { + cfg := &config.MessageDecisionConfig{ + Params: map[string]string{}, + Result: "true", + } + engine, err := NewDecisionEngine(cfg) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if engine == nil { + t.Fatal("engine is nil") + } + }) } func TestDecisionEngine_Evaluate(t *testing.T) { now := time.Now() - engine := newTestEngine() + engine := newTestDecisionEngine(t) tests := []struct { - lastUpdated time.Time - now time.Time - name string - wantReasonContains string - description string - ready bool - wantShouldPublish bool + resource *client.Resource + now time.Time + name string + wantReason string + wantShouldPublish bool }{ - // Zero LastUpdated tests - should publish immediately as never processed - { - name: "zero LastUpdated - ready", - ready: true, - lastUpdated: time.Time{}, // Zero time - never processed - now: now, - wantShouldPublish: true, - wantReasonContains: ReasonNeverProcessed, - description: "Resources with zero LastUpdated should publish immediately (never processed)", - }, { - name: "zero LastUpdated - not ready", - ready: false, - lastUpdated: time.Time{}, // Zero time - never processed - now: now, - wantShouldPublish: true, - wantReasonContains: ReasonNeverProcessed, - description: "Resources with zero LastUpdated should publish immediately (never processed)", - }, - - // Not-ready resources (10s max age) - { - name: "not ready - max age exceeded", - ready: false, - lastUpdated: now.Add(-11 * time.Second), // 11s ago (> 10s max age) - now: now, - wantShouldPublish: true, - wantReasonContains: "max age exceeded", - description: "Not-ready resources with exceeded max age should publish", + name: "ready and stale - should publish", + resource: newResourceWithCondition("True", now.Add(-31*time.Minute), 2), + now: now, + wantShouldPublish: true, + wantReason: "message decision matched", }, { - name: "not ready - max age not exceeded", - ready: false, - lastUpdated: now.Add(-5 * time.Second), // 5s ago (< 10s max age) - now: now, - wantShouldPublish: false, - wantReasonContains: "max age not exceeded", - description: "Not-ready resources within max age should not publish", + name: "ready and recent - should not publish", + resource: newResourceWithCondition("True", now.Add(-5*time.Minute), 2), + now: now, + wantShouldPublish: false, + wantReason: "message decision result is false", }, { - name: "not ready - max age exactly exceeded", - ready: false, - lastUpdated: now.Add(-10 * time.Second), // Exactly 10s ago - now: now, - wantShouldPublish: true, - wantReasonContains: "max age exceeded", - description: "Not-ready resources with exactly exceeded max age should publish", + name: "not ready and debounced - should publish", + resource: newResourceWithCondition("False", now.Add(-11*time.Second), 2), + now: now, + wantShouldPublish: true, + wantReason: "message decision matched", }, - - // Ready resources (30m max age) { - name: "ready - max age exceeded", - ready: true, - lastUpdated: now.Add(-31 * time.Minute), // 31m ago (> 30m max age) - now: now, - wantShouldPublish: true, - wantReasonContains: "max age exceeded", - description: "Ready resources with exceeded max age should publish", + name: "not ready and too recent - should not publish", + resource: newResourceWithCondition("False", now.Add(-3*time.Second), 2), + now: now, + wantShouldPublish: false, + wantReason: "message decision result is false", }, { - name: "ready - max age not exceeded", - ready: true, - lastUpdated: now.Add(-15 * time.Minute), // 15m ago (< 30m max age) - now: now, - wantShouldPublish: false, - wantReasonContains: "max age not exceeded", - description: "Ready resources within max age should not publish", + name: "new resource (generation 1, not ready) - should publish", + resource: newResourceWithCondition("False", now, 1), + now: now, + wantShouldPublish: true, + wantReason: "message decision matched", }, { - name: "ready - max age exactly exceeded", - ready: true, - lastUpdated: now.Add(-30 * time.Minute), // Exactly 30m ago - now: now, - wantShouldPublish: true, - wantReasonContains: "max age exceeded", - description: "Ready resources with exactly exceeded max age should publish", + name: "generation mismatch (ready, recent) - should publish immediately", + resource: newResourceWithGenerationMismatch("True", now.Add(-1*time.Minute), 3, 2), + now: now, + wantShouldPublish: true, + wantReason: "message decision matched", }, - - // Edge cases { - name: "LastUpdated in future - ready", - ready: true, - lastUpdated: now.Add(1 * time.Hour), // 1 hour in the future - now: now, - wantShouldPublish: false, - wantReasonContains: "max age not exceeded", - description: "Resources with LastUpdated in future should not publish (clock skew protection)", + name: "generation mismatch (not ready, recent) - should publish immediately", + resource: newResourceWithGenerationMismatch("False", now.Add(-1*time.Second), 5, 4), + now: now, + wantShouldPublish: true, + wantReason: "message decision matched", }, { - name: "LastUpdated in future - not ready", - ready: false, - lastUpdated: now.Add(1 * time.Minute), // 1 minute in the future - now: now, - wantShouldPublish: false, - wantReasonContains: "max age not exceeded", - description: "Not-ready resources with LastUpdated in future should not publish", + name: "no generation mismatch (ready, recent) - should not publish", + resource: newResourceWithGenerationMismatch("True", now.Add(-1*time.Minute), 2, 2), + now: now, + wantShouldPublish: false, + wantReason: "message decision result is false", }, { - name: "LastUpdated very old - ready", - ready: true, - lastUpdated: now.Add(-24 * time.Hour), // 24 hours ago - now: now, - wantShouldPublish: true, - wantReasonContains: "max age exceeded", - description: "Very old resources should publish (max age long exceeded)", + name: "nil resource - should not publish", + resource: nil, + now: now, + wantShouldPublish: false, + wantReason: "resource is nil", }, { - name: "LastUpdated very recent - not ready", - ready: false, - lastUpdated: now.Add(-1 * time.Millisecond), // Just 1ms ago - now: now, - wantShouldPublish: false, - wantReasonContains: "max age not exceeded", - description: "Very recent updates should not publish immediately", + name: "zero now time - should not publish", + resource: newResourceWithCondition("True", now, 2), + now: time.Time{}, + wantShouldPublish: false, + wantReason: "now time is zero", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - resource := newTestResource(tt.ready, tt.lastUpdated) - decision := engine.Evaluate(resource, tt.now) - - assertDecision(t, decision, tt.wantShouldPublish, tt.wantReasonContains) + decision := engine.Evaluate(tt.resource, tt.now) - // Additional context on failure - if t.Failed() { - t.Logf("Test description: %s", tt.description) + if decision.ShouldPublish != tt.wantShouldPublish { + t.Errorf("ShouldPublish = %v, want %v", decision.ShouldPublish, tt.wantShouldPublish) + } + if decision.Reason != tt.wantReason { + t.Errorf("Reason = %q, want %q", decision.Reason, tt.wantReason) } }) } } -// TestDecisionEngine_Evaluate_ZeroMaxAge tests edge case with zero max age intervals -func TestDecisionEngine_Evaluate_ZeroMaxAge(t *testing.T) { +func TestDecisionEngine_Evaluate_MissingCondition(t *testing.T) { now := time.Now() - - tests := []struct { - lastUpdated time.Time - name string - maxAgeNotReady time.Duration - maxAgeReady time.Duration - ready bool - wantShouldPublish bool - }{ - { - name: "zero maxAgeNotReady - not ready", - maxAgeNotReady: 0, - maxAgeReady: 30 * time.Minute, - ready: false, - lastUpdated: now, // Even with now, should publish due to zero max age - wantShouldPublish: true, - }, - { - name: "zero maxAgeReady - ready", - maxAgeNotReady: 10 * time.Second, - maxAgeReady: 0, - ready: true, - lastUpdated: now, // Even with now, should publish due to zero max age - wantShouldPublish: true, - }, - { - name: "both zero max ages - ready", - maxAgeNotReady: 0, - maxAgeReady: 0, - ready: true, - lastUpdated: now, - wantShouldPublish: true, - }, - { - name: "both zero max ages - not ready", - maxAgeNotReady: 0, - maxAgeReady: 0, - ready: false, - lastUpdated: now, - wantShouldPublish: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - engine := NewDecisionEngine(tt.maxAgeNotReady, tt.maxAgeReady) - resource := newTestResource(tt.ready, tt.lastUpdated) - decision := engine.Evaluate(resource, now) - - assertDecision(t, decision, tt.wantShouldPublish, "") - }) - } + engine := newTestDecisionEngine(t) + + // Resource with no conditions at all - condition("Ready") returns zero-value. + // Zero-value: status="" (not "True"), last_updated_time is zero time (very old). + // is_ready = false, is_new_resource depends on generation. + t.Run("no conditions generation 1 - new resource publishes", func(t *testing.T) { + resource := newResourceNoConditions(1) + decision := engine.Evaluate(resource, now) + + if !decision.ShouldPublish { + t.Errorf("expected ShouldPublish=true for new resource with no conditions, got false") + } + }) + + t.Run("no conditions generation 2 - generation mismatch publishes", func(t *testing.T) { + resource := newResourceNoConditions(2) + decision := engine.Evaluate(resource, now) + + // No conditions → observed_generation=0, generation=2 → generation_mismatch triggers + if !decision.ShouldPublish { + t.Errorf("expected ShouldPublish=true for resource with generation mismatch (gen=2, observed=0), got false") + } + }) } -// TestDecisionEngine_Evaluate_NegativeMaxAge tests edge case with negative max age intervals -func TestDecisionEngine_Evaluate_NegativeMaxAge(t *testing.T) { +func TestDecisionEngine_Evaluate_CustomExpressions(t *testing.T) { now := time.Now() - lastUpdated := now.Add(-5 * time.Second) - tests := []struct { - name string - maxAgeNotReady time.Duration - maxAgeReady time.Duration - ready bool - wantShouldPublish bool - }{ - { - name: "negative maxAgeNotReady", - maxAgeNotReady: -10 * time.Second, - maxAgeReady: 30 * time.Minute, - ready: false, - wantShouldPublish: true, // Negative max age means nextEventTime is in the past - }, - { - name: "negative maxAgeReady", - maxAgeNotReady: 10 * time.Second, - maxAgeReady: -10 * time.Minute, - ready: true, - wantShouldPublish: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - engine := NewDecisionEngine(tt.maxAgeNotReady, tt.maxAgeReady) - resource := newTestResource(tt.ready, lastUpdated) - decision := engine.Evaluate(resource, now) - - assertDecision(t, decision, tt.wantShouldPublish, "") - }) - } + t.Run("always true", func(t *testing.T) { + cfg := &config.MessageDecisionConfig{ + Params: map[string]string{}, + Result: "true", + } + engine, err := NewDecisionEngine(cfg) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + resource := newResourceWithCondition("True", now, 2) + decision := engine.Evaluate(resource, now) + + if !decision.ShouldPublish { + t.Error("expected ShouldPublish=true for always-true result") + } + }) + + t.Run("always false", func(t *testing.T) { + cfg := &config.MessageDecisionConfig{ + Params: map[string]string{}, + Result: "false", + } + engine, err := NewDecisionEngine(cfg) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + resource := newResourceWithCondition("True", now, 2) + decision := engine.Evaluate(resource, now) + + if decision.ShouldPublish { + t.Error("expected ShouldPublish=false for always-false result") + } + }) + + t.Run("param chain with dependencies", func(t *testing.T) { + cfg := &config.MessageDecisionConfig{ + Params: map[string]string{ + "gen": "resource.generation", + "is_first": "gen == 1", + "should_pub": "is_first", + }, + Result: "should_pub", + } + engine, err := NewDecisionEngine(cfg) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // generation 1 → should publish + resource := newResourceWithCondition("False", now, 1) + decision := engine.Evaluate(resource, now) + if !decision.ShouldPublish { + t.Error("expected ShouldPublish=true for generation 1") + } + + // generation 2 → should not publish + resource2 := newResourceWithCondition("False", now, 2) + decision2 := engine.Evaluate(resource2, now) + if decision2.ShouldPublish { + t.Error("expected ShouldPublish=false for generation 2") + } + }) + + t.Run("condition function with custom condition name", func(t *testing.T) { + cfg := &config.MessageDecisionConfig{ + Params: map[string]string{ + "is_available": `condition("Available").status == "True"`, + }, + Result: "is_available", + } + engine, err := NewDecisionEngine(cfg) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + resource := &client.Resource{ + ID: testResourceID, + Kind: testResourceKind, + Generation: 1, + Status: client.ResourceStatus{ + Conditions: []client.Condition{ + {Type: "Available", Status: "True", LastUpdatedTime: now}, + }, + }, + } + + decision := engine.Evaluate(resource, now) + if !decision.ShouldPublish { + t.Error("expected ShouldPublish=true for Available=True condition") + } + + // Missing Available condition → zero-value → status="" → false + resource2 := &client.Resource{ + ID: testResourceID, + Kind: testResourceKind, + Generation: 1, + Status: client.ResourceStatus{ + Conditions: []client.Condition{ + {Type: "Ready", Status: "True", LastUpdatedTime: now}, + }, + }, + } + + decision2 := engine.Evaluate(resource2, now) + if decision2.ShouldPublish { + t.Error("expected ShouldPublish=false when Available condition is missing") + } + }) } -// TestDecisionEngine_Evaluate_ConsistentBehavior tests that multiple calls with same inputs produce same results func TestDecisionEngine_Evaluate_ConsistentBehavior(t *testing.T) { - engine := newTestEngine() + engine := newTestDecisionEngine(t) now := time.Now() - resource := newTestResource(true, now.Add(-31*time.Minute)) + resource := newResourceWithCondition("True", now.Add(-31*time.Minute), 2) - // Call multiple times - should get same result decision1 := engine.Evaluate(resource, now) decision2 := engine.Evaluate(resource, now) decision3 := engine.Evaluate(resource, now) @@ -363,352 +394,198 @@ func TestDecisionEngine_Evaluate_ConsistentBehavior(t *testing.T) { if decision1.ShouldPublish != decision2.ShouldPublish || decision1.ShouldPublish != decision3.ShouldPublish { t.Error("Evaluate should return consistent results for same inputs") } - if decision1.Reason != decision2.Reason || decision1.Reason != decision3.Reason { t.Error("Evaluate should return consistent reason for same inputs") } } -// TestDecisionEngine_Evaluate_InvalidInputs tests handling of invalid inputs -func TestDecisionEngine_Evaluate_InvalidInputs(t *testing.T) { - engine := newTestEngine() +func TestDecisionEngine_Evaluate_ReadyBoundary(t *testing.T) { now := time.Now() + engine := newTestDecisionEngine(t) + // Default ready_and_stale threshold is 30m (strictly greater than) tests := []struct { - now time.Time - resource *client.Resource name string - wantReason string + lastUpdated time.Duration wantShouldPublish bool }{ - { - name: "nil resource", - resource: nil, - now: now, - wantShouldPublish: false, - wantReason: ReasonNilResource, - }, - { - name: "zero now time", - resource: newTestResource(true, now), - now: time.Time{}, // Zero time - wantShouldPublish: false, - wantReason: ReasonZeroNow, - }, + {"exactly 30m - should not publish (> not >=)", -30 * time.Minute, false}, + {"29m59s - should not publish", -29*time.Minute - 59*time.Second, false}, + {"31m - should publish", -31 * time.Minute, true}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - decision := engine.Evaluate(tt.resource, tt.now) + resource := newResourceWithCondition("True", now.Add(tt.lastUpdated), 2) + decision := engine.Evaluate(resource, now) if decision.ShouldPublish != tt.wantShouldPublish { - t.Errorf("ShouldPublish = %v, want %v", decision.ShouldPublish, tt.wantShouldPublish) - } - - if decision.Reason != tt.wantReason { - t.Errorf("Reason = %q, want %q", decision.Reason, tt.wantReason) + t.Errorf("ShouldPublish = %v, want %v (lastUpdated offset: %v)", + decision.ShouldPublish, tt.wantShouldPublish, tt.lastUpdated) } }) } } -// TestDecisionEngine_Evaluate_NeverProcessedResources tests the never-processed reconciliation -// When lastUpdated is zero, resources are published immediately with "never processed" reason -func TestDecisionEngine_Evaluate_NeverProcessedResources(t *testing.T) { - engine := newTestEngine() +func TestDecisionEngine_Evaluate_NotReadyBoundary(t *testing.T) { now := time.Now() + engine := newTestDecisionEngine(t) + // Default not_ready_and_debounced threshold is 10s (strictly greater than) tests := []struct { - createdTime time.Time - lastUpdated time.Time - name string - wantReasonContains string - description string - ready bool - wantShouldPublish bool + name string + lastUpdated time.Duration + wantShouldPublish bool }{ - { - name: "never processed - not ready", - createdTime: now.Add(-1 * time.Hour), // Doesn't affect decision - lastUpdated: time.Time{}, // Zero - never processed - ready: false, - wantShouldPublish: true, - wantReasonContains: ReasonNeverProcessed, - description: "Should publish immediately (never processed)", - }, - { - name: "never processed - ready", - createdTime: now.Add(-1 * time.Hour), // Doesn't affect decision - lastUpdated: time.Time{}, // Zero - never processed - ready: true, - wantShouldPublish: true, - wantReasonContains: ReasonNeverProcessed, - description: "Should publish immediately (never processed)", - }, - { - name: "processed resource - use lastUpdated for max age", - createdTime: now.Add(-1 * time.Hour), // Irrelevant once processed - lastUpdated: now.Add(-5 * time.Second), // Updated recently - ready: false, - wantShouldPublish: false, - wantReasonContains: "max age not exceeded", - description: "Should use lastUpdated for max age calculation (5s < 10s max age)", - }, + {"exactly 10s - should not publish (> not >=)", -10 * time.Second, false}, + {"9s - should not publish", -9 * time.Second, false}, + {"11s - should publish", -11 * time.Second, true}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - resource := newTestResourceWithCreatedTime( - testResourceID, - testResourceKind, - tt.ready, - tt.createdTime, - tt.lastUpdated, - ) + resource := newResourceWithCondition("False", now.Add(tt.lastUpdated), 2) decision := engine.Evaluate(resource, now) - assertDecision(t, decision, tt.wantShouldPublish, tt.wantReasonContains) - - // Additional context on failure - if t.Failed() { - t.Logf("Test description: %s", tt.description) - t.Logf("Created time: %v", tt.createdTime) - t.Logf("LastUpdated: %v", tt.lastUpdated) + if decision.ShouldPublish != tt.wantShouldPublish { + t.Errorf("ShouldPublish = %v, want %v (lastUpdated offset: %v)", + decision.ShouldPublish, tt.wantShouldPublish, tt.lastUpdated) } }) } } -// TestDecisionEngine_Evaluate_GenerationBasedReconciliation tests generation-based reconciliation -func TestDecisionEngine_Evaluate_GenerationBasedReconciliation(t *testing.T) { - engine := newTestEngine() +func TestBuildConditionsLookup(t *testing.T) { now := time.Now() - - tests := []struct { - lastUpdated time.Time - name string - wantReasonContains string - description string - generation int32 - observedGeneration int32 - ready bool - wantShouldPublish bool - }{ - // Generation mismatch tests - should publish immediately - { - name: "generation ahead by 1 - ready", - generation: 2, - observedGeneration: 1, - ready: true, - lastUpdated: now, // Even with recent update, should publish due to generation - wantShouldPublish: true, - wantReasonContains: ReasonGenerationChanged, - description: "Spec change should trigger immediate reconciliation (ready)", - }, + conditions := []client.Condition{ { - name: "generation ahead by 1 - not ready", - generation: 3, - observedGeneration: 2, - ready: false, - lastUpdated: now, // Even with recent update, should publish due to generation - wantShouldPublish: true, - wantReasonContains: ReasonGenerationChanged, - description: "Spec change should trigger immediate reconciliation (not ready)", + Type: "Ready", + Status: "True", + LastUpdatedTime: now, + LastTransitionTime: now.Add(-1 * time.Hour), + ObservedGeneration: 3, }, { - name: "generation ahead by many - ready", - generation: 10, - observedGeneration: 5, - ready: true, - lastUpdated: now, - wantShouldPublish: true, - wantReasonContains: ReasonGenerationChanged, - description: "Multiple spec changes should trigger immediate reconciliation", - }, - { - name: "generation ahead - zero observedGeneration", - generation: 1, - observedGeneration: 0, - ready: true, - lastUpdated: now, - wantShouldPublish: true, - wantReasonContains: ReasonGenerationChanged, - description: "Generation mismatch triggers immediate reconciliation", + Type: "Available", + Status: "False", + LastUpdatedTime: now.Add(-5 * time.Minute), + LastTransitionTime: now.Add(-10 * time.Minute), + ObservedGeneration: 2, }, + } - // Generation in sync tests - should follow normal max age logic - { - name: "generation in sync - recent update - ready", - generation: 5, - observedGeneration: 5, - ready: true, - lastUpdated: now.Add(-15 * time.Minute), // 15m ago (< 30m max age) - wantShouldPublish: false, - wantReasonContains: "max age not exceeded", - description: "When generations match, follow normal max age logic (ready)", - }, - { - name: "generation in sync - recent update - not ready", - generation: 2, - observedGeneration: 2, - ready: false, - lastUpdated: now.Add(-5 * time.Second), // 5s ago (< 10s max age) - wantShouldPublish: false, - wantReasonContains: "max age not exceeded", - description: "When generations match, follow normal max age logic (not ready)", - }, - { - name: "generation in sync - max age exceeded - ready", - generation: 3, - observedGeneration: 3, - ready: true, - lastUpdated: now.Add(-31 * time.Minute), // 31m ago (> 30m max age) - wantShouldPublish: true, - wantReasonContains: ReasonMaxAgeExceeded, - description: "When generations match and max age exceeded, publish (ready)", - }, - { - name: "generation in sync - max age exceeded - not ready", - generation: 1, - observedGeneration: 1, - ready: false, - lastUpdated: now.Add(-11 * time.Second), // 11s ago (> 10s max age) - wantShouldPublish: true, - wantReasonContains: ReasonMaxAgeExceeded, - description: "When generations match and max age exceeded, publish (not ready)", - }, + lookup := buildConditionsLookup(conditions) - // Edge cases - { - name: "both generation and observedGeneration are zero", - generation: 0, - observedGeneration: 0, - ready: true, - lastUpdated: now.Add(-5 * time.Minute), - wantShouldPublish: false, - wantReasonContains: "max age not exceeded", - description: "Zero generations should be treated as in sync (defensive)", - }, - { - name: "observedGeneration ahead of generation (defensive)", - generation: 5, - observedGeneration: 10, - ready: true, - lastUpdated: now.Add(-5 * time.Minute), - wantShouldPublish: false, - wantReasonContains: "max age not exceeded", - description: "ObservedGeneration > Generation shouldn't happen, but handle defensively", - }, + if len(lookup) != 2 { + t.Fatalf("expected 2 entries, got %d", len(lookup)) } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - resource := newTestResourceWithGeneration( - testResourceID, - testResourceKind, - tt.ready, - tt.lastUpdated, - tt.generation, - tt.observedGeneration, - ) - decision := engine.Evaluate(resource, now) + ready, ok := lookup["Ready"] + if !ok { + t.Fatal("missing Ready condition") + } + if ready["status"] != "True" { + t.Errorf("Ready status = %v, want True", ready["status"]) + } + if ready["observed_generation"] != int64(3) { + t.Errorf("Ready observed_generation = %v, want 3", ready["observed_generation"]) + } - assertDecision(t, decision, tt.wantShouldPublish, tt.wantReasonContains) + avail, ok := lookup["Available"] + if !ok { + t.Fatal("missing Available condition") + } + if avail["status"] != "False" { + t.Errorf("Available status = %v, want False", avail["status"]) + } +} - // Additional context on failure - if t.Failed() { - t.Logf("Test description: %s", tt.description) - t.Logf("Generation: %d, ObservedGeneration: %d", tt.generation, tt.observedGeneration) - t.Logf("Ready: %t, LastUpdated: %v", tt.ready, tt.lastUpdated) - } - }) +func TestBuildConditionsLookup_Empty(t *testing.T) { + lookup := buildConditionsLookup(nil) + if len(lookup) != 0 { + t.Errorf("expected empty map, got %d entries", len(lookup)) } } -// TestDecisionEngine_Evaluate_NeverProcessedPrecedence tests the precedence of checks -// when both "never processed" and "generation changed" conditions are true. -// -// For brand-new resources (generation=1, observedGeneration=0, lastUpdated=zero), -// both the LastUpdated.IsZero() check and the generation mismatch check would fire. -// This test ensures that the "never processed" check takes precedence since it runs -// first, and protects against future reordering of these checks. -func TestDecisionEngine_Evaluate_NeverProcessedPrecedence(t *testing.T) { - engine := newTestEngine() - now := time.Now() +func TestZeroCondition(t *testing.T) { + zero := zeroCondition() - tests := []struct { - lastUpdated time.Time - name string - wantReasonContains string - description string - generation int32 - observedGeneration int32 - ready bool - wantShouldPublish bool - }{ - { - name: "brand-new resource - never processed wins over generation changed", - generation: 1, - observedGeneration: 0, - ready: false, - lastUpdated: time.Time{}, // Zero - triggers "never processed" - wantShouldPublish: true, - wantReasonContains: ReasonNeverProcessed, // Should be "never processed", not "generation changed" - description: "Both checks fire, but never-processed check runs first and wins", - }, - { - name: "brand-new resource - ready - never processed wins", - generation: 1, - observedGeneration: 0, - ready: true, - lastUpdated: time.Time{}, // Zero - triggers "never processed" - wantShouldPublish: true, - wantReasonContains: ReasonNeverProcessed, - description: "Both checks fire, but never-processed check runs first and wins (ready)", - }, - { - name: "generation mismatch with non-zero lastUpdated - generation changed wins", - generation: 1, - observedGeneration: 0, - ready: true, - lastUpdated: now, // Non-zero - skips "never processed" check - wantShouldPublish: true, - wantReasonContains: ReasonGenerationChanged, // Should be "generation changed" - description: "Only generation check fires since lastUpdated is non-zero", - }, - { - name: "multiple generation changes on never-processed resource", - generation: 5, - observedGeneration: 0, - ready: false, - lastUpdated: time.Time{}, // Zero - triggers "never processed" - wantShouldPublish: true, - wantReasonContains: ReasonNeverProcessed, // Still "never processed", not "generation changed" - description: "Never-processed check takes precedence even with large generation gap", + if zero["status"] != "" { + t.Errorf("status = %q, want empty string", zero["status"]) + } + if zero["observed_generation"] != int64(0) { + t.Errorf("observed_generation = %v, want 0", zero["observed_generation"]) + } + if zero["last_updated_time"] != "" { + t.Errorf("last_updated_time = %q, want empty string", zero["last_updated_time"]) + } + if zero["last_transition_time"] != "" { + t.Errorf("last_transition_time = %q, want empty string", zero["last_transition_time"]) + } +} + +func TestResourceToMap(t *testing.T) { + now := time.Now() + resource := &client.Resource{ + ID: "res-1", + Href: "/api/v1/clusters/res-1", + Kind: "Cluster", + Generation: 3, + CreatedTime: now, + UpdatedTime: now, + Labels: map[string]string{"env": "prod"}, + OwnerReferences: &client.OwnerReference{ + ID: "owner-1", + Href: "/api/v1/owners/owner-1", + Kind: "Owner", }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - resource := newTestResourceWithGeneration( - testResourceID, - testResourceKind, - tt.ready, - tt.lastUpdated, - tt.generation, - tt.observedGeneration, - ) - decision := engine.Evaluate(resource, now) + m := resourceToMap(resource) - assertDecision(t, decision, tt.wantShouldPublish, tt.wantReasonContains) + if m["id"] != "res-1" { + t.Errorf("id = %v, want res-1", m["id"]) + } + if m["kind"] != "Cluster" { + t.Errorf("kind = %v, want Cluster", m["kind"]) + } + if m["generation"] != int64(3) { + t.Errorf("generation = %v, want 3", m["generation"]) + } - // Additional context on failure - if t.Failed() { - t.Logf("Test description: %s", tt.description) - t.Logf("Generation: %d, ObservedGeneration: %d", tt.generation, tt.observedGeneration) - t.Logf("LastUpdated.IsZero(): %v", tt.lastUpdated.IsZero()) - t.Logf("Expected reason containing: %q, got: %q", tt.wantReasonContains, decision.Reason) - } - }) + labels, ok := m["labels"].(map[string]interface{}) + if !ok { + t.Fatal("labels not found or wrong type") + } + if labels["env"] != "prod" { + t.Errorf("labels.env = %v, want prod", labels["env"]) + } + + owner, ok := m["owner_references"].(map[string]interface{}) + if !ok { + t.Fatal("owner_references not found or wrong type") + } + if owner["id"] != "owner-1" { + t.Errorf("owner_references.id = %v, want owner-1", owner["id"]) + } +} + +func TestResourceToMap_NoOptionalFields(t *testing.T) { + resource := &client.Resource{ + ID: "res-2", + Kind: "NodePool", + Generation: 1, + } + + m := resourceToMap(resource) + + if _, ok := m["labels"]; ok { + t.Error("labels should not be present when empty") + } + if _, ok := m["owner_references"]; ok { + t.Error("owner_references should not be present when nil") + } + if _, ok := m["metadata"]; ok { + t.Error("metadata should not be present when nil") } } diff --git a/internal/payload/builder.go b/internal/payload/builder.go index 5eb94be..d744a8c 100644 --- a/internal/payload/builder.go +++ b/internal/payload/builder.go @@ -136,13 +136,9 @@ func (b *Builder) BuildPayload(ctx context.Context, resource *client.Resource, r // resourceToMap converts a Resource into a plain map[string]interface{} for CEL evaluation. // Time fields are formatted as RFC3339Nano strings to match their JSON representation. +// Status data is accessed only through conditions (no flattened fields). func resourceToMap(r *client.Resource) map[string]interface{} { - status := map[string]interface{}{ - "ready": r.Status.Ready, - "last_transition_time": r.Status.LastTransitionTime.Format(time.RFC3339Nano), - "last_updated": r.Status.LastUpdated.Format(time.RFC3339Nano), - "observed_generation": r.Status.ObservedGeneration, - } + status := map[string]interface{}{} if len(r.Status.Conditions) > 0 { conditions := make([]interface{}, len(r.Status.Conditions)) for i, c := range r.Status.Conditions { diff --git a/internal/payload/builder_test.go b/internal/payload/builder_test.go index cb20bf9..dcc643d 100644 --- a/internal/payload/builder_test.go +++ b/internal/payload/builder_test.go @@ -118,7 +118,13 @@ func makeTestResource() *client.Resource { Href: "/api/v1/clusters/cls-abc", Generation: 3, Status: client.ResourceStatus{ - Ready: true, + Conditions: []client.Condition{ + { + Type: "Ready", + Status: "True", + LastUpdatedTime: time.Now(), + }, + }, }, Labels: map[string]string{"region": "us-east"}, CreatedTime: time.Now(), @@ -133,7 +139,13 @@ func makeTestNodePoolResource() *client.Resource { Href: "/api/v1/nodepools/np-xyz", Generation: 2, Status: client.ResourceStatus{ - Ready: true, + Conditions: []client.Condition{ + { + Type: "Ready", + Status: "True", + LastUpdatedTime: time.Now(), + }, + }, }, OwnerReferences: &client.OwnerReference{ ID: "cluster-123", @@ -191,7 +203,7 @@ func TestBuildPayload_NestedObject(t *testing.T) { func TestBuildPayload_CELConditional(t *testing.T) { buildDef := map[string]interface{}{ - "status": `resource.status.ready == true ? "Ready" : "NotReady"`, + "gen_check": `resource.generation > 2 ? "high" : "low"`, } b, err := NewBuilder(buildDef, logger.NewHyperFleetLogger()) if err != nil { @@ -200,8 +212,8 @@ func TestBuildPayload_CELConditional(t *testing.T) { payload := b.BuildPayload(context.Background(), makeTestResource(), "") - if payload["status"] != "Ready" { - t.Errorf("expected status 'Ready', got %v", payload["status"]) + if payload["gen_check"] != "high" { + t.Errorf("expected gen_check 'high', got %v", payload["gen_check"]) } } @@ -243,9 +255,9 @@ func TestBuildPayload_CELStringLiteral(t *testing.T) { func TestBuildPayload_MixedTypes(t *testing.T) { buildDef := map[string]interface{}{ - "id": "resource.id", - "origin": `"sentinel"`, - "status": `resource.status.ready == true ? "Ready" : "NotReady"`, + "id": "resource.id", + "origin": `"sentinel"`, + "gen_check": `resource.generation > 2 ? "high" : "low"`, "nested": map[string]interface{}{ "kind": "resource.kind", }, @@ -263,8 +275,8 @@ func TestBuildPayload_MixedTypes(t *testing.T) { if payload["origin"] != "sentinel" { t.Errorf("expected origin 'sentinel', got %v", payload["origin"]) } - if payload["status"] != "Ready" { - t.Errorf("expected status 'Ready', got %v", payload["status"]) + if payload["gen_check"] != "high" { + t.Errorf("expected gen_check 'high', got %v", payload["gen_check"]) } nested, ok := payload["nested"].(map[string]interface{}) if !ok { diff --git a/internal/sentinel/sentinel.go b/internal/sentinel/sentinel.go index 693cdb9..3c15368 100644 --- a/internal/sentinel/sentinel.go +++ b/internal/sentinel/sentinel.go @@ -2,7 +2,6 @@ package sentinel import ( "context" - "errors" "fmt" "sync" "time" @@ -21,15 +20,6 @@ import ( "go.opentelemetry.io/otel/codes" ) -const ( - // notReadyFilter selects resources whose Ready condition is False. - notReadyFilter = "status.conditions.Ready='False'" - - // staleReadyFilterFmt selects resources whose Ready condition is True but - // last_updated_time is older than the given cutoff (RFC 3339 timestamp). - staleReadyFilterFmt = "status.conditions.Ready='True' and status.conditions.Ready.last_updated_time<='%s'" -) - // Sentinel polls the HyperFleet API and triggers reconciliation events type Sentinel struct { lastSuccessfulPoll time.Time @@ -77,8 +67,8 @@ func (s *Sentinel) LastSuccessfulPoll() time.Time { // Start starts the polling loop func (s *Sentinel) Start(ctx context.Context) error { - s.logger.Infof(ctx, "Starting sentinel resource_type=%s poll_interval=%s max_age_not_ready=%s max_age_ready=%s", - s.config.ResourceType, s.config.PollInterval, s.config.MaxAgeNotReady, s.config.MaxAgeReady) + s.logger.Infof(ctx, "Starting sentinel resource_type=%s poll_interval=%s", + s.config.ResourceType, s.config.PollInterval) ticker := time.NewTicker(s.config.PollInterval) defer ticker.Stop() @@ -124,10 +114,13 @@ func (s *Sentinel) trigger(ctx context.Context) error { // Convert label selectors to map for filtering labelSelector := s.config.ResourceSelector.ToMap() - // Fetch resources using condition-based server-side filtering: - // Query 1: Not-ready resources (need frequent reconciliation) - // Query 2: Stale ready resources (exceeded max age) - resources, err := s.fetchFilteredResources(ctx, labelSelector) + // Fetch all resources matching label selectors. + // TODO(HYPERFLEET-805): Add optional server_filters config for server-side pre-filtering + // to reduce the result set before CEL evaluation. Currently fetches the full result set + // and evaluates each resource in-memory. At large scale, use resource_selector labels + // to shard across multiple Sentinel instances. + rt := client.ResourceType(s.config.ResourceType) + resources, err := s.client.FetchResources(ctx, rt, labelSelector) if err != nil { // Record API error pollSpan.RecordError(err) @@ -212,8 +205,8 @@ func (s *Sentinel) trigger(ctx context.Context) error { // Record successful event publication metrics.UpdateEventsPublishedMetric(resourceType, resourceSelector, decision.Reason) - s.logger.Infof(eventCtx, "Published event resource_id=%s ready=%t", - resource.ID, resource.Status.Ready) + s.logger.Infof(eventCtx, "Published event resource_id=%s", + resource.ID) published++ } else { // Add decision reason to context for structured logging @@ -222,8 +215,8 @@ func (s *Sentinel) trigger(ctx context.Context) error { // Record skipped resource metrics.UpdateResourcesSkippedMetric(resourceType, resourceSelector, decision.Reason) - s.logger.Debugf(skipCtx, "Skipped resource resource_id=%s ready=%t", - resource.ID, resource.Status.Ready) + s.logger.Debugf(skipCtx, "Skipped resource resource_id=%s", + resource.ID) skipped++ } @@ -248,64 +241,6 @@ func (s *Sentinel) trigger(ctx context.Context) error { return nil } -// fetchFilteredResources makes two targeted API queries to fetch only resources -// that likely need reconciliation, reducing network traffic compared to fetching -// all resources: -// -// 1. Not-ready resources: status.conditions.Ready='False' -// 2. Stale ready resources: Ready='True' with last_updated_time older than max_age_ready -// -// Results are merged and deduplicated by resource ID. The DecisionEngine still -// evaluates the filtered set in memory (e.g., for generation-based checks). -func (s *Sentinel) fetchFilteredResources( - ctx context.Context, - labelSelector map[string]string, -) ([]client.Resource, error) { - rt := client.ResourceType(s.config.ResourceType) - - // Query 1: Not-ready resources - notReadyResources, err := s.client.FetchResources(ctx, rt, labelSelector, notReadyFilter) - if err != nil { - return nil, fmt.Errorf("failed to fetch not-ready resources: %w", err) - } - s.logger.Debugf(ctx, "Fetched not-ready resources count=%d", len(notReadyResources)) - - // Query 2: Stale ready resources (last_updated_time exceeded max_age_ready) - cutoff := time.Now().Add(-s.config.MaxAgeReady) - staleFilter := fmt.Sprintf(staleReadyFilterFmt, cutoff.Format(time.RFC3339)) - staleResources, err := s.client.FetchResources(ctx, rt, labelSelector, staleFilter) - if err != nil { - // Propagate context cancellation/timeout — the caller must see these - if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { - return nil, err - } - // Graceful degradation: for transient/API errors, continue with not-ready results - s.logger.Errorf(ctx, "Failed to fetch stale resources, continuing with not-ready only: %v", err) - return notReadyResources, nil - } - s.logger.Debugf(ctx, "Fetched stale ready resources count=%d", len(staleResources)) - - return mergeResources(notReadyResources, staleResources), nil -} - -// mergeResources combines two resource slices, deduplicating by resource ID. -// Resources from the first slice take precedence when duplicates are found. -func mergeResources(a, b []client.Resource) []client.Resource { - seen := make(map[string]struct{}, len(a)) - result := make([]client.Resource, 0, len(a)+len(b)) - - for i := range a { - seen[a[i].ID] = struct{}{} - result = append(result, a[i]) - } - for i := range b { - if _, exists := seen[b[i].ID]; !exists { - result = append(result, b[i]) - } - } - return result -} - // buildEventData builds the CloudEvent data payload for a resource using the // configured payload builder. func (s *Sentinel) buildEventData( diff --git a/internal/sentinel/sentinel_test.go b/internal/sentinel/sentinel_test.go index bcbda8b..45ddfaf 100644 --- a/internal/sentinel/sentinel_test.go +++ b/internal/sentinel/sentinel_test.go @@ -7,9 +7,6 @@ import ( "fmt" "net/http" "net/http/httptest" - "strings" - "sync" - "sync/atomic" "testing" "time" @@ -30,8 +27,7 @@ const ( testResourceKind = "Cluster" ) -// createMockCluster creates a mock cluster response matching the new API spec. -// Ready status is determined by the ready bool parameter. +// createMockCluster creates a mock cluster response matching the API spec. func createMockCluster( id string, generation int, @@ -39,21 +35,11 @@ func createMockCluster( ready bool, lastUpdated time.Time, ) map[string]interface{} { - // Map ready bool to condition status readyStatus := "False" if ready { readyStatus = "True" } - readyCondition := map[string]interface{}{ - "type": "Ready", - "status": readyStatus, - "created_time": "2025-01-01T09:00:00Z", - "last_transition_time": "2025-01-01T10:00:00Z", - "last_updated_time": lastUpdated.Format(time.RFC3339), - "observed_generation": observedGeneration, - } - return map[string]interface{}{ "id": id, "href": "/api/hyperfleet/v1/clusters/" + id, @@ -67,9 +53,8 @@ func createMockCluster( "spec": map[string]interface{}{}, "status": map[string]interface{}{ "conditions": []map[string]interface{}{ - readyCondition, { - "type": "Available", + "type": "Ready", "status": readyStatus, "created_time": "2025-01-01T09:00:00Z", "last_transition_time": "2025-01-01T10:00:00Z", @@ -92,6 +77,18 @@ func createMockClusterList(clusters []map[string]interface{}) map[string]interfa } } +// mockServerForResources creates a mock HTTP server that returns the given resources. +func mockServerForResources(t *testing.T, clusters []map[string]interface{}) *httptest.Server { + t.Helper() + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + response := createMockClusterList(clusters) + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(response); err != nil { + t.Logf("Error encoding response: %v", err) + } + })) +} + // MockPublisher implements broker.Publisher for testing type MockPublisher struct { publishError error @@ -119,7 +116,6 @@ type MockPublisherWithLogger struct { } func (m *MockPublisherWithLogger) Publish(ctx context.Context, topic string, event *cloudevents.Event) error { - // Simulate what broker does - log with the provided context m.mockLogger.Info(ctx, fmt.Sprintf("broker publishing event to topic %s", topic)) return nil } @@ -128,35 +124,28 @@ func (m *MockPublisherWithLogger) Close() error { return nil } func (m *MockPublisherWithLogger) Health(ctx context.Context) error { return nil } -// mockServerForConditionQueries creates a mock HTTP server that routes responses -// based on condition-based search parameters. Each poll cycle makes two queries: -// one for not-ready resources (Ready='False') and one for stale-ready resources (Ready='True'). -func mockServerForConditionQueries( - t *testing.T, - notReadyClusters, staleClusters []map[string]interface{}, -) *httptest.Server { +// newTestDecisionEngine creates a CEL-based decision engine with default config. +func newTestDecisionEngine(t *testing.T) *engine.DecisionEngine { t.Helper() - return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - search := r.URL.Query().Get("search") - - var clusters []map[string]interface{} - switch { - case strings.Contains(search, "Ready='False'"): - clusters = notReadyClusters - case strings.Contains(search, "Ready='True'"): - clusters = staleClusters - default: - t.Errorf("unexpected search query: %q", search) - w.WriteHeader(http.StatusBadRequest) - return - } + cfg := config.DefaultMessageDecision() + de, err := engine.NewDecisionEngine(cfg) + if err != nil { + t.Fatalf("NewDecisionEngine failed: %v", err) + } + return de +} - response := createMockClusterList(clusters) - w.Header().Set("Content-Type", "application/json") - if err := json.NewEncoder(w).Encode(response); err != nil { - t.Logf("Error encoding response: %v", err) - } - })) +// newTestSentinelConfig creates a config for testing (no max_age fields). +func newTestSentinelConfig() *config.SentinelConfig { + return &config.SentinelConfig{ + ResourceType: "clusters", + Topic: testTopic, + MessageDecision: config.DefaultMessageDecision(), + MessageData: map[string]interface{}{ + "id": "resource.id", + "kind": "resource.kind", + }, + } } // TestTrigger_Success tests successful event publishing @@ -164,44 +153,28 @@ func TestTrigger_Success(t *testing.T) { ctx := context.Background() now := time.Now() - // Stale ready cluster exceeds max age (31 minutes ago) - server := mockServerForConditionQueries(t, - nil, // no not-ready clusters - []map[string]interface{}{ - createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)), - }, - ) + // Ready cluster with stale last_updated (> 30m) should trigger publish + server := mockServerForResources(t, []map[string]interface{}{ + createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)), + }) defer server.Close() - // Setup components hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second) - decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute) + decisionEngine := newTestDecisionEngine(t) mockPublisher := &MockPublisher{} log := logger.NewHyperFleetLogger() - // Create metrics with a test registry (registers metrics once via sync.Once) registry := prometheus.NewRegistry() metrics.NewSentinelMetrics(registry, "test") - cfg := &config.SentinelConfig{ - ResourceType: "clusters", - Topic: testTopic, - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - MessageData: map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - }, - } + cfg := newTestSentinelConfig() s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log) if err != nil { t.Fatalf("NewSentinel failed: %v", err) } - // Execute err = s.trigger(ctx) - // Verify if err != nil { t.Errorf("Expected no error, got %v", err) } @@ -210,15 +183,10 @@ func TestTrigger_Success(t *testing.T) { t.Errorf("Expected 1 published event, got %d", len(mockPublisher.publishedEvents)) } - if len(mockPublisher.publishedTopics) != 1 { - t.Errorf("Expected 1 topic, got %d", len(mockPublisher.publishedTopics)) - } - - if mockPublisher.publishedTopics[0] != testTopic { - t.Errorf("Expected topic '%s', got '%s'", testTopic, mockPublisher.publishedTopics[0]) + if len(mockPublisher.publishedTopics) != 1 || mockPublisher.publishedTopics[0] != testTopic { + t.Errorf("Expected topic '%s', got %v", testTopic, mockPublisher.publishedTopics) } - // Verify CloudEvent properties event := mockPublisher.publishedEvents[0] if event.Type() != "com.redhat.hyperfleet.Cluster.reconcile" { t.Errorf("Expected event type 'com.redhat.hyperfleet.Cluster.reconcile', got '%s'", event.Type()) @@ -235,44 +203,29 @@ func TestTrigger_Success(t *testing.T) { func TestTrigger_NoEventsPublished(t *testing.T) { ctx := context.Background() - server := mockServerForConditionQueries(t, - []map[string]interface{}{ - // Not-ready for only 1 second — within max_age_not_ready (10s), should be skipped - createMockCluster("cluster-1", 1, 1, false, time.Now().Add(-1*time.Second)), - }, - nil, // no stale clusters - ) + // Not-ready for only 1 second — within debounce (10s), should be skipped. + // Generation > 1 so is_new_resource won't fire. + server := mockServerForResources(t, []map[string]interface{}{ + createMockCluster("cluster-1", 2, 2, false, time.Now().Add(-1*time.Second)), + }) defer server.Close() - // Setup components hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second) - decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute) + decisionEngine := newTestDecisionEngine(t) mockPublisher := &MockPublisher{} log := logger.NewHyperFleetLogger() - // Create metrics with a test registry (registers metrics once via sync.Once) registry := prometheus.NewRegistry() metrics.NewSentinelMetrics(registry, "test") - cfg := &config.SentinelConfig{ - ResourceType: "clusters", - Topic: testTopic, - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - MessageData: map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - }, - } + cfg := newTestSentinelConfig() s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log) if err != nil { t.Fatalf("NewSentinel failed: %v", err) } - // Execute err = s.trigger(ctx) - // Verify if err != nil { t.Errorf("Expected no error, got %v", err) } @@ -286,7 +239,6 @@ func TestTrigger_NoEventsPublished(t *testing.T) { func TestTrigger_FetchError(t *testing.T) { ctx := context.Background() - // Create mock server that returns 500 errors for all queries server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusInternalServerError) if _, err := w.Write([]byte(`{"error": "internal server error"}`)); err != nil { @@ -295,36 +247,23 @@ func TestTrigger_FetchError(t *testing.T) { })) defer server.Close() - // Setup components - hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 1*time.Second) // Short timeout - decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute) + hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 1*time.Second) + decisionEngine := newTestDecisionEngine(t) mockPublisher := &MockPublisher{} log := logger.NewHyperFleetLogger() - // Create metrics with a test registry (registers metrics once via sync.Once) registry := prometheus.NewRegistry() metrics.NewSentinelMetrics(registry, "test") - cfg := &config.SentinelConfig{ - ResourceType: "clusters", - Topic: testTopic, - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - MessageData: map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - }, - } + cfg := newTestSentinelConfig() s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log) if err != nil { t.Fatalf("NewSentinel failed: %v", err) } - // Execute err = s.trigger(ctx) - // Verify if err == nil { t.Error("Expected error, got nil") } @@ -339,45 +278,29 @@ func TestTrigger_PublishError(t *testing.T) { ctx := context.Background() now := time.Now() - server := mockServerForConditionQueries(t, - nil, // no not-ready clusters - []map[string]interface{}{ - createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)), - }, - ) + server := mockServerForResources(t, []map[string]interface{}{ + createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)), + }) defer server.Close() - // Setup components hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second) - decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute) + decisionEngine := newTestDecisionEngine(t) mockPublisher := &MockPublisher{ publishError: errors.New("broker connection failed"), } log := logger.NewHyperFleetLogger() - // Create metrics with a test registry (registers metrics once via sync.Once) registry := prometheus.NewRegistry() metrics.NewSentinelMetrics(registry, "test") - cfg := &config.SentinelConfig{ - ResourceType: "clusters", - Topic: testTopic, - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - MessageData: map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - }, - } + cfg := newTestSentinelConfig() s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log) if err != nil { t.Fatalf("NewSentinel failed: %v", err) } - // Execute err = s.trigger(ctx) - // Verify - trigger should succeed even if publish fails (graceful degradation) if err != nil { t.Errorf("Expected no error (graceful degradation), got %v", err) } @@ -388,65 +311,45 @@ func TestTrigger_MixedResources(t *testing.T) { ctx := context.Background() now := time.Now() - // Split resources between not-ready and stale queries - server := mockServerForConditionQueries(t, - // Not-ready query - []map[string]interface{}{ - // Should publish (not ready max age exceeded: 1min > 10s) - createMockCluster("cluster-3", 3, 3, false, now.Add(-1*time.Minute)), - // Should be skipped (empty ID) - createMockCluster("", 1, 1, false, now.Add(-15*time.Second)), - }, - // Stale query - []map[string]interface{}{ - createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)), // Should publish (max age exceeded) - createMockCluster("cluster-4", 5, 4, true, now.Add(-5*time.Minute)), // Should publish (generation changed) - }, - ) + server := mockServerForResources(t, []map[string]interface{}{ + // Should publish (not ready, debounce exceeded: 1min > 10s, generation > 1) + createMockCluster("cluster-3", 3, 3, false, now.Add(-1*time.Minute)), + // Should be skipped (empty ID) + createMockCluster("", 1, 1, false, now.Add(-15*time.Second)), + // Should publish (ready, stale: 31min > 30min) + createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)), + // Should not publish (ready, recent: 5min < 30min, no generation mismatch) + createMockCluster("cluster-4", 5, 5, true, now.Add(-5*time.Minute)), + }) defer server.Close() - // Setup components hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second) - decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute) + decisionEngine := newTestDecisionEngine(t) mockPublisher := &MockPublisher{} log := logger.NewHyperFleetLogger() - // Create metrics with a test registry (registers metrics once via sync.Once) registry := prometheus.NewRegistry() metrics.NewSentinelMetrics(registry, "test") - cfg := &config.SentinelConfig{ - ResourceType: "clusters", - Topic: testTopic, - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - MessageData: map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - }, - } + cfg := newTestSentinelConfig() s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log) if err != nil { t.Fatalf("NewSentinel failed: %v", err) } - // Execute err = s.trigger(ctx) - // Verify if err != nil { t.Errorf("Expected no error, got %v", err) } - // Should publish for: - // - cluster-3 (not ready max age exceeded: 1min > 10s) - // - cluster-1 (ready max age exceeded: 31min > 30min) - // - cluster-4 (generation changed: 5 > 4) - if len(mockPublisher.publishedEvents) != 3 { - t.Errorf("Expected 3 published events, got %d", len(mockPublisher.publishedEvents)) + // cluster-3: not ready + debounced → publish + // cluster-1: ready + stale → publish + // cluster-4: ready + recent, not new → skip + if len(mockPublisher.publishedEvents) != 2 { + t.Errorf("Expected 2 published events, got %d", len(mockPublisher.publishedEvents)) } - // Verify topics for _, topic := range mockPublisher.publishedTopics { if topic != testTopic { t.Errorf("Expected topic '%s', got '%s'", testTopic, topic) @@ -460,32 +363,24 @@ func TestTrigger_WithMessageDataConfig(t *testing.T) { ctx := context.Background() now := time.Now() - server := mockServerForConditionQueries(t, - nil, - []map[string]interface{}{ - createMockCluster("cluster-xyz", 2, 2, true, now.Add(-31*time.Minute)), - }, - ) + server := mockServerForResources(t, []map[string]interface{}{ + createMockCluster("cluster-xyz", 2, 2, true, now.Add(-31*time.Minute)), + }) defer server.Close() hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second) - decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute) + decisionEngine := newTestDecisionEngine(t) mockPublisher := &MockPublisher{} log := logger.NewHyperFleetLogger() registry := prometheus.NewRegistry() metrics.NewSentinelMetrics(registry, "test") - cfg := &config.SentinelConfig{ - ResourceType: "clusters", - Topic: testTopic, - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - MessageData: map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - "origin": `"sentinel"`, - }, + cfg := newTestSentinelConfig() + cfg.MessageData = map[string]interface{}{ + "id": "resource.id", + "kind": "resource.kind", + "origin": `"sentinel"`, } s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log) @@ -514,7 +409,6 @@ func TestTrigger_WithMessageDataConfig(t *testing.T) { if data["origin"] != "sentinel" { t.Errorf("Expected origin 'sentinel', got %v", data["origin"]) } - // Hardcoded fields should NOT be present when builder is configured if _, ok := data["reason"]; ok { t.Errorf("Expected 'reason' to be absent (hardcoded field), but found it") } @@ -525,33 +419,25 @@ func TestTrigger_WithNestedMessageData(t *testing.T) { ctx := context.Background() now := time.Now() - server := mockServerForConditionQueries(t, - nil, - []map[string]interface{}{ - createMockCluster("cluster-nest", 1, 1, true, now.Add(-31*time.Minute)), - }, - ) + server := mockServerForResources(t, []map[string]interface{}{ + createMockCluster("cluster-nest", 2, 2, true, now.Add(-31*time.Minute)), + }) defer server.Close() hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second) - decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute) + decisionEngine := newTestDecisionEngine(t) mockPublisher := &MockPublisher{} log := logger.NewHyperFleetLogger() registry := prometheus.NewRegistry() metrics.NewSentinelMetrics(registry, "test") - cfg := &config.SentinelConfig{ - ResourceType: "clusters", - Topic: testTopic, - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - MessageData: map[string]interface{}{ - "id": "resource.id", - "resource": map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - }, + cfg := newTestSentinelConfig() + cfg.MessageData = map[string]interface{}{ + "id": "resource.id", + "resource": map[string]interface{}{ + "id": "resource.id", + "kind": "resource.kind", }, } @@ -586,15 +472,7 @@ func TestTrigger_WithNestedMessageData(t *testing.T) { // TestBuildEventData_WithBuilder tests buildEventData directly with a configured builder. func TestBuildEventData_WithBuilder(t *testing.T) { - cfg := &config.SentinelConfig{ - ResourceType: "clusters", - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - MessageData: map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - }, - } + cfg := newTestSentinelConfig() log := logger.NewHyperFleetLogger() s, err := NewSentinel(cfg, nil, nil, nil, log) if err != nil { @@ -607,7 +485,7 @@ func TestBuildEventData_WithBuilder(t *testing.T) { Href: "/api/v1/clusters/cls-direct", Generation: 1, } - decision := engine.Decision{ShouldPublish: true, Reason: "max_age_exceeded"} + decision := engine.Decision{ShouldPublish: true, Reason: "message decision matched"} ctx := logger.WithDecisionReason(context.Background(), decision.Reason) data := s.buildEventData(ctx, resource, decision) @@ -628,13 +506,12 @@ func TestTrigger_ContextPropagationToBroker(t *testing.T) { CapturedContexts: &capturedContexts, } - // Create mock publisher that uses our mock logger mockPublisherWithLogger := &MockPublisherWithLogger{ mockLogger: mockLogger, } ctx := context.Background() - ctx = logger.WithDecisionReason(ctx, "max_age_exceeded") + ctx = logger.WithDecisionReason(ctx, "message decision matched") ctx = logger.WithTopic(ctx, testTopic) ctx = logger.WithSubset(ctx, "clusters") ctx = logger.WithTraceID(ctx, "trace-123") @@ -657,8 +534,7 @@ func TestTrigger_ContextPropagationToBroker(t *testing.T) { brokerCtx := capturedContexts[0] - // Test context values propagated to broker - if reason, ok := brokerCtx.Value(logger.DecisionReasonCtxKey).(string); !ok || reason != "max_age_exceeded" { + if reason, ok := brokerCtx.Value(logger.DecisionReasonCtxKey).(string); !ok || reason != "message decision matched" { t.Errorf("decision_reason not propagated: got %v", reason) } @@ -675,271 +551,9 @@ func TestTrigger_ContextPropagationToBroker(t *testing.T) { } } -// TestMergeResources tests the mergeResources deduplication function -func TestMergeResources(t *testing.T) { - tests := []struct { - name string - a []client.Resource - b []client.Resource - wantIDs []string - wantLen int - }{ - { - name: "no overlap", - a: []client.Resource{{ID: "a1"}, {ID: "a2"}}, - b: []client.Resource{{ID: "b1"}}, - wantIDs: []string{"a1", "a2", "b1"}, - wantLen: 3, - }, - { - name: "duplicate in b removed", - a: []client.Resource{{ID: "x"}}, - b: []client.Resource{{ID: "x"}, {ID: "y"}}, - wantIDs: []string{"x", "y"}, - wantLen: 2, - }, - { - name: "both empty", - a: nil, - b: nil, - wantIDs: nil, - wantLen: 0, - }, - { - name: "a empty", - a: nil, - b: []client.Resource{{ID: "b1"}}, - wantIDs: []string{"b1"}, - wantLen: 1, - }, - { - name: "b empty", - a: []client.Resource{{ID: "a1"}}, - b: nil, - wantIDs: []string{"a1"}, - wantLen: 1, - }, - { - name: "all duplicates", - a: []client.Resource{{ID: "x"}, {ID: "y"}}, - b: []client.Resource{{ID: "x"}, {ID: "y"}}, - wantIDs: []string{"x", "y"}, - wantLen: 2, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := mergeResources(tt.a, tt.b) - if len(result) != tt.wantLen { - t.Errorf("mergeResources() returned %d resources, want %d", len(result), tt.wantLen) - } - for i, id := range tt.wantIDs { - if i < len(result) && result[i].ID != id { - t.Errorf("mergeResources()[%d].ID = %q, want %q", i, result[i].ID, id) - } - } - }) - } -} - -// TestMergeResources_FirstSlicePrecedence verifies that resources from the first -// slice take precedence when duplicates are found. -func TestMergeResources_FirstSlicePrecedence(t *testing.T) { - a := []client.Resource{{ID: "dup", Kind: "from-a"}} - b := []client.Resource{{ID: "dup", Kind: "from-b"}} - - result := mergeResources(a, b) - if len(result) != 1 { - t.Fatalf("Expected 1 resource, got %d", len(result)) - } - if result[0].Kind != "from-a" { - t.Errorf("Expected resource from first slice (Kind='from-a'), got Kind=%q", result[0].Kind) - } -} - -// TestTrigger_ConditionBasedQueries verifies that trigger sends two condition-based -// queries (not-ready + stale) and correctly processes results from both. -func TestTrigger_ConditionBasedQueries(t *testing.T) { - ctx := context.Background() - now := time.Now() - - var capturedSearchParams []string - var mu sync.Mutex - - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - search := r.URL.Query().Get("search") - mu.Lock() - capturedSearchParams = append(capturedSearchParams, search) - mu.Unlock() - - var clusters []map[string]interface{} - switch { - case strings.Contains(search, "Ready='False'"): - clusters = []map[string]interface{}{ - createMockCluster("not-ready-1", 1, 1, false, now.Add(-15*time.Second)), - } - case strings.Contains(search, "Ready='True'"): - clusters = []map[string]interface{}{ - createMockCluster("stale-1", 2, 2, true, now.Add(-31*time.Minute)), - } - default: - t.Errorf("unexpected search query: %q", search) - w.WriteHeader(http.StatusBadRequest) - return - } - - response := createMockClusterList(clusters) - w.Header().Set("Content-Type", "application/json") - if err := json.NewEncoder(w).Encode(response); err != nil { - t.Logf("Error encoding response: %v", err) - } - })) - defer server.Close() - - hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second) - decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute) - mockPublisher := &MockPublisher{} - log := logger.NewHyperFleetLogger() - - registry := prometheus.NewRegistry() - metrics.NewSentinelMetrics(registry, "test") - - cfg := &config.SentinelConfig{ - ResourceType: "clusters", - Topic: "test-topic", - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - MessageData: map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - }, - } - - s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log) - if err != nil { - t.Fatalf("NewSentinel failed: %v", err) - } - - if err := s.trigger(ctx); err != nil { - t.Fatalf("Expected no error, got %v", err) - } - - // Verify two queries were sent - mu.Lock() - params := make([]string, len(capturedSearchParams)) - copy(params, capturedSearchParams) - mu.Unlock() - - if len(params) != 2 { - t.Fatalf("Expected 2 search queries (not-ready + stale), got %d", len(params)) - } - - hasNotReady := false - hasStale := false - for _, search := range params { - if strings.Contains(search, "Ready='False'") { - hasNotReady = true - } - if strings.Contains(search, "Ready='True'") { - hasStale = true - } - } - if !hasNotReady { - t.Error("No not-ready query (Ready='False') was sent") - } - if !hasStale { - t.Error("No stale query (Ready='True') was sent") - } - - // Both resources should trigger events - if len(mockPublisher.publishedEvents) != 2 { - t.Errorf("Expected 2 published events (1 not-ready + 1 stale), got %d", len(mockPublisher.publishedEvents)) - } -} - -// TestTrigger_StaleQueryFailure tests graceful degradation when the stale query -// fails but the not-ready query succeeds. -func TestTrigger_StaleQueryFailure(t *testing.T) { - ctx := context.Background() - now := time.Now() - - var queryCount atomic.Int32 - - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - queryCount.Add(1) - search := r.URL.Query().Get("search") - - switch { - case strings.Contains(search, "Ready='False'"): - // Not-ready query succeeds - clusters := []map[string]interface{}{ - createMockCluster("not-ready-1", 1, 1, false, now.Add(-15*time.Second)), - } - response := createMockClusterList(clusters) - w.Header().Set("Content-Type", "application/json") - if err := json.NewEncoder(w).Encode(response); err != nil { - t.Logf("Error encoding response: %v", err) - } - case strings.Contains(search, "Ready='True'"): - // Stale query fails - w.WriteHeader(http.StatusInternalServerError) - if _, err := w.Write([]byte(`{"error": "internal server error"}`)); err != nil { - t.Logf("Error writing error response: %v", err) - } - default: - t.Errorf("unexpected search query: %q", search) - w.WriteHeader(http.StatusBadRequest) - return - } - })) - defer server.Close() - - hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 1*time.Second) - decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute) - mockPublisher := &MockPublisher{} - log := logger.NewHyperFleetLogger() - - registry := prometheus.NewRegistry() - metrics.NewSentinelMetrics(registry, "test") - - cfg := &config.SentinelConfig{ - ResourceType: "clusters", - Topic: "test-topic", - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - MessageData: map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - }, - } - - s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log) - if err != nil { - t.Fatalf("NewSentinel failed: %v", err) - } - - // Should succeed with graceful degradation (not-ready results only) - err = s.trigger(ctx) - if err != nil { - t.Errorf("Expected no error (graceful degradation), got %v", err) - } - - // Should still publish event for the not-ready resource - if len(mockPublisher.publishedEvents) != 1 { - t.Errorf("Expected 1 published event (from not-ready query), got %d", len(mockPublisher.publishedEvents)) - } - - // Both queries should have been attempted (stale query retries on 500, so queryCount >= 2) - if queryCount.Load() < 2 { - t.Errorf("Expected at least 2 queries (not-ready + stale), got %d", queryCount.Load()) - } -} - func TestTrigger_CreatesRequiredSpans(t *testing.T) { ctx := context.Background() - // Setup in-memory trace exporter for span verification exporter := tracetest.NewInMemoryExporter() tp := trace.NewTracerProvider( trace.WithSampler(trace.AlwaysSample()), @@ -954,66 +568,49 @@ func TestTrigger_CreatesRequiredSpans(t *testing.T) { otel.SetTracerProvider(previousProvider) }() - server := mockServerForConditionQueries(t, - nil, // no not-ready clusters - []map[string]interface{}{ - createMockCluster("cluster-1", 2, 2, true, time.Now().Add(-31*time.Minute)), - }, - ) + server := mockServerForResources(t, []map[string]interface{}{ + createMockCluster("cluster-1", 2, 2, true, time.Now().Add(-31*time.Minute)), + }) defer server.Close() hyperfleetClient, err := client.NewHyperFleetClient(server.URL, 10*time.Second) if err != nil { t.Fatalf("failed to create HyperFleet client: %v", err) } - decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute) + decisionEngine := newTestDecisionEngine(t) mockPublisher := &MockPublisher{} log := logger.NewHyperFleetLogger() - // Create metrics with a test registry registry := prometheus.NewRegistry() metrics.NewSentinelMetrics(registry, "test") - cfg := &config.SentinelConfig{ - ResourceType: "clusters", - Topic: "hyperfleet-clusters", - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - MessagingSystem: "rabbitmq", - MessageData: map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - }, - } + cfg := newTestSentinelConfig() + cfg.Topic = "hyperfleet-clusters" + cfg.MessagingSystem = "rabbitmq" s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log) if err != nil { t.Fatalf("NewSentinel failed: %v", err) } - // Execute trigger err = s.trigger(ctx) if err != nil { t.Fatalf("trigger failed: %v", err) } - // Force flush spans to exporter err = tp.ForceFlush(ctx) if err != nil { t.Fatalf("force flush error: %v", err) } - // Verify spans were created spans := exporter.GetSpans() - // Build map of span names for easy checking spanNames := make(map[string]bool) for _, span := range spans { spanNames[span.Name] = true } - // Verify required spans exist requiredSpans := []string{ "sentinel.poll", "sentinel.evaluate", @@ -1026,8 +623,6 @@ func TestTrigger_CreatesRequiredSpans(t *testing.T) { } } - // Verify we got the expected number of spans - // Should have: 1 poll + 1 evaluate + 1 publish = 3 spans minimum if len(spans) < 3 { t.Errorf("Expected at least 3 spans, got %d. Spans: %v", len(spans), getSpanNames(spans)) } @@ -1036,12 +631,10 @@ func TestTrigger_CreatesRequiredSpans(t *testing.T) { validateSpanAttribute(t, spans, "hyperfleet-clusters publish", "messaging.operation.type", "publish") validateSpanAttribute(t, spans, "hyperfleet-clusters publish", "messaging.destination.name", cfg.Topic) - // Verify the CloudEvent was published (basic functional verification) if len(mockPublisher.publishedEvents) != 1 { t.Errorf("Expected 1 published event, got %d", len(mockPublisher.publishedEvents)) } - // Verify CloudEvent has traceparent extension if len(mockPublisher.publishedEvents) > 0 { event := mockPublisher.publishedEvents[0] extensions := event.Extensions() @@ -1071,7 +664,6 @@ func validateSpanAttribute(t *testing.T, spans []tracetest.SpanStub, spanName, a t.Errorf("Span '%s' not found", spanName) } -// Helper function for span name extraction func getSpanNames(spans []tracetest.SpanStub) []string { names := make([]string, len(spans)) for i, span := range spans { diff --git a/test/integration/integration_test.go b/test/integration/integration_test.go index 33d21c8..fbfd219 100644 --- a/test/integration/integration_test.go +++ b/test/integration/integration_test.go @@ -57,7 +57,6 @@ func createMockCluster(id string, generation int, observedGeneration int, ready // createMockClusterWithLabels creates a mock cluster response with labels func createMockClusterWithLabels(id string, generation int, observedGeneration int, ready bool, lastUpdated time.Time, labels map[string]string) map[string]interface{} { - // Map ready bool to condition status readyStatus := "False" if ready { readyStatus = "True" @@ -116,39 +115,47 @@ func createMockClusterList(clusters []map[string]interface{}) map[string]interfa } } +// newTestDecisionEngine creates a CEL-based decision engine with default config. +func newTestDecisionEngine(t *testing.T) *engine.DecisionEngine { + t.Helper() + de, err := engine.NewDecisionEngine(config.DefaultMessageDecision()) + if err != nil { + t.Fatalf("NewDecisionEngine failed: %v", err) + } + return de +} + +// newTestSentinelConfig creates a config for testing. +func newTestSentinelConfig() *config.SentinelConfig { + return &config.SentinelConfig{ + ResourceType: "clusters", + PollInterval: 100 * time.Millisecond, + MessageDecision: config.DefaultMessageDecision(), + MessageData: map[string]interface{}{ + "id": "resource.id", + "kind": "resource.kind", + }, + } +} + // TestIntegration_EndToEnd tests the full Sentinel workflow end-to-end with real RabbitMQ func TestIntegration_EndToEnd(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - // Get shared RabbitMQ testcontainer from helper helper := NewHelper() now := time.Now() var pollCycleCount atomic.Int32 - // Create mock HyperFleet API server that handles condition-based queries - // Each poll cycle makes 2 queries: not-ready + stale + // Single query mock - returns stale cluster on first poll only server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - search := r.URL.Query().Get("search") + cycle := pollCycleCount.Add(1) var clusters []map[string]interface{} - - // Track poll cycles (stale query marks end of a cycle) - if strings.Contains(search, "Ready='True'") { - pollCycleCount.Add(1) - } - - switch { - case strings.Contains(search, "Ready='False'"): - // Not-ready query: no not-ready clusters - clusters = nil - case strings.Contains(search, "Ready='True'"): - // Stale query: return stale cluster only on first cycle - if pollCycleCount.Load() == 1 { - clusters = []map[string]interface{}{ - createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)), // Max age exceeded - } + if cycle == 1 { + clusters = []map[string]interface{}{ + createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)), } } @@ -158,47 +165,32 @@ func TestIntegration_EndToEnd(t *testing.T) { })) defer server.Close() - // Setup components with real RabbitMQ broker hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second) - decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute) + decisionEngine := newTestDecisionEngine(t) log := logger.NewHyperFleetLogger() - // Create metrics with a test registry registry := prometheus.NewRegistry() metrics.NewSentinelMetrics(registry, "test") - cfg := &config.SentinelConfig{ - ResourceType: "clusters", - PollInterval: 100 * time.Millisecond, // Short interval for testing - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - MessageData: map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - }, - } + cfg := newTestSentinelConfig() s, err := sentinel.NewSentinel(cfg, hyperfleetClient, decisionEngine, helper.RabbitMQ.Publisher(), log) if err != nil { t.Fatalf("NewSentinel failed: %v", err) } - // Run Sentinel in background errChan := make(chan error, 1) go func() { errChan <- s.Start(ctx) }() - // Wait for at least 2 polling cycles time.Sleep(300 * time.Millisecond) cancel() - // Verify that Sentinel actually polled the API at least twice if pollCycleCount.Load() < 2 { t.Errorf("Expected at least 2 polling cycles, got %d", pollCycleCount.Load()) } - // Wait for Sentinel to stop select { case err := <-errChan: if err != nil && err != context.Canceled { @@ -208,8 +200,6 @@ func TestIntegration_EndToEnd(t *testing.T) { t.Fatal("Sentinel did not stop within timeout") } - // Integration test validates end-to-end workflow without errors - // Event verification requires subscriber implementation (future enhancement) t.Log("Integration test with real RabbitMQ broker completed successfully") } @@ -218,19 +208,14 @@ func TestIntegration_LabelSelectorFiltering(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - // Get shared RabbitMQ testcontainer from helper helper := NewHelper() now := time.Now() - // Track search parameters for assertion var capturedSearchParams []string var captureMu sync.Mutex - // Create mock server that returns clusters filtered by label selector and conditions. - // With condition-based queries, Sentinel sends: - // - Not-ready query: "labels.shard='1' and status.conditions.Ready='False'" - // - Stale query: "labels.shard='1' and status.conditions.Ready='True' and ..." + // Single query mock - validates label selectors in search params server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { searchParam := r.URL.Query().Get("search") captureMu.Lock() @@ -238,25 +223,16 @@ func TestIntegration_LabelSelectorFiltering(t *testing.T) { captureMu.Unlock() var filteredClusters []map[string]interface{} - - // Only return clusters matching shard:1 label AND condition filters if strings.Contains(searchParam, "labels.shard='1'") { - if strings.Contains(searchParam, "Ready='False'") { - // Not-ready query: no not-ready clusters with shard:1 - filteredClusters = nil - } else if strings.Contains(searchParam, "Ready='True'") { - // Stale query: return stale cluster with shard:1 - filteredClusters = []map[string]interface{}{ - createMockClusterWithLabels( - "cluster-shard-1", - 2, 2, true, - now.Add(-31*time.Minute), // Exceeds max_age_ready (30m) - map[string]string{"shard": "1"}, - ), - } + filteredClusters = []map[string]interface{}{ + createMockClusterWithLabels( + "cluster-shard-1", + 2, 2, true, + now.Add(-31*time.Minute), + map[string]string{"shard": "1"}, + ), } } - // shard:2 clusters are never returned since label selector doesn't match response := createMockClusterList(filteredClusters) w.Header().Set("Content-Type", "application/json") @@ -264,27 +240,16 @@ func TestIntegration_LabelSelectorFiltering(t *testing.T) { })) defer server.Close() - // Setup components with real RabbitMQ broker hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second) - decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute) + decisionEngine := newTestDecisionEngine(t) log := logger.NewHyperFleetLogger() - // Create metrics with a test registry registry := prometheus.NewRegistry() metrics.NewSentinelMetrics(registry, "test") - cfg := &config.SentinelConfig{ - ResourceType: "clusters", - PollInterval: 100 * time.Millisecond, - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - ResourceSelector: []config.LabelSelector{ - {Label: "shard", Value: "1"}, - }, - MessageData: map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - }, + cfg := newTestSentinelConfig() + cfg.ResourceSelector = []config.LabelSelector{ + {Label: "shard", Value: "1"}, } s, err := sentinel.NewSentinel(cfg, hyperfleetClient, decisionEngine, helper.RabbitMQ.Publisher(), log) @@ -292,23 +257,19 @@ func TestIntegration_LabelSelectorFiltering(t *testing.T) { t.Fatalf("NewSentinel failed: %v", err) } - // Run sentinel in goroutine and capture error errChan := make(chan error, 1) go func() { errChan <- s.Start(ctx) }() - // Wait for a few polling cycles time.Sleep(300 * time.Millisecond) cancel() - // Check Start error startErr := <-errChan if startErr != nil && startErr != context.Canceled { t.Errorf("Expected Start to return nil or context.Canceled, got: %v", startErr) } - // Validate captured search parameters captureMu.Lock() params := make([]string, len(capturedSearchParams)) copy(params, capturedSearchParams) @@ -318,45 +279,27 @@ func TestIntegration_LabelSelectorFiltering(t *testing.T) { t.Fatal("No search queries were captured — Sentinel may not have polled") } - hasNotReady := false - hasStale := false for _, search := range params { if !strings.Contains(search, "labels.shard='1'") { t.Errorf("Expected search to contain label selector \"labels.shard='1'\", got %q", search) } - if strings.Contains(search, "Ready='False'") { - hasNotReady = true - } - if strings.Contains(search, "Ready='True'") { - hasStale = true - } - } - if !hasNotReady { - t.Error("No not-ready query (Ready='False') was captured") - } - if !hasStale { - t.Error("No stale query (Ready='True') was captured") } t.Logf("Label selector filtering test completed with %d queries", len(params)) } // TestIntegration_TSLSyntaxMultipleLabels validates TSL syntax with multiple label selectors -// and condition-based filtering func TestIntegration_TSLSyntaxMultipleLabels(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - // Get shared RabbitMQ testcontainer from helper helper := NewHelper() now := time.Now() - // Track the search parameters received by the mock server var receivedSearchParams []string var searchMu sync.Mutex - // Create mock server that validates TSL syntax with conditions server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { search := r.URL.Query().Get("search") searchMu.Lock() @@ -365,22 +308,15 @@ func TestIntegration_TSLSyntaxMultipleLabels(t *testing.T) { var filteredClusters []map[string]interface{} - // With condition-based queries, labels are combined with condition filters labelPrefix := "labels.env='production' and labels.region='us-east'" if strings.HasPrefix(search, labelPrefix) { - if strings.Contains(search, "Ready='False'") { - // Not-ready query - filteredClusters = nil - } else if strings.Contains(search, "Ready='True'") { - // Stale query: return matching cluster - filteredClusters = []map[string]interface{}{ - createMockClusterWithLabels( - "cluster-region-env-match", - 2, 2, true, - now.Add(-31*time.Minute), - map[string]string{"region": "us-east", "env": "production"}, - ), - } + filteredClusters = []map[string]interface{}{ + createMockClusterWithLabels( + "cluster-region-env-match", + 2, 2, true, + now.Add(-31*time.Minute), + map[string]string{"region": "us-east", "env": "production"}, + ), } } @@ -390,27 +326,17 @@ func TestIntegration_TSLSyntaxMultipleLabels(t *testing.T) { })) defer server.Close() - // Setup components hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second) - decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute) + decisionEngine := newTestDecisionEngine(t) log := logger.NewHyperFleetLogger() registry := prometheus.NewRegistry() metrics.NewSentinelMetrics(registry, "test") - cfg := &config.SentinelConfig{ - ResourceType: "clusters", - PollInterval: 100 * time.Millisecond, - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - ResourceSelector: []config.LabelSelector{ - {Label: "region", Value: "us-east"}, - {Label: "env", Value: "production"}, - }, - MessageData: map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - }, + cfg := newTestSentinelConfig() + cfg.ResourceSelector = []config.LabelSelector{ + {Label: "region", Value: "us-east"}, + {Label: "env", Value: "production"}, } s, err := sentinel.NewSentinel(cfg, hyperfleetClient, decisionEngine, helper.RabbitMQ.Publisher(), log) @@ -418,17 +344,14 @@ func TestIntegration_TSLSyntaxMultipleLabels(t *testing.T) { t.Fatalf("NewSentinel failed: %v", err) } - // Run sentinel errChan := make(chan error, 1) go func() { errChan <- s.Start(ctx) }() - // Wait for polling time.Sleep(300 * time.Millisecond) cancel() - // Validate that queries include both label selectors and condition filters labelPrefix := "labels.env='production' and labels.region='us-east'" searchMu.Lock() paramsCopy := make([]string, len(receivedSearchParams)) @@ -439,27 +362,12 @@ func TestIntegration_TSLSyntaxMultipleLabels(t *testing.T) { t.Fatal("No search queries were captured — Sentinel may not have polled") } - hasNotReady := false - hasStale := false for _, search := range paramsCopy { if !strings.HasPrefix(search, labelPrefix) { t.Errorf("Expected search to start with label selectors %q, got %q", labelPrefix, search) } - if strings.Contains(search, "Ready='False'") { - hasNotReady = true - } - if strings.Contains(search, "Ready='True'") { - hasStale = true - } - } - if !hasNotReady { - t.Error("No not-ready query (Ready='False') was captured") - } - if !hasStale { - t.Error("No stale query (Ready='True') was captured") } - // Wait for sentinel to stop startErr := <-errChan if startErr != nil && startErr != context.Canceled { t.Errorf("Expected Start to return nil or context.Canceled, got: %v", startErr) @@ -468,47 +376,13 @@ func TestIntegration_TSLSyntaxMultipleLabels(t *testing.T) { t.Logf("TSL syntax validation completed with %d queries", len(paramsCopy)) } -// TestIntegration_BrokerLoggerContext validates that OpenTelemetry trace context and Sentinel-specific context fields are properly propagated through the logging system -// during event publishing workflows. -// -// Context Propagation Flow: -// 1. OpenTelemetry creates trace_id and span_id for spans -// 2. telemetry.StartSpan() enriches context with trace IDs for logging -// 3. Sentinel adds decision context (decision_reason, topic, subset) to context -// 4. Context flows through to broker operations via logger.WithXXX() calls -// 5. All log entries contain both OpenTelemetry and Sentinel context fields -// -// Validated Log Fields: -// -// OpenTelemetry fields: -// - trace_id: Distributed trace identifier from active span -// - span_id: Current span identifier from active span -// -// Sentinel-specific fields: -// - decision_reason: Why the event was triggered (e.g., "max age exceeded", "generation changed") -// - topic: Message broker topic where event is published -// - subset: Resource type being monitored (e.g., "clusters") -// - component: Always "sentinel" for consistent log attribution -// -// Test Approach: -// - Captures structured JSON logs to a buffer for analysis -// - Uses real RabbitMQ broker to generate authentic broker operation logs -// - Mock server returns resources that trigger multiple event types -// - Parses JSON log entries to verify required context fields are present -// -// Success Criteria: -// - Sentinel's "Published event" logs contain all OpenTelemetry and context fields -// - Broker operation logs inherit Sentinel's component and context -// - No context fields are lost during the broker publishing workflow -// -// This test ensures distributed tracing correlation and structured logging work correctly across the Sentinel → Broker boundary for observability. +// TestIntegration_BrokerLoggerContext validates context propagation through logging func TestIntegration_BrokerLoggerContext(t *testing.T) { - const SENTINEL_COMPONENT = "sentinel" - const TEST_VERSION = "test" - const TEST_HOST = "testhost" - const TEST_TOPIC = "test-topic" + const sentinelComponent = "sentinel" + const testVersion = "test" + const testHost = "testhost" + const testTopic = "test-topic" - // Buffer to observe logs var logBuffer bytes.Buffer now := time.Now() var callCount atomic.Int32 @@ -517,7 +391,6 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - // Set OTLP sampler err := os.Setenv("OTEL_TRACES_SAMPLER", "always_on") if err != nil { t.Errorf("Failed to set OTEL_TRACES_SAMPLER: %v", err) @@ -529,58 +402,41 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) { } }() - // Setup OpenTelemetry for integration test tp, err := telemetry.InitTraceProvider(ctx, "sentinel", "test") if err != nil { t.Fatalf("Failed to initialize OpenTelemetry: %v", err) } defer telemetry.Shutdown(context.Background(), tp) - // Get globalConfig and assign multiWriter to observe output logs globalConfig := logger.GetGlobalConfig() multiWriter := io.MultiWriter(globalConfig.Output, &logBuffer) helper := NewHelper() - cfg := &logger.LogConfig{ + logCfg := &logger.LogConfig{ Level: logger.LevelInfo, - Format: logger.FormatJSON, // JSON for easy parsing + Format: logger.FormatJSON, Output: multiWriter, - Component: SENTINEL_COMPONENT, - Version: TEST_VERSION, - Hostname: TEST_HOST, + Component: sentinelComponent, + Version: testVersion, + Hostname: testHost, OTel: logger.OTelConfig{ Enabled: true, }, } - log := logger.NewHyperFleetLoggerWithConfig(cfg) + log := logger.NewHyperFleetLoggerWithConfig(logCfg) - // Mock server returns clusters that will trigger event publishing - // With condition-based queries, each poll cycle makes 2 queries + // Single query mock - returns resources that trigger events server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - search := r.URL.Query().Get("search") - - var clusters []map[string]interface{} - switch { - case strings.Contains(search, "Ready='False'"): - if callCount.Add(1) >= 2 { - select { - case readyChan <- true: - default: - } - } - // Not-ready query: return not-ready cluster - clusters = []map[string]interface{}{ - createMockCluster("cluster-not-ready", 1, 1, false, now.Add(-15*time.Second)), - } - case strings.Contains(search, "Ready='True'"): - // Stale query: return stale ready cluster - clusters = []map[string]interface{}{ - createMockCluster("cluster-old", 2, 2, true, now.Add(-35*time.Minute)), + if callCount.Add(1) >= 2 { + select { + case readyChan <- true: + default: } - default: - t.Errorf("unexpected search query: %q", search) - w.WriteHeader(http.StatusBadRequest) - return + } + + clusters := []map[string]interface{}{ + createMockCluster("cluster-not-ready", 2, 2, false, now.Add(-15*time.Second)), + createMockCluster("cluster-old", 2, 2, true, now.Add(-35*time.Minute)), } response := createMockClusterList(clusters) w.Header().Set("Content-Type", "application/json") @@ -589,31 +445,20 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) { defer server.Close() hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second) - decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute) - - sentinelConfig := &config.SentinelConfig{ - ResourceType: "clusters", - Topic: TEST_TOPIC, - PollInterval: 100 * time.Millisecond, - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - ResourceSelector: []config.LabelSelector{ - {Label: "region", Value: "us-east"}, - {Label: "env", Value: "production"}, - }, - MessageData: map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - }, + decisionEngine := newTestDecisionEngine(t) + + sentinelConfig := newTestSentinelConfig() + sentinelConfig.Topic = testTopic + sentinelConfig.ResourceSelector = []config.LabelSelector{ + {Label: "region", Value: "us-east"}, + {Label: "env", Value: "production"}, } - // Create Sentinel with our logger and real RabbitMQ broker s, err := sentinel.NewSentinel(sentinelConfig, hyperfleetClient, decisionEngine, helper.RabbitMQ.Publisher(), log) if err != nil { t.Fatalf("NewSentinel failed: %v", err) } - // Run Sentinel errChan := make(chan error, 1) go func() { errChan <- s.Start(ctx) @@ -627,7 +472,6 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) { } cancel() - // Wait for Sentinel to stop select { case err := <-errChan: if err != nil && !errors.Is(err, context.Canceled) { @@ -637,7 +481,6 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) { t.Fatal("Sentinel did not stop within timeout") } - // Analyze logs logOutput := logBuffer.String() t.Logf("Captured log output:\n%s", logOutput) @@ -660,12 +503,10 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) { msg, hasMsg := entry["message"].(string) component, hasComponent := entry["component"].(string) - // Look for Sentinel's own event publishing logs - if hasMsg && hasComponent && component == SENTINEL_COMPONENT && + if hasMsg && hasComponent && component == sentinelComponent && strings.Contains(msg, "Published event") { foundSentinelEventLog = true - // Verify Sentinel context fields are present if entry["decision_reason"] == nil { t.Errorf("Sentinel event log missing decision_reason: %v", entry) } @@ -675,7 +516,6 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) { if entry["subset"] == nil { t.Errorf("Sentinel event log missing subset: %v", entry) } - if entry["trace_id"] == nil { t.Errorf("Sentinel event log missing trace_id: %v", entry) } @@ -687,29 +527,19 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) { entry["decision_reason"], entry["topic"], entry["subset"]) } - // Look for broker operation logs (these should inherit Sentinel context) - if hasMsg && hasComponent && component == SENTINEL_COMPONENT && + if hasMsg && hasComponent && component == sentinelComponent && (strings.Contains(msg, "broker") || strings.Contains(msg, "publish") || strings.Contains(msg, "Creating publisher") || strings.Contains(msg, "publisher initialized")) { foundBrokerOperationLog = true - // Broker operations should have Sentinel context - if entry["component"] != SENTINEL_COMPONENT { - t.Errorf("Broker operation log missing component=%s: %v", SENTINEL_COMPONENT, entry) - } - - if entry["version"] != TEST_VERSION { - t.Errorf("Broker operation log missing version=%s: %v", TEST_VERSION, entry) + if entry["component"] != sentinelComponent { + t.Errorf("Broker operation log missing component=%s: %v", sentinelComponent, entry) } - - if entry["hostname"] != TEST_HOST { - t.Errorf("Broker operation log missing hostname=%s: %v", TEST_HOST, entry) + if entry["version"] != testVersion { + t.Errorf("Broker operation log missing version=%s: %v", testVersion, entry) } - - // Check for context inheritance (these fields should be present if context flowed through) - if entry["decision_reason"] != nil || entry["topic"] != nil || entry["subset"] != nil { - t.Logf("Broker operation inherits Sentinel context: decision_reason=%v topic=%v subset=%v", - entry["decision_reason"], entry["topic"], entry["subset"]) + if entry["hostname"] != testHost { + t.Errorf("Broker operation log missing hostname=%s: %v", testHost, entry) } t.Logf("Found broker operation log with component=sentinel: %s", msg) @@ -724,41 +554,16 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) { t.Error("No broker operation logs found - broker may not be logging") } - // Success criteria: Both Sentinel and broker logs should use component=sentinel if foundSentinelEventLog && foundBrokerOperationLog { t.Log("SUCCESS: Logger context inheritance verified - both Sentinel and broker operations log with component=sentinel") } } -// TestIntegration_EndToEndSpanHierarchy validates the complete OpenTelemetry span hierarchy created during Sentinel's polling and event publishing workflow. -// -// Expected Span Hierarchy: -// -// sentinel.poll (root span - one per polling cycle) -// ├── HTTP GET (API call to fetch resources) -// ├── sentinel.evaluate (one per resource evaluated) -// │ └── {topic} publish (one per event published) -// ├── sentinel.evaluate (next resource) -// │ └── {topic} publish -// └── ... -// -// The test validates: -// 1. Required spans are created: sentinel.poll, sentinel.evaluate, {topic} publish -// 2. Parent-child relationships: evaluate HTTP spans are children of poll spans, publish spans are children of evaluate spans -// 3. Multiple spans: One evaluate/publish span per resource that triggers an event -// 4. Trace continuity: All spans within a poll cycle belong to the same trace -// -// Test Setup: -// - Uses in-memory OpenTelemetry exporter to capture and analyze spans -// - Mock server returns 3 test resources that will trigger events -// - Real RabbitMQ broker for realistic message publishing -// -// Note: The test may capture multiple poll cycles. Due to context cancellation timing, only 2 poll cycles are validated +// TestIntegration_EndToEndSpanHierarchy validates the complete OpenTelemetry span hierarchy func TestIntegration_EndToEndSpanHierarchy(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - // Setup in-memory trace exporter to capture spans exporter := tracetest.NewInMemoryExporter() tp := trace.NewTracerProvider( trace.WithSampler(trace.AlwaysSample()), @@ -779,36 +584,20 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) { var callCount atomic.Int32 readyChan := make(chan bool, 1) - // Mock server that routes responses based on condition-based search parameters. - // Each poll cycle makes 2 queries: not-ready (Ready='False') + stale (Ready='True'). + // Single query mock - returns resources that trigger different decision paths server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - callCount.Add(1) - // 2 queries per poll cycle; signal after 2 complete cycles (4+ queries) - if callCount.Load() > 4 { + if callCount.Add(1) > 2 { select { case readyChan <- true: default: } } - search := r.URL.Query().Get("search") - var clusters []map[string]interface{} - switch { - case strings.Contains(search, "Ready='False'"): - clusters = []map[string]interface{}{ - // Triggers max_age_not_ready exceeded - createMockCluster("cluster-not-ready-old", 1, 1, false, now.Add(-15*time.Second)), - } - case strings.Contains(search, "Ready='True'"): - clusters = []map[string]interface{}{ - // Triggers max_age_ready exceeded - createMockCluster("cluster-ready-old", 2, 2, true, now.Add(-35*time.Minute)), - // Triggers generation mismatch - createMockCluster("cluster-generation-mismatch", 5, 3, true, now.Add(-5*time.Minute)), - } - default: - w.WriteHeader(http.StatusBadRequest) - return + clusters := []map[string]interface{}{ + // Triggers not_ready_and_debounced + createMockCluster("cluster-not-ready-old", 2, 2, false, now.Add(-15*time.Second)), + // Triggers ready_and_stale + createMockCluster("cluster-ready-old", 2, 2, true, now.Add(-35*time.Minute)), } response := createMockClusterList(clusters) w.Header().Set("Content-Type", "application/json") @@ -816,45 +605,32 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) { })) defer server.Close() - // Get shared RabbitMQ from helper helper := NewHelper() - // Setup components hyperfleetClient, clientErr := client.NewHyperFleetClient(server.URL, 10*time.Second) if clientErr != nil { t.Fatalf("failed to create HyperFleet client: %v", clientErr) } - decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute) + decisionEngine := newTestDecisionEngine(t) log := logger.NewHyperFleetLogger() registry := prometheus.NewRegistry() metrics.NewSentinelMetrics(registry, "test") - cfg := &config.SentinelConfig{ - ResourceType: "clusters", - Topic: "test-spans-topic", - PollInterval: 100 * time.Millisecond, - MaxAgeNotReady: 10 * time.Second, - MaxAgeReady: 30 * time.Minute, - MessagingSystem: "rabbitmq", - MessageData: map[string]interface{}{ - "id": "resource.id", - "kind": "resource.kind", - }, - } + cfg := newTestSentinelConfig() + cfg.Topic = "test-spans-topic" + cfg.MessagingSystem = "rabbitmq" s, err := sentinel.NewSentinel(cfg, hyperfleetClient, decisionEngine, helper.RabbitMQ.Publisher(), log) if err != nil { t.Fatalf("NewSentinel failed: %v", err) } - // Run Sentinel to generate spans errChan := make(chan error, 1) go func() { errChan <- s.Start(ctx) }() - // Wait for at least 2 polling cycles select { case <-readyChan: t.Log("Sentinel completed required polling cycles") @@ -863,7 +639,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) { } cancel() - // Wait for Sentinel to stop select { case err := <-errChan: if err != nil && !errors.Is(err, context.Canceled) { @@ -873,7 +648,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) { t.Fatal("Sentinel did not stop within timeout") } - // Force flush spans to exporter flushCtx, flushCancel := context.WithTimeout(context.Background(), 5*time.Second) defer flushCancel() @@ -882,11 +656,9 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) { t.Fatalf("force flush error: %v", err) } - // Analyze captured spans spans := exporter.GetSpans() t.Logf("Captured %d spans", len(spans)) - // Build span maps for analysis spansByName := make(map[string][]tracetest.SpanStub) spansByID := make(map[string]tracetest.SpanStub) spansByParentID := make(map[string][]tracetest.SpanStub) @@ -901,7 +673,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) { } } - // Print span hierarchy for debugging t.Log("Span hierarchy:") for _, span := range spans { parentInfo := "ROOT" @@ -911,7 +682,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) { t.Logf(" %s (%s) - %s", span.Name, span.SpanContext.SpanID().String()[:8], parentInfo) } - // Validate required spans exist requiredSpans := []string{ "sentinel.poll", "sentinel.evaluate", @@ -924,7 +694,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) { } } - // Validate span hierarchy structure pollSpans := spansByName["sentinel.poll"] if len(pollSpans) == 0 { t.Fatal("No sentinel.poll spans found") @@ -935,25 +704,19 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) { pollSpansToValidate = pollSpansToValidate[:2] } - // For each poll span, validate it has the expected children, evaluate only the first two poll spans for _, pollSpan := range pollSpansToValidate { pollSpanID := pollSpan.SpanContext.SpanID().String() directChildren := spansByParentID[pollSpanID] t.Logf("Poll span %s has %d direct children", pollSpanID[:8], len(directChildren)) - // Verify poll span has evaluate children (direct) hasEvaluateChild := false - hasHTTPChild := false evaluateChildCount := 0 for _, child := range directChildren { - switch { - case child.Name == "sentinel.evaluate": + if child.Name == "sentinel.evaluate" { hasEvaluateChild = true evaluateChildCount++ - case strings.Contains(child.Name, "HTTP"): - hasHTTPChild = true } } @@ -962,7 +725,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) { continue } - // Verify each evaluate span has publish grandchildren publishGrandchildCount := 0 for _, child := range directChildren { if child.Name == "sentinel.evaluate" { @@ -976,26 +738,23 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) { } } - t.Logf("Poll span %s: evaluate children=%d, publish grandchildren=%d, http=%t", - pollSpanID[:8], evaluateChildCount, publishGrandchildCount, hasHTTPChild) + t.Logf("Poll span %s: evaluate children=%d, publish grandchildren=%d", + pollSpanID[:8], evaluateChildCount, publishGrandchildCount) - // Only validate publish grandchildren if this poll span actually processed events if evaluateChildCount > 0 && publishGrandchildCount == 0 { t.Errorf("Poll span %s has %d evaluate children but no publish grandchildren", pollSpanID[:8], evaluateChildCount) } } - // Validate we have multiple evaluation spans (one per resource) evaluateSpans := spansByName["sentinel.evaluate"] - if len(evaluateSpans) < 3 { - t.Errorf("Expected at least 3 sentinel.evaluate spans (one per test resource), got %d", len(evaluateSpans)) + if len(evaluateSpans) < 2 { + t.Errorf("Expected at least 2 sentinel.evaluate spans (one per test resource), got %d", len(evaluateSpans)) } - // Validate we have multiple publish spans (one per event) publishSpans := spansByName["test-spans-topic publish"] - if len(publishSpans) < 3 { - t.Errorf("Expected at least 3 publish spans (one per test event), got %d", len(publishSpans)) + if len(publishSpans) < 2 { + t.Errorf("Expected at least 2 publish spans (one per test event), got %d", len(publishSpans)) } validateSpanAttribute(t, publishSpans, "test-spans-topic publish", "messaging.system", cfg.MessagingSystem) @@ -1019,7 +778,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) { } } - // Verify trace continuity - spans should form coherent traces traceIDs := make(map[string]bool) for _, span := range spans { traceIDs[span.SpanContext.TraceID().String()] = true @@ -1030,7 +788,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) { } } -// Helper function for span name extraction func getSpanNames(spans []tracetest.SpanStub) []string { names := make([]string, len(spans)) for i, span := range spans {