diff --git a/Makefile b/Makefile
index ae5918b..254a4c3 100755
--- a/Makefile
+++ b/Makefile
@@ -249,8 +249,7 @@ test-helm: ## Test Helm charts (lint, template, validate)
--set image.repository=openshift-hyperfleet/hyperfleet-sentinel \
--set image.tag=latest \
--set config.resourceType=nodepools \
- --set config.pollInterval=10s \
- --set config.maxAgeReady=1h > /dev/null
+ --set config.pollInterval=10s > /dev/null
@echo "Custom resource selector template OK"
@echo ""
@echo "All Helm chart tests passed!"
diff --git a/README.md b/README.md
index 9923c4c..5027d78 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# hyperfleet-sentinel
-HyperFleet Sentinel Service - Kubernetes service that polls HyperFleet API, makes orchestration decisions, and publishes events. Features configurable max age intervals, horizontal sharding via SentinelConfig CRD, and broker abstraction (GCP Pub/Sub, RabbitMQ, Stub). Centralized reconciliation logic.
+HyperFleet Sentinel Service - Kubernetes service that polls HyperFleet API, makes orchestration decisions, and publishes events. Features configurable CEL-based decision logic, horizontal sharding via SentinelConfig CRD, and broker abstraction (GCP Pub/Sub, RabbitMQ, Stub). Centralized reconciliation logic.
## Development Setup
@@ -125,8 +125,7 @@ Create a configuration file based on the examples in the `configs/` directory:
| Field | Type | Default | Description |
|-------|------|---------|-------------|
| `poll_interval` | duration | `5s` | How often to poll the API for resource updates |
-| `max_age_not_ready` | duration | `10s` | Max age interval for resources not ready |
-| `max_age_ready` | duration | `30m` | Max age interval for ready resources |
+| `message_decision` | object | See below | CEL-based decision logic (params + result) |
| `hyperfleet_api.timeout` | duration | `5s` | Request timeout for API calls |
| `resource_selector` | array | `[]` | Label selectors for filtering resources (enables sharding) |
| `message_data` | map | `{}` | Template fields for CloudEvents data payload |
@@ -231,8 +230,17 @@ This uses all defaults. Broker configuration is managed via `broker.yaml` or env
```yaml
resource_type: clusters
poll_interval: 5s
-max_age_not_ready: 10s
-max_age_ready: 30m
+
+message_decision:
+ params:
+ ref_time: 'condition("Ready").last_updated_time'
+ is_ready: 'condition("Ready").status == "True"'
+ has_ref_time: 'ref_time != ""'
+ is_new_resource: '!is_ready && resource.generation == 1'
+ generation_mismatch: 'resource.generation > condition("Ready").observed_generation'
+ ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")'
+ not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")'
+ result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced"
resource_selector:
- label: shard
diff --git a/charts/README.md b/charts/README.md
index af794aa..f5aacf6 100644
--- a/charts/README.md
+++ b/charts/README.md
@@ -80,8 +80,7 @@ The following table lists the configurable parameters of the Sentinel chart and
|-----------|-------------|---------|
| `config.resourceType` | Resource type to watch | `clusters` |
| `config.pollInterval` | Polling interval | `5s` |
-| `config.maxAgeNotReady` | Max age for not ready resources | `10s` |
-| `config.maxAgeReady` | Max age for ready resources | `30m` |
+| `config.messageDecision` | CEL-based decision logic (params + result) | See values.yaml |
| `config.resourceSelector` | Resource selector for sharding | See values.yaml |
| `config.hyperfleetApi.baseUrl` | HyperFleet API base URL | `http://hyperfleet-api:8000` |
| `config.hyperfleetApi.timeout` | API timeout | `5s` |
diff --git a/charts/templates/configmap.yaml b/charts/templates/configmap.yaml
index 5efa0a3..3c3b356 100644
--- a/charts/templates/configmap.yaml
+++ b/charts/templates/configmap.yaml
@@ -9,8 +9,6 @@ data:
# Sentinel configuration
resource_type: {{ .Values.config.resourceType }}
poll_interval: {{ .Values.config.pollInterval }}
- max_age_not_ready: {{ .Values.config.maxAgeNotReady }}
- max_age_ready: {{ .Values.config.maxAgeReady }}
{{- if .Values.config.resourceSelector }}
# Resource selector for horizontal sharding
@@ -26,6 +24,18 @@ data:
endpoint: {{ .Values.config.hyperfleetApi.baseUrl }}
timeout: {{ .Values.config.hyperfleetApi.timeout }}
+ {{- if .Values.config.messageDecision }}
+ # Configurable CEL-based decision logic
+ message_decision:
+ {{- if .Values.config.messageDecision.params }}
+ params:
+ {{- range $name, $expr := .Values.config.messageDecision.params }}
+ {{ $name }}: {{ $expr | quote }}
+ {{- end }}
+ {{- end }}
+ result: {{ .Values.config.messageDecision.result | quote }}
+ {{- end }}
+
{{- if .Values.config.messageData }}
# CloudEvents data payload configuration
message_data:
diff --git a/charts/values.yaml b/charts/values.yaml
index 495a6d0..de639a9 100644
--- a/charts/values.yaml
+++ b/charts/values.yaml
@@ -84,11 +84,19 @@ config:
# How often to poll the API for resource updates
pollInterval: 5s
- # Max age interval for resources not ready
- maxAgeNotReady: 10s
-
- # Max age interval for ready resources
- maxAgeReady: 30m
+ # Configurable CEL-based decision logic
+ # Params are named CEL expressions evaluated in dependency order.
+ # Result is a CEL boolean expression that determines whether to publish an event.
+ messageDecision:
+ params:
+ ref_time: 'condition("Ready").last_updated_time'
+ is_ready: 'condition("Ready").status == "True"'
+ has_ref_time: 'ref_time != ""'
+ is_new_resource: '!is_ready && resource.generation == 1'
+ generation_mismatch: 'resource.generation > condition("Ready").observed_generation'
+ ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")'
+ not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")'
+ result: 'is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced'
# Resource selector for horizontal sharding
# Deploy multiple Sentinel instances with different shard values
diff --git a/cmd/sentinel/main.go b/cmd/sentinel/main.go
index 3728f94..44055d7 100755
--- a/cmd/sentinel/main.go
+++ b/cmd/sentinel/main.go
@@ -217,7 +217,11 @@ func runServe(
}
log.Info(ctx, "Initialized HyperFleet client")
- decisionEngine := engine.NewDecisionEngine(cfg.MaxAgeNotReady, cfg.MaxAgeReady)
+ decisionEngine, err := engine.NewDecisionEngine(cfg.MessageDecision)
+ if err != nil {
+ log.Errorf(ctx, "Failed to create decision engine: %v", err)
+ return fmt.Errorf("failed to create decision engine: %w", err)
+ }
// Initialize broker metrics recorder
// Broker metrics (messages_published_total, errors_total, etc.) are registered
diff --git a/configs/dev-example.yaml b/configs/dev-example.yaml
index cdccca0..40ac0cd 100644
--- a/configs/dev-example.yaml
+++ b/configs/dev-example.yaml
@@ -22,9 +22,18 @@ resource_type: clusters
# Faster polling for development - see changes quickly.
poll_interval: 2s
-# Shorter max age intervals for development.
-max_age_not_ready: 5s
-max_age_ready: 2m
+# Decision logic for publishing events (CEL expressions).
+# Shorter debounce intervals for development.
+message_decision:
+ params:
+ ref_time: 'condition("Ready").last_updated_time'
+ is_ready: 'condition("Ready").status == "True"'
+ has_ref_time: 'ref_time != ""'
+ is_new_resource: '!is_ready && resource.generation == 1'
+ generation_mismatch: 'resource.generation > condition("Ready").observed_generation'
+ ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("2m")'
+ not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("5s")'
+ result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced"
# Messaging system type for OpenTelemetry tracing attributes.
# Should match the actual message broker being used for accurate observability.
@@ -51,11 +60,11 @@ hyperfleet_api:
#
# Available CEL variables:
# resource — the HyperFleet resource (id, kind, href, generation, status, labels, ...)
-# reason — the decision reason string (e.g. "max_age_exceeded")
+# reason — the decision reason string (e.g. "message decision matched")
#
# Examples:
# field: "resource.id" — field access
-# field: 'resource.status.ready ? "Ready" : "NotReady"' — conditional
+# field: 'condition("Ready").status == "True" ? "Ready" : "NotReady"' — conditional
# field: '"constant"' — CEL string literal
# field: "reason" — decision reason
#
diff --git a/configs/gcp-pubsub-example.yaml b/configs/gcp-pubsub-example.yaml
index 6f957a3..0a81d71 100644
--- a/configs/gcp-pubsub-example.yaml
+++ b/configs/gcp-pubsub-example.yaml
@@ -18,13 +18,18 @@ resource_type: clusters
# Accepts Go duration format: ns, us/µs, ms, s, m, h.
poll_interval: 5s
-# Max age interval for resources that are not ready.
-# Resources in transitional states are re-checked more frequently.
-max_age_not_ready: 10s
-
-# Max age interval for resources that are ready and stable.
-# Stable resources are checked less frequently to reduce API load.
-max_age_ready: 30m
+# Decision logic for publishing events (CEL expressions).
+# Uses condition-based checks with configurable thresholds.
+message_decision:
+ params:
+ ref_time: 'condition("Ready").last_updated_time'
+ is_ready: 'condition("Ready").status == "True"'
+ has_ref_time: 'ref_time != ""'
+ is_new_resource: '!is_ready && resource.generation == 1'
+ generation_mismatch: 'resource.generation > condition("Ready").observed_generation'
+ ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")'
+ not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")'
+ result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced"
# Messaging system type for OpenTelemetry tracing attributes.
# Should match the actual message broker being used for accurate observability.
@@ -57,11 +62,11 @@ hyperfleet_api:
#
# Available CEL variables:
# resource — the HyperFleet resource (id, kind, href, generation, status, labels, ...)
-# reason — the decision reason string (e.g. "max_age_exceeded")
+# reason — the decision reason string (e.g. "message decision matched")
#
# Examples:
# field: "resource.id" — field access
-# field: 'resource.status.ready ? "Ready" : "NotReady"' — conditional
+# field: 'condition("Ready").status == "True" ? "Ready" : "NotReady"' — conditional
# field: '"constant"' — CEL string literal
# field: "reason" — decision reason
#
diff --git a/configs/rabbitmq-example.yaml b/configs/rabbitmq-example.yaml
index ff50c3a..e085d5a 100644
--- a/configs/rabbitmq-example.yaml
+++ b/configs/rabbitmq-example.yaml
@@ -18,13 +18,18 @@ resource_type: clusters
# Accepts Go duration format: ns, us/µs, ms, s, m, h.
poll_interval: 5s
-# Max age interval for resources that are not ready.
-# Resources in transitional states are re-checked more frequently.
-max_age_not_ready: 10s
-
-# Max age interval for resources that are ready and stable.
-# Stable resources are checked less frequently to reduce API load.
-max_age_ready: 30m
+# Decision logic for publishing events (CEL expressions).
+# Uses condition-based checks with configurable thresholds.
+message_decision:
+ params:
+ ref_time: 'condition("Ready").last_updated_time'
+ is_ready: 'condition("Ready").status == "True"'
+ has_ref_time: 'ref_time != ""'
+ is_new_resource: '!is_ready && resource.generation == 1'
+ generation_mismatch: 'resource.generation > condition("Ready").observed_generation'
+ ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")'
+ not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")'
+ result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced"
# Messaging system type for OpenTelemetry tracing attributes.
# Should match the actual message broker being used for accurate observability.
@@ -57,11 +62,11 @@ hyperfleet_api:
#
# Available CEL variables:
# resource — the HyperFleet resource (id, kind, href, generation, status, labels, ...)
-# reason — the decision reason string (e.g. "max_age_exceeded")
+# reason — the decision reason string (e.g. "message decision matched")
#
# Examples:
# field: "resource.id" — field access
-# field: 'resource.status.ready ? "Ready" : "NotReady"' — conditional
+# field: 'condition("Ready").status == "True" ? "Ready" : "NotReady"' — conditional
# field: '"constant"' — CEL string literal
# field: "reason" — decision reason
#
diff --git a/docs/alerts.md b/docs/alerts.md
index 12ae298..3621a11 100644
--- a/docs/alerts.md
+++ b/docs/alerts.md
@@ -156,11 +156,11 @@ labels:
component: sentinel
annotations:
summary: "High resource skip ratio in Sentinel"
- description: "{{ $value | humanizePercentage }} of resources are being skipped. This may indicate max_age configuration issues."
+ description: "{{ $value | humanizePercentage }} of resources are being skipped. This may indicate message_decision configuration issues."
```
-**Impact**: May indicate max age intervals too long or adapter status update issues.
+**Impact**: May indicate decision thresholds too restrictive or adapter status update issues.
-**Response**: Review max age configuration and adapter health.
+**Response**: Review message_decision configuration and adapter health.
---
diff --git a/docs/metrics.md b/docs/metrics.md
index 12bbadf..f6475cc 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -60,7 +60,7 @@ sum by (resource_type) (hyperfleet_sentinel_pending_resources)
**Labels:**
- `resource_type`: Type of resource
- `resource_selector`: Label selector
-- `reason`: Reason for publishing the event (e.g., `max_age_exceeded`, `generation_mismatch`)
+- `reason`: Reason for publishing the event (e.g., `message decision matched`)
**Use Cases:**
- Monitor event publishing rate
@@ -87,7 +87,7 @@ sum by (reason) (rate(hyperfleet_sentinel_events_published_total[5m]))
**Labels:**
- `resource_type`: Type of resource
- `resource_selector`: Label selector
-- `reason`: Reason for skipping (e.g., `within_max_age`, `generation_match`)
+- `reason`: Reason for skipping (e.g., `message decision result is false`)
**Use Cases:**
- Monitor decision engine effectiveness
diff --git a/docs/multi-instance-deployment.md b/docs/multi-instance-deployment.md
index 3cf4147..9f19160 100644
--- a/docs/multi-instance-deployment.md
+++ b/docs/multi-instance-deployment.md
@@ -206,7 +206,7 @@ Memory: 64Mi + (5 × 32Mi) + 16Mi = 240Mi
│ resource_selector: │ │ resource_selector: │ │ resource_selector: │
│ - label: region │ │ - label: region │ │ - label: region │
│ value: us-east │ │ value: us-west │ │ value: eu-west │
-│ max_age_ready=30m │ │ max_age_ready=1h │ │ max_age_ready=45m │
+│ ready_stale=30m │ │ ready_stale=1h │ │ ready_stale=45m │
└──────────┬──────────┘ └──────────┬──────────┘ └──────────┬──────────┘
│ │ │
│ ▼ │
diff --git a/docs/running-sentinel.md b/docs/running-sentinel.md
index e4ba30b..5e346f7 100644
--- a/docs/running-sentinel.md
+++ b/docs/running-sentinel.md
@@ -533,8 +533,6 @@ gcloud projects remove-iam-policy-binding ${GCP_PROJECT} \
# sentinel-config.yaml
resource_type: clusters
poll_interval: 5s
-max_age_not_ready: 10s
-max_age_ready: 30m
# Watch all clusters (no filtering)
resource_selector: []
@@ -556,8 +554,6 @@ message_data:
# sentinel-dev-config.yaml
resource_type: clusters
poll_interval: 10s # Slower polling for dev
-max_age_not_ready: 30s # Longer intervals for dev
-max_age_ready: 2h
resource_selector:
- label: environment
diff --git a/docs/sentinel-operator-guide.md b/docs/sentinel-operator-guide.md
index 817b16e..d85ea2d 100644
--- a/docs/sentinel-operator-guide.md
+++ b/docs/sentinel-operator-guide.md
@@ -142,45 +142,47 @@ Time-based reconciliation ensures **eventual consistency** by publishing events
**How It Works:**
-Sentinel uses two configurable max age intervals based on the resource's status (`Ready` condition):
+Sentinel uses a configurable CEL-based decision engine (`message_decision`) that evaluates named parameters and a boolean result expression:
-| Resource State | Default Interval | Rationale |
-|----------------|------------------|----------------------------------------------------------------------------------------|
-| **Not Ready** (`Ready` condition status = False) | `10s` | Faster reconciliation for transitional states requires more frequent checks to complete quickly |
-| **Ready** (`Ready` condition status = True) | `30m` | Lower frequency for drift detection on stable resources to avoid excessive load |
+| Default Check | CEL Expression | Rationale |
+|---------------|---------------|-----------|
+| **New resource** | `!is_ready && resource.generation == 1` | Always publish for never-processed resources |
+| **Generation mismatch** | `resource.generation > condition("Ready").observed_generation` | Immediate reconciliation when spec changes |
+| **Ready and stale** | `is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")` | Lower frequency drift detection on stable resources |
+| **Not ready and debounced** | `!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")` | Faster reconciliation for transitional states |
**Decision Logic:**
-At this point, we know `status.last_updated` is not zero, so we can use it as the reference timestamp:
+The `condition("Ready")` CEL function accesses the resource's status conditions. Named params are evaluated in dependency order (topological sort):
-1. Determine max age interval based on ready status:
- - If resource is ready (`Ready` condition status == True) → use `max_age_ready` (default: 30m)
- - If resource is not ready (`Ready` condition status == False) → use `max_age_not_ready` (default: 10s)
+1. Extract reference time: `ref_time = condition("Ready").last_updated_time`
+2. Determine readiness: `is_ready = condition("Ready").status == "True"`
+3. Guard against missing conditions: `has_ref_time = ref_time != ""`
+4. Evaluate checks: `is_new_resource`, `generation_mismatch`, `ready_and_stale`, `not_ready_and_debounced`
+5. Result: `is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced`
-2. Calculate next event time:
- - `next_event = last_updated + max_age_interval`
-
-3. Compare with current time:
- - If `now >= next_event` → **Publish event** (reason: "max age exceeded")
- - Otherwise → **Skip** (reason: "max age not exceeded")
+If the result is `true` → **Publish event** (reason: "message decision matched")
+Otherwise → **Skip** (reason: "message decision result is false")
#### 2.1.4 Complete Decision Flow
-The three reconciliation checks work together in priority order to determine when to publish events:
+The four reconciliation checks work together in priority order to determine when to publish events:
```mermaid
graph TD
- START[Poll Resource] --> CHECK1{last_updated == zero?}
- CHECK1 -->|Yes| PUB1[Publish: never processed]
- CHECK1 -->|No| CHECK2{Generation > ObservedGeneration?}
- CHECK2 -->|Yes| PUB2[Publish: generation changed]
- CHECK2 -->|No| CHECK3{Resource Ready?}
- CHECK3 -->|Yes| AGE1[Max Age = 30m]
- CHECK3 -->|No| AGE2[Max Age = 10s]
- AGE1 --> CHECK4{now >= last_updated + max_age?}
- AGE2 --> CHECK4
- CHECK4 -->|Yes| PUB3[Publish: max age exceeded]
- CHECK4 -->|No| SKIP[Skip: within max age]
+ START[Poll Resource] --> CHECK1{is_new_resource?
gen==1 && !ready}
+ CHECK1 -->|Yes| PUB1[Publish: new resource]
+ CHECK1 -->|No| CHECK1B{generation_mismatch?
gen > observed_gen}
+ CHECK1B -->|Yes| PUB1B[Publish: spec changed]
+ CHECK1B -->|No| CHECK2{has_ref_time?}
+ CHECK2 -->|No| SKIP[Skip: no condition data]
+ CHECK2 -->|Yes| CHECK3{is_ready?}
+ CHECK3 -->|Yes| CHECK4{"now - ref_time > 30m?"}
+ CHECK3 -->|No| CHECK5{"now - ref_time > 10s?"}
+ CHECK4 -->|Yes| PUB2[Publish: ready and stale]
+ CHECK4 -->|No| SKIP2[Skip: within threshold]
+ CHECK5 -->|Yes| PUB3[Publish: not ready, debounced]
+ CHECK5 -->|No| SKIP2
```
**Key Takeaways:**
@@ -189,6 +191,8 @@ graph TD
- **State changes override max age** - Spec changes don't wait for intervals
- **Max age is the fallback** - Ensures eventual consistency when nothing else triggers
+> **Scalability note:** The CEL decision engine evaluates all resources matching the label selector in-memory each poll cycle. At large scale (thousands of resources), use `resource_selector` to shard resources across multiple Sentinel instances. A future `server_filters` config field is planned to allow server-side pre-filtering before CEL evaluation.
+
### 2.2 Resource Filtering
Resource filtering enables **horizontal scaling** by allowing operators to distribute resources across multiple Sentinel instances using label-based selectors.
@@ -284,10 +288,8 @@ Sentinel uses YAML-based configuration with environment variable overrides for s
# Required: Resource type to watch
resource_type: clusters
-# Optional: Polling and age intervals
+# Optional: Polling interval
poll_interval: 5s
-max_age_not_ready: 10s
-max_age_ready: 30m
# Optional: Resource filtering
resource_selector:
@@ -329,8 +331,7 @@ These fields have sensible defaults and can be omitted:
| Field | Type | Default | Description |
|-------|------|---------|-------------|
| `poll_interval` | duration | `5s` | How often to poll the API for resource updates |
-| `max_age_not_ready` | duration | `10s` | Max age interval for not-ready resources |
-| `max_age_ready` | duration | `30m` | Max age interval for ready resources |
+| `message_decision` | object | See defaults | CEL-based decision logic (params + result expression) |
| `hyperfleet_api.timeout` | duration | `5s` | Request timeout for API calls |
| `resource_selector` | array | `[]` | Label selectors for filtering (empty = all resources) |
| `message_data` | map | `{}` | CEL expressions for CloudEvents payload |
@@ -385,18 +386,14 @@ message_data:
# Nested field access
cluster_id: "resource.owner_references.id"
- # Conditionals (ternary operator)
- ready_status: 'resource.status.ready ? "Ready" : "NotReady"'
+ # Conditionals (ternary operator) — use condition() function to access status conditions
+ ready_status: 'condition("Ready").status == "True" ? "Ready" : "NotReady"'
# String literals (must use quotes inside CEL expression)
source: '"hyperfleet-sentinel"'
# Numeric/boolean literals
generation: "resource.generation"
- is_ready: "resource.status.ready"
-
- # Complex conditionals with CEL filter
- ready_condition: 'resource.status.conditions.filter(c, c.type=="Ready")[0].value == "True" ? "Ready" : "NotReady"'
# Nested objects
owner_references: "resource.owner_references"
@@ -572,10 +569,9 @@ Follow this checklist to ensure successful Sentinel deployment and operation.
**Configure Reconciliation Parameters**
-- [ ] Review and adjust polling intervals:
+- [ ] Review and adjust polling and decision parameters:
- `poll_interval` - How often to poll the HyperFleet API (default: `5s`)
- - `max_age_not_ready` - Reconciliation interval for not-ready resources (default: `10s`)
- - `max_age_ready` - Reconciliation interval for ready resources (default: `30m`)
+ - `message_decision` - CEL-based decision logic with configurable thresholds
- Reference: [Optional Fields](#optional-fields)
**Design CloudEvents Payload**
@@ -692,7 +688,7 @@ For detailed deployment guidance, see [docs/running-sentinel.md](running-sentine
| Symptom | Likely Cause | Solution |
|------------------------------------------------------------------------------------------------------------------------------|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **Events not published, resources not found** | Resource selector mismatch | Verify `resource_selector` matches resource labels. Empty selector watches ALL resources. Check logs: `kubectl logs -n hyperfleet-system -l app.kubernetes.io/name=sentinel` |
-| **Events not published, resources found but skipped** | Max age not exceeded | Normal behavior. Events publish when `generation > observed_generation` OR max age interval elapsed (`max_age_ready`: 30m, `max_age_not_ready`: 10s). |
+| **Events not published, resources found but skipped** | Decision result is false | Normal behavior. Events publish when the CEL decision expression evaluates to true (new resource, ready and stale >30m, or not ready and debounced >10s by default). |
| **API connection errors, DNS lookup fails** | Wrong service name or namespace | Verify endpoint format: `http://..svc.cluster.local:8000`. Check API is running: `kubectl get pods -n hyperfleet-system -l app=hyperfleet-api` |
| **API returns 401 Unauthorized** | Missing authentication | Add auth headers to `hyperfleet_api` config if API requires authentication. |
| **API returns 404 Not Found** | Wrong API version in path | Verify endpoint uses correct API version: `/api/v1/clusters` or `/api/hyperfleet/v1/clusters` |
diff --git a/internal/client/client.go b/internal/client/client.go
index 95c8d97..80a868a 100644
--- a/internal/client/client.go
+++ b/internal/client/client.go
@@ -87,13 +87,10 @@ type Resource struct {
Generation int32 `json:"generation"`
}
-// ResourceStatus represents the status of a resource
+// ResourceStatus represents the status of a resource.
+// All status data is accessed through Conditions only.
type ResourceStatus struct {
- LastTransitionTime time.Time `json:"lastTransitionTime"` // Updates only when ready status changes
- LastUpdated time.Time `json:"lastUpdated"` // Updates every time an adapter checks the resource
- Conditions []Condition `json:"conditions,omitempty"`
- ObservedGeneration int32 `json:"observedGeneration"` // The generation last processed by the adapter
- Ready bool `json:"ready"` // True if Ready condition status is "True"
+ Conditions []Condition `json:"conditions,omitempty"`
}
// Condition represents a status condition
@@ -341,7 +338,7 @@ func (c *HyperFleetClient) fetchClusters(ctx context.Context, searchParam string
resource.Labels = *item.Labels
}
- // Convert conditions from OpenAPI model and extract Ready condition for ready status
+ // Convert conditions from OpenAPI model
if len(item.Status.Conditions) > 0 {
resource.Status.Conditions = make([]Condition, 0, len(item.Status.Conditions))
for _, cond := range item.Status.Conditions {
@@ -359,14 +356,6 @@ func (c *HyperFleetClient) fetchClusters(ctx context.Context, searchParam string
condition.Message = *cond.Message
}
resource.Status.Conditions = append(resource.Status.Conditions, condition)
-
- // Extract ready status and timing from the Ready condition
- if cond.Type == "Ready" {
- resource.Status.Ready = cond.Status == "True"
- resource.Status.LastTransitionTime = cond.LastTransitionTime
- resource.Status.LastUpdated = cond.LastUpdatedTime
- resource.Status.ObservedGeneration = cond.ObservedGeneration
- }
}
}
@@ -468,7 +457,7 @@ func (c *HyperFleetClient) fetchNodePools(ctx context.Context, searchParam strin
resource.OwnerReferences = ref
}
- // Convert conditions from OpenAPI model and extract Ready condition for ready status
+ // Convert conditions from OpenAPI model
if len(item.Status.Conditions) > 0 {
resource.Status.Conditions = make([]Condition, 0, len(item.Status.Conditions))
for _, cond := range item.Status.Conditions {
@@ -486,14 +475,6 @@ func (c *HyperFleetClient) fetchNodePools(ctx context.Context, searchParam strin
condition.Message = *cond.Message
}
resource.Status.Conditions = append(resource.Status.Conditions, condition)
-
- // Extract ready status and timing from the Ready condition
- if cond.Type == "Ready" {
- resource.Status.Ready = cond.Status == "True"
- resource.Status.LastTransitionTime = cond.LastTransitionTime
- resource.Status.LastUpdated = cond.LastUpdatedTime
- resource.Status.ObservedGeneration = cond.ObservedGeneration
- }
}
}
diff --git a/internal/client/client_test.go b/internal/client/client_test.go
index 32ed30c..c747532 100644
--- a/internal/client/client_test.go
+++ b/internal/client/client_test.go
@@ -106,8 +106,11 @@ func TestFetchResources_Success(t *testing.T) {
if resources[0].Generation != 5 {
t.Errorf("Expected generation 5, got %d", resources[0].Generation)
}
- if !resources[0].Status.Ready {
- t.Errorf("Expected Ready=true, got %t", resources[0].Status.Ready)
+ if len(resources[0].Status.Conditions) == 0 {
+ t.Fatal("Expected at least one condition")
+ }
+ if resources[0].Status.Conditions[0].Status != "True" {
+ t.Errorf("Expected Ready condition status 'True', got %s", resources[0].Status.Conditions[0].Status)
}
}
@@ -430,8 +433,11 @@ func TestFetchResources_NilStatus(t *testing.T) {
t.Errorf("Expected cluster-2 (valid status), got %s", resources[0].ID)
}
- if !resources[0].Status.Ready {
- t.Errorf("Expected Ready=true, got %t", resources[0].Status.Ready)
+ if len(resources[0].Status.Conditions) == 0 {
+ t.Fatal("Expected at least one condition")
+ }
+ if resources[0].Status.Conditions[0].Status != "True" {
+ t.Errorf("Expected Ready condition status 'True', got %s", resources[0].Status.Conditions[0].Status)
}
}
@@ -618,8 +624,11 @@ func TestFetchResources_NodePools(t *testing.T) {
if resources[0].Generation != 3 {
t.Errorf("Expected generation 3 for nodepool, got %d", resources[0].Generation)
}
- if resources[0].Status.ObservedGeneration != 3 {
- t.Errorf("Expected observed generation 3, got %d", resources[0].Status.ObservedGeneration)
+ if len(resources[0].Status.Conditions) == 0 {
+ t.Fatal("Expected at least one condition for nodepool")
+ }
+ if resources[0].Status.Conditions[0].ObservedGeneration != 3 {
+ t.Errorf("Expected observed generation 3, got %d", resources[0].Status.Conditions[0].ObservedGeneration)
}
if resources[0].OwnerReferences == nil {
t.Fatal("Expected OwnerReferences to be set, got nil")
diff --git a/internal/config/config.go b/internal/config/config.go
index d2390f0..616bfb5 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -24,17 +24,24 @@ type LabelSelector struct {
// LabelSelectorList is a list of label selectors
type LabelSelectorList []LabelSelector
+// MessageDecisionConfig represents configurable CEL-based decision logic.
+// Params are named CEL expressions evaluated in dependency order.
+// Result is a CEL expression that evaluates to a boolean.
+type MessageDecisionConfig struct {
+ Params map[string]string `mapstructure:"params"`
+ Result string `mapstructure:"result"`
+}
+
// SentinelConfig represents the Sentinel configuration
type SentinelConfig struct {
HyperFleetAPI *HyperFleetAPIConfig `mapstructure:"hyperfleet_api"`
MessageData map[string]interface{} `mapstructure:"message_data"`
+ MessageDecision *MessageDecisionConfig `mapstructure:"message_decision"`
ResourceType string `mapstructure:"resource_type"`
Topic string `mapstructure:"topic"`
MessagingSystem string `mapstructure:"messaging_system"`
ResourceSelector LabelSelectorList `mapstructure:"resource_selector"`
PollInterval time.Duration `mapstructure:"poll_interval"`
- MaxAgeNotReady time.Duration `mapstructure:"max_age_not_ready"`
- MaxAgeReady time.Duration `mapstructure:"max_age_ready"`
}
// HyperFleetAPIConfig defines the HyperFleet API client configuration
@@ -58,13 +65,28 @@ func (ls LabelSelectorList) ToMap() map[string]string {
return result
}
+// DefaultMessageDecision returns the default message_decision configuration
+// that replicates the previous hardcoded max_age_not_ready/max_age_ready behavior.
+func DefaultMessageDecision() *MessageDecisionConfig {
+ return &MessageDecisionConfig{
+ Params: map[string]string{
+ "ref_time": `condition("Ready").last_updated_time`,
+ "is_ready": `condition("Ready").status == "True"`,
+ "has_ref_time": `ref_time != ""`,
+ "is_new_resource": `!is_ready && resource.generation == 1`,
+ "generation_mismatch": `resource.generation > condition("Ready").observed_generation`,
+ "ready_and_stale": `is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")`,
+ "not_ready_and_debounced": `!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")`,
+ },
+ Result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced",
+ }
+}
+
// NewSentinelConfig creates a new configuration with defaults
func NewSentinelConfig() *SentinelConfig {
return &SentinelConfig{
// ResourceType is required and must be set in config file
PollInterval: 5 * time.Second,
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
ResourceSelector: []LabelSelector{}, // Empty means watch all resources
HyperFleetAPI: &HyperFleetAPIConfig{
// Endpoint is required and must be set in config file
@@ -108,6 +130,11 @@ func LoadConfig(configFile string) (*SentinelConfig, error) {
}
}
+ // Apply default message_decision if not configured
+ if cfg.MessageDecision == nil {
+ cfg.MessageDecision = DefaultMessageDecision()
+ }
+
// Override topic from environment variable if explicitly provided
// Environment variable takes precedence over config file (including empty value to clear)
if topic, ok := os.LookupEnv("BROKER_TOPIC"); ok {
@@ -151,12 +178,12 @@ func (c *SentinelConfig) Validate() error {
return fmt.Errorf("poll_interval must be positive")
}
- if c.MaxAgeNotReady <= 0 {
- return fmt.Errorf("max_age_not_ready must be positive")
+ if c.MessageDecision == nil {
+ return fmt.Errorf("message_decision is required")
}
- if c.MaxAgeReady <= 0 {
- return fmt.Errorf("max_age_ready must be positive")
+ if err := c.MessageDecision.Validate(); err != nil {
+ return fmt.Errorf("message_decision: %w", err)
}
if c.MessageData == nil {
@@ -170,6 +197,137 @@ func (c *SentinelConfig) Validate() error {
return nil
}
+// Validate validates the message decision configuration
+func (md *MessageDecisionConfig) Validate() error {
+ if md.Result == "" {
+ return fmt.Errorf("result expression is required")
+ }
+
+ for name, expr := range md.Params {
+ if expr == "" {
+ return fmt.Errorf("param %q has empty expression", name)
+ }
+ }
+
+ // Check for circular dependencies
+ if _, err := md.TopologicalSort(); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+// TopologicalSort resolves param evaluation order based on inter-param dependencies.
+// Returns an ordered list of param names or an error if circular dependencies exist.
+func (md *MessageDecisionConfig) TopologicalSort() ([]string, error) {
+ // Build dependency graph: for each param, find which other params it references
+ deps := make(map[string][]string, len(md.Params))
+ paramNames := make(map[string]bool, len(md.Params))
+ for name := range md.Params {
+ paramNames[name] = true
+ deps[name] = nil
+ }
+
+ for name, expr := range md.Params {
+ for otherName := range md.Params {
+ if otherName != name && containsIdentifier(expr, otherName) {
+ deps[name] = append(deps[name], otherName)
+ }
+ }
+ }
+
+ // Kahn's algorithm for topological sort
+ inDegree := make(map[string]int, len(md.Params))
+ for name := range md.Params {
+ inDegree[name] = 0
+ }
+ for _, dependencies := range deps {
+ for _, dep := range dependencies {
+ inDegree[dep]++
+ }
+ }
+
+ // Wait — inDegree should count how many params depend ON each param,
+ // but we need the reverse: how many dependencies each param HAS.
+ // Actually for topological sort, we need: for each node, its in-degree
+ // is the number of edges pointing TO it. In our graph, an edge from A to B
+ // means "A depends on B" (B must be evaluated before A).
+ // So B's in-degree is the count of params that depend on B.
+ // Nodes with in-degree 0 have no dependents waiting — wrong.
+ // Actually, in topological sort, we want to process nodes with no DEPENDENCIES first.
+ // In our graph, edge A→B means "A depends on B", so A has out-degree to B.
+ // For Kahn's, we reverse: edge B→A means "B must come before A".
+ // In-degree of A = number of dependencies A has.
+
+ // Let me redo this correctly.
+ // deps[A] = [B, C] means A depends on B and C (B and C must be evaluated first)
+ // For Kahn's: in-degree of A = len(deps[A])
+ inDegree = make(map[string]int, len(md.Params))
+ for name := range md.Params {
+ inDegree[name] = len(deps[name])
+ }
+
+ var queue []string
+ for name, degree := range inDegree {
+ if degree == 0 {
+ queue = append(queue, name)
+ }
+ }
+
+ var sorted []string
+ for len(queue) > 0 {
+ node := queue[0]
+ queue = queue[1:]
+ sorted = append(sorted, node)
+
+ // For each param that depends on this node, decrease its in-degree
+ for other, dependencies := range deps {
+ for _, dep := range dependencies {
+ if dep == node {
+ inDegree[other]--
+ if inDegree[other] == 0 {
+ queue = append(queue, other)
+ }
+ }
+ }
+ }
+ }
+
+ if len(sorted) != len(md.Params) {
+ return nil, fmt.Errorf("circular dependency detected in params")
+ }
+
+ return sorted, nil
+}
+
+// containsIdentifier checks if an expression contains a reference to a param name.
+// Uses word boundary detection to avoid false positives (e.g., "is_ready" matching "is_ready_2").
+func containsIdentifier(expr, identifier string) bool {
+ idx := 0
+ for {
+ pos := strings.Index(expr[idx:], identifier)
+ if pos == -1 {
+ return false
+ }
+ absPos := idx + pos
+ endPos := absPos + len(identifier)
+
+ // Check word boundaries
+ validStart := absPos == 0 || !isIdentChar(expr[absPos-1])
+ validEnd := endPos >= len(expr) || !isIdentChar(expr[endPos])
+
+ if validStart && validEnd {
+ return true
+ }
+ idx = absPos + 1
+ }
+}
+
+// isIdentChar returns true if the byte is a valid identifier character (letter, digit, underscore)
+func isIdentChar(b byte) bool {
+ return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9') || b == '_'
+}
+
// validateMessageDataLeaves recursively checks that every leaf value in a
// message_data map is a non-empty string (CEL expression). nil values and
// empty strings are rejected early so that the error is reported at config
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
index 9f165ac..c39a564 100644
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@@ -25,6 +25,16 @@ func createTempConfigFile(t *testing.T, content string) string {
return configPath
}
+// newTestMessageDecision returns a valid message decision config for testing
+func newTestMessageDecision() *MessageDecisionConfig {
+ return &MessageDecisionConfig{
+ Params: map[string]string{
+ "is_ready": `condition("Ready").status == "True"`,
+ },
+ Result: "!is_ready",
+ }
+}
+
// ============================================================================
// Loading & Parsing Tests
// ============================================================================
@@ -46,12 +56,6 @@ func TestLoadConfig_ValidComplete(t *testing.T) {
if cfg.PollInterval != 5*time.Second {
t.Errorf("Expected poll_interval 5s, got %v", cfg.PollInterval)
}
- if cfg.MaxAgeNotReady != 10*time.Second {
- t.Errorf("Expected max_age_not_ready 10s, got %v", cfg.MaxAgeNotReady)
- }
- if cfg.MaxAgeReady != 30*time.Minute {
- t.Errorf("Expected max_age_ready 30m, got %v", cfg.MaxAgeReady)
- }
// Verify resource selector
if len(cfg.ResourceSelector) != 2 {
@@ -66,6 +70,17 @@ func TestLoadConfig_ValidComplete(t *testing.T) {
t.Errorf("Expected timeout 5s, got %v", cfg.HyperFleetAPI.Timeout)
}
+ // Verify message_decision
+ if cfg.MessageDecision == nil {
+ t.Fatal("Expected message_decision to be set")
+ }
+ if cfg.MessageDecision.Result == "" {
+ t.Error("Expected message_decision.result to be set")
+ }
+ if len(cfg.MessageDecision.Params) != 7 {
+ t.Errorf("Expected 7 message_decision params, got %d", len(cfg.MessageDecision.Params))
+ }
+
// Verify message data
if len(cfg.MessageData) != 3 {
t.Errorf("Expected 3 message_data fields, got %d", len(cfg.MessageData))
@@ -91,11 +106,16 @@ func TestLoadConfig_Minimal(t *testing.T) {
if cfg.PollInterval != 5*time.Second {
t.Errorf("Expected default poll_interval 5s, got %v", cfg.PollInterval)
}
- if cfg.MaxAgeNotReady != 10*time.Second {
- t.Errorf("Expected default max_age_not_ready 10s, got %v", cfg.MaxAgeNotReady)
+
+ // Verify default message_decision was applied
+ if cfg.MessageDecision == nil {
+ t.Fatal("Expected default message_decision to be applied")
+ }
+ if cfg.MessageDecision.Result == "" {
+ t.Error("Expected default message_decision.result to be set")
}
- if cfg.MaxAgeReady != 30*time.Minute {
- t.Errorf("Expected default max_age_ready 30m, got %v", cfg.MaxAgeReady)
+ if len(cfg.MessageDecision.Params) != 7 {
+ t.Errorf("Expected 7 default message_decision params, got %d", len(cfg.MessageDecision.Params))
}
}
@@ -144,14 +164,8 @@ func TestNewSentinelConfig_Defaults(t *testing.T) {
if cfg.PollInterval != 5*time.Second {
t.Errorf("Expected default poll_interval 5s, got %v", cfg.PollInterval)
}
- if cfg.MaxAgeNotReady != 10*time.Second {
- t.Errorf("Expected default max_age_not_ready 10s, got %v", cfg.MaxAgeNotReady)
- }
- if cfg.MaxAgeReady != 30*time.Minute {
- t.Errorf("Expected default max_age_ready 30m, got %v", cfg.MaxAgeReady)
- }
if cfg.HyperFleetAPI.Timeout != 5*time.Second {
- t.Errorf("Expected default timeout 30s, got %v", cfg.HyperFleetAPI.Timeout)
+ t.Errorf("Expected default timeout 5s, got %v", cfg.HyperFleetAPI.Timeout)
}
// Endpoint has no default - must be set in config file
if cfg.HyperFleetAPI.Endpoint != "" {
@@ -163,6 +177,10 @@ func TestNewSentinelConfig_Defaults(t *testing.T) {
if cfg.MessageData != nil {
t.Errorf("Expected nil message_data, got %v", cfg.MessageData)
}
+ // MessageDecision has no default in NewSentinelConfig - LoadConfig applies it
+ if cfg.MessageDecision != nil {
+ t.Errorf("Expected nil message_decision in NewSentinelConfig, got %v", cfg.MessageDecision)
+ }
}
// ============================================================================
@@ -185,7 +203,7 @@ func TestValidate_MissingResourceType(t *testing.T) {
func TestValidate_MissingEndpoint(t *testing.T) {
cfg := NewSentinelConfig()
- cfg.ResourceType = testResourceType // Set valid resource_type to test endpoint validation
+ cfg.ResourceType = testResourceType
cfg.HyperFleetAPI.Endpoint = ""
err := cfg.Validate()
@@ -233,6 +251,7 @@ func TestValidate_InvalidResourceTypes(t *testing.T) {
cfg.ResourceType = tt.resourceType
cfg.HyperFleetAPI.Endpoint = testAPIEndpoint
cfg.MessageData = map[string]interface{}{"id": "resource.id"}
+ cfg.MessageDecision = newTestMessageDecision()
err := cfg.Validate()
if tt.shouldFail && err == nil {
@@ -262,24 +281,13 @@ func TestValidate_NegativeDurations(t *testing.T) {
c.PollInterval = 0
},
},
- {
- name: "negative max_age_not_ready",
- modifier: func(c *SentinelConfig) {
- c.MaxAgeNotReady = -10 * time.Second
- },
- },
- {
- name: "zero max_age_ready",
- modifier: func(c *SentinelConfig) {
- c.MaxAgeReady = 0
- },
- },
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
cfg := NewSentinelConfig()
cfg.HyperFleetAPI.Endpoint = testAPIEndpoint
+ cfg.MessageDecision = newTestMessageDecision()
tt.modifier(cfg)
err := cfg.Validate()
@@ -345,6 +353,7 @@ func TestValidate_ValidMessageDataFlat(t *testing.T) {
cfg := NewSentinelConfig()
cfg.ResourceType = testResourceType
cfg.HyperFleetAPI.Endpoint = testAPIEndpoint
+ cfg.MessageDecision = newTestMessageDecision()
cfg.MessageData = map[string]interface{}{
"id": "resource.id",
"kind": "resource.kind",
@@ -360,6 +369,7 @@ func TestValidate_ValidMessageDataNested(t *testing.T) {
cfg := NewSentinelConfig()
cfg.ResourceType = testResourceType
cfg.HyperFleetAPI.Endpoint = testAPIEndpoint
+ cfg.MessageDecision = newTestMessageDecision()
cfg.MessageData = map[string]interface{}{
"origin": `"sentinel"`,
"ref": map[string]interface{}{
@@ -377,6 +387,7 @@ func TestValidate_NilMessageData(t *testing.T) {
cfg := NewSentinelConfig()
cfg.ResourceType = testResourceType
cfg.HyperFleetAPI.Endpoint = testAPIEndpoint
+ cfg.MessageDecision = newTestMessageDecision()
// MessageData is nil by default — message_data is required so this must fail
if err := cfg.Validate(); err == nil {
@@ -385,10 +396,10 @@ func TestValidate_NilMessageData(t *testing.T) {
}
func TestValidate_NilLeafInMessageData(t *testing.T) {
- // Mirrors YAML: `id:` — viper may drop the key, but if it doesn't the nil leaf must be rejected.
cfg := NewSentinelConfig()
cfg.ResourceType = testResourceType
cfg.HyperFleetAPI.Endpoint = testAPIEndpoint
+ cfg.MessageDecision = newTestMessageDecision()
cfg.MessageData = map[string]interface{}{
"id": nil,
"kind": "resource.kind",
@@ -400,10 +411,10 @@ func TestValidate_NilLeafInMessageData(t *testing.T) {
}
func TestValidate_EmptyStringLeafInMessageData(t *testing.T) {
- // Mirrors YAML: `id: ""` — an explicitly-set empty CEL expression.
cfg := NewSentinelConfig()
cfg.ResourceType = testResourceType
cfg.HyperFleetAPI.Endpoint = testAPIEndpoint
+ cfg.MessageDecision = newTestMessageDecision()
cfg.MessageData = map[string]interface{}{
"id": "",
"kind": "resource.kind",
@@ -415,10 +426,10 @@ func TestValidate_EmptyStringLeafInMessageData(t *testing.T) {
}
func TestValidate_NilLeafInNestedMessageData(t *testing.T) {
- // Ensures the recursive check reaches nested objects.
cfg := NewSentinelConfig()
cfg.ResourceType = testResourceType
cfg.HyperFleetAPI.Endpoint = testAPIEndpoint
+ cfg.MessageDecision = newTestMessageDecision()
cfg.MessageData = map[string]interface{}{
"ref": map[string]interface{}{
"id": nil,
@@ -431,15 +442,202 @@ func TestValidate_NilLeafInNestedMessageData(t *testing.T) {
}
}
+// ============================================================================
+// Message Decision Validation Tests
+// ============================================================================
+
+func TestValidate_MissingMessageDecision(t *testing.T) {
+ cfg := NewSentinelConfig()
+ cfg.ResourceType = testResourceType
+ cfg.HyperFleetAPI.Endpoint = testAPIEndpoint
+ cfg.MessageData = map[string]interface{}{"id": "resource.id"}
+ cfg.MessageDecision = nil
+
+ if err := cfg.Validate(); err == nil {
+ t.Error("Expected error for nil message_decision, got nil")
+ }
+}
+
+func TestValidate_EmptyResultExpression(t *testing.T) {
+ cfg := NewSentinelConfig()
+ cfg.ResourceType = testResourceType
+ cfg.HyperFleetAPI.Endpoint = testAPIEndpoint
+ cfg.MessageData = map[string]interface{}{"id": "resource.id"}
+ cfg.MessageDecision = &MessageDecisionConfig{
+ Params: map[string]string{},
+ Result: "",
+ }
+
+ err := cfg.Validate()
+ if err == nil {
+ t.Error("Expected error for empty result expression, got nil")
+ }
+}
+
+func TestValidate_EmptyParamExpression(t *testing.T) {
+ cfg := NewSentinelConfig()
+ cfg.ResourceType = testResourceType
+ cfg.HyperFleetAPI.Endpoint = testAPIEndpoint
+ cfg.MessageData = map[string]interface{}{"id": "resource.id"}
+ cfg.MessageDecision = &MessageDecisionConfig{
+ Params: map[string]string{
+ "is_ready": "",
+ },
+ Result: "is_ready",
+ }
+
+ err := cfg.Validate()
+ if err == nil {
+ t.Error("Expected error for empty param expression, got nil")
+ }
+}
+
+func TestValidate_CircularDependency(t *testing.T) {
+ cfg := NewSentinelConfig()
+ cfg.ResourceType = testResourceType
+ cfg.HyperFleetAPI.Endpoint = testAPIEndpoint
+ cfg.MessageData = map[string]interface{}{"id": "resource.id"}
+ cfg.MessageDecision = &MessageDecisionConfig{
+ Params: map[string]string{
+ "a": "b",
+ "b": "a",
+ },
+ Result: "a",
+ }
+
+ err := cfg.Validate()
+ if err == nil {
+ t.Error("Expected error for circular dependency, got nil")
+ }
+ if !strings.Contains(err.Error(), "circular dependency") {
+ t.Errorf("Expected circular dependency error, got: %v", err)
+ }
+}
+
+// ============================================================================
+// Topological Sort Tests
+// ============================================================================
+
+func TestTopologicalSort_NoDependencies(t *testing.T) {
+ md := &MessageDecisionConfig{
+ Params: map[string]string{
+ "a": `condition("Ready").status`,
+ "b": `condition("Available").status`,
+ },
+ Result: "a == b",
+ }
+
+ order, err := md.TopologicalSort()
+ if err != nil {
+ t.Fatalf("Expected no error, got: %v", err)
+ }
+ if len(order) != 2 {
+ t.Fatalf("Expected 2 params, got %d", len(order))
+ }
+}
+
+func TestTopologicalSort_LinearDependency(t *testing.T) {
+ md := &MessageDecisionConfig{
+ Params: map[string]string{
+ "is_ready": `condition("Ready").status == "True"`,
+ "should_publish": `!is_ready`,
+ },
+ Result: "should_publish",
+ }
+
+ order, err := md.TopologicalSort()
+ if err != nil {
+ t.Fatalf("Expected no error, got: %v", err)
+ }
+ if len(order) != 2 {
+ t.Fatalf("Expected 2 params, got %d", len(order))
+ }
+
+ // is_ready must come before should_publish
+ isReadyIdx := -1
+ shouldPublishIdx := -1
+ for i, name := range order {
+ if name == "is_ready" {
+ isReadyIdx = i
+ }
+ if name == "should_publish" {
+ shouldPublishIdx = i
+ }
+ }
+ if isReadyIdx >= shouldPublishIdx {
+ t.Errorf("Expected is_ready (%d) before should_publish (%d)", isReadyIdx, shouldPublishIdx)
+ }
+}
+
+func TestTopologicalSort_CircularDependency(t *testing.T) {
+ md := &MessageDecisionConfig{
+ Params: map[string]string{
+ "a": "b && true",
+ "b": "c || false",
+ "c": "a",
+ },
+ Result: "a",
+ }
+
+ _, err := md.TopologicalSort()
+ if err == nil {
+ t.Error("Expected error for circular dependency, got nil")
+ }
+}
+
+func TestTopologicalSort_EmptyParams(t *testing.T) {
+ md := &MessageDecisionConfig{
+ Params: map[string]string{},
+ Result: "true",
+ }
+
+ order, err := md.TopologicalSort()
+ if err != nil {
+ t.Fatalf("Expected no error, got: %v", err)
+ }
+ if len(order) != 0 {
+ t.Errorf("Expected 0 params, got %d", len(order))
+ }
+}
+
+// ============================================================================
+// containsIdentifier Tests
+// ============================================================================
+
+func TestContainsIdentifier(t *testing.T) {
+ tests := []struct {
+ name string
+ expr string
+ identifier string
+ want bool
+ }{
+ {"exact match", "is_ready", "is_ready", true},
+ {"in expression", "!is_ready && x", "is_ready", true},
+ {"prefix match should fail", "is_ready_2", "is_ready", false},
+ {"suffix match should fail", "not_is_ready", "is_ready", false},
+ {"substring in middle", "foo_is_ready_bar", "is_ready", false},
+ {"not present", "something_else", "is_ready", false},
+ {"at start with operator", "is_ready || other", "is_ready", true},
+ {"at end with operator", "other && is_ready", "is_ready", true},
+ {"in parentheses", "(is_ready)", "is_ready", true},
+ {"with negation", "!is_ready", "is_ready", true},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got := containsIdentifier(tt.expr, tt.identifier)
+ if got != tt.want {
+ t.Errorf("containsIdentifier(%q, %q) = %v, want %v", tt.expr, tt.identifier, got, tt.want)
+ }
+ })
+ }
+}
+
// ============================================================================
// Integration-like Test with Full Config
// ============================================================================
func TestLoadConfig_BlankMessageDataLeafReturnsError(t *testing.T) {
- // A blank leaf (e.g. `id:`) in message_data is decoded as nil by the YAML
- // parser. mapstructure then silently drops nil-valued keys during Unmarshal,
- // so the key disappears from cfg.MessageData before Validate() runs.
- // LoadConfig must catch this via the raw viper value.
_, err := LoadConfig(filepath.Join("testdata", "message-data-blank-id.yaml"))
if err == nil {
t.Fatal("expected error for blank message_data leaf, got nil")
@@ -471,6 +669,14 @@ func TestLoadConfig_FullWorkflow(t *testing.T) {
if len(md) != 4 {
t.Errorf("Expected 4 message_data fields, got %d", len(md))
}
+
+ // Verify message_decision is loaded from file
+ if cfg.MessageDecision == nil {
+ t.Fatal("Expected message_decision to be set")
+ }
+ if len(cfg.MessageDecision.Params) != 7 {
+ t.Errorf("Expected 7 message_decision params, got %d", len(cfg.MessageDecision.Params))
+ }
}
// ============================================================================
@@ -478,7 +684,6 @@ func TestLoadConfig_FullWorkflow(t *testing.T) {
// ============================================================================
func TestLoadConfig_TopicFromEnvVar(t *testing.T) {
- // Set environment variable (t.Setenv auto-cleans after test)
t.Setenv("BROKER_TOPIC", "test-namespace-clusters")
configPath := filepath.Join("testdata", "minimal.yaml")
@@ -494,10 +699,8 @@ func TestLoadConfig_TopicFromEnvVar(t *testing.T) {
}
func TestLoadConfig_TopicEnvVarOverridesConfig(t *testing.T) {
- // Set environment variable (t.Setenv auto-cleans after test)
t.Setenv("BROKER_TOPIC", "env-topic")
- // Create config with topic set
yaml := `
resource_type: clusters
hyperfleet_api:
@@ -513,14 +716,12 @@ message_data:
t.Fatalf("Expected no error, got: %v", err)
}
- // Environment variable should override config file
if cfg.Topic != "env-topic" {
t.Errorf("Expected topic 'env-topic' (from env), got '%s'", cfg.Topic)
}
}
func TestLoadConfig_TopicFromConfigFile(t *testing.T) {
- // Save and restore original value, then unset for test
origValue, wasSet := os.LookupEnv("BROKER_TOPIC")
if wasSet {
defer func() { _ = os.Setenv("BROKER_TOPIC", origValue) }()
@@ -548,7 +749,6 @@ message_data:
}
func TestLoadConfig_TopicEmpty(t *testing.T) {
- // Save and restore original value, then unset for test
origValue, wasSet := os.LookupEnv("BROKER_TOPIC")
if wasSet {
defer func() { _ = os.Setenv("BROKER_TOPIC", origValue) }()
@@ -562,18 +762,14 @@ func TestLoadConfig_TopicEmpty(t *testing.T) {
t.Fatalf("Expected no error, got: %v", err)
}
- // Topic should be empty when not configured
if cfg.Topic != "" {
t.Errorf("Expected empty topic, got '%s'", cfg.Topic)
}
}
func TestLoadConfig_TopicEnvVarEmptyClearsConfig(t *testing.T) {
- // Set environment variable to empty string (explicitly clears config value)
- // t.Setenv auto-cleans after test
t.Setenv("BROKER_TOPIC", "")
- // Create config with topic set
yaml := `
resource_type: clusters
hyperfleet_api:
@@ -589,7 +785,6 @@ message_data:
t.Fatalf("Expected no error, got: %v", err)
}
- // Empty env var should clear the config value (using os.LookupEnv)
if cfg.Topic != "" {
t.Errorf("Expected empty topic (cleared by env var), got '%s'", cfg.Topic)
}
diff --git a/internal/config/testdata/full-workflow.yaml b/internal/config/testdata/full-workflow.yaml
index ce8c07b..0002773 100644
--- a/internal/config/testdata/full-workflow.yaml
+++ b/internal/config/testdata/full-workflow.yaml
@@ -1,7 +1,5 @@
resource_type: nodepools
poll_interval: 3s
-max_age_not_ready: 5s
-max_age_ready: 15m
resource_selector:
- label: cluster-id
@@ -13,6 +11,17 @@ hyperfleet_api:
endpoint: http://hyperfleet-api.hyperfleet-system.svc.cluster.local:8080
timeout: 20s
+message_decision:
+ params:
+ ref_time: 'condition("Ready").last_updated_time'
+ is_ready: 'condition("Ready").status == "True"'
+ has_ref_time: 'ref_time != ""'
+ is_new_resource: '!is_ready && resource.generation == 1'
+ generation_mismatch: 'resource.generation > condition("Ready").observed_generation'
+ ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("15m")'
+ not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("5s")'
+ result: 'is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced'
+
message_data:
id: "resource.id"
kind: "resource.kind"
diff --git a/internal/config/testdata/message-data-blank-id.yaml b/internal/config/testdata/message-data-blank-id.yaml
index 7b9c69a..51b60dc 100644
--- a/internal/config/testdata/message-data-blank-id.yaml
+++ b/internal/config/testdata/message-data-blank-id.yaml
@@ -2,8 +2,16 @@
# it has a nil in the message_data.id property
resource_type: clusters
poll_interval: 2s
-max_age_not_ready: 5s
-max_age_ready: 2m
+message_decision:
+ params:
+ ref_time: 'condition("Ready").last_updated_time'
+ is_ready: 'condition("Ready").status == "True"'
+ has_ref_time: 'ref_time != ""'
+ is_new_resource: '!is_ready && resource.generation == 1'
+ generation_mismatch: 'resource.generation > condition("Ready").observed_generation'
+ ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("2m")'
+ not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("5s")'
+ result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced"
hyperfleet_api:
endpoint: http://localhost:8000
timeout: 10s
diff --git a/internal/config/testdata/valid-complete.yaml b/internal/config/testdata/valid-complete.yaml
index 62dee5f..a4b8c2b 100644
--- a/internal/config/testdata/valid-complete.yaml
+++ b/internal/config/testdata/valid-complete.yaml
@@ -1,7 +1,5 @@
resource_type: clusters
poll_interval: 5s
-max_age_not_ready: 10s
-max_age_ready: 30m
resource_selector:
- label: region
@@ -13,6 +11,17 @@ hyperfleet_api:
endpoint: https://api.hyperfleet.example.com
timeout: 5s
+message_decision:
+ params:
+ ref_time: 'condition("Ready").last_updated_time'
+ is_ready: 'condition("Ready").status == "True"'
+ has_ref_time: 'ref_time != ""'
+ is_new_resource: '!is_ready && resource.generation == 1'
+ generation_mismatch: 'resource.generation > condition("Ready").observed_generation'
+ ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")'
+ not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")'
+ result: 'is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced'
+
message_data:
id: "resource.id"
kind: "resource.kind"
diff --git a/internal/engine/decision.go b/internal/engine/decision.go
index e989fd1..2ef4431 100644
--- a/internal/engine/decision.go
+++ b/internal/engine/decision.go
@@ -2,125 +2,263 @@ package engine
import (
"fmt"
+ "sync"
"time"
+ "github.com/google/cel-go/cel"
+ "github.com/google/cel-go/common/types"
+ "github.com/google/cel-go/common/types/ref"
"github.com/openshift-hyperfleet/hyperfleet-sentinel/internal/client"
+ "github.com/openshift-hyperfleet/hyperfleet-sentinel/internal/config"
)
-// Decision reasons
-const (
- ReasonMaxAgeExceeded = "max age exceeded"
- ReasonGenerationChanged = "generation changed"
- ReasonNeverProcessed = "never processed"
- ReasonNilResource = "resource is nil"
- ReasonZeroNow = "now time is zero"
-)
+// Decision represents the result of evaluating a resource
+type Decision struct {
+ Reason string // Human-readable explanation for the decision
+ ShouldPublish bool // Indicates whether an event should be published for the resource
+}
// DecisionEngine evaluates whether a resource needs an event published
+// using configurable CEL expressions.
type DecisionEngine struct {
- maxAgeNotReady time.Duration
- maxAgeReady time.Duration
+ resultProg cel.Program
+ paramProgs map[string]cel.Program
+ conditionsLookup map[string]map[string]interface{}
+ paramOrder []string
+ mu sync.Mutex
}
-// NewDecisionEngine creates a new decision engine
-func NewDecisionEngine(maxAgeNotReady, maxAgeReady time.Duration) *DecisionEngine {
- return &DecisionEngine{
- maxAgeNotReady: maxAgeNotReady,
- maxAgeReady: maxAgeReady,
+// NewDecisionEngine creates a new CEL-based decision engine from a MessageDecisionConfig.
+// All CEL expressions are compiled at creation time for fail-fast validation.
+func NewDecisionEngine(cfg *config.MessageDecisionConfig) (*DecisionEngine, error) {
+ if cfg == nil {
+ return nil, fmt.Errorf("message_decision config is required")
}
-}
-// Decision represents the result of evaluating a resource
-type Decision struct {
- Reason string // Human-readable explanation for the decision
- ShouldPublish bool // Indicates whether an event should be published for the resource
+ // Resolve param evaluation order
+ paramOrder, err := cfg.TopologicalSort()
+ if err != nil {
+ return nil, fmt.Errorf("failed to resolve param dependencies: %w", err)
+ }
+
+ de := &DecisionEngine{
+ paramOrder: paramOrder,
+ }
+
+ // Build CEL environment with all variables and the condition() function.
+ // The function declaration includes the implementation via FunctionBinding,
+ // which reads from the engine's conditionsLookup (updated per-evaluation).
+ envOpts := []cel.EnvOption{
+ cel.Variable("resource", cel.DynType),
+ cel.Variable("now", cel.TimestampType),
+ cel.Function("condition",
+ cel.Overload("condition_string_to_dyn",
+ []*cel.Type{cel.StringType},
+ cel.DynType,
+ cel.UnaryBinding(func(val ref.Val) ref.Val {
+ name, ok := val.Value().(string)
+ if !ok {
+ return types.DefaultTypeAdapter.NativeToValue(zeroCondition())
+ }
+ de.mu.Lock()
+ lookup := de.conditionsLookup
+ de.mu.Unlock()
+ if cond, exists := lookup[name]; exists {
+ return types.DefaultTypeAdapter.NativeToValue(cond)
+ }
+ return types.DefaultTypeAdapter.NativeToValue(zeroCondition())
+ }),
+ ),
+ ),
+ }
+
+ // Declare all param names as DynType variables for inter-param references
+ for name := range cfg.Params {
+ envOpts = append(envOpts, cel.Variable(name, cel.DynType))
+ }
+
+ env, err := cel.NewEnv(envOpts...)
+ if err != nil {
+ return nil, fmt.Errorf("failed to create CEL environment: %w", err)
+ }
+
+ // Compile all param expressions
+ paramProgs := make(map[string]cel.Program, len(cfg.Params))
+ for name, expr := range cfg.Params {
+ ast, issues := env.Compile(expr)
+ if issues != nil && issues.Err() != nil {
+ return nil, fmt.Errorf("failed to compile param %q expression %q: %w", name, expr, issues.Err())
+ }
+ prg, prgErr := env.Program(ast)
+ if prgErr != nil {
+ return nil, fmt.Errorf("failed to create program for param %q: %w", name, prgErr)
+ }
+ paramProgs[name] = prg
+ }
+
+ // Compile result expression
+ resultAST, issues := env.Compile(cfg.Result)
+ if issues != nil && issues.Err() != nil {
+ return nil, fmt.Errorf("failed to compile result expression %q: %w", cfg.Result, issues.Err())
+ }
+ resultPrg, err := env.Program(resultAST)
+ if err != nil {
+ return nil, fmt.Errorf("failed to create result program: %w", err)
+ }
+
+ de.paramProgs = paramProgs
+ de.resultProg = resultPrg
+
+ return de, nil
}
// Evaluate determines if an event should be published for the resource.
-//
-// Decision Logic (in priority order):
-// 1. Never-processed reconciliation: If status.LastUpdated is zero, publish immediately
-// (resource has never been processed by any adapter)
-// 2. Generation-based reconciliation: If resource.Generation > status.ObservedGeneration,
-// publish immediately (spec has changed, adapter needs to reconcile)
-// 3. Time-based reconciliation: If max age exceeded since last update, publish
-// - Uses status.LastUpdated as reference timestamp
-//
-// Max Age Intervals:
-// - Resources with Ready=true: maxAgeReady (default 30m)
-// - Resources with Ready=false: maxAgeNotReady (default 10s)
-//
-// Adapter Contract:
-// - Adapters MUST update status.LastUpdated on EVERY evaluation
-// - Adapters MUST update status.ObservedGeneration to resource.Generation when processing
-// - This prevents infinite event loops when adapters skip work due to unmet preconditions
-//
-// Returns a Decision indicating whether to publish and why. Returns ShouldPublish=false
-// for invalid inputs (nil resource, zero now time).
+// Returns a Decision indicating whether to publish and why.
func (e *DecisionEngine) Evaluate(resource *client.Resource, now time.Time) Decision {
- // Validate inputs
if resource == nil {
+ return Decision{ShouldPublish: false, Reason: "resource is nil"}
+ }
+ if now.IsZero() {
+ return Decision{ShouldPublish: false, Reason: "now time is zero"}
+ }
+
+ // Build resource map for CEL evaluation
+ resourceMap := resourceToMap(resource)
+
+ // Update the conditions lookup for the condition() function binding
+ e.mu.Lock()
+ e.conditionsLookup = buildConditionsLookup(resource.Status.Conditions)
+ e.mu.Unlock()
+
+ // Build base activation with resource and now
+ activation := map[string]interface{}{
+ "resource": resourceMap,
+ "now": now,
+ }
+
+ // Evaluate params in topological order
+ paramValues := make(map[string]interface{}, len(e.paramOrder))
+ for _, name := range e.paramOrder {
+ prg := e.paramProgs[name]
+
+ // Merge param values into activation for inter-param references
+ evalActivation := make(map[string]interface{}, len(activation)+len(paramValues))
+ for k, v := range activation {
+ evalActivation[k] = v
+ }
+ for k, v := range paramValues {
+ evalActivation[k] = v
+ }
+
+ out, _, err := prg.Eval(evalActivation)
+ if err != nil {
+ return Decision{
+ ShouldPublish: false,
+ Reason: fmt.Sprintf("param %q evaluation failed: %v", name, err),
+ }
+ }
+ paramValues[name] = out.Value()
+ }
+
+ // Evaluate result expression
+ resultActivation := make(map[string]interface{}, len(activation)+len(paramValues))
+ for k, v := range activation {
+ resultActivation[k] = v
+ }
+ for k, v := range paramValues {
+ resultActivation[k] = v
+ }
+
+ out, _, err := e.resultProg.Eval(resultActivation)
+ if err != nil {
return Decision{
ShouldPublish: false,
- Reason: ReasonNilResource,
+ Reason: fmt.Sprintf("result evaluation failed: %v", err),
}
}
- if now.IsZero() {
+ shouldPublish, ok := out.Value().(bool)
+ if !ok {
return Decision{
ShouldPublish: false,
- Reason: ReasonZeroNow,
+ Reason: fmt.Sprintf("result expression did not return bool, got %T", out.Value()),
}
}
- // Check if resource has never been processed by an adapter
- // LastUpdated is zero means no adapter has updated the status yet
- // This ensures first-time resources are published immediately
- if resource.Status.LastUpdated.IsZero() {
+ if shouldPublish {
return Decision{
ShouldPublish: true,
- Reason: ReasonNeverProcessed,
+ Reason: "message decision matched",
}
}
- // Check for generation mismatch
- // This triggers immediate reconciliation regardless of max age
- if resource.Generation > resource.Status.ObservedGeneration {
- return Decision{
- ShouldPublish: true,
- Reason: ReasonGenerationChanged,
+ return Decision{
+ ShouldPublish: false,
+ Reason: "message decision result is false",
+ }
+}
+
+// buildConditionsLookup creates a map from condition type name to condition data
+// for use by the condition() CEL function.
+func buildConditionsLookup(conditions []client.Condition) map[string]map[string]interface{} {
+ lookup := make(map[string]map[string]interface{}, len(conditions))
+ for _, c := range conditions {
+ lookup[c.Type] = map[string]interface{}{
+ "status": c.Status,
+ "observed_generation": int64(c.ObservedGeneration),
+ "last_updated_time": c.LastUpdatedTime.Format(time.RFC3339Nano),
+ "last_transition_time": c.LastTransitionTime.Format(time.RFC3339Nano),
+ "reason": c.Reason,
+ "message": c.Message,
}
}
+ return lookup
+}
- // Determine the reference timestamp for max age calculation
- // At this point, we know LastUpdated is not zero (checked above)
- // so we can use it directly for the max age calculation
- referenceTime := resource.Status.LastUpdated
+// zeroCondition returns a zero-value condition map for safe field access
+// when a condition is not found. Time fields are empty strings so that
+// CEL expressions can guard against missing conditions with `ref_time != ""`.
+func zeroCondition() map[string]interface{} {
+ return map[string]interface{}{
+ "status": "",
+ "observed_generation": int64(0),
+ "last_updated_time": "",
+ "last_transition_time": "",
+ "reason": "",
+ "message": "",
+ }
+}
- // Determine the appropriate max age based on resource ready status
- var maxAge time.Duration
- if resource.Status.Ready {
- maxAge = e.maxAgeReady
- } else {
- maxAge = e.maxAgeNotReady
+// resourceToMap converts a Resource into a plain map for CEL evaluation.
+func resourceToMap(r *client.Resource) map[string]interface{} {
+ m := map[string]interface{}{
+ "id": r.ID,
+ "href": r.Href,
+ "kind": r.Kind,
+ "created_time": r.CreatedTime.Format(time.RFC3339Nano),
+ "updated_time": r.UpdatedTime.Format(time.RFC3339Nano),
+ "generation": int64(r.Generation),
}
- // Calculate the next event time based on reference timestamp
- // Adapters update LastUpdated on every check, enabling proper max age
- // calculation even when resources stay in the same status
- nextEventTime := referenceTime.Add(maxAge)
+ if len(r.Labels) > 0 {
+ labels := make(map[string]interface{}, len(r.Labels))
+ for k, v := range r.Labels {
+ labels[k] = v
+ }
+ m["labels"] = labels
+ }
- // Check if enough time has passed
- if now.Before(nextEventTime) {
- timeUntilNext := nextEventTime.Sub(now)
- return Decision{
- ShouldPublish: false,
- Reason: fmt.Sprintf("max age not exceeded (waiting %s)", timeUntilNext),
+ if r.OwnerReferences != nil {
+ m["owner_references"] = map[string]interface{}{
+ "id": r.OwnerReferences.ID,
+ "href": r.OwnerReferences.Href,
+ "kind": r.OwnerReferences.Kind,
}
}
- return Decision{
- ShouldPublish: true,
- Reason: ReasonMaxAgeExceeded,
+ if r.Metadata != nil {
+ m["metadata"] = r.Metadata
}
+
+ return m
}
diff --git a/internal/engine/decision_test.go b/internal/engine/decision_test.go
index 908fbd0..298d942 100644
--- a/internal/engine/decision_test.go
+++ b/internal/engine/decision_test.go
@@ -1,361 +1,392 @@
package engine
import (
- "strings"
"testing"
"time"
"github.com/openshift-hyperfleet/hyperfleet-sentinel/internal/client"
+ "github.com/openshift-hyperfleet/hyperfleet-sentinel/internal/config"
)
-// Test helpers and factories
+// Test helpers
-// newTestResource creates a test resource with the given parameters
-// This follows TRex pattern of using test factories for consistent test data
-func newTestResource(ready bool, lastUpdated time.Time) *client.Resource {
- return &client.Resource{
- ID: testResourceID,
- Kind: testResourceKind,
- Generation: 1, // Default generation
- CreatedTime: time.Now().Add(-1 * time.Hour), // Default: created 1 hour ago
- Status: client.ResourceStatus{
- Ready: ready,
- LastUpdated: lastUpdated,
- ObservedGeneration: 1, // Default: in sync with generation
- },
+const (
+ testResourceID = "test-cluster-1"
+ testResourceKind = "Cluster"
+)
+
+// newDefaultDecisionConfig returns the default message decision config
+// matching the sentinel architecture: ref_time, is_ready, is_new_resource,
+// ready_and_stale, not_ready_and_debounced.
+func newDefaultDecisionConfig() *config.MessageDecisionConfig {
+ return config.DefaultMessageDecision()
+}
+
+// newTestDecisionEngine creates a decision engine with the default config.
+func newTestDecisionEngine(t *testing.T) *DecisionEngine {
+ t.Helper()
+ cfg := newDefaultDecisionConfig()
+ engine, err := NewDecisionEngine(cfg)
+ if err != nil {
+ t.Fatalf("NewDecisionEngine failed: %v", err)
}
+ return engine
}
-// newTestResourceWithCreatedTime creates a test resource with explicit created_time
-func newTestResourceWithCreatedTime(id, kind string, ready bool, createdTime, lastUpdated time.Time) *client.Resource {
+// newResourceWithCondition creates a test resource with a single "Ready" condition.
+func newResourceWithCondition(status string, lastUpdated time.Time, generation int32) *client.Resource {
return &client.Resource{
- ID: id,
- Kind: kind,
- Generation: 1, // Default generation
- CreatedTime: createdTime,
+ ID: testResourceID,
+ Kind: testResourceKind,
+ Generation: generation,
+ CreatedTime: time.Now().Add(-1 * time.Hour),
Status: client.ResourceStatus{
- Ready: ready,
- LastUpdated: lastUpdated,
- ObservedGeneration: 1, // Default: in sync with generation
+ Conditions: []client.Condition{
+ {
+ Type: "Ready",
+ Status: status,
+ LastUpdatedTime: lastUpdated,
+ ObservedGeneration: generation,
+ },
+ },
},
}
}
-// newTestResourceWithGeneration creates a test resource with explicit
-// generation values
-func newTestResourceWithGeneration(
- id, kind string,
- ready bool,
- lastUpdated time.Time,
- generation, observedGeneration int32,
+// newResourceWithGenerationMismatch creates a test resource where generation > observed_generation.
+func newResourceWithGenerationMismatch(
+ status string, lastUpdated time.Time, generation, observedGeneration int32,
) *client.Resource {
return &client.Resource{
- ID: id,
- Kind: kind,
+ ID: testResourceID,
+ Kind: testResourceKind,
Generation: generation,
- CreatedTime: time.Now().Add(-1 * time.Hour), // Default: created 1 hour ago
+ CreatedTime: time.Now().Add(-1 * time.Hour),
Status: client.ResourceStatus{
- Ready: ready,
- LastUpdated: lastUpdated,
- ObservedGeneration: observedGeneration,
+ Conditions: []client.Condition{
+ {
+ Type: "Ready",
+ Status: status,
+ LastUpdatedTime: lastUpdated,
+ ObservedGeneration: observedGeneration,
+ },
+ },
},
}
}
-// newTestEngine creates a decision engine with standard test values
-func newTestEngine() *DecisionEngine {
- return NewDecisionEngine(testMaxAgeNotReady, testMaxAgeReady)
-}
-
-// assertDecision verifies a decision matches expected values
-func assertDecision(t *testing.T, got Decision, wantPublish bool, wantReasonContains string) {
- t.Helper()
-
- if got.ShouldPublish != wantPublish {
- t.Errorf("ShouldPublish = %v, want %v", got.ShouldPublish, wantPublish)
- }
-
- if wantReasonContains != "" && !strings.Contains(got.Reason, wantReasonContains) {
- t.Errorf("Reason = %q, want it to contain %q", got.Reason, wantReasonContains)
- }
-
- if got.Reason == "" {
- t.Error("Reason should never be empty")
+// newResourceNoConditions creates a test resource with no conditions.
+func newResourceNoConditions(generation int32) *client.Resource {
+ return &client.Resource{
+ ID: testResourceID,
+ Kind: testResourceKind,
+ Generation: generation,
+ CreatedTime: time.Now().Add(-1 * time.Hour),
+ Status: client.ResourceStatus{},
}
}
-// Test constants
-const (
- testMaxAgeNotReady = 10 * time.Second
- testMaxAgeReady = 30 * time.Minute
- testResourceID = "test-cluster-1"
- testResourceKind = "Cluster"
-)
-
func TestNewDecisionEngine(t *testing.T) {
- engine := newTestEngine()
-
- if engine == nil {
- t.Fatal("NewDecisionEngine returned nil")
- }
-
- if engine.maxAgeNotReady != testMaxAgeNotReady {
- t.Errorf("maxAgeNotReady = %v, want %v", engine.maxAgeNotReady, testMaxAgeNotReady)
- }
-
- if engine.maxAgeReady != testMaxAgeReady {
- t.Errorf("maxAgeReady = %v, want %v", engine.maxAgeReady, testMaxAgeReady)
- }
+ t.Run("valid config", func(t *testing.T) {
+ engine, err := NewDecisionEngine(newDefaultDecisionConfig())
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if engine == nil {
+ t.Fatal("engine is nil")
+ }
+ })
+
+ t.Run("nil config", func(t *testing.T) {
+ _, err := NewDecisionEngine(nil)
+ if err == nil {
+ t.Fatal("expected error for nil config")
+ }
+ })
+
+ t.Run("invalid CEL expression", func(t *testing.T) {
+ cfg := &config.MessageDecisionConfig{
+ Params: map[string]string{
+ "bad": "this is not valid CEL !!!",
+ },
+ Result: "bad",
+ }
+ _, err := NewDecisionEngine(cfg)
+ if err == nil {
+ t.Fatal("expected error for invalid CEL expression")
+ }
+ })
+
+ t.Run("invalid result expression", func(t *testing.T) {
+ cfg := &config.MessageDecisionConfig{
+ Params: map[string]string{},
+ Result: "not valid !!!",
+ }
+ _, err := NewDecisionEngine(cfg)
+ if err == nil {
+ t.Fatal("expected error for invalid result expression")
+ }
+ })
+
+ t.Run("simple boolean result", func(t *testing.T) {
+ cfg := &config.MessageDecisionConfig{
+ Params: map[string]string{},
+ Result: "true",
+ }
+ engine, err := NewDecisionEngine(cfg)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if engine == nil {
+ t.Fatal("engine is nil")
+ }
+ })
}
func TestDecisionEngine_Evaluate(t *testing.T) {
now := time.Now()
- engine := newTestEngine()
+ engine := newTestDecisionEngine(t)
tests := []struct {
- lastUpdated time.Time
- now time.Time
- name string
- wantReasonContains string
- description string
- ready bool
- wantShouldPublish bool
+ resource *client.Resource
+ now time.Time
+ name string
+ wantReason string
+ wantShouldPublish bool
}{
- // Zero LastUpdated tests - should publish immediately as never processed
- {
- name: "zero LastUpdated - ready",
- ready: true,
- lastUpdated: time.Time{}, // Zero time - never processed
- now: now,
- wantShouldPublish: true,
- wantReasonContains: ReasonNeverProcessed,
- description: "Resources with zero LastUpdated should publish immediately (never processed)",
- },
{
- name: "zero LastUpdated - not ready",
- ready: false,
- lastUpdated: time.Time{}, // Zero time - never processed
- now: now,
- wantShouldPublish: true,
- wantReasonContains: ReasonNeverProcessed,
- description: "Resources with zero LastUpdated should publish immediately (never processed)",
- },
-
- // Not-ready resources (10s max age)
- {
- name: "not ready - max age exceeded",
- ready: false,
- lastUpdated: now.Add(-11 * time.Second), // 11s ago (> 10s max age)
- now: now,
- wantShouldPublish: true,
- wantReasonContains: "max age exceeded",
- description: "Not-ready resources with exceeded max age should publish",
+ name: "ready and stale - should publish",
+ resource: newResourceWithCondition("True", now.Add(-31*time.Minute), 2),
+ now: now,
+ wantShouldPublish: true,
+ wantReason: "message decision matched",
},
{
- name: "not ready - max age not exceeded",
- ready: false,
- lastUpdated: now.Add(-5 * time.Second), // 5s ago (< 10s max age)
- now: now,
- wantShouldPublish: false,
- wantReasonContains: "max age not exceeded",
- description: "Not-ready resources within max age should not publish",
+ name: "ready and recent - should not publish",
+ resource: newResourceWithCondition("True", now.Add(-5*time.Minute), 2),
+ now: now,
+ wantShouldPublish: false,
+ wantReason: "message decision result is false",
},
{
- name: "not ready - max age exactly exceeded",
- ready: false,
- lastUpdated: now.Add(-10 * time.Second), // Exactly 10s ago
- now: now,
- wantShouldPublish: true,
- wantReasonContains: "max age exceeded",
- description: "Not-ready resources with exactly exceeded max age should publish",
+ name: "not ready and debounced - should publish",
+ resource: newResourceWithCondition("False", now.Add(-11*time.Second), 2),
+ now: now,
+ wantShouldPublish: true,
+ wantReason: "message decision matched",
},
-
- // Ready resources (30m max age)
{
- name: "ready - max age exceeded",
- ready: true,
- lastUpdated: now.Add(-31 * time.Minute), // 31m ago (> 30m max age)
- now: now,
- wantShouldPublish: true,
- wantReasonContains: "max age exceeded",
- description: "Ready resources with exceeded max age should publish",
+ name: "not ready and too recent - should not publish",
+ resource: newResourceWithCondition("False", now.Add(-3*time.Second), 2),
+ now: now,
+ wantShouldPublish: false,
+ wantReason: "message decision result is false",
},
{
- name: "ready - max age not exceeded",
- ready: true,
- lastUpdated: now.Add(-15 * time.Minute), // 15m ago (< 30m max age)
- now: now,
- wantShouldPublish: false,
- wantReasonContains: "max age not exceeded",
- description: "Ready resources within max age should not publish",
+ name: "new resource (generation 1, not ready) - should publish",
+ resource: newResourceWithCondition("False", now, 1),
+ now: now,
+ wantShouldPublish: true,
+ wantReason: "message decision matched",
},
{
- name: "ready - max age exactly exceeded",
- ready: true,
- lastUpdated: now.Add(-30 * time.Minute), // Exactly 30m ago
- now: now,
- wantShouldPublish: true,
- wantReasonContains: "max age exceeded",
- description: "Ready resources with exactly exceeded max age should publish",
+ name: "generation mismatch (ready, recent) - should publish immediately",
+ resource: newResourceWithGenerationMismatch("True", now.Add(-1*time.Minute), 3, 2),
+ now: now,
+ wantShouldPublish: true,
+ wantReason: "message decision matched",
},
-
- // Edge cases
{
- name: "LastUpdated in future - ready",
- ready: true,
- lastUpdated: now.Add(1 * time.Hour), // 1 hour in the future
- now: now,
- wantShouldPublish: false,
- wantReasonContains: "max age not exceeded",
- description: "Resources with LastUpdated in future should not publish (clock skew protection)",
+ name: "generation mismatch (not ready, recent) - should publish immediately",
+ resource: newResourceWithGenerationMismatch("False", now.Add(-1*time.Second), 5, 4),
+ now: now,
+ wantShouldPublish: true,
+ wantReason: "message decision matched",
},
{
- name: "LastUpdated in future - not ready",
- ready: false,
- lastUpdated: now.Add(1 * time.Minute), // 1 minute in the future
- now: now,
- wantShouldPublish: false,
- wantReasonContains: "max age not exceeded",
- description: "Not-ready resources with LastUpdated in future should not publish",
+ name: "no generation mismatch (ready, recent) - should not publish",
+ resource: newResourceWithGenerationMismatch("True", now.Add(-1*time.Minute), 2, 2),
+ now: now,
+ wantShouldPublish: false,
+ wantReason: "message decision result is false",
},
{
- name: "LastUpdated very old - ready",
- ready: true,
- lastUpdated: now.Add(-24 * time.Hour), // 24 hours ago
- now: now,
- wantShouldPublish: true,
- wantReasonContains: "max age exceeded",
- description: "Very old resources should publish (max age long exceeded)",
+ name: "nil resource - should not publish",
+ resource: nil,
+ now: now,
+ wantShouldPublish: false,
+ wantReason: "resource is nil",
},
{
- name: "LastUpdated very recent - not ready",
- ready: false,
- lastUpdated: now.Add(-1 * time.Millisecond), // Just 1ms ago
- now: now,
- wantShouldPublish: false,
- wantReasonContains: "max age not exceeded",
- description: "Very recent updates should not publish immediately",
+ name: "zero now time - should not publish",
+ resource: newResourceWithCondition("True", now, 2),
+ now: time.Time{},
+ wantShouldPublish: false,
+ wantReason: "now time is zero",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- resource := newTestResource(tt.ready, tt.lastUpdated)
- decision := engine.Evaluate(resource, tt.now)
-
- assertDecision(t, decision, tt.wantShouldPublish, tt.wantReasonContains)
+ decision := engine.Evaluate(tt.resource, tt.now)
- // Additional context on failure
- if t.Failed() {
- t.Logf("Test description: %s", tt.description)
+ if decision.ShouldPublish != tt.wantShouldPublish {
+ t.Errorf("ShouldPublish = %v, want %v", decision.ShouldPublish, tt.wantShouldPublish)
+ }
+ if decision.Reason != tt.wantReason {
+ t.Errorf("Reason = %q, want %q", decision.Reason, tt.wantReason)
}
})
}
}
-// TestDecisionEngine_Evaluate_ZeroMaxAge tests edge case with zero max age intervals
-func TestDecisionEngine_Evaluate_ZeroMaxAge(t *testing.T) {
+func TestDecisionEngine_Evaluate_MissingCondition(t *testing.T) {
now := time.Now()
-
- tests := []struct {
- lastUpdated time.Time
- name string
- maxAgeNotReady time.Duration
- maxAgeReady time.Duration
- ready bool
- wantShouldPublish bool
- }{
- {
- name: "zero maxAgeNotReady - not ready",
- maxAgeNotReady: 0,
- maxAgeReady: 30 * time.Minute,
- ready: false,
- lastUpdated: now, // Even with now, should publish due to zero max age
- wantShouldPublish: true,
- },
- {
- name: "zero maxAgeReady - ready",
- maxAgeNotReady: 10 * time.Second,
- maxAgeReady: 0,
- ready: true,
- lastUpdated: now, // Even with now, should publish due to zero max age
- wantShouldPublish: true,
- },
- {
- name: "both zero max ages - ready",
- maxAgeNotReady: 0,
- maxAgeReady: 0,
- ready: true,
- lastUpdated: now,
- wantShouldPublish: true,
- },
- {
- name: "both zero max ages - not ready",
- maxAgeNotReady: 0,
- maxAgeReady: 0,
- ready: false,
- lastUpdated: now,
- wantShouldPublish: true,
- },
- }
-
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- engine := NewDecisionEngine(tt.maxAgeNotReady, tt.maxAgeReady)
- resource := newTestResource(tt.ready, tt.lastUpdated)
- decision := engine.Evaluate(resource, now)
-
- assertDecision(t, decision, tt.wantShouldPublish, "")
- })
- }
+ engine := newTestDecisionEngine(t)
+
+ // Resource with no conditions at all - condition("Ready") returns zero-value.
+ // Zero-value: status="" (not "True"), last_updated_time is zero time (very old).
+ // is_ready = false, is_new_resource depends on generation.
+ t.Run("no conditions generation 1 - new resource publishes", func(t *testing.T) {
+ resource := newResourceNoConditions(1)
+ decision := engine.Evaluate(resource, now)
+
+ if !decision.ShouldPublish {
+ t.Errorf("expected ShouldPublish=true for new resource with no conditions, got false")
+ }
+ })
+
+ t.Run("no conditions generation 2 - generation mismatch publishes", func(t *testing.T) {
+ resource := newResourceNoConditions(2)
+ decision := engine.Evaluate(resource, now)
+
+ // No conditions → observed_generation=0, generation=2 → generation_mismatch triggers
+ if !decision.ShouldPublish {
+ t.Errorf("expected ShouldPublish=true for resource with generation mismatch (gen=2, observed=0), got false")
+ }
+ })
}
-// TestDecisionEngine_Evaluate_NegativeMaxAge tests edge case with negative max age intervals
-func TestDecisionEngine_Evaluate_NegativeMaxAge(t *testing.T) {
+func TestDecisionEngine_Evaluate_CustomExpressions(t *testing.T) {
now := time.Now()
- lastUpdated := now.Add(-5 * time.Second)
- tests := []struct {
- name string
- maxAgeNotReady time.Duration
- maxAgeReady time.Duration
- ready bool
- wantShouldPublish bool
- }{
- {
- name: "negative maxAgeNotReady",
- maxAgeNotReady: -10 * time.Second,
- maxAgeReady: 30 * time.Minute,
- ready: false,
- wantShouldPublish: true, // Negative max age means nextEventTime is in the past
- },
- {
- name: "negative maxAgeReady",
- maxAgeNotReady: 10 * time.Second,
- maxAgeReady: -10 * time.Minute,
- ready: true,
- wantShouldPublish: true,
- },
- }
-
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- engine := NewDecisionEngine(tt.maxAgeNotReady, tt.maxAgeReady)
- resource := newTestResource(tt.ready, lastUpdated)
- decision := engine.Evaluate(resource, now)
-
- assertDecision(t, decision, tt.wantShouldPublish, "")
- })
- }
+ t.Run("always true", func(t *testing.T) {
+ cfg := &config.MessageDecisionConfig{
+ Params: map[string]string{},
+ Result: "true",
+ }
+ engine, err := NewDecisionEngine(cfg)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ resource := newResourceWithCondition("True", now, 2)
+ decision := engine.Evaluate(resource, now)
+
+ if !decision.ShouldPublish {
+ t.Error("expected ShouldPublish=true for always-true result")
+ }
+ })
+
+ t.Run("always false", func(t *testing.T) {
+ cfg := &config.MessageDecisionConfig{
+ Params: map[string]string{},
+ Result: "false",
+ }
+ engine, err := NewDecisionEngine(cfg)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ resource := newResourceWithCondition("True", now, 2)
+ decision := engine.Evaluate(resource, now)
+
+ if decision.ShouldPublish {
+ t.Error("expected ShouldPublish=false for always-false result")
+ }
+ })
+
+ t.Run("param chain with dependencies", func(t *testing.T) {
+ cfg := &config.MessageDecisionConfig{
+ Params: map[string]string{
+ "gen": "resource.generation",
+ "is_first": "gen == 1",
+ "should_pub": "is_first",
+ },
+ Result: "should_pub",
+ }
+ engine, err := NewDecisionEngine(cfg)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ // generation 1 → should publish
+ resource := newResourceWithCondition("False", now, 1)
+ decision := engine.Evaluate(resource, now)
+ if !decision.ShouldPublish {
+ t.Error("expected ShouldPublish=true for generation 1")
+ }
+
+ // generation 2 → should not publish
+ resource2 := newResourceWithCondition("False", now, 2)
+ decision2 := engine.Evaluate(resource2, now)
+ if decision2.ShouldPublish {
+ t.Error("expected ShouldPublish=false for generation 2")
+ }
+ })
+
+ t.Run("condition function with custom condition name", func(t *testing.T) {
+ cfg := &config.MessageDecisionConfig{
+ Params: map[string]string{
+ "is_available": `condition("Available").status == "True"`,
+ },
+ Result: "is_available",
+ }
+ engine, err := NewDecisionEngine(cfg)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ resource := &client.Resource{
+ ID: testResourceID,
+ Kind: testResourceKind,
+ Generation: 1,
+ Status: client.ResourceStatus{
+ Conditions: []client.Condition{
+ {Type: "Available", Status: "True", LastUpdatedTime: now},
+ },
+ },
+ }
+
+ decision := engine.Evaluate(resource, now)
+ if !decision.ShouldPublish {
+ t.Error("expected ShouldPublish=true for Available=True condition")
+ }
+
+ // Missing Available condition → zero-value → status="" → false
+ resource2 := &client.Resource{
+ ID: testResourceID,
+ Kind: testResourceKind,
+ Generation: 1,
+ Status: client.ResourceStatus{
+ Conditions: []client.Condition{
+ {Type: "Ready", Status: "True", LastUpdatedTime: now},
+ },
+ },
+ }
+
+ decision2 := engine.Evaluate(resource2, now)
+ if decision2.ShouldPublish {
+ t.Error("expected ShouldPublish=false when Available condition is missing")
+ }
+ })
}
-// TestDecisionEngine_Evaluate_ConsistentBehavior tests that multiple calls with same inputs produce same results
func TestDecisionEngine_Evaluate_ConsistentBehavior(t *testing.T) {
- engine := newTestEngine()
+ engine := newTestDecisionEngine(t)
now := time.Now()
- resource := newTestResource(true, now.Add(-31*time.Minute))
+ resource := newResourceWithCondition("True", now.Add(-31*time.Minute), 2)
- // Call multiple times - should get same result
decision1 := engine.Evaluate(resource, now)
decision2 := engine.Evaluate(resource, now)
decision3 := engine.Evaluate(resource, now)
@@ -363,352 +394,198 @@ func TestDecisionEngine_Evaluate_ConsistentBehavior(t *testing.T) {
if decision1.ShouldPublish != decision2.ShouldPublish || decision1.ShouldPublish != decision3.ShouldPublish {
t.Error("Evaluate should return consistent results for same inputs")
}
-
if decision1.Reason != decision2.Reason || decision1.Reason != decision3.Reason {
t.Error("Evaluate should return consistent reason for same inputs")
}
}
-// TestDecisionEngine_Evaluate_InvalidInputs tests handling of invalid inputs
-func TestDecisionEngine_Evaluate_InvalidInputs(t *testing.T) {
- engine := newTestEngine()
+func TestDecisionEngine_Evaluate_ReadyBoundary(t *testing.T) {
now := time.Now()
+ engine := newTestDecisionEngine(t)
+ // Default ready_and_stale threshold is 30m (strictly greater than)
tests := []struct {
- now time.Time
- resource *client.Resource
name string
- wantReason string
+ lastUpdated time.Duration
wantShouldPublish bool
}{
- {
- name: "nil resource",
- resource: nil,
- now: now,
- wantShouldPublish: false,
- wantReason: ReasonNilResource,
- },
- {
- name: "zero now time",
- resource: newTestResource(true, now),
- now: time.Time{}, // Zero time
- wantShouldPublish: false,
- wantReason: ReasonZeroNow,
- },
+ {"exactly 30m - should not publish (> not >=)", -30 * time.Minute, false},
+ {"29m59s - should not publish", -29*time.Minute - 59*time.Second, false},
+ {"31m - should publish", -31 * time.Minute, true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- decision := engine.Evaluate(tt.resource, tt.now)
+ resource := newResourceWithCondition("True", now.Add(tt.lastUpdated), 2)
+ decision := engine.Evaluate(resource, now)
if decision.ShouldPublish != tt.wantShouldPublish {
- t.Errorf("ShouldPublish = %v, want %v", decision.ShouldPublish, tt.wantShouldPublish)
- }
-
- if decision.Reason != tt.wantReason {
- t.Errorf("Reason = %q, want %q", decision.Reason, tt.wantReason)
+ t.Errorf("ShouldPublish = %v, want %v (lastUpdated offset: %v)",
+ decision.ShouldPublish, tt.wantShouldPublish, tt.lastUpdated)
}
})
}
}
-// TestDecisionEngine_Evaluate_NeverProcessedResources tests the never-processed reconciliation
-// When lastUpdated is zero, resources are published immediately with "never processed" reason
-func TestDecisionEngine_Evaluate_NeverProcessedResources(t *testing.T) {
- engine := newTestEngine()
+func TestDecisionEngine_Evaluate_NotReadyBoundary(t *testing.T) {
now := time.Now()
+ engine := newTestDecisionEngine(t)
+ // Default not_ready_and_debounced threshold is 10s (strictly greater than)
tests := []struct {
- createdTime time.Time
- lastUpdated time.Time
- name string
- wantReasonContains string
- description string
- ready bool
- wantShouldPublish bool
+ name string
+ lastUpdated time.Duration
+ wantShouldPublish bool
}{
- {
- name: "never processed - not ready",
- createdTime: now.Add(-1 * time.Hour), // Doesn't affect decision
- lastUpdated: time.Time{}, // Zero - never processed
- ready: false,
- wantShouldPublish: true,
- wantReasonContains: ReasonNeverProcessed,
- description: "Should publish immediately (never processed)",
- },
- {
- name: "never processed - ready",
- createdTime: now.Add(-1 * time.Hour), // Doesn't affect decision
- lastUpdated: time.Time{}, // Zero - never processed
- ready: true,
- wantShouldPublish: true,
- wantReasonContains: ReasonNeverProcessed,
- description: "Should publish immediately (never processed)",
- },
- {
- name: "processed resource - use lastUpdated for max age",
- createdTime: now.Add(-1 * time.Hour), // Irrelevant once processed
- lastUpdated: now.Add(-5 * time.Second), // Updated recently
- ready: false,
- wantShouldPublish: false,
- wantReasonContains: "max age not exceeded",
- description: "Should use lastUpdated for max age calculation (5s < 10s max age)",
- },
+ {"exactly 10s - should not publish (> not >=)", -10 * time.Second, false},
+ {"9s - should not publish", -9 * time.Second, false},
+ {"11s - should publish", -11 * time.Second, true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- resource := newTestResourceWithCreatedTime(
- testResourceID,
- testResourceKind,
- tt.ready,
- tt.createdTime,
- tt.lastUpdated,
- )
+ resource := newResourceWithCondition("False", now.Add(tt.lastUpdated), 2)
decision := engine.Evaluate(resource, now)
- assertDecision(t, decision, tt.wantShouldPublish, tt.wantReasonContains)
-
- // Additional context on failure
- if t.Failed() {
- t.Logf("Test description: %s", tt.description)
- t.Logf("Created time: %v", tt.createdTime)
- t.Logf("LastUpdated: %v", tt.lastUpdated)
+ if decision.ShouldPublish != tt.wantShouldPublish {
+ t.Errorf("ShouldPublish = %v, want %v (lastUpdated offset: %v)",
+ decision.ShouldPublish, tt.wantShouldPublish, tt.lastUpdated)
}
})
}
}
-// TestDecisionEngine_Evaluate_GenerationBasedReconciliation tests generation-based reconciliation
-func TestDecisionEngine_Evaluate_GenerationBasedReconciliation(t *testing.T) {
- engine := newTestEngine()
+func TestBuildConditionsLookup(t *testing.T) {
now := time.Now()
-
- tests := []struct {
- lastUpdated time.Time
- name string
- wantReasonContains string
- description string
- generation int32
- observedGeneration int32
- ready bool
- wantShouldPublish bool
- }{
- // Generation mismatch tests - should publish immediately
- {
- name: "generation ahead by 1 - ready",
- generation: 2,
- observedGeneration: 1,
- ready: true,
- lastUpdated: now, // Even with recent update, should publish due to generation
- wantShouldPublish: true,
- wantReasonContains: ReasonGenerationChanged,
- description: "Spec change should trigger immediate reconciliation (ready)",
- },
+ conditions := []client.Condition{
{
- name: "generation ahead by 1 - not ready",
- generation: 3,
- observedGeneration: 2,
- ready: false,
- lastUpdated: now, // Even with recent update, should publish due to generation
- wantShouldPublish: true,
- wantReasonContains: ReasonGenerationChanged,
- description: "Spec change should trigger immediate reconciliation (not ready)",
+ Type: "Ready",
+ Status: "True",
+ LastUpdatedTime: now,
+ LastTransitionTime: now.Add(-1 * time.Hour),
+ ObservedGeneration: 3,
},
{
- name: "generation ahead by many - ready",
- generation: 10,
- observedGeneration: 5,
- ready: true,
- lastUpdated: now,
- wantShouldPublish: true,
- wantReasonContains: ReasonGenerationChanged,
- description: "Multiple spec changes should trigger immediate reconciliation",
- },
- {
- name: "generation ahead - zero observedGeneration",
- generation: 1,
- observedGeneration: 0,
- ready: true,
- lastUpdated: now,
- wantShouldPublish: true,
- wantReasonContains: ReasonGenerationChanged,
- description: "Generation mismatch triggers immediate reconciliation",
+ Type: "Available",
+ Status: "False",
+ LastUpdatedTime: now.Add(-5 * time.Minute),
+ LastTransitionTime: now.Add(-10 * time.Minute),
+ ObservedGeneration: 2,
},
+ }
- // Generation in sync tests - should follow normal max age logic
- {
- name: "generation in sync - recent update - ready",
- generation: 5,
- observedGeneration: 5,
- ready: true,
- lastUpdated: now.Add(-15 * time.Minute), // 15m ago (< 30m max age)
- wantShouldPublish: false,
- wantReasonContains: "max age not exceeded",
- description: "When generations match, follow normal max age logic (ready)",
- },
- {
- name: "generation in sync - recent update - not ready",
- generation: 2,
- observedGeneration: 2,
- ready: false,
- lastUpdated: now.Add(-5 * time.Second), // 5s ago (< 10s max age)
- wantShouldPublish: false,
- wantReasonContains: "max age not exceeded",
- description: "When generations match, follow normal max age logic (not ready)",
- },
- {
- name: "generation in sync - max age exceeded - ready",
- generation: 3,
- observedGeneration: 3,
- ready: true,
- lastUpdated: now.Add(-31 * time.Minute), // 31m ago (> 30m max age)
- wantShouldPublish: true,
- wantReasonContains: ReasonMaxAgeExceeded,
- description: "When generations match and max age exceeded, publish (ready)",
- },
- {
- name: "generation in sync - max age exceeded - not ready",
- generation: 1,
- observedGeneration: 1,
- ready: false,
- lastUpdated: now.Add(-11 * time.Second), // 11s ago (> 10s max age)
- wantShouldPublish: true,
- wantReasonContains: ReasonMaxAgeExceeded,
- description: "When generations match and max age exceeded, publish (not ready)",
- },
+ lookup := buildConditionsLookup(conditions)
- // Edge cases
- {
- name: "both generation and observedGeneration are zero",
- generation: 0,
- observedGeneration: 0,
- ready: true,
- lastUpdated: now.Add(-5 * time.Minute),
- wantShouldPublish: false,
- wantReasonContains: "max age not exceeded",
- description: "Zero generations should be treated as in sync (defensive)",
- },
- {
- name: "observedGeneration ahead of generation (defensive)",
- generation: 5,
- observedGeneration: 10,
- ready: true,
- lastUpdated: now.Add(-5 * time.Minute),
- wantShouldPublish: false,
- wantReasonContains: "max age not exceeded",
- description: "ObservedGeneration > Generation shouldn't happen, but handle defensively",
- },
+ if len(lookup) != 2 {
+ t.Fatalf("expected 2 entries, got %d", len(lookup))
}
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- resource := newTestResourceWithGeneration(
- testResourceID,
- testResourceKind,
- tt.ready,
- tt.lastUpdated,
- tt.generation,
- tt.observedGeneration,
- )
- decision := engine.Evaluate(resource, now)
+ ready, ok := lookup["Ready"]
+ if !ok {
+ t.Fatal("missing Ready condition")
+ }
+ if ready["status"] != "True" {
+ t.Errorf("Ready status = %v, want True", ready["status"])
+ }
+ if ready["observed_generation"] != int64(3) {
+ t.Errorf("Ready observed_generation = %v, want 3", ready["observed_generation"])
+ }
- assertDecision(t, decision, tt.wantShouldPublish, tt.wantReasonContains)
+ avail, ok := lookup["Available"]
+ if !ok {
+ t.Fatal("missing Available condition")
+ }
+ if avail["status"] != "False" {
+ t.Errorf("Available status = %v, want False", avail["status"])
+ }
+}
- // Additional context on failure
- if t.Failed() {
- t.Logf("Test description: %s", tt.description)
- t.Logf("Generation: %d, ObservedGeneration: %d", tt.generation, tt.observedGeneration)
- t.Logf("Ready: %t, LastUpdated: %v", tt.ready, tt.lastUpdated)
- }
- })
+func TestBuildConditionsLookup_Empty(t *testing.T) {
+ lookup := buildConditionsLookup(nil)
+ if len(lookup) != 0 {
+ t.Errorf("expected empty map, got %d entries", len(lookup))
}
}
-// TestDecisionEngine_Evaluate_NeverProcessedPrecedence tests the precedence of checks
-// when both "never processed" and "generation changed" conditions are true.
-//
-// For brand-new resources (generation=1, observedGeneration=0, lastUpdated=zero),
-// both the LastUpdated.IsZero() check and the generation mismatch check would fire.
-// This test ensures that the "never processed" check takes precedence since it runs
-// first, and protects against future reordering of these checks.
-func TestDecisionEngine_Evaluate_NeverProcessedPrecedence(t *testing.T) {
- engine := newTestEngine()
- now := time.Now()
+func TestZeroCondition(t *testing.T) {
+ zero := zeroCondition()
- tests := []struct {
- lastUpdated time.Time
- name string
- wantReasonContains string
- description string
- generation int32
- observedGeneration int32
- ready bool
- wantShouldPublish bool
- }{
- {
- name: "brand-new resource - never processed wins over generation changed",
- generation: 1,
- observedGeneration: 0,
- ready: false,
- lastUpdated: time.Time{}, // Zero - triggers "never processed"
- wantShouldPublish: true,
- wantReasonContains: ReasonNeverProcessed, // Should be "never processed", not "generation changed"
- description: "Both checks fire, but never-processed check runs first and wins",
- },
- {
- name: "brand-new resource - ready - never processed wins",
- generation: 1,
- observedGeneration: 0,
- ready: true,
- lastUpdated: time.Time{}, // Zero - triggers "never processed"
- wantShouldPublish: true,
- wantReasonContains: ReasonNeverProcessed,
- description: "Both checks fire, but never-processed check runs first and wins (ready)",
- },
- {
- name: "generation mismatch with non-zero lastUpdated - generation changed wins",
- generation: 1,
- observedGeneration: 0,
- ready: true,
- lastUpdated: now, // Non-zero - skips "never processed" check
- wantShouldPublish: true,
- wantReasonContains: ReasonGenerationChanged, // Should be "generation changed"
- description: "Only generation check fires since lastUpdated is non-zero",
- },
- {
- name: "multiple generation changes on never-processed resource",
- generation: 5,
- observedGeneration: 0,
- ready: false,
- lastUpdated: time.Time{}, // Zero - triggers "never processed"
- wantShouldPublish: true,
- wantReasonContains: ReasonNeverProcessed, // Still "never processed", not "generation changed"
- description: "Never-processed check takes precedence even with large generation gap",
+ if zero["status"] != "" {
+ t.Errorf("status = %q, want empty string", zero["status"])
+ }
+ if zero["observed_generation"] != int64(0) {
+ t.Errorf("observed_generation = %v, want 0", zero["observed_generation"])
+ }
+ if zero["last_updated_time"] != "" {
+ t.Errorf("last_updated_time = %q, want empty string", zero["last_updated_time"])
+ }
+ if zero["last_transition_time"] != "" {
+ t.Errorf("last_transition_time = %q, want empty string", zero["last_transition_time"])
+ }
+}
+
+func TestResourceToMap(t *testing.T) {
+ now := time.Now()
+ resource := &client.Resource{
+ ID: "res-1",
+ Href: "/api/v1/clusters/res-1",
+ Kind: "Cluster",
+ Generation: 3,
+ CreatedTime: now,
+ UpdatedTime: now,
+ Labels: map[string]string{"env": "prod"},
+ OwnerReferences: &client.OwnerReference{
+ ID: "owner-1",
+ Href: "/api/v1/owners/owner-1",
+ Kind: "Owner",
},
}
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- resource := newTestResourceWithGeneration(
- testResourceID,
- testResourceKind,
- tt.ready,
- tt.lastUpdated,
- tt.generation,
- tt.observedGeneration,
- )
- decision := engine.Evaluate(resource, now)
+ m := resourceToMap(resource)
- assertDecision(t, decision, tt.wantShouldPublish, tt.wantReasonContains)
+ if m["id"] != "res-1" {
+ t.Errorf("id = %v, want res-1", m["id"])
+ }
+ if m["kind"] != "Cluster" {
+ t.Errorf("kind = %v, want Cluster", m["kind"])
+ }
+ if m["generation"] != int64(3) {
+ t.Errorf("generation = %v, want 3", m["generation"])
+ }
- // Additional context on failure
- if t.Failed() {
- t.Logf("Test description: %s", tt.description)
- t.Logf("Generation: %d, ObservedGeneration: %d", tt.generation, tt.observedGeneration)
- t.Logf("LastUpdated.IsZero(): %v", tt.lastUpdated.IsZero())
- t.Logf("Expected reason containing: %q, got: %q", tt.wantReasonContains, decision.Reason)
- }
- })
+ labels, ok := m["labels"].(map[string]interface{})
+ if !ok {
+ t.Fatal("labels not found or wrong type")
+ }
+ if labels["env"] != "prod" {
+ t.Errorf("labels.env = %v, want prod", labels["env"])
+ }
+
+ owner, ok := m["owner_references"].(map[string]interface{})
+ if !ok {
+ t.Fatal("owner_references not found or wrong type")
+ }
+ if owner["id"] != "owner-1" {
+ t.Errorf("owner_references.id = %v, want owner-1", owner["id"])
+ }
+}
+
+func TestResourceToMap_NoOptionalFields(t *testing.T) {
+ resource := &client.Resource{
+ ID: "res-2",
+ Kind: "NodePool",
+ Generation: 1,
+ }
+
+ m := resourceToMap(resource)
+
+ if _, ok := m["labels"]; ok {
+ t.Error("labels should not be present when empty")
+ }
+ if _, ok := m["owner_references"]; ok {
+ t.Error("owner_references should not be present when nil")
+ }
+ if _, ok := m["metadata"]; ok {
+ t.Error("metadata should not be present when nil")
}
}
diff --git a/internal/payload/builder.go b/internal/payload/builder.go
index 5eb94be..d744a8c 100644
--- a/internal/payload/builder.go
+++ b/internal/payload/builder.go
@@ -136,13 +136,9 @@ func (b *Builder) BuildPayload(ctx context.Context, resource *client.Resource, r
// resourceToMap converts a Resource into a plain map[string]interface{} for CEL evaluation.
// Time fields are formatted as RFC3339Nano strings to match their JSON representation.
+// Status data is accessed only through conditions (no flattened fields).
func resourceToMap(r *client.Resource) map[string]interface{} {
- status := map[string]interface{}{
- "ready": r.Status.Ready,
- "last_transition_time": r.Status.LastTransitionTime.Format(time.RFC3339Nano),
- "last_updated": r.Status.LastUpdated.Format(time.RFC3339Nano),
- "observed_generation": r.Status.ObservedGeneration,
- }
+ status := map[string]interface{}{}
if len(r.Status.Conditions) > 0 {
conditions := make([]interface{}, len(r.Status.Conditions))
for i, c := range r.Status.Conditions {
diff --git a/internal/payload/builder_test.go b/internal/payload/builder_test.go
index cb20bf9..dcc643d 100644
--- a/internal/payload/builder_test.go
+++ b/internal/payload/builder_test.go
@@ -118,7 +118,13 @@ func makeTestResource() *client.Resource {
Href: "/api/v1/clusters/cls-abc",
Generation: 3,
Status: client.ResourceStatus{
- Ready: true,
+ Conditions: []client.Condition{
+ {
+ Type: "Ready",
+ Status: "True",
+ LastUpdatedTime: time.Now(),
+ },
+ },
},
Labels: map[string]string{"region": "us-east"},
CreatedTime: time.Now(),
@@ -133,7 +139,13 @@ func makeTestNodePoolResource() *client.Resource {
Href: "/api/v1/nodepools/np-xyz",
Generation: 2,
Status: client.ResourceStatus{
- Ready: true,
+ Conditions: []client.Condition{
+ {
+ Type: "Ready",
+ Status: "True",
+ LastUpdatedTime: time.Now(),
+ },
+ },
},
OwnerReferences: &client.OwnerReference{
ID: "cluster-123",
@@ -191,7 +203,7 @@ func TestBuildPayload_NestedObject(t *testing.T) {
func TestBuildPayload_CELConditional(t *testing.T) {
buildDef := map[string]interface{}{
- "status": `resource.status.ready == true ? "Ready" : "NotReady"`,
+ "gen_check": `resource.generation > 2 ? "high" : "low"`,
}
b, err := NewBuilder(buildDef, logger.NewHyperFleetLogger())
if err != nil {
@@ -200,8 +212,8 @@ func TestBuildPayload_CELConditional(t *testing.T) {
payload := b.BuildPayload(context.Background(), makeTestResource(), "")
- if payload["status"] != "Ready" {
- t.Errorf("expected status 'Ready', got %v", payload["status"])
+ if payload["gen_check"] != "high" {
+ t.Errorf("expected gen_check 'high', got %v", payload["gen_check"])
}
}
@@ -243,9 +255,9 @@ func TestBuildPayload_CELStringLiteral(t *testing.T) {
func TestBuildPayload_MixedTypes(t *testing.T) {
buildDef := map[string]interface{}{
- "id": "resource.id",
- "origin": `"sentinel"`,
- "status": `resource.status.ready == true ? "Ready" : "NotReady"`,
+ "id": "resource.id",
+ "origin": `"sentinel"`,
+ "gen_check": `resource.generation > 2 ? "high" : "low"`,
"nested": map[string]interface{}{
"kind": "resource.kind",
},
@@ -263,8 +275,8 @@ func TestBuildPayload_MixedTypes(t *testing.T) {
if payload["origin"] != "sentinel" {
t.Errorf("expected origin 'sentinel', got %v", payload["origin"])
}
- if payload["status"] != "Ready" {
- t.Errorf("expected status 'Ready', got %v", payload["status"])
+ if payload["gen_check"] != "high" {
+ t.Errorf("expected gen_check 'high', got %v", payload["gen_check"])
}
nested, ok := payload["nested"].(map[string]interface{})
if !ok {
diff --git a/internal/sentinel/sentinel.go b/internal/sentinel/sentinel.go
index 693cdb9..3c15368 100644
--- a/internal/sentinel/sentinel.go
+++ b/internal/sentinel/sentinel.go
@@ -2,7 +2,6 @@ package sentinel
import (
"context"
- "errors"
"fmt"
"sync"
"time"
@@ -21,15 +20,6 @@ import (
"go.opentelemetry.io/otel/codes"
)
-const (
- // notReadyFilter selects resources whose Ready condition is False.
- notReadyFilter = "status.conditions.Ready='False'"
-
- // staleReadyFilterFmt selects resources whose Ready condition is True but
- // last_updated_time is older than the given cutoff (RFC 3339 timestamp).
- staleReadyFilterFmt = "status.conditions.Ready='True' and status.conditions.Ready.last_updated_time<='%s'"
-)
-
// Sentinel polls the HyperFleet API and triggers reconciliation events
type Sentinel struct {
lastSuccessfulPoll time.Time
@@ -77,8 +67,8 @@ func (s *Sentinel) LastSuccessfulPoll() time.Time {
// Start starts the polling loop
func (s *Sentinel) Start(ctx context.Context) error {
- s.logger.Infof(ctx, "Starting sentinel resource_type=%s poll_interval=%s max_age_not_ready=%s max_age_ready=%s",
- s.config.ResourceType, s.config.PollInterval, s.config.MaxAgeNotReady, s.config.MaxAgeReady)
+ s.logger.Infof(ctx, "Starting sentinel resource_type=%s poll_interval=%s",
+ s.config.ResourceType, s.config.PollInterval)
ticker := time.NewTicker(s.config.PollInterval)
defer ticker.Stop()
@@ -124,10 +114,13 @@ func (s *Sentinel) trigger(ctx context.Context) error {
// Convert label selectors to map for filtering
labelSelector := s.config.ResourceSelector.ToMap()
- // Fetch resources using condition-based server-side filtering:
- // Query 1: Not-ready resources (need frequent reconciliation)
- // Query 2: Stale ready resources (exceeded max age)
- resources, err := s.fetchFilteredResources(ctx, labelSelector)
+ // Fetch all resources matching label selectors.
+ // TODO(HYPERFLEET-805): Add optional server_filters config for server-side pre-filtering
+ // to reduce the result set before CEL evaluation. Currently fetches the full result set
+ // and evaluates each resource in-memory. At large scale, use resource_selector labels
+ // to shard across multiple Sentinel instances.
+ rt := client.ResourceType(s.config.ResourceType)
+ resources, err := s.client.FetchResources(ctx, rt, labelSelector)
if err != nil {
// Record API error
pollSpan.RecordError(err)
@@ -212,8 +205,8 @@ func (s *Sentinel) trigger(ctx context.Context) error {
// Record successful event publication
metrics.UpdateEventsPublishedMetric(resourceType, resourceSelector, decision.Reason)
- s.logger.Infof(eventCtx, "Published event resource_id=%s ready=%t",
- resource.ID, resource.Status.Ready)
+ s.logger.Infof(eventCtx, "Published event resource_id=%s",
+ resource.ID)
published++
} else {
// Add decision reason to context for structured logging
@@ -222,8 +215,8 @@ func (s *Sentinel) trigger(ctx context.Context) error {
// Record skipped resource
metrics.UpdateResourcesSkippedMetric(resourceType, resourceSelector, decision.Reason)
- s.logger.Debugf(skipCtx, "Skipped resource resource_id=%s ready=%t",
- resource.ID, resource.Status.Ready)
+ s.logger.Debugf(skipCtx, "Skipped resource resource_id=%s",
+ resource.ID)
skipped++
}
@@ -248,64 +241,6 @@ func (s *Sentinel) trigger(ctx context.Context) error {
return nil
}
-// fetchFilteredResources makes two targeted API queries to fetch only resources
-// that likely need reconciliation, reducing network traffic compared to fetching
-// all resources:
-//
-// 1. Not-ready resources: status.conditions.Ready='False'
-// 2. Stale ready resources: Ready='True' with last_updated_time older than max_age_ready
-//
-// Results are merged and deduplicated by resource ID. The DecisionEngine still
-// evaluates the filtered set in memory (e.g., for generation-based checks).
-func (s *Sentinel) fetchFilteredResources(
- ctx context.Context,
- labelSelector map[string]string,
-) ([]client.Resource, error) {
- rt := client.ResourceType(s.config.ResourceType)
-
- // Query 1: Not-ready resources
- notReadyResources, err := s.client.FetchResources(ctx, rt, labelSelector, notReadyFilter)
- if err != nil {
- return nil, fmt.Errorf("failed to fetch not-ready resources: %w", err)
- }
- s.logger.Debugf(ctx, "Fetched not-ready resources count=%d", len(notReadyResources))
-
- // Query 2: Stale ready resources (last_updated_time exceeded max_age_ready)
- cutoff := time.Now().Add(-s.config.MaxAgeReady)
- staleFilter := fmt.Sprintf(staleReadyFilterFmt, cutoff.Format(time.RFC3339))
- staleResources, err := s.client.FetchResources(ctx, rt, labelSelector, staleFilter)
- if err != nil {
- // Propagate context cancellation/timeout — the caller must see these
- if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
- return nil, err
- }
- // Graceful degradation: for transient/API errors, continue with not-ready results
- s.logger.Errorf(ctx, "Failed to fetch stale resources, continuing with not-ready only: %v", err)
- return notReadyResources, nil
- }
- s.logger.Debugf(ctx, "Fetched stale ready resources count=%d", len(staleResources))
-
- return mergeResources(notReadyResources, staleResources), nil
-}
-
-// mergeResources combines two resource slices, deduplicating by resource ID.
-// Resources from the first slice take precedence when duplicates are found.
-func mergeResources(a, b []client.Resource) []client.Resource {
- seen := make(map[string]struct{}, len(a))
- result := make([]client.Resource, 0, len(a)+len(b))
-
- for i := range a {
- seen[a[i].ID] = struct{}{}
- result = append(result, a[i])
- }
- for i := range b {
- if _, exists := seen[b[i].ID]; !exists {
- result = append(result, b[i])
- }
- }
- return result
-}
-
// buildEventData builds the CloudEvent data payload for a resource using the
// configured payload builder.
func (s *Sentinel) buildEventData(
diff --git a/internal/sentinel/sentinel_test.go b/internal/sentinel/sentinel_test.go
index bcbda8b..45ddfaf 100644
--- a/internal/sentinel/sentinel_test.go
+++ b/internal/sentinel/sentinel_test.go
@@ -7,9 +7,6 @@ import (
"fmt"
"net/http"
"net/http/httptest"
- "strings"
- "sync"
- "sync/atomic"
"testing"
"time"
@@ -30,8 +27,7 @@ const (
testResourceKind = "Cluster"
)
-// createMockCluster creates a mock cluster response matching the new API spec.
-// Ready status is determined by the ready bool parameter.
+// createMockCluster creates a mock cluster response matching the API spec.
func createMockCluster(
id string,
generation int,
@@ -39,21 +35,11 @@ func createMockCluster(
ready bool,
lastUpdated time.Time,
) map[string]interface{} {
- // Map ready bool to condition status
readyStatus := "False"
if ready {
readyStatus = "True"
}
- readyCondition := map[string]interface{}{
- "type": "Ready",
- "status": readyStatus,
- "created_time": "2025-01-01T09:00:00Z",
- "last_transition_time": "2025-01-01T10:00:00Z",
- "last_updated_time": lastUpdated.Format(time.RFC3339),
- "observed_generation": observedGeneration,
- }
-
return map[string]interface{}{
"id": id,
"href": "/api/hyperfleet/v1/clusters/" + id,
@@ -67,9 +53,8 @@ func createMockCluster(
"spec": map[string]interface{}{},
"status": map[string]interface{}{
"conditions": []map[string]interface{}{
- readyCondition,
{
- "type": "Available",
+ "type": "Ready",
"status": readyStatus,
"created_time": "2025-01-01T09:00:00Z",
"last_transition_time": "2025-01-01T10:00:00Z",
@@ -92,6 +77,18 @@ func createMockClusterList(clusters []map[string]interface{}) map[string]interfa
}
}
+// mockServerForResources creates a mock HTTP server that returns the given resources.
+func mockServerForResources(t *testing.T, clusters []map[string]interface{}) *httptest.Server {
+ t.Helper()
+ return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ response := createMockClusterList(clusters)
+ w.Header().Set("Content-Type", "application/json")
+ if err := json.NewEncoder(w).Encode(response); err != nil {
+ t.Logf("Error encoding response: %v", err)
+ }
+ }))
+}
+
// MockPublisher implements broker.Publisher for testing
type MockPublisher struct {
publishError error
@@ -119,7 +116,6 @@ type MockPublisherWithLogger struct {
}
func (m *MockPublisherWithLogger) Publish(ctx context.Context, topic string, event *cloudevents.Event) error {
- // Simulate what broker does - log with the provided context
m.mockLogger.Info(ctx, fmt.Sprintf("broker publishing event to topic %s", topic))
return nil
}
@@ -128,35 +124,28 @@ func (m *MockPublisherWithLogger) Close() error { return nil }
func (m *MockPublisherWithLogger) Health(ctx context.Context) error { return nil }
-// mockServerForConditionQueries creates a mock HTTP server that routes responses
-// based on condition-based search parameters. Each poll cycle makes two queries:
-// one for not-ready resources (Ready='False') and one for stale-ready resources (Ready='True').
-func mockServerForConditionQueries(
- t *testing.T,
- notReadyClusters, staleClusters []map[string]interface{},
-) *httptest.Server {
+// newTestDecisionEngine creates a CEL-based decision engine with default config.
+func newTestDecisionEngine(t *testing.T) *engine.DecisionEngine {
t.Helper()
- return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- search := r.URL.Query().Get("search")
-
- var clusters []map[string]interface{}
- switch {
- case strings.Contains(search, "Ready='False'"):
- clusters = notReadyClusters
- case strings.Contains(search, "Ready='True'"):
- clusters = staleClusters
- default:
- t.Errorf("unexpected search query: %q", search)
- w.WriteHeader(http.StatusBadRequest)
- return
- }
+ cfg := config.DefaultMessageDecision()
+ de, err := engine.NewDecisionEngine(cfg)
+ if err != nil {
+ t.Fatalf("NewDecisionEngine failed: %v", err)
+ }
+ return de
+}
- response := createMockClusterList(clusters)
- w.Header().Set("Content-Type", "application/json")
- if err := json.NewEncoder(w).Encode(response); err != nil {
- t.Logf("Error encoding response: %v", err)
- }
- }))
+// newTestSentinelConfig creates a config for testing (no max_age fields).
+func newTestSentinelConfig() *config.SentinelConfig {
+ return &config.SentinelConfig{
+ ResourceType: "clusters",
+ Topic: testTopic,
+ MessageDecision: config.DefaultMessageDecision(),
+ MessageData: map[string]interface{}{
+ "id": "resource.id",
+ "kind": "resource.kind",
+ },
+ }
}
// TestTrigger_Success tests successful event publishing
@@ -164,44 +153,28 @@ func TestTrigger_Success(t *testing.T) {
ctx := context.Background()
now := time.Now()
- // Stale ready cluster exceeds max age (31 minutes ago)
- server := mockServerForConditionQueries(t,
- nil, // no not-ready clusters
- []map[string]interface{}{
- createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)),
- },
- )
+ // Ready cluster with stale last_updated (> 30m) should trigger publish
+ server := mockServerForResources(t, []map[string]interface{}{
+ createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)),
+ })
defer server.Close()
- // Setup components
hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second)
- decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute)
+ decisionEngine := newTestDecisionEngine(t)
mockPublisher := &MockPublisher{}
log := logger.NewHyperFleetLogger()
- // Create metrics with a test registry (registers metrics once via sync.Once)
registry := prometheus.NewRegistry()
metrics.NewSentinelMetrics(registry, "test")
- cfg := &config.SentinelConfig{
- ResourceType: "clusters",
- Topic: testTopic,
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- },
- }
+ cfg := newTestSentinelConfig()
s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log)
if err != nil {
t.Fatalf("NewSentinel failed: %v", err)
}
- // Execute
err = s.trigger(ctx)
- // Verify
if err != nil {
t.Errorf("Expected no error, got %v", err)
}
@@ -210,15 +183,10 @@ func TestTrigger_Success(t *testing.T) {
t.Errorf("Expected 1 published event, got %d", len(mockPublisher.publishedEvents))
}
- if len(mockPublisher.publishedTopics) != 1 {
- t.Errorf("Expected 1 topic, got %d", len(mockPublisher.publishedTopics))
- }
-
- if mockPublisher.publishedTopics[0] != testTopic {
- t.Errorf("Expected topic '%s', got '%s'", testTopic, mockPublisher.publishedTopics[0])
+ if len(mockPublisher.publishedTopics) != 1 || mockPublisher.publishedTopics[0] != testTopic {
+ t.Errorf("Expected topic '%s', got %v", testTopic, mockPublisher.publishedTopics)
}
- // Verify CloudEvent properties
event := mockPublisher.publishedEvents[0]
if event.Type() != "com.redhat.hyperfleet.Cluster.reconcile" {
t.Errorf("Expected event type 'com.redhat.hyperfleet.Cluster.reconcile', got '%s'", event.Type())
@@ -235,44 +203,29 @@ func TestTrigger_Success(t *testing.T) {
func TestTrigger_NoEventsPublished(t *testing.T) {
ctx := context.Background()
- server := mockServerForConditionQueries(t,
- []map[string]interface{}{
- // Not-ready for only 1 second — within max_age_not_ready (10s), should be skipped
- createMockCluster("cluster-1", 1, 1, false, time.Now().Add(-1*time.Second)),
- },
- nil, // no stale clusters
- )
+ // Not-ready for only 1 second — within debounce (10s), should be skipped.
+ // Generation > 1 so is_new_resource won't fire.
+ server := mockServerForResources(t, []map[string]interface{}{
+ createMockCluster("cluster-1", 2, 2, false, time.Now().Add(-1*time.Second)),
+ })
defer server.Close()
- // Setup components
hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second)
- decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute)
+ decisionEngine := newTestDecisionEngine(t)
mockPublisher := &MockPublisher{}
log := logger.NewHyperFleetLogger()
- // Create metrics with a test registry (registers metrics once via sync.Once)
registry := prometheus.NewRegistry()
metrics.NewSentinelMetrics(registry, "test")
- cfg := &config.SentinelConfig{
- ResourceType: "clusters",
- Topic: testTopic,
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- },
- }
+ cfg := newTestSentinelConfig()
s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log)
if err != nil {
t.Fatalf("NewSentinel failed: %v", err)
}
- // Execute
err = s.trigger(ctx)
- // Verify
if err != nil {
t.Errorf("Expected no error, got %v", err)
}
@@ -286,7 +239,6 @@ func TestTrigger_NoEventsPublished(t *testing.T) {
func TestTrigger_FetchError(t *testing.T) {
ctx := context.Background()
- // Create mock server that returns 500 errors for all queries
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
if _, err := w.Write([]byte(`{"error": "internal server error"}`)); err != nil {
@@ -295,36 +247,23 @@ func TestTrigger_FetchError(t *testing.T) {
}))
defer server.Close()
- // Setup components
- hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 1*time.Second) // Short timeout
- decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute)
+ hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 1*time.Second)
+ decisionEngine := newTestDecisionEngine(t)
mockPublisher := &MockPublisher{}
log := logger.NewHyperFleetLogger()
- // Create metrics with a test registry (registers metrics once via sync.Once)
registry := prometheus.NewRegistry()
metrics.NewSentinelMetrics(registry, "test")
- cfg := &config.SentinelConfig{
- ResourceType: "clusters",
- Topic: testTopic,
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- },
- }
+ cfg := newTestSentinelConfig()
s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log)
if err != nil {
t.Fatalf("NewSentinel failed: %v", err)
}
- // Execute
err = s.trigger(ctx)
- // Verify
if err == nil {
t.Error("Expected error, got nil")
}
@@ -339,45 +278,29 @@ func TestTrigger_PublishError(t *testing.T) {
ctx := context.Background()
now := time.Now()
- server := mockServerForConditionQueries(t,
- nil, // no not-ready clusters
- []map[string]interface{}{
- createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)),
- },
- )
+ server := mockServerForResources(t, []map[string]interface{}{
+ createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)),
+ })
defer server.Close()
- // Setup components
hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second)
- decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute)
+ decisionEngine := newTestDecisionEngine(t)
mockPublisher := &MockPublisher{
publishError: errors.New("broker connection failed"),
}
log := logger.NewHyperFleetLogger()
- // Create metrics with a test registry (registers metrics once via sync.Once)
registry := prometheus.NewRegistry()
metrics.NewSentinelMetrics(registry, "test")
- cfg := &config.SentinelConfig{
- ResourceType: "clusters",
- Topic: testTopic,
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- },
- }
+ cfg := newTestSentinelConfig()
s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log)
if err != nil {
t.Fatalf("NewSentinel failed: %v", err)
}
- // Execute
err = s.trigger(ctx)
- // Verify - trigger should succeed even if publish fails (graceful degradation)
if err != nil {
t.Errorf("Expected no error (graceful degradation), got %v", err)
}
@@ -388,65 +311,45 @@ func TestTrigger_MixedResources(t *testing.T) {
ctx := context.Background()
now := time.Now()
- // Split resources between not-ready and stale queries
- server := mockServerForConditionQueries(t,
- // Not-ready query
- []map[string]interface{}{
- // Should publish (not ready max age exceeded: 1min > 10s)
- createMockCluster("cluster-3", 3, 3, false, now.Add(-1*time.Minute)),
- // Should be skipped (empty ID)
- createMockCluster("", 1, 1, false, now.Add(-15*time.Second)),
- },
- // Stale query
- []map[string]interface{}{
- createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)), // Should publish (max age exceeded)
- createMockCluster("cluster-4", 5, 4, true, now.Add(-5*time.Minute)), // Should publish (generation changed)
- },
- )
+ server := mockServerForResources(t, []map[string]interface{}{
+ // Should publish (not ready, debounce exceeded: 1min > 10s, generation > 1)
+ createMockCluster("cluster-3", 3, 3, false, now.Add(-1*time.Minute)),
+ // Should be skipped (empty ID)
+ createMockCluster("", 1, 1, false, now.Add(-15*time.Second)),
+ // Should publish (ready, stale: 31min > 30min)
+ createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)),
+ // Should not publish (ready, recent: 5min < 30min, no generation mismatch)
+ createMockCluster("cluster-4", 5, 5, true, now.Add(-5*time.Minute)),
+ })
defer server.Close()
- // Setup components
hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second)
- decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute)
+ decisionEngine := newTestDecisionEngine(t)
mockPublisher := &MockPublisher{}
log := logger.NewHyperFleetLogger()
- // Create metrics with a test registry (registers metrics once via sync.Once)
registry := prometheus.NewRegistry()
metrics.NewSentinelMetrics(registry, "test")
- cfg := &config.SentinelConfig{
- ResourceType: "clusters",
- Topic: testTopic,
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- },
- }
+ cfg := newTestSentinelConfig()
s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log)
if err != nil {
t.Fatalf("NewSentinel failed: %v", err)
}
- // Execute
err = s.trigger(ctx)
- // Verify
if err != nil {
t.Errorf("Expected no error, got %v", err)
}
- // Should publish for:
- // - cluster-3 (not ready max age exceeded: 1min > 10s)
- // - cluster-1 (ready max age exceeded: 31min > 30min)
- // - cluster-4 (generation changed: 5 > 4)
- if len(mockPublisher.publishedEvents) != 3 {
- t.Errorf("Expected 3 published events, got %d", len(mockPublisher.publishedEvents))
+ // cluster-3: not ready + debounced → publish
+ // cluster-1: ready + stale → publish
+ // cluster-4: ready + recent, not new → skip
+ if len(mockPublisher.publishedEvents) != 2 {
+ t.Errorf("Expected 2 published events, got %d", len(mockPublisher.publishedEvents))
}
- // Verify topics
for _, topic := range mockPublisher.publishedTopics {
if topic != testTopic {
t.Errorf("Expected topic '%s', got '%s'", testTopic, topic)
@@ -460,32 +363,24 @@ func TestTrigger_WithMessageDataConfig(t *testing.T) {
ctx := context.Background()
now := time.Now()
- server := mockServerForConditionQueries(t,
- nil,
- []map[string]interface{}{
- createMockCluster("cluster-xyz", 2, 2, true, now.Add(-31*time.Minute)),
- },
- )
+ server := mockServerForResources(t, []map[string]interface{}{
+ createMockCluster("cluster-xyz", 2, 2, true, now.Add(-31*time.Minute)),
+ })
defer server.Close()
hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second)
- decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute)
+ decisionEngine := newTestDecisionEngine(t)
mockPublisher := &MockPublisher{}
log := logger.NewHyperFleetLogger()
registry := prometheus.NewRegistry()
metrics.NewSentinelMetrics(registry, "test")
- cfg := &config.SentinelConfig{
- ResourceType: "clusters",
- Topic: testTopic,
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- "origin": `"sentinel"`,
- },
+ cfg := newTestSentinelConfig()
+ cfg.MessageData = map[string]interface{}{
+ "id": "resource.id",
+ "kind": "resource.kind",
+ "origin": `"sentinel"`,
}
s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log)
@@ -514,7 +409,6 @@ func TestTrigger_WithMessageDataConfig(t *testing.T) {
if data["origin"] != "sentinel" {
t.Errorf("Expected origin 'sentinel', got %v", data["origin"])
}
- // Hardcoded fields should NOT be present when builder is configured
if _, ok := data["reason"]; ok {
t.Errorf("Expected 'reason' to be absent (hardcoded field), but found it")
}
@@ -525,33 +419,25 @@ func TestTrigger_WithNestedMessageData(t *testing.T) {
ctx := context.Background()
now := time.Now()
- server := mockServerForConditionQueries(t,
- nil,
- []map[string]interface{}{
- createMockCluster("cluster-nest", 1, 1, true, now.Add(-31*time.Minute)),
- },
- )
+ server := mockServerForResources(t, []map[string]interface{}{
+ createMockCluster("cluster-nest", 2, 2, true, now.Add(-31*time.Minute)),
+ })
defer server.Close()
hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second)
- decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute)
+ decisionEngine := newTestDecisionEngine(t)
mockPublisher := &MockPublisher{}
log := logger.NewHyperFleetLogger()
registry := prometheus.NewRegistry()
metrics.NewSentinelMetrics(registry, "test")
- cfg := &config.SentinelConfig{
- ResourceType: "clusters",
- Topic: testTopic,
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "resource": map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- },
+ cfg := newTestSentinelConfig()
+ cfg.MessageData = map[string]interface{}{
+ "id": "resource.id",
+ "resource": map[string]interface{}{
+ "id": "resource.id",
+ "kind": "resource.kind",
},
}
@@ -586,15 +472,7 @@ func TestTrigger_WithNestedMessageData(t *testing.T) {
// TestBuildEventData_WithBuilder tests buildEventData directly with a configured builder.
func TestBuildEventData_WithBuilder(t *testing.T) {
- cfg := &config.SentinelConfig{
- ResourceType: "clusters",
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- },
- }
+ cfg := newTestSentinelConfig()
log := logger.NewHyperFleetLogger()
s, err := NewSentinel(cfg, nil, nil, nil, log)
if err != nil {
@@ -607,7 +485,7 @@ func TestBuildEventData_WithBuilder(t *testing.T) {
Href: "/api/v1/clusters/cls-direct",
Generation: 1,
}
- decision := engine.Decision{ShouldPublish: true, Reason: "max_age_exceeded"}
+ decision := engine.Decision{ShouldPublish: true, Reason: "message decision matched"}
ctx := logger.WithDecisionReason(context.Background(), decision.Reason)
data := s.buildEventData(ctx, resource, decision)
@@ -628,13 +506,12 @@ func TestTrigger_ContextPropagationToBroker(t *testing.T) {
CapturedContexts: &capturedContexts,
}
- // Create mock publisher that uses our mock logger
mockPublisherWithLogger := &MockPublisherWithLogger{
mockLogger: mockLogger,
}
ctx := context.Background()
- ctx = logger.WithDecisionReason(ctx, "max_age_exceeded")
+ ctx = logger.WithDecisionReason(ctx, "message decision matched")
ctx = logger.WithTopic(ctx, testTopic)
ctx = logger.WithSubset(ctx, "clusters")
ctx = logger.WithTraceID(ctx, "trace-123")
@@ -657,8 +534,7 @@ func TestTrigger_ContextPropagationToBroker(t *testing.T) {
brokerCtx := capturedContexts[0]
- // Test context values propagated to broker
- if reason, ok := brokerCtx.Value(logger.DecisionReasonCtxKey).(string); !ok || reason != "max_age_exceeded" {
+ if reason, ok := brokerCtx.Value(logger.DecisionReasonCtxKey).(string); !ok || reason != "message decision matched" {
t.Errorf("decision_reason not propagated: got %v", reason)
}
@@ -675,271 +551,9 @@ func TestTrigger_ContextPropagationToBroker(t *testing.T) {
}
}
-// TestMergeResources tests the mergeResources deduplication function
-func TestMergeResources(t *testing.T) {
- tests := []struct {
- name string
- a []client.Resource
- b []client.Resource
- wantIDs []string
- wantLen int
- }{
- {
- name: "no overlap",
- a: []client.Resource{{ID: "a1"}, {ID: "a2"}},
- b: []client.Resource{{ID: "b1"}},
- wantIDs: []string{"a1", "a2", "b1"},
- wantLen: 3,
- },
- {
- name: "duplicate in b removed",
- a: []client.Resource{{ID: "x"}},
- b: []client.Resource{{ID: "x"}, {ID: "y"}},
- wantIDs: []string{"x", "y"},
- wantLen: 2,
- },
- {
- name: "both empty",
- a: nil,
- b: nil,
- wantIDs: nil,
- wantLen: 0,
- },
- {
- name: "a empty",
- a: nil,
- b: []client.Resource{{ID: "b1"}},
- wantIDs: []string{"b1"},
- wantLen: 1,
- },
- {
- name: "b empty",
- a: []client.Resource{{ID: "a1"}},
- b: nil,
- wantIDs: []string{"a1"},
- wantLen: 1,
- },
- {
- name: "all duplicates",
- a: []client.Resource{{ID: "x"}, {ID: "y"}},
- b: []client.Resource{{ID: "x"}, {ID: "y"}},
- wantIDs: []string{"x", "y"},
- wantLen: 2,
- },
- }
-
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- result := mergeResources(tt.a, tt.b)
- if len(result) != tt.wantLen {
- t.Errorf("mergeResources() returned %d resources, want %d", len(result), tt.wantLen)
- }
- for i, id := range tt.wantIDs {
- if i < len(result) && result[i].ID != id {
- t.Errorf("mergeResources()[%d].ID = %q, want %q", i, result[i].ID, id)
- }
- }
- })
- }
-}
-
-// TestMergeResources_FirstSlicePrecedence verifies that resources from the first
-// slice take precedence when duplicates are found.
-func TestMergeResources_FirstSlicePrecedence(t *testing.T) {
- a := []client.Resource{{ID: "dup", Kind: "from-a"}}
- b := []client.Resource{{ID: "dup", Kind: "from-b"}}
-
- result := mergeResources(a, b)
- if len(result) != 1 {
- t.Fatalf("Expected 1 resource, got %d", len(result))
- }
- if result[0].Kind != "from-a" {
- t.Errorf("Expected resource from first slice (Kind='from-a'), got Kind=%q", result[0].Kind)
- }
-}
-
-// TestTrigger_ConditionBasedQueries verifies that trigger sends two condition-based
-// queries (not-ready + stale) and correctly processes results from both.
-func TestTrigger_ConditionBasedQueries(t *testing.T) {
- ctx := context.Background()
- now := time.Now()
-
- var capturedSearchParams []string
- var mu sync.Mutex
-
- server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- search := r.URL.Query().Get("search")
- mu.Lock()
- capturedSearchParams = append(capturedSearchParams, search)
- mu.Unlock()
-
- var clusters []map[string]interface{}
- switch {
- case strings.Contains(search, "Ready='False'"):
- clusters = []map[string]interface{}{
- createMockCluster("not-ready-1", 1, 1, false, now.Add(-15*time.Second)),
- }
- case strings.Contains(search, "Ready='True'"):
- clusters = []map[string]interface{}{
- createMockCluster("stale-1", 2, 2, true, now.Add(-31*time.Minute)),
- }
- default:
- t.Errorf("unexpected search query: %q", search)
- w.WriteHeader(http.StatusBadRequest)
- return
- }
-
- response := createMockClusterList(clusters)
- w.Header().Set("Content-Type", "application/json")
- if err := json.NewEncoder(w).Encode(response); err != nil {
- t.Logf("Error encoding response: %v", err)
- }
- }))
- defer server.Close()
-
- hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second)
- decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute)
- mockPublisher := &MockPublisher{}
- log := logger.NewHyperFleetLogger()
-
- registry := prometheus.NewRegistry()
- metrics.NewSentinelMetrics(registry, "test")
-
- cfg := &config.SentinelConfig{
- ResourceType: "clusters",
- Topic: "test-topic",
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- },
- }
-
- s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log)
- if err != nil {
- t.Fatalf("NewSentinel failed: %v", err)
- }
-
- if err := s.trigger(ctx); err != nil {
- t.Fatalf("Expected no error, got %v", err)
- }
-
- // Verify two queries were sent
- mu.Lock()
- params := make([]string, len(capturedSearchParams))
- copy(params, capturedSearchParams)
- mu.Unlock()
-
- if len(params) != 2 {
- t.Fatalf("Expected 2 search queries (not-ready + stale), got %d", len(params))
- }
-
- hasNotReady := false
- hasStale := false
- for _, search := range params {
- if strings.Contains(search, "Ready='False'") {
- hasNotReady = true
- }
- if strings.Contains(search, "Ready='True'") {
- hasStale = true
- }
- }
- if !hasNotReady {
- t.Error("No not-ready query (Ready='False') was sent")
- }
- if !hasStale {
- t.Error("No stale query (Ready='True') was sent")
- }
-
- // Both resources should trigger events
- if len(mockPublisher.publishedEvents) != 2 {
- t.Errorf("Expected 2 published events (1 not-ready + 1 stale), got %d", len(mockPublisher.publishedEvents))
- }
-}
-
-// TestTrigger_StaleQueryFailure tests graceful degradation when the stale query
-// fails but the not-ready query succeeds.
-func TestTrigger_StaleQueryFailure(t *testing.T) {
- ctx := context.Background()
- now := time.Now()
-
- var queryCount atomic.Int32
-
- server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- queryCount.Add(1)
- search := r.URL.Query().Get("search")
-
- switch {
- case strings.Contains(search, "Ready='False'"):
- // Not-ready query succeeds
- clusters := []map[string]interface{}{
- createMockCluster("not-ready-1", 1, 1, false, now.Add(-15*time.Second)),
- }
- response := createMockClusterList(clusters)
- w.Header().Set("Content-Type", "application/json")
- if err := json.NewEncoder(w).Encode(response); err != nil {
- t.Logf("Error encoding response: %v", err)
- }
- case strings.Contains(search, "Ready='True'"):
- // Stale query fails
- w.WriteHeader(http.StatusInternalServerError)
- if _, err := w.Write([]byte(`{"error": "internal server error"}`)); err != nil {
- t.Logf("Error writing error response: %v", err)
- }
- default:
- t.Errorf("unexpected search query: %q", search)
- w.WriteHeader(http.StatusBadRequest)
- return
- }
- }))
- defer server.Close()
-
- hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 1*time.Second)
- decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute)
- mockPublisher := &MockPublisher{}
- log := logger.NewHyperFleetLogger()
-
- registry := prometheus.NewRegistry()
- metrics.NewSentinelMetrics(registry, "test")
-
- cfg := &config.SentinelConfig{
- ResourceType: "clusters",
- Topic: "test-topic",
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- },
- }
-
- s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log)
- if err != nil {
- t.Fatalf("NewSentinel failed: %v", err)
- }
-
- // Should succeed with graceful degradation (not-ready results only)
- err = s.trigger(ctx)
- if err != nil {
- t.Errorf("Expected no error (graceful degradation), got %v", err)
- }
-
- // Should still publish event for the not-ready resource
- if len(mockPublisher.publishedEvents) != 1 {
- t.Errorf("Expected 1 published event (from not-ready query), got %d", len(mockPublisher.publishedEvents))
- }
-
- // Both queries should have been attempted (stale query retries on 500, so queryCount >= 2)
- if queryCount.Load() < 2 {
- t.Errorf("Expected at least 2 queries (not-ready + stale), got %d", queryCount.Load())
- }
-}
-
func TestTrigger_CreatesRequiredSpans(t *testing.T) {
ctx := context.Background()
- // Setup in-memory trace exporter for span verification
exporter := tracetest.NewInMemoryExporter()
tp := trace.NewTracerProvider(
trace.WithSampler(trace.AlwaysSample()),
@@ -954,66 +568,49 @@ func TestTrigger_CreatesRequiredSpans(t *testing.T) {
otel.SetTracerProvider(previousProvider)
}()
- server := mockServerForConditionQueries(t,
- nil, // no not-ready clusters
- []map[string]interface{}{
- createMockCluster("cluster-1", 2, 2, true, time.Now().Add(-31*time.Minute)),
- },
- )
+ server := mockServerForResources(t, []map[string]interface{}{
+ createMockCluster("cluster-1", 2, 2, true, time.Now().Add(-31*time.Minute)),
+ })
defer server.Close()
hyperfleetClient, err := client.NewHyperFleetClient(server.URL, 10*time.Second)
if err != nil {
t.Fatalf("failed to create HyperFleet client: %v", err)
}
- decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute)
+ decisionEngine := newTestDecisionEngine(t)
mockPublisher := &MockPublisher{}
log := logger.NewHyperFleetLogger()
- // Create metrics with a test registry
registry := prometheus.NewRegistry()
metrics.NewSentinelMetrics(registry, "test")
- cfg := &config.SentinelConfig{
- ResourceType: "clusters",
- Topic: "hyperfleet-clusters",
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- MessagingSystem: "rabbitmq",
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- },
- }
+ cfg := newTestSentinelConfig()
+ cfg.Topic = "hyperfleet-clusters"
+ cfg.MessagingSystem = "rabbitmq"
s, err := NewSentinel(cfg, hyperfleetClient, decisionEngine, mockPublisher, log)
if err != nil {
t.Fatalf("NewSentinel failed: %v", err)
}
- // Execute trigger
err = s.trigger(ctx)
if err != nil {
t.Fatalf("trigger failed: %v", err)
}
- // Force flush spans to exporter
err = tp.ForceFlush(ctx)
if err != nil {
t.Fatalf("force flush error: %v", err)
}
- // Verify spans were created
spans := exporter.GetSpans()
- // Build map of span names for easy checking
spanNames := make(map[string]bool)
for _, span := range spans {
spanNames[span.Name] = true
}
- // Verify required spans exist
requiredSpans := []string{
"sentinel.poll",
"sentinel.evaluate",
@@ -1026,8 +623,6 @@ func TestTrigger_CreatesRequiredSpans(t *testing.T) {
}
}
- // Verify we got the expected number of spans
- // Should have: 1 poll + 1 evaluate + 1 publish = 3 spans minimum
if len(spans) < 3 {
t.Errorf("Expected at least 3 spans, got %d. Spans: %v", len(spans), getSpanNames(spans))
}
@@ -1036,12 +631,10 @@ func TestTrigger_CreatesRequiredSpans(t *testing.T) {
validateSpanAttribute(t, spans, "hyperfleet-clusters publish", "messaging.operation.type", "publish")
validateSpanAttribute(t, spans, "hyperfleet-clusters publish", "messaging.destination.name", cfg.Topic)
- // Verify the CloudEvent was published (basic functional verification)
if len(mockPublisher.publishedEvents) != 1 {
t.Errorf("Expected 1 published event, got %d", len(mockPublisher.publishedEvents))
}
- // Verify CloudEvent has traceparent extension
if len(mockPublisher.publishedEvents) > 0 {
event := mockPublisher.publishedEvents[0]
extensions := event.Extensions()
@@ -1071,7 +664,6 @@ func validateSpanAttribute(t *testing.T, spans []tracetest.SpanStub, spanName, a
t.Errorf("Span '%s' not found", spanName)
}
-// Helper function for span name extraction
func getSpanNames(spans []tracetest.SpanStub) []string {
names := make([]string, len(spans))
for i, span := range spans {
diff --git a/test/integration/integration_test.go b/test/integration/integration_test.go
index 33d21c8..fbfd219 100644
--- a/test/integration/integration_test.go
+++ b/test/integration/integration_test.go
@@ -57,7 +57,6 @@ func createMockCluster(id string, generation int, observedGeneration int, ready
// createMockClusterWithLabels creates a mock cluster response with labels
func createMockClusterWithLabels(id string, generation int, observedGeneration int, ready bool, lastUpdated time.Time, labels map[string]string) map[string]interface{} {
- // Map ready bool to condition status
readyStatus := "False"
if ready {
readyStatus = "True"
@@ -116,39 +115,47 @@ func createMockClusterList(clusters []map[string]interface{}) map[string]interfa
}
}
+// newTestDecisionEngine creates a CEL-based decision engine with default config.
+func newTestDecisionEngine(t *testing.T) *engine.DecisionEngine {
+ t.Helper()
+ de, err := engine.NewDecisionEngine(config.DefaultMessageDecision())
+ if err != nil {
+ t.Fatalf("NewDecisionEngine failed: %v", err)
+ }
+ return de
+}
+
+// newTestSentinelConfig creates a config for testing.
+func newTestSentinelConfig() *config.SentinelConfig {
+ return &config.SentinelConfig{
+ ResourceType: "clusters",
+ PollInterval: 100 * time.Millisecond,
+ MessageDecision: config.DefaultMessageDecision(),
+ MessageData: map[string]interface{}{
+ "id": "resource.id",
+ "kind": "resource.kind",
+ },
+ }
+}
+
// TestIntegration_EndToEnd tests the full Sentinel workflow end-to-end with real RabbitMQ
func TestIntegration_EndToEnd(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
- // Get shared RabbitMQ testcontainer from helper
helper := NewHelper()
now := time.Now()
var pollCycleCount atomic.Int32
- // Create mock HyperFleet API server that handles condition-based queries
- // Each poll cycle makes 2 queries: not-ready + stale
+ // Single query mock - returns stale cluster on first poll only
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- search := r.URL.Query().Get("search")
+ cycle := pollCycleCount.Add(1)
var clusters []map[string]interface{}
-
- // Track poll cycles (stale query marks end of a cycle)
- if strings.Contains(search, "Ready='True'") {
- pollCycleCount.Add(1)
- }
-
- switch {
- case strings.Contains(search, "Ready='False'"):
- // Not-ready query: no not-ready clusters
- clusters = nil
- case strings.Contains(search, "Ready='True'"):
- // Stale query: return stale cluster only on first cycle
- if pollCycleCount.Load() == 1 {
- clusters = []map[string]interface{}{
- createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)), // Max age exceeded
- }
+ if cycle == 1 {
+ clusters = []map[string]interface{}{
+ createMockCluster("cluster-1", 2, 2, true, now.Add(-31*time.Minute)),
}
}
@@ -158,47 +165,32 @@ func TestIntegration_EndToEnd(t *testing.T) {
}))
defer server.Close()
- // Setup components with real RabbitMQ broker
hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second)
- decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute)
+ decisionEngine := newTestDecisionEngine(t)
log := logger.NewHyperFleetLogger()
- // Create metrics with a test registry
registry := prometheus.NewRegistry()
metrics.NewSentinelMetrics(registry, "test")
- cfg := &config.SentinelConfig{
- ResourceType: "clusters",
- PollInterval: 100 * time.Millisecond, // Short interval for testing
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- },
- }
+ cfg := newTestSentinelConfig()
s, err := sentinel.NewSentinel(cfg, hyperfleetClient, decisionEngine, helper.RabbitMQ.Publisher(), log)
if err != nil {
t.Fatalf("NewSentinel failed: %v", err)
}
- // Run Sentinel in background
errChan := make(chan error, 1)
go func() {
errChan <- s.Start(ctx)
}()
- // Wait for at least 2 polling cycles
time.Sleep(300 * time.Millisecond)
cancel()
- // Verify that Sentinel actually polled the API at least twice
if pollCycleCount.Load() < 2 {
t.Errorf("Expected at least 2 polling cycles, got %d", pollCycleCount.Load())
}
- // Wait for Sentinel to stop
select {
case err := <-errChan:
if err != nil && err != context.Canceled {
@@ -208,8 +200,6 @@ func TestIntegration_EndToEnd(t *testing.T) {
t.Fatal("Sentinel did not stop within timeout")
}
- // Integration test validates end-to-end workflow without errors
- // Event verification requires subscriber implementation (future enhancement)
t.Log("Integration test with real RabbitMQ broker completed successfully")
}
@@ -218,19 +208,14 @@ func TestIntegration_LabelSelectorFiltering(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
- // Get shared RabbitMQ testcontainer from helper
helper := NewHelper()
now := time.Now()
- // Track search parameters for assertion
var capturedSearchParams []string
var captureMu sync.Mutex
- // Create mock server that returns clusters filtered by label selector and conditions.
- // With condition-based queries, Sentinel sends:
- // - Not-ready query: "labels.shard='1' and status.conditions.Ready='False'"
- // - Stale query: "labels.shard='1' and status.conditions.Ready='True' and ..."
+ // Single query mock - validates label selectors in search params
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
searchParam := r.URL.Query().Get("search")
captureMu.Lock()
@@ -238,25 +223,16 @@ func TestIntegration_LabelSelectorFiltering(t *testing.T) {
captureMu.Unlock()
var filteredClusters []map[string]interface{}
-
- // Only return clusters matching shard:1 label AND condition filters
if strings.Contains(searchParam, "labels.shard='1'") {
- if strings.Contains(searchParam, "Ready='False'") {
- // Not-ready query: no not-ready clusters with shard:1
- filteredClusters = nil
- } else if strings.Contains(searchParam, "Ready='True'") {
- // Stale query: return stale cluster with shard:1
- filteredClusters = []map[string]interface{}{
- createMockClusterWithLabels(
- "cluster-shard-1",
- 2, 2, true,
- now.Add(-31*time.Minute), // Exceeds max_age_ready (30m)
- map[string]string{"shard": "1"},
- ),
- }
+ filteredClusters = []map[string]interface{}{
+ createMockClusterWithLabels(
+ "cluster-shard-1",
+ 2, 2, true,
+ now.Add(-31*time.Minute),
+ map[string]string{"shard": "1"},
+ ),
}
}
- // shard:2 clusters are never returned since label selector doesn't match
response := createMockClusterList(filteredClusters)
w.Header().Set("Content-Type", "application/json")
@@ -264,27 +240,16 @@ func TestIntegration_LabelSelectorFiltering(t *testing.T) {
}))
defer server.Close()
- // Setup components with real RabbitMQ broker
hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second)
- decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute)
+ decisionEngine := newTestDecisionEngine(t)
log := logger.NewHyperFleetLogger()
- // Create metrics with a test registry
registry := prometheus.NewRegistry()
metrics.NewSentinelMetrics(registry, "test")
- cfg := &config.SentinelConfig{
- ResourceType: "clusters",
- PollInterval: 100 * time.Millisecond,
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- ResourceSelector: []config.LabelSelector{
- {Label: "shard", Value: "1"},
- },
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- },
+ cfg := newTestSentinelConfig()
+ cfg.ResourceSelector = []config.LabelSelector{
+ {Label: "shard", Value: "1"},
}
s, err := sentinel.NewSentinel(cfg, hyperfleetClient, decisionEngine, helper.RabbitMQ.Publisher(), log)
@@ -292,23 +257,19 @@ func TestIntegration_LabelSelectorFiltering(t *testing.T) {
t.Fatalf("NewSentinel failed: %v", err)
}
- // Run sentinel in goroutine and capture error
errChan := make(chan error, 1)
go func() {
errChan <- s.Start(ctx)
}()
- // Wait for a few polling cycles
time.Sleep(300 * time.Millisecond)
cancel()
- // Check Start error
startErr := <-errChan
if startErr != nil && startErr != context.Canceled {
t.Errorf("Expected Start to return nil or context.Canceled, got: %v", startErr)
}
- // Validate captured search parameters
captureMu.Lock()
params := make([]string, len(capturedSearchParams))
copy(params, capturedSearchParams)
@@ -318,45 +279,27 @@ func TestIntegration_LabelSelectorFiltering(t *testing.T) {
t.Fatal("No search queries were captured — Sentinel may not have polled")
}
- hasNotReady := false
- hasStale := false
for _, search := range params {
if !strings.Contains(search, "labels.shard='1'") {
t.Errorf("Expected search to contain label selector \"labels.shard='1'\", got %q", search)
}
- if strings.Contains(search, "Ready='False'") {
- hasNotReady = true
- }
- if strings.Contains(search, "Ready='True'") {
- hasStale = true
- }
- }
- if !hasNotReady {
- t.Error("No not-ready query (Ready='False') was captured")
- }
- if !hasStale {
- t.Error("No stale query (Ready='True') was captured")
}
t.Logf("Label selector filtering test completed with %d queries", len(params))
}
// TestIntegration_TSLSyntaxMultipleLabels validates TSL syntax with multiple label selectors
-// and condition-based filtering
func TestIntegration_TSLSyntaxMultipleLabels(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
- // Get shared RabbitMQ testcontainer from helper
helper := NewHelper()
now := time.Now()
- // Track the search parameters received by the mock server
var receivedSearchParams []string
var searchMu sync.Mutex
- // Create mock server that validates TSL syntax with conditions
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
search := r.URL.Query().Get("search")
searchMu.Lock()
@@ -365,22 +308,15 @@ func TestIntegration_TSLSyntaxMultipleLabels(t *testing.T) {
var filteredClusters []map[string]interface{}
- // With condition-based queries, labels are combined with condition filters
labelPrefix := "labels.env='production' and labels.region='us-east'"
if strings.HasPrefix(search, labelPrefix) {
- if strings.Contains(search, "Ready='False'") {
- // Not-ready query
- filteredClusters = nil
- } else if strings.Contains(search, "Ready='True'") {
- // Stale query: return matching cluster
- filteredClusters = []map[string]interface{}{
- createMockClusterWithLabels(
- "cluster-region-env-match",
- 2, 2, true,
- now.Add(-31*time.Minute),
- map[string]string{"region": "us-east", "env": "production"},
- ),
- }
+ filteredClusters = []map[string]interface{}{
+ createMockClusterWithLabels(
+ "cluster-region-env-match",
+ 2, 2, true,
+ now.Add(-31*time.Minute),
+ map[string]string{"region": "us-east", "env": "production"},
+ ),
}
}
@@ -390,27 +326,17 @@ func TestIntegration_TSLSyntaxMultipleLabels(t *testing.T) {
}))
defer server.Close()
- // Setup components
hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second)
- decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute)
+ decisionEngine := newTestDecisionEngine(t)
log := logger.NewHyperFleetLogger()
registry := prometheus.NewRegistry()
metrics.NewSentinelMetrics(registry, "test")
- cfg := &config.SentinelConfig{
- ResourceType: "clusters",
- PollInterval: 100 * time.Millisecond,
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- ResourceSelector: []config.LabelSelector{
- {Label: "region", Value: "us-east"},
- {Label: "env", Value: "production"},
- },
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- },
+ cfg := newTestSentinelConfig()
+ cfg.ResourceSelector = []config.LabelSelector{
+ {Label: "region", Value: "us-east"},
+ {Label: "env", Value: "production"},
}
s, err := sentinel.NewSentinel(cfg, hyperfleetClient, decisionEngine, helper.RabbitMQ.Publisher(), log)
@@ -418,17 +344,14 @@ func TestIntegration_TSLSyntaxMultipleLabels(t *testing.T) {
t.Fatalf("NewSentinel failed: %v", err)
}
- // Run sentinel
errChan := make(chan error, 1)
go func() {
errChan <- s.Start(ctx)
}()
- // Wait for polling
time.Sleep(300 * time.Millisecond)
cancel()
- // Validate that queries include both label selectors and condition filters
labelPrefix := "labels.env='production' and labels.region='us-east'"
searchMu.Lock()
paramsCopy := make([]string, len(receivedSearchParams))
@@ -439,27 +362,12 @@ func TestIntegration_TSLSyntaxMultipleLabels(t *testing.T) {
t.Fatal("No search queries were captured — Sentinel may not have polled")
}
- hasNotReady := false
- hasStale := false
for _, search := range paramsCopy {
if !strings.HasPrefix(search, labelPrefix) {
t.Errorf("Expected search to start with label selectors %q, got %q", labelPrefix, search)
}
- if strings.Contains(search, "Ready='False'") {
- hasNotReady = true
- }
- if strings.Contains(search, "Ready='True'") {
- hasStale = true
- }
- }
- if !hasNotReady {
- t.Error("No not-ready query (Ready='False') was captured")
- }
- if !hasStale {
- t.Error("No stale query (Ready='True') was captured")
}
- // Wait for sentinel to stop
startErr := <-errChan
if startErr != nil && startErr != context.Canceled {
t.Errorf("Expected Start to return nil or context.Canceled, got: %v", startErr)
@@ -468,47 +376,13 @@ func TestIntegration_TSLSyntaxMultipleLabels(t *testing.T) {
t.Logf("TSL syntax validation completed with %d queries", len(paramsCopy))
}
-// TestIntegration_BrokerLoggerContext validates that OpenTelemetry trace context and Sentinel-specific context fields are properly propagated through the logging system
-// during event publishing workflows.
-//
-// Context Propagation Flow:
-// 1. OpenTelemetry creates trace_id and span_id for spans
-// 2. telemetry.StartSpan() enriches context with trace IDs for logging
-// 3. Sentinel adds decision context (decision_reason, topic, subset) to context
-// 4. Context flows through to broker operations via logger.WithXXX() calls
-// 5. All log entries contain both OpenTelemetry and Sentinel context fields
-//
-// Validated Log Fields:
-//
-// OpenTelemetry fields:
-// - trace_id: Distributed trace identifier from active span
-// - span_id: Current span identifier from active span
-//
-// Sentinel-specific fields:
-// - decision_reason: Why the event was triggered (e.g., "max age exceeded", "generation changed")
-// - topic: Message broker topic where event is published
-// - subset: Resource type being monitored (e.g., "clusters")
-// - component: Always "sentinel" for consistent log attribution
-//
-// Test Approach:
-// - Captures structured JSON logs to a buffer for analysis
-// - Uses real RabbitMQ broker to generate authentic broker operation logs
-// - Mock server returns resources that trigger multiple event types
-// - Parses JSON log entries to verify required context fields are present
-//
-// Success Criteria:
-// - Sentinel's "Published event" logs contain all OpenTelemetry and context fields
-// - Broker operation logs inherit Sentinel's component and context
-// - No context fields are lost during the broker publishing workflow
-//
-// This test ensures distributed tracing correlation and structured logging work correctly across the Sentinel → Broker boundary for observability.
+// TestIntegration_BrokerLoggerContext validates context propagation through logging
func TestIntegration_BrokerLoggerContext(t *testing.T) {
- const SENTINEL_COMPONENT = "sentinel"
- const TEST_VERSION = "test"
- const TEST_HOST = "testhost"
- const TEST_TOPIC = "test-topic"
+ const sentinelComponent = "sentinel"
+ const testVersion = "test"
+ const testHost = "testhost"
+ const testTopic = "test-topic"
- // Buffer to observe logs
var logBuffer bytes.Buffer
now := time.Now()
var callCount atomic.Int32
@@ -517,7 +391,6 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
- // Set OTLP sampler
err := os.Setenv("OTEL_TRACES_SAMPLER", "always_on")
if err != nil {
t.Errorf("Failed to set OTEL_TRACES_SAMPLER: %v", err)
@@ -529,58 +402,41 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) {
}
}()
- // Setup OpenTelemetry for integration test
tp, err := telemetry.InitTraceProvider(ctx, "sentinel", "test")
if err != nil {
t.Fatalf("Failed to initialize OpenTelemetry: %v", err)
}
defer telemetry.Shutdown(context.Background(), tp)
- // Get globalConfig and assign multiWriter to observe output logs
globalConfig := logger.GetGlobalConfig()
multiWriter := io.MultiWriter(globalConfig.Output, &logBuffer)
helper := NewHelper()
- cfg := &logger.LogConfig{
+ logCfg := &logger.LogConfig{
Level: logger.LevelInfo,
- Format: logger.FormatJSON, // JSON for easy parsing
+ Format: logger.FormatJSON,
Output: multiWriter,
- Component: SENTINEL_COMPONENT,
- Version: TEST_VERSION,
- Hostname: TEST_HOST,
+ Component: sentinelComponent,
+ Version: testVersion,
+ Hostname: testHost,
OTel: logger.OTelConfig{
Enabled: true,
},
}
- log := logger.NewHyperFleetLoggerWithConfig(cfg)
+ log := logger.NewHyperFleetLoggerWithConfig(logCfg)
- // Mock server returns clusters that will trigger event publishing
- // With condition-based queries, each poll cycle makes 2 queries
+ // Single query mock - returns resources that trigger events
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- search := r.URL.Query().Get("search")
-
- var clusters []map[string]interface{}
- switch {
- case strings.Contains(search, "Ready='False'"):
- if callCount.Add(1) >= 2 {
- select {
- case readyChan <- true:
- default:
- }
- }
- // Not-ready query: return not-ready cluster
- clusters = []map[string]interface{}{
- createMockCluster("cluster-not-ready", 1, 1, false, now.Add(-15*time.Second)),
- }
- case strings.Contains(search, "Ready='True'"):
- // Stale query: return stale ready cluster
- clusters = []map[string]interface{}{
- createMockCluster("cluster-old", 2, 2, true, now.Add(-35*time.Minute)),
+ if callCount.Add(1) >= 2 {
+ select {
+ case readyChan <- true:
+ default:
}
- default:
- t.Errorf("unexpected search query: %q", search)
- w.WriteHeader(http.StatusBadRequest)
- return
+ }
+
+ clusters := []map[string]interface{}{
+ createMockCluster("cluster-not-ready", 2, 2, false, now.Add(-15*time.Second)),
+ createMockCluster("cluster-old", 2, 2, true, now.Add(-35*time.Minute)),
}
response := createMockClusterList(clusters)
w.Header().Set("Content-Type", "application/json")
@@ -589,31 +445,20 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) {
defer server.Close()
hyperfleetClient, _ := client.NewHyperFleetClient(server.URL, 10*time.Second)
- decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute)
-
- sentinelConfig := &config.SentinelConfig{
- ResourceType: "clusters",
- Topic: TEST_TOPIC,
- PollInterval: 100 * time.Millisecond,
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- ResourceSelector: []config.LabelSelector{
- {Label: "region", Value: "us-east"},
- {Label: "env", Value: "production"},
- },
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- },
+ decisionEngine := newTestDecisionEngine(t)
+
+ sentinelConfig := newTestSentinelConfig()
+ sentinelConfig.Topic = testTopic
+ sentinelConfig.ResourceSelector = []config.LabelSelector{
+ {Label: "region", Value: "us-east"},
+ {Label: "env", Value: "production"},
}
- // Create Sentinel with our logger and real RabbitMQ broker
s, err := sentinel.NewSentinel(sentinelConfig, hyperfleetClient, decisionEngine, helper.RabbitMQ.Publisher(), log)
if err != nil {
t.Fatalf("NewSentinel failed: %v", err)
}
- // Run Sentinel
errChan := make(chan error, 1)
go func() {
errChan <- s.Start(ctx)
@@ -627,7 +472,6 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) {
}
cancel()
- // Wait for Sentinel to stop
select {
case err := <-errChan:
if err != nil && !errors.Is(err, context.Canceled) {
@@ -637,7 +481,6 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) {
t.Fatal("Sentinel did not stop within timeout")
}
- // Analyze logs
logOutput := logBuffer.String()
t.Logf("Captured log output:\n%s", logOutput)
@@ -660,12 +503,10 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) {
msg, hasMsg := entry["message"].(string)
component, hasComponent := entry["component"].(string)
- // Look for Sentinel's own event publishing logs
- if hasMsg && hasComponent && component == SENTINEL_COMPONENT &&
+ if hasMsg && hasComponent && component == sentinelComponent &&
strings.Contains(msg, "Published event") {
foundSentinelEventLog = true
- // Verify Sentinel context fields are present
if entry["decision_reason"] == nil {
t.Errorf("Sentinel event log missing decision_reason: %v", entry)
}
@@ -675,7 +516,6 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) {
if entry["subset"] == nil {
t.Errorf("Sentinel event log missing subset: %v", entry)
}
-
if entry["trace_id"] == nil {
t.Errorf("Sentinel event log missing trace_id: %v", entry)
}
@@ -687,29 +527,19 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) {
entry["decision_reason"], entry["topic"], entry["subset"])
}
- // Look for broker operation logs (these should inherit Sentinel context)
- if hasMsg && hasComponent && component == SENTINEL_COMPONENT &&
+ if hasMsg && hasComponent && component == sentinelComponent &&
(strings.Contains(msg, "broker") || strings.Contains(msg, "publish") ||
strings.Contains(msg, "Creating publisher") || strings.Contains(msg, "publisher initialized")) {
foundBrokerOperationLog = true
- // Broker operations should have Sentinel context
- if entry["component"] != SENTINEL_COMPONENT {
- t.Errorf("Broker operation log missing component=%s: %v", SENTINEL_COMPONENT, entry)
- }
-
- if entry["version"] != TEST_VERSION {
- t.Errorf("Broker operation log missing version=%s: %v", TEST_VERSION, entry)
+ if entry["component"] != sentinelComponent {
+ t.Errorf("Broker operation log missing component=%s: %v", sentinelComponent, entry)
}
-
- if entry["hostname"] != TEST_HOST {
- t.Errorf("Broker operation log missing hostname=%s: %v", TEST_HOST, entry)
+ if entry["version"] != testVersion {
+ t.Errorf("Broker operation log missing version=%s: %v", testVersion, entry)
}
-
- // Check for context inheritance (these fields should be present if context flowed through)
- if entry["decision_reason"] != nil || entry["topic"] != nil || entry["subset"] != nil {
- t.Logf("Broker operation inherits Sentinel context: decision_reason=%v topic=%v subset=%v",
- entry["decision_reason"], entry["topic"], entry["subset"])
+ if entry["hostname"] != testHost {
+ t.Errorf("Broker operation log missing hostname=%s: %v", testHost, entry)
}
t.Logf("Found broker operation log with component=sentinel: %s", msg)
@@ -724,41 +554,16 @@ func TestIntegration_BrokerLoggerContext(t *testing.T) {
t.Error("No broker operation logs found - broker may not be logging")
}
- // Success criteria: Both Sentinel and broker logs should use component=sentinel
if foundSentinelEventLog && foundBrokerOperationLog {
t.Log("SUCCESS: Logger context inheritance verified - both Sentinel and broker operations log with component=sentinel")
}
}
-// TestIntegration_EndToEndSpanHierarchy validates the complete OpenTelemetry span hierarchy created during Sentinel's polling and event publishing workflow.
-//
-// Expected Span Hierarchy:
-//
-// sentinel.poll (root span - one per polling cycle)
-// ├── HTTP GET (API call to fetch resources)
-// ├── sentinel.evaluate (one per resource evaluated)
-// │ └── {topic} publish (one per event published)
-// ├── sentinel.evaluate (next resource)
-// │ └── {topic} publish
-// └── ...
-//
-// The test validates:
-// 1. Required spans are created: sentinel.poll, sentinel.evaluate, {topic} publish
-// 2. Parent-child relationships: evaluate HTTP spans are children of poll spans, publish spans are children of evaluate spans
-// 3. Multiple spans: One evaluate/publish span per resource that triggers an event
-// 4. Trace continuity: All spans within a poll cycle belong to the same trace
-//
-// Test Setup:
-// - Uses in-memory OpenTelemetry exporter to capture and analyze spans
-// - Mock server returns 3 test resources that will trigger events
-// - Real RabbitMQ broker for realistic message publishing
-//
-// Note: The test may capture multiple poll cycles. Due to context cancellation timing, only 2 poll cycles are validated
+// TestIntegration_EndToEndSpanHierarchy validates the complete OpenTelemetry span hierarchy
func TestIntegration_EndToEndSpanHierarchy(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
- // Setup in-memory trace exporter to capture spans
exporter := tracetest.NewInMemoryExporter()
tp := trace.NewTracerProvider(
trace.WithSampler(trace.AlwaysSample()),
@@ -779,36 +584,20 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) {
var callCount atomic.Int32
readyChan := make(chan bool, 1)
- // Mock server that routes responses based on condition-based search parameters.
- // Each poll cycle makes 2 queries: not-ready (Ready='False') + stale (Ready='True').
+ // Single query mock - returns resources that trigger different decision paths
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- callCount.Add(1)
- // 2 queries per poll cycle; signal after 2 complete cycles (4+ queries)
- if callCount.Load() > 4 {
+ if callCount.Add(1) > 2 {
select {
case readyChan <- true:
default:
}
}
- search := r.URL.Query().Get("search")
- var clusters []map[string]interface{}
- switch {
- case strings.Contains(search, "Ready='False'"):
- clusters = []map[string]interface{}{
- // Triggers max_age_not_ready exceeded
- createMockCluster("cluster-not-ready-old", 1, 1, false, now.Add(-15*time.Second)),
- }
- case strings.Contains(search, "Ready='True'"):
- clusters = []map[string]interface{}{
- // Triggers max_age_ready exceeded
- createMockCluster("cluster-ready-old", 2, 2, true, now.Add(-35*time.Minute)),
- // Triggers generation mismatch
- createMockCluster("cluster-generation-mismatch", 5, 3, true, now.Add(-5*time.Minute)),
- }
- default:
- w.WriteHeader(http.StatusBadRequest)
- return
+ clusters := []map[string]interface{}{
+ // Triggers not_ready_and_debounced
+ createMockCluster("cluster-not-ready-old", 2, 2, false, now.Add(-15*time.Second)),
+ // Triggers ready_and_stale
+ createMockCluster("cluster-ready-old", 2, 2, true, now.Add(-35*time.Minute)),
}
response := createMockClusterList(clusters)
w.Header().Set("Content-Type", "application/json")
@@ -816,45 +605,32 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) {
}))
defer server.Close()
- // Get shared RabbitMQ from helper
helper := NewHelper()
- // Setup components
hyperfleetClient, clientErr := client.NewHyperFleetClient(server.URL, 10*time.Second)
if clientErr != nil {
t.Fatalf("failed to create HyperFleet client: %v", clientErr)
}
- decisionEngine := engine.NewDecisionEngine(10*time.Second, 30*time.Minute)
+ decisionEngine := newTestDecisionEngine(t)
log := logger.NewHyperFleetLogger()
registry := prometheus.NewRegistry()
metrics.NewSentinelMetrics(registry, "test")
- cfg := &config.SentinelConfig{
- ResourceType: "clusters",
- Topic: "test-spans-topic",
- PollInterval: 100 * time.Millisecond,
- MaxAgeNotReady: 10 * time.Second,
- MaxAgeReady: 30 * time.Minute,
- MessagingSystem: "rabbitmq",
- MessageData: map[string]interface{}{
- "id": "resource.id",
- "kind": "resource.kind",
- },
- }
+ cfg := newTestSentinelConfig()
+ cfg.Topic = "test-spans-topic"
+ cfg.MessagingSystem = "rabbitmq"
s, err := sentinel.NewSentinel(cfg, hyperfleetClient, decisionEngine, helper.RabbitMQ.Publisher(), log)
if err != nil {
t.Fatalf("NewSentinel failed: %v", err)
}
- // Run Sentinel to generate spans
errChan := make(chan error, 1)
go func() {
errChan <- s.Start(ctx)
}()
- // Wait for at least 2 polling cycles
select {
case <-readyChan:
t.Log("Sentinel completed required polling cycles")
@@ -863,7 +639,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) {
}
cancel()
- // Wait for Sentinel to stop
select {
case err := <-errChan:
if err != nil && !errors.Is(err, context.Canceled) {
@@ -873,7 +648,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) {
t.Fatal("Sentinel did not stop within timeout")
}
- // Force flush spans to exporter
flushCtx, flushCancel := context.WithTimeout(context.Background(), 5*time.Second)
defer flushCancel()
@@ -882,11 +656,9 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) {
t.Fatalf("force flush error: %v", err)
}
- // Analyze captured spans
spans := exporter.GetSpans()
t.Logf("Captured %d spans", len(spans))
- // Build span maps for analysis
spansByName := make(map[string][]tracetest.SpanStub)
spansByID := make(map[string]tracetest.SpanStub)
spansByParentID := make(map[string][]tracetest.SpanStub)
@@ -901,7 +673,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) {
}
}
- // Print span hierarchy for debugging
t.Log("Span hierarchy:")
for _, span := range spans {
parentInfo := "ROOT"
@@ -911,7 +682,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) {
t.Logf(" %s (%s) - %s", span.Name, span.SpanContext.SpanID().String()[:8], parentInfo)
}
- // Validate required spans exist
requiredSpans := []string{
"sentinel.poll",
"sentinel.evaluate",
@@ -924,7 +694,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) {
}
}
- // Validate span hierarchy structure
pollSpans := spansByName["sentinel.poll"]
if len(pollSpans) == 0 {
t.Fatal("No sentinel.poll spans found")
@@ -935,25 +704,19 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) {
pollSpansToValidate = pollSpansToValidate[:2]
}
- // For each poll span, validate it has the expected children, evaluate only the first two poll spans
for _, pollSpan := range pollSpansToValidate {
pollSpanID := pollSpan.SpanContext.SpanID().String()
directChildren := spansByParentID[pollSpanID]
t.Logf("Poll span %s has %d direct children", pollSpanID[:8], len(directChildren))
- // Verify poll span has evaluate children (direct)
hasEvaluateChild := false
- hasHTTPChild := false
evaluateChildCount := 0
for _, child := range directChildren {
- switch {
- case child.Name == "sentinel.evaluate":
+ if child.Name == "sentinel.evaluate" {
hasEvaluateChild = true
evaluateChildCount++
- case strings.Contains(child.Name, "HTTP"):
- hasHTTPChild = true
}
}
@@ -962,7 +725,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) {
continue
}
- // Verify each evaluate span has publish grandchildren
publishGrandchildCount := 0
for _, child := range directChildren {
if child.Name == "sentinel.evaluate" {
@@ -976,26 +738,23 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) {
}
}
- t.Logf("Poll span %s: evaluate children=%d, publish grandchildren=%d, http=%t",
- pollSpanID[:8], evaluateChildCount, publishGrandchildCount, hasHTTPChild)
+ t.Logf("Poll span %s: evaluate children=%d, publish grandchildren=%d",
+ pollSpanID[:8], evaluateChildCount, publishGrandchildCount)
- // Only validate publish grandchildren if this poll span actually processed events
if evaluateChildCount > 0 && publishGrandchildCount == 0 {
t.Errorf("Poll span %s has %d evaluate children but no publish grandchildren",
pollSpanID[:8], evaluateChildCount)
}
}
- // Validate we have multiple evaluation spans (one per resource)
evaluateSpans := spansByName["sentinel.evaluate"]
- if len(evaluateSpans) < 3 {
- t.Errorf("Expected at least 3 sentinel.evaluate spans (one per test resource), got %d", len(evaluateSpans))
+ if len(evaluateSpans) < 2 {
+ t.Errorf("Expected at least 2 sentinel.evaluate spans (one per test resource), got %d", len(evaluateSpans))
}
- // Validate we have multiple publish spans (one per event)
publishSpans := spansByName["test-spans-topic publish"]
- if len(publishSpans) < 3 {
- t.Errorf("Expected at least 3 publish spans (one per test event), got %d", len(publishSpans))
+ if len(publishSpans) < 2 {
+ t.Errorf("Expected at least 2 publish spans (one per test event), got %d", len(publishSpans))
}
validateSpanAttribute(t, publishSpans, "test-spans-topic publish", "messaging.system", cfg.MessagingSystem)
@@ -1019,7 +778,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) {
}
}
- // Verify trace continuity - spans should form coherent traces
traceIDs := make(map[string]bool)
for _, span := range spans {
traceIDs[span.SpanContext.TraceID().String()] = true
@@ -1030,7 +788,6 @@ func TestIntegration_EndToEndSpanHierarchy(t *testing.T) {
}
}
-// Helper function for span name extraction
func getSpanNames(spans []tracetest.SpanStub) []string {
names := make([]string, len(spans))
for i, span := range spans {