openshift-hyperfleet · openshift-merge-bot · Mar 24, 2026 · Mar 23, 2026 · Mar 24, 2026
diff --git a/Makefile b/Makefile
@@ -249,8 +249,7 @@ test-helm: ## Test Helm charts (lint, template, validate)
 		--set image.repository=openshift-hyperfleet/hyperfleet-sentinel \
 		--set image.tag=latest \
 		--set config.resourceType=nodepools \
-		--set config.pollInterval=10s \
-		--set config.maxAgeReady=1h > /dev/null
+		--set config.pollInterval=10s > /dev/null
 	@echo "Custom resource selector template OK"
 	@echo ""
 	@echo "All Helm chart tests passed!"

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # hyperfleet-sentinel
 
-HyperFleet Sentinel Service - Kubernetes service that polls HyperFleet API, makes orchestration decisions, and publishes events. Features configurable max age intervals, horizontal sharding via SentinelConfig CRD, and broker abstraction (GCP Pub/Sub, RabbitMQ, Stub). Centralized reconciliation logic.
+HyperFleet Sentinel Service - Kubernetes service that polls HyperFleet API, makes orchestration decisions, and publishes events. Features configurable CEL-based decision logic, horizontal sharding via SentinelConfig CRD, and broker abstraction (GCP Pub/Sub, RabbitMQ, Stub). Centralized reconciliation logic.
 
 ## Development Setup
 
@@ -125,8 +125,7 @@ Create a configuration file based on the examples in the `configs/` directory:
 | Field | Type | Default | Description |
 |-------|------|---------|-------------|
 | `poll_interval` | duration | `5s` | How often to poll the API for resource updates |
-| `max_age_not_ready` | duration | `10s` | Max age interval for resources not ready |
-| `max_age_ready` | duration | `30m` | Max age interval for ready resources |
+| `message_decision` | object | See below | CEL-based decision logic (params + result) |
 | `hyperfleet_api.timeout` | duration | `5s` | Request timeout for API calls |
 | `resource_selector` | array | `[]` | Label selectors for filtering resources (enables sharding) |
 | `message_data` | map | `{}` | Template fields for CloudEvents data payload |
@@ -231,8 +230,17 @@ This uses all defaults. Broker configuration is managed via `broker.yaml` or env
 ```yaml
 resource_type: clusters
 poll_interval: 5s
-max_age_not_ready: 10s
-max_age_ready: 30m
+
+message_decision:
+  params:
+    ref_time: 'condition("Ready").last_updated_time'
+    is_ready: 'condition("Ready").status == "True"'
+    has_ref_time: 'ref_time != ""'
+    is_new_resource: '!is_ready && resource.generation == 1'
+    generation_mismatch: 'resource.generation > condition("Ready").observed_generation'
+    ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")'
+    not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")'
+  result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced"
 
 resource_selector:
   - label: shard

diff --git a/charts/README.md b/charts/README.md
@@ -80,8 +80,7 @@ The following table lists the configurable parameters of the Sentinel chart and
 |-----------|-------------|---------|
 | `config.resourceType` | Resource type to watch | `clusters` |
 | `config.pollInterval` | Polling interval | `5s` |
-| `config.maxAgeNotReady` | Max age for not ready resources | `10s` |
-| `config.maxAgeReady` | Max age for ready resources | `30m` |
+| `config.messageDecision` | CEL-based decision logic (params + result) | See values.yaml |
 | `config.resourceSelector` | Resource selector for sharding | See values.yaml |
 | `config.hyperfleetApi.baseUrl` | HyperFleet API base URL | `http://hyperfleet-api:8000` |
 | `config.hyperfleetApi.timeout` | API timeout | `5s` |

diff --git a/charts/templates/configmap.yaml b/charts/templates/configmap.yaml
@@ -9,8 +9,6 @@ data:
     # Sentinel configuration
     resource_type: {{ .Values.config.resourceType }}
     poll_interval: {{ .Values.config.pollInterval }}
-    max_age_not_ready: {{ .Values.config.maxAgeNotReady }}
-    max_age_ready: {{ .Values.config.maxAgeReady }}
 
     {{- if .Values.config.resourceSelector }}
     # Resource selector for horizontal sharding
@@ -26,6 +24,18 @@ data:
       endpoint: {{ .Values.config.hyperfleetApi.baseUrl }}
       timeout: {{ .Values.config.hyperfleetApi.timeout }}
 
+    {{- if .Values.config.messageDecision }}
+    # Configurable CEL-based decision logic
+    message_decision:
+      {{- if .Values.config.messageDecision.params }}
+      params:
+        {{- range $name, $expr := .Values.config.messageDecision.params }}
+        {{ $name }}: {{ $expr | quote }}
+        {{- end }}
+      {{- end }}
+      result: {{ .Values.config.messageDecision.result | quote }}
+    {{- end }}
+
     {{- if .Values.config.messageData }}
     # CloudEvents data payload configuration
     message_data:

diff --git a/charts/values.yaml b/charts/values.yaml
@@ -84,11 +84,19 @@ config:
   # How often to poll the API for resource updates
   pollInterval: 5s
 
-  # Max age interval for resources not ready
-  maxAgeNotReady: 10s
-
-  # Max age interval for ready resources
-  maxAgeReady: 30m
+  # Configurable CEL-based decision logic
+  # Params are named CEL expressions evaluated in dependency order.
+  # Result is a CEL boolean expression that determines whether to publish an event.
+  messageDecision:
+    params:
+      ref_time: 'condition("Ready").last_updated_time'
+      is_ready: 'condition("Ready").status == "True"'
+      has_ref_time: 'ref_time != ""'
+      is_new_resource: '!is_ready && resource.generation == 1'
+      generation_mismatch: 'resource.generation > condition("Ready").observed_generation'
+      ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")'
+      not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")'
+    result: 'is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced'
 
   # Resource selector for horizontal sharding
   # Deploy multiple Sentinel instances with different shard values

diff --git a/cmd/sentinel/main.go b/cmd/sentinel/main.go
@@ -217,7 +217,11 @@ func runServe(
 	}
 	log.Info(ctx, "Initialized HyperFleet client")
 
-	decisionEngine := engine.NewDecisionEngine(cfg.MaxAgeNotReady, cfg.MaxAgeReady)
+	decisionEngine, err := engine.NewDecisionEngine(cfg.MessageDecision)
+	if err != nil {
+		log.Errorf(ctx, "Failed to create decision engine: %v", err)
+		return fmt.Errorf("failed to create decision engine: %w", err)
+	}
 
 	// Initialize broker metrics recorder
 	// Broker metrics (messages_published_total, errors_total, etc.) are registered

diff --git a/configs/dev-example.yaml b/configs/dev-example.yaml
@@ -22,9 +22,18 @@ resource_type: clusters
 # Faster polling for development - see changes quickly.
 poll_interval: 2s
 
-# Shorter max age intervals for development.
-max_age_not_ready: 5s
-max_age_ready: 2m
+# Decision logic for publishing events (CEL expressions).
+# Shorter debounce intervals for development.
+message_decision:
+  params:
+    ref_time: 'condition("Ready").last_updated_time'
+    is_ready: 'condition("Ready").status == "True"'
+    has_ref_time: 'ref_time != ""'
+    is_new_resource: '!is_ready && resource.generation == 1'
+    generation_mismatch: 'resource.generation > condition("Ready").observed_generation'
+    ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("2m")'
+    not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("5s")'
+  result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced"
 
 # Messaging system type for OpenTelemetry tracing attributes.
 # Should match the actual message broker being used for accurate observability.
@@ -51,11 +60,11 @@ hyperfleet_api:
 #
 # Available CEL variables:
 #   resource — the HyperFleet resource (id, kind, href, generation, status, labels, ...)
-#   reason   — the decision reason string (e.g. "max_age_exceeded")
+#   reason   — the decision reason string (e.g. "message decision matched")
 #
 # Examples:
 #   field: "resource.id"                                    — field access
-#   field: 'resource.status.ready ? "Ready" : "NotReady"'  — conditional
+#   field: 'condition("Ready").status == "True" ? "Ready" : "NotReady"'  — conditional
 #   field: '"constant"'                                     — CEL string literal
 #   field: "reason"                                         — decision reason
 #

diff --git a/configs/gcp-pubsub-example.yaml b/configs/gcp-pubsub-example.yaml
@@ -18,13 +18,18 @@ resource_type: clusters
 # Accepts Go duration format: ns, us/µs, ms, s, m, h.
 poll_interval: 5s
 
-# Max age interval for resources that are not ready.
-# Resources in transitional states are re-checked more frequently.
-max_age_not_ready: 10s
-
-# Max age interval for resources that are ready and stable.
-# Stable resources are checked less frequently to reduce API load.
-max_age_ready: 30m
+# Decision logic for publishing events (CEL expressions).
+# Uses condition-based checks with configurable thresholds.
+message_decision:
+  params:
+    ref_time: 'condition("Ready").last_updated_time'
+    is_ready: 'condition("Ready").status == "True"'
+    has_ref_time: 'ref_time != ""'
+    is_new_resource: '!is_ready && resource.generation == 1'
+    generation_mismatch: 'resource.generation > condition("Ready").observed_generation'
+    ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")'
+    not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")'
+  result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced"
 
 # Messaging system type for OpenTelemetry tracing attributes.
 # Should match the actual message broker being used for accurate observability.
@@ -57,11 +62,11 @@ hyperfleet_api:
 #
 # Available CEL variables:
 #   resource — the HyperFleet resource (id, kind, href, generation, status, labels, ...)
-#   reason   — the decision reason string (e.g. "max_age_exceeded")
+#   reason   — the decision reason string (e.g. "message decision matched")
 #
 # Examples:
 #   field: "resource.id"                                    — field access
-#   field: 'resource.status.ready ? "Ready" : "NotReady"'  — conditional
+#   field: 'condition("Ready").status == "True" ? "Ready" : "NotReady"'  — conditional
 #   field: '"constant"'                                     — CEL string literal
 #   field: "reason"                                         — decision reason
 #

diff --git a/configs/rabbitmq-example.yaml b/configs/rabbitmq-example.yaml
@@ -18,13 +18,18 @@ resource_type: clusters
 # Accepts Go duration format: ns, us/µs, ms, s, m, h.
 poll_interval: 5s
 
-# Max age interval for resources that are not ready.
-# Resources in transitional states are re-checked more frequently.
-max_age_not_ready: 10s
-
-# Max age interval for resources that are ready and stable.
-# Stable resources are checked less frequently to reduce API load.
-max_age_ready: 30m
+# Decision logic for publishing events (CEL expressions).
+# Uses condition-based checks with configurable thresholds.
+message_decision:
+  params:
+    ref_time: 'condition("Ready").last_updated_time'
+    is_ready: 'condition("Ready").status == "True"'
+    has_ref_time: 'ref_time != ""'
+    is_new_resource: '!is_ready && resource.generation == 1'
+    generation_mismatch: 'resource.generation > condition("Ready").observed_generation'
+    ready_and_stale: 'is_ready && has_ref_time && now - timestamp(ref_time) > duration("30m")'
+    not_ready_and_debounced: '!is_ready && has_ref_time && now - timestamp(ref_time) > duration("10s")'
+  result: "is_new_resource || generation_mismatch || ready_and_stale || not_ready_and_debounced"
 
 # Messaging system type for OpenTelemetry tracing attributes.
 # Should match the actual message broker being used for accurate observability.
@@ -57,11 +62,11 @@ hyperfleet_api:
 #
 # Available CEL variables:
 #   resource — the HyperFleet resource (id, kind, href, generation, status, labels, ...)
-#   reason   — the decision reason string (e.g. "max_age_exceeded")
+#   reason   — the decision reason string (e.g. "message decision matched")
 #
 # Examples:
 #   field: "resource.id"                                    — field access
-#   field: 'resource.status.ready ? "Ready" : "NotReady"'  — conditional
+#   field: 'condition("Ready").status == "True" ? "Ready" : "NotReady"'  — conditional
 #   field: '"constant"'                                     — CEL string literal
 #   field: "reason"                                         — decision reason
 #

diff --git a/docs/alerts.md b/docs/alerts.md
@@ -156,11 +156,11 @@ labels:
   component: sentinel
 annotations:
   summary: "High resource skip ratio in Sentinel"
-  description: "{{ $value | humanizePercentage }} of resources are being skipped. This may indicate max_age configuration issues."
+  description: "{{ $value | humanizePercentage }} of resources are being skipped. This may indicate message_decision configuration issues."
 ```
-**Impact**: May indicate max age intervals too long or adapter status update issues.
+**Impact**: May indicate decision thresholds too restrictive or adapter status update issues.
 
-**Response**: Review max age configuration and adapter health.
+**Response**: Review message_decision configuration and adapter health.
 
 ---
 

diff --git a/docs/metrics.md b/docs/metrics.md
@@ -60,7 +60,7 @@ sum by (resource_type) (hyperfleet_sentinel_pending_resources)
 **Labels:**
 - `resource_type`: Type of resource
 - `resource_selector`: Label selector
-- `reason`: Reason for publishing the event (e.g., `max_age_exceeded`, `generation_mismatch`)
+- `reason`: Reason for publishing the event (e.g., `message decision matched`)
 
 **Use Cases:**
 - Monitor event publishing rate
@@ -87,7 +87,7 @@ sum by (reason) (rate(hyperfleet_sentinel_events_published_total[5m]))
 **Labels:**
 - `resource_type`: Type of resource
 - `resource_selector`: Label selector
-- `reason`: Reason for skipping (e.g., `within_max_age`, `generation_match`)
+- `reason`: Reason for skipping (e.g., `message decision result is false`)
 
 **Use Cases:**
 - Monitor decision engine effectiveness

diff --git a/docs/multi-instance-deployment.md b/docs/multi-instance-deployment.md
@@ -206,7 +206,7 @@ Memory: 64Mi + (5 × 32Mi) + 16Mi = 240Mi
 │  resource_selector: │  │  resource_selector: │  │  resource_selector: │
 │  - label: region    │  │  - label: region    │  │  - label: region    │
 │    value: us-east   │  │    value: us-west   │  │    value: eu-west   │
-│  max_age_ready=30m  │  │  max_age_ready=1h   │  │  max_age_ready=45m  │
+│  ready_stale=30m    │  │  ready_stale=1h     │  │  ready_stale=45m    │
 └──────────┬──────────┘  └──────────┬──────────┘  └──────────┬──────────┘
            │                        │                        │
            │                        ▼                        │

diff --git a/docs/running-sentinel.md b/docs/running-sentinel.md
@@ -533,8 +533,6 @@ gcloud projects remove-iam-policy-binding ${GCP_PROJECT} \
 # sentinel-config.yaml
 resource_type: clusters
 poll_interval: 5s
-max_age_not_ready: 10s
-max_age_ready: 30m
 
 # Watch all clusters (no filtering)
 resource_selector: []
@@ -556,8 +554,6 @@ message_data:
 # sentinel-dev-config.yaml
 resource_type: clusters
 poll_interval: 10s      # Slower polling for dev
-max_age_not_ready: 30s  # Longer intervals for dev
-max_age_ready: 2h
 
 resource_selector:
   - label: environment