From 1c957c7214d3bb0efd6609af4e8d546d2bcc6f5c Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 25 Mar 2026 12:03:59 -0400 Subject: [PATCH 1/5] feat: genesis ceremony controller orchestration Add group-level genesis ceremony support to SeiNodeGroup and per-node genesis configuration to SeiNode/ValidatorSpec. The node controller builds the full Init plan upfront (no deferred steps) and the group controller coordinates assembly, peer collection, and static peer injection. New CRD types: GenesisCeremonyConfig, GenesisCeremonyNodeConfig, GenesisS3Destination, GenesisS3Source. New group status fields: AssemblyPlan, GenesisHash, GenesisS3URI. Depends on seictl genesis ceremony client types (sei-protocol/seictl#40). --- api/v1alpha1/seinodegroup_types.go | 94 ++++- api/v1alpha1/validator_types.go | 50 +++ internal/controller/node/controller.go | 7 +- internal/controller/node/job.go | 92 ++++- internal/controller/node/planner.go | 14 + internal/controller/node/planner_bootstrap.go | 54 ++- internal/controller/node/planner_validator.go | 12 + internal/controller/node/pre_init.go | 25 ++ internal/controller/node/task_builders.go | 33 ++ internal/controller/nodegroup/controller.go | 18 + internal/controller/nodegroup/genesis.go | 325 ++++++++++++++++++ internal/controller/nodegroup/nodes.go | 25 ++ 12 files changed, 725 insertions(+), 24 deletions(-) create mode 100644 internal/controller/nodegroup/genesis.go diff --git a/api/v1alpha1/seinodegroup_types.go b/api/v1alpha1/seinodegroup_types.go index e9e6e1c..59cd4cd 100644 --- a/api/v1alpha1/seinodegroup_types.go +++ b/api/v1alpha1/seinodegroup_types.go @@ -26,6 +26,12 @@ type SeiNodeGroupSpec struct { // +kubebuilder:default=Delete DeletionPolicy DeletionPolicy `json:"deletionPolicy,omitempty"` + // Genesis configures genesis ceremony orchestration for this group. + // When set, the controller generates GenesisCeremonyNodeConfig for each + // child SeiNode and coordinates assembly of the final genesis.json. + // +optional + Genesis *GenesisCeremonyConfig `json:"genesis,omitempty"` + // Networking controls how the group is exposed to traffic. // Networking resources are shared across all replicas. // +optional @@ -37,6 +43,56 @@ type SeiNodeGroupSpec struct { Monitoring *MonitoringConfig `json:"monitoring,omitempty"` } +// GenesisCeremonyConfig configures genesis ceremony orchestration for a node group. +type GenesisCeremonyConfig struct { + // ChainID for the new network. + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=64 + // +kubebuilder:validation:Pattern=`^[a-z0-9][a-z0-9-]*[a-z0-9]$` + ChainID string `json:"chainId"` + + // StakingAmount is the amount each validator self-delegates in its gentx. + // +optional + // +kubebuilder:default="10000000usei" + StakingAmount string `json:"stakingAmount,omitempty"` + + // AccountBalance is the initial balance for each validator's genesis account. + // +optional + // +kubebuilder:default="1000000000000000000000usei,1000000000000000000000uusdc,1000000000000000000000uatom" + AccountBalance string `json:"accountBalance,omitempty"` + + // Accounts adds non-validator genesis accounts (e.g. for load test funding). + // +optional + Accounts []GenesisAccount `json:"accounts,omitempty"` + + // Overrides is a flat map of dotted key paths merged on top of sei-config's + // GenesisDefaults(). Applied BEFORE gentx generation. + // +optional + Overrides map[string]string `json:"overrides,omitempty"` + + // GenesisS3 configures where genesis artifacts are stored. + // When omitted, inferred: bucket = "sei-genesis-artifacts", + // prefix = "//", region from PlatformConfig. + // +optional + GenesisS3 *GenesisS3Destination `json:"genesisS3,omitempty"` + + // MaxCeremonyDuration is the maximum time from group creation to genesis + // assembly completion. Default: "15m". + // +optional + MaxCeremonyDuration *metav1.Duration `json:"maxCeremonyDuration,omitempty"` +} + +// GenesisAccount represents a non-validator genesis account to fund. +type GenesisAccount struct { + // Address is the bech32-encoded account address. + // +kubebuilder:validation:MinLength=1 + Address string `json:"address"` + + // Balance is the initial balance in coin notation (e.g. "1000000usei"). + // +kubebuilder:validation:MinLength=1 + Balance string `json:"balance"` +} + // SeiNodeTemplate wraps a SeiNodeSpec for use in the group template. type SeiNodeTemplate struct { // Metadata allows setting labels and annotations on child SeiNodes. @@ -65,16 +121,17 @@ type SeiNodeTemplateMeta struct { // --------------------------------------------------------------------------- // SeiNodeGroupPhase represents the high-level lifecycle state. -// +kubebuilder:validation:Enum=Pending;Initializing;Ready;Degraded;Failed;Terminating +// +kubebuilder:validation:Enum=Pending;GenesisCeremony;Initializing;Ready;Degraded;Failed;Terminating type SeiNodeGroupPhase string const ( - GroupPhasePending SeiNodeGroupPhase = "Pending" - GroupPhaseInitializing SeiNodeGroupPhase = "Initializing" - GroupPhaseReady SeiNodeGroupPhase = "Ready" - GroupPhaseDegraded SeiNodeGroupPhase = "Degraded" - GroupPhaseFailed SeiNodeGroupPhase = "Failed" - GroupPhaseTerminating SeiNodeGroupPhase = "Terminating" + GroupPhasePending SeiNodeGroupPhase = "Pending" + GroupPhaseGenesisCeremony SeiNodeGroupPhase = "GenesisCeremony" + GroupPhaseInitializing SeiNodeGroupPhase = "Initializing" + GroupPhaseReady SeiNodeGroupPhase = "Ready" + GroupPhaseDegraded SeiNodeGroupPhase = "Degraded" + GroupPhaseFailed SeiNodeGroupPhase = "Failed" + GroupPhaseTerminating SeiNodeGroupPhase = "Terminating" ) // SeiNodeGroupStatus defines the observed state of a SeiNodeGroup. @@ -98,6 +155,18 @@ type SeiNodeGroupStatus struct { // +optional Nodes []GroupNodeStatus `json:"nodes,omitempty"` + // AssemblyPlan tracks the genesis assembly task on index 0's sidecar. + // +optional + AssemblyPlan *TaskPlan `json:"assemblyPlan,omitempty"` + + // GenesisHash is the SHA-256 hex digest of the assembled genesis.json. + // +optional + GenesisHash string `json:"genesisHash,omitempty"` + + // GenesisS3URI is the S3 URI of the uploaded genesis. + // +optional + GenesisS3URI string `json:"genesisS3URI,omitempty"` + // NetworkingStatus reports the observed state of networking resources. // +optional NetworkingStatus *NetworkingStatus `json:"networkingStatus,omitempty"` @@ -131,11 +200,12 @@ type NetworkingStatus struct { // Status condition types for SeiNodeGroup. const ( - ConditionNodesReady = "NodesReady" - ConditionExternalServiceReady = "ExternalServiceReady" - ConditionRouteReady = "RouteReady" - ConditionIsolationReady = "IsolationReady" - ConditionServiceMonitorReady = "ServiceMonitorReady" + ConditionNodesReady = "NodesReady" + ConditionExternalServiceReady = "ExternalServiceReady" + ConditionRouteReady = "RouteReady" + ConditionIsolationReady = "IsolationReady" + ConditionServiceMonitorReady = "ServiceMonitorReady" + ConditionGenesisCeremonyComplete = "GenesisCeremonyComplete" ) // +kubebuilder:object:root=true diff --git a/api/v1alpha1/validator_types.go b/api/v1alpha1/validator_types.go index 6827132..8f0cd17 100644 --- a/api/v1alpha1/validator_types.go +++ b/api/v1alpha1/validator_types.go @@ -11,4 +11,54 @@ type ValidatorSpec struct { // When absent the node block-syncs from genesis. // +optional Snapshot *SnapshotSource `json:"snapshot,omitempty"` + + // GenesisCeremony indicates this validator participates in a group genesis + // ceremony. Set by the SeiNodeGroup controller — not intended for direct use. + // +optional + GenesisCeremony *GenesisCeremonyNodeConfig `json:"genesisCeremony,omitempty"` +} + +// GenesisCeremonyNodeConfig holds per-node genesis ceremony parameters. +// Populated by the SeiNodeGroup controller when genesis is configured. +type GenesisCeremonyNodeConfig struct { + // ChainID of the genesis network. + // +kubebuilder:validation:MinLength=1 + ChainID string `json:"chainId"` + + // StakingAmount is the self-delegation amount for this validator's gentx. + // +kubebuilder:validation:MinLength=1 + StakingAmount string `json:"stakingAmount"` + + // AccountBalance is the initial coin balance to fund this validator's + // genesis account. The node's own address is discovered during identity + // generation — no cross-node coordination needed. + // +kubebuilder:validation:MinLength=1 + AccountBalance string `json:"accountBalance"` + + // GenesisParams is a JSON string of genesis parameter overrides merged + // on top of sei-config's GenesisDefaults(). Applied before gentx generation. + // +optional + GenesisParams string `json:"genesisParams,omitempty"` + + // Index is the node's ordinal within the group (0-based). + // +kubebuilder:validation:Minimum=0 + Index int32 `json:"index"` + + // ArtifactS3 is the S3 location where this node uploads its genesis artifacts. + ArtifactS3 GenesisS3Destination `json:"artifactS3"` +} + +// GenesisS3Destination configures where genesis artifacts are stored in S3. +type GenesisS3Destination struct { + // Bucket is the S3 bucket name. + // +kubebuilder:validation:MinLength=1 + Bucket string `json:"bucket"` + + // Prefix is an optional key prefix within the bucket. + // +optional + Prefix string `json:"prefix,omitempty"` + + // Region is the AWS region for S3 access. + // +kubebuilder:validation:MinLength=1 + Region string `json:"region"` } diff --git a/internal/controller/node/controller.go b/internal/controller/node/controller.go index e6ecc0a..e82595c 100644 --- a/internal/controller/node/controller.go +++ b/internal/controller/node/controller.go @@ -137,11 +137,16 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } // reconcilePending creates all plans up front and selects the starting phase. +// The Init plan is always built here — for genesis ceremony nodes the S3 +// source is deterministic and discover-peers is unconditionally included. +// Plan execution order is the single source of truth for orchestration. func (r *SeiNodeReconciler) reconcilePending(ctx context.Context, node *seiv1alpha1.SeiNode, planner NodePlanner) (ctrl.Result, error) { patch := client.MergeFrom(node.DeepCopy()) node.Status.PreInitPlan = buildPreInitPlan(node, planner) - if needsPreInit(node) { + if isGenesisCeremonyNode(node) { + node.Status.InitPlan = buildGenesisInitPlan() + } else if needsPreInit(node) { node.Status.InitPlan = buildPostBootstrapInitPlan(node) } else { node.Status.InitPlan = planner.BuildPlan(node) diff --git a/internal/controller/node/job.go b/internal/controller/node/job.go index 198e0f4..f87b377 100644 --- a/internal/controller/node/job.go +++ b/internal/controller/node/job.go @@ -23,7 +23,14 @@ func preInitJobName(node *seiv1alpha1.SeiNode) string { func generatePreInitJob(node *seiv1alpha1.SeiNode, platform PlatformConfig) *batchv1.Job { labels := preInitLabelsForNode(node) - snap := snapshotSourceFor(node) + + var podSpec corev1.PodSpec + if isGenesisCeremonyNode(node) { + podSpec = buildGenesisPreInitPodSpec(node, platform) + } else { + snap := snapshotSourceFor(node) + podSpec = buildPreInitPodSpec(node, snap, platform) + } return &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ @@ -41,7 +48,7 @@ func generatePreInitJob(node *seiv1alpha1.SeiNode, platform PlatformConfig) *bat "karpenter.sh/do-not-disrupt": "true", }, }, - Spec: buildPreInitPodSpec(node, snap, platform), + Spec: podSpec, }, }, } @@ -72,8 +79,15 @@ func generatePreInitService(node *seiv1alpha1.SeiNode) *corev1.Service { // preInitSidecarURL returns the in-cluster DNS URL for the pre-init Job's sidecar. func preInitSidecarURL(node *seiv1alpha1.SeiNode) string { + return PreInitSidecarURL(node.Name, node.Namespace, sidecarPort(node)) +} + +// PreInitSidecarURL builds the in-cluster DNS URL for a pre-init Job's sidecar +// given the node name, namespace, and port. Exported for use by the group controller. +func PreInitSidecarURL(nodeName, namespace string, port int32) string { + jobName := fmt.Sprintf("%s-pre-init", nodeName) return fmt.Sprintf("http://%s.%s.%s.svc.cluster.local:%d", - preInitPodHostname, preInitJobName(node), node.Namespace, sidecarPort(node)) + preInitPodHostname, jobName, namespace, port) } // preInitWaitCommand returns a shell command that waits for the sidecar @@ -99,6 +113,78 @@ func preInitWaitCommand(port int32, haltHeight int64) (command []string, args [] return []string{"/bin/bash", "-c"}, []string{script} } +// buildGenesisPreInitPodSpec constructs a PodSpec for the genesis ceremony PreInit Job. +// Unlike the snapshot PreInit, the main container runs "sleep infinity" as a keepalive +// (the sidecar does the real work), and an init container copies the seid binary from +// the node image to the PVC so sidecar tasks can invoke it. +func buildGenesisPreInitPodSpec(node *seiv1alpha1.SeiNode, platform PlatformConfig) corev1.PodSpec { + serviceName := preInitJobName(node) + + dataVolume := corev1.Volume{ + Name: "data", + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: nodeDataPVCClaimName(node), + }, + }, + } + + port := sidecarPort(node) + sidecarContainer := corev1.Container{ + Name: "sei-sidecar", + Image: sidecarImage(node), + Command: []string{"seictl", "serve"}, + RestartPolicy: ptr.To(corev1.ContainerRestartPolicyAlways), + Env: []corev1.EnvVar{ + {Name: "SEI_CHAIN_ID", Value: node.Spec.ChainID}, + {Name: "SEI_SIDECAR_PORT", Value: fmt.Sprintf("%d", port)}, + {Name: "SEI_HOME", Value: dataDir}, + }, + Ports: []corev1.ContainerPort{ + {Name: "sidecar", ContainerPort: port, Protocol: corev1.ProtocolTCP}, + }, + VolumeMounts: []corev1.VolumeMount{ + {Name: "data", MountPath: dataDir}, + }, + } + if node.Spec.Sidecar != nil && node.Spec.Sidecar.Resources != nil { + sidecarContainer.Resources = *node.Spec.Sidecar.Resources + } + + copySeid := corev1.Container{ + Name: "copy-seid", + Image: node.Spec.Image, + Command: []string{"sh", "-c", fmt.Sprintf("mkdir -p %s/bin && cp $(which seid) %s/bin/seid", dataDir, dataDir)}, + VolumeMounts: []corev1.VolumeMount{ + {Name: "data", MountPath: dataDir}, + }, + } + + keepalive := corev1.Container{ + Name: "keepalive", + Image: node.Spec.Image, + Command: []string{"sleep", "infinity"}, + VolumeMounts: []corev1.VolumeMount{ + {Name: "data", MountPath: dataDir}, + }, + } + + return corev1.PodSpec{ + Hostname: preInitPodHostname, + Subdomain: serviceName, + ServiceAccountName: platform.ServiceAccount, + ShareProcessNamespace: ptr.To(true), + RestartPolicy: corev1.RestartPolicyNever, + TerminationGracePeriodSeconds: ptr.To(int64(5)), + Tolerations: []corev1.Toleration{ + {Key: platform.TolerationKey, Value: platform.TolerationVal, Effect: corev1.TaintEffectNoSchedule}, + }, + Volumes: []corev1.Volume{dataVolume}, + InitContainers: []corev1.Container{copySeid, sidecarContainer}, + Containers: []corev1.Container{keepalive}, + } +} + func buildPreInitPodSpec(node *seiv1alpha1.SeiNode, snap *seiv1alpha1.SnapshotSource, platform PlatformConfig) corev1.PodSpec { serviceName := preInitJobName(node) diff --git a/internal/controller/node/planner.go b/internal/controller/node/planner.go index 1f6dc02..0d24a35 100644 --- a/internal/controller/node/planner.go +++ b/internal/controller/node/planner.go @@ -80,11 +80,19 @@ func peersFor(node *seiv1alpha1.SeiNode) []seiv1alpha1.PeerSource { // needsPreInit returns true when the node requires a PreInitPlan Job. func needsPreInit(node *seiv1alpha1.SeiNode) bool { + if isGenesisCeremonyNode(node) { + return true + } snap := snapshotSourceFor(node) return snap != nil && snap.BootstrapImage != "" && snap.S3 != nil && snap.S3.TargetHeight > 0 } +// isGenesisCeremonyNode returns true when the node participates in a group genesis ceremony. +func isGenesisCeremonyNode(node *seiv1alpha1.SeiNode) bool { + return node.Spec.Validator != nil && node.Spec.Validator.GenesisCeremony != nil +} + // snapshotGeneration extracts the SnapshotGenerationConfig from the populated // mode sub-spec. Returns nil when the mode doesn't support it. func snapshotGeneration(node *seiv1alpha1.SeiNode) *seiv1alpha1.SnapshotGenerationConfig { @@ -197,6 +205,12 @@ func buildSharedTask( return sidecar.MarkReadyTask{}, nil case taskAwaitCondition: return awaitConditionTask(node) + case taskGenerateIdentity: + return generateIdentityTaskBuilder(node), nil + case taskGenerateGentx: + return generateGentxTaskBuilder(node), nil + case taskUploadGenesisArtifacts: + return uploadGenesisArtifactsTaskBuilder(node), nil default: return nil, fmt.Errorf("buildSharedTask: unhandled task type %q", taskType) } diff --git a/internal/controller/node/planner_bootstrap.go b/internal/controller/node/planner_bootstrap.go index 844ca12..d915b1b 100644 --- a/internal/controller/node/planner_bootstrap.go +++ b/internal/controller/node/planner_bootstrap.go @@ -8,21 +8,40 @@ import ( seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" ) -const taskAwaitCondition = sidecar.TaskTypeAwaitCondition - -// buildPreInitPlan constructs the task plan for the PreInit Job. For nodes -// that require bootstrap infrastructure (e.g. S3 snapshot with a bootstrap -// image), it builds the standard bootstrap sequence. The seid process -// handles the halt-height natively (via --halt-height), so no -// await-condition task is needed. For all other nodes it returns an empty -// plan that resolves trivially. +const ( + taskAwaitCondition = sidecar.TaskTypeAwaitCondition + taskGenerateIdentity = sidecar.TaskTypeGenerateIdentity + taskGenerateGentx = sidecar.TaskTypeGenerateGentx + taskUploadGenesisArtifacts = sidecar.TaskTypeUploadGenesisArtifacts +) + +// buildPreInitPlan constructs the task plan for the PreInit Job. For genesis +// ceremony nodes, it builds the 3-task identity/gentx/upload sequence. For +// snapshot-bootstrap nodes, it builds the standard bootstrap sequence. For +// all other nodes it returns an empty plan that resolves trivially. func buildPreInitPlan(node *seiv1alpha1.SeiNode, planner NodePlanner) *seiv1alpha1.TaskPlan { + if isGenesisCeremonyNode(node) { + return buildGenesisPreInitPlan() + } if !needsPreInit(node) { return &seiv1alpha1.TaskPlan{Phase: seiv1alpha1.TaskPlanActive, Tasks: []seiv1alpha1.PlannedTask{}} } return planner.BuildPlan(node) } +// buildGenesisPreInitPlan returns a 3-task plan for genesis ceremony PreInit: +// generate validator identity, generate gentx, upload artifacts to S3. +func buildGenesisPreInitPlan() *seiv1alpha1.TaskPlan { + return &seiv1alpha1.TaskPlan{ + Phase: seiv1alpha1.TaskPlanActive, + Tasks: []seiv1alpha1.PlannedTask{ + {Type: taskGenerateIdentity, Status: seiv1alpha1.PlannedTaskPending}, + {Type: taskGenerateGentx, Status: seiv1alpha1.PlannedTaskPending}, + {Type: taskUploadGenesisArtifacts, Status: seiv1alpha1.PlannedTaskPending}, + }, + } +} + // buildPostBootstrapInitPlan constructs a reduced InitPlan for nodes that // completed a PreInit Job. The PVC already contains blockchain data synced // to the target height, so snapshot-restore and configure-state-sync are @@ -44,6 +63,25 @@ func buildPostBootstrapInitPlan(node *seiv1alpha1.SeiNode) *seiv1alpha1.TaskPlan return &seiv1alpha1.TaskPlan{Phase: seiv1alpha1.TaskPlanActive, Tasks: tasks} } +// buildGenesisInitPlan constructs the Init plan for genesis ceremony nodes. +// Unlike buildPostBootstrapInitPlan, discover-peers is unconditionally +// included: peers are not yet set at plan-creation time but will be pushed +// by the group controller before the node transitions to Initializing. +func buildGenesisInitPlan() *seiv1alpha1.TaskPlan { + prog := []string{ + taskConfigureGenesis, + taskConfigApply, + taskDiscoverPeers, + taskConfigValidate, + taskMarkReady, + } + tasks := make([]seiv1alpha1.PlannedTask, len(prog)) + for i, t := range prog { + tasks[i] = seiv1alpha1.PlannedTask{Type: t, Status: seiv1alpha1.PlannedTaskPending} + } + return &seiv1alpha1.TaskPlan{Phase: seiv1alpha1.TaskPlanActive, Tasks: tasks} +} + // awaitConditionTask builds the sidecar task for the await-condition step. func awaitConditionTask(node *seiv1alpha1.SeiNode) (sidecar.TaskBuilder, error) { snap := snapshotSourceFor(node) diff --git a/internal/controller/node/planner_validator.go b/internal/controller/node/planner_validator.go index a3d7356..7ea1255 100644 --- a/internal/controller/node/planner_validator.go +++ b/internal/controller/node/planner_validator.go @@ -19,6 +19,18 @@ func (p *validatorPlanner) Validate(node *seiv1alpha1.SeiNode) error { if node.Spec.Validator == nil { return fmt.Errorf("validator sub-spec is nil") } + if gc := node.Spec.Validator.GenesisCeremony; gc != nil { + if gc.ChainID == "" { + return fmt.Errorf("validator: genesisCeremony.chainId is required") + } + if gc.StakingAmount == "" { + return fmt.Errorf("validator: genesisCeremony.stakingAmount is required") + } + if gc.ArtifactS3.Bucket == "" || gc.ArtifactS3.Region == "" { + return fmt.Errorf("validator: genesisCeremony.artifactS3 bucket and region are required") + } + return nil + } if snap := node.Spec.Validator.Snapshot; snap != nil && snap.BootstrapImage != "" { if snap.S3 == nil || snap.S3.TargetHeight <= 0 { return fmt.Errorf("validator: bootstrapImage requires s3 with targetHeight > 0") diff --git a/internal/controller/node/pre_init.go b/internal/controller/node/pre_init.go index 100cd3d..8268b11 100644 --- a/internal/controller/node/pre_init.go +++ b/internal/controller/node/pre_init.go @@ -3,6 +3,7 @@ package node import ( "context" "fmt" + "time" sidecar "github.com/sei-protocol/seictl/sidecar/client" batchv1 "k8s.io/api/batch/v1" @@ -24,6 +25,9 @@ func (r *SeiNodeReconciler) reconcilePreInitializing(ctx context.Context, node * plan := node.Status.PreInitPlan if plan.Phase == seiv1alpha1.TaskPlanComplete { + if isGenesisCeremonyNode(node) { + return r.handleGenesisPreInitComplete(ctx, node) + } if len(plan.Tasks) > 0 { job := &batchv1.Job{} key := types.NamespacedName{Name: preInitJobName(node), Namespace: node.Namespace} @@ -102,6 +106,9 @@ func (r *SeiNodeReconciler) reconcilePreInitializing(ctx context.Context, node * } if plan.Phase == seiv1alpha1.TaskPlanComplete { + if isGenesisCeremonyNode(node) { + return r.handleGenesisPreInitComplete(ctx, node) + } if !isJobComplete(job) { log.FromContext(ctx).Info("sidecar tasks complete, waiting for seid to reach halt-height", "job", job.Name) return ctrl.Result{RequeueAfter: taskPollInterval}, nil @@ -120,6 +127,24 @@ func (r *SeiNodeReconciler) reconcilePreInitializing(ctx context.Context, node * return result, nil } +// handleGenesisPreInitComplete is called when a genesis ceremony node's PreInit +// plan completes. The genesis PreInit Job runs "sleep infinity" — it never +// completes on its own. The Init plan was already built in reconcilePending +// (the S3 URI is deterministic). We wait for the group controller to push +// static peers (the last step of finalizeGenesis after assembly + nodeID +// collection), then clean up PreInit resources and transition. +func (r *SeiNodeReconciler) handleGenesisPreInitComplete(ctx context.Context, node *seiv1alpha1.SeiNode) (ctrl.Result, error) { + if len(peersFor(node)) == 0 { + log.FromContext(ctx).Info("genesis PreInit complete, waiting for group to push peers") + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } + + if err := r.cleanupPreInit(ctx, node); err != nil { + return ctrl.Result{}, fmt.Errorf("cleaning up genesis pre-init resources: %w", err) + } + return r.transitionPhase(ctx, node, seiv1alpha1.PhaseInitializing) +} + // ensurePreInitService creates the headless Service for pre-init Job pod DNS. func (r *SeiNodeReconciler) ensurePreInitService(ctx context.Context, node *seiv1alpha1.SeiNode) error { key := types.NamespacedName{Name: preInitJobName(node), Namespace: node.Namespace} diff --git a/internal/controller/node/task_builders.go b/internal/controller/node/task_builders.go index 394cbe6..3b3e594 100644 --- a/internal/controller/node/task_builders.go +++ b/internal/controller/node/task_builders.go @@ -60,6 +60,39 @@ func configureGenesisBuilder(node *seiv1alpha1.SeiNode) sidecar.TaskBuilder { } } +// generateIdentityTaskBuilder builds the sidecar task for the generate-identity step. +func generateIdentityTaskBuilder(node *seiv1alpha1.SeiNode) sidecar.TaskBuilder { + gc := node.Spec.Validator.GenesisCeremony + return sidecar.GenerateIdentityTask{ + ChainID: gc.ChainID, + Moniker: node.Name, + } +} + +// generateGentxTaskBuilder builds the sidecar task for the generate-gentx step. +// The node's own address is discovered at runtime by the handler — only the +// balance amount and staking parameters need to be forwarded. +func generateGentxTaskBuilder(node *seiv1alpha1.SeiNode) sidecar.TaskBuilder { + gc := node.Spec.Validator.GenesisCeremony + return sidecar.GenerateGentxTask{ + ChainID: gc.ChainID, + StakingAmount: gc.StakingAmount, + AccountBalance: gc.AccountBalance, + GenesisParams: gc.GenesisParams, + } +} + +// uploadGenesisArtifactsTaskBuilder builds the sidecar task for the upload-genesis-artifacts step. +func uploadGenesisArtifactsTaskBuilder(node *seiv1alpha1.SeiNode) sidecar.TaskBuilder { + gc := node.Spec.Validator.GenesisCeremony + return sidecar.UploadGenesisArtifactsTask{ + S3Bucket: gc.ArtifactS3.Bucket, + S3Prefix: gc.ArtifactS3.Prefix, + S3Region: gc.ArtifactS3.Region, + NodeName: node.Name, + } +} + // resultExportScheduledTask returns a result-export task builder with a // cron schedule if the node is a replayer with result export enabled. // Returns nil when not applicable. diff --git a/internal/controller/nodegroup/controller.go b/internal/controller/nodegroup/controller.go index 7b79157..f619cf8 100644 --- a/internal/controller/nodegroup/controller.go +++ b/internal/controller/nodegroup/controller.go @@ -35,6 +35,9 @@ type SeiNodeGroupReconciler struct { // It is auto-injected into every AuthorizationPolicy to ensure the // controller can always reach the seictl sidecar. ControllerSA string + + // BuildSidecarClientFn overrides sidecar client construction for testing. + BuildSidecarClientFn func(node *seiv1alpha1.SeiNode) interface{} } // +kubebuilder:rbac:groups=sei.io,resources=seinodegroups,verbs=get;list;watch;create;update;patch;delete @@ -81,6 +84,21 @@ func (r *SeiNodeGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{}, fmt.Errorf("reconciling SeiNodes: %w", err) } + if group.Spec.Genesis != nil { + result, err := r.reconcileGenesisAssembly(ctx, group) + if err != nil { + logger.Error(err, "reconciling genesis assembly") + observability.ReconcileErrorsTotal.WithLabelValues(controllerName, ns, name).Inc() + return ctrl.Result{}, fmt.Errorf("reconciling genesis assembly: %w", err) + } + if result.RequeueAfter > 0 || result.Requeue { + if err := r.updateStatus(ctx, group, statusBase); err != nil { + return ctrl.Result{}, fmt.Errorf("updating status: %w", err) + } + return result, nil + } + } + if err := timeSubstep("reconcileNetworking", func() error { return r.reconcileNetworking(ctx, group) }); err != nil { diff --git a/internal/controller/nodegroup/genesis.go b/internal/controller/nodegroup/genesis.go new file mode 100644 index 0000000..a32a9d0 --- /dev/null +++ b/internal/controller/nodegroup/genesis.go @@ -0,0 +1,325 @@ +package nodegroup + +import ( + "context" + "encoding/json" + "fmt" + "sort" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + nodecontroller "github.com/sei-protocol/sei-k8s-controller/internal/controller/node" + sidecar "github.com/sei-protocol/seictl/sidecar/client" +) + +const ( + defaultGenesisBucket = "sei-genesis-artifacts" + defaultMaxCeremonyDuration = 15 * time.Minute + defaultSidecarPort = int32(7777) + defaultP2PPort = int32(26656) +) + +// SidecarStatusClient abstracts the sidecar HTTP API for testability. +type SidecarStatusClient interface { + GetNodeID(ctx context.Context) (string, error) +} + +// sidecarSubmitter is the subset of sidecar.SidecarClient that can submit and poll tasks. +type sidecarSubmitter interface { + SubmitTask(ctx context.Context, task sidecar.TaskRequest) (interface{ String() string }, error) + GetTask(ctx context.Context, id interface{ String() string }) (*sidecar.TaskResult, error) +} + +// reconcileGenesisAssembly is the top-level genesis coordination entry point. +// It runs only when spec.genesis is set. +func (r *SeiNodeGroupReconciler) reconcileGenesisAssembly(ctx context.Context, group *seiv1alpha1.SeiNodeGroup) (ctrl.Result, error) { + if group.Spec.Genesis == nil { + return ctrl.Result{}, nil + } + + nodes, err := r.listChildSeiNodes(ctx, group) + if err != nil { + return ctrl.Result{}, err + } + sort.Slice(nodes, func(i, j int) bool { return nodes[i].Name < nodes[j].Name }) + + if int32(len(nodes)) < group.Spec.Replicas { + log.FromContext(ctx).Info("waiting for all SeiNodes to be created", + "have", len(nodes), "want", group.Spec.Replicas) + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } + + maxDuration := defaultMaxCeremonyDuration + if group.Spec.Genesis.MaxCeremonyDuration != nil { + maxDuration = group.Spec.Genesis.MaxCeremonyDuration.Duration + } + if !group.CreationTimestamp.IsZero() && time.Since(group.CreationTimestamp.Time) > maxDuration { + r.Recorder.Event(group, corev1.EventTypeWarning, "GenesisCeremonyTimeout", + "Genesis ceremony exceeded maximum duration") + return r.setGenesisCondition(ctx, group, metav1.ConditionFalse, + "Timeout", "genesis ceremony exceeded maximum duration") + } + + for i := range nodes { + n := &nodes[i] + if n.Status.Phase == seiv1alpha1.PhaseFailed { + msg := fmt.Sprintf("SeiNode %s is in Failed phase", n.Name) + r.Recorder.Event(group, corev1.EventTypeWarning, "GenesisPreInitFailed", msg) + return r.setGenesisCondition(ctx, group, metav1.ConditionFalse, "PreInitFailed", msg) + } + } + + allPreInitComplete := true + for i := range nodes { + plan := nodes[i].Status.PreInitPlan + if plan == nil || plan.Phase != seiv1alpha1.TaskPlanComplete { + allPreInitComplete = false + break + } + } + if !allPreInitComplete { + log.FromContext(ctx).Info("waiting for all SeiNodes PreInit to complete") + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } + + if group.Status.AssemblyPlan == nil { + return r.startAssembly(ctx, group) + } + + if group.Status.AssemblyPlan.Phase == seiv1alpha1.TaskPlanComplete { + if group.Status.GenesisS3URI != "" { + return ctrl.Result{}, nil + } + return r.finalizeGenesis(ctx, group, nodes) + } + + if group.Status.AssemblyPlan.Phase == seiv1alpha1.TaskPlanFailed { + return r.setGenesisCondition(ctx, group, metav1.ConditionFalse, + "AssemblyFailed", "genesis assembly task failed") + } + + return r.driveAssemblyPlan(ctx, group, nodes) +} + +func (r *SeiNodeGroupReconciler) startAssembly(ctx context.Context, group *seiv1alpha1.SeiNodeGroup) (ctrl.Result, error) { + plan := &seiv1alpha1.TaskPlan{ + Phase: seiv1alpha1.TaskPlanActive, + Tasks: []seiv1alpha1.PlannedTask{ + {Type: sidecar.TaskTypeAssembleGenesis, Status: seiv1alpha1.PlannedTaskPending}, + }, + } + + patch := client.MergeFrom(group.DeepCopy()) + group.Status.AssemblyPlan = plan + group.Status.Phase = seiv1alpha1.GroupPhaseGenesisCeremony + if err := r.Status().Patch(ctx, group, patch); err != nil { + return ctrl.Result{}, fmt.Errorf("setting assembly plan: %w", err) + } + + r.Recorder.Event(group, corev1.EventTypeNormal, "GenesisAssemblyStarted", + "All PreInit complete, starting genesis assembly") + return ctrl.Result{RequeueAfter: 2 * time.Second}, nil +} + +func (r *SeiNodeGroupReconciler) driveAssemblyPlan(ctx context.Context, group *seiv1alpha1.SeiNodeGroup, nodes []seiv1alpha1.SeiNode) (ctrl.Result, error) { + if len(nodes) == 0 { + return ctrl.Result{}, fmt.Errorf("no nodes available for assembly") + } + + assemblerNode := &nodes[0] + sc := r.buildPreInitSidecarClient(assemblerNode) + if sc == nil { + log.FromContext(ctx).Info("assembler sidecar not reachable yet") + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } + + plan := group.Status.AssemblyPlan + task := &plan.Tasks[0] + + switch task.Status { + case seiv1alpha1.PlannedTaskPending: + s3 := genesisS3Config(group) + nodeParams := make([]sidecar.GenesisNodeParam, len(nodes)) + for i := range nodes { + nodeParams[i] = sidecar.GenesisNodeParam{Name: nodes[i].Name} + } + builder := sidecar.AssembleAndUploadGenesisTask{ + S3Bucket: s3.Bucket, + S3Prefix: s3.Prefix, + S3Region: s3.Region, + ChainID: group.Spec.Genesis.ChainID, + Nodes: nodeParams, + } + req := builder.ToTaskRequest() + id, err := sc.(sidecarSubmitter).SubmitTask(ctx, req) + if err != nil { + return ctrl.Result{}, fmt.Errorf("submitting assembly task: %w", err) + } + patch := client.MergeFrom(group.DeepCopy()) + task.Status = seiv1alpha1.PlannedTaskSubmitted + task.TaskID = id.String() + if err := r.Status().Patch(ctx, group, patch); err != nil { + return ctrl.Result{}, fmt.Errorf("recording assembly task submission: %w", err) + } + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + + case seiv1alpha1.PlannedTaskSubmitted: + return r.pollAssemblyTask(ctx, group, sc, task) + } + + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil +} + +// TODO(PR-3): poll sidecar GetTask, mark plan Complete/Failed based on result. +func (r *SeiNodeGroupReconciler) pollAssemblyTask(ctx context.Context, group *seiv1alpha1.SeiNodeGroup, sc interface{}, task *seiv1alpha1.PlannedTask) (ctrl.Result, error) { + log.FromContext(ctx).Info("polling assembly task", "taskID", task.TaskID) + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil +} + +func (r *SeiNodeGroupReconciler) finalizeGenesis(ctx context.Context, group *seiv1alpha1.SeiNodeGroup, nodes []seiv1alpha1.SeiNode) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + peerList, err := r.collectPeers(ctx, nodes) + if err != nil { + return ctrl.Result{}, fmt.Errorf("collecting peers: %w", err) + } + + for i := range nodes { + if err := r.setStaticPeers(ctx, &nodes[i], peerList); err != nil { + return ctrl.Result{}, fmt.Errorf("pushing peers to %s: %w", nodes[i].Name, err) + } + } + + s3 := genesisS3Config(group) + genesisURI := fmt.Sprintf("s3://%s/%sgenesis.json", s3.Bucket, s3.Prefix) + + patch := client.MergeFrom(group.DeepCopy()) + group.Status.GenesisS3URI = genesisURI + if err := r.Status().Patch(ctx, group, patch); err != nil { + return ctrl.Result{}, fmt.Errorf("recording genesis S3 URI: %w", err) + } + + logger.Info("genesis ceremony complete", "genesisURI", genesisURI, "peers", len(peerList)) + r.Recorder.Eventf(group, corev1.EventTypeNormal, "GenesisCeremonyComplete", + "Genesis assembled and distributed to %d nodes", len(nodes)) + + return r.setGenesisCondition(ctx, group, metav1.ConditionTrue, + "CeremonyComplete", "genesis assembled and distributed") +} + +func (r *SeiNodeGroupReconciler) collectPeers(ctx context.Context, nodes []seiv1alpha1.SeiNode) ([]string, error) { + var peers []string + for i := range nodes { + n := &nodes[i] + sc := r.buildPreInitSidecarClient(n) + if sc == nil { + return nil, fmt.Errorf("cannot reach sidecar on %s", n.Name) + } + + getter, ok := sc.(SidecarStatusClient) + if !ok { + return nil, fmt.Errorf("sidecar client for %s does not support GetNodeID", n.Name) + } + nodeID, err := getter.GetNodeID(ctx) + if err != nil { + return nil, fmt.Errorf("getting node ID for %s: %w", n.Name, err) + } + + port := defaultP2PPort + dns := fmt.Sprintf("%s.%s.svc.cluster.local", n.Name, n.Namespace) + peers = append(peers, fmt.Sprintf("%s@%s:%d", nodeID, dns, port)) + } + return peers, nil +} + +func (r *SeiNodeGroupReconciler) setStaticPeers(ctx context.Context, node *seiv1alpha1.SeiNode, peers []string) error { + if node.Spec.Validator == nil { + return fmt.Errorf("node %s has no validator spec", node.Name) + } + patch := client.MergeFrom(node.DeepCopy()) + node.Spec.Validator.Peers = []seiv1alpha1.PeerSource{ + {Static: &seiv1alpha1.StaticPeerSource{Addresses: peers}}, + } + return r.Patch(ctx, node, patch) +} + +func (r *SeiNodeGroupReconciler) buildPreInitSidecarClient(node *seiv1alpha1.SeiNode) interface{} { + if r.BuildSidecarClientFn != nil { + return r.BuildSidecarClientFn(node) + } + port := defaultSidecarPort + if node.Spec.Sidecar != nil && node.Spec.Sidecar.Port != 0 { + port = node.Spec.Sidecar.Port + } + url := nodecontroller.PreInitSidecarURL(node.Name, node.Namespace, port) + c, err := sidecar.NewSidecarClient(url) + if err != nil { + return nil + } + return c +} + +func (r *SeiNodeGroupReconciler) setGenesisCondition(ctx context.Context, group *seiv1alpha1.SeiNodeGroup, status metav1.ConditionStatus, reason, message string) (ctrl.Result, error) { + patch := client.MergeFrom(group.DeepCopy()) + + found := false + for i := range group.Status.Conditions { + if group.Status.Conditions[i].Type == seiv1alpha1.ConditionGenesisCeremonyComplete { + group.Status.Conditions[i].Status = status + group.Status.Conditions[i].Reason = reason + group.Status.Conditions[i].Message = message + group.Status.Conditions[i].LastTransitionTime = metav1.Now() + found = true + break + } + } + if !found { + group.Status.Conditions = append(group.Status.Conditions, metav1.Condition{ + Type: seiv1alpha1.ConditionGenesisCeremonyComplete, + Status: status, + Reason: reason, + Message: message, + LastTransitionTime: metav1.Now(), + }) + } + + if err := r.Status().Patch(ctx, group, patch); err != nil { + return ctrl.Result{}, fmt.Errorf("setting genesis condition: %w", err) + } + return ctrl.Result{}, nil +} + +// genesisS3Config returns the S3 destination for genesis artifacts, applying +// defaults when the user omits the field. +func genesisS3Config(group *seiv1alpha1.SeiNodeGroup) seiv1alpha1.GenesisS3Destination { + gc := group.Spec.Genesis + if gc.GenesisS3 != nil { + dest := *gc.GenesisS3 + if dest.Prefix == "" { + dest.Prefix = fmt.Sprintf("%s/%s/", gc.ChainID, group.Name) + } + return dest + } + return seiv1alpha1.GenesisS3Destination{ + Bucket: defaultGenesisBucket, + Prefix: fmt.Sprintf("%s/%s/", gc.ChainID, group.Name), + Region: "us-east-2", + } +} + +func marshalOverrides(overrides map[string]string) string { + if len(overrides) == 0 { + return "" + } + data, err := json.Marshal(overrides) + if err != nil { + return "" + } + return string(data) +} diff --git a/internal/controller/nodegroup/nodes.go b/internal/controller/nodegroup/nodes.go index a132a1e..2b6eae0 100644 --- a/internal/controller/nodegroup/nodes.go +++ b/internal/controller/nodegroup/nodes.go @@ -96,6 +96,31 @@ func generateSeiNode(group *seiv1alpha1.SeiNodeGroup, ordinal int) *seiv1alpha1. } spec.PodLabels[groupLabel] = group.Name + if gc := group.Spec.Genesis; gc != nil && spec.Validator != nil { + if spec.ChainID == "" { + spec.ChainID = gc.ChainID + } + s3 := genesisS3Config(group) + spec.Validator.GenesisCeremony = &seiv1alpha1.GenesisCeremonyNodeConfig{ + ChainID: gc.ChainID, + StakingAmount: gc.StakingAmount, + AccountBalance: gc.AccountBalance, + GenesisParams: marshalOverrides(gc.Overrides), + Index: int32(ordinal), + ArtifactS3: seiv1alpha1.GenesisS3Destination{ + Bucket: s3.Bucket, + Prefix: s3.Prefix, + Region: s3.Region, + }, + } + spec.Genesis = seiv1alpha1.GenesisConfiguration{ + S3: &seiv1alpha1.GenesisS3Source{ + URI: fmt.Sprintf("s3://%s/%sgenesis.json", s3.Bucket, s3.Prefix), + Region: s3.Region, + }, + } + } + return &seiv1alpha1.SeiNode{ ObjectMeta: metav1.ObjectMeta{ Name: seiNodeName(group, ordinal), From 1b76cda3fadafbf01ba8ae52dea72a71c1694153 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 25 Mar 2026 12:05:28 -0400 Subject: [PATCH 2/5] chore: regenerate deepcopy for genesis ceremony types --- api/v1alpha1/zz_generated.deepcopy.go | 98 +++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index edf5cf8..5b6280f 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -256,6 +256,74 @@ func (in *GatewayRouteConfig) DeepCopy() *GatewayRouteConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GenesisAccount) DeepCopyInto(out *GenesisAccount) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GenesisAccount. +func (in *GenesisAccount) DeepCopy() *GenesisAccount { + if in == nil { + return nil + } + out := new(GenesisAccount) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GenesisCeremonyConfig) DeepCopyInto(out *GenesisCeremonyConfig) { + *out = *in + if in.Accounts != nil { + in, out := &in.Accounts, &out.Accounts + *out = make([]GenesisAccount, len(*in)) + copy(*out, *in) + } + if in.Overrides != nil { + in, out := &in.Overrides, &out.Overrides + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.GenesisS3 != nil { + in, out := &in.GenesisS3, &out.GenesisS3 + *out = new(GenesisS3Destination) + **out = **in + } + if in.MaxCeremonyDuration != nil { + in, out := &in.MaxCeremonyDuration, &out.MaxCeremonyDuration + *out = new(metav1.Duration) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GenesisCeremonyConfig. +func (in *GenesisCeremonyConfig) DeepCopy() *GenesisCeremonyConfig { + if in == nil { + return nil + } + out := new(GenesisCeremonyConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GenesisCeremonyNodeConfig) DeepCopyInto(out *GenesisCeremonyNodeConfig) { + *out = *in + out.ArtifactS3 = in.ArtifactS3 +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GenesisCeremonyNodeConfig. +func (in *GenesisCeremonyNodeConfig) DeepCopy() *GenesisCeremonyNodeConfig { + if in == nil { + return nil + } + out := new(GenesisCeremonyNodeConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GenesisConfiguration) DeepCopyInto(out *GenesisConfiguration) { *out = *in @@ -296,6 +364,21 @@ func (in *GenesisPVCSource) DeepCopy() *GenesisPVCSource { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GenesisS3Destination) DeepCopyInto(out *GenesisS3Destination) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GenesisS3Destination. +func (in *GenesisS3Destination) DeepCopy() *GenesisS3Destination { + if in == nil { + return nil + } + out := new(GenesisS3Destination) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GenesisS3Source) DeepCopyInto(out *GenesisS3Source) { *out = *in @@ -652,6 +735,11 @@ func (in *SeiNodeGroupList) DeepCopyObject() runtime.Object { func (in *SeiNodeGroupSpec) DeepCopyInto(out *SeiNodeGroupSpec) { *out = *in in.Template.DeepCopyInto(&out.Template) + if in.Genesis != nil { + in, out := &in.Genesis, &out.Genesis + *out = new(GenesisCeremonyConfig) + (*in).DeepCopyInto(*out) + } if in.Networking != nil { in, out := &in.Networking, &out.Networking *out = new(NetworkingConfig) @@ -682,6 +770,11 @@ func (in *SeiNodeGroupStatus) DeepCopyInto(out *SeiNodeGroupStatus) { *out = make([]GroupNodeStatus, len(*in)) copy(*out, *in) } + if in.AssemblyPlan != nil { + in, out := &in.AssemblyPlan, &out.AssemblyPlan + *out = new(TaskPlan) + (*in).DeepCopyInto(*out) + } if in.NetworkingStatus != nil { in, out := &in.NetworkingStatus, &out.NetworkingStatus *out = new(NetworkingStatus) @@ -1228,6 +1321,11 @@ func (in *ValidatorSpec) DeepCopyInto(out *ValidatorSpec) { *out = new(SnapshotSource) (*in).DeepCopyInto(*out) } + if in.GenesisCeremony != nil { + in, out := &in.GenesisCeremony, &out.GenesisCeremony + *out = new(GenesisCeremonyNodeConfig) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ValidatorSpec. From a342413fdf870967c128767c2281b299370c6e2e Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 25 Mar 2026 12:09:07 -0400 Subject: [PATCH 3/5] fix: resolve lint issues in genesis ceremony code - Fix gofmt alignment in SeiNodeGroupPhase and genesis constants - Fix goimports grouping (seictl is external, not local) - Replace interface{} with any (modernize) - Remove deprecated result.Requeue usage - Suppress unused params in pollAssemblyTask stub (PR-3) --- api/v1alpha1/seinodegroup_types.go | 12 ++++++------ internal/controller/nodegroup/controller.go | 4 ++-- internal/controller/nodegroup/genesis.go | 14 +++++++------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/api/v1alpha1/seinodegroup_types.go b/api/v1alpha1/seinodegroup_types.go index 59cd4cd..7cd09f7 100644 --- a/api/v1alpha1/seinodegroup_types.go +++ b/api/v1alpha1/seinodegroup_types.go @@ -125,13 +125,13 @@ type SeiNodeTemplateMeta struct { type SeiNodeGroupPhase string const ( - GroupPhasePending SeiNodeGroupPhase = "Pending" + GroupPhasePending SeiNodeGroupPhase = "Pending" GroupPhaseGenesisCeremony SeiNodeGroupPhase = "GenesisCeremony" - GroupPhaseInitializing SeiNodeGroupPhase = "Initializing" - GroupPhaseReady SeiNodeGroupPhase = "Ready" - GroupPhaseDegraded SeiNodeGroupPhase = "Degraded" - GroupPhaseFailed SeiNodeGroupPhase = "Failed" - GroupPhaseTerminating SeiNodeGroupPhase = "Terminating" + GroupPhaseInitializing SeiNodeGroupPhase = "Initializing" + GroupPhaseReady SeiNodeGroupPhase = "Ready" + GroupPhaseDegraded SeiNodeGroupPhase = "Degraded" + GroupPhaseFailed SeiNodeGroupPhase = "Failed" + GroupPhaseTerminating SeiNodeGroupPhase = "Terminating" ) // SeiNodeGroupStatus defines the observed state of a SeiNodeGroup. diff --git a/internal/controller/nodegroup/controller.go b/internal/controller/nodegroup/controller.go index f619cf8..66321d8 100644 --- a/internal/controller/nodegroup/controller.go +++ b/internal/controller/nodegroup/controller.go @@ -37,7 +37,7 @@ type SeiNodeGroupReconciler struct { ControllerSA string // BuildSidecarClientFn overrides sidecar client construction for testing. - BuildSidecarClientFn func(node *seiv1alpha1.SeiNode) interface{} + BuildSidecarClientFn func(node *seiv1alpha1.SeiNode) any } // +kubebuilder:rbac:groups=sei.io,resources=seinodegroups,verbs=get;list;watch;create;update;patch;delete @@ -91,7 +91,7 @@ func (r *SeiNodeGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request observability.ReconcileErrorsTotal.WithLabelValues(controllerName, ns, name).Inc() return ctrl.Result{}, fmt.Errorf("reconciling genesis assembly: %w", err) } - if result.RequeueAfter > 0 || result.Requeue { + if result.RequeueAfter > 0 { if err := r.updateStatus(ctx, group, statusBase); err != nil { return ctrl.Result{}, fmt.Errorf("updating status: %w", err) } diff --git a/internal/controller/nodegroup/genesis.go b/internal/controller/nodegroup/genesis.go index a32a9d0..fa6828c 100644 --- a/internal/controller/nodegroup/genesis.go +++ b/internal/controller/nodegroup/genesis.go @@ -7,6 +7,7 @@ import ( "sort" "time" + sidecar "github.com/sei-protocol/seictl/sidecar/client" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" @@ -15,14 +16,13 @@ import ( seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" nodecontroller "github.com/sei-protocol/sei-k8s-controller/internal/controller/node" - sidecar "github.com/sei-protocol/seictl/sidecar/client" ) const ( - defaultGenesisBucket = "sei-genesis-artifacts" - defaultMaxCeremonyDuration = 15 * time.Minute - defaultSidecarPort = int32(7777) - defaultP2PPort = int32(26656) + defaultGenesisBucket = "sei-genesis-artifacts" + defaultMaxCeremonyDuration = 15 * time.Minute + defaultSidecarPort = int32(7777) + defaultP2PPort = int32(26656) ) // SidecarStatusClient abstracts the sidecar HTTP API for testability. @@ -177,7 +177,7 @@ func (r *SeiNodeGroupReconciler) driveAssemblyPlan(ctx context.Context, group *s } // TODO(PR-3): poll sidecar GetTask, mark plan Complete/Failed based on result. -func (r *SeiNodeGroupReconciler) pollAssemblyTask(ctx context.Context, group *seiv1alpha1.SeiNodeGroup, sc interface{}, task *seiv1alpha1.PlannedTask) (ctrl.Result, error) { +func (r *SeiNodeGroupReconciler) pollAssemblyTask(ctx context.Context, _ *seiv1alpha1.SeiNodeGroup, _ any, task *seiv1alpha1.PlannedTask) (ctrl.Result, error) { log.FromContext(ctx).Info("polling assembly task", "taskID", task.TaskID) return ctrl.Result{RequeueAfter: 5 * time.Second}, nil } @@ -249,7 +249,7 @@ func (r *SeiNodeGroupReconciler) setStaticPeers(ctx context.Context, node *seiv1 return r.Patch(ctx, node, patch) } -func (r *SeiNodeGroupReconciler) buildPreInitSidecarClient(node *seiv1alpha1.SeiNode) interface{} { +func (r *SeiNodeGroupReconciler) buildPreInitSidecarClient(node *seiv1alpha1.SeiNode) any { if r.BuildSidecarClientFn != nil { return r.BuildSidecarClientFn(node) } From 72d5f1201b13d2707c804bd44b00ee0407d98266 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 25 Mar 2026 12:16:04 -0400 Subject: [PATCH 4/5] refactor: simplify group lifecycle and rename AssemblyPlan to InitPlan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove GroupPhaseGenesisCeremony — genesis work runs during the existing Initializing phase, keyed off spec.genesis != nil - Rename AssemblyPlan to InitPlan for consistency with node-level naming and to avoid scoping the field to assembly-only workloads - Move design docs from tide/ to .tide/, remove tide/ and docs/ --- .tide/genesis-execution-plan.md | 419 ++++++++++ .tide/network-deployments.md | 799 +++++++++++++++++++ .tide/network-upgrades.md | 489 ++++++++++++ .tide/observability.md | 926 +++++++++++++++++++++++ .tide/seinodegroup-scheduling.md | 699 +++++++++++++++++ .tide/validator-migration.md | 411 ++++++++++ api/v1alpha1/seinodegroup_types.go | 19 +- api/v1alpha1/zz_generated.deepcopy.go | 4 +- internal/controller/nodegroup/genesis.go | 23 +- 9 files changed, 3765 insertions(+), 24 deletions(-) create mode 100644 .tide/genesis-execution-plan.md create mode 100644 .tide/network-deployments.md create mode 100644 .tide/network-upgrades.md create mode 100644 .tide/observability.md create mode 100644 .tide/seinodegroup-scheduling.md create mode 100644 .tide/validator-migration.md diff --git a/.tide/genesis-execution-plan.md b/.tide/genesis-execution-plan.md new file mode 100644 index 0000000..037cc98 --- /dev/null +++ b/.tide/genesis-execution-plan.md @@ -0,0 +1,419 @@ +# Execution Plan: Genesis Network Configuration + +**Source:** [network-deployments.md](./network-deployments.md) +**Created:** 2026-03-25 + +--- + +## Work Streams + +Two parallel work streams plus a platform prerequisite. + +``` +PR 0: Terraform (platform) + ├── S3 bucket: sei-genesis-artifacts + └── IAM: s3:GetObject + s3:PutObject on sei-genesis-artifacts/* + +Stream A (seictl) Stream B (sei-k8s-controller) +───────────────── ───────────────────────────── +PR 1: S3ObjectClient PR 5: CRD types + manifests + ↓ ↓ +PR 2: Genesis task handlers PR 6: PreInit Job + planner + ↓ ↓ +PR 3: Assembly handler PR 7: PreInit gate + ↓ ↓ +PR 4: /v0/node-id endpoint PR 8: Group coordination + ↓ ↓ + └──────────── PR 9: E2E ────────────┘ +``` + +--- + +## Prerequisite: Platform + +### PR 0 — S3 bucket + IAM policy (Terraform) + +**Scope:** Create the genesis artifact bucket and grant the `seid-node` ServiceAccount read+write access. + +**Files:** +- `platform/terraform/aws/189176372795/us-east-2/dev/eks.tf` — new IAM statement + S3 bucket +- Equivalent for any other environments (staging, prod) as needed + +**Changes:** + +S3 bucket: +```hcl +resource "aws_s3_bucket" "genesis_artifacts" { + bucket = "sei-genesis-artifacts" + tags = local.tags +} +``` + +IAM statement added to `aws_iam_policy.seid_node`: +```hcl +{ + Sid = "GenesisArtifacts" + Effect = "Allow" + Action = ["s3:GetObject", "s3:PutObject"] + Resource = "arn:aws:s3:::sei-genesis-artifacts/*" +} +``` + +**Why this is a prerequisite:** +The current `seid-node` IAM policy only has `s3:PutObject` on `sei-node-mvp/shadow-results/*` (for result export). Genesis handlers upload to a different bucket entirely. Without this change, `upload-genesis-artifacts` and `assemble-and-upload-genesis` fail with `AccessDenied` at runtime even if the code is correct. + +**Acceptance:** +- [ ] Bucket exists in target account +- [ ] `seid-node` SA can `PutObject` and `GetObject` to `sei-genesis-artifacts/*` +- [ ] Existing snapshot download and result export permissions unchanged + +**Dependencies:** None — can land immediately +**Estimated effort:** Small (< 1 day, mostly review + apply) + +--- + +## Stream A: seictl (Sidecar) + +### PR 1 — S3ObjectClient interface + +**Scope:** New `S3ObjectClient` interface with Get+Put for small files. + +**Files:** +- `seictl/sidecar/tasks/s3.go` (new) — `S3ObjectClient` interface, `S3ObjectClientFactory` type, `DefaultS3ObjectClientFactory` using `s3.NewFromConfig` +- `seictl/sidecar/tasks/s3_test.go` (new) — factory returns client satisfying interface + +**Changes:** +- Define `S3ObjectClient` with `GetObject` + `PutObject` (both from `aws-sdk-go-v2/service/s3`) +- `DefaultS3ObjectClientFactory` loads config with region, returns `*s3.Client` (natively satisfies both) +- Existing `S3GetObjectAPI` / `S3ClientFactory` in `genesis.go` are left in place (no migration) + +**Acceptance:** +- [ ] Interface compiles with `*s3.Client` as implementation +- [ ] Factory returns working client in unit test with mocked config + +**Estimated effort:** Small (< 1 day) + +--- + +### PR 2 — Genesis task handlers (generate-identity, generate-gentx, upload-genesis-artifacts) + +**Scope:** Three new task handlers + TaskType constants + client-side TaskBuilders. + +**Files:** +- `seictl/sidecar/engine/types.go` — add 3 `TaskType` constants +- `seictl/sidecar/tasks/genesis_ceremony.go` (new) — `IdentityGenerator`, `GentxGenerator`, `ArtifactUploader` structs + `Handler()` methods +- `seictl/sidecar/tasks/genesis_ceremony_test.go` (new) — unit tests per design spec +- `seictl/sidecar/client/tasks.go` — add `GenerateIdentityTask`, `GenerateGentxTask`, `UploadGenesisArtifactsTask` builders +- `seictl/cmd/serve.go` — register handlers in engine `handlers` map + +**Pre-implementation: seid CLI contract verification** + +Before writing handler code, run the exact command sequence manually against the target seid Docker image and document: +1. `seid init` — directory structure, output format, `node_key.json` location +2. `seid keys add --output json` — JSON schema for address/pubkey extraction +3. `seid add-genesis-account` — exact positional arg syntax for the target SDK version +4. `seid gentx` — flag names (e.g., `--amount` vs positional), keyring flags +5. `seid collect-gentxs` — expected input directory, output genesis.json location + +This becomes the test fixture and the contract the handlers code against. The seid binary comes from the node image (version-matched via `copy-seid` init container), but the handler code must know the output format at compile time. + +Key concern: `seid add-genesis-account` syntax changed in Cosmos SDK v0.47+ (moved to `genesis add-genesis-account` subcommand). Verify which pattern the target seid version uses. + +**Handler details:** + +`generate-identity`: +1. Check marker `.sei-sidecar-identity-done` → return nil if exists +2. Run `seid init --chain-id --home --moniker ` +3. Run `seid keys add --keyring-backend file --home ` +4. Parse key output, read `config/node_key.json` for nodeId +5. Write `$SEI_HOME/.genesis/identity.json` with `{nodeId, accountAddress, consensusPubkey}` +6. Write marker + +`generate-gentx`: +1. Check marker `.sei-sidecar-gentx-done` → return nil if exists +2. Load `GenesisDefaults()` from sei-config, merge with `genesisParams` overrides +3. Write patched genesis to `config/genesis.json` +4. For each `accountBalance`, run `seid add-genesis-account` +5. Run `seid gentx --chain-id --keyring-backend file` +6. Write `$SEI_HOME/.genesis/gentx.json` (copy of gentx file) +7. Write marker + +`upload-genesis-artifacts`: +1. Check marker `.sei-sidecar-upload-artifacts-done` → return nil if exists +2. Read `.genesis/identity.json` and `.genesis/gentx.json` +3. `PutObject` to `s3://///identity.json` +4. `PutObject` to `s3://///gentx.json` +5. Write marker + +**Acceptance:** +- [ ] `TestGenerateIdentity_HappyPath` — identity.json written, marker created +- [ ] `TestGenerateIdentity_Idempotent` — marker exists, no seid calls +- [ ] `TestGenerateGentx_HappyPath` — gentx.json written, genesis patched +- [ ] `TestGenerateGentx_Idempotent` — marker exists, no seid calls +- [ ] `TestUploadGenesisArtifacts_HappyPath` — 2 PutObject calls at correct keys +- [ ] `TestUploadGenesisArtifacts_Idempotent` — marker exists, no S3 calls +- [ ] All three TaskBuilders serialize/validate correctly + +**Dependencies:** PR 1 (S3ObjectClient for upload handler) +**Estimated effort:** Medium (2-3 days) + +--- + +### PR 3 — Assembly handler (assemble-and-upload-genesis) + +**Scope:** Group-level assembly task that runs on node 0's sidecar. + +**Files:** +- `seictl/sidecar/engine/types.go` — add `TaskAssembleAndUploadGenesis` constant +- `seictl/sidecar/tasks/genesis_ceremony.go` — add `GenesisAssembler` struct + `Handler()` +- `seictl/sidecar/tasks/genesis_ceremony_test.go` — assembly tests +- `seictl/sidecar/client/tasks.go` — add `AssembleAndUploadGenesisTask` builder with `GenesisNodeParam` +- `seictl/cmd/serve.go` — register handler + +**Handler details:** +1. Check marker `.sei-sidecar-assemble-genesis-done` → return nil if exists +2. For each node in `nodes[]` param: + - `GetObject` from `s3://///identity.json` + - `GetObject` from `s3://///gentx.json` + - Write gentx to `$SEI_HOME/config/gentx/` + - Run `seid add-genesis-account` for each validator's account +3. Run `seid collect-gentxs --home ` +4. Hash genesis.json (SHA-256) +5. `PutObject` assembled genesis to `s3:////genesis.json` +6. Write marker + +**Acceptance:** +- [ ] `TestAssembleAndUploadGenesis_HappyPath` — genesis.json uploaded, marker created +- [ ] `TestAssembleAndUploadGenesis_Idempotent` — marker exists, no S3/seid calls + +**Dependencies:** PR 1 (S3ObjectClient), PR 2 (identity.json format) +**Estimated effort:** Medium (2-3 days) + +--- + +### PR 4 — /v0/node-id endpoint + SidecarStatusClient extension + +**Scope:** Read-only HTTP endpoint + client interface addition. + +**Files:** +- `seictl/sidecar/server/server.go` — extend `Server` to accept `homeDir`, add `handleNodeID` +- `seictl/sidecar/server/server_test.go` — endpoint tests +- `seictl/cmd/serve.go` — pass `homeDir` to `NewServer` +- `seictl/sidecar/client/client.go` — add `GetNodeID(ctx) (string, error)` method +- `sei-k8s-controller/internal/controller/node/plan_execution.go` — add `GetNodeID` to `SidecarStatusClient` interface + +**Implementation:** +- `NewServer(addr, eng, homeDir)` — thread homeDir +- Handler reads `$homeDir/config/node_key.json`, derives node ID via CometBFT `p2p.LoadNodeKey()` (or equivalent Ed25519 pubkey → address derivation) +- Returns `{"nodeId": "abc123..."}` + +**Acceptance:** +- [ ] `TestNodeIDEndpoint_ReturnsNodeId` — node_key.json on disk, returns 200 with correct nodeId +- [ ] `TestNodeIDEndpoint_NoKeyFile` — no file, returns error +- [ ] `SidecarStatusClient` compiles with new method +- [ ] Client `GetNodeID` makes correct HTTP call + +**Dependencies:** None (can start in parallel with PR 2-3) +**Estimated effort:** Small (1 day) + +--- + +## Stream B: sei-k8s-controller + +### PR 5 — CRD types + manifests + +**Scope:** New Go types, validation, generated manifests. + +**Files:** +- `api/v1alpha1/seinodegroup_types.go` — add `GenesisCeremonyConfig`, `GenesisS3Destination`, `GenesisAccount` to `SeiNodeGroupSpec`; add `AssemblyPlan`, `GenesisHash`, `GenesisS3URI` to status +- `api/v1alpha1/seinode_types.go` — add `GenesisCeremonyNodeConfig`, `AccountBalance` to `ValidatorSpec` +- `api/v1alpha1/common_types.go` — no changes (existing `GenesisS3Source` reused) +- Run `make manifests generate` + +**Acceptance:** +- [ ] `make manifests` succeeds +- [ ] `make generate` succeeds +- [ ] CRD YAML includes new fields with correct validation markers +- [ ] CEL rules unchanged on `PeerSource` + +**Dependencies:** None +**Estimated effort:** Small (1 day) + +--- + +### PR 6 — Genesis PreInit Job + planner + +**Scope:** Branch in `generatePreInitJob`, genesis PreInit plan, task builders. + +**Files:** +- `internal/controller/node/job.go` — `generateGenesisPreInitJob` (branch from `generatePreInitJob`) +- `internal/controller/node/planner_validator.go` — `isGenesisCeremonyNode`, update `Validate` +- `internal/controller/node/planner_bootstrap.go` — `buildGenesisPreInitPlan` +- `internal/controller/node/planner.go` — update `needsPreInit` for genesis +- `internal/controller/node/task_builders.go` — `generateIdentityTask`, `generateGentxTask`, `uploadGenesisArtifactsTask` +- `internal/controller/node/controller.go` — genesis branch in `reconcilePending` (defer Init plan) +- Tests: `resources_test.go`, `plan_execution_test.go` + +**Key implementation details:** + +`generateGenesisPreInitJob`: +- Init container: `copy-seid` — copies seid binary from node image to PVC at `$dataDir/bin/seid` +- Sidecar container: `seictl serve` (restartable init container) +- Main container: `sleep infinity` keepalive +- Same IRSA service account as snapshot PreInit +- No `snapshotSourceFor` call + +`reconcilePending` genesis branch: +```go +if isGenesisCeremonyNode(node) { + node.Status.PreInitPlan = buildGenesisPreInitPlan() + // Init plan deferred — peers and genesis source not yet available +} else { ... } +``` + +**Acceptance:** +- [ ] `TestNeedsPreInit_GenesisCeremony` — returns true +- [ ] `TestBuildGenesisPreInitPlan` — 3 tasks in correct order +- [ ] `TestReconcilePending_GenesisDefersInitPlan` — PreInit set, Init nil +- [ ] `TestGenerateGenesisPreInitJob_NoPanic` — no nil dereference from missing snapshot +- [ ] Genesis PreInit Job pod spec has copy-seid init container, sleep infinity main, sidecar + +**Dependencies:** PR 5 (CRD types) +**Estimated effort:** Medium (2-3 days) + +--- + +### PR 7 — PreInit gate (handlePreInitComplete) + +**Scope:** Unified PreInit completion handler that bypasses `isJobComplete` for genesis. + +**Files:** +- `internal/controller/node/pre_init.go` — extract `handlePreInitComplete`, refactor both `isJobComplete` checkpoints +- Tests: `plan_execution_test.go` + +**Key implementation details:** + +In `reconcilePreInitializing`, when `plan.Phase == Complete`: +- If `isGenesisCeremonyNode(node)` → route to `handlePreInitComplete` (skip `isJobComplete`) +- Otherwise → existing halt-height wait logic + +`handlePreInitComplete` for genesis: +1. If `node.Spec.Genesis.S3 == nil` → requeue 10s (waiting for group) +2. If set → `buildPostBootstrapInitPlan(node)` (static peers are on `spec.validator.peers`) +3. `cleanupPreInit` → delete Job + headless service +4. Transition to `PhaseInitializing` + +Must handle BOTH code paths in `reconcilePreInitializing` — the early return at line 26-33 AND the post-`executePlan` check at line 104-108. + +**Acceptance:** +- [ ] `TestPreInitGate_WaitsForGenesisSource` — requeue when genesis.s3 nil +- [ ] `TestPreInitGate_ProceedsWhenS3Set` — builds Init plan, cleans up, transitions +- [ ] `TestPreInitGate_InitPlanHasDiscoverPeers` — static peers → discover-peers in Init plan +- [ ] Snapshot PreInit flow unchanged (no regression) + +**Dependencies:** PR 6 (PreInit Job + planner) +**Estimated effort:** Medium (1-2 days) + +--- + +### PR 8 — Group coordination + +**Scope:** Full genesis orchestration in the SeiNodeGroup controller. + +**Files:** +- `internal/controller/nodegroup/nodes.go` — update `generateSeiNode` with genesis injection +- `internal/controller/nodegroup/genesis.go` (new) — `reconcileGenesisAssembly`, `collectPeers`, `setGenesisSource`, `buildAssemblyPlan`, `executeAssemblyPlan`, `genesisS3Config` +- `internal/controller/nodegroup/controller.go` — add `BuildSidecarClientFn`, call `reconcileGenesisAssembly` in reconcile loop +- `internal/controller/node/plan_execution.go` — export `PreInitSidecarURL` +- Tests: `nodes_test.go`, `genesis_test.go` (new) + +**Key implementation details:** + +`generateSeiNode` genesis injection: +- Set `spec.chainId` from `genesis.chainId` if template omits it +- Populate full `GenesisCeremonyNodeConfig` (chainId, stakingAmount, accountBalances, genesisParams, index, artifactS3) + +`reconcileGenesisAssembly`: +1. Sort nodes by name +2. Timeout check +3. Early failure detection (PreInit Failed) +4. Wait for all PreInit complete +5. Build/resume `AssemblyPlan` in group status +6. Execute assembly plan against node 0's sidecar +7. `collectPeers` — query `/v0/node-id` on each sidecar, build `nodeId@dns:port` +8. `setGenesisSource` — patch each SeiNode with `spec.genesis.s3` + `spec.validator.peers` (static) +9. Record completion in group status + +**Acceptance:** +- [ ] `TestGenerateSeiNode_InjectsChainId` +- [ ] `TestGenerateSeiNode_InjectsGenesisCeremonyConfig` +- [ ] `TestGroupAssembly_WaitsForPreInit` — 2 of 4 complete → requeue +- [ ] `TestGroupAssembly_SubmitsOnAllComplete` — assembly plan created +- [ ] `TestCollectPeers_BuildsPeerList` — 4 entries in nodeId@dns:port +- [ ] `TestSetGenesisSource_StaticPeers` — genesis.s3 + peers set correctly +- [ ] `TestGroupAssembly_Timeout` — error on expiry +- [ ] `TestGroupAssembly_FailsOnPreInitFailure` — GenesisFailed condition + +**Dependencies:** PR 5 (CRD types), PR 6 (planner), PR 7 (gate). Also requires PR 4 (node-id endpoint) for the `SidecarStatusClient.GetNodeID` interface. +**Estimated effort:** Large (3-4 days) + +--- + +### PR 9 — Integration (E2E) + +**Scope:** End-to-end integration test + sei-config dependency. + +**Files:** +- `sei-config` — `GenesisDefaults()` function (may be its own PR in sei-config repo) +- Integration test in sei-k8s-controller or a dedicated test harness + +**Acceptance:** +- [ ] `TestGenesisNetworkE2E` — SeiNodeGroup replicas=4, all reach Running, produce blocks +- [ ] `TestGenesisNetworkDeletion` — clean deletion of all resources + +**Dependencies:** All prior PRs merged +**Estimated effort:** Medium (2-3 days) + +--- + +## Dependency Graph + +``` +PR 0: Terraform (land first — blocks E2E testing) + +sei-config: GenesisDefaults() (can start immediately, needed by PR 2) + +Stream A: PR 1 → PR 2 → PR 3 + \ + PR 4 (parallel) ────→ PR 9 (E2E, requires PR 0 applied) + / +Stream B: PR 5 → PR 6 → PR 7 → PR 8 +``` + +**Critical path:** PR 0 (Terraform) + PR 5 → PR 6 → PR 7 → PR 8 → PR 9 +**Parallelism:** Stream A and Stream B run concurrently. PR 0 and PR 4 can start immediately. + +--- + +## Estimated Timeline + +| Week | Stream A (seictl) | Stream B (controller) | +|------|-------------------|----------------------| +| 1 | PR 1 (S3ObjectClient) + PR 2 (task handlers) | PR 5 (CRD types) + PR 6 (PreInit Job) | +| 2 | PR 3 (assembly handler) + PR 4 (node-id endpoint) | PR 7 (PreInit gate) + PR 8 (group coordination) | +| 3 | — | PR 9 (E2E integration) | + +**Total:** ~3 weeks with two parallel contributors, ~4-5 weeks single-threaded. + +--- + +## Risk Register + +| Risk | Impact | Likelihood | Mitigation | +|------|--------|------------|-----------| +| `seid` CLI syntax differs from expected contract | Handler fails to parse output or execute commands | Medium | **Pre-implementation CLI verification** (added to PR 2). Run exact commands against target seid image. Use `--output json` where available. Key concern: `add-genesis-account` syntax changed in Cosmos SDK v0.47+ (`genesis add-genesis-account` subcommand). | +| IAM policy missing `s3:PutObject` on genesis bucket | `upload-genesis-artifacts` and `assemble-and-upload-genesis` fail with AccessDenied | **Confirmed** — current policy lacks this | **PR 0 (Terraform)** creates `sei-genesis-artifacts` bucket + IAM grant. Must land before E2E testing. | +| PVC RWO handoff timing on cleanup | StatefulSet pod can't mount PVC after PreInit Job deletion | Low | `cleanupPreInit` already polls for Job full deletion before proceeding; no change needed | +| Large genesis (many validators) exceeds ceremony timeout | Genesis ceremony fails | Low | `maxCeremonyDuration` is configurable; default 15m is generous for ≤50 validators | +| sei-config `GenesisDefaults()` not ready when PR 2 starts | PR 2 blocked on external dependency | Low | Use inline defaults initially; swap to sei-config when ready (the function is small) | +| Pod identity namespace binding (`default` only) | Genesis nodes in non-default namespace get no AWS credentials | Low | Current setup uses `default` for all workloads; document the constraint | diff --git a/.tide/network-deployments.md b/.tide/network-deployments.md new file mode 100644 index 0000000..f5635af --- /dev/null +++ b/.tide/network-deployments.md @@ -0,0 +1,799 @@ +# Component: Genesis Network Configuration on SeiNodeGroup + +**Date:** 2026-03-22 +**Status:** Implementation-Ready (cross-review round 4 resolved) +**Last revised:** 2026-03-25 + +--- + +## Owner + +Platform / Infrastructure (joint: Kubernetes Specialist + Platform Engineer) + +## Phase + +Pre-Tide — Operational prerequisite. Spinning up isolated validator networks is required for image qualification, load testing, and (in a future iteration) upgrade rehearsal. + +## Purpose + +Add an optional `genesis` field to `SeiNodeGroupSpec` that bootstraps a self-contained Sei validator network. The design preserves the task engine as a pure sequencer — tasks succeed or fail, data flows through the filesystem and S3, and the controller sequences execution without routing data. + +**Scope:** Genesis ceremony and validator network bootstrap only. + +**Architecture principles (this iteration):** +1. **Engine stays stateless.** Tasks return success/fail only. No `TaskResult.Result` field. No inter-task output dependencies. +2. **State lives on PVC + S3.** Per-node genesis artifacts (identity, gentx) flow through the PVC and are uploaded to S3. Assembly reads from S3. Final genesis is distributed via S3. +3. **Controller is a sequencer.** It builds plans, submits tasks, and polls for completion. The one exception: after assembly, the group controller queries each node's sidecar `/v0/node-id` endpoint to build the static peer list. This is a read-only metadata query, not task output routing. + +--- + +## Dependencies + +- **seid binary** (accessible from sidecar in PreInit Job — via init container copy) +- **seictl sidecar** — three new task types: `generate-identity`, `generate-gentx`, `upload-genesis-artifacts`. One group-level task: `assemble-and-upload-genesis`. One new HTTP endpoint: `GET /v0/node-id`. No engine changes required. +- **sei-config** — `GenesisDefaults()` function for test-network genesis parameter defaults +- **S3** — artifact store for cross-node data flow; genesis distribution via existing `configure-genesis` S3 path +- **SeiNode controller** — PreInit gate for genesis nodes; Init plan deferred to gate-clear time +- **SeiNodeGroup controller** — thin coordination: observe PreInit completion, trigger assembly on node 0, collect node IDs, set genesis source + static peers on each SeiNode + +**Explicit exclusions:** +- `TaskResult.Result` / typed task results — separate design deliverable +- `TaskExecutionInterface` — separate design deliverable (informs but does not block genesis) +- `UpgradePolicy` — separate design iteration +- EFS / ReadWriteMany storage — not needed (S3 is the cross-node channel) +- PreInit/Init phase merge — deferred until TaskExecutionInterface provides the abstraction to make the phase boundary invisible (see Deferred section) + +--- + +## Interface Specification + +### CRD Changes: `SeiNodeGroupSpec` + +```go +type SeiNodeGroupSpec struct { + Replicas int32 `json:"replicas"` + Template SeiNodeTemplate `json:"template"` + DeletionPolicy DeletionPolicy `json:"deletionPolicy,omitempty"` + Networking *NetworkingConfig `json:"networking,omitempty"` + Monitoring *MonitoringConfig `json:"monitoring,omitempty"` + + // Genesis enables bootstrapping a new validator network from scratch. + // Requires template.spec.validator to be set. + // +optional + Genesis *GenesisCeremonyConfig `json:"genesis,omitempty"` +} +``` + +### `GenesisCeremonyConfig` + +```go +type GenesisCeremonyConfig struct { + // ChainID for the new network. + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=64 + // +kubebuilder:validation:Pattern=`^[a-z0-9][a-z0-9-]*[a-z0-9]$` + ChainID string `json:"chainId"` + + // StakingAmount is the amount each validator self-delegates in its gentx. + // Default: "10000000usei" + // +optional + StakingAmount string `json:"stakingAmount,omitempty"` + + // AccountBalance is the initial balance for each validator's genesis account. + // Default: "1000000000000000000000usei,1000000000000000000000uusdc,1000000000000000000000uatom" + // +optional + AccountBalance string `json:"accountBalance,omitempty"` + + // Accounts adds non-validator genesis accounts (e.g. for load test funding). + // +optional + Accounts []GenesisAccount `json:"accounts,omitempty"` + + // Overrides is a flat map of dotted key paths merged on top of sei-config's + // GenesisDefaults(). Applied BEFORE gentx generation. + // +optional + Overrides map[string]string `json:"overrides,omitempty"` + + // GenesisS3 configures where genesis artifacts are stored. + // When omitted, inferred: bucket = "sei-genesis-artifacts", + // prefix = "//", region from PlatformConfig. + // +optional + GenesisS3 *GenesisS3Destination `json:"genesisS3,omitempty"` + + // MaxCeremonyDuration is the maximum time from group creation to genesis + // assembly completion. Default: "15m". + // +optional + MaxCeremonyDuration *metav1.Duration `json:"maxCeremonyDuration,omitempty"` +} + +type GenesisS3Destination struct { + Bucket string `json:"bucket"` + Prefix string `json:"prefix,omitempty"` + Region string `json:"region"` +} + +type GenesisAccount struct { + Address string `json:"address"` + Balance string `json:"balance"` +} +``` + +### CRD Changes: `ValidatorSpec` + +```go +type ValidatorSpec struct { + // ... existing fields (Peers, Snapshot, etc.) ... + + // GenesisCeremony indicates this validator participates in a group genesis. + // +optional + GenesisCeremony *GenesisCeremonyNodeConfig `json:"genesisCeremony,omitempty"` +} + +type GenesisCeremonyNodeConfig struct { + ChainID string `json:"chainId"` + StakingAmount string `json:"stakingAmount"` + AccountBalances []AccountBalance `json:"accountBalances"` + GenesisParams string `json:"genesisParams,omitempty"` + Index int32 `json:"index"` + + // ArtifactS3 is the S3 location where this node uploads its genesis artifacts. + ArtifactS3 GenesisS3Destination `json:"artifactS3"` +} + +type AccountBalance struct { + Address string `json:"address"` + Amount string `json:"amount"` +} +``` + +### CRD Changes: `PeerSource` + +No new peer source types. Genesis peers are pushed by the group controller as `StaticPeerSource` entries in `nodeId@host:port` format. The existing CEL validation rule is unchanged: + +```go +// +kubebuilder:validation:XValidation:rule="(has(self.ec2Tags) ? 1 : 0) + (has(self.static) ? 1 : 0) == 1",message="exactly one of ec2Tags or static must be set" +``` + +### `SeiNodeGroupStatus` Additions + +```go +type SeiNodeGroupStatus struct { + // ... existing fields ... + + // AssemblyPlan tracks the genesis assembly task on index 0's sidecar. + // +optional + AssemblyPlan *TaskPlan `json:"assemblyPlan,omitempty"` + + // GenesisHash is the SHA-256 hex digest of the assembled genesis.json. + // +optional + GenesisHash string `json:"genesisHash,omitempty"` + + // GenesisS3URI is the S3 URI of the uploaded genesis. + // +optional + GenesisS3URI string `json:"genesisS3URI,omitempty"` +} +``` + +### New Sidecar Task Types + +Four new task types. No engine changes — all use the existing `func(ctx, params) error` handler signature. + +| Task Type | Params | PVC Side Effects | S3 Side Effects | +|---|---|---|---| +| `generate-identity` | `chainId`, `moniker`, `seidPath` | Writes key material, `$SEI_HOME/.genesis/identity.json` | None | +| `generate-gentx` | `stakingAmount`, `chainId`, `accountBalances[]`, `genesisParams`, `seidPath` | Writes gentx, `$SEI_HOME/.genesis/gentx.json` | None | +| `upload-genesis-artifacts` | `s3Bucket`, `s3Prefix`, `s3Region`, `nodeName` | Reads `.genesis/identity.json`, `.genesis/gentx.json` | Uploads to `s3://///` | +| `assemble-and-upload-genesis` | `s3Bucket`, `s3Prefix`, `s3Region`, `chainId`, `seidPath`, `nodes[]` (see `GenesisNodeParam` below) | Writes assembled genesis locally for seid | Reads `/identity.json` + `/gentx.json` per node, uploads `genesis.json` | + +All tasks return `error` only. Data flows through PVC files and S3. The engine never routes data between tasks. + +**`GenesisNodeParam`** (wire format for `nodes[]` in `assemble-and-upload-genesis`): +```go +type GenesisNodeParam struct { + Name string `json:"name"` // S3 key segment: //identity.json +} +``` + +No S3 ListObjects required — node names are passed as task params, making key construction deterministic. + +**`identity.json`** (written by `generate-identity`, read by `assemble-and-upload-genesis`): +```json +{"nodeId": "abc123...", "accountAddress": "sei1...", "consensusPubkey": "..."} +``` + +### New Sidecar HTTP Endpoint: `GET /v0/node-id` + +A read-only metadata endpoint on the sidecar HTTP server. Returns the node's Tendermint node ID by reading `$SEI_HOME/config/node_key.json` from the PVC. Available after `generate-identity` has run. + +```json +{"nodeId": "abc123def456..."} +``` + +This is called by the group controller after assembly — not by peer sidecars. The sidecar HTTP server is already running during PreInit (it's how the controller submits tasks). + +**Implementation notes:** +- `Server` currently holds `(addr, engine, mux)` but not `homeDir`. The constructor must be extended to accept `homeDir` (or an `IdentityProvider` interface) so the endpoint can locate `node_key.json`. +- CometBFT's `node_key.json` stores a `priv_key` object, not a literal `nodeId` field. The node ID is **derived** from the Ed25519 public key (same derivation as `seid tendermint show-node-id`). Use CometBFT's `p2p.LoadNodeKey()` or equivalent to extract the ID. +- The derived ID must match the `nodeId` field in `identity.json` (both sourced from the same key material written by `generate-identity`). +- Invariant: sidecar `homeDir` == task `homeDir` for PreInit (same PVC mount, same `--home` / `SEI_HOME`). + +**Controller client:** `SidecarStatusClient` needs a new `GetNodeID(ctx) (string, error)` method. The concrete `SidecarClient` implements it as `GET /v0/node-id` with JSON parsing. All mocks/fakes in tests must be updated. + +### sei-config: `GenesisDefaults()` + +Unchanged from prior revision — test-network genesis parameter defaults in sei-config. + +--- + +## State Model + +``` +┌──────────────────────────────────────────────────────────────────────────────┐ +│ Data flows through PVC (local) and S3 (cross-node). Engine is a sequencer. │ +│ │ +│ Each SeiNode PreInit plan (3 tasks, all filesystem ops): │ +│ ┌──────────────────────────────────────────────────────────────────┐ │ +│ │ generate-identity PVC: keys, identity.json │ │ +│ │ ↓ │ │ +│ │ generate-gentx PVC: gentx.json, patched genesis │ │ +│ │ ↓ │ │ +│ │ upload-genesis-artifacts PVC→S3: identity.json, gentx.json │ │ +│ └──────────────────────────────────────────────────────────────────┘ │ +│ │ +│ SeiNodeGroup coordination (reconcile loop): │ +│ 1. Create SeiNodes → PreInit plans run independently │ +│ 2. Watch: all SeiNodes PreInit plans complete? │ +│ 3. Submit assemble-and-upload-genesis to node 0's sidecar │ +│ S3→PVC→seid→S3: reads all artifacts, assembles, uploads genesis.json │ +│ 4. Query GET /v0/node-id on each node's sidecar → build peer list │ +│ 5. Set spec.genesis.s3 + spec.validator.peers (static) on each SeiNode │ +│ → PreInit gate clears │ +│ │ +│ Each SeiNode Init plan (standard — no new peer source types): │ +│ ┌──────────────────────────────────────────────────────────────────┐ │ +│ │ configure-genesis S3→PVC: downloads assembled genesis.json │ │ +│ │ ↓ │ │ +│ │ config-apply Writes node config via sei-config │ │ +│ │ ↓ │ │ +│ │ discover-peers Static peers: writes persistent_peers │ │ +│ │ (already in nodeId@host:port format) │ │ +│ │ ↓ │ │ +│ │ config-validate → mark-ready │ │ +│ └──────────────────────────────────────────────────────────────────┘ │ +│ │ +│ Controller reads nodeIds via sidecar HTTP (not task output). │ +│ Peers flow through existing StaticPeerSource. No new sidecar peer logic. │ +└──────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Internal Design + +### 1. Validator Planner: Genesis PreInit + +```go +func needsPreInit(node *seiv1alpha1.SeiNode) bool { + // ... existing snapshot check ... + if node.Spec.Validator != nil && node.Spec.Validator.GenesisCeremony != nil { + return true + } + return false +} + +func buildGenesisPreInitPlan() *seiv1alpha1.TaskPlan { + return &seiv1alpha1.TaskPlan{ + Phase: seiv1alpha1.TaskPlanActive, + Tasks: []seiv1alpha1.PlannedTask{ + {Type: taskGenerateIdentity, Status: seiv1alpha1.PlannedTaskPending}, + {Type: taskGenerateGentx, Status: seiv1alpha1.PlannedTaskPending}, + {Type: taskUploadGenesisArtifacts, Status: seiv1alpha1.PlannedTaskPending}, + }, + } +} +``` + +In `reconcilePending`, Init plan is deferred for genesis nodes (peers and genesis source not yet available): + +```go +if isGenesisCeremonyNode(node) { + node.Status.PreInitPlan = buildGenesisPreInitPlan() +} else { + node.Status.PreInitPlan = buildPreInitPlan(node, planner) + if needsPreInit(node) { + node.Status.InitPlan = buildPostBootstrapInitPlan(node) + } else { + node.Status.InitPlan = planner.BuildPlan(node) + } +} +``` + +### 2. Genesis PreInit Job + +The existing `generatePreInitJob` / `buildPreInitPodSpec` assumes a snapshot source (dereferences `snap.S3.TargetHeight`). For genesis nodes, `snapshotSourceFor` returns nil — this would nil-panic. Genesis nodes branch to `generateGenesisPreInitJob` which builds a different pod spec: + +```go +func generatePreInitJob(node *seiv1alpha1.SeiNode, platform PlatformConfig) *batchv1.Job { + if isGenesisCeremonyNode(node) { + return generateGenesisPreInitJob(node, platform) + } + // ... existing snapshot path (unchanged) ... +} +``` + +Genesis PreInit Job differences from snapshot PreInit: +- **No `snapshotSourceFor` call** — genesis has no snapshot +- **Init container:** `copy-seid` copies binary to PVC (genesis tasks need seid) +- **Main container:** `sleep infinity` keepalive (not `seid start --halt-height`) +- **Same IRSA service account** — S3 upload needs credentials +- Pod stays alive for group-submitted assembly task on node 0 + +### 3. Genesis Task Builders + +```go +func generateIdentityTask(node *seiv1alpha1.SeiNode) sidecar.TaskBuilder { + gc := node.Spec.Validator.GenesisCeremony + return sidecar.GenerateIdentityTask{ + ChainID: gc.ChainID, + Moniker: node.Name, + SeidPath: filepath.Join(dataDir, "bin", "seid"), + } +} + +func generateGentxTask(node *seiv1alpha1.SeiNode) sidecar.TaskBuilder { + gc := node.Spec.Validator.GenesisCeremony + return sidecar.GenerateGentxTask{ + ChainID: gc.ChainID, + StakingAmount: gc.StakingAmount, + AccountBalances: gc.AccountBalances, + GenesisParams: gc.GenesisParams, + SeidPath: filepath.Join(dataDir, "bin", "seid"), + } +} + +func uploadGenesisArtifactsTask(node *seiv1alpha1.SeiNode) sidecar.TaskBuilder { + gc := node.Spec.Validator.GenesisCeremony + return sidecar.UploadGenesisArtifactsTask{ + S3Bucket: gc.ArtifactS3.Bucket, + S3Prefix: gc.ArtifactS3.Prefix, + S3Region: gc.ArtifactS3.Region, + NodeName: node.Name, + } +} +``` + +### 4. PreInit Gate for Genesis Nodes + +The existing `reconcilePreInitializing` has two places where `plan.Phase == Complete` leads to `isJobComplete(job)` checks (lines 26-33 and 104-108 in `pre_init.go`). For genesis nodes with `sleep infinity`, `isJobComplete` is permanently false — this would deadlock the node in a requeue loop with a misleading "waiting for halt-height" log. + +`handlePreInitComplete` replaces BOTH inline completion guards for genesis nodes. It must be the only exit path when the plan is complete: + +```go +func (r *SeiNodeReconciler) handlePreInitComplete(ctx context.Context, node *seiv1alpha1.SeiNode, planner NodePlanner) (ctrl.Result, error) { + if isGenesisCeremonyNode(node) { + if node.Spec.Genesis.S3 == nil { + log.FromContext(ctx).Info("genesis PreInit complete, waiting for group to set genesis source") + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } + patch := client.MergeFrom(node.DeepCopy()) + node.Status.InitPlan = buildPostBootstrapInitPlan(node) + if err := r.Status().Patch(ctx, node, patch); err != nil { + return ctrl.Result{}, fmt.Errorf("building genesis Init plan: %w", err) + } + } else { + // Snapshot: wait for halt-height (existing) + // ... + } + + if err := r.cleanupPreInit(ctx, node); err != nil { + return ctrl.Result{}, fmt.Errorf("cleaning up pre-init: %w", err) + } + return r.transitionPhase(ctx, node, seiv1alpha1.PhaseInitializing) +} +``` + +### 5. SeiNodeGroup Genesis Coordination + +Thin coordination layer. After assembly, the group controller collects node IDs via the sidecar HTTP endpoint and pushes static peers to each SeiNode. + +```go +type SeiNodeGroupReconciler struct { + client.Client + Scheme *runtime.Scheme + Recorder record.EventRecorder + ControllerSA string + + BuildSidecarClientFn func(baseURL string) SidecarStatusClient +} +``` + +**Genesis injection in `generateSeiNode`:** Beyond ChainID, the full `GenesisCeremonyNodeConfig` must be injected. The current `generateSeiNode` does a plain `DeepCopy` of the template — no genesis logic exists. + +```go +if group.Spec.Genesis != nil { + if spec.ChainID == "" { + spec.ChainID = group.Spec.Genesis.ChainID + } + if spec.Validator == nil { + spec.Validator = &seiv1alpha1.ValidatorSpec{} + } + s3Cfg := r.genesisS3Config(group) + spec.Validator.GenesisCeremony = &seiv1alpha1.GenesisCeremonyNodeConfig{ + ChainID: group.Spec.Genesis.ChainID, + StakingAmount: valueOrDefault(group.Spec.Genesis.StakingAmount, "10000000usei"), + AccountBalances: buildAccountBalances(group.Spec.Genesis), + GenesisParams: buildGenesisParamsJSON(group.Spec.Genesis.Overrides), + Index: int32(ordinal), + ArtifactS3: s3Cfg, + } +} +``` + +**Reconcile:** + +```go +func (r *SeiNodeGroupReconciler) reconcileGenesisAssembly(ctx context.Context, group *SeiNodeGroup) (bool, ctrl.Result, error) { + nodes, err := r.listChildSeiNodes(ctx, group) + if err != nil { + return false, ctrl.Result{}, err + } + + // Timeout check + if group.Spec.Genesis.MaxCeremonyDuration != nil { + if time.Since(group.CreationTimestamp.Time) > group.Spec.Genesis.MaxCeremonyDuration.Duration { + return false, ctrl.Result{}, fmt.Errorf("genesis ceremony timed out") + } + } + + // Early failure detection — surface errors before timeout + for _, node := range nodes { + if node.Status.PreInitPlan != nil && node.Status.PreInitPlan.Phase == seiv1alpha1.TaskPlanFailed { + r.Recorder.Eventf(group, corev1.EventTypeWarning, "GenesisPreInitFailed", + "Node %s PreInit failed", node.Name) + meta.SetStatusCondition(&group.Status.Conditions, metav1.Condition{ + Type: "GenesisFailed", Status: metav1.ConditionTrue, + Reason: "PreInitFailed", Message: fmt.Sprintf("node %s PreInit failed", node.Name), + }) + return false, ctrl.Result{}, fmt.Errorf("node %s PreInit failed", node.Name) + } + } + + // Wait for ALL PreInit plans to complete (all artifacts uploaded to S3) + for _, node := range nodes { + if node.Status.PreInitPlan == nil || node.Status.PreInitPlan.Phase != seiv1alpha1.TaskPlanComplete { + return false, ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } + } + + // Sort by ordinal so nodes[0] is deterministically the index-0 node. + // listChildSeiNodes returns in API list order which is not guaranteed. + sort.Slice(nodes, func(i, j int) bool { return nodes[i].Name < nodes[j].Name }) + + // Build or resume the assembly plan (targets node 0's PreInit sidecar) + if group.Status.AssemblyPlan == nil { + group.Status.AssemblyPlan = buildAssemblyPlan() + patch := client.MergeFrom(group.DeepCopy()) + if err := r.Status().Patch(ctx, group, patch); err != nil { + return false, ctrl.Result{}, err + } + } + + // Execute assembly plan against node 0's sidecar + sc := r.BuildSidecarClientFn(PreInitSidecarURL(&nodes[0])) + result, err := r.executeAssemblyPlan(ctx, group, sc, nodes) + if err != nil { + return false, result, err + } + if group.Status.AssemblyPlan.Phase != seiv1alpha1.TaskPlanComplete { + return false, result, nil + } + + // Assembly complete — S3 now has genesis.json. + // Collect node IDs from each node's sidecar to build the peer list. + peers, err := r.collectPeers(ctx, group, nodes) + if err != nil { + return false, ctrl.Result{}, fmt.Errorf("collecting peer node IDs: %w", err) + } + + // Set each SeiNode's genesis source + static peers. This unblocks the PreInit gate. + s3Cfg := r.genesisS3Config(group) + genesisURI := fmt.Sprintf("s3://%s/%sgenesis.json", s3Cfg.Bucket, s3Cfg.Prefix) + + for i := range nodes { + if err := r.setGenesisSource(ctx, &nodes[i], genesisURI, s3Cfg.Region, peers); err != nil { + return false, ctrl.Result{}, err + } + } + + // Record completion + patch := client.MergeFrom(group.DeepCopy()) + group.Status.GenesisS3URI = genesisURI + if err := r.Status().Patch(ctx, group, patch); err != nil { + return false, ctrl.Result{}, err + } + + return true, ctrl.Result{}, nil +} +``` + +**`collectPeers`** queries each node's sidecar for its Tendermint node ID and builds the peer list: + +```go +func (r *SeiNodeGroupReconciler) collectPeers(ctx context.Context, group *SeiNodeGroup, nodes []seiv1alpha1.SeiNode) ([]string, error) { + peers := make([]string, 0, len(nodes)) + for i := range nodes { + sc := r.BuildSidecarClientFn(PreInitSidecarURL(&nodes[i])) + nodeID, err := sc.GetNodeID(ctx) + if err != nil { + return nil, fmt.Errorf("getting node ID for %s: %w", nodes[i].Name, err) + } + dns := fmt.Sprintf("%s-0.%s.%s.svc.cluster.local:%d", + nodes[i].Name, nodes[i].Name, nodes[i].Namespace, p2pPort) + peers = append(peers, fmt.Sprintf("%s@%s", nodeID, dns)) + } + return peers, nil +} +``` + +**`setGenesisSource`** sets two fields using existing types — no new CRD peer source variants: + +```go +func (r *SeiNodeGroupReconciler) setGenesisSource(ctx context.Context, node *seiv1alpha1.SeiNode, genesisURI, region string, peers []string) error { + patch := client.MergeFrom(node.DeepCopy()) + node.Spec.Genesis = &seiv1alpha1.GenesisConfiguration{ + S3: &seiv1alpha1.GenesisS3Source{URI: genesisURI, Region: region}, + } + node.Spec.Validator.Peers = []seiv1alpha1.PeerSource{ + {Static: &seiv1alpha1.StaticPeerSource{Addresses: peers}}, + } + return r.Patch(ctx, node, patch) +} +``` + +The Init plan's `discover-peers` task receives these static peers through the existing `discoverPeersTask` builder — zero changes to the sidecar's peer discovery handler. + +### 6. S3 Client Interface for Genesis + +Genesis tasks need both read (GetObject) and write (PutObject) on small files (< 10KB). The existing `S3GetObjectAPI` (in `tasks/genesis.go`) handles reads; the `UploaderFactory` / `transfermanager` is designed for large multi-part uploads and is overkill for genesis artifacts. + +New interface for genesis use: + +```go +type S3ObjectClient interface { + GetObject(ctx context.Context, params *s3.GetObjectInput, optFns ...func(*s3.Options)) (*s3.GetObjectOutput, error) + PutObject(ctx context.Context, params *s3.PutObjectInput, optFns ...func(*s3.Options)) (*s3.PutObjectOutput, error) +} + +type S3ObjectClientFactory func(ctx context.Context, region string) (S3ObjectClient, error) +``` + +`*s3.Client` natively satisfies this interface. The existing `DefaultS3ClientFactory` returns `s3.NewFromConfig(cfg)` — just widen the return type. Reserve `transfermanager` for snapshot-sized uploads. All four genesis handlers receive `S3ObjectClientFactory` via constructor injection. + +### 7. S3 Artifact Convention + +All genesis artifacts flow through a single S3 prefix: + +``` +s3://// + ├── / + │ ├── identity.json + │ └── gentx.json + ├── / + │ ├── identity.json + │ └── gentx.json + └── genesis.json (assembled, uploaded by assemble-and-upload-genesis) +``` + +Default convention: `bucket = "sei-genesis-artifacts"`, `prefix = "//"`. Override via `genesis.genesisS3`. Genesis artifacts are kept separate from snapshot buckets — different lifecycle, access patterns, and IAM scoping. + +--- + +## Error Handling + +| Error Case | Detection | Response | +|---|---|---| +| `template.spec.validator` not set with genesis | CEL validation | API server rejects | +| PreInit task fails | SeiNode.status.preInitPlan.phase = Failed | Group emits event, sets condition `GenesisFailed` | +| S3 artifact upload fails | `upload-genesis-artifacts` task error | SeiNode PreInit plan fails, group detects | +| Assembly task fails | AssemblyPlan task error | Group emits event, sets condition `GenesisFailed` | +| Node ID collection fails | `collectPeers` HTTP error | Group retries on next reconcile (sidecars still alive) | +| Ceremony timeout | `time.Since(creation) > maxDuration` | Group sets condition `GenesisCeremonyTimedOut` | +| Controller restart during assembly | AssemblyPlan persisted in status | Resumes poll from current task state | + +### Marker File Convention + +Each handler writes a marker file on completion for idempotency. Marker names MUST NOT collide with existing markers (particularly `.sei-sidecar-genesis-done` used by `configure-genesis` in Init): + +| Handler | Marker | +|---|---| +| `generate-identity` | `.sei-sidecar-identity-done` | +| `generate-gentx` | `.sei-sidecar-gentx-done` | +| `upload-genesis-artifacts` | `.sei-sidecar-upload-artifacts-done` | +| `assemble-and-upload-genesis` | `.sei-sidecar-assemble-genesis-done` | + +PreInit and Init share the same PVC — distinct prefixes prevent cross-phase collisions. + +### Invariant: `ensureSeiNode` Spec Sync + +`ensureSeiNode` syncs Labels, Annotations, Image, Entrypoint, Sidecar, PodLabels only. It MUST NOT sync fields under `spec.genesis`, `spec.validator.peers`, or `spec.validator.genesisCeremony` for genesis-managed nodes — these are set by `setGenesisSource` on a separate code path. + +--- + +## Test Specification + +### Unit Tests — Sidecar (seictl) + +| Test | Setup | Action | Expected | +|---|---|---|---| +| `TestGenerateIdentity_HappyPath` | Temp SEI_HOME, seid at seidPath | Handle | identity.json written, marker file created | +| `TestGenerateIdentity_Idempotent` | SEI_HOME with marker | Handle | No seid commands, no error | +| `TestGenerateGentx_HappyPath` | SEI_HOME with identity | Handle with 2 accounts | gentx.json written, genesis patched | +| `TestGenerateGentx_Idempotent` | SEI_HOME with marker | Handle | No seid commands, no error | +| `TestUploadGenesisArtifacts_HappyPath` | identity.json + gentx.json on PVC, mock S3 | Handle | Two S3 PutObject calls at correct keys | +| `TestUploadGenesisArtifacts_Idempotent` | Marker file exists | Handle | No S3 calls | +| `TestAssembleAndUploadGenesis_HappyPath` | Mock S3 with 4 nodes' artifacts | Handle | genesis.json uploaded, marker created | +| `TestAssembleAndUploadGenesis_Idempotent` | Marker file exists | Handle | No S3 calls, no error | +| `TestNodeIDEndpoint_ReturnsNodeId` | node_key.json on PVC | GET /v0/node-id | 200 with nodeId | +| `TestNodeIDEndpoint_NoKeyFile` | Empty PVC | GET /v0/node-id | 404 or 500 | + +### Unit Tests — Controller (sei-k8s-controller) + +| Test | Setup | Action | Expected | +|---|---|---|---| +| `TestNeedsPreInit_GenesisCeremony` | SeiNode with genesisCeremony | needsPreInit | true | +| `TestBuildGenesisPreInitPlan` | Genesis node | buildGenesisPreInitPlan | 3 tasks: generate-identity, generate-gentx, upload-genesis-artifacts | +| `TestReconcilePending_GenesisDefersInitPlan` | Genesis node | reconcilePending | PreInit plan set, Init plan nil | +| `TestPreInitGate_WaitsForGenesisSource` | Genesis node, PreInit complete, genesis.s3 nil | handlePreInitComplete | Requeue 10s | +| `TestPreInitGate_ProceedsWhenS3Set` | Genesis node, PreInit complete, genesis.s3 set | handlePreInitComplete | Builds Init plan, cleans up, transitions | +| `TestPreInitGate_InitPlanHasDiscoverPeers` | Genesis node with static peers set | handlePreInitComplete | Init plan includes discover-peers with static source | +| `TestGenerateSeiNode_InjectsChainId` | Group with genesis, template without chainId | generateSeiNode | chainId injected | +| `TestGenerateSeiNode_InjectsGenesisCeremonyConfig` | Group with genesis | generateSeiNode | Full GenesisCeremonyNodeConfig populated | +| `TestGroupAssembly_WaitsForPreInit` | 4 nodes, 2 complete | reconcileGenesisAssembly | Requeue | +| `TestGroupAssembly_SubmitsOnAllComplete` | 4 nodes all complete, mock sidecar | reconcileGenesisAssembly | Assembly plan created, task submitted | +| `TestCollectPeers_BuildsPeerList` | 4 mock sidecars returning nodeIds | collectPeers | 4 entries in nodeId@dns:port format | +| `TestSetGenesisSource_StaticPeers` | SeiNode | setGenesisSource | genesis.s3 set, peers set as StaticPeerSource | +| `TestGroupAssembly_Timeout` | Created 20min ago, max 15m | reconcileGenesisAssembly | Error | +| `TestGroupAssembly_FailsOnPreInitFailure` | 1 node with PreInit Failed | reconcileGenesisAssembly | GenesisFailed condition, error | + +### Integration Tests + +| Test | Setup | Action | Expected | +|---|---|---|---| +| `TestGenesisNetworkE2E` | SeiNodeGroup replicas=4 | Wait for Running | 4 validators producing blocks | +| `TestGenesisNetworkDeletion` | Running network | Delete group | All resources cleaned up | + +--- + +## Deployment + +### Implementation Order + +1. **sei-config: `GenesisDefaults()`** — Test-network genesis parameter defaults. +2. **seictl: `S3ObjectClient` interface** — Combined Get+Put for small files. Widen `DefaultS3ClientFactory` return type. +3. **seictl: `generate-identity` handler** — Runs seid init + keys add, writes identity.json, marker file. Standard `func(ctx, params) error`. +4. **seictl: `generate-gentx` handler** — Patches genesis, adds accounts, creates gentx, writes gentx.json, marker file. +5. **seictl: `upload-genesis-artifacts` handler** — Reads PVC files, uploads to S3 via `S3ObjectClient.PutObject`. +6. **seictl: `assemble-and-upload-genesis` handler** — Downloads all artifacts from S3 via `S3ObjectClient.GetObject`, runs seid collect-gentxs, uploads genesis.json via PutObject. +7. **seictl: `GET /v0/node-id` endpoint** — Thread `homeDir` into `Server`, derive node ID from `node_key.json` via CometBFT key loading, return as JSON. Add `GetNodeID` to `SidecarStatusClient` interface + concrete client implementation. +8. **sei-k8s-controller: CRD types** — `GenesisCeremonyConfig`, `GenesisCeremonyNodeConfig`, status fields. Run `make manifests`. +9. **sei-k8s-controller: Export `PreInitSidecarURL`** — From `node` package. +10. **sei-k8s-controller: Genesis PreInit Job** — `generateGenesisPreInitJob` branch in `generatePreInitJob` (avoids `snapshotSourceFor` nil-panic). +11. **sei-k8s-controller: Validator planner** — `needsPreInit`, `buildGenesisPreInitPlan`, task builders. +12. **sei-k8s-controller: PreInit gate** — `handlePreInitComplete` replaces both `isJobComplete` checkpoints for genesis nodes, deferred Init plan, guard both code paths. +13. **sei-k8s-controller: Group coordination** — `generateSeiNode` full genesis injection, `reconcileGenesisAssembly` with early failure detection, `AssemblyPlan`, `collectPeers`, `setGenesisSource`. + +No seictl engine changes. No `TaskResult.Result` migration. No new peer source types. Steps 1-7 (seictl) and 8-13 (controller) can proceed in parallel. + +--- + +## Decision Log + +| # | Decision | Rationale | Reversibility | +|---|---|---|---| +| 1 | Engine stays stateless (no TaskResult.Result) | Tasks return success/fail only. Data flows through PVC + S3. Preserves the engine as a pure sequencer. Avoids inter-task output coupling and workflow-engine complexity. | Two-way | +| 2 | Dedicated genesis S3 bucket | Genesis artifacts in `sei-genesis-artifacts` (not snapshot buckets). Different lifecycle, simpler IAM: single bucket with Get+Put. Sidecar already has SDK + IRSA. | Two-way | +| 3 | Controller-pushed static peers | Group controller queries `/v0/node-id` on each sidecar after assembly, builds `nodeId@dns:port` peer list, pushes as `StaticPeerSource`. No new peer source types, no sidecar peer discovery changes, no S3 for peer data. Reuses existing `StaticPeerSource` + `discoverPeersTask` + `writePeersToConfig`. | Two-way | +| 4 | PreInit gate with deferred Init plan | Init plan built when gate clears (peers available). Prevents missing `discover-peers`. | Two-way | +| 5 | S3 URIs are deterministic (convention-based) | Controller constructs URIs from group name + genesis config. Never reads task output. | Two-way | +| 6 | Genesis as PreInit tasks | Reuses existing Job + sidecar infrastructure. | Two-way | +| 7 | Copy seid to PVC via init container | Sidecar needs seid. Future: `TaskExecutionInterface` eliminates this. | Two-way | +| 8 | `GenesisCeremonyConfig` naming | Distinguishes from existing `GenesisConfiguration`. | Two-way | +| 9 | ChainID injection in `generateSeiNode` | UX: users set `genesis.chainId`, not template.spec.chainId. | Two-way | +| 10 | `handlePreInitComplete` unified exit path | Both code paths share gate logic. No misleading halt-height log. | Two-way | +| 11 | Assembly as group `AssemblyPlan` | Task submission tracked in group status. Resumable on restart. | Two-way | + +--- + +## Cross-Review Resolution Log (Round 4) + +### Kubernetes Specialist (5 findings) + +| # | Finding | Status | Resolution | +|---|---------|--------|-----------| +| 1 | PreInit sidecar liveness at `collectPeers` | OK | Genesis gate keeps all sidecars alive — `cleanupPreInit` only runs when `spec.genesis.s3` is set | +| 2 | `setGenesisSource` uses `r.Update` — stale resourceVersion risk | Low | Changed to `r.Patch(ctx, node, patch)` with `client.MergeFrom` | +| 3 | StatefulSet DNS shape `-0...svc` | OK | Correct for 1-pod-per-SeiNode StatefulSets | +| 4 | `buildPostBootstrapInitPlan` handles static peers | OK | Fixed state model diagram to match actual plan order (config-apply before discover-peers) | +| 5 | `listChildSeiNodes` unsorted — `nodes[0]` not deterministic | Medium | Added `sort.Slice` by name before assembly | + +### Platform Engineer (4 findings) + +| # | Finding | Status | Resolution | +|---|---------|--------|-----------| +| 1 | `Server` lacks `homeDir`; `node_key.json` stores private key, not nodeId | Spec gap | Added implementation notes: thread homeDir into Server, derive ID via CometBFT key loading, invariant on matching identity.json | +| 2 | `SidecarStatusClient` missing `GetNodeID` method | Gap | Documented as required interface addition + concrete client implementation | +| 3 | No orphaned `network-info.json` references | Clean | Confirmed | +| 4 | No orphaned `S3NetworkInfo` references | Clean | Confirmed | + +--- + +## Deferred (Do Not Build) + +| Feature | Rationale | +|---|---| +| **TaskResult.Result / typed task results** | Separate design deliverable — engine result routing is a fundamentally different problem | +| **TaskExecutionInterface** | Separate design deliverable — abstracts sidecar vs exec vs Job execution. Informs but doesn't block genesis. | +| **PreInit/Init phase merge** | Currently the phase boundary is Job→StatefulSet. Merging requires seid startup gating and changes the pod lifecycle model. Natural follow-up when TaskExecutionInterface provides the abstraction to make the boundary invisible. The current separation works, is well-tested, and doesn't create bloat. | +| **UpgradePolicy** | Separate design iteration | +| **Sidecar SQLite datastore** | Long-term persistence for task state | +| **Scale-up after genesis** | Requires on-chain governance | +| **Non-validator node types** | Start with validators | +| **Genesis from existing chain state** | New networks only | + +--- + +## Sample Manifests + +### Minimal Test Network + +```yaml +apiVersion: sei.io/v1alpha1 +kind: SeiNodeGroup +metadata: + name: loadtest +spec: + replicas: 4 + genesis: + chainId: loadtest-1 + template: + spec: + image: 189176372795.dkr.ecr.us-east-2.amazonaws.com/sei/sei-chain:v5.0.0 + entrypoint: + command: ["seid"] + args: ["start", "--keyring-backend", "file", "--home", "/sei"] + validator: {} +``` + +### Custom Parameters + +```yaml +apiVersion: sei.io/v1alpha1 +kind: SeiNodeGroup +metadata: + name: custom-net +spec: + replicas: 4 + genesis: + chainId: custom-1 + stakingAmount: "50000000usei" + accountBalance: "5000000000000usei,5000000000000uusdc" + maxCeremonyDuration: "20m" + overrides: + "staking.params.unbonding_time": "300s" + "gov.voting_params.voting_period": "120s" + accounts: + - address: "sei1loadtestfundingaddress..." + balance: "999999999999usei" + genesisS3: + bucket: "sei-genesis-artifacts" + prefix: "custom-1/custom-net/" + region: "us-east-2" + template: + spec: + image: 189176372795.dkr.ecr.us-east-2.amazonaws.com/sei/sei-chain:v5.0.0 + entrypoint: + command: ["seid"] + args: ["start", "--keyring-backend", "file", "--home", "/sei"] + validator: {} + overrides: + "p2p.max_num_outbound_peers": "10" +``` diff --git a/.tide/network-upgrades.md b/.tide/network-upgrades.md new file mode 100644 index 0000000..9abbdef --- /dev/null +++ b/.tide/network-upgrades.md @@ -0,0 +1,489 @@ +# Component: Automated Network Upgrades + +**Date:** 2026-03-22 +**Status:** Draft — Technical Direction + +--- + +## Owner + +Platform / Infrastructure + +## Phase + +Pre-Tide — Operational prerequisite. Network upgrades are the most coordination-intensive event in a Cosmos chain's lifecycle. Automating them via the controller is required before the Kubernetes infrastructure can be considered production-grade. + +## Purpose + +Enable the `sei-k8s-controller` to orchestrate governance-driven software upgrades across managed `SeiNode` and `SeiNodeGroup` resources. The orchestration must handle both **major** (hard stop at a block height) and **minor** (rolling, pre-height) upgrades with zero missed blocks beyond the mandatory consensus halt, and zero risk of running an incompatible binary at the wrong height. + +--- + +## Current State + +### Cosmos SDK Upgrade Flow + +The `x/upgrade` module is the standard mechanism for coordinated chain upgrades: + +1. A `SoftwareUpgradeProposal` is submitted via governance, specifying a **name**, **height**, and **info** (JSON metadata including `upgradeType`). +2. Validators vote. If the proposal passes, the upgrade `Plan` is stored on-chain. +3. At block height H, the upgrade module's `BeginBlocker` checks the plan: + - If **no handler** is registered for the plan name → the node **panics** (writes `upgrade-info.json` to disk, logs `UPGRADE "" NEEDED`, then crashes). This is the halt signal. + - If a **handler IS registered** → the node runs migrations (`applyUpgrade()`) and continues. No halt. +4. The operator's job: replace the binary before the node restarts so the handler is present. + +| Upgrade Type | Behavior | Operator Window | +|---|---|---| +| **Major** (`upgradeType: "major"`) | Running the new binary before height H causes a panic (`BINARY UPDATED BEFORE TRIGGER`). Must swap at exactly height H. | Height H only | +| **Minor** (`upgradeType: "minor"`) | Running the new binary before height H is allowed (no pre-height panic). | Any time before or at height H | + +### Halt Mechanisms + +| Mechanism | Trigger Point | Exit Behavior | Use Case | +|---|---|---|---| +| Upgrade module panic | `BeginBlocker` at height H | Non-zero exit (panic) | Standard governance upgrade | +| `--halt-height` flag | `Commit` after block H committed | Clean exit 0 (SIGINT/SIGTERM to self) | Controlled stops for snapshots, exports | +| `--halt-time` flag | `Commit` when block time ≥ threshold | Clean exit 0 | Time-based stops | + +**Key insight:** `--halt-height` triggers during **Commit** (block is persisted), producing a clean exit. The upgrade panic triggers during **BeginBlocker** (before the block is processed), producing a crash. Using `halt-height = H - 1` to stop cleanly, then swapping the image and restarting at height H is operationally cleaner than letting the upgrade panic fire. + +### Sei-Specific Upgrade Patterns + +- **Handler registration**: All upgrade names are embedded in `app/tags` (104 entries from `1.0.2beta` through `v6.4.0`). Each binary "knows" all historical upgrades. New versions append to this file. +- **Store migrations**: `SetStoreUpgradeHandlers()` reads `upgrade-info.json` from disk to determine which KV stores to add/delete. +- **Hard forks**: Separate from governance — `HardForkManager` runs at exact heights for specific chain IDs. Not relevant to automated upgrades. +- **Operations today** (`sei-infra`): `upgrade-scheduler.py` estimates target height, generates `seid tx gov submit-proposal software-upgrade` commands. Monitoring alerts on `cosmos_upgrade_plan_height`. Manual binary swap on EC2. + +### sei-k8s-controller Today + +| Capability | Status | +|---|---| +| `spec.image` on SeiNode | Exists — controls container image | +| `spec.entrypoint` on SeiNode | Exists — controls seid start command/args | +| StatefulSet reconciliation in Running phase | **NOT IMPLEMENTED** — image changes don't roll pods after phase=Running | +| SeiNodeGroup image push to children | Exists — pushes `image`, `entrypoint`, `sidecar` to child SeiNodes | +| `halt-height` on running pods | **NOT IMPLEMENTED** — only used in pre-init Jobs for snapshot bootstrap | +| Chain height in status | **NOT IMPLEMENTED** — `SeiNodePool.status.nodeStatuses.blockHeight` field exists but is never populated | +| Upgrade plan detection | **NOT IMPLEMENTED** — no chain RPC queries | +| Coordinated rollout across nodes | **NOT IMPLEMENTED** — SeiNodeGroup updates all children simultaneously | + +--- + +## Dependencies + +- **On-chain**: `x/upgrade` module state (pending plan, plan height, plan name/info) +- **seid RPC**: `/abci_query` for upgrade plan, `/status` for current block height +- **Container registry**: New image must be available before upgrade height +- **seictl sidecar**: Current StatusClient already queries `/status` for height (result-export uses this) + +**Explicit exclusions:** +- Cosmovisor — replaced by controller-managed image swap +- Manual operator intervention for the binary swap — automated by the controller +- Hard fork management — separate mechanism, out of scope + +--- + +## Interface Specification + +### New CRD Fields + +#### `SeiNodeSpec` additions + +```go +// UpgradePolicy controls how the controller handles on-chain software upgrades. +// +optional +type UpgradePolicy struct { + // Strategy determines the upgrade orchestration mode. + // "Auto" — controller detects on-chain proposals, halts at upgrade height, + // swaps image, and restarts automatically. + // "Manual" — controller detects proposals and reports them in status, + // but waits for the operator to update spec.image. + // Default: "Manual" + // +optional + Strategy UpgradeStrategy `json:"strategy,omitempty"` + + // ImageMap maps upgrade plan names to container images. + // When strategy is "Auto", the controller looks up the plan name here + // to determine the target image. If the plan name is not found, + // the controller falls back to "Manual" behavior for that upgrade. + // +optional + ImageMap map[string]string `json:"imageMap,omitempty"` +} + +type UpgradeStrategy string + +const ( + UpgradeStrategyManual UpgradeStrategy = "Manual" + UpgradeStrategyAuto UpgradeStrategy = "Auto" +) +``` + +#### `SeiNodeStatus` additions + +```go +// UpgradeStatus reports the current state of any pending or in-progress upgrade. +// +optional +type UpgradeStatus struct { + // PendingPlan is the currently scheduled on-chain upgrade plan, if any. + // +optional + PendingPlan *UpgradePlanInfo `json:"pendingPlan,omitempty"` + + // Phase is the current upgrade orchestration state. + // Empty when no upgrade is in progress. + // +optional + Phase UpgradePhase `json:"phase,omitempty"` + + // TargetImage is the image the controller will use after the upgrade. + // Set when strategy is "Auto" and the plan name is found in imageMap. + // +optional + TargetImage string `json:"targetImage,omitempty"` +} + +type UpgradePlanInfo struct { + Name string `json:"name"` + Height int64 `json:"height"` + Info string `json:"info,omitempty"` +} + +type UpgradePhase string + +const ( + UpgradePhaseNone UpgradePhase = "" + UpgradePhasePending UpgradePhase = "Pending" // Plan detected, waiting for height + UpgradePhaseHalting UpgradePhase = "Halting" // halt-height set, waiting for clean stop + UpgradePhaseSwapping UpgradePhase = "Swapping" // Pod stopped, updating image + UpgradePhaseMigrating UpgradePhase = "Migrating" // New binary running, executing migrations + UpgradePhaseComplete UpgradePhase = "Complete" // Upgrade finished, back to normal +) +``` + +#### `SeiNodeStatus` — observed chain state + +```go +// ChainStatus reports the node's observed chain state, populated from sidecar RPC queries. +// +optional +type ChainStatus struct { + // BlockHeight is the latest committed block height. + // +optional + BlockHeight int64 `json:"blockHeight,omitempty"` + + // BlockTime is the timestamp of the latest committed block. + // +optional + BlockTime *metav1.Time `json:"blockTime,omitempty"` +} +``` + +### SeiNodeGroup Upgrade Coordination + +```go +// SeiNodeGroupSpec additions +type GroupUpgradePolicy struct { + // Strategy inherited by all child SeiNodes unless overridden. + // +optional + Strategy UpgradeStrategy `json:"strategy,omitempty"` + + // ImageMap inherited by all child SeiNodes. + // +optional + ImageMap map[string]string `json:"imageMap,omitempty"` + + // MinorStrategy controls how minor upgrades are rolled out across the group. + // "Rolling" — update nodes one at a time, waiting for each to become healthy. + // "AllAtOnce" — update all nodes simultaneously. + // Default: "Rolling" + // +optional + MinorStrategy GroupRolloutStrategy `json:"minorStrategy,omitempty"` +} + +type GroupRolloutStrategy string + +const ( + GroupRolloutRolling GroupRolloutStrategy = "Rolling" + GroupRolloutAllAtOnce GroupRolloutStrategy = "AllAtOnce" +) +``` + +### Controller → Sidecar Interface + +The sidecar already queries `/status` for result-export. Extend the status polling to also query the upgrade module: + +``` +GET /abci_query?path="custom/upgrade/plan" +``` + +Response: the pending `Plan` proto, or empty if no plan is scheduled. + +The controller reads this from the sidecar's status report (extend `ConfigStatus` or add a new status endpoint). Alternatively, the controller can query the node's RPC directly. + +### Error Conditions + +| Error | Cause | Detection | Response | +|---|---|---|---| +| Image not found in registry | `imageMap` points to non-existent tag | Pod `ImagePullBackOff` | Emit event, set upgrade phase to `Pending`, do not proceed | +| Plan name not in `imageMap` | New upgrade proposed but operator hasn't updated map | Plan detected, lookup misses | Set upgrade phase to `Pending`, emit warning event, fallback to Manual | +| Node crashes before halt-height | Unrelated crash during upgrade window | Pod restart, height < plan height | Controller re-applies halt-height on restart | +| Node doesn't halt at expected height | `halt-height` not applied or race condition | Height advances past H-1 without clean exit | Let upgrade panic fire (fallback), controller detects non-zero exit and swaps image | +| Upgrade migrations fail | New binary panics during `applyUpgrade()` | Pod CrashLoopBackOff at height H | Emit event, set phase to Failed, require manual intervention | +| Chain halt (< 2/3 voting power) | Not enough validators upgrade | Block production stops | Out of scope — monitoring/alerting concern | + +--- + +## State Model + +### Upgrade State Machine (per SeiNode) + +```mermaid +stateDiagram-v2 + [*] --> Running + Running --> Pending : Plan detected on-chain + + state Pending { + direction LR + [*] --> CheckStrategy + CheckStrategy --> AutoResolved : Auto + imageMap hit → targetImage + CheckStrategy --> WaitForOperator : Manual or no imageMap entry + } + + Pending --> Halting : Major upgrade\nset halt-height = H-1 + Pending --> Swapping_Minor : Minor upgrade\nswap image directly + + state "Halting" as Halting + note right of Halting + halt-height = H-1 + Waiting for clean exit + end note + + Halting --> Swapping_Major : Pod exits 0 at height H-1 + + state "Swapping (Major)" as Swapping_Major + note right of Swapping_Major + Update spec.image + Remove halt-height + Write upgrade-info.json + Restart pod + end note + + Swapping_Major --> Migrating : Pod starts with new binary + + state "Migrating" as Migrating + note right of Migrating + BeginBlocker at height H + applyUpgrade() runs + Store migrations execute + end note + + state "Swapping (Minor)" as Swapping_Minor + note left of Swapping_Minor + Rolling image update + Group: one node at a time + end note + + Migrating --> Complete : Height > H + Swapping_Minor --> Complete : Pod healthy with new image + + state "Complete" as Complete + note right of Complete + Clear upgrade status + Resume normal reconciliation + end note + + Complete --> Running + + Halting --> Running : Plan cancelled on-chain\nremove halt-height +``` + +### State Storage + +| State | Location | Source of Truth | +|---|---|---| +| Pending upgrade plan | On-chain (`x/upgrade` store) | Chain | +| Observed plan + phase | `SeiNode.status.upgrade` | Controller | +| Target image | `SeiNode.status.upgrade.targetImage` (resolved from `spec.upgradePolicy.imageMap`) | CRD | +| Current block height | `SeiNode.status.chain.blockHeight` | Chain via RPC | +| Image map | `SeiNode.spec.upgradePolicy.imageMap` or `SeiNodeGroup.spec.upgradePolicy.imageMap` | Operator-managed | + +--- + +## Internal Design + +### 1. Upgrade Detection (reconcileRunning extension) + +Every reconciliation cycle for a Running node: + +``` +1. Query node RPC for current block height → update status.chain.blockHeight +2. Query node RPC for pending upgrade plan +3. If plan exists and status.upgrade.phase == "": + a. Set status.upgrade.pendingPlan = {name, height, info} + b. Parse plan.Info for upgradeType + c. If spec.upgradePolicy.strategy == "Auto": + - Look up plan.Name in spec.upgradePolicy.imageMap + - If found: set status.upgrade.targetImage, phase = "Pending" + - If not found: phase = "Pending", emit warning event + d. If strategy == "Manual": phase = "Pending", emit event +4. Requeue with appropriate interval (shorter as height approaches) +``` + +### 2. Major Upgrade Orchestration + +When `status.upgrade.phase == "Pending"` and upgrade is major: + +``` +1. Calculate blocks remaining = plan.Height - status.chain.blockHeight +2. If blocks remaining <= haltWindow (configurable, default 100): + a. Inject --halt-height= into the pod + b. Set phase = "Halting" + c. Requeue frequently (every 5s) +3. When pod exits cleanly (exit 0) at height H-1: + a. Update spec.image to status.upgrade.targetImage + b. Remove halt-height from entrypoint + c. Set phase = "Swapping" + d. Reconcile StatefulSet (trigger pod replacement) +4. When new pod is Running and height >= H: + a. Set phase = "Complete" + b. Clear upgrade status on next reconcile +``` + +**Injecting halt-height:** The cleanest mechanism is patching `spec.entrypoint.args` to append `--halt-height=`. When the upgrade completes, the controller removes it. An alternative is a dedicated `spec.haltHeight` field. + +### 3. Minor Upgrade Orchestration + +When upgrade is minor and strategy is Auto: + +``` +1. Update spec.image to targetImage immediately (no halt needed) +2. Reconcile StatefulSet (triggers rolling restart) +3. For SeiNodeGroup with minorStrategy=Rolling: + a. Update one child SeiNode's image at a time + b. Wait for it to reach Running and height > plan.Height + c. Proceed to next node +4. Set phase = "Complete" when all nodes are updated +``` + +### 4. Prerequisite: StatefulSet Reconciliation in Running Phase + +The current controller does **not** reconcile the StatefulSet after `phase=Running`. This must be fixed: + +```go +func (r *SeiNodeReconciler) reconcileRunning(ctx context.Context, node *seiv1alpha1.SeiNode) (ctrl.Result, error) { + // Existing: reconcileRuntimeTasks(...) + + // NEW: reconcile StatefulSet spec to pick up image/entrypoint changes + if err := r.reconcileNodeStatefulSet(ctx, node); err != nil { + return ctrl.Result{}, err + } + + // NEW: upgrade detection and orchestration + if err := r.reconcileUpgrade(ctx, node); err != nil { + return ctrl.Result{}, err + } + + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil +} +``` + +### 5. SeiNodeGroup Coordination + +For major upgrades, all nodes must halt at the same height. The group controller: + +``` +1. Detects upgrade from any child's status.upgrade.pendingPlan +2. Pushes imageMap and strategy to all children (if not already set) +3. All children independently halt at H-1 and swap +4. Group reports upgrade complete when all children are past height H +``` + +For minor upgrades with Rolling strategy: + +``` +1. Group maintains an ordered list of children +2. Updates one child's image at a time +3. Waits for health check before proceeding +4. Respects PodDisruptionBudget if configured +``` + +--- + +## Error Handling + +| Error Case | Detection | Controller Response | Operator Action | +|---|---|---|---| +| Upgrade plan detected, no imageMap entry | Plan name lookup miss | Emit `UpgradeImageNotMapped` event, stay in Pending | Add entry to `imageMap` with target image | +| Image pull failure | Pod enters `ImagePullBackOff` | Emit `UpgradeImagePullFailed` event, stay in Swapping | Fix image reference in `imageMap` | +| Node doesn't halt cleanly | Pod still running past H-1 | Increase requeue frequency, let upgrade panic fire as fallback | None — controller handles crash restart | +| Upgrade migration panic | Pod `CrashLoopBackOff` at height H | Emit `UpgradeMigrationFailed` event, set phase to Failed | Investigate migration failure, may need `--unsafe-skip-upgrades` | +| Chain halted (consensus failure) | No height advancement across all nodes | Emit `ChainHalted` event | Coordinate with other validators off-band | +| Partial group upgrade | Some children complete, others stuck | Group status shows mixed phases | Investigate stuck nodes individually | +| Manual strategy — operator slow | Height approaching, no image update | Emit escalating warning events | Update `spec.image` before height H | + +--- + +## Test Specification + +### Unit Tests + +| Test | Setup | Action | Expected | +|---|---|---|---| +| `TestUpgradeDetection_PlanFound` | Mock RPC returns pending plan at height 1000 | Reconcile running node | `status.upgrade.pendingPlan` populated, phase=Pending | +| `TestUpgradeDetection_NoPlan` | Mock RPC returns no plan | Reconcile running node | `status.upgrade` empty | +| `TestMajorUpgrade_HaltHeightInjected` | Node at height 900, plan at 1000, haltWindow=100 | Reconcile | `entrypoint.args` includes `--halt-height=999` | +| `TestMajorUpgrade_HaltHeightNotYet` | Node at height 800, plan at 1000, haltWindow=100 | Reconcile | No halt-height injection, phase stays Pending | +| `TestMajorUpgrade_PodExitedCleanly` | Phase=Halting, pod terminated with exit 0 | Reconcile | Image updated to targetImage, phase=Swapping | +| `TestMinorUpgrade_ImmediateSwap` | Minor plan detected, Auto strategy, imageMap hit | Reconcile | Image updated immediately, no halt-height | +| `TestImageMapMiss` | Plan name "v7.0.0" not in imageMap | Reconcile | Warning event emitted, phase=Pending, no swap | +| `TestManualStrategy_NoAutoSwap` | Manual strategy, plan detected | Reconcile | Phase=Pending, image NOT updated, event emitted | +| `TestStatefulSetReconciledInRunning` | Phase=Running, spec.image changed | Reconcile | StatefulSet updated with new image | + +### Integration Tests + +| Test | Setup | Action | Expected | +|---|---|---|---| +| `TestMajorUpgradeE2E` | SeiNode running at height H-10, plan at H, Auto strategy with imageMap | Wait for height approach | Node halts at H-1, restarts with new image, resumes at H | +| `TestGroupMajorUpgrade` | SeiNodeGroup with 3 nodes, all detect same plan | Wait | All nodes halt at H-1, all swap, all resume | +| `TestGroupMinorRolling` | SeiNodeGroup with 3 nodes, minor plan, Rolling | Trigger | Nodes updated one at a time, each healthy before next | +| `TestUpgradePanicFallback` | halt-height injection fails, node crashes at H | Reconcile after crash | Controller detects crash, swaps image, pod restarts successfully | + +--- + +## Deployment + +### Implementation Order + +1. **Phase 1 — Foundation**: Populate `status.chain.blockHeight` from sidecar RPC. Reconcile StatefulSet in Running phase. These are prerequisites with value independent of upgrades. +2. **Phase 2 — Detection**: Add `UpgradePolicy` and `UpgradeStatus` CRD fields. Implement upgrade plan detection via RPC. Manual strategy only. +3. **Phase 3 — Major Auto**: Implement halt-height injection, image swap, and the full major upgrade state machine. +4. **Phase 4 — Minor + Group**: Implement minor rolling upgrades and SeiNodeGroup coordination. + +### Testnet vs Mainnet + +| Aspect | Testnet | Mainnet | +|---|---|---| +| Strategy default | Auto | Manual | +| haltWindow | 50 blocks | 200 blocks | +| imageMap population | Automated from CI/CD pipeline | Reviewed and approved by operator | +| Monitoring | Standard alerts | Enhanced: `UpgradeApproaching`, `UpgradeStalled`, `ChainHalted` | + +--- + +## Decision Log + +| # | Decision | Rationale | Reversibility | +|---|---|---|---| +| 1 | Use `halt-height` for clean stops instead of letting the upgrade panic fire | Clean exit (code 0) is operationally safer than a panic crash. Controller can distinguish "halted for upgrade" from "crashed". Kubernetes restart policies don't interfere. | Two-way door: can always fall back to letting the panic fire | +| 2 | `imageMap` on CRD rather than auto-discovering images | Container image tags are not derivable from on-chain plan names. The mapping is organizational convention, not protocol. Explicit mapping prevents deploying the wrong image. | Two-way door: can add auto-discovery later as a convenience layer | +| 3 | Manual strategy as default | Mainnet upgrades are high-stakes events. Auto mode should be opt-in after operators gain confidence. | Two-way door: change the default | +| 4 | Reconcile StatefulSet in Running phase | Required for upgrades but also fixes a general gap where spec changes don't propagate to running pods. | Two-way door: scope can be narrowed to upgrade-only fields | +| 5 | Major upgrades halt at H-1, not H | `halt-height` triggers during Commit, so block H-1 is committed and persisted. On restart at block H, the upgrade handler runs in BeginBlocker. This is the standard Cosmovisor-equivalent flow. | One-way door (operationally): changing halt target after deployment requires careful coordination. But the flag value itself is a two-way door. | + +--- + +## Deferred (Do Not Build) + +| Feature | Rationale | +|---|---| +| **Cosmovisor integration** | Controller replaces Cosmovisor's role. No need for both. | +| **Automatic image building** | Out of scope — CI/CD concern. Controller consumes pre-built images. | +| **Governance proposal submission** | Controller is an observer/executor, not a governance participant. | +| **Hard fork management** | Separate mechanism with chain-ID-specific logic. Not governance-driven. | +| **Cross-chain upgrade coordination** | Single chain per SeiNodeGroup. Multi-chain is future work. | +| **Rollback automation** | Cosmos upgrades are forward-only. Rollback requires governance cancel + manual intervention. | +| **Auto-discovery of image from plan.Info URL** | Cosmovisor supports download URLs in plan.Info, but pulling arbitrary binaries into containers is an anti-pattern in Kubernetes. Use imageMap instead. | diff --git a/.tide/observability.md b/.tide/observability.md new file mode 100644 index 0000000..96a72d6 --- /dev/null +++ b/.tide/observability.md @@ -0,0 +1,926 @@ +# Component: Controller & Sidecar Observability + +## Owner +Platform Engineering (controller) / Sidecar team (seictl) + +## Phase +Phase 1: Metrics registration, events, annotation-based scraping, controller self-monitoring +Phase 2: Grafana dashboards, sidecar metrics, kube-prometheus-stack migration, alerting +Phase 3 (deferred): Distributed tracing, cost attribution + +## Purpose +Make the sei-k8s-controller and its seictl sidecar operationally ergonomic by wiring +Prometheus metrics, Kubernetes events, structured logging, self-monitoring, dashboards, +and alerting throughout the system. Today the controller emits zero custom metrics, only +the SeiNodeGroup reconciler produces Kubernetes events, the sidecar has no `/metrics` +endpoint, and no dashboards or alert rules exist. Operators have no signal beyond +`kubectl logs` and manual status inspection to understand fleet health. + +## Dependencies + +### External systems consumed +- **Prometheus** (community chart, annotation-based scraping) — TSDB for metrics storage and PromQL evaluation +- **Grafana** — Dashboard rendering +- **Loki** — Log aggregation (structured JSON logs from all pods via Promtail) +- **controller-runtime metrics** — built-in workqueue, reconcile duration, REST client metrics + +### Phase 2 dependencies (not yet present on dev) +- **Prometheus Operator** — ServiceMonitor/PodMonitor CRD reconciliation (requires migration from community chart to kube-prometheus-stack) +- **Alertmanager** — Alert routing (currently disabled on dev; `alertmanager.enabled: false`) +- **Grafana sidecar** — Dashboard auto-discovery from ConfigMaps (currently disabled) + +### Internal Tide components consumed +- **SeiNodeGroup controller** — already has EventRecorder; metrics hook points in reconcile loop +- **SeiNode controller** — needs EventRecorder; metrics hook points in reconcile/plan execution +- **SeiNodePool controller** — needs EventRecorder; metrics hook points in genesis lifecycle +- **seictl sidecar** — task execution engine; needs `/metrics` HTTP handler (Phase 2) + +### Explicit exclusions +- Does NOT depend on any tracing backend (Jaeger, Tempo) — deferred to Phase 3 +- Does NOT depend on external APM tools (Datadog, New Relic) +- Does NOT modify CRD spec fields — observability is orthogonal to user-facing API +- Does NOT depend on Prometheus Operator in Phase 1 (annotation-based scraping only) + +--- + +## Interface Specification + +### 1. Controller Prometheus Metrics + +All metrics use the `sei_controller_` prefix. Labels follow controller-runtime conventions. + +#### Reconciliation Metrics + +```go +// Gauges — per-resource state (cleaned up on deletion via §1e) +sei_controller_seinode_phase{namespace, name, phase} gauge // 1.0 for current phase, 0.0 for others +sei_controller_seinodegroup_phase{namespace, name, phase} gauge // same pattern +sei_controller_seinodepool_phase{namespace, name, phase} gauge // same pattern +sei_controller_seinodegroup_replicas{namespace, name, type} gauge // type=desired|ready|initializing|failed +sei_controller_seinodepool_replicas{namespace, name, type} gauge // type=total|ready +sei_controller_seinodegroup_condition{namespace, name, type, status} gauge // 1.0 when condition matches + +// Counters +sei_controller_reconcile_errors_total{controller, namespace, name} counter // reconcile errors beyond what controller-runtime tracks +sei_controller_seinode_phase_transitions_total{namespace, name, from, to} counter // phase state machine transitions + +// Histograms — aggregate across resources for meaningful percentiles +sei_controller_seinode_init_duration_seconds{namespace, chain_id} histogram // time from Pending to Running (no `name` label — aggregates across all nodes for a chain) +sei_controller_seinode_last_init_duration_seconds{namespace, name} gauge // per-node init duration for debugging (set once when node reaches Running) +sei_controller_seinodegroup_reconcile_substep_duration_seconds{controller, substep} histogram // per-substep timing +``` + +**Histogram bucket selection:** Custom buckets for all histograms to cover the full range of +controller operations, including slow API server writes and sidecar interactions: + +```go +var reconcileBuckets = []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60} +``` + +This extends `prometheus.DefBuckets` (max 10s) to 60s, covering controller-runtime's default +reconcile timeout and slow network conditions. + +#### Sidecar Interaction Metrics + +```go +sei_controller_sidecar_request_duration_seconds{namespace, node, method, route, status_code} histogram +sei_controller_sidecar_task_submit_total{namespace, node, task_type} counter +sei_controller_sidecar_task_complete_total{namespace, node, task_type, result} counter // result=success|failure +sei_controller_sidecar_unreachable_total{namespace, node} counter +``` + +**Route normalization:** The `route` label uses fixed route templates, not raw paths. +Bounded allowlist of known routes: + +```go +var knownRoutes = map[string]string{ + "/v0/tasks": "/v0/tasks", + "/v0/healthz": "/v0/healthz", + "/v0/status": "/v0/status", +} +// Any path starting with "/v0/tasks/" maps to "/v0/tasks/:id" +// Any unrecognized path maps to "other" +``` + +#### Networking Resource Metrics + +```go +sei_controller_networking_resource_reconcile_total{namespace, group, resource_type, result} counter // resource_type=Service|HTTPRoute|AuthorizationPolicy, result=created|updated|unchanged|error +sei_controller_networking_crd_available{crd} gauge // 1.0 if CRD is installed, 0.0 if not +``` + +**Registration:** All metrics registered in a `metrics.go` file per controller package using +`prometheus.NewGaugeVec` / `NewCounterVec` / `NewHistogramVec` from `github.com/prometheus/client_golang`. +Registered with controller-runtime's default registry (`metrics.Registry`). + +**Cardinality budget:** Phase labels are bounded enums (6 phases for SeiNode, 6 for SeiNodeGroup, +4 for SeiNodePool). `namespace` × `name` cardinality is bounded by cluster size (target: <100 +SeiNodes, <20 SeiNodeGroups, <10 SeiNodePools). `from` × `to` transition pairs are bounded by +the state machine (max 30 combinations). `route` is bounded by the allowlist (4 values + "other"). + +The 0/1-per-phase gauge pattern follows the kube-state-metrics convention (e.g., `kube_pod_status_phase`) +where each enum value is a separate time series with value 0 or 1. This enables simple `== 1` +queries and alerting without cardinality issues from multi-valued labels. + +### 2. Kubernetes Events + +#### SeiNode Events (new — EventRecorder to be added) + +| Reason | Type | When | +|--------|------|------| +| `PhaseTransition` | Normal | Phase changes (e.g. Initializing → Running) | +| `PreInitStarted` | Normal | PreInit Job created | +| `PreInitComplete` | Normal | PreInit Job succeeded | +| `PreInitFailed` | Warning | PreInit Job failed | +| `InitPlanStarted` | Normal | Init task sequence begins | +| `InitPlanComplete` | Normal | All init tasks done, node is ready | +| `InitTaskFailed` | Warning | Individual init task failed | +| `SidecarUnreachable` | Warning | Sidecar HTTP calls fail (debounced: once per 5 min) | +| `ConfigDriftDetected` | Warning | ConfigStatus.DriftDetected transitions to true | +| `SnapshotScheduled` | Normal | Snapshot generation task submitted | +| `SnapshotComplete` | Normal | Snapshot upload complete | +| `SnapshotFailed` | Warning | Snapshot task failed | + +**RBAC requirement:** Add the following kubebuilder marker to `SeiNodeReconciler`: +```go +// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch +``` +Then run `make manifests` to regenerate the ClusterRole. + +#### SeiNodePool Events (new — EventRecorder to be added) + +| Reason | Type | When | +|--------|------|------| +| `GenesisStarted` | Normal | Genesis preparation Job created | +| `GenesisComplete` | Normal | Genesis Job succeeded, nodes being created | +| `GenesisFailed` | Warning | Genesis Job failed | +| `NodeCreated` | Normal | Child SeiNode created | +| `NodeDeleted` | Normal | Child SeiNode deleted | +| `PoolReady` | Normal | All nodes in Running phase | + +**RBAC requirement:** Add the following kubebuilder marker to `SeiNodePoolReconciler`: +```go +// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch +``` +Then run `make manifests` to regenerate the ClusterRole. + +#### SeiNodeGroup Events (existing, extended) + +Add to existing event set: + +| Reason | Type | When | +|--------|------|------| +| `GroupReady` | Normal | Phase transitions to Ready (all nodes Running) | +| `GroupDegraded` | Warning | Phase transitions to Degraded (some nodes not Running) | +| `ScaleUp` | Normal | Replicas increased, new SeiNodes being created | +| `ScaleDown` | Normal | Replicas decreased, excess SeiNodes being deleted | + +### 3. Controller Self-Monitoring (Phase 1 — Annotation-Based) + +The dev cluster runs the community Prometheus chart with annotation-based pod discovery, **not** +Prometheus Operator. Phase 1 uses annotations for scraping; Phase 2 migrates to ServiceMonitor. + +#### Phase 1: Annotation-based scraping + +Switch the controller's metrics endpoint to HTTP on a non-public port for in-cluster scraping. +This avoids the complexity of bearer token auth and TLS for annotation-based discovery: + +```go +// cmd/main.go — Phase 1 change +flag.BoolVar(&secureMetrics, "metrics-secure", false, // changed from true + "If set, the metrics endpoint is served securely via HTTPS.") +``` + +The metrics bind address stays at `:8443`. Add annotations to the controller Deployment pod template: + +```yaml +# config/manager/manager.yaml — pod template metadata +template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + prometheus.io/scrape: "true" + prometheus.io/port: "8443" + prometheus.io/path: "/metrics" +``` + +Also declare the metrics port in the container spec (currently `ports: []`): + +```yaml +ports: + - name: metrics + containerPort: 8443 + protocol: TCP +``` + +The existing `extraScrapeConfigs` in the Prometheus Helm values discovers pods with +`prometheus.io/scrape: "true"` and honors the `port` and `path` annotations. + +#### Phase 2: ServiceMonitor-based scraping (after kube-prometheus-stack migration) + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: sei-k8s-controller-manager + namespace: sei-k8s-controller-system + labels: + control-plane: controller-manager +spec: + namespaceSelector: + matchNames: + - sei-k8s-controller-system + selector: + matchLabels: + control-plane: controller-manager + endpoints: + - port: https + scheme: https + tlsConfig: + insecureSkipVerify: true + authorization: + type: Bearer + credentials: + name: prometheus-sa-token + key: token + interval: 30s +``` + +Note: Uses `authorization` block instead of deprecated `bearerTokenFile` (deprecated in +Prometheus Operator v0.59+). + +**RBAC prerequisite for Phase 2:** When re-enabling secure metrics, deploy: + +```yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: sei-controller-metrics-reader +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: sei-controller-metrics-reader # already generated by controller-gen +subjects: + - kind: ServiceAccount + name: prometheus-server # verify actual SA name on target cluster + namespace: monitoring +``` + +### 4. Sidecar Metrics Endpoint (seictl — Phase 2) + +The seictl sidecar exposes a Prometheus-compatible `/metrics` endpoint on its existing HTTP server port (7777). + +```go +// Sidecar metrics — sei_sidecar_ prefix +sei_sidecar_task_execution_duration_seconds{task_type} histogram // custom buckets: reconcileBuckets +sei_sidecar_task_executions_total{task_type, result} counter // result=success|failure +sei_sidecar_active_tasks gauge +sei_sidecar_config_applies_total{result} counter +sei_sidecar_config_generation gauge // monotonically increasing integer +sei_sidecar_health_checks_total{result} counter // result=ok|degraded +sei_sidecar_process_start_time_seconds gauge // for uptime calculation +``` + +Note: `sei_sidecar_config_generation` (renamed from `config_version`) is a monotonically increasing +integer representing the config schema generation. If a string identifier is needed later, use an +info-style metric: `sei_sidecar_config_info{version="..."}` with constant value 1. + +#### Sidecar scraping — Phase 1 (annotation-based) + +Add annotations to the sidecar container's pod template in `generateNodeStatefulSet`: + +```go +// internal/controller/node/resources.go — StatefulSet pod template annotations +podAnnotations := map[string]string{ + "prometheus.io/scrape": "true", + "prometheus.io/port": "7777", + "prometheus.io/path": "/metrics", +} +``` + +This works immediately with the existing annotation-based Prometheus discovery. + +#### Sidecar scraping — Phase 2 (PodMonitor after kube-prometheus-stack migration) + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: sei-sidecar + namespace: monitoring +spec: + namespaceSelector: + matchNames: [] # empty = all namespaces + selector: + matchLabels: + sei.io/node: "" # label present on all SeiNode StatefulSet pods + podMetricsEndpoints: + - port: seictl + path: /metrics + interval: 30s +``` + +Verify that `sei.io/node` label is set on StatefulSet pod templates by checking +`generateNodeStatefulSet` output in `resources.go`. + +**Network policy consideration:** SeiNodePool-generated NetworkPolicies may restrict ingress +to node pods. When deploying the PodMonitor, ensure the pool NetworkPolicy includes an ingress +rule allowing traffic from the `monitoring` namespace on port 7777, or deploy a supplementary +NetworkPolicy: + +```yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-prometheus-sidecar-scrape +spec: + podSelector: + matchLabels: + sei.io/node: "" + ingress: + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: monitoring + ports: + - port: 7777 + protocol: TCP +``` + +### 5. Structured Logging Standards + +All controllers and the sidecar follow these conventions: + +``` +Logger hierarchy: + controller.seinode — SeiNode reconciler + controller.seinodegroup — SeiNodeGroup reconciler + controller.seinodepool — SeiNodePool reconciler + sidecar.engine — seictl task engine + sidecar.task. — per-task-type logger + +Required fields on every reconcile log line: + "namespace" string + "name" string + "reconcileID" string (from context, controller-runtime provides this) + +Required fields on phase transitions: + "fromPhase" string + "toPhase" string + +Required fields on sidecar interactions: + "taskType" string + "taskID" string (when known) + "duration" string (for completed calls) +``` + +Eliminate all `log.Log.Info(...)` (global logger) usage in favor of `log.FromContext(ctx)`. + +Promtail is deployed with default scrape configs (collects all pod logs from all namespaces). +Loki has `allow_structured_metadata: true`. JSON structured logs from the controller and sidecar +will be ingested automatically via Promtail without additional configuration. + +### 6. Grafana Dashboards + +#### Phase 1: Manual provisioning + +Since the Grafana sidecar for ConfigMap auto-discovery is not enabled on dev, Phase 1 uses +Grafana's built-in `dashboardProviders` and `dashboardsConfigMaps` Helm values. + +Dashboard JSON files are committed to the controller repo at `config/monitoring/dashboards/` +for version control alongside the code. + +Deployment requires a platform repo change to the Grafana Helm values: + +```yaml +# platform/clusters/dev/monitoring/grafana.yaml — add to values: +dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: sei-controller + orgId: 1 + folder: Sei Controller + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/sei-controller + +dashboardsConfigMaps: + sei-controller: sei-controller-dashboards +``` + +A ConfigMap named `sei-controller-dashboards` is deployed to the `monitoring` namespace +(NOT bundled with the controller — see Deployment section). + +#### Phase 2: Sidecar auto-discovery (after Grafana sidecar enablement) + +Enable in platform repo: +```yaml +sidecar: + dashboards: + enabled: true + label: grafana_dashboard + labelValue: "1" + searchNamespace: ALL +``` + +Then dashboards can be deployed as labeled ConfigMaps to any namespace. + +#### Dashboard 1: Fleet Overview + +Panels: +- SeiNodeGroup count by phase (pie/stat) +- SeiNode count by phase (pie/stat) +- SeiNodePool count by phase (pie/stat) +- Replicas desired vs ready per group (timeseries) +- Reconciliation rate and error rate (timeseries) +- SeiNode initialization duration P50/P95/P99 (timeseries) +- Networking resource status (table: group, HTTPRoute status, Service LB IP, AuthorizationPolicy) +- Recent Kubernetes events (logs panel via Loki, filtered to sei.io resources) + +#### Dashboard 2: Controller Health + +Panels: +- Reconcile duration by controller (P50/P95/P99 heatmap, from controller-runtime metrics) +- Work queue depth and processing time (timeseries) +- Sidecar request latency by node (heatmap) +- Sidecar unreachable rate (timeseries with threshold annotation) +- Controller memory and CPU (from cAdvisor) +- Active reconciles / goroutines +- REST client request rate and errors (from controller-runtime client metrics) + +### 7. Alerting + +#### Phase 1: No alerting infrastructure + +Alertmanager is explicitly disabled on dev (`alertmanager.enabled: false`). Phase 1 focuses +on metrics and dashboards. Operators use Grafana dashboards for visual monitoring. + +#### Phase 2: PrometheusRule (after kube-prometheus-stack migration + Alertmanager enablement) + +**Prerequisites:** +1. Migrate monitoring stack to kube-prometheus-stack (enables Prometheus Operator) +2. Enable Alertmanager with a receiver (Slack, PagerDuty, etc.) + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: sei-controller-alerts +spec: + groups: + - name: sei-controller + rules: + - alert: SeiNodeGroupDegraded + expr: sei_controller_seinodegroup_phase{phase="Degraded"} == 1 + for: 10m + labels: + severity: warning + annotations: + summary: "SeiNodeGroup {{ $labels.namespace }}/{{ $labels.name }} is degraded" + + - alert: SeiNodeGroupFailed + expr: sei_controller_seinodegroup_phase{phase="Failed"} == 1 + for: 5m + labels: + severity: critical + annotations: + summary: "SeiNodeGroup {{ $labels.namespace }}/{{ $labels.name }} has failed" + + - alert: SeiNodeStuckInitializing + expr: sei_controller_seinode_phase{phase="Initializing"} == 1 + for: 30m + labels: + severity: warning + annotations: + summary: "SeiNode {{ $labels.namespace }}/{{ $labels.name }} stuck initializing for 30m" + + - alert: SeiNodeStuckPending + expr: sei_controller_seinode_phase{phase="Pending"} == 1 + for: 15m + labels: + severity: warning + annotations: + summary: "SeiNode {{ $labels.namespace }}/{{ $labels.name }} stuck pending for 15m" + + - alert: SeiNodePoolStuckPending + expr: sei_controller_seinodepool_phase{phase="Pending"} == 1 + for: 15m + labels: + severity: warning + annotations: + summary: "SeiNodePool {{ $labels.namespace }}/{{ $labels.name }} stuck pending for 15m" + + - alert: SidecarUnreachableHigh + expr: rate(sei_controller_sidecar_unreachable_total[5m]) > 0.1 + for: 10m + labels: + severity: warning + annotations: + summary: "Sidecar for {{ $labels.namespace }}/{{ $labels.node }} unreachable" + + - alert: ControllerReconcileErrors + expr: rate(sei_controller_reconcile_errors_total[5m]) > 0 + for: 15m + labels: + severity: warning + annotations: + summary: "Controller {{ $labels.controller }} has sustained reconcile errors" + + - alert: ControllerHighReconcileLatency + expr: histogram_quantile(0.99, rate(controller_runtime_reconcile_duration_seconds_bucket[5m])) > 30 + for: 10m + labels: + severity: warning + annotations: + summary: "Controller reconcile P99 latency exceeds 30s" +``` + +--- + +## State Model + +### Metrics state +- **Location:** In-process Prometheus registry (controller-runtime `metrics.Registry`) +- **Lifecycle:** Created at controller startup, updated during reconcile loops, cleaned up on resource deletion (§1e), scraped by Prometheus +- **Source of truth:** Live process memory; Prometheus TSDB is the persistent store +- **Staleness:** If gauge cleanup is missed (crash between delete and cleanup), Prometheus staleness mechanism (default 5m) handles stale series as a fallback + +### Event state +- **Location:** Kubernetes API server (Events) +- **Lifecycle:** Created by EventRecorder, garbage-collected by K8s (default 1h TTL) +- **Source of truth:** etcd via API server + +### Dashboard/Alert state +- **Location:** ConfigMaps (dashboards), PrometheusRule CRDs (alerts, Phase 2) +- **Lifecycle:** Deployed via Kustomize/Helm, managed by Flux GitOps +- **Source of truth:** Git repository + +--- + +## Internal Design + +### Phase 1 Implementation (controller metrics + events + annotation-based scraping) + +#### 1a. Metrics registration pattern + +Each controller package gets a `metrics.go` file: + +```go +// internal/controller/nodegroup/metrics.go +package nodegroup + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var reconcileBuckets = []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60} + +var ( + groupPhaseGauge = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "sei_controller_seinodegroup_phase", + Help: "Current phase of each SeiNodeGroup (1=active, 0=inactive)", + }, + []string{"namespace", "name", "phase"}, + ) + + groupReplicasGauge = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "sei_controller_seinodegroup_replicas", + Help: "Replica counts for each SeiNodeGroup", + }, + []string{"namespace", "name", "type"}, + ) + + reconcileSubstepDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "sei_controller_seinodegroup_reconcile_substep_duration_seconds", + Help: "Duration of individual reconcile substeps", + Buckets: reconcileBuckets, + }, + []string{"controller", "substep"}, + ) +) + +func init() { + metrics.Registry.MustRegister( + groupPhaseGauge, + groupReplicasGauge, + reconcileSubstepDuration, + ) +} +``` + +#### 1b. Metrics emission in reconcile loop + +```go +func (r *SeiNodeGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // ... existing logic ... + + // After status update, emit phase gauge + emitPhaseGauge(groupPhaseGauge, group.Namespace, group.Name, string(group.Status.Phase), + allGroupPhases) + + // Emit replica gauges + groupReplicasGauge.WithLabelValues(group.Namespace, group.Name, "desired").Set(float64(group.Spec.Replicas)) + groupReplicasGauge.WithLabelValues(group.Namespace, group.Name, "ready").Set(float64(group.Status.ReadyReplicas)) + + return ctrl.Result{}, nil +} + +// emitPhaseGauge sets 1.0 for the current phase and 0.0 for all others, +// giving Prometheus a clean gauge that works with instant queries. +func emitPhaseGauge(gauge *prometheus.GaugeVec, ns, name, current string, allPhases []string) { + for _, p := range allPhases { + val := 0.0 + if p == current { + val = 1.0 + } + gauge.WithLabelValues(ns, name, p).Set(val) + } +} +``` + +#### 1c. Substep timing helper + +```go +func (r *SeiNodeGroupReconciler) timeSubstep(ctx context.Context, name string, fn func() error) error { + start := time.Now() + err := fn() + reconcileSubstepDuration.WithLabelValues("seinodegroup", name).Observe(time.Since(start).Seconds()) + return err +} + +// Usage in reconcile: +if err := r.timeSubstep(ctx, "ensureNodes", func() error { + return r.ensureNodes(ctx, group) +}); err != nil { + return ctrl.Result{}, err +} +``` + +#### 1d. EventRecorder for SeiNode and SeiNodePool controllers + +```go +// cmd/main.go — add recorders +nodeRecorder := mgr.GetEventRecorderFor("seinode-controller") +if err := (&nodecontroller.SeiNodeReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Platform: platform, + Recorder: nodeRecorder, +}).SetupWithManager(mgr); err != nil { ... } + +poolRecorder := mgr.GetEventRecorderFor("seinodepool-controller") +if err := (&nodepoolcontroller.SeiNodePoolReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: poolRecorder, +}).SetupWithManager(mgr); err != nil { ... } +``` + +Emit events at phase transitions: + +```go +func (r *SeiNodeReconciler) transitionPhase(ctx context.Context, node *SeiNode, to SeiNodePhase) { + from := node.Status.Phase + if from == to { + return + } + + // Metric + nodePhaseTransitions.WithLabelValues(node.Namespace, node.Name, string(from), string(to)).Inc() + + // Event + if r.Recorder != nil { + r.Recorder.Eventf(node, corev1.EventTypeNormal, "PhaseTransition", + "Phase changed from %s to %s", from, to) + } + + // Log + log.FromContext(ctx).Info("phase transition", "fromPhase", from, "toPhase", to) + + node.Status.Phase = to +} +``` + +**RBAC markers:** Both `SeiNodeReconciler` and `SeiNodePoolReconciler` must add: +```go +// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch +``` +Then run `make manifests` to regenerate the ClusterRole. Without these markers, `Recorder.Eventf()` +calls will silently fail with 403 errors (the event recorder swallows RBAC errors by default). + +#### 1e. Gauge cleanup on resource deletion + +Each controller implements a `cleanupMetrics` helper called at the beginning of `handleDeletion`, +before any status patch: + +```go +// internal/controller/nodegroup/metrics.go + +func cleanupGroupMetrics(namespace, name string) { + for _, phase := range allGroupPhases { + groupPhaseGauge.DeleteLabelValues(namespace, name, phase) + } + for _, typ := range []string{"desired", "ready", "initializing", "failed"} { + groupReplicasGauge.DeleteLabelValues(namespace, name, typ) + } + for _, cond := range allConditionTypes { + for _, status := range []string{"True", "False", "Unknown"} { + groupConditionGauge.DeleteLabelValues(namespace, name, cond, status) + } + } +} +``` + +```go +// internal/controller/nodegroup/controller.go — in handleDeletion: + +func (r *SeiNodeGroupReconciler) handleDeletion(ctx context.Context, group *SeiNodeGroup) (ctrl.Result, error) { + cleanupGroupMetrics(group.Namespace, group.Name) + // ... existing finalizer logic ... +} +``` + +Same pattern for SeiNode (`cleanupNodeMetrics`) and SeiNodePool (`cleanupPoolMetrics`). + +For counters and histograms with per-resource labels (e.g., `sidecar_unreachable_total`), +`DeleteLabelValues` is semantically acceptable because the resource identity is gone — the +counter for that label set will never be incremented again. + +**Fallback:** If the controller crashes between resource deletion and gauge cleanup, Prometheus' +built-in staleness mechanism (default 5m) marks the series as stale. No phantom resources will +persist beyond that window. + +### Phase 2 Implementation (dashboards, alerts, sidecar metrics, kube-prometheus-stack) + +#### 2a. kube-prometheus-stack migration (prerequisite) + +Replace the community Prometheus chart with `kube-prometheus-stack` in the platform repo. +This enables: +- Prometheus Operator (reconciles ServiceMonitor, PodMonitor, PrometheusRule CRDs) +- Alertmanager (alert routing) +- Grafana sidecar (dashboard auto-discovery) +- Built-in kube-state-metrics (replaces standalone deployment) + +This is a platform repo change owned by the platform team. Tracked separately. + +#### 2b. Dashboard deployment + +**Phase 1 path:** Dashboard JSON committed to `config/monitoring/dashboards/` in the controller +repo. A ConfigMap `sei-controller-dashboards` is deployed to the `monitoring` namespace via +a **separate** Kustomization in the platform repo (NOT bundled with the controller's +`config/default`, which has `namespace: sei-k8s-controller-system` applied). + +```yaml +# platform/clusters/dev/monitoring/kustomization.yaml — add: +resources: + # ... existing resources ... + - github.com/sei-protocol/sei-k8s-controller/config/monitoring/dashboards?ref=main +``` + +The dashboards kustomization sets `namespace: monitoring` explicitly. + +**Phase 2 path:** After Grafana sidecar enablement, dashboards can be deployed as labeled +ConfigMaps to any namespace with `searchNamespace: ALL`. + +#### 2c. Sidecar `/metrics` endpoint +In the seictl repo, register a `promhttp.Handler()` on the existing HTTP mux at `/metrics`. +Register sidecar-specific metrics with the default registry. The port (7777) is already +exposed by the per-node headless Service. Phase 1 annotation-based scraping picks it up +immediately; Phase 2 PodMonitor provides explicit scrape configuration. + +#### 2d. PrometheusRule +Deployed after kube-prometheus-stack migration enables Prometheus Operator. +Committed to `config/monitoring/prometheus-rule.yaml` in the controller repo. + +--- + +## Error Handling + +| Error | Detection | Surface | Operator Action | +|-------|-----------|---------|----------------| +| Metrics registration collision | `MustRegister` panics at startup | Controller crash, pod restart | Fix duplicate metric name in code | +| Prometheus scrape failure (auth) | `up == 0` for controller target | Grafana dashboard gap | Check pod annotations, Prometheus config, RBAC | +| PrometheusRule rejected (Phase 2) | Apply error | Log + condition | Fix rule syntax | +| Dashboard ConfigMap too large | Apply error (etcd 1.5MB limit) | Log | Split dashboard or compress | +| Cardinality explosion | Prometheus OOM or slow queries | Grafana timeout | Review label cardinality, add metric limits | +| Sidecar /metrics unreachable | Prometheus scrape failure | `up == 0` for sidecar target | Check sidecar health, pod networking, NetworkPolicy | +| Event RBAC missing | Silent 403 (recorder swallows errors) | Events not appearing for SeiNode/Pool | Add RBAC markers, run `make manifests` | + +--- + +## Test Specification + +### Unit Tests + +| Test | Setup | Action | Expected | +|------|-------|--------|----------| +| `TestPhaseGaugeEmission` | Create SeiNodeGroup, set phase to Ready | Call `emitPhaseGauge` | Ready=1.0, all others=0.0 | +| `TestReplicaGaugeValues` | Group with 3 replicas, 2 ready | Reconcile | desired=3, ready=2 | +| `TestSubstepTimingRecorded` | Mock substep taking 100ms | Call `timeSubstep` | Histogram has observation ≥ 0.1s, bucket 0.25 incremented | +| `TestPhaseTransitionCounter` | SeiNode transitions Pending→Initializing | Call `transitionPhase` | Counter incremented with from=Pending, to=Initializing | +| `TestPhaseTransitionEvent` | Same as above | Call `transitionPhase` | EventRecorder received Normal/PhaseTransition event | +| `TestSidecarUnreachableCounter` | Mock HTTP client returning connection error | Reconcile SeiNode | `sidecar_unreachable_total` incremented | +| `TestNetworkingResourceMetrics` | Reconcile HTTPRoute successfully | Check counter | `networking_resource_reconcile_total{result=created}` = 1 | +| `TestMetricsCleanupOnDeletion` | Delete SeiNodeGroup | Call `cleanupGroupMetrics` then check gauges | All gauges for deleted group removed via `DeleteLabelValues` | +| `TestRouteNormalization` | HTTP call to `/v0/tasks/abc-123` | Check histogram label | `route` = `/v0/tasks/:id` | +| `TestInitDurationHistogramAggregates` | 3 SeiNodes in same chain complete init | Check histogram | 3 observations in `sei_controller_seinode_init_duration_seconds{chain_id="pacific-1"}` | +| `TestPoolPhaseGauge` | SeiNodePool in Running phase | Call `emitPhaseGauge` | Running=1.0, all others=0.0 | + +### Integration Tests + +| Test | Setup | Action | Expected | +|------|-------|--------|----------| +| `TestControllerMetricsScrape` | Deploy controller with annotations | `curl http://:8443/metrics` | Returns Prometheus text format with `sei_controller_` metrics | +| `TestSidecarMetricsScrape` | Deploy SeiNode with sidecar (Phase 2) | `curl http://:7777/metrics` | Returns `sei_sidecar_` metrics | +| `TestDashboardConfigMapLoads` | Apply dashboard ConfigMap, add to Grafana values | Check Grafana API | Dashboard appears in Grafana | +| `TestPrometheusDiscoversPods` | Deploy controller with annotations | Query Prometheus `/api/v1/targets` | Controller pod appears as active target | + +--- + +## Deployment + +### Phase 1 (controller repo changes only) + +**No infrastructure changes required.** Works with existing annotation-based Prometheus. + +| File | Change | +|------|--------| +| `internal/controller/nodegroup/metrics.go` | New — metric registration for SeiNodeGroup | +| `internal/controller/node/metrics.go` | New — metric registration for SeiNode | +| `internal/controller/nodepool/metrics.go` | New — metric registration for SeiNodePool | +| `internal/controller/nodegroup/controller.go` | Add metric emission, substep timing, cleanup | +| `internal/controller/node/controller.go` | Add Recorder field, metric emission, phase transition helper, cleanup, RBAC marker | +| `internal/controller/nodepool/controller.go` | Add Recorder field, metric emission, cleanup, RBAC marker | +| `cmd/main.go` | Wire EventRecorders for SeiNode and SeiNodePool; switch `metrics-secure` default to false | +| `config/manager/manager.yaml` | Add `prometheus.io/*` annotations, declare metrics port | +| Various `*_test.go` | New test cases per Test Specification | + +Run `make manifests` after adding RBAC markers to regenerate ClusterRole. + +### Phase 2 prerequisites (platform repo changes) + +| Task | Owner | Repo | +|------|-------|------| +| Migrate to kube-prometheus-stack | Platform team | platform | +| Enable Alertmanager with receiver | Platform team | platform | +| Enable Grafana sidecar | Platform team | platform | +| Deploy dashboard ConfigMap to monitoring namespace | Platform team | platform (references controller repo) | + +### Phase 2 (controller + seictl repo changes) + +| File | Change | +|------|--------| +| `config/monitoring/service-monitor.yaml` | New — ServiceMonitor for controller | +| `config/monitoring/prometheus-rule.yaml` | New — PrometheusRule with alerts | +| `config/monitoring/dashboards/*.json` | New — Grafana dashboard JSON | +| `config/monitoring/sidecar-pod-monitor.yaml` | New — PodMonitor for sidecar | +| `config/monitoring/networkpolicy.yaml` | New — allow Prometheus→sidecar on 7777 | +| `config/default/kustomization.yaml` | Add `../monitoring` (ServiceMonitor + rule only, not dashboards) | +| seictl: `sidecar/server/server.go` | Add `promhttp.Handler()` on `/metrics` | +| seictl: `sidecar/metrics/metrics.go` | New — sidecar metric registration | + +### Dev vs Prod differences +- Phase 1 (annotation-based) is identical across environments +- Phase 2: dev uses `kube-prometheus-stack` defaults; prod may have different scrape intervals, alert `for` durations, and Alertmanager receivers +- Dashboard ConfigMaps are identical across environments (parameterized by `$namespace` Grafana variable) + +--- + +## Deferred (Do Not Build) + +| Feature | Rationale | +|---------|-----------| +| **Distributed tracing (OpenTelemetry)** | Adds significant complexity; reconcile loops are not RPC-heavy enough to warrant trace propagation yet. Revisit when cross-service debugging is a real pain point. | +| **Custom metrics CRD** | No need for user-configurable metrics; the fixed metric set covers operational needs. | +| **Cost attribution metrics** | Requires cloud billing API integration; not a controller concern. | +| **Log-based alerting (Loki alerts)** | Phase 2 Prometheus alerting covers critical paths; Loki alerts can be added ad-hoc without design. | +| **Sidecar tracing** | Same rationale as controller tracing; sidecar tasks are self-contained. | +| **Multi-cluster metrics aggregation** | Single cluster today; Thanos/Cortex federation is an infrastructure concern. | +| **SLA/SLO reporting** | Premature; need baseline data first. Revisit after 30 days of metrics collection. | +| **Secure metrics in Phase 1** | Annotation-based scraping with in-cluster HTTP is simpler and sufficient. Re-enable TLS+auth when ServiceMonitor-based scraping is available in Phase 2. | + +--- + +## Decision Log + +| # | Decision | Rationale | Reversibility | +|---|----------|-----------|---------------| +| 1 | Use `sei_controller_` prefix for all controller metrics | Avoids collision with controller-runtime `controller_runtime_` prefix; groups all custom metrics under one namespace | Two-way: prefix is internal, can rename with a migration | +| 2 | Phase gauge with 0/1 per phase instead of info-style label | Follows kube-state-metrics convention (`kube_pod_status_phase`). Enables simple `== 1` queries and alerting without cardinality issues from multi-valued labels | Two-way: can switch to enum/info metric later | +| 3 | EventRecorder per controller, not shared | Follows controller-runtime convention; event `source.component` field correctly identifies which controller emitted | Two-way: trivial to consolidate | +| 4 | Phase 1 uses annotation-based scraping, not ServiceMonitor | Dev cluster runs community Prometheus chart (not kube-prometheus-stack). ServiceMonitor CRDs exist but no Operator reconciles them. Annotations work immediately with existing `extraScrapeConfigs` | Two-way: migrate to ServiceMonitor in Phase 2 | +| 5 | Switch metrics endpoint to HTTP (not HTTPS) in Phase 1 | Annotation-based scraping has no built-in bearer token support. In-cluster HTTP on a non-public port is the standard pattern for controllers scraped by cluster Prometheus. Re-enable TLS+auth in Phase 2 with ServiceMonitor | Two-way: flag-based, can switch back per-environment | +| 6 | Dashboards deployed to monitoring namespace via platform repo reference, not bundled with controller | Controller Kustomization applies `namespace: sei-k8s-controller-system` to all resources. Dashboard ConfigMaps must be in the monitoring namespace for Grafana to read them | Two-way: can change deployment strategy | +| 7 | PodMonitor for sidecar instead of ServiceMonitor (Phase 2) | Sidecar metrics port (7777) is on the pod, not the external Service; PodMonitor directly targets pods by label | Two-way: could use additional ServiceMonitor port instead | +| 8 | Sidecar metrics on same port (7777) as API | Avoids adding another port/listener; `/metrics` is a standard path alongside `/v0/healthz` and `/v0/tasks` | Two-way: can split to separate port later | +| 9 | Custom histogram buckets extending to 60s | `prometheus.DefBuckets` max at 10s; controller substeps and sidecar interactions can exceed this under load. 60s covers controller-runtime reconcile timeout | Two-way: bucket boundaries can be adjusted | +| 10 | Drop `name` label from init duration histogram | Per-resource histograms with N=1 observations produce meaningless percentiles. Aggregate by `chain_id` for statistical value; use per-node gauge for debugging | Two-way: can add `name` label back if needed | +| 11 | Alerting deferred to Phase 2 | Alertmanager is disabled on dev. No destination for fired alerts. Enable after kube-prometheus-stack migration | Two-way: PrometheusRule can be deployed at any time | +| 12 | Normalize sidecar HTTP paths to route templates | Raw `path` label has unbounded cardinality risk from parameterized paths (`/v0/tasks/{id}`). Bounded allowlist of 5 route templates caps cardinality | Two-way: can expand allowlist | diff --git a/.tide/seinodegroup-scheduling.md b/.tide/seinodegroup-scheduling.md new file mode 100644 index 0000000..2f586d4 --- /dev/null +++ b/.tide/seinodegroup-scheduling.md @@ -0,0 +1,699 @@ +# Component: SeiNodeGroup Scheduling and Topology + +## Owner + +Kubernetes Specialist + Platform Engineer (joint ownership) + +## Phase + +- **Phase 1**: Implicit AZ spread for grouped nodes (no API change) +- **Phase 2**: Explicit `placement` field on SeiNodeGroup CRD +- **Phase 3**: Full scheduling override (future, deferred) + +## Purpose + +SeiNodeGroups serving RPC traffic via HTTPRoute must survive availability zone failures without manual intervention. Today, all scheduling is hardcoded in `PlatformConfig` (Karpenter nodepool affinity + toleration) with no topology spread constraints and no pod anti-affinity. A 3-replica group could land entirely in one AZ, making the NLB-backed HTTPRoute a single point of failure at the zone level. + +This component adds topology-aware scheduling to the SeiNode controller so that pods belonging to the same SeiNodeGroup are automatically spread across availability zones. It also introduces a PodDisruptionBudget per group so voluntary disruptions (node upgrades, Karpenter consolidation) cannot take down all replicas simultaneously. + +**Business need**: Production RPC availability during AZ failures for SeiNodeGroups with `replicas > 1`. + +## Dependencies + +- **SeiNodeGroup controller** (`sei-node-controller-networking/internal/controller/nodegroup/`) — creates child SeiNode resources with `sei.io/group` pod label +- **SeiNode controller** (`sei-node-controller-networking/internal/controller/node/`) — translates SeiNodeSpec into StatefulSet pod spec via `buildNodePodSpec()`. **Note**: The production binary is `sei-node-controller-networking`, which already has `PodLabels` on `SeiNodeSpec` and merges them into pod template labels via `resourceLabelsForNode`. The `sei-k8s-controller` repo's version does NOT have `PodLabels` support. All Phase 1 changes target the networking controller repo. +- **Karpenter NodePool** — provisions nodes with `karpenter.sh/nodepool` label; respects topology spread constraints when choosing which zone to provision into +- **EBS CSI driver** — creates gp3 RWO PVCs that are zone-bound after initial provisioning. **Prerequisite**: The StorageClass must use `volumeBindingMode: WaitForFirstConsumer` for topology-aware scheduling to work. Immediate binding defeats the spread by binding PVCs to a random zone before the scheduler evaluates topology spread constraints. +- **Does NOT depend on**: Istio, Gateway API, or any networking component. Scheduling is orthogonal to traffic routing. + +## Interface Specification + +### Phase 1: Implicit AZ Spread (no CRD change) + +No new API types. The SeiNode controller detects that a node belongs to a group via the `sei.io/group` pod label and injects a topology spread constraint automatically. + +**Target repo**: `sei-node-controller-networking` (the production binary where `SeiNodeSpec.PodLabels` and `resourceLabelsForNode` merging already exist). + +**Modified function signature** (no change — internal behavior only): + +```go +// resources.go +func buildNodePodSpec(node *seiv1alpha1.SeiNode, platform PlatformConfig) corev1.PodSpec +``` + +**Injected topology spread constraint** (when `node.Spec.PodLabels["sei.io/group"]` is non-empty): + +```go +corev1.TopologySpreadConstraint{ + MaxSkew: 1, + TopologyKey: "topology.kubernetes.io/zone", + WhenUnsatisfiable: corev1.ScheduleAnyway, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "sei.io/group": node.Spec.PodLabels["sei.io/group"], + }, + }, +} +``` + +Design decisions for Phase 1: + +| Decision | Value | Rationale | +|----------|-------|-----------| +| `MaxSkew` | `1` | Strictest spread — at most 1 more pod in any zone than the least-populated zone | +| `TopologyKey` | `topology.kubernetes.io/zone` | Standard Kubernetes zone label; Karpenter and EKS both populate it | +| `WhenUnsatisfiable` | `ScheduleAnyway` | Prefer spread but don't block scheduling if zones are asymmetric. A node stuck Pending is worse than imperfect spread for Phase 1. Phase 2 lets users choose `DoNotSchedule`. | +| `LabelSelector` | `sei.io/group: {groupName}` | Scoped to the group, not all sei-nodes. Two different groups don't interfere with each other's spread. | + +**Karpenter interaction note**: With `ScheduleAnyway`, Karpenter treats the constraint as a soft preference during batch provisioning. Combined with `karpenter.sh/do-not-disrupt: "true"` on all sei-node pods, the initial placement is permanent — Karpenter will never rebalance pods across zones after creation. Operators should verify AZ distribution after initial group creation with `kubectl get pods -l sei.io/group={name} -o wide` and consider recreating pods if the spread is poor. Phase 2's `DoNotSchedule` option provides stronger guarantees. + +**PodDisruptionBudget** (Phase 1 — created by SeiNodeGroup controller): + +```go +type: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {group-name} + namespace: {group-namespace} +spec: + maxUnavailable: 1 + selector: + matchLabels: + sei.io/group: {group-name} +``` + +The PDB uses `maxUnavailable: 1` (not `minAvailable`) to cap simultaneous disruptions at exactly 1 pod regardless of group size. This is the standard practice for availability-critical workloads — with `minAvailable: 1` a 5-replica group would allow 4 simultaneous disruptions, creating a capacity cliff behind the NLB. + +The PDB is created by the SeiNodeGroup controller (not the SeiNode controller) because it spans all pods in the group. It is only created when `replicas > 1` (a PDB on a single-replica group would block all voluntary disruptions). + +**Scale-up transient**: When scaling from 1 to N replicas, the PDB is created immediately but only 1 pod may be Ready. With `maxUnavailable: 1`, `disruptionsAllowed` may be 0 until new pods complete their multi-hour startup (snapshot restore, state sync). This blocks voluntary disruptions (Karpenter consolidation, node drain) during the scale-up window. This is acceptable — protecting running workloads during scale-up is the safer default. Document this in operational runbooks. + +### Phase 2: Explicit Placement Field (CRD change) + +New types added to `api/v1alpha1/`: + +```go +// placement_types.go + +// PlacementConfig controls how pods in a SeiNodeGroup are scheduled +// across the cluster topology. +type PlacementConfig struct { + // TopologySpread controls how pods are distributed across failure + // domains. When nil, the controller applies a default AZ spread + // with MaxSkew=1 and WhenUnsatisfiable=ScheduleAnyway. + // +optional + TopologySpread *TopologySpreadConfig `json:"topologySpread,omitempty"` + + // PodDisruptionBudget controls voluntary disruption tolerance. + // When nil, the controller creates a PDB with maxUnavailable=1 + // for groups with replicas > 1. + // +optional + PodDisruptionBudget *PDBConfig `json:"podDisruptionBudget,omitempty"` +} + +// TopologySpreadConfig configures topology spread constraints for the group. +type TopologySpreadConfig struct { + // MaxSkew is the maximum difference in pod count between any two + // topology domains. Defaults to 1. + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:default=1 + // +optional + MaxSkew int32 `json:"maxSkew,omitempty"` + + // WhenUnsatisfiable controls what happens when spread cannot be + // satisfied. "ScheduleAnyway" (default) applies a soft preference. + // "DoNotSchedule" makes it a hard requirement — pods will stay + // Pending if no valid zone exists. + // +kubebuilder:validation:Enum=DoNotSchedule;ScheduleAnyway + // +kubebuilder:default=ScheduleAnyway + // +optional + WhenUnsatisfiable corev1.UnsatisfiableConstraintAction `json:"whenUnsatisfiable,omitempty"` + + // Disabled suppresses the default topology spread constraint. + // Use this for workloads where AZ locality is preferred over + // spread (e.g., low-latency validator sets that benefit from + // same-zone communication). + // +optional + Disabled bool `json:"disabled,omitempty"` +} + +// PDBConfig controls the PodDisruptionBudget for the group. +type PDBConfig struct { + // MaxUnavailable is the maximum number of pods that can be + // simultaneously unavailable during voluntary disruptions. + // Defaults to 1. Only integer values are supported; percentage- + // based values are not supported due to ambiguity at small + // replica counts. + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:default=1 + // +optional + MaxUnavailable *int32 `json:"maxUnavailable,omitempty"` + + // Disabled suppresses PDB creation entirely. + // +optional + Disabled bool `json:"disabled,omitempty"` +} +``` + +**SeiNodeGroupSpec change:** + +```go +type SeiNodeGroupSpec struct { + Replicas int32 `json:"replicas"` + Template SeiNodeTemplate `json:"template"` + DeletionPolicy DeletionPolicy `json:"deletionPolicy,omitempty"` + Networking *NetworkingConfig `json:"networking,omitempty"` + Monitoring *MonitoringConfig `json:"monitoring,omitempty"` + // NEW — Phase 2 + Placement *PlacementConfig `json:"placement,omitempty"` +} +``` + +**SeiNodeSpec change** (to carry topology config down to the SeiNode controller): + +```go +type SeiNodeSpec struct { + // ... existing fields ... + + // Scheduling holds pod scheduling directives passed down from the + // SeiNodeGroup. Not intended for direct user configuration on + // standalone SeiNodes — set via SeiNodeGroup.spec.placement instead. + // +optional + Scheduling *SchedulingConfig `json:"scheduling,omitempty"` +} + +// SchedulingConfig holds pod-level scheduling directives. +type SchedulingConfig struct { + // TopologySpreadConstraints are injected into the StatefulSet pod spec. + // +optional + TopologySpreadConstraints []corev1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"` +} +``` + +**One-way door**: The CRD field names `placement`, `topologySpread`, and `scheduling` become part of the API contract once controllers depend on them. The names are chosen to align with Kubernetes terminology (`placement` is the user-facing group concept; `scheduling` is the lower-level pod concept passed through). + +### Cross-Component Interface: SeiNodeGroup → SeiNode → StatefulSet + +``` +SeiNodeGroup.spec.placement + ↓ (SeiNodeGroup controller: generateSeiNode) +SeiNode.spec.scheduling.topologySpreadConstraints + ↓ (SeiNode controller: buildNodePodSpec) +StatefulSet.spec.template.spec.topologySpreadConstraints +``` + +In Phase 1, the SeiNode controller infers the constraint from `sei.io/group` in `PodLabels`. In Phase 2, the SeiNodeGroup controller explicitly sets `SeiNode.spec.scheduling.topologySpreadConstraints` from the resolved placement config, and the SeiNode controller passes it through to the pod spec. + +**Phase 2 invariant**: `buildNodePodSpec` MUST check `node.Spec.Scheduling` FIRST and skip the Phase 1 `sei.io/group` label inference when `Scheduling` is non-nil. This prevents duplicate topology spread constraints during the Phase 1→2 transition. + +**Phase 2 rollout window**: During the brief period where the new SeiNodeGroup controller writes `spec.scheduling` but the old SeiNode controller ignores it, the system falls back to Phase 1 inference. This is safe and produces identical behavior. + +### Status Condition: ConditionPDBReady + +A new status condition `PDBReady` is added to `SeiNodeGroup`, following the pattern of `ExternalServiceReady`, `RouteReady`, `IsolationReady`, and `ServiceMonitorReady`: + +```go +const ConditionPDBReady = "PDBReady" +``` + +- Set to `True` after successful PDB reconciliation +- Set to `False` with reason `ReconcileError` on PDB SSA failure +- Removed via `removeCondition` when `replicas <= 1` or `placement.podDisruptionBudget.disabled == true` + +## State Model + +### Topology Spread State + +| State | Location | Source of Truth | +|-------|----------|-----------------| +| Desired spread config | `SeiNodeGroup.spec.placement` | User / GitOps | +| Resolved per-node constraints | `SeiNode.spec.scheduling` | SeiNodeGroup controller | +| Applied pod constraints | `StatefulSet.spec.template.spec.topologySpreadConstraints` | SeiNode controller | +| Actual pod zone | `pod.spec.nodeName` → node's `topology.kubernetes.io/zone` label | Kubernetes scheduler | + +### PDB State + +| State | Location | Source of Truth | +|-------|----------|-----------------| +| Desired PDB config | `SeiNodeGroup.spec.placement.podDisruptionBudget` | User / GitOps | +| PDB resource | `PodDisruptionBudget/{group-name}` in group namespace | SeiNodeGroup controller | +| PDB health condition | `SeiNodeGroup.status.conditions[type=PDBReady]` | SeiNodeGroup controller | +| Disruption budget | `PDB.status.disruptionsAllowed` | Kubernetes PDB controller | + +### State Transitions + +PDB lifecycle: + +``` +SeiNodeGroup created (replicas > 1) → PDB created (maxUnavailable=1), ConditionPDBReady=True +SeiNodeGroup scaled to 1 → PDB deleted, ConditionPDBReady removed +SeiNodeGroup deleted (DeletionPolicy=Delete)→ PDB cascade-deleted via owner reference +SeiNodeGroup deleted (DeletionPolicy=Retain)→ PDB orphaned (owner reference removed), survives with retained pods +``` + +## Internal Design + +### Phase 1 Changes + +#### Target Repository + +All Phase 1 changes target `sei-node-controller-networking`, which is the production binary. This repo already has: +- `SeiNodeSpec.PodLabels` field +- `resourceLabelsForNode` that merges `PodLabels` into pod template labels +- `sei.io/group` set in `PodLabels` by `generateSeiNode` + +Existing StatefulSets created by this controller already have `sei.io/group` in both their selector and pod template labels, so adding `TopologySpreadConstraints` to the pod template does NOT change the selector. SSA will update the pod template and trigger a RollingUpdate of existing pods. **Operator note**: For seid pods with multi-hour startup (snapshot restore), coordinate the topology spread rollout during a maintenance window or accept that pods will restart on next reconcile cycle (30s). + +#### SeiNode Controller: `buildNodePodSpec` (resources.go) + +After building the existing pod spec, check if the node belongs to a group and inject the topology spread constraint: + +```go +func buildNodePodSpec(node *seiv1alpha1.SeiNode, platform PlatformConfig) corev1.PodSpec { + // ... existing spec construction ... + + if groupName := node.Spec.PodLabels[seiv1alpha1.LabelGroup]; groupName != "" { + spec.TopologySpreadConstraints = []corev1.TopologySpreadConstraint{{ + MaxSkew: 1, + TopologyKey: "topology.kubernetes.io/zone", + WhenUnsatisfiable: corev1.ScheduleAnyway, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + seiv1alpha1.LabelGroup: groupName, + }, + }, + }} + } + + return spec +} +``` + +#### Shared Label Constants: api/v1alpha1/constants.go + +Both controllers need the same label values. Export them from the shared API package: + +```go +// api/v1alpha1/constants.go +const ( + LabelGroup = "sei.io/group" + LabelGroupOrdinal = "sei.io/group-ordinal" + LabelNode = "sei.io/node" +) +``` + +The existing unexported constants in `nodegroup/labels.go` (`groupLabel`, `groupOrdinalLabel`, `nodeLabel`) must be updated to reference these exported constants to prevent drift. + +#### SeiNodeGroup Controller: PDB reconciliation (pdb.go — new file) + +```go +func (r *SeiNodeGroupReconciler) reconcilePDB(ctx context.Context, group *seiv1alpha1.SeiNodeGroup) error { + if group.Spec.Replicas <= 1 { + removeCondition(group, ConditionPDBReady) + return r.deletePDB(ctx, group) + } + + desired := &policyv1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{ + Name: group.Name, + Namespace: group.Namespace, + Labels: resourceLabels(group), + Annotations: managedByAnnotations(), + }, + Spec: policyv1.PodDisruptionBudgetSpec{ + MaxUnavailable: ptr.To(intstr.FromInt32(1)), + Selector: &metav1.LabelSelector{ + MatchLabels: groupSelector(group), + }, + }, + } + if err := ctrl.SetControllerReference(group, desired, r.Scheme); err != nil { + return fmt.Errorf("setting owner reference: %w", err) + } + desired.SetGroupVersionKind(policyv1.SchemeGroupVersion.WithKind("PodDisruptionBudget")) + //nolint:staticcheck // migrating to typed ApplyConfiguration is a separate effort + if err := r.Patch(ctx, desired, client.Apply, fieldOwner, client.ForceOwnership); err != nil { + setCondition(group, ConditionPDBReady, metav1.ConditionFalse, + "ReconcileError", fmt.Sprintf("Failed to reconcile PDB: %v", err)) + return err + } + setCondition(group, ConditionPDBReady, metav1.ConditionTrue, "Reconciled", "PDB is up to date") + return nil +} + +func (r *SeiNodeGroupReconciler) deletePDB(ctx context.Context, group *seiv1alpha1.SeiNodeGroup) error { + pdb := &policyv1.PodDisruptionBudget{} + err := r.Get(ctx, types.NamespacedName{Name: group.Name, Namespace: group.Namespace}, pdb) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + return err + } + if metav1.IsControlledBy(pdb, group) { + return r.Delete(ctx, pdb) + } + return nil +} +``` + +#### SeiNodeGroup Controller: handleDeletion — PDB under Retain policy + +The PDB must be handled in `handleDeletion` for `DeletionPolicy=Retain`. Without this, the owner reference causes cascade deletion, removing disruption protection from retained pods: + +```go +func (r *SeiNodeGroupReconciler) handleDeletion(ctx context.Context, group *seiv1alpha1.SeiNodeGroup) (ctrl.Result, error) { + // ... existing status patch ... + + if policy == seiv1alpha1.DeletionPolicyRetain { + // ... existing orphan calls ... + if err := r.orphanPDB(ctx, group); err != nil { + return ctrl.Result{}, fmt.Errorf("orphaning PDB: %w", err) + } + } else { + // ... existing delete calls ... + // PDB is cascade-deleted via owner reference, no explicit delete needed + } + + // ... existing finalizer removal ... +} + +func (r *SeiNodeGroupReconciler) orphanPDB(ctx context.Context, group *seiv1alpha1.SeiNodeGroup) error { + pdb := &policyv1.PodDisruptionBudget{} + err := r.Get(ctx, types.NamespacedName{Name: group.Name, Namespace: group.Namespace}, pdb) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + return err + } + if !metav1.IsControlledBy(pdb, group) { + return nil + } + patch := client.MergeFrom(pdb.DeepCopy()) + removeOwnerRef(pdb, group) + return r.Patch(ctx, pdb, patch) +} +``` + +The `reconcilePDB` call is added to `Reconcile()` in `controller.go` between `reconcileSeiNodes` and `reconcileNetworking`. + +RBAC marker addition: + +```go +// +kubebuilder:rbac:groups=policy,resources=poddisruptionbudgets,verbs=get;list;watch;create;update;patch;delete +``` + +`SetupWithManager` addition: + +```go +Owns(&policyv1.PodDisruptionBudget{}). +``` + +#### SeiNodeGroup Controller: nodes.go update field propagation + +Add `scheduling` to the narrow set of fields that `ensureSeiNode` propagates on update. In Phase 1 this is a no-op (both sides are nil), but prepares for Phase 2: + +```go +// In ensureSeiNode, after PodLabels comparison: +// Use apiequality.Semantic.DeepEqual (not reflect.DeepEqual) to correctly +// handle nil vs empty slice equivalence after DeepCopy / JSON round-trip. +if !apiequality.Semantic.DeepEqual(existing.Spec.Scheduling, desired.Spec.Scheduling) { + existing.Spec.Scheduling = desired.Spec.Scheduling + updated = true +} +``` + +Import: `apiequality "k8s.io/apimachinery/pkg/api/equality"` + +### Phase 2 Changes (summary) + +1. Add `PlacementConfig`, `TopologySpreadConfig`, `PDBConfig` types to `api/v1alpha1/placement_types.go` +2. Add `Scheduling *SchedulingConfig` to `SeiNodeSpec` +3. Add `Placement *PlacementConfig` to `SeiNodeGroupSpec` +4. In `generateSeiNode`, resolve placement config into `SeiNode.spec.scheduling.topologySpreadConstraints` +5. In `buildNodePodSpec`: check `node.Spec.Scheduling` FIRST; if non-nil, use directly and skip `sei.io/group` inference. This prevents duplicate constraints. +6. In `reconcilePDB`, read `maxUnavailable` from `placement.podDisruptionBudget` if set +7. Add `scheduling` to `ensureSeiNode` field propagation (already wired in Phase 1, now active) +8. Regenerate CRDs with `controller-gen` +9. Add unit tests for all new paths +10. Add webhook validation: `maxUnavailable` cannot exceed `replicas - 1`; warn if `DoNotSchedule` + `replicas > number_of_cluster_zones` + +## Error Handling + +| Error Case | Detection | Surface | Operator Action | +|-----------|-----------|---------|-----------------| +| Pod stuck Pending due to topology constraint | Pod has no node assignment after startup timeout | `kubectl describe pod` shows `FailedScheduling` with topology spread violation | Scale the Karpenter NodePool to allow more zones, or relax spread to `ScheduleAnyway` | +| PDB blocks node drain | Karpenter or kubectl drain blocked | `kubectl get pdb` shows `disruptionsAllowed: 0` | Wait for more replicas to be ready, or temporarily delete the PDB for emergency maintenance | +| PDB blocks during scale-up | Scaling from 1 to N, only 1 pod Ready | `kubectl get pdb` shows `disruptionsAllowed: 0` until new pods Ready | Expected behavior — disruptions blocked until new pods complete multi-hour startup. Document in runbooks. | +| PVC zone mismatch on reschedule | Pod scheduled to zone A, PVC bound to zone B | Pod Pending with `volume node affinity conflict` | Fundamental Kubernetes limitation — RWO PVCs are zone-pinned. Requires PVC migration or recreation. | +| PVC zone pinning after scale-down/up | RetainOnDelete PVC persists zone-bound; scale-up forces pod back to old zone | Pod lands in original PVC zone regardless of current spread | Spread may degrade over scale-down/up cycles with RetainOnDelete. Delete PVCs explicitly if AZ rebalancing is needed. | +| Topology spread ignored by scheduler | Cluster has only 1 zone available | All pods land in same zone; no error surfaced | Add nodes in additional zones to the Karpenter NodePool | +| StorageClass uses Immediate binding | PVC binds before scheduler evaluates topology | Pods may cluster in one zone despite spread constraint | Ensure StorageClass uses `volumeBindingMode: WaitForFirstConsumer` | +| Suboptimal initial spread (ScheduleAnyway) | Karpenter batch-provisions to one zone | `kubectl get pods -o wide` shows imbalanced AZ distribution | Permanent with `do-not-disrupt`; recreate pods or use `DoNotSchedule` in Phase 2 | +| SSA triggers pod rolling restart | Adding TopologySpreadConstraints to pod template | Existing pods restart on next reconcile (30s) | Coordinate Phase 1 rollout during maintenance window for groups with multi-hour startup | + +## Test Specification + +### Phase 1 Unit Tests + +#### Test: TopologySpread_GroupedNode_InjectsConstraint + +- **Setup**: Create SeiNode with `PodLabels: {"sei.io/group": "rpc-group"}` +- **Action**: Call `generateNodeStatefulSet(node, DefaultPlatformConfig())` +- **Expected**: `sts.Spec.Template.Spec.TopologySpreadConstraints` has length 1, with `MaxSkew=1`, `TopologyKey="topology.kubernetes.io/zone"`, `WhenUnsatisfiable=ScheduleAnyway`, and `LabelSelector` matching `sei.io/group: rpc-group` + +#### Test: TopologySpread_StandaloneNode_NoConstraint + +- **Setup**: Create SeiNode with no `sei.io/group` in PodLabels +- **Action**: Call `generateNodeStatefulSet(node, DefaultPlatformConfig())` +- **Expected**: `sts.Spec.Template.Spec.TopologySpreadConstraints` is nil + +#### Test: TopologySpread_EmptyGroupLabel_NoConstraint + +- **Setup**: Create SeiNode with `PodLabels: {"sei.io/group": ""}` +- **Action**: Call `generateNodeStatefulSet(node, DefaultPlatformConfig())` +- **Expected**: `sts.Spec.Template.Spec.TopologySpreadConstraints` is nil + +#### Test: PDB_MultiReplica_Created + +- **Setup**: Create SeiNodeGroup with `replicas: 3` +- **Action**: Reconcile and list PodDisruptionBudgets in the namespace +- **Expected**: PDB exists with name matching group name, `maxUnavailable: 1`, selector `sei.io/group: {name}`, owner reference to the group, `ConditionPDBReady=True` + +#### Test: PDB_SingleReplica_NotCreated + +- **Setup**: Create SeiNodeGroup with `replicas: 1` +- **Action**: Reconcile and list PodDisruptionBudgets +- **Expected**: No PDB exists for this group, no `ConditionPDBReady` in status + +#### Test: PDB_ScaleDownToOne_Deleted + +- **Setup**: Create SeiNodeGroup with `replicas: 3`, reconcile (PDB created). Then update to `replicas: 1`. +- **Action**: Reconcile again +- **Expected**: PDB is deleted, `ConditionPDBReady` removed from status + +#### Test: PDB_GroupDeletion_Cascades + +- **Setup**: Create SeiNodeGroup with `replicas: 3`, reconcile (PDB created). Delete the group with `DeletionPolicy=Delete`. +- **Action**: Observe PDB +- **Expected**: PDB is garbage collected via owner reference + +#### Test: PDB_GroupDeletion_RetainOrphans + +- **Setup**: Create SeiNodeGroup with `replicas: 3`, `DeletionPolicy=Retain`, reconcile (PDB created). Delete the group. +- **Action**: Observe PDB +- **Expected**: PDB survives with owner reference removed, continues protecting retained pods + +### Phase 2 Unit Tests + +#### Test: Placement_CustomMaxSkew + +- **Setup**: SeiNodeGroup with `placement.topologySpread.maxSkew: 2` +- **Action**: Reconcile, inspect generated SeiNode's `spec.scheduling` +- **Expected**: TopologySpreadConstraint has `MaxSkew=2` + +#### Test: Placement_DoNotSchedule + +- **Setup**: SeiNodeGroup with `placement.topologySpread.whenUnsatisfiable: DoNotSchedule` +- **Action**: Reconcile, inspect pod spec +- **Expected**: TopologySpreadConstraint has `WhenUnsatisfiable=DoNotSchedule` + +#### Test: Placement_Disabled + +- **Setup**: SeiNodeGroup with `placement.topologySpread.disabled: true` +- **Action**: Reconcile, inspect pod spec +- **Expected**: No topology spread constraints on the pod + +#### Test: Placement_CustomPDB + +- **Setup**: SeiNodeGroup with `replicas: 5`, `placement.podDisruptionBudget.maxUnavailable: 2` +- **Action**: Reconcile, inspect PDB +- **Expected**: PDB has `maxUnavailable: 2` + +#### Test: Placement_PDBDisabled + +- **Setup**: SeiNodeGroup with `placement.podDisruptionBudget.disabled: true` +- **Action**: Reconcile +- **Expected**: No PDB created, no `ConditionPDBReady` + +#### Test: Placement_Nil_DefaultBehavior + +- **Setup**: SeiNodeGroup with no `placement` field, `replicas: 3` +- **Action**: Reconcile +- **Expected**: Default topology spread (MaxSkew=1, ScheduleAnyway) and default PDB (maxUnavailable=1) + +#### Test: Phase1_To_Phase2_Transition_NoDuplicateConstraints + +- **Setup**: Create SeiNode with both `PodLabels: {"sei.io/group": "rpc"}` AND explicit `Scheduling.TopologySpreadConstraints` set +- **Action**: Call `generateNodeStatefulSet(node, DefaultPlatformConfig())` +- **Expected**: Uses `Scheduling.TopologySpreadConstraints` directly; no duplicate constraints from label inference + +### Sample Manifest (Phase 2) + +```yaml +apiVersion: sei.io/v1alpha1 +kind: SeiNodeGroup +metadata: + name: pacific-1-rpc + namespace: default +spec: + replicas: 3 + placement: + topologySpread: + maxSkew: 1 + whenUnsatisfiable: DoNotSchedule + podDisruptionBudget: + maxUnavailable: 1 + template: + spec: + chainId: pacific-1 + image: ghcr.io/sei-protocol/sei:v6.3.0 + fullNode: + peers: + - ec2Tags: + region: eu-central-1 + tags: + ChainIdentifier: pacific-1 + networking: + service: + type: ClusterIP + ports: [rpc, evm-rpc, evm-ws, grpc, metrics] + gateway: + parentRef: + name: sei-gateway + namespace: istio-system + hostnames: + - rpc.pacific-1.sei.io +``` + +## Deployment + +### Phase 1 + +- All changes target `sei-node-controller-networking` (the production binary) +- SeiNode controller: `resources.go` (inject topology spread) +- SeiNodeGroup controller: new `pdb.go`, `controller.go` (add `reconcilePDB`, `orphanPDB`, `Owns` for PDB) +- Shared: `api/v1alpha1/constants.go` (exported label constants), `nodegroup/labels.go` (reference shared constants) +- No CRD regeneration required +- **Migration**: Existing StatefulSets created by this controller already have `sei.io/group` in their selector and pod template labels. Adding `TopologySpreadConstraints` only changes the pod template, which SSA updates cleanly. However, this triggers a RollingUpdate of existing pods. For seid pods with multi-hour startup, coordinate the rollout during a maintenance window. +- **Rollout**: Deploy updated controller image. Existing StatefulSets are updated via Server-Side Apply on next reconcile cycle (30s). + +### Phase 2 + +- CRD changes: run `controller-gen` to regenerate `sei.io_seinodegroups.yaml` and `sei.io_seinodes.yaml` +- Requires CRD apply before controller deploy (new fields must exist before controller writes them) +- **Rollout order**: CRDs → controller image +- Backward compatible — `placement: nil` produces identical behavior to Phase 1 defaults +- During the brief rollout window, the SeiNodeGroup controller may write `spec.scheduling` before the SeiNode controller is updated to read it. This is safe — the old controller falls back to Phase 1 label inference. + +## Deferred (Do Not Build) + +| Feature | Rationale | +|---------|-----------| +| Custom `topologyKey` (e.g., `kubernetes.io/hostname` for per-node spread) | AZ spread covers the critical failure domain. Per-node spread is Karpenter's job via resource packing. | +| Pod affinity (co-locate with other workloads) | No business need identified. Sei-node pods run on dedicated node pools. | +| Custom tolerations / node selectors on CRD | Covered by `PlatformConfig` env vars. Per-group overrides add API surface without clear need. | +| Rack-aware topology | EKS doesn't expose rack topology. Revisit if moving to bare metal. | +| Weighted spread across zones | `MaxSkew=1` is sufficient. Weighted policies add complexity without proportional value. | +| Automatic PVC migration on zone failure | Fundamental Kubernetes limitation. Requires snapshot + restore workflow, not a scheduling feature. | +| Multi-cluster scheduling | Out of scope for single-cluster SeiNodeGroup. | +| Percentage-based PDB values | Ambiguous at small replica counts (50% of 3 = 1.5). Integer-only simplifies validation and avoids confusion. | + +## Decision Log + +| # | Decision | Rationale | Reversibility | +|---|----------|-----------|---------------| +| 1 | `ScheduleAnyway` as Phase 1 default | Pods stuck Pending is worse than imperfect spread for initial rollout. Users can opt into `DoNotSchedule` in Phase 2. | Two-way: can change default in Phase 2 without migration | +| 2 | PDB owned by SeiNodeGroup (not SeiNode) | PDB spans all pods in the group; creating per-SeiNode PDBs would be incorrect. Owner reference on the group provides cascade cleanup. | Two-way: ownership can be changed by deleting and recreating PDB | +| 3 | `placement` at SeiNodeGroup level, `scheduling` at SeiNode level | Clean separation: group-level intent (placement) vs. pod-level mechanics (scheduling). Mirrors K8s concepts: Deployment has strategy, Pod has scheduling. | One-way: CRD field names become API contract. Names chosen carefully. | +| 4 | Infer topology spread from `sei.io/group` label in Phase 1 | No CRD change, no migration, immediate value. Phase 2 makes it explicit. | Two-way: Phase 2 overrides the inference path | +| 5 | `maxUnavailable: 1` PDB default (not `minAvailable`) | Caps simultaneous disruptions at 1 regardless of group size. `minAvailable: 1` on a 5-replica group allows 4 simultaneous disruptions. | Two-way: config value | +| 6 | PDB orphaned under DeletionPolicy=Retain | Consistent with how SeiNodes and networking resources are orphaned. Retained pods need disruption protection. | Two-way: PDB can be manually deleted after group deletion | +| 7 | `apiequality.Semantic.DeepEqual` for scheduling field comparison | Handles nil vs empty slice equivalence correctly. `reflect.DeepEqual` treats `nil != []T{}`, causing spurious updates on every reconcile after JSON round-trip. | Two-way: implementation detail | +| 8 | Target `sei-node-controller-networking` for all changes | This is the production binary with `PodLabels` support. `sei-k8s-controller`'s `SeiNodeSpec` lacks `PodLabels`. | Two-way: can port to other repo later if needed | + +## Cross-Review Resolution Log + +### Blocking Concerns Resolved + +| Source | Concern | Resolution | +|--------|---------|------------| +| kubernetes-specialist | `sei-k8s-controller` SeiNodeSpec has no PodLabels field — code won't compile | Retargeted all changes to `sei-node-controller-networking` where PodLabels already exists. Added to Dependencies and Deployment sections. | +| kubernetes-specialist | Existing StatefulSet selector mismatch on migration | Clarified that the networking controller already includes `sei.io/group` in selectors. No selector change needed — only pod template changes. Documented RollingUpdate side effect. | +| platform-engineer | PDB not handled in handleDeletion for Retain policy | Added `orphanPDB` to the Retain path in handleDeletion. Added test case `PDB_GroupDeletion_RetainOrphans`. | +| platform-engineer | `reflect.DeepEqual` nil vs empty slice causes spurious updates | Changed to `apiequality.Semantic.DeepEqual`. Added to Decision Log. | + +### Non-Blocking Suggestions Incorporated + +| Source | Suggestion | Disposition | +|--------|-----------|-------------| +| kubernetes-specialist | Change PDB from `minAvailable:1` to `maxUnavailable:1` | **Adopted**. Changed throughout. Better caps simultaneous disruptions. | +| kubernetes-specialist | Document Karpenter `do-not-disrupt` interaction | **Adopted**. Added note to Phase 1 Interface Specification. | +| kubernetes-specialist | WaitForFirstConsumer StorageClass prerequisite | **Adopted**. Added to Dependencies and Error Handling. | +| kubernetes-specialist | Scale-up transient PDB blocking | **Adopted**. Added note to PDB section and Error Handling table. | +| kubernetes-specialist | RetainOnDelete PVC zone pinning | **Adopted**. Added to Error Handling table. | +| kubernetes-specialist | Observability for zone spread violations | **Deferred to Phase 2**. Could emit events when actual pod distribution violates MaxSkew. | +| kubernetes-specialist | DoNotSchedule as Phase 2 default | **Not adopted**. Keeping ScheduleAnyway default with DoNotSchedule available. Operators who need hard guarantees opt in explicitly. | +| kubernetes-specialist | SSA triggers rolling restart | **Adopted**. Added to Deployment section with maintenance window guidance. | +| platform-engineer | Add ConditionPDBReady status condition | **Adopted**. Added to Interface Specification, State Model, all PDB test cases. | +| platform-engineer | Add `//nolint:staticcheck` on SSA Patch | **Adopted**. Added to PDB reconciliation code. | +| platform-engineer | Update labels.go to reference shared constants | **Adopted**. Added to Internal Design. | +| platform-engineer | Phase 1→2 transition test case | **Adopted**. Added `Phase1_To_Phase2_Transition_NoDuplicateConstraints` test. | +| platform-engineer | Document int-only PDB restriction | **Adopted**. Added to PDBConfig godoc and Deferred table. | +| platform-engineer | Sample manifest | **Adopted**. Added sample Kustomize manifest for Phase 2. | + +## Phased Execution Plan + +### Phase 1 — Ship immediately (no API change) + +**Estimated effort**: 1-2 days + +1. Add exported label constants to `api/v1alpha1/constants.go` (`LabelGroup`, `LabelGroupOrdinal`, `LabelNode`) +2. Update `nodegroup/labels.go` to reference shared constants +3. Modify `buildNodePodSpec` in `resources.go` to inject topology spread when `sei.io/group` is present in PodLabels +4. Add unit tests for topology spread injection (3 test cases) +5. Add `pdb.go` to `internal/controller/nodegroup/` with `reconcilePDB`, `deletePDB`, `orphanPDB` +6. Add `ConditionPDBReady` constant to status condition types +7. Wire `reconcilePDB` into `controller.go` reconcile loop, `orphanPDB` into `handleDeletion` Retain path +8. Add RBAC markers and `Owns(&policyv1.PodDisruptionBudget{})` to `SetupWithManager` +9. Add unit tests for PDB lifecycle (5 test cases including Retain orphan) +10. Add `apiequality.Semantic.DeepEqual` comparison for `Scheduling` field in `ensureSeiNode` (no-op in Phase 1) + +### Phase 2 — Fast follow (CRD change) + +**Estimated effort**: 2-3 days + +1. Add `placement_types.go` with `PlacementConfig`, `TopologySpreadConfig`, `PDBConfig` +2. Add `Scheduling *SchedulingConfig` to `SeiNodeSpec` +3. Add `Placement *PlacementConfig` to `SeiNodeGroupSpec` +4. Update `generateSeiNode` to resolve placement → scheduling +5. Update `buildNodePodSpec` to check `node.Spec.Scheduling` FIRST, skip label inference when non-nil +6. Update `reconcilePDB` to read from placement config +7. Regenerate CRDs with `controller-gen` +8. Add webhook validation for `maxUnavailable` bounds +9. Add unit tests for all new paths (7 test cases including Phase 1→2 transition) + +### Phase 3 — Future (deferred) + +Full scheduling override: affinity, tolerations, node selector exposed on the CRD. Only build when a concrete use case requires per-group scheduling beyond topology spread. diff --git a/.tide/validator-migration.md b/.tide/validator-migration.md new file mode 100644 index 0000000..b041731 --- /dev/null +++ b/.tide/validator-migration.md @@ -0,0 +1,411 @@ +# Component: Validator Migration (sei-infra → sei-k8s-controller) + +**Date:** 2026-03-22 +**Status:** Draft — Technical Direction + +--- + +## Owner + +Platform / Infrastructure + +## Phase + +Pre-Tide — Operational prerequisite. Validator availability is a hard requirement for chain liveness; this migration must complete before any Tide workloads depend on the chain. + +## Purpose + +Migrate Sei's consensus-participating validators from the current EC2/Terraform infrastructure (`sei-infra`) onto the Kubernetes-native `SeiNode` controller. The migration must preserve validator uptime and signing continuity with **zero risk of double-signing or slashing**. + +**Key constraint:** At a coordinated block height, validators must begin signing blocks on the new infrastructure and must under no circumstance sign blocks on both old and new infrastructure simultaneously. + +--- + +## Current State + +### sei-infra Validators (EC2) + +| Property | Value | +|----------|-------| +| Infrastructure | EC2 instances managed by Terraform | +| Chains | pacific-1 (2 validators), arctic-1 (30 validators) | +| Instance types | m5.4xlarge, c5.4xlarge, r5.4xlarge | +| Storage | 200 GB root + variable data (gp3) | +| Bootstrap | `init_configure.sh` → `seid init`, mode=validator, genesis ceremony via S3 | +| Key management | `priv_validator_key.json` generated on-host, backed up to S3 | +| Key artifacts | `priv_validator_key.json`, `priv_validator_state.json`, `node_key.json` | +| Peer discovery | `persistent_peers.sh` resolves peers from S3-published node IDs | +| Monitoring | Prometheus with EC2 service discovery | + +### sei-k8s-controller Validators (Kubernetes) + +| Property | Value | +|----------|-------| +| CRD | `ValidatorSpec` with `Peers` and `Snapshot` fields | +| Planner | Sets `mode = "validator"` via sei-config; shares full-node bootstrap | +| Key management | None — no key import, creation, or reference in the spec | +| Sentry/double-sign | Not implemented | +| Sample manifests | None | + +--- + +## Threat Model + +### T1: Double Signing (Slashing — Catastrophic) + +If the old and new validator instances both have the same `priv_validator_key.json` and are running simultaneously, they will sign conflicting blocks at the same height. This triggers on-chain slashing (typically 5% stake for Cosmos SDK chains) and is **irreversible**. + +**Mitigation:** Hard cutover at a coordinated block height. The old instance must be stopped (and confirmed stopped) before the new instance starts signing. The `priv_validator_state.json` must be transferred to prevent signing at any previously-signed height. + +### T2: Missed Blocks (Jailing — Recoverable but Costly) + +If the new instance takes too long to start signing after the old one stops, the validator misses blocks and may be jailed for downtime. Jailing is recoverable (unjail tx), but extended downtime damages reputation and may result in delegation loss. + +**Mitigation:** Minimize the cutover window. Pre-sync the new instance to near-tip before stopping the old one. Target cutover window < 30 seconds. + +### T3: State Corruption During Transfer + +If `priv_validator_state.json` is not transferred or is stale, the new instance may re-sign a height it already signed on the old instance (→ T1), or refuse to sign because its state is ahead of the chain. + +**Mitigation:** Stop old instance → copy latest `priv_validator_state.json` → start new instance. The state file is the single source of truth for what has been signed. + +### T4: Network Partition During Cutover + +If the new instance cannot reach peers immediately after cutover, it will miss blocks even though it's ready to sign. + +**Mitigation:** Pre-configure peers and verify connectivity before cutover. The new instance should be fully synced and peered before the signing key is activated. + +--- + +## Migration Strategy: Staged Cutover + +The migration uses a **pre-sync + hard cutover** pattern. The new Kubernetes validator syncs to near-tip without signing keys, then at a coordinated moment the old instance is stopped and signing state is transferred to the new instance. + +### Phase 1: Pre-Sync (No Signing Keys) + +Deploy a `SeiNode` with `spec.validator` and state sync (or S3 snapshot) to bootstrap and sync to near-tip. **The node does NOT have `priv_validator_key.json` — it runs as a non-signing full node in validator mode.** + +This phase can run for as long as needed. The old validator continues signing normally. There is zero risk because the new instance has no signing key. + +```mermaid +graph LR + subgraph Old["Old EC2 Validator"] + A1[Signing blocks] + A2[Has priv_key] + A3[Active validator] + end + subgraph New["New K8s Validator"] + B1[Syncing to tip] + B2[NO priv_key] + B3[Observer only] + end +``` + +**Exit criteria for Phase 1:** +- New instance is synced to within ~10 blocks of tip +- New instance has stable peer connections +- New instance is healthy (RPC responding, not crash-looping) + +### Phase 2: Hard Cutover (Coordinated) + +This is the critical window. It must be executed as a single atomic sequence: + +``` +1. STOP old EC2 validator (confirm process is dead) +2. COPY priv_validator_key.json from old → new +3. COPY priv_validator_state.json from old → new +4. START signing on new K8s validator (restart seid to pick up keys) +``` + +```mermaid +graph LR + subgraph Old["Old EC2 Validator"] + A1[STOPPED] + A2[priv_key removed] + end + subgraph New["New K8s Validator"] + B1[Picks up keys] + B2[Resumes signing] + B3[Active validator] + end +``` + +**Timing:** The cutover window (step 1 → step 4) should be < 30 seconds to minimize missed blocks. This is achievable because: +- Step 1: SSH kill + confirm (~5s) +- Step 2-3: S3 copy or direct transfer (~5s) +- Step 4: seid reads keys on startup, already synced to near-tip (~10-20s) + +### Phase 3: Verification + +After cutover: +- Confirm new instance is signing blocks (check `signing_info` on-chain) +- Confirm old instance is stopped and stays stopped +- Monitor for missed blocks over next 100 blocks +- Remove old EC2 instance from Terraform state (do NOT destroy yet — keep as cold backup) + +### Phase 4: Cleanup + +After 1000+ blocks with stable signing on new infrastructure: +- Destroy old EC2 instance +- Remove `priv_validator_key.json` from any S3 backups (or rotate) + +--- + +## Controller Changes Required + +### 1. Key Import Mechanism + +The controller needs a way to inject `priv_validator_key.json` and `priv_validator_state.json` into the validator's data directory. Options: + +**Option A: Kubernetes Secret + Volume Mount** + +```yaml +spec: + validator: + signingKey: + secretRef: + name: pacific-1-validator-0-keys + # Contains: priv_validator_key.json, priv_validator_state.json +``` + +The controller mounts the Secret as a volume into the seid container at the expected config path. The Secret is created out-of-band (manually or via External Secrets Operator from AWS Secrets Manager). + +- Pro: Standard K8s pattern, integrates with external secret stores +- Pro: Keys never pass through the controller binary +- Con: Secret lifecycle management is manual + +**Option B: Sidecar Task (`import-validator-keys`)** + +A new sidecar task that downloads keys from S3/Secrets Manager and writes them to the data directory during init. + +- Pro: Consistent with existing sidecar task model +- Con: Keys transit through sidecar process +- Con: New task type to implement and test + +**Recommendation: Option A** — simpler, more secure (keys never in controller or sidecar memory), standard Kubernetes pattern. The Secret can be sourced from AWS Secrets Manager via the CSI driver or External Secrets Operator. + +### 2. ValidatorSpec Extension + +```go +type ValidatorSpec struct { + Peers []PeerSource `json:"peers,omitempty"` + Snapshot *SnapshotSource `json:"snapshot,omitempty"` + + // SigningKey references a Secret containing priv_validator_key.json + // and priv_validator_state.json. When set, the controller mounts + // the Secret into the seid container's config directory. + // +optional + SigningKey *SigningKeySource `json:"signingKey,omitempty"` +} + +type SigningKeySource struct { + // SecretRef is a reference to a Kubernetes Secret in the same namespace + // containing the validator's signing key material. + SecretRef corev1.LocalObjectReference `json:"secretRef"` +} +``` + +### 3. Resource Generation Changes + +When `SigningKey` is set, the StatefulSet pod spec must: +- Mount the referenced Secret as a volume +- Map `priv_validator_key.json` → `$HOME/.sei/config/priv_validator_key.json` +- Map `priv_validator_state.json` → `$HOME/.sei/data/priv_validator_state.json` + +The init container (`seid init`) must not overwrite these files if they already exist from the Secret mount. + +### 4. Pre-Sync Mode (No Signing Key) + +When `SigningKey` is nil, the validator runs as a non-signing observer. This is the Phase 1 state. The node syncs and peers normally but does not participate in consensus. + +When `SigningKey` is added (via `kubectl apply` with the updated manifest), the controller updates the StatefulSet to mount the Secret. The pod restarts and picks up the keys. + +**This two-step apply is the cutover mechanism:** +1. Deploy with `signingKey: null` → syncs to tip +2. Stop old instance, create Secret with keys, apply with `signingKey.secretRef` → pod restarts with keys + +--- + +## Cutover Runbook (per validator) + +### Prerequisites + +- [ ] New K8s validator SeiNode deployed and synced to within 10 blocks of tip +- [ ] New K8s validator has stable peer connections (check sidecar logs) +- [ ] K8s Secret for signing keys does NOT exist yet +- [ ] Old EC2 validator is running and signing normally +- [ ] Operator has SSH access to old EC2 instance +- [ ] Operator has kubectl access to target cluster + +### Execution + +```bash +# 1. Confirm new node is synced +kubectl exec pacific-1-validator-0 -c seid -- seid status | jq '.SyncInfo' +# Verify: catching_up = false, latest_block_height near tip + +# 2. Stop old EC2 validator (THE POINT OF NO RETURN) +ssh validator-0.ec2 'sudo systemctl stop seid && sudo systemctl disable seid' +# Verify: process is dead +ssh validator-0.ec2 'pgrep seid' # Should return nothing + +# 3. Copy signing state from old instance +ssh validator-0.ec2 'cat /sei/config/priv_validator_key.json' > /tmp/priv_validator_key.json +ssh validator-0.ec2 'cat /sei/data/priv_validator_state.json' > /tmp/priv_validator_state.json + +# 4. Create K8s Secret with signing keys +kubectl create secret generic pacific-1-validator-0-keys \ + --from-file=priv_validator_key.json=/tmp/priv_validator_key.json \ + --from-file=priv_validator_state.json=/tmp/priv_validator_state.json + +# 5. Update SeiNode manifest to reference the Secret +# (apply updated YAML with signingKey.secretRef) +kubectl apply -f pacific-1-validator-0.yaml + +# 6. Wait for pod restart and verify signing +kubectl logs pacific-1-validator-0-0 -c seid -f | grep "signed proposal" + +# 7. Clean up local key copies +rm /tmp/priv_validator_key.json /tmp/priv_validator_state.json +``` + +### Rollback + +If the new instance fails to start signing within 60 seconds: + +```bash +# Re-enable old EC2 validator +ssh validator-0.ec2 'sudo systemctl enable seid && sudo systemctl start seid' +# Delete the K8s Secret to prevent the new instance from signing +kubectl delete secret pacific-1-validator-0-keys +# Delete or scale down the K8s validator +kubectl delete seinode pacific-1-validator-0 +``` + +The old instance will resume from its last signed state. Since the new instance either never signed (no key mounted yet) or signed at heights after the old one stopped, there is no double-sign risk in this rollback path. + +--- + +## State Model + +```mermaid +stateDiagram-v2 + [*] --> Phase1 + state "Phase 1: Pre-Sync (no keys)" as Phase1 + Phase1 --> Phase2 : Synced to tip + peers stable + + state "Phase 2: Cutover (< 30s)" as Phase2 + Phase2 --> Phase3 : Old stopped + keys transferred + + state "Phase 3: Verify (100 blocks)" as Phase3 + Phase3 --> Phase4 : Stable signing confirmed + + state "Phase 4: Cleanup (destroy)" as Phase4 + Phase4 --> [*] +``` + +### Source of truth for signing state + +| Artifact | Location | Owner | +|----------|----------|-------| +| `priv_validator_key.json` | K8s Secret (sourced from AWS Secrets Manager) | Operator | +| `priv_validator_state.json` | K8s Secret (initial), then seid updates on-disk | seid process | +| Chain validator set | On-chain | Consensus | + +**Note on `priv_validator_state.json`:** This file is updated by seid after every signed block. The Secret provides the initial state at cutover time. After the first signed block on the new instance, the on-disk copy diverges from the Secret. This is expected — the Secret serves as the bootstrap, not the ongoing source of truth. If the pod restarts, seid reads from the on-disk copy in the PVC, not the Secret mount. The controller should mount the Secret with a lower priority than the PVC path, or copy-on-first-use. + +--- + +## Error Handling + +| Error | Detection | Response | +|-------|-----------|----------| +| New instance not synced at cutover time | `catching_up = true` in seid status | Abort cutover, wait for sync | +| Old instance not fully stopped | `pgrep seid` returns PID | Kill -9, verify again | +| Secret mount fails | Pod stuck in `ContainerCreating` | Check Secret exists, check RBAC | +| seid crashes on startup with keys | Pod in `CrashLoopBackOff` | Check logs, rollback to old instance | +| Double-sign detected | On-chain evidence, slashing event | **Incident.** Should never happen with this procedure. Investigate root cause. | +| Missed blocks during cutover | `signing_info` shows missed blocks | Expected for cutover window. Unjail if jailed. | + +--- + +## Test Specification + +### T1: Pre-sync mode (no signing key) + +**Setup:** Deploy SeiNode with `spec.validator` and `snapshot.stateSync: {}`, no `signingKey`. +**Action:** Wait for node to sync. +**Expected:** Node syncs to tip, does not sign blocks, no `priv_validator_key.json` on disk. + +### T2: Key injection via Secret mount + +**Setup:** Create Secret with test `priv_validator_key.json` and `priv_validator_state.json`. Apply SeiNode with `signingKey.secretRef`. +**Action:** Pod starts. +**Expected:** Key files are present at expected paths. seid loads them without error. + +### T3: Cutover simulation (testnet) + +**Setup:** Two validator instances on arctic-1 — one EC2, one K8s (pre-synced, no keys). +**Action:** Execute cutover runbook. +**Expected:** < 5 missed blocks during cutover. New instance signs blocks. No double-sign evidence on-chain. + +### T4: Rollback simulation + +**Setup:** Same as T3, but simulate failure (kill new instance after key transfer). +**Action:** Execute rollback procedure. +**Expected:** Old instance resumes signing. No double-sign evidence. + +### T5: `priv_validator_state.json` continuity + +**Setup:** New instance receives state file showing last signed height H. +**Action:** New instance starts, chain is at height H+5. +**Expected:** Instance catches up and signs H+6 (or whatever the next proposal is). Does NOT attempt to sign H or earlier. + +--- + +## Deployment + +### Testnet first (arctic-1) + +Arctic-1 has 30 validators. Migrate one validator at a time, starting with the least-staked. Validate the procedure and tooling before touching pacific-1. + +### Mainnet (pacific-1) + +Pacific-1 has 2 validators. Migrate one at a time with the full team on-call. Keep the second EC2 validator running as backup until the first K8s validator has been stable for 24+ hours. + +--- + +## Deferred (Do Not Build) + +| Feature | Rationale | +|---------|-----------| +| HSM / remote signer integration | Not in current sei-infra; add after migration is stable | +| Sentry node topology (private validator + public sentries) | Not in current sei-infra; add as a follow-up | +| Automated cutover orchestration | Manual cutover is safer for first migration; automate after confidence | +| `priv_validator_state.json` auto-sync to Secret | Complex, seid owns this file at runtime; PVC is the source of truth | +| Double-sign protection (Horcrux / TMKMS) | Valuable but not required for migration parity with sei-infra | +| Validator registration (`create-validator` tx) | Only needed for new validators; migrated validators are already registered | + +--- + +## Decision Log + +| # | Decision | Rationale | Reversibility | +|---|----------|-----------|---------------| +| 1 | Use K8s Secret for key injection (not sidecar task) | Keys never transit controller/sidecar memory; standard K8s pattern; integrates with CSI/ESO | Two-way: can switch to sidecar task later | +| 2 | Manual cutover (not automated) | Validator migration is safety-critical; human judgment for first migration reduces blast radius | Two-way: automate after first successful migration | +| 3 | Pre-sync without keys (not with keys + disabled signing) | Eliminates any possibility of accidental signing during sync phase; no code needed to "disable" signing | Two-way: could add a signing-disabled flag later | +| 4 | Testnet-first rollout | Arctic-1 has 30 validators with lower stakes; validates tooling before pacific-1 | One-way in the sense that we learn from it | +| 5 | One validator at a time | Limits blast radius; chain continues with N-1 validators during cutover | Two-way: could parallelize after confidence | + +--- + +## Open Questions + +1. **`priv_validator_state.json` mount strategy:** The Secret mount provides initial state, but seid updates this file on every signed block. Should the controller copy from Secret → PVC on first use, or should the Secret be mounted read-only with seid configured to write to the PVC path? This affects pod restart behavior. + +2. **Monitoring during cutover:** What metrics/alerts should fire during the cutover window? We need to detect missed blocks in real-time, not after the fact. + +3. **Pacific-1 validator count:** With only 2 validators, migrating one leaves a single point of failure. Should we stand up a temporary third validator before migration to provide a safety margin? + +4. **Key rotation post-migration:** Should we rotate `priv_validator_key.json` after migration to ensure the old EC2 backup can never accidentally sign? This would require an on-chain `MsgEditValidator` with the new consensus pubkey. diff --git a/api/v1alpha1/seinodegroup_types.go b/api/v1alpha1/seinodegroup_types.go index 7cd09f7..2088865 100644 --- a/api/v1alpha1/seinodegroup_types.go +++ b/api/v1alpha1/seinodegroup_types.go @@ -121,17 +121,16 @@ type SeiNodeTemplateMeta struct { // --------------------------------------------------------------------------- // SeiNodeGroupPhase represents the high-level lifecycle state. -// +kubebuilder:validation:Enum=Pending;GenesisCeremony;Initializing;Ready;Degraded;Failed;Terminating +// +kubebuilder:validation:Enum=Pending;Initializing;Ready;Degraded;Failed;Terminating type SeiNodeGroupPhase string const ( - GroupPhasePending SeiNodeGroupPhase = "Pending" - GroupPhaseGenesisCeremony SeiNodeGroupPhase = "GenesisCeremony" - GroupPhaseInitializing SeiNodeGroupPhase = "Initializing" - GroupPhaseReady SeiNodeGroupPhase = "Ready" - GroupPhaseDegraded SeiNodeGroupPhase = "Degraded" - GroupPhaseFailed SeiNodeGroupPhase = "Failed" - GroupPhaseTerminating SeiNodeGroupPhase = "Terminating" + GroupPhasePending SeiNodeGroupPhase = "Pending" + GroupPhaseInitializing SeiNodeGroupPhase = "Initializing" + GroupPhaseReady SeiNodeGroupPhase = "Ready" + GroupPhaseDegraded SeiNodeGroupPhase = "Degraded" + GroupPhaseFailed SeiNodeGroupPhase = "Failed" + GroupPhaseTerminating SeiNodeGroupPhase = "Terminating" ) // SeiNodeGroupStatus defines the observed state of a SeiNodeGroup. @@ -155,9 +154,9 @@ type SeiNodeGroupStatus struct { // +optional Nodes []GroupNodeStatus `json:"nodes,omitempty"` - // AssemblyPlan tracks the genesis assembly task on index 0's sidecar. + // InitPlan tracks group-level initialization tasks (e.g. genesis assembly). // +optional - AssemblyPlan *TaskPlan `json:"assemblyPlan,omitempty"` + InitPlan *TaskPlan `json:"initPlan,omitempty"` // GenesisHash is the SHA-256 hex digest of the assembled genesis.json. // +optional diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 5b6280f..36e2eae 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -770,8 +770,8 @@ func (in *SeiNodeGroupStatus) DeepCopyInto(out *SeiNodeGroupStatus) { *out = make([]GroupNodeStatus, len(*in)) copy(*out, *in) } - if in.AssemblyPlan != nil { - in, out := &in.AssemblyPlan, &out.AssemblyPlan + if in.InitPlan != nil { + in, out := &in.InitPlan, &out.InitPlan *out = new(TaskPlan) (*in).DeepCopyInto(*out) } diff --git a/internal/controller/nodegroup/genesis.go b/internal/controller/nodegroup/genesis.go index fa6828c..a6e221b 100644 --- a/internal/controller/nodegroup/genesis.go +++ b/internal/controller/nodegroup/genesis.go @@ -88,23 +88,23 @@ func (r *SeiNodeGroupReconciler) reconcileGenesisAssembly(ctx context.Context, g return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } - if group.Status.AssemblyPlan == nil { + if group.Status.InitPlan == nil { return r.startAssembly(ctx, group) } - if group.Status.AssemblyPlan.Phase == seiv1alpha1.TaskPlanComplete { + if group.Status.InitPlan.Phase == seiv1alpha1.TaskPlanComplete { if group.Status.GenesisS3URI != "" { return ctrl.Result{}, nil } return r.finalizeGenesis(ctx, group, nodes) } - if group.Status.AssemblyPlan.Phase == seiv1alpha1.TaskPlanFailed { + if group.Status.InitPlan.Phase == seiv1alpha1.TaskPlanFailed { return r.setGenesisCondition(ctx, group, metav1.ConditionFalse, "AssemblyFailed", "genesis assembly task failed") } - return r.driveAssemblyPlan(ctx, group, nodes) + return r.driveInitPlan(ctx, group, nodes) } func (r *SeiNodeGroupReconciler) startAssembly(ctx context.Context, group *seiv1alpha1.SeiNodeGroup) (ctrl.Result, error) { @@ -116,10 +116,9 @@ func (r *SeiNodeGroupReconciler) startAssembly(ctx context.Context, group *seiv1 } patch := client.MergeFrom(group.DeepCopy()) - group.Status.AssemblyPlan = plan - group.Status.Phase = seiv1alpha1.GroupPhaseGenesisCeremony + group.Status.InitPlan = plan if err := r.Status().Patch(ctx, group, patch); err != nil { - return ctrl.Result{}, fmt.Errorf("setting assembly plan: %w", err) + return ctrl.Result{}, fmt.Errorf("setting init plan: %w", err) } r.Recorder.Event(group, corev1.EventTypeNormal, "GenesisAssemblyStarted", @@ -127,7 +126,7 @@ func (r *SeiNodeGroupReconciler) startAssembly(ctx context.Context, group *seiv1 return ctrl.Result{RequeueAfter: 2 * time.Second}, nil } -func (r *SeiNodeGroupReconciler) driveAssemblyPlan(ctx context.Context, group *seiv1alpha1.SeiNodeGroup, nodes []seiv1alpha1.SeiNode) (ctrl.Result, error) { +func (r *SeiNodeGroupReconciler) driveInitPlan(ctx context.Context, group *seiv1alpha1.SeiNodeGroup, nodes []seiv1alpha1.SeiNode) (ctrl.Result, error) { if len(nodes) == 0 { return ctrl.Result{}, fmt.Errorf("no nodes available for assembly") } @@ -139,7 +138,7 @@ func (r *SeiNodeGroupReconciler) driveAssemblyPlan(ctx context.Context, group *s return ctrl.Result{RequeueAfter: 5 * time.Second}, nil } - plan := group.Status.AssemblyPlan + plan := group.Status.InitPlan task := &plan.Tasks[0] switch task.Status { @@ -170,15 +169,15 @@ func (r *SeiNodeGroupReconciler) driveAssemblyPlan(ctx context.Context, group *s return ctrl.Result{RequeueAfter: 5 * time.Second}, nil case seiv1alpha1.PlannedTaskSubmitted: - return r.pollAssemblyTask(ctx, group, sc, task) + return r.pollInitTask(ctx, group, sc, task) } return ctrl.Result{RequeueAfter: 5 * time.Second}, nil } // TODO(PR-3): poll sidecar GetTask, mark plan Complete/Failed based on result. -func (r *SeiNodeGroupReconciler) pollAssemblyTask(ctx context.Context, _ *seiv1alpha1.SeiNodeGroup, _ any, task *seiv1alpha1.PlannedTask) (ctrl.Result, error) { - log.FromContext(ctx).Info("polling assembly task", "taskID", task.TaskID) +func (r *SeiNodeGroupReconciler) pollInitTask(ctx context.Context, _ *seiv1alpha1.SeiNodeGroup, _ any, task *seiv1alpha1.PlannedTask) (ctrl.Result, error) { + log.FromContext(ctx).Info("polling init task", "taskID", task.TaskID) return ctrl.Result{RequeueAfter: 5 * time.Second}, nil } From a694e6c7c65ea0cca7506eb86c2b03475b2cfc38 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Wed, 25 Mar 2026 13:02:36 -0400 Subject: [PATCH 5/5] refactor: remove genesis PreInit, merge all tasks into InitPlan Remove the genesis-specific PreInit Job (sleep infinity + copy-seid), handleGenesisPreInitComplete, and all isGenesisCeremonyNode branches in pre_init.go and job.go. Genesis nodes now skip PreInit entirely. The full genesis InitPlan includes: generate-identity, generate-gentx, upload-genesis-artifacts, await-genesis-assembly (barrier), then the standard configure-genesis, config-apply, discover-peers, config-validate, mark-ready sequence. WIP: executePlan still uses the old submit/poll model. Next step is the TaskExecutionInterface refactor to support the await barrier natively. --- internal/controller/node/job.go | 81 +------------------ internal/controller/node/planner.go | 5 +- internal/controller/node/planner_bootstrap.go | 40 ++++----- internal/controller/node/pre_init.go | 25 ------ internal/controller/node/task_builders.go | 9 +++ 5 files changed, 27 insertions(+), 133 deletions(-) diff --git a/internal/controller/node/job.go b/internal/controller/node/job.go index f87b377..f5df7e7 100644 --- a/internal/controller/node/job.go +++ b/internal/controller/node/job.go @@ -24,13 +24,8 @@ func preInitJobName(node *seiv1alpha1.SeiNode) string { func generatePreInitJob(node *seiv1alpha1.SeiNode, platform PlatformConfig) *batchv1.Job { labels := preInitLabelsForNode(node) - var podSpec corev1.PodSpec - if isGenesisCeremonyNode(node) { - podSpec = buildGenesisPreInitPodSpec(node, platform) - } else { - snap := snapshotSourceFor(node) - podSpec = buildPreInitPodSpec(node, snap, platform) - } + snap := snapshotSourceFor(node) + podSpec := buildPreInitPodSpec(node, snap, platform) return &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ @@ -113,78 +108,6 @@ func preInitWaitCommand(port int32, haltHeight int64) (command []string, args [] return []string{"/bin/bash", "-c"}, []string{script} } -// buildGenesisPreInitPodSpec constructs a PodSpec for the genesis ceremony PreInit Job. -// Unlike the snapshot PreInit, the main container runs "sleep infinity" as a keepalive -// (the sidecar does the real work), and an init container copies the seid binary from -// the node image to the PVC so sidecar tasks can invoke it. -func buildGenesisPreInitPodSpec(node *seiv1alpha1.SeiNode, platform PlatformConfig) corev1.PodSpec { - serviceName := preInitJobName(node) - - dataVolume := corev1.Volume{ - Name: "data", - VolumeSource: corev1.VolumeSource{ - PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ - ClaimName: nodeDataPVCClaimName(node), - }, - }, - } - - port := sidecarPort(node) - sidecarContainer := corev1.Container{ - Name: "sei-sidecar", - Image: sidecarImage(node), - Command: []string{"seictl", "serve"}, - RestartPolicy: ptr.To(corev1.ContainerRestartPolicyAlways), - Env: []corev1.EnvVar{ - {Name: "SEI_CHAIN_ID", Value: node.Spec.ChainID}, - {Name: "SEI_SIDECAR_PORT", Value: fmt.Sprintf("%d", port)}, - {Name: "SEI_HOME", Value: dataDir}, - }, - Ports: []corev1.ContainerPort{ - {Name: "sidecar", ContainerPort: port, Protocol: corev1.ProtocolTCP}, - }, - VolumeMounts: []corev1.VolumeMount{ - {Name: "data", MountPath: dataDir}, - }, - } - if node.Spec.Sidecar != nil && node.Spec.Sidecar.Resources != nil { - sidecarContainer.Resources = *node.Spec.Sidecar.Resources - } - - copySeid := corev1.Container{ - Name: "copy-seid", - Image: node.Spec.Image, - Command: []string{"sh", "-c", fmt.Sprintf("mkdir -p %s/bin && cp $(which seid) %s/bin/seid", dataDir, dataDir)}, - VolumeMounts: []corev1.VolumeMount{ - {Name: "data", MountPath: dataDir}, - }, - } - - keepalive := corev1.Container{ - Name: "keepalive", - Image: node.Spec.Image, - Command: []string{"sleep", "infinity"}, - VolumeMounts: []corev1.VolumeMount{ - {Name: "data", MountPath: dataDir}, - }, - } - - return corev1.PodSpec{ - Hostname: preInitPodHostname, - Subdomain: serviceName, - ServiceAccountName: platform.ServiceAccount, - ShareProcessNamespace: ptr.To(true), - RestartPolicy: corev1.RestartPolicyNever, - TerminationGracePeriodSeconds: ptr.To(int64(5)), - Tolerations: []corev1.Toleration{ - {Key: platform.TolerationKey, Value: platform.TolerationVal, Effect: corev1.TaintEffectNoSchedule}, - }, - Volumes: []corev1.Volume{dataVolume}, - InitContainers: []corev1.Container{copySeid, sidecarContainer}, - Containers: []corev1.Container{keepalive}, - } -} - func buildPreInitPodSpec(node *seiv1alpha1.SeiNode, snap *seiv1alpha1.SnapshotSource, platform PlatformConfig) corev1.PodSpec { serviceName := preInitJobName(node) diff --git a/internal/controller/node/planner.go b/internal/controller/node/planner.go index 0d24a35..d189331 100644 --- a/internal/controller/node/planner.go +++ b/internal/controller/node/planner.go @@ -80,9 +80,6 @@ func peersFor(node *seiv1alpha1.SeiNode) []seiv1alpha1.PeerSource { // needsPreInit returns true when the node requires a PreInitPlan Job. func needsPreInit(node *seiv1alpha1.SeiNode) bool { - if isGenesisCeremonyNode(node) { - return true - } snap := snapshotSourceFor(node) return snap != nil && snap.BootstrapImage != "" && snap.S3 != nil && snap.S3.TargetHeight > 0 @@ -211,6 +208,8 @@ func buildSharedTask( return generateGentxTaskBuilder(node), nil case taskUploadGenesisArtifacts: return uploadGenesisArtifactsTaskBuilder(node), nil + case taskAwaitGenesisAssembly: + return awaitGenesisAssemblyTaskBuilder(node), nil default: return nil, fmt.Errorf("buildSharedTask: unhandled task type %q", taskType) } diff --git a/internal/controller/node/planner_bootstrap.go b/internal/controller/node/planner_bootstrap.go index d915b1b..6dedc1b 100644 --- a/internal/controller/node/planner_bootstrap.go +++ b/internal/controller/node/planner_bootstrap.go @@ -9,39 +9,23 @@ import ( ) const ( - taskAwaitCondition = sidecar.TaskTypeAwaitCondition - taskGenerateIdentity = sidecar.TaskTypeGenerateIdentity - taskGenerateGentx = sidecar.TaskTypeGenerateGentx - taskUploadGenesisArtifacts = sidecar.TaskTypeUploadGenesisArtifacts + taskAwaitCondition = sidecar.TaskTypeAwaitCondition + taskGenerateIdentity = sidecar.TaskTypeGenerateIdentity + taskGenerateGentx = sidecar.TaskTypeGenerateGentx + taskUploadGenesisArtifacts = sidecar.TaskTypeUploadGenesisArtifacts + taskAwaitGenesisAssembly = sidecar.TaskTypeAwaitGenesisAssembly ) -// buildPreInitPlan constructs the task plan for the PreInit Job. For genesis -// ceremony nodes, it builds the 3-task identity/gentx/upload sequence. For +// buildPreInitPlan constructs the task plan for the PreInit Job. For // snapshot-bootstrap nodes, it builds the standard bootstrap sequence. For // all other nodes it returns an empty plan that resolves trivially. func buildPreInitPlan(node *seiv1alpha1.SeiNode, planner NodePlanner) *seiv1alpha1.TaskPlan { - if isGenesisCeremonyNode(node) { - return buildGenesisPreInitPlan() - } if !needsPreInit(node) { return &seiv1alpha1.TaskPlan{Phase: seiv1alpha1.TaskPlanActive, Tasks: []seiv1alpha1.PlannedTask{}} } return planner.BuildPlan(node) } -// buildGenesisPreInitPlan returns a 3-task plan for genesis ceremony PreInit: -// generate validator identity, generate gentx, upload artifacts to S3. -func buildGenesisPreInitPlan() *seiv1alpha1.TaskPlan { - return &seiv1alpha1.TaskPlan{ - Phase: seiv1alpha1.TaskPlanActive, - Tasks: []seiv1alpha1.PlannedTask{ - {Type: taskGenerateIdentity, Status: seiv1alpha1.PlannedTaskPending}, - {Type: taskGenerateGentx, Status: seiv1alpha1.PlannedTaskPending}, - {Type: taskUploadGenesisArtifacts, Status: seiv1alpha1.PlannedTaskPending}, - }, - } -} - // buildPostBootstrapInitPlan constructs a reduced InitPlan for nodes that // completed a PreInit Job. The PVC already contains blockchain data synced // to the target height, so snapshot-restore and configure-state-sync are @@ -63,12 +47,16 @@ func buildPostBootstrapInitPlan(node *seiv1alpha1.SeiNode) *seiv1alpha1.TaskPlan return &seiv1alpha1.TaskPlan{Phase: seiv1alpha1.TaskPlanActive, Tasks: tasks} } -// buildGenesisInitPlan constructs the Init plan for genesis ceremony nodes. -// Unlike buildPostBootstrapInitPlan, discover-peers is unconditionally -// included: peers are not yet set at plan-creation time but will be pushed -// by the group controller before the node transitions to Initializing. +// buildGenesisInitPlan constructs the full Init plan for genesis ceremony +// nodes. Per-node artifact generation runs first, then await-genesis-assembly +// blocks until the group controller has assembled and uploaded genesis.json. +// After that the standard configure/apply/peers/validate/ready sequence runs. func buildGenesisInitPlan() *seiv1alpha1.TaskPlan { prog := []string{ + taskGenerateIdentity, + taskGenerateGentx, + taskUploadGenesisArtifacts, + taskAwaitGenesisAssembly, taskConfigureGenesis, taskConfigApply, taskDiscoverPeers, diff --git a/internal/controller/node/pre_init.go b/internal/controller/node/pre_init.go index 8268b11..100cd3d 100644 --- a/internal/controller/node/pre_init.go +++ b/internal/controller/node/pre_init.go @@ -3,7 +3,6 @@ package node import ( "context" "fmt" - "time" sidecar "github.com/sei-protocol/seictl/sidecar/client" batchv1 "k8s.io/api/batch/v1" @@ -25,9 +24,6 @@ func (r *SeiNodeReconciler) reconcilePreInitializing(ctx context.Context, node * plan := node.Status.PreInitPlan if plan.Phase == seiv1alpha1.TaskPlanComplete { - if isGenesisCeremonyNode(node) { - return r.handleGenesisPreInitComplete(ctx, node) - } if len(plan.Tasks) > 0 { job := &batchv1.Job{} key := types.NamespacedName{Name: preInitJobName(node), Namespace: node.Namespace} @@ -106,9 +102,6 @@ func (r *SeiNodeReconciler) reconcilePreInitializing(ctx context.Context, node * } if plan.Phase == seiv1alpha1.TaskPlanComplete { - if isGenesisCeremonyNode(node) { - return r.handleGenesisPreInitComplete(ctx, node) - } if !isJobComplete(job) { log.FromContext(ctx).Info("sidecar tasks complete, waiting for seid to reach halt-height", "job", job.Name) return ctrl.Result{RequeueAfter: taskPollInterval}, nil @@ -127,24 +120,6 @@ func (r *SeiNodeReconciler) reconcilePreInitializing(ctx context.Context, node * return result, nil } -// handleGenesisPreInitComplete is called when a genesis ceremony node's PreInit -// plan completes. The genesis PreInit Job runs "sleep infinity" — it never -// completes on its own. The Init plan was already built in reconcilePending -// (the S3 URI is deterministic). We wait for the group controller to push -// static peers (the last step of finalizeGenesis after assembly + nodeID -// collection), then clean up PreInit resources and transition. -func (r *SeiNodeReconciler) handleGenesisPreInitComplete(ctx context.Context, node *seiv1alpha1.SeiNode) (ctrl.Result, error) { - if len(peersFor(node)) == 0 { - log.FromContext(ctx).Info("genesis PreInit complete, waiting for group to push peers") - return ctrl.Result{RequeueAfter: 10 * time.Second}, nil - } - - if err := r.cleanupPreInit(ctx, node); err != nil { - return ctrl.Result{}, fmt.Errorf("cleaning up genesis pre-init resources: %w", err) - } - return r.transitionPhase(ctx, node, seiv1alpha1.PhaseInitializing) -} - // ensurePreInitService creates the headless Service for pre-init Job pod DNS. func (r *SeiNodeReconciler) ensurePreInitService(ctx context.Context, node *seiv1alpha1.SeiNode) error { key := types.NamespacedName{Name: preInitJobName(node), Namespace: node.Namespace} diff --git a/internal/controller/node/task_builders.go b/internal/controller/node/task_builders.go index 3b3e594..81bac0e 100644 --- a/internal/controller/node/task_builders.go +++ b/internal/controller/node/task_builders.go @@ -93,6 +93,15 @@ func uploadGenesisArtifactsTaskBuilder(node *seiv1alpha1.SeiNode) sidecar.TaskBu } } +func awaitGenesisAssemblyTaskBuilder(node *seiv1alpha1.SeiNode) sidecar.TaskBuilder { + gc := node.Spec.Validator.GenesisCeremony + return sidecar.AwaitGenesisAssemblyTask{ + S3Bucket: gc.ArtifactS3.Bucket, + S3Prefix: gc.ArtifactS3.Prefix, + S3Region: gc.ArtifactS3.Region, + } +} + // resultExportScheduledTask returns a result-export task builder with a // cron schedule if the node is a replayer with result export enabled. // Returns nil when not applicable.