From e5ea8295d177dfb5bb3db4ee2fd5db99d8a7d695 Mon Sep 17 00:00:00 2001 From: Lyn Nagara Date: Fri, 15 May 2026 11:14:33 -0700 Subject: [PATCH 1/4] locator: Don't crash when on startup when control plane is unreachable In sentry devservices, the proxy and ingest router must be available even when the Sentry application (control plane) isn't up and running yet AND no backup file exists from a previous run. Now the locator can start up even with the initial load fails: this is retried periodically in line with the incremental load loop. The devservices healthcheck now hits the /health endpoint instead of the /ready endpoint. /ready is unchanged and still reports 503 until routing data is loaded, so it can still be used to gate the kubernetes service from routing traffic before control plane data is loaded. --- devservices/proxy.yaml | 2 +- locator/src/control_plane.rs | 6 +-- locator/src/locator.rs | 98 ++++++++++++++++++++++++++++-------- 3 files changed, 82 insertions(+), 24 deletions(-) diff --git a/devservices/proxy.yaml b/devservices/proxy.yaml index dcf1c4c..d6424f2 100644 --- a/devservices/proxy.yaml +++ b/devservices/proxy.yaml @@ -29,7 +29,7 @@ proxy: resolver: cell_from_organization cell_to_upstream: "--monolith--": sentry-dev-monolith - # default: sentry-dev-monolith + default: sentry-dev-monolith metrics: statsd_host: "127.0.0.1" diff --git a/locator/src/control_plane.rs b/locator/src/control_plane.rs index fc0af7a..85e6349 100644 --- a/locator/src/control_plane.rs +++ b/locator/src/control_plane.rs @@ -46,8 +46,8 @@ pub enum ControlPlaneError { ReqwestError(#[from] reqwest::Error), #[error("invalid URL: {0}")] InvalidUrl(String), - #[error("control plane unavailable")] - ControlPlaneRetriesExceeded, + #[error("control plane returned unsuccessful status: {0}")] + ControlPlaneStatus(StatusCode), #[error("missing cursor in response")] MissingCursor, } @@ -201,7 +201,7 @@ impl ControlPlane { retries += 1; continue; } else { - return Err(ControlPlaneError::ControlPlaneRetriesExceeded); + return Err(ControlPlaneError::ControlPlaneStatus(response.status())); } } diff --git a/locator/src/locator.rs b/locator/src/locator.rs index f726c74..17ea238 100644 --- a/locator/src/locator.rs +++ b/locator/src/locator.rs @@ -1,6 +1,6 @@ use crate::config::LocatorDataType; use crate::control_plane::ControlPlane; -use crate::types::RouteData; +use crate::types::{Cell, RouteData}; use std::sync::Arc; use std::time::Instant; @@ -130,7 +130,10 @@ struct RouteDataWithTimestamp { /// This struct is used internally by the Locator. struct IdToCell { control_plane: ControlPlane, - locality_to_default_cell: HashMap, + // Pre-built default cells keyed by locality. Built once at startup so + // default-fallback lookups don't allocate, and stored separately from + // `data.cells` so they survive snapshot reloads. + locality_to_default_cell: HashMap>, data: RwLock, // Keeps track of recently failed lookups to avoid repeated queries against // non-existent or recently deleted organizations/project keys from adding load to the system. @@ -166,9 +169,15 @@ impl IdToCell { last_updated: None, }; + let locality_to_default_cell = locality_to_default_cell + .unwrap_or_default() + .into_iter() + .map(|(locality, id)| (locality.clone(), Arc::new(Cell { id, locality }))) + .collect(); + IdToCell { control_plane: ControlPlane::new(data_type, control_plane_url, localities), - locality_to_default_cell: locality_to_default_cell.unwrap_or_default(), + locality_to_default_cell, data: RwLock::new(data), negative_cache: NegativeCache::new(), update_lock: Semaphore::new(1), @@ -186,8 +195,14 @@ impl IdToCell { // Returns `Ok(Cell)` if found, or a default applies. // Returns an error if locality is passed and the id/locality pair is not valid. // Or if a locality is passed but no default cell is found for that locality + + // Before initial load: skip data/refresh paths and serve via default + // if one applies, otherwise NotReady. if !self.ready.load(Ordering::Relaxed) { - return Err(LocatorError::NotReady); + return locality + .and_then(|loc| self.locality_to_default_cell.get(loc)) + .map(|cell| cell.id.clone()) + .ok_or(LocatorError::NotReady); } let start_lookup = Instant::now(); @@ -241,19 +256,9 @@ impl IdToCell { maybe_cell }; - // If no cell is found, apply locality default - let maybe_cell = if maybe_cell.is_none() { - if let Some(loc) = locality { - let read_guard = self.data.read().await; - self.locality_to_default_cell - .get(loc) - .and_then(|default_cell_id| read_guard.data.cells.get(default_cell_id).cloned()) - } else { - None - } - } else { - maybe_cell - }; + // If no cell is found, apply the locality default + let maybe_cell = maybe_cell + .or_else(|| locality.and_then(|loc| self.locality_to_default_cell.get(loc).cloned())); let cell = maybe_cell.ok_or(LocatorError::NoCell)?; @@ -274,8 +279,16 @@ impl IdToCell { /// command is received. The loop runs indefinitely until the Shutdown /// command is received. pub async fn start(&self, mut rx: mpsc::Receiver) -> Result<(), LoadError> { - self.load_snapshot().await?; - self.ready.store(true, Ordering::Relaxed); + // With defaults configured, a failed initial load is non-fatal: + // the process stays up, /ready returns 503, and the periodic loop + // retries. Without defaults, fail fast as before. + match self.load_snapshot().await { + Ok(()) => self.ready.store(true, Ordering::Relaxed), + Err(err) if !self.locality_to_default_cell.is_empty() => { + tracing::warn!("Initial snapshot load failed: {err:?}; will retry"); + } + Err(err) => return Err(err), + } // Once a snapshot is loaded, the worker periodically requests incremental results // until the Shutdown command is received. @@ -284,7 +297,12 @@ impl IdToCell { loop { tokio::select! { _ = tokio::time::sleep(self.refresh_interval) => { - let _ = self.load_incremental().await; + match self.load_incremental().await { + // Flip ready on first success — handles the case where + // initial snapshot failed but a later incremental succeeded. + Ok(()) => self.ready.store(true, Ordering::Relaxed), + Err(err) => tracing::warn!("Incremental load failed: {err:?}; will retry"), + } } Some(cmd) = rx.recv() => { match cmd { @@ -518,4 +536,44 @@ mod tests { Err(LocatorError::NoCell) ); } + + #[tokio::test] + async fn test_locator_both_unavailable_with_defaults() { + // Cold devservices boot: control plane down, no backup file. With + // defaults configured the locator must stay up and serve via default. + let dir = tempfile::tempdir().unwrap(); + let provider = Arc::new(FilesystemRouteProvider::new( + dir.path().to_str().unwrap(), + "backup.bin", + config::Compression::None, + )); + + let locator = Locator::new( + LocatorDataType::Organization, + "http://invalid-control-plane:8000".to_string(), + provider, + None, + Some(HashMap::from([("--monolith--".into(), "--monolith--".into())])), + ); + + tokio::time::sleep(Duration::from_millis(100)).await; + + assert!(!locator.is_ready()); + + // With matching locality: default applies. + assert_eq!( + locator.lookup("anything", Some("--monolith--")).await, + Ok("--monolith--".into()) + ); + + // No matching default → NotReady (data hasn't loaded). + assert_eq!( + locator.lookup("anything", None).await, + Err(LocatorError::NotReady) + ); + assert_eq!( + locator.lookup("anything", Some("other")).await, + Err(LocatorError::NotReady) + ); + } } From a23b2e2c5f50989bb1dbf24ebd6d0351d2e0a2f9 Mon Sep 17 00:00:00 2001 From: Lyn Nagara Date: Fri, 15 May 2026 16:28:30 -0700 Subject: [PATCH 2/4] fix url --- locator/src/locator.rs | 5 ++++- synapse/src/healthcheck.rs | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/locator/src/locator.rs b/locator/src/locator.rs index 17ea238..39fcd8a 100644 --- a/locator/src/locator.rs +++ b/locator/src/locator.rs @@ -553,7 +553,10 @@ mod tests { "http://invalid-control-plane:8000".to_string(), provider, None, - Some(HashMap::from([("--monolith--".into(), "--monolith--".into())])), + Some(HashMap::from([( + "--monolith--".into(), + "--monolith--".into(), + )])), ); tokio::time::sleep(Duration::from_millis(100)).await; diff --git a/synapse/src/healthcheck.rs b/synapse/src/healthcheck.rs index 26b2bcd..5464f82 100644 --- a/synapse/src/healthcheck.rs +++ b/synapse/src/healthcheck.rs @@ -12,7 +12,7 @@ pub fn run(config_path: &Path) -> Result<(), CliError> { .ok_or(CliError::InvalidConfig( "Missing proxy or ingest-router config", ))?; - let response = reqwest::blocking::get(format!("http://localhost:{port}/ready")) + let response = reqwest::blocking::get(format!("http://localhost:{port}/health")) .map_err(|e| CliError::HealthcheckFailed(e.to_string()))?; if !response.status().is_success() { return Err(CliError::HealthcheckFailed(format!( From 4aab3aff31b4b1a53e58fbf2907b4a155cc0e50c Mon Sep 17 00:00:00 2001 From: Lyn Nagara Date: Mon, 18 May 2026 12:58:40 -0700 Subject: [PATCH 3/4] check store value before writing --- locator/src/locator.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/locator/src/locator.rs b/locator/src/locator.rs index 39fcd8a..a4eb5cb 100644 --- a/locator/src/locator.rs +++ b/locator/src/locator.rs @@ -300,7 +300,11 @@ impl IdToCell { match self.load_incremental().await { // Flip ready on first success — handles the case where // initial snapshot failed but a later incremental succeeded. - Ok(()) => self.ready.store(true, Ordering::Relaxed), + Ok(()) => { + if !self.ready.load(Ordering::Relaxed) { + self.ready.store(true, Ordering::Relaxed); + } + } Err(err) => tracing::warn!("Incremental load failed: {err:?}; will retry"), } } From a7a6ed6785c498d44b0bfd377054bd04fc95c3db Mon Sep 17 00:00:00 2001 From: Lyn Nagara Date: Mon, 18 May 2026 13:17:55 -0700 Subject: [PATCH 4/4] rewrite to better handle and write initial snapshot --- locator/src/locator.rs | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/locator/src/locator.rs b/locator/src/locator.rs index a4eb5cb..976843a 100644 --- a/locator/src/locator.rs +++ b/locator/src/locator.rs @@ -297,15 +297,18 @@ impl IdToCell { loop { tokio::select! { _ = tokio::time::sleep(self.refresh_interval) => { - match self.load_incremental().await { - // Flip ready on first success — handles the case where - // initial snapshot failed but a later incremental succeeded. - Ok(()) => { - if !self.ready.load(Ordering::Relaxed) { - self.ready.store(true, Ordering::Relaxed); - } + // If the initial snapshot failed, keep retrying it (which also writes + // the backup file on success) until we're ready. Only then move to + // incremental updates for steady state. + if self.ready.load(Ordering::Relaxed) { + if let Err(err) = self.load_incremental().await { + tracing::warn!("Incremental load failed: {err:?}; will retry"); + } + } else { + match self.load_snapshot().await { + Ok(()) => self.ready.store(true, Ordering::Relaxed), + Err(err) => tracing::warn!("Snapshot retry failed: {err:?}; will retry"), } - Err(err) => tracing::warn!("Incremental load failed: {err:?}; will retry"), } } Some(cmd) = rx.recv() => {