From 5f971f6d2d8d14eb6d77326dec077169f856c02f Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Thu, 21 May 2026 19:59:44 +0200
Subject: [PATCH 01/45] feat(ewma): add base EWMA crate

Introduce linkerd-ewma, a general-purpose exponentially-weighted moving
average crate. The crate provides five public methods on an Ewma struct:
new (initializes with INFINITY sentinel), get (returns stored value),
add (blends a new sample using exponential decay), add_peak (replaces
stored value when the new sample exceeds it), and add_rate (derives a
rate from the inverse of the elapsed interval and feeds it through add).

This is being added in spite of tower::PeakEwma because this is not
limited to middleware-based RTT computing. We specifically plan to
use this implementation for a load biasing feature and a
success-rate circuit breaker policy, which would otherwise not be
possible.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 Cargo.lock              |   7 +++
 Cargo.toml              |   1 +
 linkerd/ewma/Cargo.toml |  13 +++++
 linkerd/ewma/src/lib.rs | 125 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 146 insertions(+)
 create mode 100644 linkerd/ewma/Cargo.toml
 create mode 100644 linkerd/ewma/src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
index 4578b0ee69..728172de74 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1687,6 +1687,13 @@ dependencies = [
  "pin-project",
 ]
 
+[[package]]
+name = "linkerd-ewma"
+version = "0.1.0"
+dependencies = [
+ "tokio",
+]
+
 [[package]]
 name = "linkerd-exp-backoff"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index 7ac6863ca9..cd7cbd45e4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,6 +22,7 @@ members = [
     "linkerd/error",
     "linkerd/errno",
     "linkerd/error-respond",
+    "linkerd/ewma",
     "linkerd/exp-backoff",
     "linkerd/http/access-log",
     "linkerd/http/body-eos",
diff --git a/linkerd/ewma/Cargo.toml b/linkerd/ewma/Cargo.toml
new file mode 100644
index 0000000000..8933c4e5c7
--- /dev/null
+++ b/linkerd/ewma/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "linkerd-ewma"
+version = { workspace = true }
+authors = { workspace = true }
+license = { workspace = true }
+edition = { workspace = true }
+publish = { workspace = true }
+
+[dependencies]
+tokio = { version = "1", features = ["time"] }
+
+[dev-dependencies]
+tokio = { version = "1", features = ["macros", "rt", "test-util"] }
diff --git a/linkerd/ewma/src/lib.rs b/linkerd/ewma/src/lib.rs
new file mode 100644
index 0000000000..a69601d8b9
--- /dev/null
+++ b/linkerd/ewma/src/lib.rs
@@ -0,0 +1,125 @@
+#![deny(rust_2018_idioms, clippy::disallowed_methods, clippy::disallowed_types)]
+#![forbid(unsafe_code)]
+
+use tokio::time;
+
+/// An exponentially-weighted moving average.
+#[derive(Debug)]
+pub struct Ewma {
+    value: f64,
+    decay: f64,
+    timestamp: time::Instant,
+}
+
+// === impl Ewma ===
+
+impl Ewma {
+    #[must_use]
+    pub fn new(decay: time::Duration, timestamp: time::Instant) -> Self {
+        Self {
+            decay: decay.as_secs_f64(),
+            timestamp,
+            value: f64::INFINITY,
+        }
+    }
+
+    /// Returns the current value of the average.
+    pub fn get(&self) -> f64 {
+        self.value
+    }
+
+    /// Updates the weighted moving average with a new value and timestamp.
+    ///
+    /// Precondition: `value` must not be NaN. Passing NaN poisons the EWMA
+    /// irreversibly (all subsequent reads return NaN).
+    pub fn add(&mut self, value: f64, ts: time::Instant) {
+        debug_assert!(!value.is_nan(), "EWMA input value must not be NaN");
+        if ts <= self.timestamp {
+            return;
+        }
+        if self.value == f64::INFINITY {
+            self.value = value;
+            self.timestamp = ts;
+            return;
+        }
+
+        self.value = {
+            let elapsed = ts.saturating_duration_since(self.timestamp);
+            let alpha = 1.0 - (-elapsed.as_secs_f64() / self.decay).exp();
+            self.value * (1.0 - alpha) + value * alpha
+        };
+
+        self.timestamp = ts;
+    }
+
+    pub fn add_peak(&mut self, value: f64, ts: time::Instant) {
+        if self.value < value {
+            self.value = value;
+            self.timestamp = ts;
+            return;
+        }
+        self.add(value, ts)
+    }
+
+    /// Computes 1/elapsed since the last update and feeds it through `add()`.
+    pub fn add_rate(&mut self, ts: time::Instant) {
+        if ts <= self.timestamp {
+            return;
+        }
+        let elapsed = ts.saturating_duration_since(self.timestamp);
+        if !elapsed.is_zero() {
+            self.add(1.0 / elapsed.as_secs_f64(), ts);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tokio::time::{Duration, Instant};
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_new() {
+        let now = Instant::now();
+        let ewma = Ewma::new(Duration::from_secs(10), now);
+        assert_eq!(ewma.get(), f64::INFINITY);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_add() {
+        let now = Instant::now();
+        let mut ewma = Ewma::new(Duration::from_secs(10), now);
+        ewma.add(1.0, now + Duration::from_secs(1));
+        assert_eq!(ewma.get(), 1.0);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_add_rate() {
+        let now = Instant::now();
+        let mut ewma = Ewma::new(Duration::from_secs(10), now);
+        ewma.add_rate(now + Duration::from_secs(1));
+        assert_eq!(ewma.get(), 1.0);
+        ewma.add_rate(now + Duration::from_secs(3));
+        assert_eq!(ewma.get(), 0.9093653765389909);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_add_peak() {
+        let now = Instant::now();
+        let mut ewma = Ewma::new(Duration::from_secs(10), now);
+        ewma.add_peak(1.0, now + Duration::from_secs(1));
+        assert_eq!(ewma.get(), 1.0);
+        ewma.add_peak(2.0, now + Duration::from_secs_f64(1.5));
+        assert_eq!(ewma.get(), 2.0);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_decay() {
+        let now = Instant::now();
+        let mut ewma = Ewma::new(Duration::from_secs(10), now);
+        ewma.add(1.0, now + Duration::from_secs(1));
+        assert_eq!(ewma.get(), 1.0);
+        ewma.add(0.0, now + Duration::from_secs(11));
+        assert_eq!(ewma.get(), 0.36787944117144233);
+    }
+}

From 6cc60229d11a929145de46c53e4e17e3fca45554 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Thu, 21 May 2026 20:00:02 +0200
Subject: [PATCH 02/45] feat(ewma): add success-rate tracking extensions

Extend linkerd-ewma with the API surface needed for success-rate circuit
breaking. A MIN_DECAY constant (1 ms) is now applied in both constructors
so that a zero-duration decay never produces division-by-zero or NaN
results in downstream arithmetic.

New methods: new_with_value sets an explicit initial sample instead of the
INFINITY sentinel, reset overwrites both value and timestamp for breaker
recovery, and get_at projects the stored value forward through exponential
decay without mutating internal state.

Also add_peak is now decay-aware: it projects the stored value to the
candidate timestamp before deciding whether to replace it, and it
unconditionally replaces INFINITY so that the first real sample always
takes effect even at the construction timestamp.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/ewma/src/lib.rs | 357 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 354 insertions(+), 3 deletions(-)

diff --git a/linkerd/ewma/src/lib.rs b/linkerd/ewma/src/lib.rs
index a69601d8b9..872fcfc518 100644
--- a/linkerd/ewma/src/lib.rs
+++ b/linkerd/ewma/src/lib.rs
@@ -3,6 +3,11 @@
 
 use tokio::time;
 
+/// Minimum decay duration to prevent division-by-zero in EWMA computations.
+/// Chosen as the smallest Duration that is strictly positive without overriding
+/// validated configs from the control plane (CP should reject decay=0).
+pub const MIN_DECAY: time::Duration = time::Duration::from_millis(1);
+
 /// An exponentially-weighted moving average.
 #[derive(Debug)]
 pub struct Ewma {
@@ -17,17 +22,62 @@ impl Ewma {
     #[must_use]
     pub fn new(decay: time::Duration, timestamp: time::Instant) -> Self {
         Self {
-            decay: decay.as_secs_f64(),
+            decay: decay.max(MIN_DECAY).as_secs_f64(),
             timestamp,
             value: f64::INFINITY,
         }
     }
 
+    /// Creates an EWMA with a specific initial value.
+    ///
+    /// This constructor allows setting an initial value, useful for
+    /// success rate tracking where you want to start at 100% (1.0)
+    /// success rate.
+    #[must_use]
+    pub fn new_with_value(decay: time::Duration, timestamp: time::Instant, initial: f64) -> Self {
+        debug_assert!(!initial.is_nan(), "EWMA initial value must not be NaN");
+        Self {
+            decay: decay.max(MIN_DECAY).as_secs_f64(),
+            timestamp,
+            value: initial,
+        }
+    }
+
+    /// Resets the EWMA to a new value and timestamp.
+    ///
+    /// This overwrites the current value and timestamp, useful for
+    /// resetting success rate tracking after recovery from a tripped state.
+    ///
+    /// Precondition: `ts` should be <= the timestamps passed to subsequent
+    /// `add()` calls. If it is not, those `add()` calls are silently
+    /// dropped because `add()` discards samples whose timestamp is at or
+    /// before the stored one.
+    pub fn reset(&mut self, value: f64, ts: time::Instant) {
+        debug_assert!(!value.is_nan(), "EWMA reset value must not be NaN");
+        self.value = value;
+        self.timestamp = ts;
+    }
+
     /// Returns the current value of the average.
     pub fn get(&self) -> f64 {
         self.value
     }
 
+    /// Returns the decayed value projected to the given time, without modifying stored state.
+    ///
+    /// Instead of returning the raw stored value, this applies exponential decay based
+    /// on elapsed time since the last update. This is required for load balancing where
+    /// stale measurements should lose influence over time.
+    pub fn get_at(&self, now: time::Instant) -> f64 {
+        debug_assert!(!self.value.is_nan(), "EWMA value must not be NaN");
+
+        if self.value.is_infinite() || now <= self.timestamp {
+            return self.value;
+        }
+        let elapsed = now.saturating_duration_since(self.timestamp);
+        self.value * (-elapsed.as_secs_f64() / self.decay).exp()
+    }
+
     /// Updates the weighted moving average with a new value and timestamp.
     ///
     /// Precondition: `value` must not be NaN. Passing NaN poisons the EWMA
@@ -52,8 +102,16 @@ impl Ewma {
         self.timestamp = ts;
     }
 
+    /// Updates the EWMA with a peak value, replacing the current value if the
+    /// new value exceeds the decayed projection.
+    ///
+    /// When replacement occurs, the stored timestamp is set to `ts`, which
+    /// may be earlier than the previously stored timestamp. This resets the
+    /// decay reference point, so subsequent projections via `get_at()` measure
+    /// elapsed time from `ts`.
     pub fn add_peak(&mut self, value: f64, ts: time::Instant) {
-        if self.value < value {
+        debug_assert!(!value.is_nan(), "EWMA peak value must not be NaN");
+        if self.value.is_infinite() || self.get_at(ts) < value {
             self.value = value;
             self.timestamp = ts;
             return;
@@ -78,6 +136,9 @@ mod tests {
     use super::*;
     use tokio::time::{Duration, Instant};
 
+    // Literal value for exp(-1.0), since it's not const
+    const EXP_NEG1: f64 = 0.36787944117144233;
+
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_new() {
         let now = Instant::now();
@@ -120,6 +181,296 @@ mod tests {
         ewma.add(1.0, now + Duration::from_secs(1));
         assert_eq!(ewma.get(), 1.0);
         ewma.add(0.0, now + Duration::from_secs(11));
-        assert_eq!(ewma.get(), 0.36787944117144233);
+        assert_eq!(ewma.get(), EXP_NEG1);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_new_with_value() {
+        let now = Instant::now();
+        let ewma = Ewma::new_with_value(Duration::from_secs(10), now, 1.0);
+
+        assert_eq!(ewma.get(), 1.0);
+
+        // Verify this behaves like a normal EWMA after initialization
+        let mut ewma = Ewma::new_with_value(Duration::from_secs(10), now, 1.0);
+
+        ewma.add(0.0, now + Duration::from_secs(10));
+        // After one decay period value decays towards zero.
+        assert_eq!(ewma.get(), EXP_NEG1);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_add_peak_from_infinity_same_timestamp() {
+        let now = Instant::now();
+        let mut ewma = Ewma::new(Duration::from_secs(10), now);
+
+        assert_eq!(ewma.get(), f64::INFINITY);
+
+        // Same timestamp as construction. The first real value should always
+        // take effect regardless of timestamp.
+        ewma.add_peak(0.5, now);
+        assert_eq!(ewma.get(), 0.5);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_add_peak_replaces_decayed_value() {
+        let now = Instant::now();
+        let mut ewma = Ewma::new(Duration::from_secs(10), now);
+
+        // Set initial peak of 10.0 at t=1s
+        ewma.add_peak(10.0, now + Duration::from_secs(1));
+        assert_eq!(ewma.get(), 10.0);
+
+        // After 25s of decay (t=26s), the decayed projection is:
+        // 10.0 * exp(-25/10) = 10.0 * exp(-2.5) = 0.8208...
+        // Since 0.8208 < 1.0, the new value should replace the stale peak.
+        ewma.add_peak(1.0, now + Duration::from_secs(26));
+        assert_eq!(ewma.get(), 1.0);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_zero_decay_clamped_in_new() {
+        let now = Instant::now();
+        let zero = Ewma::new(Duration::ZERO, now);
+        let min = Ewma::new(Duration::from_millis(1), now);
+
+        // Both should produce identical behavior since ZERO gets clamped to 1ms
+        assert_eq!(zero.get(), min.get());
+
+        // After adding a value, get_at should produce finite, identical results
+        let mut zero = Ewma::new(Duration::ZERO, now);
+        let mut min = Ewma::new(Duration::from_millis(1), now);
+        zero.add(5.0, now + Duration::from_secs(1));
+        min.add(5.0, now + Duration::from_secs(1));
+
+        let projected_zero = zero.get_at(now + Duration::from_secs(2));
+        let projected_min = min.get_at(now + Duration::from_secs(2));
+
+        assert!(
+            projected_zero.is_finite(),
+            "get_at must be finite with clamped decay"
+        );
+        assert!(
+            !projected_zero.is_nan(),
+            "get_at must not be NaN with clamped decay"
+        );
+        assert_eq!(projected_zero, projected_min);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_zero_decay_clamped_in_new_with_value() {
+        let now = Instant::now();
+        let zero = Ewma::new_with_value(Duration::ZERO, now, 1.0);
+        let min = Ewma::new_with_value(Duration::from_millis(1), now, 1.0);
+
+        let projected_zero = zero.get_at(now + Duration::from_secs(1));
+        let projected_min = min.get_at(now + Duration::from_secs(1));
+        assert!(
+            projected_zero.is_finite(),
+            "get_at must be finite with clamped decay"
+        );
+        assert!(
+            !projected_zero.is_nan(),
+            "get_at must not be NaN with clamped decay"
+        );
+        assert_eq!(projected_zero, projected_min);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_zero_decay_add_is_safe() {
+        let now = Instant::now();
+        let mut ewma = Ewma::new(Duration::ZERO, now);
+
+        ewma.add(5.0, now + Duration::from_secs(1));
+        ewma.add(0.5, now + Duration::from_secs(2));
+
+        let val = ewma.get();
+        assert!(
+            val.is_finite(),
+            "add() result must be finite with clamped decay"
+        );
+        assert!(
+            !val.is_nan(),
+            "add() result must not be NaN with clamped decay"
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_reset() {
+        let now = Instant::now();
+        let mut ewma = Ewma::new(Duration::from_secs(10), now);
+
+        // Set state
+        ewma.add(0.5, now + Duration::from_secs(1));
+        assert_eq!(ewma.get(), 0.5);
+
+        // Reset to a new value
+        ewma.reset(1.0, now + Duration::from_secs(2));
+        assert_eq!(ewma.get(), 1.0);
+
+        // Verify EWMA continues working after reset.
+        ewma.add(0.0, now + Duration::from_secs(12));
+        assert_eq!(ewma.get(), EXP_NEG1);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_get_at_same_timestamp() {
+        let now = Instant::now();
+        let decay = Duration::from_secs(10);
+        let add_at = now + Duration::from_secs(1);
+        let read_at = add_at;
+
+        let mut ewma = Ewma::new(decay, now);
+        ewma.add(0.5, add_at);
+
+        assert_eq!(ewma.get_at(read_at), 0.5);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_get_at_past_timestamp() {
+        let now = Instant::now();
+        let decay = Duration::from_secs(10);
+        let add_at = now + Duration::from_secs(1);
+        let read_at = now + Duration::from_millis(500);
+
+        let mut ewma = Ewma::new(decay, now);
+        ewma.add(0.5, add_at);
+
+        assert_eq!(ewma.get_at(read_at), 0.5);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_get_at_infinity() {
+        let now = Instant::now();
+        let decay = Duration::from_secs(10);
+        let probe_same = now;
+        let probe_near = now + Duration::from_secs(1);
+        let probe_far = now + Duration::from_secs(100);
+
+        // A new Ewma without addding values should project INFINITY
+        // at every timestamp.
+        let ewma = Ewma::new(decay, now);
+        assert_eq!(ewma.get_at(probe_same), f64::INFINITY);
+        assert_eq!(ewma.get_at(probe_near), f64::INFINITY);
+        assert_eq!(ewma.get_at(probe_far), f64::INFINITY);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_get_at_decay() {
+        let now = Instant::now();
+        let decay = Duration::from_secs(10);
+        let add_at = now + Duration::from_secs(1);
+        let read_at = now + Duration::from_secs(11);
+
+        let mut ewma = Ewma::new(decay, now);
+        ewma.add(1.0, add_at);
+
+        // Verify that get_at applies value * exp(-elapsed/decay) correctly
+        // without changing internal state.
+        assert_eq!(ewma.get_at(read_at), EXP_NEG1);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_get_at_large_elapsed() {
+        let now = Instant::now();
+        let decay = Duration::from_secs(10);
+        let add_at = now + Duration::from_secs(1);
+        let read_at = now + Duration::from_secs(3600);
+
+        let mut ewma = Ewma::new(decay, now);
+        ewma.add(1.0, add_at);
+
+        let result = ewma.get_at(read_at);
+        assert!(
+            result.is_finite(),
+            "get_at at large elapsed must be finite, got {result}"
+        );
+        assert!(
+            result < 1e-10,
+            "get_at at large elapsed must be near zero, got {result}"
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_get_at_non_mutation() {
+        let now = Instant::now();
+        let decay = Duration::from_secs(10);
+        let first_add_at = now + Duration::from_secs(1);
+        let read_at = now + Duration::from_secs(6);
+
+        let mut ewma = Ewma::new(decay, now);
+        ewma.add(1.0, first_add_at);
+
+        // Take internal state before the read
+        let value_before = ewma.value;
+        let timestamp_before = ewma.timestamp;
+
+        // Read must not mutate
+        let _ = ewma.get_at(read_at);
+
+        assert_eq!(ewma.value, value_before);
+        assert_eq!(ewma.timestamp, timestamp_before);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_get_at_after_reset() {
+        const EWMA_VAL: f64 = 0.5;
+        const DECAY_AT_15S_VAL: f64 = EWMA_VAL * EXP_NEG1;
+
+        let now = Instant::now();
+        let decay = Duration::from_secs(10);
+        let first_add_at = now + Duration::from_secs(1);
+        let reset_at = now + Duration::from_secs(5);
+        let immediate_read_at = reset_at;
+        let decay_read_at = now + Duration::from_secs(15);
+
+        let mut ewma = Ewma::new(decay, now);
+        // Use a large value here to make sure we detect any issues with
+        // reset not working properly, since we'd likely see a wildly
+        // different value in the assert below.
+        ewma.add(100.0, first_add_at);
+
+        // Reset replaces both value and timestamp
+        ewma.reset(EWMA_VAL, reset_at);
+
+        // Must return the freshly-reset value
+        assert_eq!(ewma.get_at(immediate_read_at), EWMA_VAL);
+
+        // Read one decay period (10s) after the reset (so 15s decay).
+        assert_eq!(ewma.get_at(decay_read_at), DECAY_AT_15S_VAL);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_get_at_between_adds() {
+        const EXP_NEG0_5: f64 = 0.6065306597126334;
+
+        let now = Instant::now();
+        let decay = Duration::from_secs(10);
+        let first_add_at = now + Duration::from_secs(1);
+        let read_at = now + Duration::from_secs(6);
+        let second_add_at = now + Duration::from_secs(11);
+
+        let mut ewma = Ewma::new(decay, now);
+        ewma.add(1.0, first_add_at);
+
+        assert_eq!(ewma.get_at(read_at), EXP_NEG0_5);
+
+        ewma.add(0.0, second_add_at);
+
+        // Final state after the second add must be exp(-1.0),
+        // ensuring get_at() doesn't change the internal state.
+        assert_eq!(ewma.get(), EXP_NEG1);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    #[cfg(debug_assertions)]
+    #[should_panic(expected = "NaN")]
+    async fn test_get_at_debug_asserts_nan() {
+        let now = Instant::now();
+        // Inject NaN via new_with_value
+        let ewma = Ewma::new_with_value(Duration::from_secs(10), now, f64::NAN);
+
+        // Should trigger a debug_assert
+        let _ = ewma.get_at(now);
     }
 }

From 03485467c6ab10c6ba89628fcd158e97d4233909 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Thu, 7 May 2026 14:09:55 +0200
Subject: [PATCH 03/45] feat(classify): add Retry-After and gRPC pushback
 parsers

Add a retry_after module to linkerd-http-classify with shared parsing
functions for extracting backoff hints from HTTP and gRPC responses.

parse_retry_after handles 429/503 responses with both delay-seconds and
HTTP-date formats per RFC 7231, capping the returned duration at a
caller-specified maximum. parse_grpc_retry_pushback reads the
grpc-retry-pushback-ms header per the gRPC A6 spec, rejecting negative
values and capping positive ones.

We use the httpdate crate for the actual RFC 7231 HTTP-date parsing.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 Cargo.lock                               |   1 +
 Cargo.toml                               |   1 +
 linkerd/http/classify/Cargo.toml         |   1 +
 linkerd/http/classify/src/lib.rs         |   1 +
 linkerd/http/classify/src/retry_after.rs | 320 +++++++++++++++++++++++
 5 files changed, 324 insertions(+)
 create mode 100644 linkerd/http/classify/src/retry_after.rs

diff --git a/Cargo.lock b/Cargo.lock
index 728172de74..278b7031f7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1758,6 +1758,7 @@ dependencies = [
  "futures",
  "http",
  "http-body",
+ "httpdate",
  "linkerd-error",
  "linkerd-http-box",
  "linkerd-stack",
diff --git a/Cargo.toml b/Cargo.toml
index cd7cbd45e4..f10ee0c3f6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -106,6 +106,7 @@ drain = { version = "0.2", default-features = false }
 h2 = { version = "0.4" }
 http = { version = "1" }
 http-body = { version = "1" }
+httpdate = { version = "1.0" }
 hyper = { version = "1", default-features = false }
 prometheus-client = { version = "0.23" }
 prost = { version = "0.14" }
diff --git a/linkerd/http/classify/Cargo.toml b/linkerd/http/classify/Cargo.toml
index c7a7911e40..e5c23eb0b7 100644
--- a/linkerd/http/classify/Cargo.toml
+++ b/linkerd/http/classify/Cargo.toml
@@ -10,6 +10,7 @@ publish = { workspace = true }
 futures = { version = "0.3", default-features = false }
 http = { workspace = true }
 http-body = { workspace = true }
+httpdate.workspace = true
 pin-project = "1"
 tokio = { version = "1", default-features = false }
 tracing = { workspace = true }
diff --git a/linkerd/http/classify/src/lib.rs b/linkerd/http/classify/src/lib.rs
index e25f2b7701..fcc9c5ef70 100644
--- a/linkerd/http/classify/src/lib.rs
+++ b/linkerd/http/classify/src/lib.rs
@@ -12,6 +12,7 @@ pub use self::{
 mod channel;
 pub mod gate;
 mod insert;
+pub mod retry_after;
 
 /// Determines how a request's response should be classified.
 pub trait Classify {
diff --git a/linkerd/http/classify/src/retry_after.rs b/linkerd/http/classify/src/retry_after.rs
new file mode 100644
index 0000000000..9b00ec4303
--- /dev/null
+++ b/linkerd/http/classify/src/retry_after.rs
@@ -0,0 +1,320 @@
+//! Shared parsing for HTTP Retry-After headers and gRPC retry-pushback-ms trailers.
+//!
+//! This module provides pure parsing functions with no classification or store logic.
+//! Both the load-biaser and the circuit breaker can use these functions to extract
+//! backoff hints from HTTP and gRPC responses.
+
+use http::{HeaderMap, StatusCode};
+use std::time::Duration;
+
+/// Parse the Retry-After header from a 429 or 503 response.
+///
+/// Supports two formats per RFC 7231:
+/// - delay-seconds: `Retry-After: 120` -> 120 seconds
+/// - HTTP-date: `Retry-After: Wed, 21 Oct 2025 07:28:00 GMT` -> duration from now
+///
+/// Returns `None` for:
+/// - Non-429/503 responses
+/// - Missing Retry-After header
+/// - Invalid header formats
+///
+/// The returned duration is capped at `max` to prevent abuse.
+pub fn parse_retry_after(
+    status: StatusCode,
+    headers: &HeaderMap,
+    max: Duration,
+) -> Option<Duration> {
+    // Only parse for 429 and 503 responses
+    if status != StatusCode::TOO_MANY_REQUESTS && status != StatusCode::SERVICE_UNAVAILABLE {
+        return None;
+    }
+
+    let value = headers.get(http::header::RETRY_AFTER)?;
+    let s = value.to_str().ok()?;
+
+    parse_retry_after_value(s, max)
+}
+
+/// Parse a Retry-After header value string.
+///
+/// Tries delay-seconds first (most common), then HTTP-date format.
+/// The returned duration is capped at `max`.
+fn parse_retry_after_value(s: &str, max: Duration) -> Option<Duration> {
+    let s = s.trim();
+    // Try delay-seconds first (most common format)
+    if let Ok(secs) = s.parse::<u64>() {
+        let duration = Duration::from_secs(secs);
+        tracing::debug!(?duration, "Parsed Retry-After delay-seconds");
+        return Some(duration.min(max));
+    }
+
+    // Try HTTP-date format
+    if let Ok(datetime) = httpdate::parse_http_date(s) {
+        let now = std::time::SystemTime::now();
+        match datetime.duration_since(now) {
+            Ok(duration) => {
+                tracing::debug!(?duration, "Parsed Retry-After HTTP-date");
+                return Some(duration.min(max));
+            }
+            Err(_) => {
+                tracing::debug!("Retry-After HTTP-date is in the past");
+                return Some(Duration::ZERO);
+            }
+        }
+    }
+
+    tracing::debug!(%s, "Failed to parse Retry-After header");
+    None
+}
+
+/// The grpc-retry-pushback-ms header/trailer name.
+const GRPC_RETRY_PUSHBACK_MS: &str = "grpc-retry-pushback-ms";
+
+/// Parse grpc-retry-pushback-ms from headers or trailers.
+///
+/// Per gRPC A6 spec:
+/// - Positive i64: retry after this many milliseconds
+/// - Negative i64: do not retry (returns `None`)
+///
+/// This function does **not** check grpc-status; that is a classification
+/// concern left to the caller.
+///
+/// Returns `None` for:
+/// - Missing header/trailer
+/// - Negative values (interpreted as "do not retry")
+/// - Invalid formats
+///
+/// The returned duration is capped at `max` to prevent abuse.
+pub fn parse_grpc_retry_pushback(headers: &HeaderMap, max: Duration) -> Option<Duration> {
+    let value = headers.get(GRPC_RETRY_PUSHBACK_MS)?;
+    let s = value.to_str().ok()?;
+
+    // Parse as i64 to handle potential negative values
+    let ms: i64 = match s.trim().parse() {
+        Ok(v) => v,
+        Err(_) => {
+            tracing::debug!(%s, "Failed to parse grpc-retry-pushback-ms");
+            return None;
+        }
+    };
+
+    // Negative values mean "do not retry".
+    if ms < 0 {
+        tracing::debug!(ms, "Ignoring negative grpc-retry-pushback-ms");
+        return None;
+    }
+
+    let duration = Duration::from_millis(ms as u64);
+    tracing::debug!(?duration, "Parsed grpc-retry-pushback-ms");
+    Some(duration.min(max))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use http::header::RETRY_AFTER;
+    use http::HeaderValue;
+    use std::time::SystemTime;
+
+    const MAX: Duration = Duration::from_secs(300);
+
+    // === Retry-After tests ===
+
+    #[test]
+    fn parse_delay_seconds() {
+        let mut headers = HeaderMap::new();
+        headers.insert(http::header::RETRY_AFTER, HeaderValue::from_static("120"));
+
+        let result = parse_retry_after(StatusCode::TOO_MANY_REQUESTS, &headers, MAX);
+        assert_eq!(result, Some(Duration::from_secs(120)));
+    }
+
+    #[test]
+    fn parse_delay_seconds_zero() {
+        let mut headers = HeaderMap::new();
+        headers.insert(http::header::RETRY_AFTER, HeaderValue::from_static("0"));
+
+        let result = parse_retry_after(StatusCode::TOO_MANY_REQUESTS, &headers, MAX);
+        assert_eq!(result, Some(Duration::ZERO));
+    }
+
+    #[test]
+    fn caps_at_max() {
+        let mut headers = HeaderMap::new();
+        headers.insert(http::header::RETRY_AFTER, HeaderValue::from_static("3600"));
+
+        let result = parse_retry_after(StatusCode::TOO_MANY_REQUESTS, &headers, MAX);
+        assert_eq!(result, Some(MAX));
+    }
+
+    #[test]
+    fn parses_503() {
+        let mut headers = HeaderMap::new();
+        headers.insert(http::header::RETRY_AFTER, HeaderValue::from_static("120"));
+
+        let result = parse_retry_after(StatusCode::SERVICE_UNAVAILABLE, &headers, MAX);
+        assert_eq!(result, Some(Duration::from_secs(120)));
+    }
+
+    #[test]
+    fn ignores_other_status() {
+        let mut headers = HeaderMap::new();
+        headers.insert(http::header::RETRY_AFTER, HeaderValue::from_static("120"));
+
+        let result = parse_retry_after(StatusCode::OK, &headers, MAX);
+        assert_eq!(result, None);
+    }
+
+    #[test]
+    fn ignores_missing_header() {
+        let headers = HeaderMap::new();
+
+        let result = parse_retry_after(StatusCode::TOO_MANY_REQUESTS, &headers, MAX);
+        assert_eq!(result, None);
+    }
+
+    #[test]
+    fn ignores_invalid_value() {
+        let mut headers = HeaderMap::new();
+        headers.insert(
+            http::header::RETRY_AFTER,
+            HeaderValue::from_static("not-a-number"),
+        );
+
+        let result = parse_retry_after(StatusCode::TOO_MANY_REQUESTS, &headers, MAX);
+        assert_eq!(result, None);
+    }
+
+    #[test]
+    fn retry_after_http_date_in_past() {
+        let mut headers = HeaderMap::new();
+        // Use a date in the past
+        headers.insert(
+            RETRY_AFTER,
+            "Wed, 01 Jan 2020 00:00:00 GMT".parse().unwrap(),
+        );
+
+        let result = parse_retry_after(StatusCode::TOO_MANY_REQUESTS, &headers, MAX);
+        assert_eq!(result, Some(Duration::ZERO));
+    }
+
+    #[test]
+    fn retry_after_http_date_in_future() {
+        let target = SystemTime::now() + Duration::from_secs(60);
+        let date_str = httpdate::fmt_http_date(target);
+        let mut headers = HeaderMap::new();
+        headers.insert(RETRY_AFTER, date_str.parse().unwrap());
+
+        let result = parse_retry_after(StatusCode::TOO_MANY_REQUESTS, &headers, MAX);
+        let dur = result.expect("should parse future HTTP-date");
+
+        // httpdate truncates sub-seconds (1s resolution), and we can have
+        // some differences for clock time under CI load or NTP adjustment
+        // so test parsing correctness within a wide enough time window.
+        assert!(
+            dur >= Duration::from_secs(55) && dur <= Duration::from_secs(65),
+            "expected ~60s, got {:?}",
+            dur,
+        );
+    }
+
+    #[test]
+    fn retry_after_http_date_caps_at_max() {
+        let target = SystemTime::now() + Duration::from_secs(600);
+        let date_str = httpdate::fmt_http_date(target);
+        let mut headers = HeaderMap::new();
+        headers.insert(RETRY_AFTER, date_str.parse().unwrap());
+
+        let result = parse_retry_after(StatusCode::TOO_MANY_REQUESTS, &headers, MAX);
+        assert_eq!(result, Some(MAX));
+    }
+
+    // === gRPC pushback tests ===
+
+    const MAX_GRPC: Duration = Duration::from_millis(300_000);
+
+    #[test]
+    fn parse_grpc_pushback_positive() {
+        let mut headers = HeaderMap::new();
+        headers.insert("grpc-retry-pushback-ms", HeaderValue::from_static("5000"));
+
+        let result = parse_grpc_retry_pushback(&headers, MAX_GRPC);
+        assert_eq!(result, Some(Duration::from_millis(5000)));
+    }
+
+    #[test]
+    fn parse_grpc_pushback_zero() {
+        let mut headers = HeaderMap::new();
+        headers.insert("grpc-retry-pushback-ms", HeaderValue::from_static("0"));
+
+        let result = parse_grpc_retry_pushback(&headers, MAX_GRPC);
+        assert_eq!(result, Some(Duration::ZERO));
+    }
+
+    #[test]
+    fn parse_grpc_pushback_negative() {
+        let mut headers = HeaderMap::new();
+        headers.insert("grpc-retry-pushback-ms", HeaderValue::from_static("-1"));
+
+        let result = parse_grpc_retry_pushback(&headers, MAX_GRPC);
+        assert_eq!(result, None);
+    }
+
+    #[test]
+    fn parse_grpc_pushback_caps_at_max() {
+        let mut headers = HeaderMap::new();
+        headers.insert("grpc-retry-pushback-ms", HeaderValue::from_static("999999"));
+
+        let result = parse_grpc_retry_pushback(&headers, MAX_GRPC);
+        assert_eq!(result, Some(MAX_GRPC));
+    }
+
+    #[test]
+    fn parse_grpc_pushback_missing() {
+        let headers = HeaderMap::new();
+
+        let result = parse_grpc_retry_pushback(&headers, MAX_GRPC);
+        assert_eq!(result, None);
+    }
+
+    #[test]
+    fn parse_grpc_pushback_invalid() {
+        let mut headers = HeaderMap::new();
+        headers.insert(
+            "grpc-retry-pushback-ms",
+            HeaderValue::from_static("not-a-number"),
+        );
+
+        let result = parse_grpc_retry_pushback(&headers, MAX_GRPC);
+        assert_eq!(result, None);
+    }
+
+    // === Whitespace handling tests ===
+
+    #[test]
+    fn retry_after_trailing_whitespace() {
+        let mut headers = HeaderMap::new();
+        headers.insert(RETRY_AFTER, "120 ".parse().unwrap());
+
+        let result = parse_retry_after(StatusCode::TOO_MANY_REQUESTS, &headers, MAX);
+        assert_eq!(result, Some(Duration::from_secs(120)));
+    }
+
+    #[test]
+    fn retry_after_leading_whitespace() {
+        let mut headers = HeaderMap::new();
+        headers.insert(RETRY_AFTER, " 120".parse().unwrap());
+
+        let result = parse_retry_after(StatusCode::TOO_MANY_REQUESTS, &headers, MAX);
+        assert_eq!(result, Some(Duration::from_secs(120)));
+    }
+
+    #[test]
+    fn grpc_pushback_whitespace() {
+        let mut headers = HeaderMap::new();
+        headers.insert("grpc-retry-pushback-ms", " 5000 ".parse().unwrap());
+
+        let result = parse_grpc_retry_pushback(&headers, MAX);
+        assert_eq!(result, Some(Duration::from_millis(5000)));
+    }
+}

From 8a7fbb2837ee1e4fb6a21c6edf7d6322af8e05e6 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Thu, 21 May 2026 20:07:52 +0200
Subject: [PATCH 04/45] feat(load-biaser): add load biasing crate with RTT
 tracking and failure penalties

Introduce the linkerd-load-biaser crate, which wraps any tower::Service to
provide per-endpoint load metrics for P2C balancing. The crate tracks request
latency via EWMA and injects penalties when failure responses are detected,
steering traffic away from unhealthy endpoints.

Penalty injection covers HTTP 429/503/5xx and gRPC RESOURCE_EXHAUSTED/UNAVAILABLE
trailers-only responses (not streaming gRPC failures since we can only
access headers here). For responses with backoff hints, Retry-After on
HTTP 429/503 or grpc-retry-pushback-ms on gRPC trailers-only errors, the
penalty is amplified so that the EWMA value remains meaningful through
the server-requested backoff window. The amplification is clamped to
prevent infinity from permanently disabling the endpoint.

The load metric is computed as `max(rtt * (pending + 1), penalty)`, where
`rtt` is the peak-EWMA latency, and `pending` is the number of in-flight
requests. This is returned via tower::load::Load for direct P2C
integration.

The load biaser is disabled by default, preserving RTT-only behavior
(PeakEwma equivalent), unless explicitly activated.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 Cargo.lock                     |  18 +
 Cargo.toml                     |   1 +
 linkerd/load-biaser/Cargo.toml |  29 ++
 linkerd/load-biaser/src/lib.rs | 597 +++++++++++++++++++++++++++++++++
 4 files changed, 645 insertions(+)
 create mode 100644 linkerd/load-biaser/Cargo.toml
 create mode 100644 linkerd/load-biaser/src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
index 278b7031f7..743cfe1206 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1987,6 +1987,24 @@ dependencies = [
  "tokio-util",
 ]
 
+[[package]]
+name = "linkerd-load-biaser"
+version = "0.1.0"
+dependencies = [
+ "futures",
+ "http",
+ "linkerd-ewma",
+ "linkerd-http-classify",
+ "linkerd-stack",
+ "parking_lot",
+ "pin-project",
+ "tokio",
+ "tokio-test",
+ "tower",
+ "tower-service",
+ "tracing",
+]
+
 [[package]]
 name = "linkerd-meshtls"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index f10ee0c3f6..bc2dfc2e95 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -43,6 +43,7 @@ members = [
     "linkerd/identity",
     "linkerd/idle-cache",
     "linkerd/io",
+    "linkerd/load-biaser",
     "linkerd/meshtls",
     "linkerd/meshtls/verifier",
     "linkerd/metrics",
diff --git a/linkerd/load-biaser/Cargo.toml b/linkerd/load-biaser/Cargo.toml
new file mode 100644
index 0000000000..a8fa0fb29a
--- /dev/null
+++ b/linkerd/load-biaser/Cargo.toml
@@ -0,0 +1,29 @@
+[package]
+name = "linkerd-load-biaser"
+version = { workspace = true }
+authors = { workspace = true }
+license = { workspace = true }
+edition = { workspace = true }
+publish = { workspace = true }
+
+[features]
+default = []
+tokio-test = ["dep:tokio-test"]
+
+[dependencies]
+linkerd-ewma = { path = "../ewma" }
+futures = { version = "0.3", default-features = false }
+http = { workspace = true }
+linkerd-http-classify = { path = "../http/classify" }
+linkerd-stack = { path = "../stack" }
+parking_lot = "0.12"
+pin-project = "1"
+tokio = { version = "1", features = ["io-util", "net", "time"] }
+tokio-test = { version = "0.4", optional = true }
+tower = { workspace = true, features = ["load"] }
+tower-service = { workspace = true }
+tracing = { workspace = true }
+
+[dev-dependencies]
+tokio = { version = "1", features = ["macros", "rt", "time"] }
+tokio-test = "0.4"
diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
new file mode 100644
index 0000000000..7254ca67ef
--- /dev/null
+++ b/linkerd/load-biaser/src/lib.rs
@@ -0,0 +1,597 @@
+//! Load tracking with response failure awareness.
+//!
+//! This module provides a `LoadBiaser` wrapper that tracks request latency (RTT)
+//! and detects failure responses (HTTP 429, 503, 5xx), applying artificial
+//! penalties. Unlike Tower's `PeakEwma`, this implementation uses
+//! `linkerd_ewma::Ewma` and returns `f64` metrics directly, enabling
+//! integration with P2C load balancing.
+//!
+//! This can wrap any service (`Load` trait not required) and it tracks RTT via
+//! EWMA, which is updated when responses complete, and pending requests for load
+//! calculation.
+//!
+//! When a failure response is detected, a penalty is applied and the EWMA jumps
+//! to a high value. The load is calculated as `max(rtt * (pending + 1), penalty)`
+//! (so at least RTT with no pending requests).
+//!
+//! Both RTT and penalty decay over time via the EWMA.
+//!
+//! For this type to work responses must implement the `ResponseFailureHint`
+//! trait, which classifies responses into failure categories (rate-limited,
+//! service unavailable, internal error). For non-HTTP responses the default
+//! implementation returns no failure hint, so only RTT tracking occurs.
+//!
+//! For gRPC, `failure_hint()` detects `RESOURCE_EXHAUSTED` (code 8) and
+//! `UNAVAILABLE` (code 14) from the `grpc-status` header in trailers-only
+//! (unary) responses. Streaming gRPC puts `grpc-status` in trailers which
+//! are not visible at response-head time; the circuit breaker handles
+//! streaming gRPC failures via `GrpcRetryPushbackClassifyEos`.
+
+#![deny(rust_2018_idioms, clippy::disallowed_methods, clippy::disallowed_types)]
+#![forbid(unsafe_code)]
+
+use futures::ready;
+use linkerd_ewma::{Ewma, MIN_DECAY};
+use linkerd_stack::NewService;
+use parking_lot::RwLock;
+use pin_project::pin_project;
+use std::{
+    future::Future,
+    marker::PhantomData,
+    pin::Pin,
+    sync::{
+        atomic::{AtomicU32, Ordering},
+        Arc,
+    },
+    task::{Context, Poll},
+    time::Duration,
+};
+use tokio::time::Instant;
+use tower::load::Load;
+use tower_service::Service;
+
+/// Default maximum duration for Retry-After hints.
+pub const DEFAULT_RETRY_AFTER_MAX_DURATION: Duration = Duration::from_secs(300);
+
+/// Classification of response failures for load biasing.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum FailureHint {
+    /// HTTP 429 Too Many Requests
+    RateLimited,
+    /// HTTP 503 Service Unavailable
+    ServiceUnavailable,
+    /// HTTP 5xx (other than 503)
+    InternalError,
+}
+
+/// Cached Retry-After hint stored in HTTP response extensions.
+///
+/// Stores the **uncapped** parsed value. Each consumer applies its own cap
+/// via `rate_limit_hint(max)`, so different callers (e.g. load biaser vs
+/// circuit breaker) can use different maximums from the same cached value.
+#[derive(Clone, Copy, Debug)]
+pub struct CachedRateLimitHint(pub Duration);
+
+/// Trait for extracting failure hints from responses.
+///
+/// This allows the load biaser to classify responses and apply appropriate
+/// penalties. Default implementations return None (no failure detected),
+/// which is appropriate for non-HTTP transports.
+///
+/// The trait splits rate limit hint access into two methods to avoid
+/// requiring `&mut self` on the read path:
+/// - `attach_parsed_rate_limit_hint(&mut self, max)`: parse and cache (needs `&mut`)
+/// - `rate_limit_hint(&self, max)`: read cached value or parse on-read (only needs `&self`)
+///
+/// # Stack ordering and caching
+///
+/// In the current proxy stack `RetryAfterClassify::start()` (the circuit breaker's
+/// classifier) calls `rate_limit_hint()` before `LoadBiaserFuture::poll()` calls
+/// `attach_parsed_rate_limit_hint()`, because responses flow inner-to-outer. This
+/// means the cache is always cold for the circuit breaker path.
+pub trait ResponseFailureHint {
+    /// Returns a failure hint if the response indicates a failure condition.
+    fn failure_hint(&self) -> Option<FailureHint> {
+        None
+    }
+
+    /// Parse and cache the raw (uncapped) rate limit hint from this response.
+    ///
+    /// The `_max` parameter is accepted for API symmetry with `rate_limit_hint(max)`
+    /// but is intentionally unused. The raw uncapped value is cached so that each
+    /// consumer can apply their own cap via `rate_limit_hint(max)`.
+    fn attach_parsed_rate_limit_hint(&mut self, _max: Duration) {}
+
+    /// Returns the rate limit hint if available.
+    ///
+    /// Checks cached value first (from a previous `attach_parsed_rate_limit_hint` call).
+    /// If no cached value, attempts to parse the header directly (capping at `max`).
+    /// Returns `None` only if the header is absent or unparseable.
+    fn rate_limit_hint(&self, _max: Duration) -> Option<Duration> {
+        None
+    }
+}
+
+/// HTTP responses classify failures by status code and parse Retry-After hints.
+///
+/// For gRPC responses (which arrive as HTTP 200 with `grpc-status` in headers
+/// for trailers-only/unary errors), the `grpc-status` header is checked when
+/// the HTTP status is 200:
+/// - gRPC status 8 (RESOURCE_EXHAUSTED) -> `RateLimited`
+/// - gRPC status 14 (UNAVAILABLE) -> `ServiceUnavailable`
+/// - Other non-zero gRPC status codes -> `InternalError`
+impl<B> ResponseFailureHint for http::Response<B> {
+    fn failure_hint(&self) -> Option<FailureHint> {
+        let status = self.status();
+        if status == http::StatusCode::TOO_MANY_REQUESTS {
+            Some(FailureHint::RateLimited)
+        } else if status == http::StatusCode::SERVICE_UNAVAILABLE {
+            Some(FailureHint::ServiceUnavailable)
+        } else if status.is_server_error() {
+            Some(FailureHint::InternalError)
+        } else if status == http::StatusCode::OK {
+            // gRPC trailers-only responses: grpc-status appears in headers.
+            // Note: for streaming gRPC, grpc-status is in trailers (not headers)
+            // and will not be detected here. The circuit breaker handles streaming
+            // gRPC failures via GrpcRetryPushbackClassifyEos.
+            self.headers()
+                .get("grpc-status")
+                .and_then(|v| v.to_str().ok())
+                .and_then(|s| s.parse::<u16>().ok())
+                .and_then(|code| match code {
+                    0 => None,                                   // OK
+                    8 => Some(FailureHint::RateLimited),         // RESOURCE_EXHAUSTED
+                    14 => Some(FailureHint::ServiceUnavailable), // UNAVAILABLE
+                    _ => Some(FailureHint::InternalError),       // Other non-zero
+                })
+        } else {
+            None
+        }
+    }
+
+    fn attach_parsed_rate_limit_hint(&mut self, _max: Duration) {
+        // Store the uncapped value. Each consumer applies their own cap via
+        // rate_limit_hint(max).
+        if let Some(d) = linkerd_http_classify::retry_after::parse_retry_after(
+            self.status(),
+            self.headers(),
+            Duration::MAX,
+        ) {
+            self.extensions_mut().insert(CachedRateLimitHint(d));
+            return;
+        }
+        // Try gRPC retry-pushback-ms (for trailers-only responses)
+        if self.status() == http::StatusCode::OK {
+            if let Some(d) = linkerd_http_classify::retry_after::parse_grpc_retry_pushback(
+                self.headers(),
+                Duration::MAX,
+            ) {
+                self.extensions_mut().insert(CachedRateLimitHint(d));
+            }
+        }
+    }
+
+    fn rate_limit_hint(&self, max: Duration) -> Option<Duration> {
+        // Check cache first (from previous attach call), apply caller's cap
+        if let Some(cached) = self.extensions().get::<CachedRateLimitHint>() {
+            return Some(cached.0.min(max));
+        }
+        // Parse on-read as fallback (header present but attach wasn't called)
+        if let Some(d) = linkerd_http_classify::retry_after::parse_retry_after(
+            self.status(),
+            self.headers(),
+            max,
+        ) {
+            return Some(d);
+        }
+        // Try gRPC pushback
+        if self.status() == http::StatusCode::OK {
+            if let Some(d) =
+                linkerd_http_classify::retry_after::parse_grpc_retry_pushback(self.headers(), max)
+            {
+                return Some(d);
+            }
+        }
+        // No header or unparseable
+        None
+    }
+}
+
+/// Connection tuples (Connection, Metadata) used by TCP/TLS paths never indicate failures.
+/// This is meant for MakeConnection services that return `(I, M)` tuples.
+impl<C, M> ResponseFailureHint for (C, M) {}
+
+/// TCP streams never indicate failures (no HTTP status codes).
+impl ResponseFailureHint for tokio::net::TcpStream {}
+
+/// Duplex streams (used in testing) never indicate failures.
+impl ResponseFailureHint for tokio::io::DuplexStream {}
+
+/// Mock IO streams never indicate failures.
+#[cfg(feature = "tokio-test")]
+impl ResponseFailureHint for tokio_test::io::Mock {}
+
+/// Amplification factor for Retry-After penalties.
+///
+/// When a 429 or 503 response includes a Retry-After header, the load biaser injects
+/// an amplified penalty so it remains meaningful through the server's requested
+/// avoidance window. The injected value is:
+///
+///   `penalty_secs * RETRY_AFTER_PENALTY_FACTOR * exp(retry_after / penalty_decay)`
+///
+/// which decays via EWMA to `penalty_secs * RETRY_AFTER_PENALTY_FACTOR` at
+/// exactly `t = retry_after`. The factor controls how aggressively the endpoint
+/// is avoided near the Retry-After deadline:
+///
+/// At 1.0 the endpoint is fully avoided (~0% traffic) through the window and 46s
+/// beyond -- too aggressive for early-recovery discovery. At 0.1, traffic leaks
+/// well before the deadline. We use 0.5: the penalty at deadline is penalty_secs/2
+/// (roughly 50x healthy load in a 5-endpoint pool), which lets occasional probes
+/// through in the second half while still exceeding a plain failure penalty for
+/// RA >= ~7s. For pools with 100+ endpoints the factor barely matters because P2C
+/// random pair selection already makes the penalized endpoint unlikely to be drawn.
+///
+/// Sending occasional probe traffic during the window is valuable: the endpoint
+/// may recover early, or a fresh 429 or 503 with a different RA provides updated
+/// information. The circuit breaker (when enabled) provides a strict backoff
+/// window, ie. `max(backoff, retry_after)`.
+const RETRY_AFTER_PENALTY_FACTOR: f64 = 0.5;
+
+/// Configuration for LoadBiaser behavior.
+#[derive(Clone, Debug)]
+pub struct LoadBiaserConfig {
+    /// Default RTT to use when no measurements are available
+    pub default_rtt: Duration,
+
+    /// Decay duration for the RTT EWMA.
+    /// Controls how quickly RTT estimates adapt to changing latency
+    pub rtt_decay: Duration,
+
+    /// The penalty value to inject on failure responses (429, 503, 5xx) in seconds
+    pub penalty_secs: f64,
+
+    /// Decay duration for the penalty EWMA.
+    /// Controls how quickly the penalty decays after a failure response
+    pub penalty_decay: Duration,
+
+    /// Whether load biasing penalties are enabled. When false, only RTT tracking
+    /// is active (PeakEwma equivalent).
+    pub enabled: bool,
+
+    /// Maximum Retry-After duration to honor. Clamped to this value.
+    pub max_duration: Duration,
+}
+
+impl Default for LoadBiaserConfig {
+    fn default() -> Self {
+        Self {
+            default_rtt: Duration::from_secs(1),
+            rtt_decay: Duration::from_secs(10),
+            // 5 second penalty on failure responses (429, 503, 5xx)
+            penalty_secs: 5.0,
+            // 10 second decay for penalty - mostly gone after ~30 seconds
+            penalty_decay: Duration::from_secs(10),
+            enabled: false,
+            max_duration: DEFAULT_RETRY_AFTER_MAX_DURATION,
+        }
+    }
+}
+
+/// Shared per-endpoint state behind a single `Arc`.
+///
+/// Combines the EWMA trackers (under a mutex for atomic RTT+penalty reads),
+/// the in-flight request counter, and the immutable config fields that every
+/// response future needs. One allocation per endpoint instead of two, and
+/// futures carry a single `Arc` clone instead of copying config fields.
+#[derive(Debug)]
+struct SharedState {
+    /// RTT and penalty EWMAs; read-locked in load(), write-locked in poll()
+    metrics: RwLock<LoadMetrics>,
+    /// Count of in-flight requests (up on call, down on response)
+    pending: AtomicU32,
+    /// Penalty value to inject on failure responses (in seconds)
+    penalty_secs: f64,
+    /// Whether penalty injection is enabled
+    enabled: bool,
+    /// Maximum Retry-After duration to honor (clamped)
+    max_duration: Duration,
+    /// Decay duration for the penalty EWMA (used to amplify Retry-After penalties)
+    penalty_decay: Duration,
+}
+
+#[derive(Debug)]
+struct LoadMetrics {
+    /// EWMA RTT tracking, updated on each response
+    rtt: Ewma,
+    /// EWMA penalty tracking, updated on failure responses (429, 503, 5xx)
+    penalty: Ewma,
+}
+
+/// A service wrapper that tracks RTT and biases load metrics based on failure responses.
+///
+/// `LoadBiaser` provides load metrics for P2C load balancing by tracking request latency
+/// (RTT) via EWMA, in-flight requests, and by injecting penalties when failure responses
+/// (429, 503, 5xx) are detected.
+///
+/// The `load()` method returns `max(rtt * (pending + 1), penalty)`, causing P2C to
+/// prefer endpoints with lower latency and fewer in-flight requests, while avoiding
+/// rate-limited endpoints.
+#[derive(Debug)]
+pub struct LoadBiaser<S> {
+    inner: S,
+    /// Per-endpoint metrics, pending counter, and penalty config
+    shared: Arc<SharedState>,
+    config: LoadBiaserConfig,
+}
+
+/// A `NewService` implementation that creates `LoadBiaser` wrappers.
+#[derive(Debug)]
+pub struct NewLoadBiaser<N, Req> {
+    inner: N,
+    config: LoadBiaserConfig,
+    _marker: PhantomData<fn(Req)>,
+}
+
+/// Response future that tracks RTT and checks for failure responses.
+///
+/// When the inner future completes we store the RTT based on elapsed time since the
+/// request started, decrement the pending counter, and if the response indicates a
+/// failure (using the `ResponseFailureHint` trait) we inject a penalty.
+#[pin_project(PinnedDrop)]
+pub struct LoadBiaserFuture<F, Rsp> {
+    #[pin]
+    inner: F,
+    /// Request start instant for RTT calculation
+    start: Instant,
+    /// Shared endpoint state (metrics, pending counter, penalty config)
+    shared: Arc<SharedState>,
+    /// Whether we've already decremented pending
+    completed: bool,
+    /// Marker for the response type (`ResponseFailureHint` bound)
+    _response: PhantomData<fn() -> Rsp>,
+}
+
+impl<S> LoadBiaser<S> {
+    /// Creates a new `LoadBiaser` wrapping the given service.
+    pub fn new(inner: S, mut config: LoadBiaserConfig) -> Self {
+        if config.penalty_secs.is_nan() || config.penalty_secs < 0.0 {
+            tracing::warn!(
+                penalty_secs = config.penalty_secs,
+                "penalty_secs is NaN or negative, clamping to 0.0"
+            );
+            config.penalty_secs = 0.0;
+        }
+        if config.penalty_decay < MIN_DECAY {
+            tracing::warn!(
+                penalty_decay = ?config.penalty_decay,
+                min = ?MIN_DECAY,
+                "penalty_decay below minimum, will be clamped by EWMA constructor"
+            );
+        }
+        if config.rtt_decay < MIN_DECAY {
+            tracing::warn!(
+                rtt_decay = ?config.rtt_decay,
+                min = ?MIN_DECAY,
+                "rtt_decay below minimum, will be clamped by EWMA constructor"
+            );
+        }
+        let now = Instant::now();
+        let shared = Arc::new(SharedState {
+            metrics: RwLock::new(LoadMetrics {
+                // Initialize RTT with default_rtt (not INFINITY) so that fresh
+                // endpoints have comparable load to unmeasured ones. This matches
+                // Tower's PeakEwma behavior and prevents P2C from permanently
+                // preferring endpoints whose first request happened to be fast.
+                rtt: Ewma::new_with_value(config.rtt_decay, now, config.default_rtt.as_secs_f64()),
+                penalty: Ewma::new(config.penalty_decay, now),
+            }),
+            pending: AtomicU32::new(0),
+            penalty_secs: config.penalty_secs,
+            enabled: config.enabled,
+            max_duration: config.max_duration,
+            penalty_decay: config.penalty_decay,
+        });
+        Self {
+            inner,
+            shared,
+            config,
+        }
+    }
+
+    /// Returns a reference to the inner service.
+    pub fn get_ref(&self) -> &S {
+        &self.inner
+    }
+}
+
+impl<S: Clone> Clone for LoadBiaser<S> {
+    fn clone(&self) -> Self {
+        Self {
+            inner: self.inner.clone(),
+            shared: self.shared.clone(),
+            config: self.config.clone(),
+        }
+    }
+}
+
+impl<S> Load for LoadBiaser<S> {
+    type Metric = f64;
+
+    fn load(&self) -> Self::Metric {
+        let pending = self.shared.pending.load(Ordering::Acquire);
+        let now = Instant::now();
+
+        let (rtt, penalty_val) = {
+            let metrics = self.shared.metrics.read();
+            (metrics.rtt.get_at(now), metrics.penalty.get_at(now))
+        };
+
+        let penalty = if penalty_val.is_infinite() {
+            0.0
+        } else {
+            penalty_val
+        };
+
+        // Load = RTT * (pending + 1), minimum will be `penalty`
+        // The +1 ensures idle endpoints have some load based on RTT
+        let base = rtt * f64::from(pending.saturating_add(1));
+        let load = f64::max(base, penalty);
+
+        tracing::trace!(
+            rtt_secs = rtt,
+            pending = pending,
+            penalty_secs = penalty,
+            load = load,
+            "LoadBiaser::load"
+        );
+
+        load
+    }
+}
+
+impl<S, Req> Service<Req> for LoadBiaser<S>
+where
+    S: Service<Req>,
+    S::Response: ResponseFailureHint,
+{
+    type Response = S::Response;
+    type Error = S::Error;
+    type Future = LoadBiaserFuture<S::Future, S::Response>;
+
+    fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        self.inner.poll_ready(cx)
+    }
+
+    fn call(&mut self, req: Req) -> Self::Future {
+        let prev = self.shared.pending.fetch_add(1, Ordering::AcqRel);
+        debug_assert!(prev < u32::MAX, "pending counter overflow");
+
+        LoadBiaserFuture {
+            inner: self.inner.call(req),
+            start: Instant::now(),
+            shared: self.shared.clone(),
+            completed: false,
+            _response: PhantomData,
+        }
+    }
+}
+
+impl<N, Req> NewLoadBiaser<N, Req> {
+    /// Creates a new `NewLoadBiaser` with the given configuration.
+    pub fn new(config: LoadBiaserConfig, inner: N) -> Self {
+        Self {
+            inner,
+            config,
+            _marker: PhantomData,
+        }
+    }
+}
+
+impl<N: Clone, Req> Clone for NewLoadBiaser<N, Req> {
+    fn clone(&self) -> Self {
+        Self {
+            inner: self.inner.clone(),
+            config: self.config.clone(),
+            _marker: PhantomData,
+        }
+    }
+}
+
+impl<T, N, S, Req> NewService<T> for NewLoadBiaser<N, Req>
+where
+    N: NewService<T, Service = S>,
+{
+    type Service = LoadBiaser<S>;
+
+    fn new_service(&self, target: T) -> LoadBiaser<S> {
+        LoadBiaser::new(self.inner.new_service(target), self.config.clone())
+    }
+}
+
+impl<F, Rsp, E> Future for LoadBiaserFuture<F, Rsp>
+where
+    F: Future<Output = Result<Rsp, E>>,
+    Rsp: ResponseFailureHint,
+{
+    type Output = F::Output;
+
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        let mut result = ready!(this.inner.poll(cx));
+        let shared = &**this.shared;
+
+        let now = Instant::now();
+        let elapsed = now.saturating_duration_since(*this.start).as_secs_f64();
+
+        // Parse rate limit hint while we have &mut access to the response (when load
+        // biasing is enabled).
+        if shared.enabled {
+            if let Ok(ref mut resp) = result {
+                resp.attach_parsed_rate_limit_hint(shared.max_duration);
+            }
+        }
+
+        {
+            let mut metrics = shared.metrics.write();
+            if let Ok(ref resp) = result {
+                // Update RTT on all HTTP responses (including 429, 503, 5xx).
+                // Only transport-level errors (connection refused, resets) skip
+                // this update, because broken endpoints fail fast and would bias
+                // P2C toward sending more traffic to the broken endpoint.
+                metrics.rtt.add_peak(elapsed, now);
+
+                if shared.enabled {
+                    if let Some(hint) = resp.failure_hint() {
+                        let base_penalty = shared.penalty_secs;
+
+                        // For rate-limited and service-unavailable responses, amplify
+                        // the penalty using the Retry-After hint so it remains meaningful
+                        // through the server's requested avoidance window.
+                        let penalty_val = match hint {
+                            FailureHint::RateLimited | FailureHint::ServiceUnavailable => {
+                                match resp.rate_limit_hint(shared.max_duration) {
+                                    Some(ra) if ra.as_secs_f64() > 0.0 => {
+                                        let decay_secs =
+                                            shared.penalty_decay.max(MIN_DECAY).as_secs_f64();
+                                        let amplified = base_penalty
+                                            * RETRY_AFTER_PENALTY_FACTOR
+                                            * (ra.as_secs_f64() / decay_secs).exp();
+                                        amplified.min(1e12)
+                                    }
+                                    _ => base_penalty,
+                                }
+                            }
+                            FailureHint::InternalError => base_penalty,
+                        };
+
+                        tracing::debug!(
+                            penalty_secs = penalty_val,
+                            rtt_secs = elapsed,
+                            ?hint,
+                            "Detected failure response - injecting load penalty"
+                        );
+                        metrics.penalty.add_peak(penalty_val, now);
+                    }
+                }
+            }
+        }
+
+        shared.pending.fetch_sub(1, Ordering::Release);
+        *this.completed = true;
+
+        Poll::Ready(result)
+    }
+}
+
+#[pin_project::pinned_drop]
+impl<F, Rsp> PinnedDrop for LoadBiaserFuture<F, Rsp> {
+    fn drop(self: Pin<&mut Self>) {
+        let this = self.project();
+        // Only decrement if we haven't already done so in poll().
+        // Avoids leaking the pending count upon cancellation.
+        if !*this.completed {
+            this.shared.pending.fetch_sub(1, Ordering::Release);
+        }
+    }
+}
+

From c40a0af477be2cbd1bc330c356b8aa9cea47e302 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Thu, 21 May 2026 20:08:06 +0200
Subject: [PATCH 05/45] test(load-biaser): add unit tests for load biasing
 lifecycle

These cover the complete load biasing lifecycle, including penalty
injection, hint parsing, cancellation safety via PinnedDrop, and
backwards-compatible behavior when disabled (ie. RTT-only behavior
equivalent to PeakEwma).

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 985 +++++++++++++++++++++++++++++++++
 1 file changed, 985 insertions(+)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index 7254ca67ef..c90848c62c 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -595,3 +595,988 @@ impl<F, Rsp> PinnedDrop for LoadBiaserFuture<F, Rsp> {
     }
 }
 
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::convert::Infallible;
+    use tokio::time;
+
+    impl<S> LoadBiaser<S> {
+        pub fn get_rtt(&self) -> f64 {
+            self.shared.metrics.read().rtt.get()
+        }
+
+        pub fn get_penalty(&self) -> f64 {
+            self.shared.metrics.read().penalty.get()
+        }
+
+        pub fn get_pending(&self) -> u32 {
+            self.shared.pending.load(Ordering::Acquire)
+        }
+
+        pub fn inject_penalty(&self, penalty_secs: f64) {
+            self.shared
+                .metrics
+                .write()
+                .penalty
+                .add_peak(penalty_secs, Instant::now());
+        }
+
+        pub fn inject_rtt(&self, rtt_secs: f64) {
+            self.shared
+                .metrics
+                .write()
+                .rtt
+                .add_peak(rtt_secs, Instant::now());
+        }
+    }
+
+    // Mock service for testing returning a specific HTTP status.
+    #[derive(Clone)]
+    struct MockService {
+        status: http::StatusCode,
+    }
+
+    impl MockService {
+        fn new(status: http::StatusCode) -> Self {
+            Self { status }
+        }
+    }
+
+    impl Service<()> for MockService {
+        type Response = http::Response<&'static str>;
+        type Error = Infallible;
+        type Future = futures::future::Ready<Result<Self::Response, Self::Error>>;
+
+        fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+            Poll::Ready(Ok(()))
+        }
+
+        fn call(&mut self, _: ()) -> Self::Future {
+            let resp = http::Response::builder()
+                .status(self.status)
+                .body("test")
+                .unwrap();
+            futures::future::ready(Ok(resp))
+        }
+    }
+
+    // Mock service that always returns an error.
+    #[derive(Clone)]
+    struct ErrorService;
+
+    impl Service<()> for ErrorService {
+        type Response = http::Response<&'static str>;
+        type Error = &'static str;
+        type Future = futures::future::Ready<Result<Self::Response, Self::Error>>;
+
+        fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+            Poll::Ready(Ok(()))
+        }
+
+        fn call(&mut self, _: ()) -> Self::Future {
+            futures::future::ready(Err("connection refused"))
+        }
+    }
+
+    fn test_config() -> LoadBiaserConfig {
+        LoadBiaserConfig {
+            default_rtt: Duration::from_millis(100), // 0.1s default
+            rtt_decay: Duration::from_secs(10),
+            penalty_secs: 5.0,
+            penalty_decay: Duration::from_secs(10),
+            enabled: true,
+            max_duration: DEFAULT_RETRY_AFTER_MAX_DURATION,
+        }
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_load_uses_default_rtt_initially() {
+        let inner = MockService::new(http::StatusCode::OK);
+        let biaser = LoadBiaser::new(inner, test_config());
+
+        // Initial load should be default_rtt * (pending + 1) = 0.1 * 1 = 0.1
+        let load = biaser.load();
+        assert!(
+            (load - 0.1).abs() < 0.001,
+            "initial load should be ~0.1 (default RTT): {}",
+            load
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_pending_increases_load() {
+        let inner = MockService::new(http::StatusCode::OK);
+        let biaser = LoadBiaser::new(inner, test_config());
+
+        // Inject a known RTT
+        time::sleep(Duration::from_millis(1)).await;
+        biaser.inject_rtt(0.05); // 50ms
+
+        // RTT * (0 + 1) = 0.05
+        let load_idle = biaser.load();
+
+        // Start a request (not awaited yet)
+        // Increment pending to simulate in-flight requests
+        biaser.shared.pending.fetch_add(1, Ordering::AcqRel);
+
+        // RTT * (1 + 1) = 0.05 * 2 = 0.1
+        let load_one_pending = biaser.load();
+
+        // Increment again
+        biaser.shared.pending.fetch_add(1, Ordering::AcqRel);
+
+        // RTT * (2 + 1) = 0.05 * 3 = 0.15
+        let load_two_pending = biaser.load();
+
+        assert!(
+            load_one_pending > load_idle,
+            "load should increase with pending: {} > {}",
+            load_one_pending,
+            load_idle
+        );
+        assert!(
+            load_two_pending > load_one_pending,
+            "load should increase more: {} > {}",
+            load_two_pending,
+            load_one_pending
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_rtt_tracked_after_request() {
+        let inner = MockService::new(http::StatusCode::OK);
+        let mut biaser = LoadBiaser::new(inner, test_config());
+
+        // Advance time so EWMA accepts updates
+        time::sleep(Duration::from_millis(1)).await;
+
+        // RTT should start at default_rtt (0.1s) before any requests
+        let initial_rtt = biaser.get_rtt();
+        assert!(
+            (initial_rtt - 0.1).abs() < 0.01,
+            "RTT should start at default_rtt (0.1s), got: {initial_rtt}"
+        );
+
+        // Make a request (will record RTT)
+        let _ = biaser.call(()).await;
+
+        // RTT should now reflect the actual request latency
+        let rtt = biaser.get_rtt();
+        assert!(
+            rtt < initial_rtt,
+            "RTT should decrease after a fast request"
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_429_injects_penalty() {
+        let inner = MockService::new(http::StatusCode::TOO_MANY_REQUESTS);
+        let mut biaser = LoadBiaser::new(inner, test_config());
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        // Penalty should be infinite (none) before we get requests.
+        assert!(biaser.get_penalty().is_infinite());
+
+        // Make a request that returns 429.
+        let _ = biaser.call(()).await;
+
+        // A penalty should be injected (5 seconds) after seeing a 429
+        let penalty = biaser.get_penalty();
+        assert!(
+            (penalty - 5.0).abs() < 0.1,
+            "penalty should be ~5s after 429: {}",
+            penalty
+        );
+
+        // Load should be at least the penalty.
+        let load = biaser.load();
+        assert!(load >= 4.9, "load should be high after 429: {}", load);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_200_does_not_inject_penalty() {
+        let inner = MockService::new(http::StatusCode::OK);
+        let mut biaser = LoadBiaser::new(inner, test_config());
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        // Make a request that returns 200.
+        let _ = biaser.call(()).await;
+
+        // Penalty should still be infinite (none).
+        assert!(
+            biaser.get_penalty().is_infinite(),
+            "penalty should not be injected for 200"
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_penalty_decays_over_time() {
+        let inner = MockService::new(http::StatusCode::TOO_MANY_REQUESTS);
+        // Use a custom config with default_rtt=1ms (0.001s) so load() tracks penalty directly.
+        // inject_rtt(0.001) would be a no-op here: add_peak() only replaces when the new
+        // value exceeds the decayed current value, so 0.001 < 0.1 (test_config default)
+        // would fail the peak check and fall through to add() which no-ops at ts==self.timestamp.
+        let config = LoadBiaserConfig {
+            default_rtt: Duration::from_millis(1),
+            ..test_config()
+        };
+        let mut biaser = LoadBiaser::new(inner, config);
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        // Trigger penalty via 429 response
+        let _ = biaser.call(()).await;
+
+        // After call completes, pending is back to 0 (LoadBiaserFuture::poll decrements
+        // before returning Poll::Ready), so load = max(rtt * 1, penalty).
+        assert_eq!(
+            biaser.get_pending(),
+            0,
+            "pending should be 0 after call completes"
+        );
+
+        // load() calls get_at(now) which projects the decayed penalty.
+        // Immediately after injection: penalty ~5.0, rtt ~0.001, so load ~5.0
+        let load_before = biaser.load();
+        assert!(
+            load_before > 4.0,
+            "load before decay should be dominated by the injected 429 penalty: {}",
+            load_before
+        );
+
+        // Advance time by one decay period (10s).
+        // Penalty decays: 5.0 * e^(-10/10) ~ 1.839
+        time::sleep(Duration::from_secs(10)).await;
+
+        // load() projects the decayed penalty at the new timestamp.
+        // No EWMA mutation needed since this is pure projection.
+        let load_after = biaser.load();
+        assert!(
+            load_after > 1.0,
+            "load after one decay period should still be dominated by decayed penalty: {}",
+            load_after
+        );
+        assert!(
+            load_after < 2.5,
+            "load after one decay period should reflect substantial decay (5.0 * e^-1 ~ 1.839): {}",
+            load_after
+        );
+        assert!(
+            load_after < load_before,
+            "penalty should decay over time as observed through load(): {} < {}",
+            load_after,
+            load_before
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_load_is_max_of_rtt_based_and_penalty() {
+        let inner = MockService::new(http::StatusCode::OK);
+        let biaser = LoadBiaser::new(inner, test_config());
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        // Inject a high RTT (10 seconds)
+        biaser.inject_rtt(10.0);
+
+        // Inject a lower penalty (1 second)
+        biaser.inject_penalty(1.0);
+
+        // Load should be RTT-based since it's higher: 10 * 1 = 10
+        let load = biaser.load();
+        assert!(
+            (load - 10.0).abs() < 0.1,
+            "load should be RTT-based when higher: {}",
+            load
+        );
+
+        // Now inject a very high penalty
+        biaser.inject_penalty(20.0);
+
+        // Load should be penalty since it's higher
+        let load = biaser.load();
+        assert!(
+            (load - 20.0).abs() < 0.1,
+            "load should be penalty when higher: {}",
+            load
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_clone_shares_state() {
+        let inner = MockService::new(http::StatusCode::TOO_MANY_REQUESTS);
+        let mut biaser1 = LoadBiaser::new(inner.clone(), test_config());
+        let biaser2 = biaser1.clone();
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        // Trigger penalty on biaser1
+        let _ = biaser1.call(()).await;
+
+        // biaser2 should see the same penalty (shared state)
+        assert_eq!(biaser1.get_penalty(), biaser2.get_penalty());
+        assert_eq!(biaser1.load(), biaser2.load());
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_pending_decremented_on_completion() {
+        let inner = MockService::new(http::StatusCode::OK);
+        let mut biaser = LoadBiaser::new(inner, test_config());
+
+        assert_eq!(biaser.get_pending(), 0, "pending should start at 0");
+
+        // Start a request, pending increments
+        let fut = biaser.call(());
+        assert_eq!(
+            biaser.get_pending(),
+            1,
+            "pending should be 1 during request"
+        );
+
+        // Complete the request, pending decrements
+        let _ = fut.await;
+        assert_eq!(
+            biaser.get_pending(),
+            0,
+            "pending should be 0 after completion"
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_rtt_not_updated_on_error() {
+        let inner = ErrorService;
+        let mut biaser = LoadBiaser::new(inner, test_config());
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        // RTT should start at default_rtt before any requests
+        let initial_rtt = biaser.get_rtt();
+        assert!(
+            (initial_rtt - 0.1).abs() < 0.01,
+            "RTT should start at default_rtt (0.1s), got: {initial_rtt}"
+        );
+
+        // Make a request that returns an error
+        let _ = biaser.call(()).await;
+
+        // RTT should remain at default_rtt because error responses don't
+        // update RTT. This prevents P2C from routing more traffic to broken
+        // endpoints that fail fast (appearing "fast" to the load metric).
+        let rtt_after_error = biaser.get_rtt();
+        assert!(
+            (rtt_after_error - initial_rtt).abs() < 0.01,
+            "RTT should not change on error responses: {rtt_after_error} vs {initial_rtt}"
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_pending_decremented_on_cancellation() {
+        use tokio::sync::oneshot;
+
+        // Build a service whose response future blocks on a oneshot receiver.
+        // This lets us drop the future mid-flight to test PinnedDrop.
+        struct DelayedService {
+            rx: Option<oneshot::Receiver<http::Response<&'static str>>>,
+        }
+
+        impl Service<()> for DelayedService {
+            type Response = http::Response<&'static str>;
+            type Error = Infallible;
+            type Future = DelayedFuture;
+
+            fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+                Poll::Ready(Ok(()))
+            }
+
+            fn call(&mut self, _: ()) -> Self::Future {
+                DelayedFuture {
+                    rx: self.rx.take().expect("called more than once"),
+                }
+            }
+        }
+
+        struct DelayedFuture {
+            rx: oneshot::Receiver<http::Response<&'static str>>,
+        }
+
+        impl std::future::Future for DelayedFuture {
+            type Output = Result<http::Response<&'static str>, Infallible>;
+
+            fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+                match Pin::new(&mut self.rx).poll(cx) {
+                    Poll::Ready(Ok(resp)) => Poll::Ready(Ok(resp)),
+                    Poll::Ready(Err(_)) => {
+                        // Sender dropped. Return a default response
+                        let resp = http::Response::builder()
+                            .status(http::StatusCode::OK)
+                            .body("cancelled")
+                            .unwrap();
+
+                        Poll::Ready(Ok(resp))
+                    }
+                    Poll::Pending => Poll::Pending,
+                }
+            }
+        }
+
+        let (tx, rx) = oneshot::channel();
+        let inner = DelayedService { rx: Some(rx) };
+        let mut biaser = LoadBiaser::new(inner, test_config());
+
+        assert_eq!(biaser.get_pending(), 0);
+
+        // Initiate the request, pending increments in Service::call()
+        let fut = biaser.call(());
+        assert_eq!(biaser.get_pending(), 1);
+
+        // Drop the future without completing it. The oneshot receiver
+        // never receives a value, so the inner future is still pending.
+        // PinnedDrop must decrement the pending count.
+        drop(fut);
+
+        assert_eq!(
+            biaser.get_pending(),
+            0,
+            "PinnedDrop should decrement pending on cancellation"
+        );
+
+        // tx is unused. Dropping it here is fine, it just closes the channel.
+        drop(tx);
+    }
+
+    #[test]
+    fn default_max_duration_matches_client_policy() {
+        // Enforces the invariant at LoadBiaserConfig::default():
+        // max_duration must match the local DEFAULT_RETRY_AFTER_MAX_DURATION constant.
+        // Production code reads the constant directly via EwmaConfig::to_load_biaser_config;
+        // this assertion keeps the test-only default() in sync.
+        assert_eq!(
+            LoadBiaserConfig::default().max_duration,
+            DEFAULT_RETRY_AFTER_MAX_DURATION,
+            "LoadBiaserConfig::default().max_duration must match \
+             DEFAULT_RETRY_AFTER_MAX_DURATION (300s)"
+        );
+    }
+
+    // Mock service returning an HTTP error with a Retry-After header.
+    // Parameterized by status code to avoid duplicating 429 and 503 variants.
+    #[derive(Clone)]
+    struct RetryAfterService {
+        status: http::StatusCode,
+        retry_after_secs: u64,
+    }
+
+    impl Service<()> for RetryAfterService {
+        type Response = http::Response<&'static str>;
+        type Error = Infallible;
+        type Future = futures::future::Ready<Result<Self::Response, Self::Error>>;
+
+        fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+            Poll::Ready(Ok(()))
+        }
+
+        fn call(&mut self, _: ()) -> Self::Future {
+            let resp = http::Response::builder()
+                .status(self.status)
+                .header(http::header::RETRY_AFTER, self.retry_after_secs.to_string())
+                .body("retry-after response")
+                .unwrap();
+
+            futures::future::ready(Ok(resp))
+        }
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_429_with_retry_after_uses_adaptive_penalty() {
+        let inner = RetryAfterService {
+            status: http::StatusCode::TOO_MANY_REQUESTS,
+            retry_after_secs: 30,
+        };
+        let mut biaser = LoadBiaser::new(inner, test_config());
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        // Make a request that returns 429 with Retry-After: 30
+        let _ = biaser.call(()).await;
+
+        // Amplified: penalty_secs * FACTOR * exp(RA/decay) = 5.0 * 0.5 * e^3
+        let expected = 5.0_f64 * RETRY_AFTER_PENALTY_FACTOR * (30.0_f64 / 10.0_f64).exp();
+        let penalty = biaser.get_penalty();
+
+        assert!(
+            (penalty - expected).abs() < 1.0,
+            "penalty should be ~{expected:.1} (amplified), got: {}",
+            penalty
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_429_with_retry_after_clamped_to_max() {
+        let inner = RetryAfterService {
+            status: http::StatusCode::TOO_MANY_REQUESTS,
+            retry_after_secs: 600,
+        };
+        let config = LoadBiaserConfig {
+            max_duration: DEFAULT_RETRY_AFTER_MAX_DURATION,
+            ..test_config()
+        };
+        let mut biaser = LoadBiaser::new(inner, config);
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        // Make a request that returns 429 with Retry-After: 600, clamped to 300s
+        let _ = biaser.call(()).await;
+
+        // Amplified: penalty_secs * FACTOR * exp(clamped_RA/decay) = 5.0 * 0.5 * e^30
+        // exceeds the 1e12 penalty clamp, so the actual penalty is clamped.
+        let unclamped = 5.0_f64 * RETRY_AFTER_PENALTY_FACTOR * (300.0_f64 / 10.0_f64).exp();
+        let expected = unclamped.min(1e12);
+        let penalty = biaser.get_penalty();
+
+        assert!(
+            (penalty - expected).abs() / expected < 0.01,
+            "penalty should be ~{expected:.0} (amplified, clamped RA=300), got: {}",
+            penalty
+        );
+    }
+
+    #[test]
+    fn test_rate_limit_hint_parses_retry_after() {
+        let mut resp = http::Response::builder()
+            .status(http::StatusCode::TOO_MANY_REQUESTS)
+            .header(http::header::RETRY_AFTER, "45")
+            .body("rate limited")
+            .unwrap();
+        let max = Duration::from_secs(60);
+
+        resp.attach_parsed_rate_limit_hint(max);
+
+        assert_eq!(resp.rate_limit_hint(max), Some(Duration::from_secs(45)));
+    }
+
+    #[test]
+    fn test_rate_limit_hint_none_for_200() {
+        let mut resp = http::Response::builder()
+            .status(http::StatusCode::OK)
+            .header(http::header::RETRY_AFTER, "45")
+            .body("ok")
+            .unwrap();
+        let max = Duration::from_secs(60);
+
+        resp.attach_parsed_rate_limit_hint(max);
+
+        assert_eq!(resp.rate_limit_hint(max), None);
+    }
+
+    #[test]
+    fn test_rate_limit_hint_none_without_header() {
+        let mut resp = http::Response::builder()
+            .status(http::StatusCode::TOO_MANY_REQUESTS)
+            .body("rate limited")
+            .unwrap();
+        let max = Duration::from_secs(60);
+
+        resp.attach_parsed_rate_limit_hint(max);
+
+        assert_eq!(resp.rate_limit_hint(max), None);
+    }
+
+    #[test]
+    fn test_rate_limit_hint_on_read_without_attach() {
+        let resp = http::Response::builder()
+            .status(http::StatusCode::TOO_MANY_REQUESTS)
+            .header(http::header::RETRY_AFTER, "45")
+            .body("rate limited")
+            .unwrap();
+
+        // No attach call, test on-read fallback
+        assert_eq!(
+            resp.rate_limit_hint(Duration::from_secs(60)),
+            Some(Duration::from_secs(45))
+        );
+    }
+
+    #[test]
+    fn test_rate_limit_hint_on_read_caps_at_max() {
+        let resp = http::Response::builder()
+            .status(http::StatusCode::TOO_MANY_REQUESTS)
+            .header(http::header::RETRY_AFTER, "300")
+            .body("rate limited")
+            .unwrap();
+
+        // On-read parse caps at caller's max
+        assert_eq!(
+            resp.rate_limit_hint(Duration::from_secs(60)),
+            Some(Duration::from_secs(60))
+        );
+    }
+
+    // Test multiple caps
+
+    const SMALL_CAP: Duration = Duration::from_secs(60);
+    const DEFAULT_CAP: Duration = DEFAULT_RETRY_AFTER_MAX_DURATION;
+    const LARGE_CAP: Duration = Duration::from_secs(1800);
+
+    // Constructs a 429 response whose `Retry-After` header uses the integer
+    // seconds form (delay-seconds per RFC 7231).
+    fn build_http_retry_after_response(retry_after_secs: u64) -> http::Response<&'static str> {
+        http::Response::builder()
+            .status(http::StatusCode::TOO_MANY_REQUESTS)
+            .header(http::header::RETRY_AFTER, retry_after_secs.to_string())
+            .body("rate limited")
+            .unwrap()
+    }
+
+    // Constructs a 200 OK trailers-only gRPC error response using
+    // `grpc-status: 8` (RESOURCE_EXHAUSTED) and `grpc-retry-pushback-ms: <ms>`.
+    fn build_grpc_pushback_response(pushback_ms: u64) -> http::Response<&'static str> {
+        http::Response::builder()
+            .status(http::StatusCode::OK)
+            .header("grpc-status", "8")
+            .header("grpc-retry-pushback-ms", pushback_ms.to_string())
+            .body("grpc error")
+            .unwrap()
+    }
+
+    // Verifies the cached-path `.min(max)` clamp in `rate_limit_hint()` clamps HTTP
+    // Retry-After hints to the caller-supplied cap consistently across cap
+    // magnitudes (60s / 300s / 1800s) and directions (below-cap / over-cap).
+    #[test]
+    fn test_rate_limit_hint_multi_cap_http() {
+        struct Row {
+            cap: Duration,
+            header_value: u64, // Retry-After in integer seconds
+        }
+
+        let rows = [
+            Row {
+                cap: SMALL_CAP,
+                header_value: 30,
+            }, // below
+            Row {
+                cap: SMALL_CAP,
+                header_value: 120,
+            }, // over
+            Row {
+                cap: DEFAULT_CAP,
+                header_value: 120,
+            }, // below
+            Row {
+                cap: DEFAULT_CAP,
+                header_value: 600,
+            }, // over
+            Row {
+                cap: LARGE_CAP,
+                header_value: 900,
+            }, // below
+            Row {
+                cap: LARGE_CAP,
+                header_value: 3600,
+            }, // over
+        ];
+
+        for Row { cap, header_value } in rows {
+            let mut resp = build_http_retry_after_response(header_value);
+            resp.attach_parsed_rate_limit_hint(Duration::MAX);
+
+            // Remove the source header after attach
+            resp.headers_mut().remove(http::header::RETRY_AFTER);
+            let parsed = Duration::from_secs(header_value);
+            let expected = parsed.min(cap);
+
+            assert_eq!(
+                resp.rate_limit_hint(cap),
+                Some(expected),
+                "cap={cap:?}, header={header_value}s, expected={expected:?}",
+            );
+        }
+    }
+
+    // Verifies the cached-path `.min(max)` clamp in `rate_limit_hint()` clamps gRPC
+    // retry-pushback-ms hints to the caller-supplied cap consistently across
+    // cap magnitudes (60s / 300s / 1800s) and directions (below-cap / over-cap).
+    #[test]
+    fn test_rate_limit_hint_multi_cap_grpc() {
+        struct Row {
+            cap: Duration,
+            header_value: u64, // grpc-retry-pushback in milliseconds
+        }
+
+        let rows = [
+            Row {
+                cap: SMALL_CAP,
+                header_value: 30_000,
+            }, // below (30s)
+            Row {
+                cap: SMALL_CAP,
+                header_value: 120_000,
+            }, // over  (120s)
+            Row {
+                cap: DEFAULT_CAP,
+                header_value: 120_000,
+            }, // below (120s)
+            Row {
+                cap: DEFAULT_CAP,
+                header_value: 600_000,
+            }, // over  (600s)
+            Row {
+                cap: LARGE_CAP,
+                header_value: 900_000,
+            }, // below (900s)
+            Row {
+                cap: LARGE_CAP,
+                header_value: 3_600_000,
+            }, // over  (3600s)
+        ];
+
+        for Row { cap, header_value } in rows {
+            let mut resp = build_grpc_pushback_response(header_value);
+            resp.attach_parsed_rate_limit_hint(Duration::MAX);
+
+            // Remove the source header after attach
+            resp.headers_mut().remove("grpc-retry-pushback-ms");
+            let parsed = Duration::from_millis(header_value);
+            let expected = parsed.min(cap);
+
+            assert_eq!(
+                resp.rate_limit_hint(cap),
+                Some(expected),
+                "cap={cap:?}, header={header_value}ms, expected={expected:?}",
+            );
+        }
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_503_injects_full_penalty() {
+        let inner = MockService::new(http::StatusCode::SERVICE_UNAVAILABLE);
+        let mut biaser = LoadBiaser::new(inner, test_config());
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        let _ = biaser.call(()).await;
+        let penalty = biaser.get_penalty();
+
+        // test_config has penalty_secs = 5.0
+        assert!(
+            (penalty - 5.0).abs() < 0.1,
+            "503 should inject full penalty (~5.0s), got: {penalty}"
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_503_with_retry_after_uses_adaptive_penalty() {
+        let inner = RetryAfterService {
+            status: http::StatusCode::SERVICE_UNAVAILABLE,
+            retry_after_secs: 30,
+        };
+        let mut biaser = LoadBiaser::new(inner, test_config());
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        // Make a request that returns 503 with Retry-After: 30
+        let _ = biaser.call(()).await;
+
+        // Same amplification formula as 429:
+        // penalty_secs * FACTOR * exp(RA/decay) = 5.0 * 0.5 * e^3
+        let expected = 5.0_f64 * RETRY_AFTER_PENALTY_FACTOR * (30.0_f64 / 10.0_f64).exp();
+        let penalty = biaser.get_penalty();
+        assert!(
+            (penalty - expected).abs() < 1.0,
+            "503+RA penalty should be ~{expected:.1} (amplified), got: {penalty}"
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_503_with_retry_after_clamped_to_max() {
+        let inner = RetryAfterService {
+            status: http::StatusCode::SERVICE_UNAVAILABLE,
+            retry_after_secs: 600,
+        };
+        let config = LoadBiaserConfig {
+            max_duration: DEFAULT_RETRY_AFTER_MAX_DURATION,
+            ..test_config()
+        };
+        let mut biaser = LoadBiaser::new(inner, config);
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        // Make a request that returns 503 with Retry-After: 600, clamped to 300s
+        let _ = biaser.call(()).await;
+
+        // Amplified with clamped RA:
+        // penalty_secs * FACTOR * exp(clamped_RA/decay) = 5.0 * 0.5 * e^30
+        // exceeds the 1e12 penalty clamp, so the actual penalty is clamped.
+        let unclamped = 5.0_f64 * RETRY_AFTER_PENALTY_FACTOR * (300.0_f64 / 10.0_f64).exp();
+        let expected = unclamped.min(1e12);
+        let penalty = biaser.get_penalty();
+        assert!(
+            (penalty - expected).abs() / expected < 0.01,
+            "503+RA penalty should be ~{expected:.0} (amplified, clamped RA=300), got: {penalty}"
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_429_and_503_retry_after_produce_identical_penalty() {
+        // 429 path: RetryAfterService returns TOO_MANY_REQUESTS with Retry-After: 30
+        let inner_429 = RetryAfterService {
+            status: http::StatusCode::TOO_MANY_REQUESTS,
+            retry_after_secs: 30,
+        };
+        let mut biaser_429 = LoadBiaser::new(inner_429, test_config());
+
+        // 503 path: RetryAfterService returns SERVICE_UNAVAILABLE with Retry-After: 30
+        let inner_503 = RetryAfterService {
+            status: http::StatusCode::SERVICE_UNAVAILABLE,
+            retry_after_secs: 30,
+        };
+        let mut biaser_503 = LoadBiaser::new(inner_503, test_config());
+
+        // Bootstrap EWMA timestamps
+        time::sleep(Duration::from_millis(1)).await;
+        let _ = biaser_429.call(()).await;
+
+        time::sleep(Duration::from_millis(1)).await;
+        let _ = biaser_503.call(()).await;
+
+        let load_429 = biaser_429.load();
+        let load_503 = biaser_503.load();
+
+        assert!(
+            (load_429 - load_503).abs() / load_429.max(1e-9) < 0.05,
+            "429+RA and 503+RA should produce similar load; 429={load_429}, 503={load_503}"
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_500_injects_penalty() {
+        let inner = MockService::new(http::StatusCode::INTERNAL_SERVER_ERROR);
+        let mut biaser = LoadBiaser::new(inner, test_config());
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        let _ = biaser.call(()).await;
+        let penalty = biaser.get_penalty();
+
+        // test_config has penalty_secs = 5.0
+        assert!(
+            (penalty - 5.0).abs() < 0.1,
+            "500 should inject full penalty (~5.0s), got: {penalty}"
+        );
+    }
+
+    // Mock service that returns HTTP 200 with a `grpc-status` header,
+    // simulating a gRPC trailers-only error response.
+    #[derive(Clone)]
+    struct GrpcErrorService {
+        grpc_status: u16,
+    }
+
+    impl Service<()> for GrpcErrorService {
+        type Response = http::Response<&'static str>;
+        type Error = Infallible;
+        type Future = futures::future::Ready<Result<Self::Response, Self::Error>>;
+
+        fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+            Poll::Ready(Ok(()))
+        }
+
+        fn call(&mut self, _: ()) -> Self::Future {
+            let resp = http::Response::builder()
+                .status(http::StatusCode::OK)
+                .header("grpc-status", self.grpc_status.to_string())
+                .body("grpc error")
+                .unwrap();
+
+            futures::future::ready(Ok(resp))
+        }
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_grpc_resource_exhausted_injects_penalty() {
+        let inner = GrpcErrorService { grpc_status: 8 }; // RESOURCE_EXHAUSTED
+        let mut biaser = LoadBiaser::new(inner, test_config());
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        let _ = biaser.call(()).await;
+        // gRPC RESOURCE_EXHAUSTED maps to FailureHint::RateLimited,
+        // which injects the full penalty_secs (5.0).
+        let penalty = biaser.get_penalty();
+
+        assert!(
+            (penalty - 5.0).abs() < 0.1,
+            "gRPC RESOURCE_EXHAUSTED should inject full penalty (~5s), got: {penalty}"
+        );
+    }
+
+    async fn assert_disabled_no_penalty<S>(mut biaser: LoadBiaser<S>, label: &str)
+    where
+        S: Service<(), Response = http::Response<&'static str>, Error = Infallible>,
+    {
+        time::sleep(Duration::from_millis(1)).await;
+        let _ = biaser.call(()).await;
+        assert!(
+            biaser.get_penalty().is_infinite(),
+            "penalty should not be injected when disabled for {label}: {}",
+            biaser.get_penalty()
+        );
+    }
+
+    fn disabled_config() -> LoadBiaserConfig {
+        LoadBiaserConfig {
+            enabled: false,
+            ..test_config()
+        }
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_429_disabled_no_penalty() {
+        let inner = MockService::new(http::StatusCode::TOO_MANY_REQUESTS);
+        assert_disabled_no_penalty(LoadBiaser::new(inner, disabled_config()), "429").await;
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_503_disabled_no_penalty() {
+        let inner = MockService::new(http::StatusCode::SERVICE_UNAVAILABLE);
+        assert_disabled_no_penalty(LoadBiaser::new(inner, disabled_config()), "503").await;
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_500_disabled_no_penalty() {
+        let inner = MockService::new(http::StatusCode::INTERNAL_SERVER_ERROR);
+        assert_disabled_no_penalty(LoadBiaser::new(inner, disabled_config()), "500").await;
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_grpc_resource_exhausted_disabled_no_penalty() {
+        let inner = GrpcErrorService { grpc_status: 8 };
+        assert_disabled_no_penalty(
+            LoadBiaser::new(inner, disabled_config()),
+            "gRPC RESOURCE_EXHAUSTED (status 8)",
+        )
+        .await;
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_grpc_unavailable_disabled_no_penalty() {
+        let inner = GrpcErrorService { grpc_status: 14 };
+        assert_disabled_no_penalty(
+            LoadBiaser::new(inner, disabled_config()),
+            "gRPC UNAVAILABLE (status 14)",
+        )
+        .await;
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_grpc_internal_disabled_no_penalty() {
+        let inner = GrpcErrorService { grpc_status: 13 };
+        assert_disabled_no_penalty(
+            LoadBiaser::new(inner, disabled_config()),
+            "gRPC INTERNAL (status 13)",
+        )
+        .await;
+    }
+}

From 342f7d93898b5c327fff41f987b1457137bbe652 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <alex@flawedcode.org>
Date: Tue, 26 May 2026 13:40:54 +0200
Subject: [PATCH 06/45] Update linkerd/http/classify/Cargo.toml

Co-authored-by: katelyn martin <kate@buoyant.io>
---
 linkerd/http/classify/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linkerd/http/classify/Cargo.toml b/linkerd/http/classify/Cargo.toml
index e5c23eb0b7..2e36d20264 100644
--- a/linkerd/http/classify/Cargo.toml
+++ b/linkerd/http/classify/Cargo.toml
@@ -10,7 +10,7 @@ publish = { workspace = true }
 futures = { version = "0.3", default-features = false }
 http = { workspace = true }
 http-body = { workspace = true }
-httpdate.workspace = true
+httpdate = { workspace = true }
 pin-project = "1"
 tokio = { version = "1", default-features = false }
 tracing = { workspace = true }

From 11ee334f91edd5de2d27578d4f3895bae537f16c Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <alex@flawedcode.org>
Date: Tue, 26 May 2026 13:41:57 +0200
Subject: [PATCH 07/45] Update linkerd/load-biaser/Cargo.toml

Co-authored-by: katelyn martin <kate@buoyant.io>
---
 linkerd/load-biaser/Cargo.toml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/linkerd/load-biaser/Cargo.toml b/linkerd/load-biaser/Cargo.toml
index a8fa0fb29a..4bb2360b62 100644
--- a/linkerd/load-biaser/Cargo.toml
+++ b/linkerd/load-biaser/Cargo.toml
@@ -11,11 +11,8 @@ default = []
 tokio-test = ["dep:tokio-test"]
 
 [dependencies]
-linkerd-ewma = { path = "../ewma" }
 futures = { version = "0.3", default-features = false }
 http = { workspace = true }
-linkerd-http-classify = { path = "../http/classify" }
-linkerd-stack = { path = "../stack" }
 parking_lot = "0.12"
 pin-project = "1"
 tokio = { version = "1", features = ["io-util", "net", "time"] }
@@ -24,6 +21,10 @@ tower = { workspace = true, features = ["load"] }
 tower-service = { workspace = true }
 tracing = { workspace = true }
 
+linkerd-ewma = { path = "../ewma" }
+linkerd-http-classify = { path = "../http/classify" }
+linkerd-stack = { path = "../stack" }
+
 [dev-dependencies]
 tokio = { version = "1", features = ["macros", "rt", "time"] }
 tokio-test = "0.4"

From 4f5b302ffa145445f584ade6c06e3d4e47ba3092 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 25 May 2026 11:17:11 +0200
Subject: [PATCH 08/45] refactor(load-biaser): remove unused max parameter from
 attach_parsed_rate_limit_hint

The _max parameter was accepted for API symmetry with rate_limit_hint(max) but
intentionally unused: the method always caches the uncapped raw value so each
consumer can apply its own cap via rate_limit_hint(max). Removing the parameter
for now since we probably won't need it in the future, and if so we can
always put it back in place.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index c90848c62c..75e138c219 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -80,7 +80,7 @@ pub struct CachedRateLimitHint(pub Duration);
 ///
 /// The trait splits rate limit hint access into two methods to avoid
 /// requiring `&mut self` on the read path:
-/// - `attach_parsed_rate_limit_hint(&mut self, max)`: parse and cache (needs `&mut`)
+/// - `attach_parsed_rate_limit_hint(&mut self)`: parse and cache (needs `&mut`)
 /// - `rate_limit_hint(&self, max)`: read cached value or parse on-read (only needs `&self`)
 ///
 /// # Stack ordering and caching
@@ -97,10 +97,9 @@ pub trait ResponseFailureHint {
 
     /// Parse and cache the raw (uncapped) rate limit hint from this response.
     ///
-    /// The `_max` parameter is accepted for API symmetry with `rate_limit_hint(max)`
-    /// but is intentionally unused. The raw uncapped value is cached so that each
-    /// consumer can apply their own cap via `rate_limit_hint(max)`.
-    fn attach_parsed_rate_limit_hint(&mut self, _max: Duration) {}
+    /// The raw uncapped value is cached so that each consumer can apply their
+    /// own cap via `rate_limit_hint(max)`.
+    fn attach_parsed_rate_limit_hint(&mut self) {}
 
     /// Returns the rate limit hint if available.
     ///
@@ -149,7 +148,7 @@ impl<B> ResponseFailureHint for http::Response<B> {
         }
     }
 
-    fn attach_parsed_rate_limit_hint(&mut self, _max: Duration) {
+    fn attach_parsed_rate_limit_hint(&mut self) {
         // Store the uncapped value. Each consumer applies their own cap via
         // rate_limit_hint(max).
         if let Some(d) = linkerd_http_classify::retry_after::parse_retry_after(
@@ -527,7 +526,7 @@ where
         // biasing is enabled).
         if shared.enabled {
             if let Ok(ref mut resp) = result {
-                resp.attach_parsed_rate_limit_hint(shared.max_duration);
+                resp.attach_parsed_rate_limit_hint();
             }
         }
 
@@ -1152,7 +1151,7 @@ mod tests {
             .unwrap();
         let max = Duration::from_secs(60);
 
-        resp.attach_parsed_rate_limit_hint(max);
+        resp.attach_parsed_rate_limit_hint();
 
         assert_eq!(resp.rate_limit_hint(max), Some(Duration::from_secs(45)));
     }
@@ -1166,7 +1165,7 @@ mod tests {
             .unwrap();
         let max = Duration::from_secs(60);
 
-        resp.attach_parsed_rate_limit_hint(max);
+        resp.attach_parsed_rate_limit_hint();
 
         assert_eq!(resp.rate_limit_hint(max), None);
     }
@@ -1179,7 +1178,7 @@ mod tests {
             .unwrap();
         let max = Duration::from_secs(60);
 
-        resp.attach_parsed_rate_limit_hint(max);
+        resp.attach_parsed_rate_limit_hint();
 
         assert_eq!(resp.rate_limit_hint(max), None);
     }
@@ -1280,7 +1279,7 @@ mod tests {
 
         for Row { cap, header_value } in rows {
             let mut resp = build_http_retry_after_response(header_value);
-            resp.attach_parsed_rate_limit_hint(Duration::MAX);
+            resp.attach_parsed_rate_limit_hint();
 
             // Remove the source header after attach
             resp.headers_mut().remove(http::header::RETRY_AFTER);
@@ -1334,7 +1333,7 @@ mod tests {
 
         for Row { cap, header_value } in rows {
             let mut resp = build_grpc_pushback_response(header_value);
-            resp.attach_parsed_rate_limit_hint(Duration::MAX);
+            resp.attach_parsed_rate_limit_hint();
 
             // Remove the source header after attach
             resp.headers_mut().remove("grpc-retry-pushback-ms");

From b4d0693765a150c0f9df3223c1fa07a0edf55d6f Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 25 May 2026 11:25:01 +0200
Subject: [PATCH 09/45] refactor(load-biaser): encapsulate CachedRateLimitHint
 with constructor and accessor

Make the inner Duration field private and provide CachedRateLimitHint::new() for
construction and duration_capped(max) for reads. This prevents consumers from
bypassing the per-caller cap that rate_limit_hint(max) enforces, since the cached
value is intentionally uncapped.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index 75e138c219..beda6a9368 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -70,7 +70,19 @@ pub enum FailureHint {
 /// via `rate_limit_hint(max)`, so different callers (e.g. load biaser vs
 /// circuit breaker) can use different maximums from the same cached value.
 #[derive(Clone, Copy, Debug)]
-pub struct CachedRateLimitHint(pub Duration);
+pub struct CachedRateLimitHint(Duration);
+
+impl CachedRateLimitHint {
+    /// Wraps a parsed, uncapped rate limit duration for caching.
+    pub fn new(d: Duration) -> Self {
+        Self(d)
+    }
+
+    /// Returns the cached duration, capped at `max`.
+    pub fn duration_capped(&self, max: Duration) -> Duration {
+        self.0.min(max)
+    }
+}
 
 /// Trait for extracting failure hints from responses.
 ///
@@ -156,7 +168,7 @@ impl<B> ResponseFailureHint for http::Response<B> {
             self.headers(),
             Duration::MAX,
         ) {
-            self.extensions_mut().insert(CachedRateLimitHint(d));
+            self.extensions_mut().insert(CachedRateLimitHint::new(d));
             return;
         }
         // Try gRPC retry-pushback-ms (for trailers-only responses)
@@ -165,7 +177,7 @@ impl<B> ResponseFailureHint for http::Response<B> {
                 self.headers(),
                 Duration::MAX,
             ) {
-                self.extensions_mut().insert(CachedRateLimitHint(d));
+                self.extensions_mut().insert(CachedRateLimitHint::new(d));
             }
         }
     }
@@ -173,7 +185,7 @@ impl<B> ResponseFailureHint for http::Response<B> {
     fn rate_limit_hint(&self, max: Duration) -> Option<Duration> {
         // Check cache first (from previous attach call), apply caller's cap
         if let Some(cached) = self.extensions().get::<CachedRateLimitHint>() {
-            return Some(cached.0.min(max));
+            return Some(cached.duration_capped(max));
         }
         // Parse on-read as fallback (header present but attach wasn't called)
         if let Some(d) = linkerd_http_classify::retry_after::parse_retry_after(

From 3b1d46863406d80b6596652032611f7d7e2af668 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 25 May 2026 11:31:50 +0200
Subject: [PATCH 10/45] docs(ewma): add crate-level doc comment

Explain why a standalone EWMA crate exists instead of using Tower's
RttEstimate: it is private, mutates on read, and cannot support the
penalty dimension that failure-aware load balancing requires.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/ewma/src/lib.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/linkerd/ewma/src/lib.rs b/linkerd/ewma/src/lib.rs
index 872fcfc518..221ca98391 100644
--- a/linkerd/ewma/src/lib.rs
+++ b/linkerd/ewma/src/lib.rs
@@ -1,3 +1,10 @@
+//! Exponentially weighted moving average (EWMA) with time-based decay.
+//!
+//! A standalone EWMA implementation that supports non-mutating time-projected
+//! reads via [`Ewma::get_at`] and dual-metric tracking (RTT + penalty) under a
+//! single lock. Tower's internal `RttEstimate` is private, mutates on read, and
+//! cannot support the penalty dimension that failure-aware load balancing needs.
+
 #![deny(rust_2018_idioms, clippy::disallowed_methods, clippy::disallowed_types)]
 #![forbid(unsafe_code)]
 

From 6ed442b36037fac4c3d76f12bf311c488df83db6 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 25 May 2026 11:35:08 +0200
Subject: [PATCH 11/45] build(ewma): disable default tokio features

The crate only uses tokio::time, so disable the default feature set to
avoid pulling unnecessary features into the dependency declaration.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/ewma/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linkerd/ewma/Cargo.toml b/linkerd/ewma/Cargo.toml
index 8933c4e5c7..d97f75bff7 100644
--- a/linkerd/ewma/Cargo.toml
+++ b/linkerd/ewma/Cargo.toml
@@ -7,7 +7,7 @@ edition = { workspace = true }
 publish = { workspace = true }
 
 [dependencies]
-tokio = { version = "1", features = ["time"] }
+tokio = { version = "1", default-features = false, features = ["time"] }
 
 [dev-dependencies]
 tokio = { version = "1", features = ["macros", "rt", "test-util"] }

From 380c8f2cabb4ba0c87e4415ab61b15b4fe187b64 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 25 May 2026 11:37:41 +0200
Subject: [PATCH 12/45] build(load-biaser): declare tokio sync feature in
 dev-dependencies

The cancellation test uses tokio::sync::oneshot which requires the sync
feature. This compiled only because workspace feature unification pulled
it in from other crates.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linkerd/load-biaser/Cargo.toml b/linkerd/load-biaser/Cargo.toml
index 4bb2360b62..68737d512f 100644
--- a/linkerd/load-biaser/Cargo.toml
+++ b/linkerd/load-biaser/Cargo.toml
@@ -26,5 +26,5 @@ linkerd-http-classify = { path = "../http/classify" }
 linkerd-stack = { path = "../stack" }
 
 [dev-dependencies]
-tokio = { version = "1", features = ["macros", "rt", "time"] }
+tokio = { version = "1", features = ["macros", "rt", "sync", "time"] }
 tokio-test = "0.4"

From b74b52abd334fd8238bc299415f1be016a3428a0 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 25 May 2026 11:42:40 +0200
Subject: [PATCH 13/45] refactor(classify): use GRPC_RETRY_PUSHBACK_MS constant
 in tests

Replace raw string literals with the module-level constant for
consistency with how HTTP tests use http::header::RETRY_AFTER.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/http/classify/src/retry_after.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/linkerd/http/classify/src/retry_after.rs b/linkerd/http/classify/src/retry_after.rs
index 9b00ec4303..7d2b0d4915 100644
--- a/linkerd/http/classify/src/retry_after.rs
+++ b/linkerd/http/classify/src/retry_after.rs
@@ -236,7 +236,7 @@ mod tests {
     #[test]
     fn parse_grpc_pushback_positive() {
         let mut headers = HeaderMap::new();
-        headers.insert("grpc-retry-pushback-ms", HeaderValue::from_static("5000"));
+        headers.insert(GRPC_RETRY_PUSHBACK_MS, HeaderValue::from_static("5000"));
 
         let result = parse_grpc_retry_pushback(&headers, MAX_GRPC);
         assert_eq!(result, Some(Duration::from_millis(5000)));
@@ -245,7 +245,7 @@ mod tests {
     #[test]
     fn parse_grpc_pushback_zero() {
         let mut headers = HeaderMap::new();
-        headers.insert("grpc-retry-pushback-ms", HeaderValue::from_static("0"));
+        headers.insert(GRPC_RETRY_PUSHBACK_MS, HeaderValue::from_static("0"));
 
         let result = parse_grpc_retry_pushback(&headers, MAX_GRPC);
         assert_eq!(result, Some(Duration::ZERO));
@@ -254,7 +254,7 @@ mod tests {
     #[test]
     fn parse_grpc_pushback_negative() {
         let mut headers = HeaderMap::new();
-        headers.insert("grpc-retry-pushback-ms", HeaderValue::from_static("-1"));
+        headers.insert(GRPC_RETRY_PUSHBACK_MS, HeaderValue::from_static("-1"));
 
         let result = parse_grpc_retry_pushback(&headers, MAX_GRPC);
         assert_eq!(result, None);
@@ -263,7 +263,7 @@ mod tests {
     #[test]
     fn parse_grpc_pushback_caps_at_max() {
         let mut headers = HeaderMap::new();
-        headers.insert("grpc-retry-pushback-ms", HeaderValue::from_static("999999"));
+        headers.insert(GRPC_RETRY_PUSHBACK_MS, HeaderValue::from_static("999999"));
 
         let result = parse_grpc_retry_pushback(&headers, MAX_GRPC);
         assert_eq!(result, Some(MAX_GRPC));
@@ -312,7 +312,7 @@ mod tests {
     #[test]
     fn grpc_pushback_whitespace() {
         let mut headers = HeaderMap::new();
-        headers.insert("grpc-retry-pushback-ms", " 5000 ".parse().unwrap());
+        headers.insert(GRPC_RETRY_PUSHBACK_MS, " 5000 ".parse().unwrap());
 
         let result = parse_grpc_retry_pushback(&headers, MAX);
         assert_eq!(result, Some(Duration::from_millis(5000)));

From ae6de471768041adcb9aa04083259752413ef6ef Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 25 May 2026 12:06:39 +0200
Subject: [PATCH 14/45] fix(ewma): fix typo in test comment

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/ewma/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linkerd/ewma/src/lib.rs b/linkerd/ewma/src/lib.rs
index 221ca98391..09394e8e68 100644
--- a/linkerd/ewma/src/lib.rs
+++ b/linkerd/ewma/src/lib.rs
@@ -354,7 +354,7 @@ mod tests {
         let probe_near = now + Duration::from_secs(1);
         let probe_far = now + Duration::from_secs(100);
 
-        // A new Ewma without addding values should project INFINITY
+        // A new Ewma without adding values should project INFINITY
         // at every timestamp.
         let ewma = Ewma::new(decay, now);
         assert_eq!(ewma.get_at(probe_same), f64::INFINITY);

From 30234220fb7e7c75cc53e3b92af77b013bc872d8 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 25 May 2026 12:18:28 +0200
Subject: [PATCH 15/45] refactor(load-biaser): add #[must_use] to
 LoadBiaser::new

Consistent with Ewma::new which already has this attribute.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index beda6a9368..ac79840b36 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -364,6 +364,7 @@ pub struct LoadBiaserFuture<F, Rsp> {
 
 impl<S> LoadBiaser<S> {
     /// Creates a new `LoadBiaser` wrapping the given service.
+    #[must_use]
     pub fn new(inner: S, mut config: LoadBiaserConfig) -> Self {
         if config.penalty_secs.is_nan() || config.penalty_secs < 0.0 {
             tracing::warn!(

From 02d9f9d704318e5932eb176c613f0782dd3550d0 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 25 May 2026 15:29:07 +0200
Subject: [PATCH 16/45] fix(load-biaser): ensure we deal with gRPC before
 checking gRPC status

Inspect the grpc-status header only on HTTP 200 responses whose
content-type starts with application/grpc. Without this a non-gRPC
upstream that happens to include a grpc-status header would be
considered a gRPC failure and penalized by the load biaser.

The same check is applied to the gRPC retry-pushback-ms parsing in
the ReponseFailureHint trait implementation.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index ac79840b36..de6a2a8044 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -123,14 +123,22 @@ pub trait ResponseFailureHint {
     }
 }
 
+fn is_grpc(headers: &http::HeaderMap) -> bool {
+    headers
+        .get(http::header::CONTENT_TYPE)
+        .and_then(|v| v.to_str().ok())
+        .is_some_and(|ct| ct.starts_with("application/grpc"))
+}
+
 /// HTTP responses classify failures by status code and parse Retry-After hints.
 ///
-/// For gRPC responses (which arrive as HTTP 200 with `grpc-status` in headers
-/// for trailers-only/unary errors), the `grpc-status` header is checked when
-/// the HTTP status is 200:
+/// For gRPC trailers-only responses (HTTP 200 with `content-type: application/grpc`
+/// and `grpc-status` in headers), the `grpc-status` header is inspected:
 /// - gRPC status 8 (RESOURCE_EXHAUSTED) -> `RateLimited`
 /// - gRPC status 14 (UNAVAILABLE) -> `ServiceUnavailable`
 /// - Other non-zero gRPC status codes -> `InternalError`
+///
+/// Non-gRPC HTTP 200 responses are not inspected for `grpc-status`.
 impl<B> ResponseFailureHint for http::Response<B> {
     fn failure_hint(&self) -> Option<FailureHint> {
         let status = self.status();
@@ -140,8 +148,12 @@ impl<B> ResponseFailureHint for http::Response<B> {
             Some(FailureHint::ServiceUnavailable)
         } else if status.is_server_error() {
             Some(FailureHint::InternalError)
-        } else if status == http::StatusCode::OK {
+        } else if status == http::StatusCode::OK && is_grpc(self.headers()) {
             // gRPC trailers-only responses: grpc-status appears in headers.
+            // We inspect grpc-status when content-type confirms this is a gRPC
+            // response so that we avoid misclassifying non-gRPC HTTP 200
+            // responses that somehow happen to include a grpc-status header.
+            //
             // Note: for streaming gRPC, grpc-status is in trailers (not headers)
             // and will not be detected here. The circuit breaker handles streaming
             // gRPC failures via GrpcRetryPushbackClassifyEos.
@@ -172,7 +184,7 @@ impl<B> ResponseFailureHint for http::Response<B> {
             return;
         }
         // Try gRPC retry-pushback-ms (for trailers-only responses)
-        if self.status() == http::StatusCode::OK {
+        if self.status() == http::StatusCode::OK && is_grpc(self.headers()) {
             if let Some(d) = linkerd_http_classify::retry_after::parse_grpc_retry_pushback(
                 self.headers(),
                 Duration::MAX,
@@ -196,7 +208,7 @@ impl<B> ResponseFailureHint for http::Response<B> {
             return Some(d);
         }
         // Try gRPC pushback
-        if self.status() == http::StatusCode::OK {
+        if self.status() == http::StatusCode::OK && is_grpc(self.headers()) {
             if let Some(d) =
                 linkerd_http_classify::retry_after::parse_grpc_retry_pushback(self.headers(), max)
             {
@@ -1247,6 +1259,7 @@ mod tests {
     fn build_grpc_pushback_response(pushback_ms: u64) -> http::Response<&'static str> {
         http::Response::builder()
             .status(http::StatusCode::OK)
+            .header(http::header::CONTENT_TYPE, "application/grpc")
             .header("grpc-status", "8")
             .header("grpc-retry-pushback-ms", pushback_ms.to_string())
             .body("grpc error")
@@ -1498,6 +1511,7 @@ mod tests {
         fn call(&mut self, _: ()) -> Self::Future {
             let resp = http::Response::builder()
                 .status(http::StatusCode::OK)
+                .header(http::header::CONTENT_TYPE, "application/grpc")
                 .header("grpc-status", self.grpc_status.to_string())
                 .body("grpc error")
                 .unwrap();

From d7f58f2c7e78786c9c61f30fbda22b98620f82e0 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 25 May 2026 15:34:52 +0200
Subject: [PATCH 17/45] fix(load-biaser): restrict gRPC failure classification
 to server errors

Up until now we mapped every non-zero gRPC status code to
FailureHint::InternalError, penalizing client errors like CANCELLED,
INVALID_ARGUMENT, NOT_FOUND, etc. These don't indicate server
health issues and should not steer traffic away from the endpoint.

Restrict penalty injection to server-side error codes that indicate
endpoint problems: UNKNOWN (2), DEADLINE_EXCEEDED (4), INTERNAL (13),
and DATA_LOSS (15), alongside the existing RESOURCE_EXHAUSTED (8)
and UNAVAILABLE (14) statuses.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index de6a2a8044..e72bf88027 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -136,7 +136,9 @@ fn is_grpc(headers: &http::HeaderMap) -> bool {
 /// and `grpc-status` in headers), the `grpc-status` header is inspected:
 /// - gRPC status 8 (RESOURCE_EXHAUSTED) -> `RateLimited`
 /// - gRPC status 14 (UNAVAILABLE) -> `ServiceUnavailable`
-/// - Other non-zero gRPC status codes -> `InternalError`
+/// - gRPC status 2 (UNKNOWN), 4 (DEADLINE_EXCEEDED), 13 (INTERNAL),
+///   15 (DATA_LOSS) -> `InternalError`
+/// - Others (client errors, ie. CANCELLED, NOT_FOUND, etc) -> ignored
 ///
 /// Non-gRPC HTTP 200 responses are not inspected for `grpc-status`.
 impl<B> ResponseFailureHint for http::Response<B> {
@@ -165,7 +167,8 @@ impl<B> ResponseFailureHint for http::Response<B> {
                     0 => None,                                   // OK
                     8 => Some(FailureHint::RateLimited),         // RESOURCE_EXHAUSTED
                     14 => Some(FailureHint::ServiceUnavailable), // UNAVAILABLE
-                    _ => Some(FailureHint::InternalError),       // Other non-zero
+                    2 | 4 | 13 | 15 => Some(FailureHint::InternalError),
+                    _ => None, // Client errors (CANCELLED, INVALID_ARGUMENT, etc.)
                 })
         } else {
             None

From 12adaca5396b8e8aeb9b8c97eb7d0d19896c04dc Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 25 May 2026 15:49:31 +0200
Subject: [PATCH 18/45] test(load-biaser): add tests for extended gRPC status
 classification

Ensure only those gRPC status codes indicating server-side errors
inject penalties.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 66 ++++++++++++++++++++++++++++++----
 1 file changed, 59 insertions(+), 7 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index e72bf88027..047ca99caa 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -1495,6 +1495,20 @@ mod tests {
         );
     }
 
+    #[test]
+    fn failure_hint_non_grpc_200_with_grpc_status_header() {
+        let resp = http::Response::builder()
+            .status(http::StatusCode::OK)
+            .header("grpc-status", "14")
+            .body("")
+            .unwrap();
+        assert_eq!(
+            resp.failure_hint(),
+            None,
+            "non-gRPC HTTP 200 with grpc-status header should not be classified"
+        );
+    }
+
     // Mock service that returns HTTP 200 with a `grpc-status` header,
     // simulating a gRPC trailers-only error response.
     #[derive(Clone)]
@@ -1523,24 +1537,62 @@ mod tests {
         }
     }
 
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_grpc_resource_exhausted_injects_penalty() {
-        let inner = GrpcErrorService { grpc_status: 8 }; // RESOURCE_EXHAUSTED
+    async fn assert_grpc_injects_penalty(grpc_status: u16, label: &str) {
+        let inner = GrpcErrorService { grpc_status };
         let mut biaser = LoadBiaser::new(inner, test_config());
 
         time::sleep(Duration::from_millis(1)).await;
-
         let _ = biaser.call(()).await;
-        // gRPC RESOURCE_EXHAUSTED maps to FailureHint::RateLimited,
-        // which injects the full penalty_secs (5.0).
         let penalty = biaser.get_penalty();
 
         assert!(
             (penalty - 5.0).abs() < 0.1,
-            "gRPC RESOURCE_EXHAUSTED should inject full penalty (~5s), got: {penalty}"
+            "gRPC {label} (status {grpc_status}) should inject full penalty (~5s), got: {penalty}"
         );
     }
 
+    async fn assert_grpc_no_penalty(grpc_status: u16, label: &str) {
+        let inner = GrpcErrorService { grpc_status };
+        let mut biaser = LoadBiaser::new(inner, test_config());
+
+        time::sleep(Duration::from_millis(1)).await;
+        let _ = biaser.call(()).await;
+
+        assert!(
+            biaser.get_penalty().is_infinite(),
+            "gRPC {label} (status {grpc_status}) should not inject penalty"
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_grpc_server_errors_inject_penalty() {
+        assert_grpc_injects_penalty(8, "RESOURCE_EXHAUSTED").await;
+        assert_grpc_injects_penalty(14, "UNAVAILABLE").await;
+        assert_grpc_injects_penalty(13, "INTERNAL").await;
+        assert_grpc_injects_penalty(2, "UNKNOWN").await;
+        assert_grpc_injects_penalty(4, "DEADLINE_EXCEEDED").await;
+        assert_grpc_injects_penalty(15, "DATA_LOSS").await;
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_grpc_client_errors_no_penalty() {
+        assert_grpc_no_penalty(1, "CANCELLED").await;
+        assert_grpc_no_penalty(3, "INVALID_ARGUMENT").await;
+        assert_grpc_no_penalty(5, "NOT_FOUND").await;
+        assert_grpc_no_penalty(6, "ALREADY_EXISTS").await;
+        assert_grpc_no_penalty(7, "PERMISSION_DENIED").await;
+        assert_grpc_no_penalty(9, "FAILED_PRECONDITION").await;
+        assert_grpc_no_penalty(10, "ABORTED").await;
+        assert_grpc_no_penalty(11, "OUT_OF_RANGE").await;
+        assert_grpc_no_penalty(12, "UNIMPLEMENTED").await;
+        assert_grpc_no_penalty(16, "UNAUTHENTICATED").await;
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_grpc_ok_no_penalty() {
+        assert_grpc_no_penalty(0, "OK").await;
+    }
+
     async fn assert_disabled_no_penalty<S>(mut biaser: LoadBiaser<S>, label: &str)
     where
         S: Service<(), Response = http::Response<&'static str>, Error = Infallible>,

From 10b32c6cf27652a87df235d314cfadaaec2b3e9a Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 25 May 2026 16:03:40 +0200
Subject: [PATCH 19/45] test(load-biaser): ensure repeated failures maintain
 penalty level

Verify that consecutive 429 responses at 1s intervals keep the
penalty at the configured level, confirming the EWMA peak resets
the decayed value rather than accumulating.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 38 ++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index 047ca99caa..58851e9555 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -899,6 +899,44 @@ mod tests {
         );
     }
 
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_sequential_failures_maintain_penalty() {
+        let inner = MockService::new(http::StatusCode::TOO_MANY_REQUESTS);
+        let config = LoadBiaserConfig {
+            default_rtt: Duration::from_millis(1),
+            ..test_config()
+        };
+        let mut biaser = LoadBiaser::new(inner, config);
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        let _ = biaser.call(()).await;
+        let penalty_after_first = biaser.get_penalty();
+        assert!(
+            (penalty_after_first - 5.0).abs() < 0.1,
+            "first 429 should inject ~5s penalty, got: {penalty_after_first}"
+        );
+
+        // Second 429 arrives 1s later. The penalty has decayed slightly but
+        // add_peak replaces the decayed value with the fresh penalty.
+        time::sleep(Duration::from_secs(1)).await;
+        let _ = biaser.call(()).await;
+        let penalty_after_second = biaser.get_penalty();
+        assert!(
+            (penalty_after_second - 5.0).abs() < 0.1,
+            "second 429 should maintain penalty at ~5s, got: {penalty_after_second}"
+        );
+
+        // Third 429 after another second. Still at ~5s.
+        time::sleep(Duration::from_secs(1)).await;
+        let _ = biaser.call(()).await;
+        let penalty_after_third = biaser.get_penalty();
+        assert!(
+            (penalty_after_third - 5.0).abs() < 0.1,
+            "third 429 should maintain penalty at ~5s, got: {penalty_after_third}"
+        );
+    }
+
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_load_is_max_of_rtt_based_and_penalty() {
         let inner = MockService::new(http::StatusCode::OK);

From 51c7b2e6b5f8a02603e4f2c1cb4475ffd006f12e Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 25 May 2026 16:21:34 +0200
Subject: [PATCH 20/45] feat(ewma): expose last update timestamp

Add a `last_update()` getter that returns the timestamp of the most
recent EWMA update. Callers that need to detect staleness (ie. idle
periods where the EWMA has decayed to the point that a single sample
dominates) can compare this against the current time to detect this
exact circumstance (and, for example, require more samples before
taking decisions).

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/ewma/src/lib.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/linkerd/ewma/src/lib.rs b/linkerd/ewma/src/lib.rs
index 09394e8e68..b6d6e81d6f 100644
--- a/linkerd/ewma/src/lib.rs
+++ b/linkerd/ewma/src/lib.rs
@@ -70,6 +70,11 @@ impl Ewma {
         self.value
     }
 
+    /// Returns the timestamp of the most recent update.
+    pub fn last_update(&self) -> time::Instant {
+        self.timestamp
+    }
+
     /// Returns the decayed value projected to the given time, without modifying stored state.
     ///
     /// Instead of returning the raw stored value, this applies exponential decay based

From 8fe67af70a30cb67d89c9fef1bd6f1816401462b Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 25 May 2026 16:55:02 +0200
Subject: [PATCH 21/45] test(ewma): verify last_update across construction,
 changes, and reads

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/ewma/src/lib.rs | 59 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/linkerd/ewma/src/lib.rs b/linkerd/ewma/src/lib.rs
index b6d6e81d6f..8d907f1d13 100644
--- a/linkerd/ewma/src/lib.rs
+++ b/linkerd/ewma/src/lib.rs
@@ -485,4 +485,63 @@ mod tests {
         // Should trigger a debug_assert
         let _ = ewma.get_at(now);
     }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_last_update_tracks_construction() {
+        let now = Instant::now();
+        let ewma = Ewma::new(Duration::from_secs(10), now);
+        assert_eq!(ewma.last_update(), now);
+
+        let ewma = Ewma::new_with_value(Duration::from_secs(10), now, 1.0);
+        assert_eq!(ewma.last_update(), now);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_last_update_advances_on_add() {
+        let now = Instant::now();
+        let mut ewma = Ewma::new(Duration::from_secs(10), now);
+
+        let t1 = now + Duration::from_secs(1);
+        ewma.add(1.0, t1);
+        assert_eq!(ewma.last_update(), t1);
+
+        let t2 = now + Duration::from_secs(5);
+        ewma.add(0.5, t2);
+        assert_eq!(ewma.last_update(), t2);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_last_update_advances_on_reset() {
+        let now = Instant::now();
+        let mut ewma = Ewma::new(Duration::from_secs(10), now);
+
+        ewma.add(1.0, now + Duration::from_secs(1));
+        let reset_at = now + Duration::from_secs(5);
+        ewma.reset(1.0, reset_at);
+        assert_eq!(ewma.last_update(), reset_at);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_last_update_unchanged_by_read() {
+        let now = Instant::now();
+        let mut ewma = Ewma::new(Duration::from_secs(10), now);
+
+        let t1 = now + Duration::from_secs(1);
+        ewma.add(1.0, t1);
+        let _ = ewma.get_at(now + Duration::from_secs(10));
+        assert_eq!(ewma.last_update(), t1);
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_last_update_unchanged_by_stale_add() {
+        let now = Instant::now();
+        let mut ewma = Ewma::new(Duration::from_secs(10), now);
+
+        let t1 = now + Duration::from_secs(5);
+        ewma.add(1.0, t1);
+
+        // Stale timestamp (before last update) is silently dropped.
+        ewma.add(0.0, now + Duration::from_secs(2));
+        assert_eq!(ewma.last_update(), t1);
+    }
 }

From 98aafb8b3decb72e426f931d122a394d1517143a Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <alex@flawedcode.org>
Date: Tue, 2 Jun 2026 16:00:28 +0200
Subject: [PATCH 22/45] Apply suggestions from code review

Co-authored-by: katelyn martin <git@katelyn.world>
---
 linkerd/http/classify/src/retry_after.rs | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/linkerd/http/classify/src/retry_after.rs b/linkerd/http/classify/src/retry_after.rs
index 7d2b0d4915..fd7e1901d5 100644
--- a/linkerd/http/classify/src/retry_after.rs
+++ b/linkerd/http/classify/src/retry_after.rs
@@ -72,9 +72,14 @@ const GRPC_RETRY_PUSHBACK_MS: &str = "grpc-retry-pushback-ms";
 
 /// Parse grpc-retry-pushback-ms from headers or trailers.
 ///
-/// Per gRPC A6 spec:
-/// - Positive i64: retry after this many milliseconds
-/// - Negative i64: do not retry (returns `None`)
+/// Per the [gRPC A6 spec]:
+///
+/// > The value is to be an ASCII encoded signed 32-bit integer with no unnecessary leading zeros
+/// > that represents how many milliseconds to wait before sending a retry. If the value for
+/// > pushback is negative or unparseble, then it will be seen as the server asking the client not
+/// > to retry at all.
+///
+/// [gRPC A6 spec]: https://github.com/grpc/proposal/blob/master/A6-client-retries.md#pushback
 ///
 /// This function does **not** check grpc-status; that is a classification
 /// concern left to the caller.

From d7c03b51b98641337ee1042925e0c2051b66a7d4 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Fri, 5 Jun 2026 13:33:32 +0200
Subject: [PATCH 23/45] fix(ewma,http-classify): address feedback

- Drop unused add_rate, last_update
- Correct MIN_DECAY enforcement comment
- Note on ignoring negative do-not-retry pushbacks

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/ewma/src/lib.rs                  | 92 ++----------------------
 linkerd/http/classify/src/retry_after.rs |  7 +-
 2 files changed, 9 insertions(+), 90 deletions(-)

diff --git a/linkerd/ewma/src/lib.rs b/linkerd/ewma/src/lib.rs
index 8d907f1d13..66a57cab6b 100644
--- a/linkerd/ewma/src/lib.rs
+++ b/linkerd/ewma/src/lib.rs
@@ -10,9 +10,10 @@
 
 use tokio::time;
 
-/// Minimum decay duration to prevent division-by-zero in EWMA computations.
-/// Chosen as the smallest Duration that is strictly positive without overriding
-/// validated configs from the control plane (CP should reject decay=0).
+/// Smallest decay duration the estimator will use.
+///
+/// The constructors clamp any supplied decay up to this value, which avoids
+/// a division by zero when computing `elapsed / decay`.
 pub const MIN_DECAY: time::Duration = time::Duration::from_millis(1);
 
 /// An exponentially-weighted moving average.
@@ -70,11 +71,6 @@ impl Ewma {
         self.value
     }
 
-    /// Returns the timestamp of the most recent update.
-    pub fn last_update(&self) -> time::Instant {
-        self.timestamp
-    }
-
     /// Returns the decayed value projected to the given time, without modifying stored state.
     ///
     /// Instead of returning the raw stored value, this applies exponential decay based
@@ -130,17 +126,6 @@ impl Ewma {
         }
         self.add(value, ts)
     }
-
-    /// Computes 1/elapsed since the last update and feeds it through `add()`.
-    pub fn add_rate(&mut self, ts: time::Instant) {
-        if ts <= self.timestamp {
-            return;
-        }
-        let elapsed = ts.saturating_duration_since(self.timestamp);
-        if !elapsed.is_zero() {
-            self.add(1.0 / elapsed.as_secs_f64(), ts);
-        }
-    }
 }
 
 #[cfg(test)]
@@ -166,16 +151,6 @@ mod tests {
         assert_eq!(ewma.get(), 1.0);
     }
 
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_add_rate() {
-        let now = Instant::now();
-        let mut ewma = Ewma::new(Duration::from_secs(10), now);
-        ewma.add_rate(now + Duration::from_secs(1));
-        assert_eq!(ewma.get(), 1.0);
-        ewma.add_rate(now + Duration::from_secs(3));
-        assert_eq!(ewma.get(), 0.9093653765389909);
-    }
-
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_add_peak() {
         let now = Instant::now();
@@ -485,63 +460,4 @@ mod tests {
         // Should trigger a debug_assert
         let _ = ewma.get_at(now);
     }
-
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_last_update_tracks_construction() {
-        let now = Instant::now();
-        let ewma = Ewma::new(Duration::from_secs(10), now);
-        assert_eq!(ewma.last_update(), now);
-
-        let ewma = Ewma::new_with_value(Duration::from_secs(10), now, 1.0);
-        assert_eq!(ewma.last_update(), now);
-    }
-
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_last_update_advances_on_add() {
-        let now = Instant::now();
-        let mut ewma = Ewma::new(Duration::from_secs(10), now);
-
-        let t1 = now + Duration::from_secs(1);
-        ewma.add(1.0, t1);
-        assert_eq!(ewma.last_update(), t1);
-
-        let t2 = now + Duration::from_secs(5);
-        ewma.add(0.5, t2);
-        assert_eq!(ewma.last_update(), t2);
-    }
-
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_last_update_advances_on_reset() {
-        let now = Instant::now();
-        let mut ewma = Ewma::new(Duration::from_secs(10), now);
-
-        ewma.add(1.0, now + Duration::from_secs(1));
-        let reset_at = now + Duration::from_secs(5);
-        ewma.reset(1.0, reset_at);
-        assert_eq!(ewma.last_update(), reset_at);
-    }
-
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_last_update_unchanged_by_read() {
-        let now = Instant::now();
-        let mut ewma = Ewma::new(Duration::from_secs(10), now);
-
-        let t1 = now + Duration::from_secs(1);
-        ewma.add(1.0, t1);
-        let _ = ewma.get_at(now + Duration::from_secs(10));
-        assert_eq!(ewma.last_update(), t1);
-    }
-
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_last_update_unchanged_by_stale_add() {
-        let now = Instant::now();
-        let mut ewma = Ewma::new(Duration::from_secs(10), now);
-
-        let t1 = now + Duration::from_secs(5);
-        ewma.add(1.0, t1);
-
-        // Stale timestamp (before last update) is silently dropped.
-        ewma.add(0.0, now + Duration::from_secs(2));
-        assert_eq!(ewma.last_update(), t1);
-    }
 }
diff --git a/linkerd/http/classify/src/retry_after.rs b/linkerd/http/classify/src/retry_after.rs
index fd7e1901d5..d61f1082fa 100644
--- a/linkerd/http/classify/src/retry_after.rs
+++ b/linkerd/http/classify/src/retry_after.rs
@@ -86,7 +86,7 @@ const GRPC_RETRY_PUSHBACK_MS: &str = "grpc-retry-pushback-ms";
 ///
 /// Returns `None` for:
 /// - Missing header/trailer
-/// - Negative values (interpreted as "do not retry")
+/// - Negative values (meaning "do not retry", we can't honor that)
 /// - Invalid formats
 ///
 /// The returned duration is capped at `max` to prevent abuse.
@@ -103,7 +103,10 @@ pub fn parse_grpc_retry_pushback(headers: &HeaderMap, max: Duration) -> Option<D
         }
     };
 
-    // Negative values mean "do not retry".
+    // A negative pushback means "do not retry". However in this context
+    // we can't honor it, since users are either steering traffic away from
+    // an endpoint, or computing a backoff time giving it room to recover.
+    // We ignore the hint, as it's not really actionable.
     if ms < 0 {
         tracing::debug!(ms, "Ignoring negative grpc-retry-pushback-ms");
         return None;

From d16388a00a9d6d111c870726db141c4bf59b1bee Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Fri, 5 Jun 2026 13:33:33 +0200
Subject: [PATCH 24/45] refactor(load-biaser): address feedback to use a single
 penalized EWMA RTT

We now now keep a single RTT EWMA and a load of `rtt * (pending + 1)`,
exactly like Tower's PeakEwma.

A success records its measured RTT, while a failure now records a
computed effective RTT through the same peak-EWMA logic, using the
Retry-After or grpc-retry-pushback hint when present, or otherwise
penalizing the RTT with a base value.

In-flight requests are now counted the way Tower's PeakEwma counts them,
using Arc's strong count and measuring on cancellation.

Finally an explicit completion tracker can use `PendingUntilFirstData`
for measurement to more closely match previous behavior.

`linkerd-ewma` is still a separate crate because we feed it a penalty
value rather than a measured RTT, and since Tower's `RttEstimate` is
private (at the moment) and advances its decay clock on read, it can't
accept an injected observation nor be read under a shared lock.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 992 +++++++++++++--------------------
 1 file changed, 398 insertions(+), 594 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index 58851e9555..c15303be3e 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -6,26 +6,20 @@
 //! `linkerd_ewma::Ewma` and returns `f64` metrics directly, enabling
 //! integration with P2C load balancing.
 //!
-//! This can wrap any service (`Load` trait not required) and it tracks RTT via
-//! EWMA, which is updated when responses complete, and pending requests for load
-//! calculation.
+//! The load metric is `rtt * (pending + 1)`, exactly PeakEwma. The difference
+//! is what counts as an observation. A success records its measured RTT. A
+//! failure records a *computed*, penalized RTT instead: when present, it'll be
+//! the server's Retry-After or grpc-retry-pushback hint (capped at
+//! `max_duration`), otherwise a base penalty.
 //!
-//! When a failure response is detected, a penalty is applied and the EWMA jumps
-//! to a high value. The load is calculated as `max(rtt * (pending + 1), penalty)`
-//! (so at least RTT with no pending requests).
+//! Responses are classified through the [`ResponseFailureHint`] trait, which
+//! are only relevant for HTTP, as other transports have no hint.
 //!
-//! Both RTT and penalty decay over time via the EWMA.
-//!
-//! For this type to work responses must implement the `ResponseFailureHint`
-//! trait, which classifies responses into failure categories (rate-limited,
-//! service unavailable, internal error). For non-HTTP responses the default
-//! implementation returns no failure hint, so only RTT tracking occurs.
-//!
-//! For gRPC, `failure_hint()` detects `RESOURCE_EXHAUSTED` (code 8) and
-//! `UNAVAILABLE` (code 14) from the `grpc-status` header in trailers-only
-//! (unary) responses. Streaming gRPC puts `grpc-status` in trailers which
-//! are not visible at response-head time; the circuit breaker handles
-//! streaming gRPC failures via `GrpcRetryPushbackClassifyEos`.
+//! In-flight requests are counted the way Tower's PeakEwma counts them, using
+//! a [`Handle`] holding a clone of the shared state, and the strong count of
+//! that `Arc` (minus the service's own reference) is the pending count. The
+//! handle records a measurement when it drops, so a cancelled request still
+//! contributes its elapsed time.
 
 #![deny(rust_2018_idioms, clippy::disallowed_methods, clippy::disallowed_types)]
 #![forbid(unsafe_code)]
@@ -39,15 +33,12 @@ use std::{
     future::Future,
     marker::PhantomData,
     pin::Pin,
-    sync::{
-        atomic::{AtomicU32, Ordering},
-        Arc,
-    },
+    sync::Arc,
     task::{Context, Poll},
     time::Duration,
 };
 use tokio::time::Instant;
-use tower::load::Load;
+use tower::load::{CompleteOnResponse, Load, TrackCompletion};
 use tower_service::Service;
 
 /// Default maximum duration for Retry-After hints.
@@ -237,32 +228,6 @@ impl ResponseFailureHint for tokio::io::DuplexStream {}
 #[cfg(feature = "tokio-test")]
 impl ResponseFailureHint for tokio_test::io::Mock {}
 
-/// Amplification factor for Retry-After penalties.
-///
-/// When a 429 or 503 response includes a Retry-After header, the load biaser injects
-/// an amplified penalty so it remains meaningful through the server's requested
-/// avoidance window. The injected value is:
-///
-///   `penalty_secs * RETRY_AFTER_PENALTY_FACTOR * exp(retry_after / penalty_decay)`
-///
-/// which decays via EWMA to `penalty_secs * RETRY_AFTER_PENALTY_FACTOR` at
-/// exactly `t = retry_after`. The factor controls how aggressively the endpoint
-/// is avoided near the Retry-After deadline:
-///
-/// At 1.0 the endpoint is fully avoided (~0% traffic) through the window and 46s
-/// beyond -- too aggressive for early-recovery discovery. At 0.1, traffic leaks
-/// well before the deadline. We use 0.5: the penalty at deadline is penalty_secs/2
-/// (roughly 50x healthy load in a 5-endpoint pool), which lets occasional probes
-/// through in the second half while still exceeding a plain failure penalty for
-/// RA >= ~7s. For pools with 100+ endpoints the factor barely matters because P2C
-/// random pair selection already makes the penalized endpoint unlikely to be drawn.
-///
-/// Sending occasional probe traffic during the window is valuable: the endpoint
-/// may recover early, or a fresh 429 or 503 with a different RA provides updated
-/// information. The circuit breaker (when enabled) provides a strict backoff
-/// window, ie. `max(backoff, retry_after)`.
-const RETRY_AFTER_PENALTY_FACTOR: f64 = 0.5;
-
 /// Configuration for LoadBiaser behavior.
 #[derive(Clone, Debug)]
 pub struct LoadBiaserConfig {
@@ -273,17 +238,10 @@ pub struct LoadBiaserConfig {
     /// Controls how quickly RTT estimates adapt to changing latency
     pub rtt_decay: Duration,
 
-    /// The penalty value to inject on failure responses (429, 503, 5xx) in seconds
+    /// The base penalty to inject on failure responses (429, 503, 5xx)
+    /// in seconds,  when the server sends no backoff hint.
     pub penalty_secs: f64,
 
-    /// Decay duration for the penalty EWMA.
-    /// Controls how quickly the penalty decays after a failure response
-    pub penalty_decay: Duration,
-
-    /// Whether load biasing penalties are enabled. When false, only RTT tracking
-    /// is active (PeakEwma equivalent).
-    pub enabled: bool,
-
     /// Maximum Retry-After duration to honor. Clamped to this value.
     pub max_duration: Duration,
 }
@@ -295,9 +253,6 @@ impl Default for LoadBiaserConfig {
             rtt_decay: Duration::from_secs(10),
             // 5 second penalty on failure responses (429, 503, 5xx)
             penalty_secs: 5.0,
-            // 10 second decay for penalty - mostly gone after ~30 seconds
-            penalty_decay: Duration::from_secs(10),
-            enabled: false,
             max_duration: DEFAULT_RETRY_AFTER_MAX_DURATION,
         }
     }
@@ -305,123 +260,148 @@ impl Default for LoadBiaserConfig {
 
 /// Shared per-endpoint state behind a single `Arc`.
 ///
-/// Combines the EWMA trackers (under a mutex for atomic RTT+penalty reads),
-/// the in-flight request counter, and the immutable config fields that every
-/// response future needs. One allocation per endpoint instead of two, and
-/// futures carry a single `Arc` clone instead of copying config fields.
+/// Holds the RTT EWMA and the immutable config fields every response future
+/// needs. The strong count of the `Arc` wrapping this state is also used as
+/// the in-flight request count: each [`Handle`] clones it, so subtracting the
+/// service's own reference gives the pending count.
 #[derive(Debug)]
 struct SharedState {
-    /// RTT and penalty EWMAs; read-locked in load(), write-locked in poll()
-    metrics: RwLock<LoadMetrics>,
-    /// Count of in-flight requests (up on call, down on response)
-    pending: AtomicU32,
-    /// Penalty value to inject on failure responses (in seconds)
+    /// RwLocked RTT EWMA. Read in load(), written when measuring.
+    rtt: RwLock<Ewma>,
+    /// Penalty value to inject on failure responses (in seconds) when no
+    /// hint is present.
     penalty_secs: f64,
-    /// Whether penalty injection is enabled
-    enabled: bool,
     /// Maximum Retry-After duration to honor (clamped)
     max_duration: Duration,
-    /// Decay duration for the penalty EWMA (used to amplify Retry-After penalties)
-    penalty_decay: Duration,
 }
 
-#[derive(Debug)]
-struct LoadMetrics {
-    /// EWMA RTT tracking, updated on each response
-    rtt: Ewma,
-    /// EWMA penalty tracking, updated on failure responses (429, 503, 5xx)
-    penalty: Ewma,
+impl SharedState {
+    /// Translates a classified failure into an effective RTT measurement,
+    /// taking into account Retry-After/grpc-retry-pushback hints.
+    fn effective_rtt<R: ResponseFailureHint>(&self, hint: FailureHint, resp: &R) -> f64 {
+        let from_hint = match hint {
+            FailureHint::RateLimited | FailureHint::ServiceUnavailable => resp
+                .rate_limit_hint(self.max_duration)
+                .map(|d| d.as_secs_f64()),
+            FailureHint::InternalError => None,
+        };
+
+        // A hint, if present, is honored verbatim (capped).
+        from_hint.unwrap_or(self.penalty_secs)
+    }
 }
 
-/// A service wrapper that tracks RTT and biases load metrics based on failure responses.
+/// A service wrapper that tracks RTT and biases load metrics on failure responses.
 ///
 /// `LoadBiaser` provides load metrics for P2C load balancing by tracking request latency
 /// (RTT) via EWMA, in-flight requests, and by injecting penalties when failure responses
-/// (429, 503, 5xx) are detected.
+/// (429, 503, 5xx) are detected. The load metric is `rtt * (pending + 1)`, identical
+/// to Tower's PeakEwma.
 ///
-/// The `load()` method returns `max(rtt * (pending + 1), penalty)`, causing P2C to
-/// prefer endpoints with lower latency and fewer in-flight requests, while avoiding
-/// rate-limited endpoints.
+/// Successes record their measured RTT, and failures record a computed RTT where a
+/// penalty has been injected.
+///
+/// The completion type `C` controls when a successful request's RTT is
+/// measured, mirroring PeakEwma. The default drops the handle when the response
+/// future resolves. `hyper_balance::PendingUntilFirstData` defers it to the
+/// first response body frame, matching the proxy's HTTP balancer.
 #[derive(Debug)]
-pub struct LoadBiaser<S> {
+pub struct LoadBiaser<S, C = CompleteOnResponse> {
     inner: S,
-    /// Per-endpoint metrics, pending counter, and penalty config
+    /// Per-endpoint RTT state and failure config.
     shared: Arc<SharedState>,
-    config: LoadBiaserConfig,
+    completion: C,
 }
 
 /// A `NewService` implementation that creates `LoadBiaser` wrappers.
 #[derive(Debug)]
-pub struct NewLoadBiaser<N, Req> {
+pub struct NewLoadBiaser<N, Req, C = CompleteOnResponse> {
     inner: N,
+    completion: C,
     config: LoadBiaserConfig,
     _marker: PhantomData<fn(Req)>,
 }
 
-/// Response future that tracks RTT and checks for failure responses.
+/// Tracks one in-flight request and records a measurement when it drops.
 ///
-/// When the inner future completes we store the RTT based on elapsed time since the
-/// request started, decrement the pending counter, and if the response indicates a
-/// failure (using the `ResponseFailureHint` trait) we inject a penalty.
-#[pin_project(PinnedDrop)]
-pub struct LoadBiaserFuture<F, Rsp> {
+/// It clones the shared `Arc`, so while it lives the endpoint's strong count
+/// is incremented by one. On drop it records the elapsed time as an RTT
+/// measurement, so a cancelled request still measures latency.
+/// A failure disables the handle first, because it needs to record its own
+/// penalty-injected measurement.
+#[derive(Debug)]
+pub struct Handle {
+    shared: Arc<SharedState>,
+    sent_at: Instant,
+    enabled: bool,
+}
+
+impl Handle {
+    /// Records the measured elapsed time, then prevents the drop from recording
+    /// again.
+    fn record_elapsed(&mut self, now: Instant) {
+        if self.enabled {
+            let elapsed = now.saturating_duration_since(self.sent_at).as_secs_f64();
+            self.shared.rtt.write().add_peak(elapsed, now);
+            self.enabled = false;
+        }
+    }
+
+    /// Records a computed effective RTT and disables the handle so its drop does
+    /// not also record the measurement of the failure.
+    fn record_effective_rtt(&mut self, value: f64, now: Instant) {
+        self.shared.rtt.write().add_peak(value, now);
+        self.enabled = false;
+    }
+}
+
+impl Drop for Handle {
+    fn drop(&mut self) {
+        // Ensure we take a measurement if the request was cancelled (enabled
+        // should be true).
+        self.record_elapsed(Instant::now());
+    }
+}
+
+/// Response future that records a measurement and checks for failure responses.
+///
+/// On resolution a failure injects a computed RTT and disables the handle.
+/// A success leaves the handle for the completion tracker, which decides when
+/// to record the measurement.
+#[pin_project]
+pub struct LoadBiaserFuture<F, Rsp, C> {
     #[pin]
     inner: F,
-    /// Request start instant for RTT calculation
-    start: Instant,
-    /// Shared endpoint state (metrics, pending counter, penalty config)
-    shared: Arc<SharedState>,
-    /// Whether we've already decremented pending
-    completed: bool,
+    /// Handle whose drop records the RTT measurement.
+    handle: Option<Handle>,
+    completion: C,
     /// Marker for the response type (`ResponseFailureHint` bound)
     _response: PhantomData<fn() -> Rsp>,
 }
 
-impl<S> LoadBiaser<S> {
-    /// Creates a new `LoadBiaser` wrapping the given service.
+impl<S, C> LoadBiaser<S, C> {
+    /// Creates a new `LoadBiaser` with an explicit completion tracker.
     #[must_use]
-    pub fn new(inner: S, mut config: LoadBiaserConfig) -> Self {
-        if config.penalty_secs.is_nan() || config.penalty_secs < 0.0 {
-            tracing::warn!(
-                penalty_secs = config.penalty_secs,
-                "penalty_secs is NaN or negative, clamping to 0.0"
-            );
-            config.penalty_secs = 0.0;
-        }
-        if config.penalty_decay < MIN_DECAY {
-            tracing::warn!(
-                penalty_decay = ?config.penalty_decay,
-                min = ?MIN_DECAY,
-                "penalty_decay below minimum, will be clamped by EWMA constructor"
-            );
-        }
-        if config.rtt_decay < MIN_DECAY {
-            tracing::warn!(
-                rtt_decay = ?config.rtt_decay,
-                min = ?MIN_DECAY,
-                "rtt_decay below minimum, will be clamped by EWMA constructor"
-            );
-        }
+    pub fn with_completion(inner: S, completion: C, config: LoadBiaserConfig) -> Self {
+        let config = sanitize(config);
         let now = Instant::now();
         let shared = Arc::new(SharedState {
-            metrics: RwLock::new(LoadMetrics {
-                // Initialize RTT with default_rtt (not INFINITY) so that fresh
-                // endpoints have comparable load to unmeasured ones. This matches
-                // Tower's PeakEwma behavior and prevents P2C from permanently
-                // preferring endpoints whose first request happened to be fast.
-                rtt: Ewma::new_with_value(config.rtt_decay, now, config.default_rtt.as_secs_f64()),
-                penalty: Ewma::new(config.penalty_decay, now),
-            }),
-            pending: AtomicU32::new(0),
+            // Initialize RTT with default_rtt (not INFINITY) so that fresh
+            // endpoints have comparable load to unmeasured ones. This matches
+            // Tower's PeakEwma behavior and prevents P2C from permanently
+            // preferring endpoints whose first request happened to be fast.
+            rtt: RwLock::new(Ewma::new_with_value(
+                config.rtt_decay,
+                now,
+                config.default_rtt.as_secs_f64(),
+            )),
             penalty_secs: config.penalty_secs,
-            enabled: config.enabled,
             max_duration: config.max_duration,
-            penalty_decay: config.penalty_decay,
         });
         Self {
             inner,
             shared,
-            config,
+            completion,
         }
     }
 
@@ -429,45 +409,59 @@ impl<S> LoadBiaser<S> {
     pub fn get_ref(&self) -> &S {
         &self.inner
     }
-}
 
-impl<S: Clone> Clone for LoadBiaser<S> {
-    fn clone(&self) -> Self {
-        Self {
-            inner: self.inner.clone(),
+    fn handle(&self) -> Handle {
+        Handle {
             shared: self.shared.clone(),
-            config: self.config.clone(),
+            sent_at: Instant::now(),
+            enabled: true,
         }
     }
 }
 
-impl<S> Load for LoadBiaser<S> {
+impl<S> LoadBiaser<S, CompleteOnResponse> {
+    /// Creates a new `LoadBiaser` recording RTT when the response resolves.
+    #[must_use]
+    pub fn new(inner: S, config: LoadBiaserConfig) -> Self {
+        Self::with_completion(inner, CompleteOnResponse::default(), config)
+    }
+}
+
+fn sanitize(mut config: LoadBiaserConfig) -> LoadBiaserConfig {
+    if config.penalty_secs.is_nan() || config.penalty_secs < 0.0 {
+        tracing::warn!(
+            penalty_secs = config.penalty_secs,
+            "penalty_secs is NaN or negative, clamping to 0.0"
+        );
+        config.penalty_secs = 0.0;
+    }
+    if config.rtt_decay < MIN_DECAY {
+        tracing::warn!(
+            rtt_decay = ?config.rtt_decay,
+            min = ?MIN_DECAY,
+            "rtt_decay below minimum, will be clamped by EWMA constructor"
+        );
+    }
+    config
+}
+
+impl<S, C> Load for LoadBiaser<S, C> {
     type Metric = f64;
 
     fn load(&self) -> Self::Metric {
-        let pending = self.shared.pending.load(Ordering::Acquire);
+        // Pending is the strong count of the Arc minus the service's own ref.
+        let pending = (Arc::strong_count(&self.shared) as u32).saturating_sub(1);
         let now = Instant::now();
 
-        let (rtt, penalty_val) = {
-            let metrics = self.shared.metrics.read();
-            (metrics.rtt.get_at(now), metrics.penalty.get_at(now))
-        };
-
-        let penalty = if penalty_val.is_infinite() {
-            0.0
-        } else {
-            penalty_val
-        };
+        let rtt = self.shared.rtt.read().get_at(now);
 
-        // Load = RTT * (pending + 1), minimum will be `penalty`
-        // The +1 ensures idle endpoints have some load based on RTT
-        let base = rtt * f64::from(pending.saturating_add(1));
-        let load = f64::max(base, penalty);
+        // Load = RTT * (pending + 1). The +1 keeps an idle endpoint's cost tied
+        // to its RTT so a slow-but-quiet endpoint is not treated as free.
+        let load = rtt * f64::from(pending.saturating_add(1));
 
         tracing::trace!(
             rtt_secs = rtt,
             pending = pending,
-            penalty_secs = penalty,
             load = load,
             "LoadBiaser::load"
         );
@@ -476,148 +470,114 @@ impl<S> Load for LoadBiaser<S> {
     }
 }
 
-impl<S, Req> Service<Req> for LoadBiaser<S>
+impl<S, C, Req> Service<Req> for LoadBiaser<S, C>
 where
     S: Service<Req>,
     S::Response: ResponseFailureHint,
+    C: TrackCompletion<Handle, S::Response> + Clone,
 {
-    type Response = S::Response;
+    type Response = C::Output;
     type Error = S::Error;
-    type Future = LoadBiaserFuture<S::Future, S::Response>;
+    type Future = LoadBiaserFuture<S::Future, S::Response, C>;
 
     fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
         self.inner.poll_ready(cx)
     }
 
     fn call(&mut self, req: Req) -> Self::Future {
-        let prev = self.shared.pending.fetch_add(1, Ordering::AcqRel);
-        debug_assert!(prev < u32::MAX, "pending counter overflow");
-
         LoadBiaserFuture {
             inner: self.inner.call(req),
-            start: Instant::now(),
-            shared: self.shared.clone(),
-            completed: false,
+            handle: Some(self.handle()),
+            completion: self.completion.clone(),
             _response: PhantomData,
         }
     }
 }
 
-impl<N, Req> NewLoadBiaser<N, Req> {
-    /// Creates a new `NewLoadBiaser` with the given configuration.
-    pub fn new(config: LoadBiaserConfig, inner: N) -> Self {
+impl<N, Req, C> NewLoadBiaser<N, Req, C> {
+    /// Creates a `NewLoadBiaser` with an explicit completion tracker.
+    pub fn with_completion(config: LoadBiaserConfig, completion: C, inner: N) -> Self {
         Self {
             inner,
+            completion,
             config,
             _marker: PhantomData,
         }
     }
 }
 
-impl<N: Clone, Req> Clone for NewLoadBiaser<N, Req> {
+impl<N, Req> NewLoadBiaser<N, Req, CompleteOnResponse> {
+    /// Creates a new `NewLoadBiaser` with the given configuration.
+    pub fn new(config: LoadBiaserConfig, inner: N) -> Self {
+        Self::with_completion(config, CompleteOnResponse::default(), inner)
+    }
+}
+
+impl<N: Clone, Req, C: Clone> Clone for NewLoadBiaser<N, Req, C> {
     fn clone(&self) -> Self {
         Self {
             inner: self.inner.clone(),
+            completion: self.completion.clone(),
             config: self.config.clone(),
             _marker: PhantomData,
         }
     }
 }
 
-impl<T, N, S, Req> NewService<T> for NewLoadBiaser<N, Req>
+impl<T, N, S, Req, C> NewService<T> for NewLoadBiaser<N, Req, C>
 where
     N: NewService<T, Service = S>,
+    C: Clone,
 {
-    type Service = LoadBiaser<S>;
+    type Service = LoadBiaser<S, C>;
 
-    fn new_service(&self, target: T) -> LoadBiaser<S> {
-        LoadBiaser::new(self.inner.new_service(target), self.config.clone())
+    fn new_service(&self, target: T) -> LoadBiaser<S, C> {
+        LoadBiaser::with_completion(
+            self.inner.new_service(target),
+            self.completion.clone(),
+            self.config.clone(),
+        )
     }
 }
 
-impl<F, Rsp, E> Future for LoadBiaserFuture<F, Rsp>
+impl<F, Rsp, E, C> Future for LoadBiaserFuture<F, Rsp, C>
 where
     F: Future<Output = Result<Rsp, E>>,
     Rsp: ResponseFailureHint,
+    C: TrackCompletion<Handle, Rsp>,
 {
-    type Output = F::Output;
+    type Output = Result<C::Output, E>;
 
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         let this = self.project();
-        let mut result = ready!(this.inner.poll(cx));
-        let shared = &**this.shared;
-
+        let result = ready!(this.inner.poll(cx));
         let now = Instant::now();
-        let elapsed = now.saturating_duration_since(*this.start).as_secs_f64();
-
-        // Parse rate limit hint while we have &mut access to the response (when load
-        // biasing is enabled).
-        if shared.enabled {
-            if let Ok(ref mut resp) = result {
-                resp.attach_parsed_rate_limit_hint();
-            }
-        }
-
-        {
-            let mut metrics = shared.metrics.write();
-            if let Ok(ref resp) = result {
-                // Update RTT on all HTTP responses (including 429, 503, 5xx).
-                // Only transport-level errors (connection refused, resets) skip
-                // this update, because broken endpoints fail fast and would bias
-                // P2C toward sending more traffic to the broken endpoint.
-                metrics.rtt.add_peak(elapsed, now);
-
-                if shared.enabled {
-                    if let Some(hint) = resp.failure_hint() {
-                        let base_penalty = shared.penalty_secs;
-
-                        // For rate-limited and service-unavailable responses, amplify
-                        // the penalty using the Retry-After hint so it remains meaningful
-                        // through the server's requested avoidance window.
-                        let penalty_val = match hint {
-                            FailureHint::RateLimited | FailureHint::ServiceUnavailable => {
-                                match resp.rate_limit_hint(shared.max_duration) {
-                                    Some(ra) if ra.as_secs_f64() > 0.0 => {
-                                        let decay_secs =
-                                            shared.penalty_decay.max(MIN_DECAY).as_secs_f64();
-                                        let amplified = base_penalty
-                                            * RETRY_AFTER_PENALTY_FACTOR
-                                            * (ra.as_secs_f64() / decay_secs).exp();
-                                        amplified.min(1e12)
-                                    }
-                                    _ => base_penalty,
-                                }
-                            }
-                            FailureHint::InternalError => base_penalty,
-                        };
-
-                        tracing::debug!(
-                            penalty_secs = penalty_val,
-                            rtt_secs = elapsed,
-                            ?hint,
-                            "Detected failure response - injecting load penalty"
-                        );
-                        metrics.penalty.add_peak(penalty_val, now);
-                    }
+        let mut handle = this.handle.take().expect("polled after completion");
+
+        match result {
+            Ok(mut resp) => {
+                if let Some(hint) = resp.failure_hint() {
+                    // Cache the uncapped hint while we hold &mut, then record the
+                    // failure as a penalized effective RTT.
+                    resp.attach_parsed_rate_limit_hint();
+                    let value = handle.shared.effective_rtt(hint, &resp);
+                    tracing::debug!(rtt_secs = value, ?hint, "Failure recorded as effective RTT");
+                    handle.record_effective_rtt(value, now);
                 }
-            }
-        }
 
-        shared.pending.fetch_sub(1, Ordering::Release);
-        *this.completed = true;
+                // The completion tracker will drop the handle now or at first body
+                // frame, and that drop will record the measured RTT.
+                let output = this.completion.track_completion(handle, resp);
 
-        Poll::Ready(result)
-    }
-}
+                Poll::Ready(Ok(output))
+            }
+            Err(e) => {
+                // A transport error (connection refused, reset) or cancellation with
+                // no response to classify. Record its elapsed time.
+                drop(handle);
 
-#[pin_project::pinned_drop]
-impl<F, Rsp> PinnedDrop for LoadBiaserFuture<F, Rsp> {
-    fn drop(self: Pin<&mut Self>) {
-        let this = self.project();
-        // Only decrement if we haven't already done so in poll().
-        // Avoids leaking the pending count upon cancellation.
-        if !*this.completed {
-            this.shared.pending.fetch_sub(1, Ordering::Release);
+                Poll::Ready(Err(e))
+            }
         }
     }
 }
@@ -628,33 +588,25 @@ mod tests {
     use std::convert::Infallible;
     use tokio::time;
 
-    impl<S> LoadBiaser<S> {
+    impl<S, C> LoadBiaser<S, C> {
         pub fn get_rtt(&self) -> f64 {
-            self.shared.metrics.read().rtt.get()
-        }
-
-        pub fn get_penalty(&self) -> f64 {
-            self.shared.metrics.read().penalty.get()
+            self.shared.rtt.read().get()
         }
 
         pub fn get_pending(&self) -> u32 {
-            self.shared.pending.load(Ordering::Acquire)
+            (Arc::strong_count(&self.shared) as u32).saturating_sub(1)
         }
 
-        pub fn inject_penalty(&self, penalty_secs: f64) {
-            self.shared
-                .metrics
-                .write()
-                .penalty
-                .add_peak(penalty_secs, Instant::now());
+        pub fn inject_rtt(&self, rtt_secs: f64) {
+            self.shared.rtt.write().add_peak(rtt_secs, Instant::now());
         }
+    }
 
-        pub fn inject_rtt(&self, rtt_secs: f64) {
-            self.shared
-                .metrics
-                .write()
-                .rtt
-                .add_peak(rtt_secs, Instant::now());
+    impl Handle {
+        /// Disable a handle so dropping it records nothing. Lets a test hold a
+        /// handle just to raise the pending count.
+        fn disable(mut self) {
+            self.enabled = false;
         }
     }
 
@@ -688,31 +640,11 @@ mod tests {
         }
     }
 
-    // Mock service that always returns an error.
-    #[derive(Clone)]
-    struct ErrorService;
-
-    impl Service<()> for ErrorService {
-        type Response = http::Response<&'static str>;
-        type Error = &'static str;
-        type Future = futures::future::Ready<Result<Self::Response, Self::Error>>;
-
-        fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
-            Poll::Ready(Ok(()))
-        }
-
-        fn call(&mut self, _: ()) -> Self::Future {
-            futures::future::ready(Err("connection refused"))
-        }
-    }
-
     fn test_config() -> LoadBiaserConfig {
         LoadBiaserConfig {
             default_rtt: Duration::from_millis(100), // 0.1s default
             rtt_decay: Duration::from_secs(10),
             penalty_secs: 5.0,
-            penalty_decay: Duration::from_secs(10),
-            enabled: true,
             max_duration: DEFAULT_RETRY_AFTER_MAX_DURATION,
         }
     }
@@ -743,17 +675,12 @@ mod tests {
         // RTT * (0 + 1) = 0.05
         let load_idle = biaser.load();
 
-        // Start a request (not awaited yet)
-        // Increment pending to simulate in-flight requests
-        biaser.shared.pending.fetch_add(1, Ordering::AcqRel);
-
-        // RTT * (1 + 1) = 0.05 * 2 = 0.1
+        // Hold a handle to simulate an in-flight request: the strong count of
+        // the shared Arc increments per live handle like during a call.
+        let h1 = biaser.handle();
         let load_one_pending = biaser.load();
 
-        // Increment again
-        biaser.shared.pending.fetch_add(1, Ordering::AcqRel);
-
-        // RTT * (2 + 1) = 0.05 * 3 = 0.15
+        let h2 = biaser.handle();
         let load_two_pending = biaser.load();
 
         assert!(
@@ -768,6 +695,9 @@ mod tests {
             load_two_pending,
             load_one_pending
         );
+
+        h1.disable();
+        h2.disable();
     }
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
@@ -796,6 +726,78 @@ mod tests {
         );
     }
 
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_fast_failure_more_expensive_than_healthy() {
+        // A fast 503 should be more expensive than a fast 200
+        let mut failing = LoadBiaser::new(
+            MockService::new(http::StatusCode::SERVICE_UNAVAILABLE),
+            test_config(),
+        );
+        let mut healthy = LoadBiaser::new(MockService::new(http::StatusCode::OK), test_config());
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        let _ = failing.call(()).await;
+        let _ = healthy.call(()).await;
+
+        let load_failing = failing.load();
+        let load_healthy = healthy.load();
+
+        assert!(
+            load_failing > load_healthy,
+            "fast failure should read higher load than healthy: {load_failing} > {load_healthy}"
+        );
+        // The failure records the base penalty (5s)
+        assert!(
+            load_failing > 4.0,
+            "503 should record the base effective RTT (~5s): {load_failing}"
+        );
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_retry_after_hint_load_grows_monotonically() {
+        let hints = [0u64, 1, 5, 30, 120, 300];
+        let mut prev = f64::NEG_INFINITY;
+
+        for secs in hints {
+            let inner = RetryAfterService {
+                status: http::StatusCode::TOO_MANY_REQUESTS,
+                retry_after_secs: secs,
+            };
+            let mut biaser = LoadBiaser::new(inner, test_config());
+
+            time::sleep(Duration::from_millis(1)).await;
+
+            let _ = biaser.call(()).await;
+            let load = biaser.load();
+            assert!(
+                load >= prev,
+                "load must be monotonic in Retry-After: hint={secs}s gave {load}, previous {prev}"
+            );
+            prev = load;
+        }
+    }
+
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_faster_healthy_is_cheaper_than_slower_healthy() {
+        let mut fast = LoadBiaser::new(MockService::new(http::StatusCode::OK), test_config());
+        let slow = LoadBiaser::new(MockService::new(http::StatusCode::OK), test_config());
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        let _ = fast.call(()).await;
+
+        fast.inject_rtt(0.01);
+        slow.inject_rtt(2.0);
+
+        assert!(
+            fast.load() < slow.load(),
+            "lower RTT must rank cheaper: fast={} slow={}",
+            fast.load(),
+            slow.load()
+        );
+    }
+
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_429_injects_penalty() {
         let inner = MockService::new(http::StatusCode::TOO_MANY_REQUESTS);
@@ -803,18 +805,14 @@ mod tests {
 
         time::sleep(Duration::from_millis(1)).await;
 
-        // Penalty should be infinite (none) before we get requests.
-        assert!(biaser.get_penalty().is_infinite());
-
-        // Make a request that returns 429.
+        // Make a request that returns 429 (no hint)
         let _ = biaser.call(()).await;
 
-        // A penalty should be injected (5 seconds) after seeing a 429
-        let penalty = biaser.get_penalty();
+        // The 429 records the penalty (5s)
+        let rtt = biaser.get_rtt();
         assert!(
-            (penalty - 5.0).abs() < 0.1,
-            "penalty should be ~5s after 429: {}",
-            penalty
+            (rtt - 5.0).abs() < 0.1,
+            "429 without a hint should record ~5s RTT: {rtt}"
         );
 
         // Load should be at least the penalty.
@@ -832,20 +830,15 @@ mod tests {
         // Make a request that returns 200.
         let _ = biaser.call(()).await;
 
-        // Penalty should still be infinite (none).
-        assert!(
-            biaser.get_penalty().is_infinite(),
-            "penalty should not be injected for 200"
-        );
+        // A success records only its measured (near-zero) RTT, well below
+        // the 5s penalty base.
+        let rtt = biaser.get_rtt();
+        assert!(rtt < 1.0, "200 should record a low RTT, got: {rtt}");
     }
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_penalty_decays_over_time() {
         let inner = MockService::new(http::StatusCode::TOO_MANY_REQUESTS);
-        // Use a custom config with default_rtt=1ms (0.001s) so load() tracks penalty directly.
-        // inject_rtt(0.001) would be a no-op here: add_peak() only replaces when the new
-        // value exceeds the decayed current value, so 0.001 < 0.1 (test_config default)
-        // would fail the peak check and fall through to add() which no-ops at ts==self.timestamp.
         let config = LoadBiaserConfig {
             default_rtt: Duration::from_millis(1),
             ..test_config()
@@ -854,53 +847,44 @@ mod tests {
 
         time::sleep(Duration::from_millis(1)).await;
 
-        // Trigger penalty via 429 response
+        // Trigger penalty via 429 response.
         let _ = biaser.call(()).await;
 
-        // After call completes, pending is back to 0 (LoadBiaserFuture::poll decrements
-        // before returning Poll::Ready), so load = max(rtt * 1, penalty).
+        // After call completes, pending is back to 0
         assert_eq!(
             biaser.get_pending(),
             0,
             "pending should be 0 after call completes"
         );
 
-        // load() calls get_at(now) which projects the decayed penalty.
-        // Immediately after injection: penalty ~5.0, rtt ~0.001, so load ~5.0
+        // Immediately after: effective RTT ~5.0, so load ~5.0.
         let load_before = biaser.load();
         assert!(
             load_before > 4.0,
-            "load before decay should be dominated by the injected 429 penalty: {}",
+            "load before decay should reflect the 429 penalized RTT: {}",
             load_before
         );
 
         // Advance time by one decay period (10s).
-        // Penalty decays: 5.0 * e^(-10/10) ~ 1.839
+        // Penalty RTT decays: 5.0 * e^(-10/10) ~ 1.839
         time::sleep(Duration::from_secs(10)).await;
 
-        // load() projects the decayed penalty at the new timestamp.
-        // No EWMA mutation needed since this is pure projection.
         let load_after = biaser.load();
         assert!(
-            load_after > 1.0,
-            "load after one decay period should still be dominated by decayed penalty: {}",
-            load_after
-        );
-        assert!(
-            load_after < 2.5,
-            "load after one decay period should reflect substantial decay (5.0 * e^-1 ~ 1.839): {}",
+            load_after > 1.0 && load_after < 2.5,
+            "load after one decay period should reflect 5.0 * e^-1 ~ 1.839: {}",
             load_after
         );
         assert!(
             load_after < load_before,
-            "penalty should decay over time as observed through load(): {} < {}",
+            "effective RTT should decay over time: {} < {}",
             load_after,
             load_before
         );
     }
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_sequential_failures_maintain_penalty() {
+    async fn test_sequential_failures_maintain_load() {
         let inner = MockService::new(http::StatusCode::TOO_MANY_REQUESTS);
         let config = LoadBiaserConfig {
             default_rtt: Duration::from_millis(1),
@@ -911,89 +895,39 @@ mod tests {
         time::sleep(Duration::from_millis(1)).await;
 
         let _ = biaser.call(()).await;
-        let penalty_after_first = biaser.get_penalty();
+        let rtt_first = biaser.get_rtt();
         assert!(
-            (penalty_after_first - 5.0).abs() < 0.1,
-            "first 429 should inject ~5s penalty, got: {penalty_after_first}"
+            (rtt_first - 5.0).abs() < 0.1,
+            "first 429 should record ~5s, got: {rtt_first}"
         );
 
-        // Second 429 arrives 1s later. The penalty has decayed slightly but
-        // add_peak replaces the decayed value with the fresh penalty.
+        // A second 429 one second later: add_peak replaces the slightly decayed
+        // value with the new 5s measurement.
         time::sleep(Duration::from_secs(1)).await;
         let _ = biaser.call(()).await;
-        let penalty_after_second = biaser.get_penalty();
+        let rtt_second = biaser.get_rtt();
         assert!(
-            (penalty_after_second - 5.0).abs() < 0.1,
-            "second 429 should maintain penalty at ~5s, got: {penalty_after_second}"
+            (rtt_second - 5.0).abs() < 0.1,
+            "second 429 should maintain ~5s, got: {rtt_second}"
         );
 
-        // Third 429 after another second. Still at ~5s.
         time::sleep(Duration::from_secs(1)).await;
         let _ = biaser.call(()).await;
-        let penalty_after_third = biaser.get_penalty();
+        let rtt_third = biaser.get_rtt();
         assert!(
-            (penalty_after_third - 5.0).abs() < 0.1,
-            "third 429 should maintain penalty at ~5s, got: {penalty_after_third}"
+            (rtt_third - 5.0).abs() < 0.1,
+            "third 429 should maintain ~5s, got: {rtt_third}"
         );
     }
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_load_is_max_of_rtt_based_and_penalty() {
-        let inner = MockService::new(http::StatusCode::OK);
-        let biaser = LoadBiaser::new(inner, test_config());
-
-        time::sleep(Duration::from_millis(1)).await;
-
-        // Inject a high RTT (10 seconds)
-        biaser.inject_rtt(10.0);
-
-        // Inject a lower penalty (1 second)
-        biaser.inject_penalty(1.0);
-
-        // Load should be RTT-based since it's higher: 10 * 1 = 10
-        let load = biaser.load();
-        assert!(
-            (load - 10.0).abs() < 0.1,
-            "load should be RTT-based when higher: {}",
-            load
-        );
-
-        // Now inject a very high penalty
-        biaser.inject_penalty(20.0);
-
-        // Load should be penalty since it's higher
-        let load = biaser.load();
-        assert!(
-            (load - 20.0).abs() < 0.1,
-            "load should be penalty when higher: {}",
-            load
-        );
-    }
-
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_clone_shares_state() {
-        let inner = MockService::new(http::StatusCode::TOO_MANY_REQUESTS);
-        let mut biaser1 = LoadBiaser::new(inner.clone(), test_config());
-        let biaser2 = biaser1.clone();
-
-        time::sleep(Duration::from_millis(1)).await;
-
-        // Trigger penalty on biaser1
-        let _ = biaser1.call(()).await;
-
-        // biaser2 should see the same penalty (shared state)
-        assert_eq!(biaser1.get_penalty(), biaser2.get_penalty());
-        assert_eq!(biaser1.load(), biaser2.load());
-    }
-
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_pending_decremented_on_completion() {
+    async fn test_pending_counted_via_strong_count() {
         let inner = MockService::new(http::StatusCode::OK);
         let mut biaser = LoadBiaser::new(inner, test_config());
 
         assert_eq!(biaser.get_pending(), 0, "pending should start at 0");
 
-        // Start a request, pending increments
+        // Start a request: the future holds a handle, raising the count.
         let fut = biaser.call(());
         assert_eq!(
             biaser.get_pending(),
@@ -1001,7 +935,7 @@ mod tests {
             "pending should be 1 during request"
         );
 
-        // Complete the request, pending decrements
+        // Complete the request: the handle drops, lowering the count.
         let _ = fut.await;
         assert_eq!(
             biaser.get_pending(),
@@ -1011,38 +945,11 @@ mod tests {
     }
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_rtt_not_updated_on_error() {
-        let inner = ErrorService;
-        let mut biaser = LoadBiaser::new(inner, test_config());
-
-        time::sleep(Duration::from_millis(1)).await;
-
-        // RTT should start at default_rtt before any requests
-        let initial_rtt = biaser.get_rtt();
-        assert!(
-            (initial_rtt - 0.1).abs() < 0.01,
-            "RTT should start at default_rtt (0.1s), got: {initial_rtt}"
-        );
-
-        // Make a request that returns an error
-        let _ = biaser.call(()).await;
-
-        // RTT should remain at default_rtt because error responses don't
-        // update RTT. This prevents P2C from routing more traffic to broken
-        // endpoints that fail fast (appearing "fast" to the load metric).
-        let rtt_after_error = biaser.get_rtt();
-        assert!(
-            (rtt_after_error - initial_rtt).abs() < 0.01,
-            "RTT should not change on error responses: {rtt_after_error} vs {initial_rtt}"
-        );
-    }
-
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_pending_decremented_on_cancellation() {
+    async fn test_cancellation_records_measurement() {
         use tokio::sync::oneshot;
 
         // Build a service whose response future blocks on a oneshot receiver.
-        // This lets us drop the future mid-flight to test PinnedDrop.
+        // This lets us drop the future mid-flight.
         struct DelayedService {
             rx: Option<oneshot::Receiver<http::Response<&'static str>>>,
         }
@@ -1074,12 +981,10 @@ mod tests {
                 match Pin::new(&mut self.rx).poll(cx) {
                     Poll::Ready(Ok(resp)) => Poll::Ready(Ok(resp)),
                     Poll::Ready(Err(_)) => {
-                        // Sender dropped. Return a default response
                         let resp = http::Response::builder()
                             .status(http::StatusCode::OK)
                             .body("cancelled")
                             .unwrap();
-
                         Poll::Ready(Ok(resp))
                     }
                     Poll::Pending => Poll::Pending,
@@ -1097,15 +1002,24 @@ mod tests {
         let fut = biaser.call(());
         assert_eq!(biaser.get_pending(), 1);
 
-        // Drop the future without completing it. The oneshot receiver
-        // never receives a value, so the inner future is still pending.
-        // PinnedDrop must decrement the pending count.
+        // Advance time so the cancelled request has measurable elapsed time.
+        time::sleep(Duration::from_secs(3)).await;
+
+        let rtt_before = biaser.get_rtt();
+
+        // Drop the future without completing it. The handle's drop should
+        // records the elapsed wait as an RTT measurement.
         drop(fut);
 
         assert_eq!(
             biaser.get_pending(),
             0,
-            "PinnedDrop should decrement pending on cancellation"
+            "dropping the future should release the handle and decrease pending"
+        );
+        let rtt_after = biaser.get_rtt();
+        assert!(
+            rtt_after > rtt_before,
+            "cancellation should record the elapsed wait: {rtt_after} > {rtt_before}"
         );
 
         // tx is unused. Dropping it here is fine, it just closes the channel.
@@ -1155,7 +1069,7 @@ mod tests {
     }
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_429_with_retry_after_uses_adaptive_penalty() {
+    async fn test_429_with_retry_after_records_hint() {
         let inner = RetryAfterService {
             status: http::StatusCode::TOO_MANY_REQUESTS,
             retry_after_secs: 30,
@@ -1167,44 +1081,32 @@ mod tests {
         // Make a request that returns 429 with Retry-After: 30
         let _ = biaser.call(()).await;
 
-        // Amplified: penalty_secs * FACTOR * exp(RA/decay) = 5.0 * 0.5 * e^3
-        let expected = 5.0_f64 * RETRY_AFTER_PENALTY_FACTOR * (30.0_f64 / 10.0_f64).exp();
-        let penalty = biaser.get_penalty();
-
+        // The hint should be recorded as the effective RTT (30s)
+        let rtt = biaser.get_rtt();
         assert!(
-            (penalty - expected).abs() < 1.0,
-            "penalty should be ~{expected:.1} (amplified), got: {}",
-            penalty
+            (rtt - 30.0).abs() < 0.5,
+            "429 with Retry-After: 30 should record ~30s effective RTT, got: {rtt}"
         );
     }
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_429_with_retry_after_clamped_to_max() {
+    async fn test_429_with_retry_after_capped_at_max() {
         let inner = RetryAfterService {
             status: http::StatusCode::TOO_MANY_REQUESTS,
             retry_after_secs: 600,
         };
-        let config = LoadBiaserConfig {
-            max_duration: DEFAULT_RETRY_AFTER_MAX_DURATION,
-            ..test_config()
-        };
-        let mut biaser = LoadBiaser::new(inner, config);
+        let mut biaser = LoadBiaser::new(inner, test_config());
 
         time::sleep(Duration::from_millis(1)).await;
 
         // Make a request that returns 429 with Retry-After: 600, clamped to 300s
         let _ = biaser.call(()).await;
 
-        // Amplified: penalty_secs * FACTOR * exp(clamped_RA/decay) = 5.0 * 0.5 * e^30
-        // exceeds the 1e12 penalty clamp, so the actual penalty is clamped.
-        let unclamped = 5.0_f64 * RETRY_AFTER_PENALTY_FACTOR * (300.0_f64 / 10.0_f64).exp();
-        let expected = unclamped.min(1e12);
-        let penalty = biaser.get_penalty();
-
+        // Retry-After: 600 is capped to max_duration (300s).
+        let rtt = biaser.get_rtt();
         assert!(
-            (penalty - expected).abs() / expected < 0.01,
-            "penalty should be ~{expected:.0} (amplified, clamped RA=300), got: {}",
-            penalty
+            (rtt - 300.0).abs() < 0.5,
+            "429 with Retry-After: 600 should cap to 300s, got: {rtt}"
         );
     }
 
@@ -1416,24 +1318,24 @@ mod tests {
     }
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_503_injects_full_penalty() {
+    async fn test_503_records_base_penalized_effective_rtt() {
         let inner = MockService::new(http::StatusCode::SERVICE_UNAVAILABLE);
         let mut biaser = LoadBiaser::new(inner, test_config());
 
         time::sleep(Duration::from_millis(1)).await;
 
         let _ = biaser.call(()).await;
-        let penalty = biaser.get_penalty();
+        let rtt = biaser.get_rtt();
 
         // test_config has penalty_secs = 5.0
         assert!(
-            (penalty - 5.0).abs() < 0.1,
-            "503 should inject full penalty (~5.0s), got: {penalty}"
+            (rtt - 5.0).abs() < 0.1,
+            "503 should record the base penalized effective RTT (~5.0s), got: {rtt}"
         );
     }
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_503_with_retry_after_uses_adaptive_penalty() {
+    async fn test_503_with_retry_after_records_hint() {
         let inner = RetryAfterService {
             status: http::StatusCode::SERVICE_UNAVAILABLE,
             retry_after_secs: 30,
@@ -1445,48 +1347,17 @@ mod tests {
         // Make a request that returns 503 with Retry-After: 30
         let _ = biaser.call(()).await;
 
-        // Same amplification formula as 429:
-        // penalty_secs * FACTOR * exp(RA/decay) = 5.0 * 0.5 * e^3
-        let expected = 5.0_f64 * RETRY_AFTER_PENALTY_FACTOR * (30.0_f64 / 10.0_f64).exp();
-        let penalty = biaser.get_penalty();
-        assert!(
-            (penalty - expected).abs() < 1.0,
-            "503+RA penalty should be ~{expected:.1} (amplified), got: {penalty}"
-        );
-    }
-
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_503_with_retry_after_clamped_to_max() {
-        let inner = RetryAfterService {
-            status: http::StatusCode::SERVICE_UNAVAILABLE,
-            retry_after_secs: 600,
-        };
-        let config = LoadBiaserConfig {
-            max_duration: DEFAULT_RETRY_AFTER_MAX_DURATION,
-            ..test_config()
-        };
-        let mut biaser = LoadBiaser::new(inner, config);
-
-        time::sleep(Duration::from_millis(1)).await;
-
-        // Make a request that returns 503 with Retry-After: 600, clamped to 300s
-        let _ = biaser.call(()).await;
+        // 503 honors the hint the same way 429 does.
+        let rtt = biaser.get_rtt();
 
-        // Amplified with clamped RA:
-        // penalty_secs * FACTOR * exp(clamped_RA/decay) = 5.0 * 0.5 * e^30
-        // exceeds the 1e12 penalty clamp, so the actual penalty is clamped.
-        let unclamped = 5.0_f64 * RETRY_AFTER_PENALTY_FACTOR * (300.0_f64 / 10.0_f64).exp();
-        let expected = unclamped.min(1e12);
-        let penalty = biaser.get_penalty();
         assert!(
-            (penalty - expected).abs() / expected < 0.01,
-            "503+RA penalty should be ~{expected:.0} (amplified, clamped RA=300), got: {penalty}"
+            (rtt - 30.0).abs() < 0.5,
+            "503 with Retry-After: 30 should record ~30s, got: {rtt}"
         );
     }
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_429_and_503_retry_after_produce_identical_penalty() {
-        // 429 path: RetryAfterService returns TOO_MANY_REQUESTS with Retry-After: 30
+    async fn test_429_and_503_retry_after_produce_identical_load() {
         let inner_429 = RetryAfterService {
             status: http::StatusCode::TOO_MANY_REQUESTS,
             retry_after_secs: 30,
@@ -1517,19 +1388,19 @@ mod tests {
     }
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_500_injects_penalty() {
+    async fn test_500_records_base_penalized_effective_rtt() {
         let inner = MockService::new(http::StatusCode::INTERNAL_SERVER_ERROR);
         let mut biaser = LoadBiaser::new(inner, test_config());
 
         time::sleep(Duration::from_millis(1)).await;
 
         let _ = biaser.call(()).await;
-        let penalty = biaser.get_penalty();
+        let rtt = biaser.get_rtt();
 
         // test_config has penalty_secs = 5.0
         assert!(
-            (penalty - 5.0).abs() < 0.1,
-            "500 should inject full penalty (~5.0s), got: {penalty}"
+            (rtt - 5.0).abs() < 0.1,
+            "500 should record the base penalized effective RTT (~5.0s), got: {rtt}"
         );
     }
 
@@ -1575,21 +1446,21 @@ mod tests {
         }
     }
 
-    async fn assert_grpc_injects_penalty(grpc_status: u16, label: &str) {
+    async fn assert_grpc_records_penalized_effective_rtt(grpc_status: u16, label: &str) {
         let inner = GrpcErrorService { grpc_status };
         let mut biaser = LoadBiaser::new(inner, test_config());
 
         time::sleep(Duration::from_millis(1)).await;
         let _ = biaser.call(()).await;
-        let penalty = biaser.get_penalty();
+        let rtt = biaser.get_rtt();
 
         assert!(
-            (penalty - 5.0).abs() < 0.1,
-            "gRPC {label} (status {grpc_status}) should inject full penalty (~5s), got: {penalty}"
+            (rtt - 5.0).abs() < 0.1,
+            "gRPC {label} (status {grpc_status}) should record the base penalized effective RTT (~5s), got: {rtt}"
         );
     }
 
-    async fn assert_grpc_no_penalty(grpc_status: u16, label: &str) {
+    async fn assert_grpc_records_low_rtt(grpc_status: u16, label: &str) {
         let inner = GrpcErrorService { grpc_status };
         let mut biaser = LoadBiaser::new(inner, test_config());
 
@@ -1597,105 +1468,38 @@ mod tests {
         let _ = biaser.call(()).await;
 
         assert!(
-            biaser.get_penalty().is_infinite(),
-            "gRPC {label} (status {grpc_status}) should not inject penalty"
-        );
-    }
-
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_grpc_server_errors_inject_penalty() {
-        assert_grpc_injects_penalty(8, "RESOURCE_EXHAUSTED").await;
-        assert_grpc_injects_penalty(14, "UNAVAILABLE").await;
-        assert_grpc_injects_penalty(13, "INTERNAL").await;
-        assert_grpc_injects_penalty(2, "UNKNOWN").await;
-        assert_grpc_injects_penalty(4, "DEADLINE_EXCEEDED").await;
-        assert_grpc_injects_penalty(15, "DATA_LOSS").await;
-    }
-
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_grpc_client_errors_no_penalty() {
-        assert_grpc_no_penalty(1, "CANCELLED").await;
-        assert_grpc_no_penalty(3, "INVALID_ARGUMENT").await;
-        assert_grpc_no_penalty(5, "NOT_FOUND").await;
-        assert_grpc_no_penalty(6, "ALREADY_EXISTS").await;
-        assert_grpc_no_penalty(7, "PERMISSION_DENIED").await;
-        assert_grpc_no_penalty(9, "FAILED_PRECONDITION").await;
-        assert_grpc_no_penalty(10, "ABORTED").await;
-        assert_grpc_no_penalty(11, "OUT_OF_RANGE").await;
-        assert_grpc_no_penalty(12, "UNIMPLEMENTED").await;
-        assert_grpc_no_penalty(16, "UNAUTHENTICATED").await;
-    }
-
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_grpc_ok_no_penalty() {
-        assert_grpc_no_penalty(0, "OK").await;
-    }
-
-    async fn assert_disabled_no_penalty<S>(mut biaser: LoadBiaser<S>, label: &str)
-    where
-        S: Service<(), Response = http::Response<&'static str>, Error = Infallible>,
-    {
-        time::sleep(Duration::from_millis(1)).await;
-        let _ = biaser.call(()).await;
-        assert!(
-            biaser.get_penalty().is_infinite(),
-            "penalty should not be injected when disabled for {label}: {}",
-            biaser.get_penalty()
+            biaser.get_rtt() < 1.0,
+            "gRPC {label} (status {grpc_status}) should record only the measured RTT: {}",
+            biaser.get_rtt()
         );
     }
 
-    fn disabled_config() -> LoadBiaserConfig {
-        LoadBiaserConfig {
-            enabled: false,
-            ..test_config()
-        }
-    }
-
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_429_disabled_no_penalty() {
-        let inner = MockService::new(http::StatusCode::TOO_MANY_REQUESTS);
-        assert_disabled_no_penalty(LoadBiaser::new(inner, disabled_config()), "429").await;
-    }
-
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_503_disabled_no_penalty() {
-        let inner = MockService::new(http::StatusCode::SERVICE_UNAVAILABLE);
-        assert_disabled_no_penalty(LoadBiaser::new(inner, disabled_config()), "503").await;
-    }
-
-    #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_500_disabled_no_penalty() {
-        let inner = MockService::new(http::StatusCode::INTERNAL_SERVER_ERROR);
-        assert_disabled_no_penalty(LoadBiaser::new(inner, disabled_config()), "500").await;
-    }
-
     #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_grpc_resource_exhausted_disabled_no_penalty() {
-        let inner = GrpcErrorService { grpc_status: 8 };
-        assert_disabled_no_penalty(
-            LoadBiaser::new(inner, disabled_config()),
-            "gRPC RESOURCE_EXHAUSTED (status 8)",
-        )
-        .await;
+    async fn test_grpc_server_errors_record_effective_rtt() {
+        assert_grpc_records_penalized_effective_rtt(8, "RESOURCE_EXHAUSTED").await;
+        assert_grpc_records_penalized_effective_rtt(14, "UNAVAILABLE").await;
+        assert_grpc_records_penalized_effective_rtt(13, "INTERNAL").await;
+        assert_grpc_records_penalized_effective_rtt(2, "UNKNOWN").await;
+        assert_grpc_records_penalized_effective_rtt(4, "DEADLINE_EXCEEDED").await;
+        assert_grpc_records_penalized_effective_rtt(15, "DATA_LOSS").await;
     }
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_grpc_unavailable_disabled_no_penalty() {
-        let inner = GrpcErrorService { grpc_status: 14 };
-        assert_disabled_no_penalty(
-            LoadBiaser::new(inner, disabled_config()),
-            "gRPC UNAVAILABLE (status 14)",
-        )
-        .await;
+    async fn test_grpc_client_errors_record_low_rtt() {
+        assert_grpc_records_low_rtt(1, "CANCELLED").await;
+        assert_grpc_records_low_rtt(3, "INVALID_ARGUMENT").await;
+        assert_grpc_records_low_rtt(5, "NOT_FOUND").await;
+        assert_grpc_records_low_rtt(6, "ALREADY_EXISTS").await;
+        assert_grpc_records_low_rtt(7, "PERMISSION_DENIED").await;
+        assert_grpc_records_low_rtt(9, "FAILED_PRECONDITION").await;
+        assert_grpc_records_low_rtt(10, "ABORTED").await;
+        assert_grpc_records_low_rtt(11, "OUT_OF_RANGE").await;
+        assert_grpc_records_low_rtt(12, "UNIMPLEMENTED").await;
+        assert_grpc_records_low_rtt(16, "UNAUTHENTICATED").await;
     }
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
-    async fn test_grpc_internal_disabled_no_penalty() {
-        let inner = GrpcErrorService { grpc_status: 13 };
-        assert_disabled_no_penalty(
-            LoadBiaser::new(inner, disabled_config()),
-            "gRPC INTERNAL (status 13)",
-        )
-        .await;
+    async fn test_grpc_ok_records_low_rtt() {
+        assert_grpc_records_low_rtt(0, "OK").await;
     }
 }

From 2589a469ebdf3823027b660d573e0169d366e2a7 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Fri, 5 Jun 2026 13:33:33 +0200
Subject: [PATCH 25/45] refactor(load-biaser): measure RTT to first response
 data frame by default

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 Cargo.lock                     |  4 ++
 linkerd/load-biaser/Cargo.toml |  4 ++
 linkerd/load-biaser/src/lib.rs | 84 ++++++++++++++++++++++++----------
 3 files changed, 67 insertions(+), 25 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 743cfe1206..9b2247ea76 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1991,8 +1991,12 @@ dependencies = [
 name = "linkerd-load-biaser"
 version = "0.1.0"
 dependencies = [
+ "bytes",
  "futures",
  "http",
+ "http-body",
+ "http-body-util",
+ "hyper-balance",
  "linkerd-ewma",
  "linkerd-http-classify",
  "linkerd-stack",
diff --git a/linkerd/load-biaser/Cargo.toml b/linkerd/load-biaser/Cargo.toml
index 68737d512f..b281c4983c 100644
--- a/linkerd/load-biaser/Cargo.toml
+++ b/linkerd/load-biaser/Cargo.toml
@@ -21,10 +21,14 @@ tower = { workspace = true, features = ["load"] }
 tower-service = { workspace = true }
 tracing = { workspace = true }
 
+hyper-balance = { path = "../../hyper-balance" }
 linkerd-ewma = { path = "../ewma" }
 linkerd-http-classify = { path = "../http/classify" }
 linkerd-stack = { path = "../stack" }
 
 [dev-dependencies]
+bytes = { workspace = true }
+http-body = { workspace = true }
+http-body-util = { workspace = true }
 tokio = { version = "1", features = ["macros", "rt", "sync", "time"] }
 tokio-test = "0.4"
diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index c15303be3e..de17ec5d00 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -25,6 +25,7 @@
 #![forbid(unsafe_code)]
 
 use futures::ready;
+use hyper_balance::PendingUntilFirstData;
 use linkerd_ewma::{Ewma, MIN_DECAY};
 use linkerd_stack::NewService;
 use parking_lot::RwLock;
@@ -38,7 +39,7 @@ use std::{
     time::Duration,
 };
 use tokio::time::Instant;
-use tower::load::{CompleteOnResponse, Load, TrackCompletion};
+use tower::load::{Load, TrackCompletion};
 use tower_service::Service;
 
 /// Default maximum duration for Retry-After hints.
@@ -302,11 +303,11 @@ impl SharedState {
 /// penalty has been injected.
 ///
 /// The completion type `C` controls when a successful request's RTT is
-/// measured, mirroring PeakEwma. The default drops the handle when the response
-/// future resolves. `hyper_balance::PendingUntilFirstData` defers it to the
-/// first response body frame, matching the proxy's HTTP balancer.
+/// measured, mirroring PeakEwma. The default is
+/// `hyper_balance::PendingUntilFirstData`, which records the RTT at the first
+/// response data frame.
 #[derive(Debug)]
-pub struct LoadBiaser<S, C = CompleteOnResponse> {
+pub struct LoadBiaser<S, C = PendingUntilFirstData> {
     inner: S,
     /// Per-endpoint RTT state and failure config.
     shared: Arc<SharedState>,
@@ -315,7 +316,7 @@ pub struct LoadBiaser<S, C = CompleteOnResponse> {
 
 /// A `NewService` implementation that creates `LoadBiaser` wrappers.
 #[derive(Debug)]
-pub struct NewLoadBiaser<N, Req, C = CompleteOnResponse> {
+pub struct NewLoadBiaser<N, Req, C = PendingUntilFirstData> {
     inner: N,
     completion: C,
     config: LoadBiaserConfig,
@@ -419,11 +420,11 @@ impl<S, C> LoadBiaser<S, C> {
     }
 }
 
-impl<S> LoadBiaser<S, CompleteOnResponse> {
-    /// Creates a new `LoadBiaser` recording RTT when the response resolves.
+impl<S> LoadBiaser<S, PendingUntilFirstData> {
+    /// Creates a new `LoadBiaser` recording RTT at the first response data frame.
     #[must_use]
     pub fn new(inner: S, config: LoadBiaserConfig) -> Self {
-        Self::with_completion(inner, CompleteOnResponse::default(), config)
+        Self::with_completion(inner, PendingUntilFirstData::default(), config)
     }
 }
 
@@ -506,10 +507,10 @@ impl<N, Req, C> NewLoadBiaser<N, Req, C> {
     }
 }
 
-impl<N, Req> NewLoadBiaser<N, Req, CompleteOnResponse> {
+impl<N, Req> NewLoadBiaser<N, Req, PendingUntilFirstData> {
     /// Creates a new `NewLoadBiaser` with the given configuration.
     pub fn new(config: LoadBiaserConfig, inner: N) -> Self {
-        Self::with_completion(config, CompleteOnResponse::default(), inner)
+        Self::with_completion(config, PendingUntilFirstData::default(), inner)
     }
 }
 
@@ -585,9 +586,33 @@ where
 #[cfg(test)]
 mod tests {
     use super::*;
+    use bytes::Bytes;
+    use http_body_util::{BodyExt, Full};
     use std::convert::Infallible;
     use tokio::time;
 
+    /// Response body used by mock services.
+    ///
+    /// Reports `is_end_stream() == false` until polled, causing
+    /// `PendingUntilFirstData` to record the RTT on success once the
+    /// body reaches its first frame or is dropped.
+    type TestBody = Full<Bytes>;
+
+    /// Test body with a small payload.
+    fn body() -> TestBody {
+        Full::new(Bytes::from_static(b"test"))
+    }
+
+    /// Drives a response body to its first frame.
+    async fn drive_to_first_frame<B>(resp: http::Response<B>)
+    where
+        B: http_body::Body + Unpin,
+        B::Error: std::fmt::Debug,
+    {
+        let mut b = resp.into_body();
+        let _ = b.frame().await;
+    }
+
     impl<S, C> LoadBiaser<S, C> {
         pub fn get_rtt(&self) -> f64 {
             self.shared.rtt.read().get()
@@ -623,7 +648,7 @@ mod tests {
     }
 
     impl Service<()> for MockService {
-        type Response = http::Response<&'static str>;
+        type Response = http::Response<TestBody>;
         type Error = Infallible;
         type Future = futures::future::Ready<Result<Self::Response, Self::Error>>;
 
@@ -634,7 +659,7 @@ mod tests {
         fn call(&mut self, _: ()) -> Self::Future {
             let resp = http::Response::builder()
                 .status(self.status)
-                .body("test")
+                .body(body())
                 .unwrap();
             futures::future::ready(Ok(resp))
         }
@@ -935,12 +960,21 @@ mod tests {
             "pending should be 1 during request"
         );
 
-        // Complete the request: the handle drops, lowering the count.
-        let _ = fut.await;
+        // Resolve the response future. The default tracker causes the handle to
+        // consider the request in-flight until the body yields the first frame.
+        let resp = fut.await.unwrap();
+        assert_eq!(
+            biaser.get_pending(),
+            1,
+            "pending should stay 1 while until the body yields it first frame"
+        );
+
+        // When the body produces its first frame the handle drops
+        drive_to_first_frame(resp).await;
         assert_eq!(
             biaser.get_pending(),
             0,
-            "pending should be 0 after completion"
+            "pending should be 0 after the body yields its first frame"
         );
     }
 
@@ -951,11 +985,11 @@ mod tests {
         // Build a service whose response future blocks on a oneshot receiver.
         // This lets us drop the future mid-flight.
         struct DelayedService {
-            rx: Option<oneshot::Receiver<http::Response<&'static str>>>,
+            rx: Option<oneshot::Receiver<http::Response<TestBody>>>,
         }
 
         impl Service<()> for DelayedService {
-            type Response = http::Response<&'static str>;
+            type Response = http::Response<TestBody>;
             type Error = Infallible;
             type Future = DelayedFuture;
 
@@ -971,11 +1005,11 @@ mod tests {
         }
 
         struct DelayedFuture {
-            rx: oneshot::Receiver<http::Response<&'static str>>,
+            rx: oneshot::Receiver<http::Response<TestBody>>,
         }
 
         impl std::future::Future for DelayedFuture {
-            type Output = Result<http::Response<&'static str>, Infallible>;
+            type Output = Result<http::Response<TestBody>, Infallible>;
 
             fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
                 match Pin::new(&mut self.rx).poll(cx) {
@@ -983,7 +1017,7 @@ mod tests {
                     Poll::Ready(Err(_)) => {
                         let resp = http::Response::builder()
                             .status(http::StatusCode::OK)
-                            .body("cancelled")
+                            .body(body())
                             .unwrap();
                         Poll::Ready(Ok(resp))
                     }
@@ -1049,7 +1083,7 @@ mod tests {
     }
 
     impl Service<()> for RetryAfterService {
-        type Response = http::Response<&'static str>;
+        type Response = http::Response<TestBody>;
         type Error = Infallible;
         type Future = futures::future::Ready<Result<Self::Response, Self::Error>>;
 
@@ -1061,7 +1095,7 @@ mod tests {
             let resp = http::Response::builder()
                 .status(self.status)
                 .header(http::header::RETRY_AFTER, self.retry_after_secs.to_string())
-                .body("retry-after response")
+                .body(body())
                 .unwrap();
 
             futures::future::ready(Ok(resp))
@@ -1426,7 +1460,7 @@ mod tests {
     }
 
     impl Service<()> for GrpcErrorService {
-        type Response = http::Response<&'static str>;
+        type Response = http::Response<TestBody>;
         type Error = Infallible;
         type Future = futures::future::Ready<Result<Self::Response, Self::Error>>;
 
@@ -1439,7 +1473,7 @@ mod tests {
                 .status(http::StatusCode::OK)
                 .header(http::header::CONTENT_TYPE, "application/grpc")
                 .header("grpc-status", self.grpc_status.to_string())
-                .body("grpc error")
+                .body(body())
                 .unwrap();
 
             futures::future::ready(Ok(resp))

From ee9e7228a621205f8c7bca9e7190a8faef2f29b2 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Fri, 5 Jun 2026 13:33:33 +0200
Subject: [PATCH 26/45] refactor(load-biaser): store the failure penalty as
 integer milliseconds

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index de17ec5d00..c3a0661ac8 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -45,6 +45,9 @@ use tower_service::Service;
 /// Default maximum duration for Retry-After hints.
 pub const DEFAULT_RETRY_AFTER_MAX_DURATION: Duration = Duration::from_secs(300);
 
+/// Default base failure penalty, in whole milliseconds.
+const DEFAULT_PENALTY_MS: u32 = 5_000;
+
 /// Classification of response failures for load biasing.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum FailureHint {
@@ -230,7 +233,7 @@ impl ResponseFailureHint for tokio::io::DuplexStream {}
 impl ResponseFailureHint for tokio_test::io::Mock {}
 
 /// Configuration for LoadBiaser behavior.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, PartialEq, Eq, Hash)]
 pub struct LoadBiaserConfig {
     /// Default RTT to use when no measurements are available
     pub default_rtt: Duration,
@@ -240,8 +243,8 @@ pub struct LoadBiaserConfig {
     pub rtt_decay: Duration,
 
     /// The base penalty to inject on failure responses (429, 503, 5xx)
-    /// in seconds,  when the server sends no backoff hint.
-    pub penalty_secs: f64,
+    /// in milliseconds, when the server sends no backoff hint.
+    pub penalty_ms: u32,
 
     /// Maximum Retry-After duration to honor. Clamped to this value.
     pub max_duration: Duration,
@@ -253,7 +256,7 @@ impl Default for LoadBiaserConfig {
             default_rtt: Duration::from_secs(1),
             rtt_decay: Duration::from_secs(10),
             // 5 second penalty on failure responses (429, 503, 5xx)
-            penalty_secs: 5.0,
+            penalty_ms: DEFAULT_PENALTY_MS,
             max_duration: DEFAULT_RETRY_AFTER_MAX_DURATION,
         }
     }
@@ -271,7 +274,7 @@ struct SharedState {
     rtt: RwLock<Ewma>,
     /// Penalty value to inject on failure responses (in seconds) when no
     /// hint is present.
-    penalty_secs: f64,
+    penalty_ms: u32,
     /// Maximum Retry-After duration to honor (clamped)
     max_duration: Duration,
 }
@@ -288,7 +291,7 @@ impl SharedState {
         };
 
         // A hint, if present, is honored verbatim (capped).
-        from_hint.unwrap_or(self.penalty_secs)
+        from_hint.unwrap_or_else(|| Duration::from_millis(u64::from(self.penalty_ms)).as_secs_f64())
     }
 }
 
@@ -396,7 +399,7 @@ impl<S, C> LoadBiaser<S, C> {
                 now,
                 config.default_rtt.as_secs_f64(),
             )),
-            penalty_secs: config.penalty_secs,
+            penalty_ms: config.penalty_ms,
             max_duration: config.max_duration,
         });
         Self {
@@ -428,14 +431,7 @@ impl<S> LoadBiaser<S, PendingUntilFirstData> {
     }
 }
 
-fn sanitize(mut config: LoadBiaserConfig) -> LoadBiaserConfig {
-    if config.penalty_secs.is_nan() || config.penalty_secs < 0.0 {
-        tracing::warn!(
-            penalty_secs = config.penalty_secs,
-            "penalty_secs is NaN or negative, clamping to 0.0"
-        );
-        config.penalty_secs = 0.0;
-    }
+fn sanitize(config: LoadBiaserConfig) -> LoadBiaserConfig {
     if config.rtt_decay < MIN_DECAY {
         tracing::warn!(
             rtt_decay = ?config.rtt_decay,
@@ -669,7 +665,7 @@ mod tests {
         LoadBiaserConfig {
             default_rtt: Duration::from_millis(100), // 0.1s default
             rtt_decay: Duration::from_secs(10),
-            penalty_secs: 5.0,
+            penalty_ms: 5000,
             max_duration: DEFAULT_RETRY_AFTER_MAX_DURATION,
         }
     }
@@ -1361,7 +1357,7 @@ mod tests {
         let _ = biaser.call(()).await;
         let rtt = biaser.get_rtt();
 
-        // test_config has penalty_secs = 5.0
+        // test_config has penalty_ms = 5000
         assert!(
             (rtt - 5.0).abs() < 0.1,
             "503 should record the base penalized effective RTT (~5.0s), got: {rtt}"
@@ -1431,7 +1427,7 @@ mod tests {
         let _ = biaser.call(()).await;
         let rtt = biaser.get_rtt();
 
-        // test_config has penalty_secs = 5.0
+        // test_config has penalty_ms = 5000
         assert!(
             (rtt - 5.0).abs() < 0.1,
             "500 should record the base penalized effective RTT (~5.0s), got: {rtt}"

From 48e1cc3cf7606f3a870ea71aa2699ac3cfb444c2 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Fri, 5 Jun 2026 13:33:33 +0200
Subject: [PATCH 27/45] refactor(load-biaser): name the default RTT and decay
 durations

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index c3a0661ac8..65e53c5004 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -48,6 +48,12 @@ pub const DEFAULT_RETRY_AFTER_MAX_DURATION: Duration = Duration::from_secs(300);
 /// Default base failure penalty, in whole milliseconds.
 const DEFAULT_PENALTY_MS: u32 = 5_000;
 
+/// Default RTT seed used before any measurement is recorded.
+const DEFAULT_RTT: Duration = Duration::from_secs(1);
+
+/// Default decay window for the RTT EWMA.
+const DEFAULT_RTT_DECAY: Duration = Duration::from_secs(10);
+
 /// Classification of response failures for load biasing.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum FailureHint {
@@ -253,9 +259,8 @@ pub struct LoadBiaserConfig {
 impl Default for LoadBiaserConfig {
     fn default() -> Self {
         Self {
-            default_rtt: Duration::from_secs(1),
-            rtt_decay: Duration::from_secs(10),
-            // 5 second penalty on failure responses (429, 503, 5xx)
+            default_rtt: DEFAULT_RTT,
+            rtt_decay: DEFAULT_RTT_DECAY,
             penalty_ms: DEFAULT_PENALTY_MS,
             max_duration: DEFAULT_RETRY_AFTER_MAX_DURATION,
         }

From 163df42b1a6667aeb393cd81edb58fe33d1cae55 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Fri, 5 Jun 2026 13:33:33 +0200
Subject: [PATCH 28/45] docs(load-biaser): name the gRPC status codes in the
 failure classifier

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index 65e53c5004..6fd35126cc 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -168,6 +168,7 @@ impl<B> ResponseFailureHint for http::Response<B> {
                     0 => None,                                   // OK
                     8 => Some(FailureHint::RateLimited),         // RESOURCE_EXHAUSTED
                     14 => Some(FailureHint::ServiceUnavailable), // UNAVAILABLE
+                    // UNKNOWN, DEADLINE_EXCEEDED, INTERNAL, DATA_LOSS
                     2 | 4 | 13 | 15 => Some(FailureHint::InternalError),
                     _ => None, // Client errors (CANCELLED, INVALID_ARGUMENT, etc.)
                 })

From 761a019773c8752a48472e0cb5bb7f315fcf7fda Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Fri, 5 Jun 2026 20:00:34 +0200
Subject: [PATCH 29/45] docs(ewma): reword stale crate comment

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/ewma/src/lib.rs | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/linkerd/ewma/src/lib.rs b/linkerd/ewma/src/lib.rs
index 66a57cab6b..ff9f49e6ab 100644
--- a/linkerd/ewma/src/lib.rs
+++ b/linkerd/ewma/src/lib.rs
@@ -1,9 +1,11 @@
 //! Exponentially weighted moving average (EWMA) with time-based decay.
 //!
-//! A standalone EWMA implementation that supports non-mutating time-projected
-//! reads via [`Ewma::get_at`] and dual-metric tracking (RTT + penalty) under a
-//! single lock. Tower's internal `RttEstimate` is private, mutates on read, and
-//! cannot support the penalty dimension that failure-aware load balancing needs.
+//! A standalone EWMA that supports non-mutating, time-projected reads via
+//! [`Ewma::get_at`] and records externally supplied peak values via
+//! [`Ewma::add_peak`]. Tower's internal `RttEstimate` is private, mutates on
+//! read, and samples only elapsed request time, so it cannot provide the
+//! read-only load projection or the injected failure penalties that
+//! failure-aware load balancing needs.
 
 #![deny(rust_2018_idioms, clippy::disallowed_methods, clippy::disallowed_types)]
 #![forbid(unsafe_code)]

From 73ce450efc6b9fb49b55108ad2f7df36531b731e Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 8 Jun 2026 15:24:11 +0200
Subject: [PATCH 30/45] fix(http-classify): parse grpc pushback as i32

The gRPC A6 spec defines grpc-retry-pushback-ms as an i32.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/http/classify/src/retry_after.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/linkerd/http/classify/src/retry_after.rs b/linkerd/http/classify/src/retry_after.rs
index d61f1082fa..d2c3eacbe6 100644
--- a/linkerd/http/classify/src/retry_after.rs
+++ b/linkerd/http/classify/src/retry_after.rs
@@ -94,8 +94,8 @@ pub fn parse_grpc_retry_pushback(headers: &HeaderMap, max: Duration) -> Option<D
     let value = headers.get(GRPC_RETRY_PUSHBACK_MS)?;
     let s = value.to_str().ok()?;
 
-    // Parse as i64 to handle potential negative values
-    let ms: i64 = match s.trim().parse() {
+    // Parse as i32 per the gRPC A6 spec, which defines a signed 32-bit value.
+    let ms: i32 = match s.trim().parse() {
         Ok(v) => v,
         Err(_) => {
             tracing::debug!(%s, "Failed to parse grpc-retry-pushback-ms");

From 9e19b9a09117047af9ecbbdf5487504ffea4ec08 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 8 Jun 2026 15:24:11 +0200
Subject: [PATCH 31/45] refactor(load-biaser): consolidate mocks into
 MockService

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 217 ++++++++++++---------------------
 1 file changed, 75 insertions(+), 142 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index 6fd35126cc..fb18cc6f37 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -637,33 +637,88 @@ mod tests {
         }
     }
 
-    // Mock service for testing returning a specific HTTP status.
+    // Mock service for testing returning a specific HTTP status, optionally
+    // after a delay and with extra response headers.
     #[derive(Clone)]
     struct MockService {
         status: http::StatusCode,
+        delay: Duration,
+        headers: Vec<(http::HeaderName, http::HeaderValue)>,
     }
 
     impl MockService {
         fn new(status: http::StatusCode) -> Self {
-            Self { status }
+            Self {
+                status,
+                delay: Duration::ZERO,
+                headers: Vec::new(),
+            }
+        }
+
+        fn with_delay(status: http::StatusCode, delay: Duration) -> Self {
+            Self {
+                status,
+                delay,
+                headers: Vec::new(),
+            }
+        }
+
+        // HTTP error carrying a Retry-After header, in seconds.
+        fn retry_after(status: http::StatusCode, retry_after_secs: u64) -> Self {
+            Self {
+                status,
+                delay: Duration::ZERO,
+                headers: vec![(
+                    http::header::RETRY_AFTER,
+                    http::HeaderValue::from(retry_after_secs),
+                )],
+            }
+        }
+
+        // HTTP 200 with a grpc-status header, simulating a gRPC trailers-only
+        // error response.
+        fn grpc_status(grpc_status: u16) -> Self {
+            Self {
+                status: http::StatusCode::OK,
+                delay: Duration::ZERO,
+                headers: vec![
+                    (
+                        http::header::CONTENT_TYPE,
+                        http::HeaderValue::from_static("application/grpc"),
+                    ),
+                    (
+                        http::HeaderName::from_static("grpc-status"),
+                        http::HeaderValue::from(grpc_status),
+                    ),
+                ],
+            }
         }
     }
 
     impl Service<()> for MockService {
         type Response = http::Response<TestBody>;
         type Error = Infallible;
-        type Future = futures::future::Ready<Result<Self::Response, Self::Error>>;
+        type Future = Pin<Box<dyn Future<Output = Result<Self::Response, Self::Error>> + Send>>;
 
         fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
             Poll::Ready(Ok(()))
         }
 
         fn call(&mut self, _: ()) -> Self::Future {
-            let resp = http::Response::builder()
-                .status(self.status)
-                .body(body())
-                .unwrap();
-            futures::future::ready(Ok(resp))
+            let status = self.status;
+            let delay = self.delay;
+            let headers = self.headers.clone();
+            Box::pin(async move {
+                if !delay.is_zero() {
+                    time::sleep(delay).await;
+                }
+                let mut builder = http::Response::builder().status(status);
+                for (name, value) in headers {
+                    builder = builder.header(name, value);
+                }
+                let resp = builder.body(body()).unwrap();
+                Ok(resp)
+            })
         }
     }
 
@@ -787,10 +842,7 @@ mod tests {
         let mut prev = f64::NEG_INFINITY;
 
         for secs in hints {
-            let inner = RetryAfterService {
-                status: http::StatusCode::TOO_MANY_REQUESTS,
-                retry_after_secs: secs,
-            };
+            let inner = MockService::retry_after(http::StatusCode::TOO_MANY_REQUESTS, secs);
             let mut biaser = LoadBiaser::new(inner, test_config());
 
             time::sleep(Duration::from_millis(1)).await;
@@ -982,54 +1034,9 @@ mod tests {
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_cancellation_records_measurement() {
-        use tokio::sync::oneshot;
-
-        // Build a service whose response future blocks on a oneshot receiver.
-        // This lets us drop the future mid-flight.
-        struct DelayedService {
-            rx: Option<oneshot::Receiver<http::Response<TestBody>>>,
-        }
-
-        impl Service<()> for DelayedService {
-            type Response = http::Response<TestBody>;
-            type Error = Infallible;
-            type Future = DelayedFuture;
-
-            fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
-                Poll::Ready(Ok(()))
-            }
-
-            fn call(&mut self, _: ()) -> Self::Future {
-                DelayedFuture {
-                    rx: self.rx.take().expect("called more than once"),
-                }
-            }
-        }
-
-        struct DelayedFuture {
-            rx: oneshot::Receiver<http::Response<TestBody>>,
-        }
-
-        impl std::future::Future for DelayedFuture {
-            type Output = Result<http::Response<TestBody>, Infallible>;
-
-            fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-                match Pin::new(&mut self.rx).poll(cx) {
-                    Poll::Ready(Ok(resp)) => Poll::Ready(Ok(resp)),
-                    Poll::Ready(Err(_)) => {
-                        let resp = http::Response::builder()
-                            .status(http::StatusCode::OK)
-                            .body(body())
-                            .unwrap();
-                        Poll::Ready(Ok(resp))
-                    }
-                    Poll::Pending => Poll::Pending,
-                }
-            }
-        }
-
-        let (tx, rx) = oneshot::channel();
-        let inner = DelayedService { rx: Some(rx) };
+        // A request that stays in flight. The long delay keeps the future
+        // pending. The test drops it to exercise the cancellation path.
+        let inner = MockService::with_delay(http::StatusCode::OK, Duration::from_secs(3600));
         let mut biaser = LoadBiaser::new(inner, test_config());
 
         assert_eq!(biaser.get_pending(), 0);
@@ -1057,9 +1064,6 @@ mod tests {
             rtt_after > rtt_before,
             "cancellation should record the elapsed wait: {rtt_after} > {rtt_before}"
         );
-
-        // tx is unused. Dropping it here is fine, it just closes the channel.
-        drop(tx);
     }
 
     #[test]
@@ -1076,40 +1080,9 @@ mod tests {
         );
     }
 
-    // Mock service returning an HTTP error with a Retry-After header.
-    // Parameterized by status code to avoid duplicating 429 and 503 variants.
-    #[derive(Clone)]
-    struct RetryAfterService {
-        status: http::StatusCode,
-        retry_after_secs: u64,
-    }
-
-    impl Service<()> for RetryAfterService {
-        type Response = http::Response<TestBody>;
-        type Error = Infallible;
-        type Future = futures::future::Ready<Result<Self::Response, Self::Error>>;
-
-        fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
-            Poll::Ready(Ok(()))
-        }
-
-        fn call(&mut self, _: ()) -> Self::Future {
-            let resp = http::Response::builder()
-                .status(self.status)
-                .header(http::header::RETRY_AFTER, self.retry_after_secs.to_string())
-                .body(body())
-                .unwrap();
-
-            futures::future::ready(Ok(resp))
-        }
-    }
-
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_429_with_retry_after_records_hint() {
-        let inner = RetryAfterService {
-            status: http::StatusCode::TOO_MANY_REQUESTS,
-            retry_after_secs: 30,
-        };
+        let inner = MockService::retry_after(http::StatusCode::TOO_MANY_REQUESTS, 30);
         let mut biaser = LoadBiaser::new(inner, test_config());
 
         time::sleep(Duration::from_millis(1)).await;
@@ -1127,10 +1100,7 @@ mod tests {
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_429_with_retry_after_capped_at_max() {
-        let inner = RetryAfterService {
-            status: http::StatusCode::TOO_MANY_REQUESTS,
-            retry_after_secs: 600,
-        };
+        let inner = MockService::retry_after(http::StatusCode::TOO_MANY_REQUESTS, 600);
         let mut biaser = LoadBiaser::new(inner, test_config());
 
         time::sleep(Duration::from_millis(1)).await;
@@ -1372,10 +1342,7 @@ mod tests {
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_503_with_retry_after_records_hint() {
-        let inner = RetryAfterService {
-            status: http::StatusCode::SERVICE_UNAVAILABLE,
-            retry_after_secs: 30,
-        };
+        let inner = MockService::retry_after(http::StatusCode::SERVICE_UNAVAILABLE, 30);
         let mut biaser = LoadBiaser::new(inner, test_config());
 
         time::sleep(Duration::from_millis(1)).await;
@@ -1394,17 +1361,11 @@ mod tests {
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_429_and_503_retry_after_produce_identical_load() {
-        let inner_429 = RetryAfterService {
-            status: http::StatusCode::TOO_MANY_REQUESTS,
-            retry_after_secs: 30,
-        };
+        let inner_429 = MockService::retry_after(http::StatusCode::TOO_MANY_REQUESTS, 30);
         let mut biaser_429 = LoadBiaser::new(inner_429, test_config());
 
-        // 503 path: RetryAfterService returns SERVICE_UNAVAILABLE with Retry-After: 30
-        let inner_503 = RetryAfterService {
-            status: http::StatusCode::SERVICE_UNAVAILABLE,
-            retry_after_secs: 30,
-        };
+        // 503 path: SERVICE_UNAVAILABLE with Retry-After: 30
+        let inner_503 = MockService::retry_after(http::StatusCode::SERVICE_UNAVAILABLE, 30);
         let mut biaser_503 = LoadBiaser::new(inner_503, test_config());
 
         // Bootstrap EWMA timestamps
@@ -1454,36 +1415,8 @@ mod tests {
         );
     }
 
-    // Mock service that returns HTTP 200 with a `grpc-status` header,
-    // simulating a gRPC trailers-only error response.
-    #[derive(Clone)]
-    struct GrpcErrorService {
-        grpc_status: u16,
-    }
-
-    impl Service<()> for GrpcErrorService {
-        type Response = http::Response<TestBody>;
-        type Error = Infallible;
-        type Future = futures::future::Ready<Result<Self::Response, Self::Error>>;
-
-        fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
-            Poll::Ready(Ok(()))
-        }
-
-        fn call(&mut self, _: ()) -> Self::Future {
-            let resp = http::Response::builder()
-                .status(http::StatusCode::OK)
-                .header(http::header::CONTENT_TYPE, "application/grpc")
-                .header("grpc-status", self.grpc_status.to_string())
-                .body(body())
-                .unwrap();
-
-            futures::future::ready(Ok(resp))
-        }
-    }
-
     async fn assert_grpc_records_penalized_effective_rtt(grpc_status: u16, label: &str) {
-        let inner = GrpcErrorService { grpc_status };
+        let inner = MockService::grpc_status(grpc_status);
         let mut biaser = LoadBiaser::new(inner, test_config());
 
         time::sleep(Duration::from_millis(1)).await;
@@ -1497,7 +1430,7 @@ mod tests {
     }
 
     async fn assert_grpc_records_low_rtt(grpc_status: u16, label: &str) {
-        let inner = GrpcErrorService { grpc_status };
+        let inner = MockService::grpc_status(grpc_status);
         let mut biaser = LoadBiaser::new(inner, test_config());
 
         time::sleep(Duration::from_millis(1)).await;

From a1173734c7fa51b596f0814b4ce240ad19e7f777 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 8 Jun 2026 15:24:11 +0200
Subject: [PATCH 32/45] refactor(load-biaser): apply ResponseFailureHint
 feedback

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index fb18cc6f37..2fdf83524c 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -88,8 +88,7 @@ impl CachedRateLimitHint {
 /// Trait for extracting failure hints from responses.
 ///
 /// This allows the load biaser to classify responses and apply appropriate
-/// penalties. Default implementations return None (no failure detected),
-/// which is appropriate for non-HTTP transports.
+/// penalties.
 ///
 /// The trait splits rate limit hint access into two methods to avoid
 /// requiring `&mut self` on the read path:
@@ -104,24 +103,20 @@ impl CachedRateLimitHint {
 /// means the cache is always cold for the circuit breaker path.
 pub trait ResponseFailureHint {
     /// Returns a failure hint if the response indicates a failure condition.
-    fn failure_hint(&self) -> Option<FailureHint> {
-        None
-    }
+    fn failure_hint(&self) -> Option<FailureHint>;
 
     /// Parse and cache the raw (uncapped) rate limit hint from this response.
     ///
     /// The raw uncapped value is cached so that each consumer can apply their
     /// own cap via `rate_limit_hint(max)`.
-    fn attach_parsed_rate_limit_hint(&mut self) {}
+    fn attach_parsed_rate_limit_hint(&mut self);
 
     /// Returns the rate limit hint if available.
     ///
     /// Checks cached value first (from a previous `attach_parsed_rate_limit_hint` call).
     /// If no cached value, attempts to parse the header directly (capping at `max`).
     /// Returns `None` only if the header is absent or unparseable.
-    fn rate_limit_hint(&self, _max: Duration) -> Option<Duration> {
-        None
-    }
+    fn rate_limit_hint(&self, max: Duration) -> Option<Duration>;
 }
 
 fn is_grpc(headers: &http::HeaderMap) -> bool {
@@ -225,20 +220,6 @@ impl<B> ResponseFailureHint for http::Response<B> {
     }
 }
 
-/// Connection tuples (Connection, Metadata) used by TCP/TLS paths never indicate failures.
-/// This is meant for MakeConnection services that return `(I, M)` tuples.
-impl<C, M> ResponseFailureHint for (C, M) {}
-
-/// TCP streams never indicate failures (no HTTP status codes).
-impl ResponseFailureHint for tokio::net::TcpStream {}
-
-/// Duplex streams (used in testing) never indicate failures.
-impl ResponseFailureHint for tokio::io::DuplexStream {}
-
-/// Mock IO streams never indicate failures.
-#[cfg(feature = "tokio-test")]
-impl ResponseFailureHint for tokio_test::io::Mock {}
-
 /// Configuration for LoadBiaser behavior.
 #[derive(Clone, Debug, PartialEq, Eq, Hash)]
 pub struct LoadBiaserConfig {

From e3f47cb8d18ad84645353ea1f8e4a1c5f5ca1246 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 8 Jun 2026 15:24:11 +0200
Subject: [PATCH 33/45] refactor(load-biaser): remove unused get_ref

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index 2fdf83524c..5ae23dbce1 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -396,11 +396,6 @@ impl<S, C> LoadBiaser<S, C> {
         }
     }
 
-    /// Returns a reference to the inner service.
-    pub fn get_ref(&self) -> &S {
-        &self.inner
-    }
-
     fn handle(&self) -> Handle {
         Handle {
             shared: self.shared.clone(),

From b19fa511cdcb804d63d56ee906e23b4b0b9a9f59 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 8 Jun 2026 15:24:11 +0200
Subject: [PATCH 34/45] test(load-biaser): assert load values in pending test

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index 5ae23dbce1..4c4fdb9d97 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -726,20 +726,23 @@ mod tests {
         let inner = MockService::new(http::StatusCode::OK);
         let biaser = LoadBiaser::new(inner, test_config());
 
-        // Inject a known RTT
+        // Inject a known RTT below the default
         time::sleep(Duration::from_millis(1)).await;
         biaser.inject_rtt(0.05); // 50ms
 
-        // RTT * (0 + 1) = 0.05
+        // No pending handles. Load = RTT * (0 + 1) = 0.05.
         let load_idle = biaser.load();
+        assert_eq!(load_idle, biaser.get_rtt());
 
         // Hold a handle to simulate an in-flight request: the strong count of
         // the shared Arc increments per live handle like during a call.
         let h1 = biaser.handle();
         let load_one_pending = biaser.load();
+        assert_eq!(load_one_pending, biaser.get_rtt() * 2.0);
 
         let h2 = biaser.handle();
         let load_two_pending = biaser.load();
+        assert_eq!(load_two_pending, biaser.get_rtt() * 3.0);
 
         assert!(
             load_one_pending > load_idle,

From e212159cf418b4a5aa248a91e55c22ea77869f73 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 8 Jun 2026 15:24:11 +0200
Subject: [PATCH 35/45] build(load-biaser): drop orphaned test deps

We can now trim our tokio flags and drop the tokio-test dependency.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 Cargo.lock                     | 1 -
 linkerd/load-biaser/Cargo.toml | 7 ++-----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9b2247ea76..376f63229e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2003,7 +2003,6 @@ dependencies = [
  "parking_lot",
  "pin-project",
  "tokio",
- "tokio-test",
  "tower",
  "tower-service",
  "tracing",
diff --git a/linkerd/load-biaser/Cargo.toml b/linkerd/load-biaser/Cargo.toml
index b281c4983c..2dbc94d28d 100644
--- a/linkerd/load-biaser/Cargo.toml
+++ b/linkerd/load-biaser/Cargo.toml
@@ -8,15 +8,13 @@ publish = { workspace = true }
 
 [features]
 default = []
-tokio-test = ["dep:tokio-test"]
 
 [dependencies]
 futures = { version = "0.3", default-features = false }
 http = { workspace = true }
 parking_lot = "0.12"
 pin-project = "1"
-tokio = { version = "1", features = ["io-util", "net", "time"] }
-tokio-test = { version = "0.4", optional = true }
+tokio = { version = "1", features = ["time"] }
 tower = { workspace = true, features = ["load"] }
 tower-service = { workspace = true }
 tracing = { workspace = true }
@@ -30,5 +28,4 @@ linkerd-stack = { path = "../stack" }
 bytes = { workspace = true }
 http-body = { workspace = true }
 http-body-util = { workspace = true }
-tokio = { version = "1", features = ["macros", "rt", "sync", "time"] }
-tokio-test = "0.4"
+tokio = { version = "1", features = ["macros", "rt", "sync", "test-util", "time"] }

From c3bc7c6e11f9576d783ac83602cff56b9c5e0f49 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 8 Jun 2026 15:24:11 +0200
Subject: [PATCH 36/45] docs(load-biaser): correct stale comments

penalty_ms on SharedState is millisecond. Remove references
to types that are not integrated yet.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index 4c4fdb9d97..5709cd3e30 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -94,13 +94,6 @@ impl CachedRateLimitHint {
 /// requiring `&mut self` on the read path:
 /// - `attach_parsed_rate_limit_hint(&mut self)`: parse and cache (needs `&mut`)
 /// - `rate_limit_hint(&self, max)`: read cached value or parse on-read (only needs `&self`)
-///
-/// # Stack ordering and caching
-///
-/// In the current proxy stack `RetryAfterClassify::start()` (the circuit breaker's
-/// classifier) calls `rate_limit_hint()` before `LoadBiaserFuture::poll()` calls
-/// `attach_parsed_rate_limit_hint()`, because responses flow inner-to-outer. This
-/// means the cache is always cold for the circuit breaker path.
 pub trait ResponseFailureHint {
     /// Returns a failure hint if the response indicates a failure condition.
     fn failure_hint(&self) -> Option<FailureHint>;
@@ -153,8 +146,8 @@ impl<B> ResponseFailureHint for http::Response<B> {
             // responses that somehow happen to include a grpc-status header.
             //
             // Note: for streaming gRPC, grpc-status is in trailers (not headers)
-            // and will not be detected here. The circuit breaker handles streaming
-            // gRPC failures via GrpcRetryPushbackClassifyEos.
+            // and is not detected here; such responses record only their
+            // measured RTT and are not penalized.
             self.headers()
                 .get("grpc-status")
                 .and_then(|v| v.to_str().ok())
@@ -259,7 +252,7 @@ impl Default for LoadBiaserConfig {
 struct SharedState {
     /// RwLocked RTT EWMA. Read in load(), written when measuring.
     rtt: RwLock<Ewma>,
-    /// Penalty value to inject on failure responses (in seconds) when no
+    /// Penalty value to inject on failure responses (in milliseconds) when no
     /// hint is present.
     penalty_ms: u32,
     /// Maximum Retry-After duration to honor (clamped)
@@ -1047,10 +1040,9 @@ mod tests {
 
     #[test]
     fn default_max_duration_matches_client_policy() {
-        // Enforces the invariant at LoadBiaserConfig::default():
-        // max_duration must match the local DEFAULT_RETRY_AFTER_MAX_DURATION constant.
-        // Production code reads the constant directly via EwmaConfig::to_load_biaser_config;
-        // this assertion keeps the test-only default() in sync.
+        // Ensure that LoadBiaserConfig::default().max_duration matches
+        // DEFAULT_RETRY_AFTER_MAX_DURATION constant, keeping the default() in
+        // sync with that constant.
         assert_eq!(
             LoadBiaserConfig::default().max_duration,
             DEFAULT_RETRY_AFTER_MAX_DURATION,

From 841abfa9470a430b15a17d5e4a298889081d1922 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 8 Jun 2026 15:24:11 +0200
Subject: [PATCH 37/45] refactor(ewma): use is_infinite instead of
 f64::INFINITY

Conveys meaning without coupling type nor constant.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/ewma/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linkerd/ewma/src/lib.rs b/linkerd/ewma/src/lib.rs
index ff9f49e6ab..6ef37d2ffc 100644
--- a/linkerd/ewma/src/lib.rs
+++ b/linkerd/ewma/src/lib.rs
@@ -97,7 +97,7 @@ impl Ewma {
         if ts <= self.timestamp {
             return;
         }
-        if self.value == f64::INFINITY {
+        if self.value.is_infinite() {
             self.value = value;
             self.timestamp = ts;
             return;

From cfad958eee77e7bd08e85b2d5075b7eaaf5d4fe8 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 8 Jun 2026 15:24:11 +0200
Subject: [PATCH 38/45] test(ewma): test add_peak when providing a lower
 measurement

Add a test exercising the case of a sample at or below the still
undecayed peak. It should not replace the peak, but compute the
value blending in the new sample.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/ewma/src/lib.rs | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/linkerd/ewma/src/lib.rs b/linkerd/ewma/src/lib.rs
index 6ef37d2ffc..8904ab05ac 100644
--- a/linkerd/ewma/src/lib.rs
+++ b/linkerd/ewma/src/lib.rs
@@ -217,6 +217,27 @@ mod tests {
         assert_eq!(ewma.get(), 1.0);
     }
 
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_add_peak_retains_higher_value() {
+        let now = Instant::now();
+        let mut ewma = Ewma::new(Duration::from_secs(10), now);
+
+        ewma.add_peak(10.0, now + Duration::from_secs(1));
+        assert_eq!(ewma.get(), 10.0);
+
+        // A lower sample arrives shortly after. The decayed projection
+        // would be close to 9.9, well above 1.0. Now add_peak() should add
+        // the measurement, but it should only slightly move the EWMA,
+        // leaving the moving average still close to 10.
+        ewma.add_peak(1.0, now + Duration::from_millis(1100));
+        let v = ewma.get();
+
+        assert!(
+            v > 9.0 && v < 10.0,
+            "expected EWMA value between 9 and 10, got {v}"
+        );
+    }
+
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_zero_decay_clamped_in_new() {
         let now = Instant::now();

From 1d592a972d2bd6d31a25a360ee7409687dc3b3ad Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 8 Jun 2026 15:24:11 +0200
Subject: [PATCH 39/45] test(ewma): add() ignores old timestamp measurements

Ensure that add() discards a sample whose timestamp is at or before
the stored one.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/ewma/src/lib.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/linkerd/ewma/src/lib.rs b/linkerd/ewma/src/lib.rs
index 8904ab05ac..8642b1d908 100644
--- a/linkerd/ewma/src/lib.rs
+++ b/linkerd/ewma/src/lib.rs
@@ -153,6 +153,25 @@ mod tests {
         assert_eq!(ewma.get(), 1.0);
     }
 
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_add_ignores_stale_timestamps() {
+        let now = Instant::now();
+        let t1 = now + Duration::from_secs(2);
+        let mut ewma = Ewma::new(Duration::from_secs(10), now);
+
+        // Store an initial value at t1, replacing the inf sentinel.
+        ewma.add(1.0, t1);
+        assert_eq!(ewma.get(), 1.0);
+
+        // A measurement at the stored timestamp is discarded.
+        ewma.add(99.0, t1);
+        assert_eq!(ewma.get(), 1.0);
+
+        // Also discarded if the timestamp is before the stored one.
+        ewma.add(99.0, t1 - Duration::from_secs(1));
+        assert_eq!(ewma.get(), 1.0);
+    }
+
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_add_peak() {
         let now = Instant::now();

From aa9e42dbc4c50ad7043a57bfdbc0501bcd4aeebd Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 8 Jun 2026 15:24:11 +0200
Subject: [PATCH 40/45] feat(load-biaser): floor Retry-After effect on RTT to
 penalty base

A hint below the base penalty such as 0 records a low effective RTT
and can make a failing endpoint look healthier than it should.

Ensure a failure's recorded measurement is at least the base penalty,
so that retry hints take effect only when they exceed that penalty.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index 5709cd3e30..5cbbd869ee 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -223,8 +223,8 @@ pub struct LoadBiaserConfig {
     /// Controls how quickly RTT estimates adapt to changing latency
     pub rtt_decay: Duration,
 
-    /// The base penalty to inject on failure responses (429, 503, 5xx)
-    /// in milliseconds, when the server sends no backoff hint.
+    /// Base penalty for failure responses (429, 503, 5xx), in milliseconds.
+    /// Also the floor for the recorded effective RTT.
     pub penalty_ms: u32,
 
     /// Maximum Retry-After duration to honor. Clamped to this value.
@@ -252,8 +252,7 @@ impl Default for LoadBiaserConfig {
 struct SharedState {
     /// RwLocked RTT EWMA. Read in load(), written when measuring.
     rtt: RwLock<Ewma>,
-    /// Penalty value to inject on failure responses (in milliseconds) when no
-    /// hint is present.
+    /// Base penalty (milliseconds) and floor for the recorded effective RTT.
     penalty_ms: u32,
     /// Maximum Retry-After duration to honor (clamped)
     max_duration: Duration,
@@ -270,8 +269,10 @@ impl SharedState {
             FailureHint::InternalError => None,
         };
 
-        // A hint, if present, is honored verbatim (capped).
-        from_hint.unwrap_or_else(|| Duration::from_millis(u64::from(self.penalty_ms)).as_secs_f64())
+        let base = Duration::from_millis(u64::from(self.penalty_ms)).as_secs_f64();
+        // Honor a hint only above the base penalty. This keeps a failing
+        // endpoint from looking healthier than a hintless one.
+        from_hint.map(|h| h.max(base)).unwrap_or(base)
     }
 }
 
@@ -1069,6 +1070,21 @@ mod tests {
         );
     }
 
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_small_retry_after_floored_at_base_penalty() {
+        // A Retry-After below the base penalty must not lower the effective RTT.
+        let inner = MockService::retry_after(http::StatusCode::TOO_MANY_REQUESTS, 1);
+        let mut biaser = LoadBiaser::new(inner, test_config());
+        time::sleep(Duration::from_millis(1)).await;
+        let _ = biaser.call(()).await;
+        // penalty_ms = 5000; a 1s hint is floored to the 5s base.
+        let rtt = biaser.get_rtt();
+        assert!(
+            (rtt - 5.0).abs() < 0.1,
+            "1s hint should floor to 5s base, got: {rtt}"
+        );
+    }
+
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_429_with_retry_after_capped_at_max() {
         let inner = MockService::retry_after(http::StatusCode::TOO_MANY_REQUESTS, 600);

From d2ea6bacc973421a497d5deca657da2d1b5e2e61 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 8 Jun 2026 15:24:11 +0200
Subject: [PATCH 41/45] test(load-biaser): measure a real success RTT

test_rtt_tracked_after_request resolved instantly under paused time and
only checked that the RTT moved minimally. Drive a request that takes
a measurable delay and assert the recorded RTT reflects it.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index 5cbbd869ee..2fc36ecc9d 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -757,27 +757,31 @@ mod tests {
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_rtt_tracked_after_request() {
-        let inner = MockService::new(http::StatusCode::OK);
+        // Drive one request that takes a measurable delay and assert the RTT
+        // recorded matches the delay.
+        let delay = Duration::from_millis(250);
+        let inner = MockService::with_delay(http::StatusCode::OK, delay);
         let mut biaser = LoadBiaser::new(inner, test_config());
 
         // Advance time so EWMA accepts updates
         time::sleep(Duration::from_millis(1)).await;
 
-        // RTT should start at default_rtt (0.1s) before any requests
+        // RTT should start at default_rtt before any requests
         let initial_rtt = biaser.get_rtt();
         assert!(
             (initial_rtt - 0.1).abs() < 0.01,
             "RTT should start at default_rtt (0.1s), got: {initial_rtt}"
         );
 
-        // Make a request (will record RTT)
-        let _ = biaser.call(()).await;
+        // First frame drops the handle and records the measurement.
+        let resp = biaser.call(()).await.unwrap();
+        drive_to_first_frame(resp).await;
 
-        // RTT should now reflect the actual request latency
+        // The recorded RTT should match the request delay (250ms).
         let rtt = biaser.get_rtt();
         assert!(
-            rtt < initial_rtt,
-            "RTT should decrease after a fast request"
+            (rtt - delay.as_secs_f64()).abs() < 0.005,
+            "RTT should reflect the 250ms request delay, got: {rtt}"
         );
     }
 

From 0fa734e83c0ed1ab495b13e366d8306fcc4a963e Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Mon, 8 Jun 2026 15:24:11 +0200
Subject: [PATCH 42/45] test(load-biaser): count concurrent in-flight

Existing tests raised the pending count with disabled handles or a
single request. Try now with two concurrent requests and assert
the strong count reports two pending, then assert the count falls
when they resolve.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 52 ++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index 2fc36ecc9d..54e072c829 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -1009,6 +1009,58 @@ mod tests {
         );
     }
 
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_two_concurrent_requests_counted() {
+        // Two concurrent in-flight requests each hold a handle. The strong count
+        // should report two pending while both are unresolved.
+        let delay = Duration::from_millis(200);
+        let inner = MockService::with_delay(http::StatusCode::OK, delay);
+        let mut biaser = LoadBiaser::new(inner, test_config());
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        assert_eq!(biaser.get_pending(), 0, "pending should start at 0");
+        let load_idle = biaser.load();
+
+        // Start two requests without resolving them
+        let fut1 = biaser.call(());
+        let fut2 = biaser.call(());
+        tokio::pin!(fut1);
+        tokio::pin!(fut2);
+
+        // Poll each once, ensure they are pending
+        let mut cx = Context::from_waker(std::task::Waker::noop());
+        assert!(fut1.as_mut().poll(&mut cx).is_pending());
+        assert!(fut2.as_mut().poll(&mut cx).is_pending());
+
+        assert_eq!(
+            biaser.get_pending(),
+            2,
+            "two concurrent requests should report two pending"
+        );
+
+        // Load = RTT * (pending + 1) = RTT * 3, triple load.
+        let load_two = biaser.load();
+        assert_eq!(
+            load_two,
+            load_idle * 3.0,
+            "two pending should triple the idle load: {load_two} vs {load_idle}"
+        );
+
+        // Advance past the delay, then drive the body to drop the handles.
+        time::sleep(Duration::from_millis(250)).await;
+        let resp1 = fut1.await.unwrap();
+        let resp2 = fut2.await.unwrap();
+        drive_to_first_frame(resp1).await;
+        drive_to_first_frame(resp2).await;
+
+        assert_eq!(
+            biaser.get_pending(),
+            0,
+            "pending should fall back to 0 once both requests resolve"
+        );
+    }
+
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_cancellation_records_measurement() {
         // A request that stays in flight. The long delay keeps the future

From 7ae6284436f43d40b4cc68b03000eb72528e13a2 Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Tue, 9 Jun 2026 11:29:39 +0200
Subject: [PATCH 43/45] test(load-biaser): assert load values by making
 inject_rtt set exact RTT

The inject_rtt test helper was using add_peak, which replaces the RTT
estimate when the measurement exceeds the current decayed projection.
The EWMA is seeded with default_rtt (0.1s in tests), so injecting a
lower value such as 0.05 left the estimate near 0.1. Reset the
estimate instead, and also assert the actual values in the comments.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index 54e072c829..34ee4d4372 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -594,8 +594,9 @@ mod tests {
             (Arc::strong_count(&self.shared) as u32).saturating_sub(1)
         }
 
+        /// Sets the RTT EWMA to an exact value for tests.
         pub fn inject_rtt(&self, rtt_secs: f64) {
-            self.shared.rtt.write().add_peak(rtt_secs, Instant::now());
+            self.shared.rtt.write().reset(rtt_secs, Instant::now());
         }
     }
 
@@ -720,9 +721,10 @@ mod tests {
         let inner = MockService::new(http::StatusCode::OK);
         let biaser = LoadBiaser::new(inner, test_config());
 
-        // Inject a known RTT below the default
+        // Inject a known RTT below the default.
         time::sleep(Duration::from_millis(1)).await;
         biaser.inject_rtt(0.05); // 50ms
+        assert_eq!(biaser.get_rtt(), 0.05, "inject_rtt sets the RTT exactly");
 
         // No pending handles. Load = RTT * (0 + 1) = 0.05.
         let load_idle = biaser.load();
@@ -731,12 +733,22 @@ mod tests {
         // Hold a handle to simulate an in-flight request: the strong count of
         // the shared Arc increments per live handle like during a call.
         let h1 = biaser.handle();
+        // Load = RTT * (1 + 1) = 0.1.
         let load_one_pending = biaser.load();
         assert_eq!(load_one_pending, biaser.get_rtt() * 2.0);
+        assert!(
+            (load_one_pending - 0.1).abs() < 1e-9,
+            "load with one request pending should be 0.1: {load_one_pending}"
+        );
 
         let h2 = biaser.handle();
+        // Load = RTT * (2 + 1) = 0.15.
         let load_two_pending = biaser.load();
         assert_eq!(load_two_pending, biaser.get_rtt() * 3.0);
+        assert!(
+            (load_two_pending - 0.15).abs() < 1e-9,
+            "load with two requests pending should be 0.15: {load_two_pending}"
+        );
 
         assert!(
             load_one_pending > load_idle,

From fe559b5365e57942335472d997a1513bc0ecb71b Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Wed, 10 Jun 2026 11:37:40 +0200
Subject: [PATCH 44/45] refactor(load-biaser): record penalties when handles
 drop

Store an optional penalty on each handle. Failures drop it at the
response head and pass `None` to completion tracking. Successes pass
the handle and record elapsed RTT when it drops.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 129 ++++++++++++++++++---------------
 1 file changed, 72 insertions(+), 57 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index 34ee4d4372..0b0579bacd 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -261,15 +261,15 @@ struct SharedState {
 impl SharedState {
     /// Translates a classified failure into an effective RTT measurement,
     /// taking into account Retry-After/grpc-retry-pushback hints.
-    fn effective_rtt<R: ResponseFailureHint>(&self, hint: FailureHint, resp: &R) -> f64 {
+    fn effective_rtt<R: ResponseFailureHint>(&self, hint: FailureHint, resp: &R) -> Duration {
         let from_hint = match hint {
-            FailureHint::RateLimited | FailureHint::ServiceUnavailable => resp
-                .rate_limit_hint(self.max_duration)
-                .map(|d| d.as_secs_f64()),
+            FailureHint::RateLimited | FailureHint::ServiceUnavailable => {
+                resp.rate_limit_hint(self.max_duration)
+            }
             FailureHint::InternalError => None,
         };
 
-        let base = Duration::from_millis(u64::from(self.penalty_ms)).as_secs_f64();
+        let base = Duration::from_millis(u64::from(self.penalty_ms));
         // Honor a hint only above the base penalty. This keeps a failing
         // endpoint from looking healthier than a hintless one.
         from_hint.map(|h| h.max(base)).unwrap_or(base)
@@ -312,47 +312,30 @@ pub struct NewLoadBiaser<N, Req, C = PendingUntilFirstData> {
 /// It clones the shared `Arc`, so while it lives the endpoint's strong count
 /// is incremented by one. On drop it records the elapsed time as an RTT
 /// measurement, so a cancelled request still measures latency.
-/// A failure disables the handle first, because it needs to record its own
-/// penalty-injected measurement.
+/// A failure sets a penalty before dropping the handle. The penalty is recorded
+/// instead of the elapsed time.
 #[derive(Debug)]
 pub struct Handle {
     shared: Arc<SharedState>,
     sent_at: Instant,
-    enabled: bool,
-}
-
-impl Handle {
-    /// Records the measured elapsed time, then prevents the drop from recording
-    /// again.
-    fn record_elapsed(&mut self, now: Instant) {
-        if self.enabled {
-            let elapsed = now.saturating_duration_since(self.sent_at).as_secs_f64();
-            self.shared.rtt.write().add_peak(elapsed, now);
-            self.enabled = false;
-        }
-    }
-
-    /// Records a computed effective RTT and disables the handle so its drop does
-    /// not also record the measurement of the failure.
-    fn record_effective_rtt(&mut self, value: f64, now: Instant) {
-        self.shared.rtt.write().add_peak(value, now);
-        self.enabled = false;
-    }
+    penalty: Option<Duration>,
 }
 
 impl Drop for Handle {
     fn drop(&mut self) {
-        // Ensure we take a measurement if the request was cancelled (enabled
-        // should be true).
-        self.record_elapsed(Instant::now());
+        let now = Instant::now();
+        let value = self
+            .penalty
+            .unwrap_or_else(|| now.saturating_duration_since(self.sent_at));
+        self.shared.rtt.write().add_peak(value.as_secs_f64(), now);
     }
 }
 
 /// Response future that records a measurement and checks for failure responses.
 ///
-/// On resolution a failure injects a computed RTT and disables the handle.
-/// A success leaves the handle for the completion tracker, which decides when
-/// to record the measurement.
+/// A failure sets a computed penalty and drops the handle immediately. A
+/// success leaves the handle for the completion tracker, which decides when to
+/// record the measured RTT.
 #[pin_project]
 pub struct LoadBiaserFuture<F, Rsp, C> {
     #[pin]
@@ -394,7 +377,7 @@ impl<S, C> LoadBiaser<S, C> {
         Handle {
             shared: self.shared.clone(),
             sent_at: Instant::now(),
-            enabled: true,
+            penalty: None,
         }
     }
 }
@@ -447,7 +430,7 @@ impl<S, C, Req> Service<Req> for LoadBiaser<S, C>
 where
     S: Service<Req>,
     S::Response: ResponseFailureHint,
-    C: TrackCompletion<Handle, S::Response> + Clone,
+    C: TrackCompletion<Option<Handle>, S::Response> + Clone,
 {
     type Response = C::Output;
     type Error = S::Error;
@@ -517,29 +500,37 @@ impl<F, Rsp, E, C> Future for LoadBiaserFuture<F, Rsp, C>
 where
     F: Future<Output = Result<Rsp, E>>,
     Rsp: ResponseFailureHint,
-    C: TrackCompletion<Handle, Rsp>,
+    C: TrackCompletion<Option<Handle>, Rsp>,
 {
     type Output = Result<C::Output, E>;
 
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         let this = self.project();
         let result = ready!(this.inner.poll(cx));
-        let now = Instant::now();
         let mut handle = this.handle.take().expect("polled after completion");
 
         match result {
             Ok(mut resp) => {
-                if let Some(hint) = resp.failure_hint() {
-                    // Cache the uncapped hint while we hold &mut, then record the
-                    // failure as a penalized effective RTT.
+                let handle = if let Some(hint) = resp.failure_hint() {
+                    // Cache the uncapped hint while the response is mutable. The
+                    // handle records the penalty when dropped.
                     resp.attach_parsed_rate_limit_hint();
-                    let value = handle.shared.effective_rtt(hint, &resp);
-                    tracing::debug!(rtt_secs = value, ?hint, "Failure recorded as effective RTT");
-                    handle.record_effective_rtt(value, now);
-                }
-
-                // The completion tracker will drop the handle now or at first body
-                // frame, and that drop will record the measured RTT.
+                    let penalty = handle.shared.effective_rtt(hint, &resp);
+                    tracing::debug!(
+                        rtt_secs = penalty.as_secs_f64(),
+                        ?hint,
+                        "Failure recorded as effective RTT"
+                    );
+                    handle.penalty = Some(penalty);
+                    drop(handle);
+                    None
+                } else {
+                    Some(handle)
+                };
+
+                // Successful responses keep the handle until completion.
+                // Failed responses pass `None` since their penalty was recorded
+                // above.
                 let output = this.completion.track_completion(handle, resp);
 
                 Poll::Ready(Ok(output))
@@ -600,14 +591,6 @@ mod tests {
         }
     }
 
-    impl Handle {
-        /// Disable a handle so dropping it records nothing. Lets a test hold a
-        /// handle just to raise the pending count.
-        fn disable(mut self) {
-            self.enabled = false;
-        }
-    }
-
     // Mock service for testing returning a specific HTTP status, optionally
     // after a delay and with extra response headers.
     #[derive(Clone)]
@@ -763,8 +746,8 @@ mod tests {
             load_one_pending
         );
 
-        h1.disable();
-        h2.disable();
+        drop(h1);
+        drop(h2);
     }
 
     #[tokio::test(flavor = "current_thread", start_paused = true)]
@@ -825,6 +808,38 @@ mod tests {
         );
     }
 
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_failure_records_once_at_response_head() {
+        let inner = MockService::new(http::StatusCode::SERVICE_UNAVAILABLE);
+        let mut biaser = LoadBiaser::new(inner, test_config());
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        // Keep the response without polling its body. A failure records and
+        // releases its handle when the response head is available.
+        let resp = biaser.call(()).await.unwrap();
+        assert_eq!(
+            biaser.get_pending(),
+            0,
+            "failure should stop counting as pending at the response head"
+        );
+        let recorded = biaser.get_rtt();
+        assert!(
+            (recorded - 5.0).abs() < 0.1,
+            "503 should record the base penalty at the response head: {recorded}"
+        );
+
+        // The completion tracker holds `None` for a failed response. Reaching
+        // the first body frame must not record a second measurement.
+        time::sleep(Duration::from_secs(3)).await;
+        drive_to_first_frame(resp).await;
+        assert_eq!(
+            biaser.get_rtt(),
+            recorded,
+            "failure body completion should not record another measurement"
+        );
+    }
+
     #[tokio::test(flavor = "current_thread", start_paused = true)]
     async fn test_retry_after_hint_load_grows_monotonically() {
         let hints = [0u64, 1, 5, 30, 120, 300];

From 5771848daf6a1c9ee7973b2c3c2db82fcb093eab Mon Sep 17 00:00:00 2001
From: Alejandro Martinez Ruiz <amr@buoyant.io>
Date: Wed, 10 Jun 2026 12:20:35 +0200
Subject: [PATCH 45/45] fix(load-biaser): floor penalty at measured RTT

When penalty_ms is below an endpoint's current decayed RTT recording
a failure could lower its EWMA. This could make the failing endpoint
appear cheaper and cause P2C to prefer it.

Floor the penalties at the endpoint's current decayed RTT so that
a failure cannot lower its RTT estimate.

Signed-off-by: Alejandro Martinez Ruiz <amr@buoyant.io>
---
 linkerd/load-biaser/src/lib.rs | 49 +++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/linkerd/load-biaser/src/lib.rs b/linkerd/load-biaser/src/lib.rs
index 0b0579bacd..f6ffb36402 100644
--- a/linkerd/load-biaser/src/lib.rs
+++ b/linkerd/load-biaser/src/lib.rs
@@ -324,10 +324,16 @@ pub struct Handle {
 impl Drop for Handle {
     fn drop(&mut self) {
         let now = Instant::now();
-        let value = self
-            .penalty
-            .unwrap_or_else(|| now.saturating_duration_since(self.sent_at));
-        self.shared.rtt.write().add_peak(value.as_secs_f64(), now);
+        let mut rtt = self.shared.rtt.write();
+        let value = self.penalty.map_or_else(
+            || now.saturating_duration_since(self.sent_at).as_secs_f64(),
+            |penalty| {
+                // Floor at the current decayed RTT. Lower penalties would
+                // otherwise steer P2C towards failing endpoints.
+                penalty.as_secs_f64().max(rtt.get_at(now))
+            },
+        );
+        rtt.add_peak(value, now);
     }
 }
 
@@ -1471,6 +1477,41 @@ mod tests {
         );
     }
 
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn test_sub_measured_penalty_failures_floor_at_measured_rtt() {
+        // A penalty below the measured latency must not drag the recorded RTT
+        // towards it. That would make the failing endpoint look cheaper.
+        const PENALTY: f64 = 0.1;
+        const MEASURED: f64 = 0.5;
+
+        let config = LoadBiaserConfig {
+            penalty_ms: (PENALTY * 1000.0) as u32,
+            ..test_config()
+        };
+        let inner = MockService::new(http::StatusCode::INTERNAL_SERVER_ERROR);
+        let mut biaser = LoadBiaser::new(inner, config);
+
+        time::sleep(Duration::from_millis(1)).await;
+
+        biaser.inject_rtt(MEASURED);
+        assert!((biaser.get_rtt() - MEASURED).abs() < 1e-9);
+
+        // Failures one second apart let the EWMA blend towards the penalty.
+        for _ in 0..8 {
+            time::sleep(Duration::from_secs(1)).await;
+            let _ = biaser.call(()).await;
+        }
+
+        // The floor keeps the RTT nearer the measured latency than the penalty.
+        let rtt = biaser.get_rtt();
+        let midpoint = (PENALTY + MEASURED) / 2.0;
+        assert!(
+            rtt > midpoint,
+            "sustained penalties below the measured RTT must not drag RTT downwards: \
+             rtt={rtt} should exceed midpoint {midpoint}"
+        );
+    }
+
     #[test]
     fn failure_hint_non_grpc_200_with_grpc_status_header() {
         let resp = http::Response::builder()