moreh-dev · hyeongyun0916 · May 14, 2026 · May 14, 2026 · May 14, 2026 · May 14, 2026
diff --git a/benchmarks/benchmark_retention_eviction.py b/benchmarks/benchmark_retention_eviction.py
@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Simulate agentic multi-turn eviction with retention directives.
+
+This script validates that the two-structure evictor (LRU + priority queue)
+correctly protects prioritized blocks under memory pressure. It simulates
+a workload where:
+
+  1. Multiple "sessions" allocate blocks (simulating multi-turn agentic
+     requests with growing context).
+  2. Some blocks receive retention priority (simulating system-prompt and
+     tool-call-awaiting blocks that the orchestrator wants to protect).
+  3. Memory pressure forces eviction.
+  4. We measure whether prioritized blocks survive longer than unprioritized
+     ones.
+
+No GPU required — runs against the actual BlockPool implementation.
+
+Usage:
+    python benchmarks/benchmark_retention_eviction.py \
+        --num-gpu-blocks 1000 --block-size 16 --num-sessions 20
+"""
+
+import argparse
+import time
+
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import KVCacheBlock
+from vllm.v1.core.priority_eviction_queue import RetentionMeta
+
+
+def run_simulation(
+    num_gpu_blocks: int,
+    block_size: int,
+    num_sessions: int,
+    blocks_per_session: int,
+    priority_fraction: float,
+    priority_value: int,
+):
+    """Run the eviction simulation.
+
+    Args:
+        num_gpu_blocks: Total blocks in the pool.
+        block_size: Tokens per block (for hash_block_size).
+        num_sessions: Number of concurrent sessions to simulate.
+        blocks_per_session: Blocks allocated per session.
+        priority_fraction: Fraction of each session's blocks that get
+            retention priority (0.0 - 1.0).
+        priority_value: Priority value to assign (0-100).
+    """
+    pool = BlockPool(
+        num_gpu_blocks=num_gpu_blocks,
+        enable_caching=True,
+        hash_block_size=block_size,
+    )
+
+    # Track which blocks are prioritized vs not.
+    prioritized_block_ids: set[int] = set()
+    unprioritized_block_ids: set[int] = set()
+
+    sessions: list[list[KVCacheBlock]] = []
+    total_allocated = 0
+
+    print(f"Pool: {num_gpu_blocks} blocks, {num_gpu_blocks - 1} usable")
+    print(f"Sessions: {num_sessions}, {blocks_per_session} blocks each")
+    print(
+        f"Priority: {priority_fraction * 100:.0f}% of blocks at "
+        f"priority={priority_value}"
+    )
+    print(f"Total demand: {num_sessions * blocks_per_session} blocks")
+    print()
+
+    # Phase 1: Allocate sessions until we run out of blocks.
+    for i in range(num_sessions):
+        free = pool.get_num_free_blocks()
+        if free < blocks_per_session:
+            print(
+                f"  Session {i}: only {free} free blocks, "
+                f"need {blocks_per_session} — stopping allocation"
+            )
+            break
+
+        blocks = pool.get_new_blocks(blocks_per_session)
+        total_allocated += len(blocks)
+
+        # Mark some blocks as prioritized.
+        num_priority = int(len(blocks) * priority_fraction)
+        for j, block in enumerate(blocks):
+            if j < num_priority:
+                pool.priority_eviction_queue._meta[block.block_id] = RetentionMeta(
+                    priority=priority_value,
+                    expiry=None,
+                    scope=None,
+                    last_freed_time=0.0,
+                )
+                prioritized_block_ids.add(block.block_id)
+            else:
+                unprioritized_block_ids.add(block.block_id)
+
+        sessions.append(blocks)
+        print(
+            f"  Session {i}: allocated {len(blocks)} blocks "
+            f"({num_priority} prioritized), "
+            f"{pool.get_num_free_blocks()} free remaining"
+        )
+
+    print(f"\nTotal allocated: {total_allocated}")
+    print(f"  Prioritized: {len(prioritized_block_ids)}")
+    print(f"  Unprioritized: {len(unprioritized_block_ids)}")
+
+    # Phase 2: Free all sessions (simulating requests completing).
+    for session_blocks in sessions:
+        pool.free_blocks(session_blocks)
+
+    print("\nAfter freeing all sessions:")
+    print(f"  LRU free list: {pool.free_block_queue.num_free_blocks}")
+    print(f"  Priority queue: {pool.priority_eviction_queue.num_blocks}")
+    print(f"  Total free: {pool.get_num_free_blocks()}")
+
+    # Phase 3: Allocate under pressure and track which blocks get evicted.
+    eviction_target = total_allocated // 2
+    print(f"\nEvicting {eviction_target} blocks (allocating new ones)...")
+
+    evicted_prioritized = 0
+    evicted_unprioritized = 0
+    evicted_other = 0
+
+    t_start = time.monotonic()
+    new_blocks = pool.get_new_blocks(eviction_target)
+    t_elapsed = time.monotonic() - t_start
+
+    for block in new_blocks:
+        bid = block.block_id
+        if bid in unprioritized_block_ids:
+            evicted_unprioritized += 1
+        elif bid in prioritized_block_ids:
+            evicted_prioritized += 1
+        else:
+            evicted_other += 1
+
+    print(f"  Time: {t_elapsed * 1000:.2f} ms")
+    print(f"  Evicted from unprioritized: {evicted_unprioritized}")
+    print(f"  Evicted from prioritized: {evicted_prioritized}")
+    print(f"  Evicted from other (null/fresh): {evicted_other}")
+
+    # Verify correctness: unprioritized blocks should be evicted first.
+    total_unprioritized = len(unprioritized_block_ids)
+    if eviction_target <= total_unprioritized:
+        # Should not have touched any prioritized blocks.
+        if evicted_prioritized == 0:
+            print("\n  PASS: No prioritized blocks evicted (all evictions from LRU)")
+        else:
+            print(
+                f"\n  FAIL: {evicted_prioritized} prioritized blocks "
+                f"evicted when {total_unprioritized} unprioritized "
+                f"were available"
+            )
+    else:
+        # Some prioritized blocks must be evicted.
+        expected_from_priority = eviction_target - total_unprioritized
+        print(
+            f"\n  INFO: Needed {expected_from_priority} from priority "
+            f"queue (exhausted {total_unprioritized} unprioritized)"
+        )
+        if evicted_unprioritized == total_unprioritized:
+            print("  PASS: All unprioritized evicted before any prioritized")
+        else:
+            print(
+                f"  FAIL: Only {evicted_unprioritized}/{total_unprioritized}"
+                f" unprioritized evicted before touching prioritized"
+            )
+
+    # Phase 4: Verify remaining priority queue state.
+    remaining_in_pq = pool.priority_eviction_queue.num_blocks
+    surviving_prioritized = len(prioritized_block_ids) - evicted_prioritized
+    print(f"\n  Surviving prioritized blocks: {surviving_prioritized}")
+    print(f"  Priority queue size: {remaining_in_pq}")
+
+    return evicted_prioritized == 0 or evicted_unprioritized == len(
+        unprioritized_block_ids
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark retention-based eviction")
+    parser.add_argument(
+        "--num-gpu-blocks", type=int, default=1000, help="Total KV-cache blocks in pool"
+    )
+    parser.add_argument("--block-size", type=int, default=16, help="Tokens per block")
+    parser.add_argument(
+        "--num-sessions", type=int, default=20, help="Number of concurrent sessions"
+    )
+    parser.add_argument(
+        "--blocks-per-session", type=int, default=40, help="Blocks per session"
+    )
+    parser.add_argument(
+        "--priority-fraction",
+        type=float,
+        default=0.3,
+        help="Fraction of blocks with priority (0-1)",
+    )
+    parser.add_argument(
+        "--priority-value",
+        type=int,
+        default=80,
+        help="Priority value for retained blocks (0-100)",
+    )
+    args = parser.parse_args()
+
+    print("=" * 60)
+    print("Retention-Based Eviction Benchmark")
+    print("=" * 60)
+    print()
+
+    success = run_simulation(
+        num_gpu_blocks=args.num_gpu_blocks,
+        block_size=args.block_size,
+        num_sessions=args.num_sessions,
+        blocks_per_session=args.blocks_per_session,
+        priority_fraction=args.priority_fraction,
+        priority_value=args.priority_value,
+    )
+
+    print()
+    print("=" * 60)
+    print(f"Result: {'PASS' if success else 'FAIL'}")
+    print("=" * 60)
+
+    return 0 if success else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py
@@ -61,3 +61,88 @@ def test_chat_completion_request_with_tool_choice_but_no_tools(tool_choice):
                 "tools": None,
             }
         )
+
+
+def test_retention_directives_field_round_trips():
+    from vllm.entrypoints.openai.chat_completion.protocol import (
+        ChatCompletionRequest,
+    )
+
+    req = ChatCompletionRequest(
+        model="dummy",
+        messages=[{"role": "user", "content": "hi"}],
+        retention_directives=[{"start": 0, "end": 16, "priority": 80}],
+        retention_scope="alice",
+    )
+    sp = req.to_sampling_params(
+        max_tokens=16,
+        default_sampling_params={},
+    )
+    assert sp.extra_args["retention_directives"] == [
+        {"start": 0, "end": 16, "priority": 80}
+    ]
+    assert sp.extra_args["retention_scope"] == "alice"
+
+
+def _build_request_with_directives(directives):
+    return ChatCompletionRequest(
+        model="dummy",
+        messages=[{"role": "user", "content": "hi"}],
+        retention_directives=directives,
+    )
+
+
+def test_retention_directives_monotonic_priorities_valid():
+    # Strictly decreasing priorities across rising token positions: valid.
+    _build_request_with_directives(
+        [
+            {"start": 0, "end": 100, "priority": 90},
+            {"start": 100, "end": 200, "priority": 60},
+            {"start": 200, "end": 300, "priority": 30},
+        ]
+    )
+
+
+def test_retention_directives_monotonic_priorities_equal_is_valid():
+    # Non-increasing (equal) priorities: valid.
+    _build_request_with_directives(
+        [
+            {"start": 0, "end": 100, "priority": 50},
+            {"start": 100, "end": 200, "priority": 50},
+        ]
+    )
+
+
+def test_retention_directives_empty_or_none_is_valid():
+    _build_request_with_directives(None)
+    _build_request_with_directives([])
+
+
+def test_retention_directives_single_directive_is_valid():
+    _build_request_with_directives([{"start": 0, "end": 16, "priority": 80}])
+
+
+def test_retention_directives_increasing_priority_rejected():
+    with pytest.raises(ValueError, match="non-increasing"):
+        _build_request_with_directives(
+            [
+                {"start": 0, "end": 100, "priority": 30},
+                {"start": 100, "end": 200, "priority": 80},  # increases — invalid
+            ]
+        )
+
+
+def test_retention_directives_unsorted_input_still_validated():
+    # Input order is unsorted, but the validator should sort by start
+    # before checking monotonicity.
+    with pytest.raises(ValueError, match="non-increasing"):
+        _build_request_with_directives(
+            [
+                {"start": 100, "end": 200, "priority": 80},
+                {
+                    "start": 0,
+                    "end": 100,
+                    "priority": 30,
+                },  # sorted: 30 then 80 → invalid
+            ]
+        )
diff --git a/tests/v1/core/conftest.py b/tests/v1/core/conftest.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def _retention_priority_threshold_zero(monkeypatch):
+    """Lower `_PRIORITY_THRESHOLD` to 0 inside every test in this dir.
+
+    The retention priority threshold is a production safety knob (default
+    60) that routes sub-threshold sidecar entries to the LRU rather than
+    the priority queue. The unit tests in this directory use priority=50
+    throughout as a convention to mean "any prioritized entry"; we lower
+    the threshold inside tests so the convention keeps working without
+    rewriting every test fixture. Tests that explicitly exercise the
+    threshold behavior re-set it via monkeypatch in-body.
+    """
+    import vllm.v1.core.priority_eviction_queue as pq_mod
+
+    monkeypatch.setattr(pq_mod, "_PRIORITY_THRESHOLD", 0)