Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
9a3efe8
Retention API: scaffold PriorityEvictionQueue with sidecar storage
hyeongyun0916 May 14, 2026
11b4836
Retention API: implement try_insert and pop_lowest with min-heap
hyeongyun0916 May 14, 2026
5e6247d
Retention API: test eviction order by priority and freed-time tiebreak
hyeongyun0916 May 14, 2026
1438b62
Retention API: remove() with lazy delete; pop_lowest skips stale entries
hyeongyun0916 May 14, 2026
c135b79
Retention API: honor expiry in pop_lowest
hyeongyun0916 May 14, 2026
645b89e
Retention API: apply_directives with overlap matching and duration
hyeongyun0916 May 14, 2026
f89917b
Retention API: scope-based ownership in apply_directives
hyeongyun0916 May 14, 2026
5b2edf1
Retention API: cleanup hooks clear_priority, clear, __contains__
hyeongyun0916 May 14, 2026
55a1a39
Retention API: wire PriorityEvictionQueue into BlockPool
hyeongyun0916 May 14, 2026
a0c6150
Retention API: wire cache_full_blocks to apply retention directives
hyeongyun0916 May 14, 2026
091b8dc
Retention API: priority-aware allocation in get_new_blocks
hyeongyun0916 May 14, 2026
3411297
Retention API: revert reset_prefix_cache change from priority-aware a…
hyeongyun0916 May 14, 2026
dbbeae2
Retention API: touch removes from priority queue when block was there
hyeongyun0916 May 14, 2026
b2b40ba
Retention API: route prioritized blocks via try_insert in free_blocks
hyeongyun0916 May 14, 2026
b10c395
Retention API: clear sidecar on evict_blocks
hyeongyun0916 May 14, 2026
4f93efb
Retention API: add retention_directives + retention_scope chat fields
hyeongyun0916 May 14, 2026
b5a5345
Retention API: validator enforces non-increasing priorities
hyeongyun0916 May 14, 2026
e582d5f
Retention API: structural-invariant tests for sidecar pattern
hyeongyun0916 May 14, 2026
85a85f1
Retention API: port throughput benchmark with sidecar access patterns
hyeongyun0916 May 14, 2026
18a77e9
[V1][Core] Fix priority-inversion in PriorityEvictionQueue via per-bl…
hyeongyun0916 Jun 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
233 changes: 233 additions & 0 deletions benchmarks/benchmark_retention_eviction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Simulate agentic multi-turn eviction with retention directives.

This script validates that the two-structure evictor (LRU + priority queue)
correctly protects prioritized blocks under memory pressure. It simulates
a workload where:

1. Multiple "sessions" allocate blocks (simulating multi-turn agentic
requests with growing context).
2. Some blocks receive retention priority (simulating system-prompt and
tool-call-awaiting blocks that the orchestrator wants to protect).
3. Memory pressure forces eviction.
4. We measure whether prioritized blocks survive longer than unprioritized
ones.

No GPU required — runs against the actual BlockPool implementation.

Usage:
python benchmarks/benchmark_retention_eviction.py \
--num-gpu-blocks 1000 --block-size 16 --num-sessions 20
"""

import argparse
import time

from vllm.v1.core.block_pool import BlockPool
from vllm.v1.core.kv_cache_utils import KVCacheBlock
from vllm.v1.core.priority_eviction_queue import RetentionMeta


def run_simulation(
num_gpu_blocks: int,
block_size: int,
num_sessions: int,
blocks_per_session: int,
priority_fraction: float,
priority_value: int,
):
"""Run the eviction simulation.

Args:
num_gpu_blocks: Total blocks in the pool.
block_size: Tokens per block (for hash_block_size).
num_sessions: Number of concurrent sessions to simulate.
blocks_per_session: Blocks allocated per session.
priority_fraction: Fraction of each session's blocks that get
retention priority (0.0 - 1.0).
priority_value: Priority value to assign (0-100).
"""
pool = BlockPool(
num_gpu_blocks=num_gpu_blocks,
enable_caching=True,
hash_block_size=block_size,
)

# Track which blocks are prioritized vs not.
prioritized_block_ids: set[int] = set()
unprioritized_block_ids: set[int] = set()

sessions: list[list[KVCacheBlock]] = []
total_allocated = 0

print(f"Pool: {num_gpu_blocks} blocks, {num_gpu_blocks - 1} usable")
print(f"Sessions: {num_sessions}, {blocks_per_session} blocks each")
print(
f"Priority: {priority_fraction * 100:.0f}% of blocks at "
f"priority={priority_value}"
)
print(f"Total demand: {num_sessions * blocks_per_session} blocks")
print()

# Phase 1: Allocate sessions until we run out of blocks.
for i in range(num_sessions):
free = pool.get_num_free_blocks()
if free < blocks_per_session:
print(
f" Session {i}: only {free} free blocks, "
f"need {blocks_per_session} — stopping allocation"
)
break

blocks = pool.get_new_blocks(blocks_per_session)
total_allocated += len(blocks)

# Mark some blocks as prioritized.
num_priority = int(len(blocks) * priority_fraction)
for j, block in enumerate(blocks):
if j < num_priority:
pool.priority_eviction_queue._meta[block.block_id] = RetentionMeta(
priority=priority_value,
expiry=None,
scope=None,
last_freed_time=0.0,
)
prioritized_block_ids.add(block.block_id)
else:
unprioritized_block_ids.add(block.block_id)

sessions.append(blocks)
print(
f" Session {i}: allocated {len(blocks)} blocks "
f"({num_priority} prioritized), "
f"{pool.get_num_free_blocks()} free remaining"
)

print(f"\nTotal allocated: {total_allocated}")
print(f" Prioritized: {len(prioritized_block_ids)}")
print(f" Unprioritized: {len(unprioritized_block_ids)}")

# Phase 2: Free all sessions (simulating requests completing).
for session_blocks in sessions:
pool.free_blocks(session_blocks)

print("\nAfter freeing all sessions:")
print(f" LRU free list: {pool.free_block_queue.num_free_blocks}")
print(f" Priority queue: {pool.priority_eviction_queue.num_blocks}")
print(f" Total free: {pool.get_num_free_blocks()}")

# Phase 3: Allocate under pressure and track which blocks get evicted.
eviction_target = total_allocated // 2
print(f"\nEvicting {eviction_target} blocks (allocating new ones)...")

evicted_prioritized = 0
evicted_unprioritized = 0
evicted_other = 0

t_start = time.monotonic()
new_blocks = pool.get_new_blocks(eviction_target)
t_elapsed = time.monotonic() - t_start

for block in new_blocks:
bid = block.block_id
if bid in unprioritized_block_ids:
evicted_unprioritized += 1
elif bid in prioritized_block_ids:
evicted_prioritized += 1
else:
evicted_other += 1

print(f" Time: {t_elapsed * 1000:.2f} ms")
print(f" Evicted from unprioritized: {evicted_unprioritized}")
print(f" Evicted from prioritized: {evicted_prioritized}")
print(f" Evicted from other (null/fresh): {evicted_other}")

# Verify correctness: unprioritized blocks should be evicted first.
total_unprioritized = len(unprioritized_block_ids)
if eviction_target <= total_unprioritized:
# Should not have touched any prioritized blocks.
if evicted_prioritized == 0:
print("\n PASS: No prioritized blocks evicted (all evictions from LRU)")
else:
print(
f"\n FAIL: {evicted_prioritized} prioritized blocks "
f"evicted when {total_unprioritized} unprioritized "
f"were available"
)
else:
# Some prioritized blocks must be evicted.
expected_from_priority = eviction_target - total_unprioritized
print(
f"\n INFO: Needed {expected_from_priority} from priority "
f"queue (exhausted {total_unprioritized} unprioritized)"
)
if evicted_unprioritized == total_unprioritized:
print(" PASS: All unprioritized evicted before any prioritized")
else:
print(
f" FAIL: Only {evicted_unprioritized}/{total_unprioritized}"
f" unprioritized evicted before touching prioritized"
)

# Phase 4: Verify remaining priority queue state.
remaining_in_pq = pool.priority_eviction_queue.num_blocks
surviving_prioritized = len(prioritized_block_ids) - evicted_prioritized
print(f"\n Surviving prioritized blocks: {surviving_prioritized}")
print(f" Priority queue size: {remaining_in_pq}")

return evicted_prioritized == 0 or evicted_unprioritized == len(
unprioritized_block_ids
)


def main():
parser = argparse.ArgumentParser(description="Benchmark retention-based eviction")
parser.add_argument(
"--num-gpu-blocks", type=int, default=1000, help="Total KV-cache blocks in pool"
)
parser.add_argument("--block-size", type=int, default=16, help="Tokens per block")
parser.add_argument(
"--num-sessions", type=int, default=20, help="Number of concurrent sessions"
)
parser.add_argument(
"--blocks-per-session", type=int, default=40, help="Blocks per session"
)
parser.add_argument(
"--priority-fraction",
type=float,
default=0.3,
help="Fraction of blocks with priority (0-1)",
)
parser.add_argument(
"--priority-value",
type=int,
default=80,
help="Priority value for retained blocks (0-100)",
)
args = parser.parse_args()

print("=" * 60)
print("Retention-Based Eviction Benchmark")
print("=" * 60)
print()

success = run_simulation(
num_gpu_blocks=args.num_gpu_blocks,
block_size=args.block_size,
num_sessions=args.num_sessions,
blocks_per_session=args.blocks_per_session,
priority_fraction=args.priority_fraction,
priority_value=args.priority_value,
)

print()
print("=" * 60)
print(f"Result: {'PASS' if success else 'FAIL'}")
print("=" * 60)

return 0 if success else 1


if __name__ == "__main__":
raise SystemExit(main())
85 changes: 85 additions & 0 deletions tests/tool_use/test_chat_completion_request_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,88 @@ def test_chat_completion_request_with_tool_choice_but_no_tools(tool_choice):
"tools": None,
}
)


def test_retention_directives_field_round_trips():
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)

req = ChatCompletionRequest(
model="dummy",
messages=[{"role": "user", "content": "hi"}],
retention_directives=[{"start": 0, "end": 16, "priority": 80}],
retention_scope="alice",
)
sp = req.to_sampling_params(
max_tokens=16,
default_sampling_params={},
)
assert sp.extra_args["retention_directives"] == [
{"start": 0, "end": 16, "priority": 80}
]
assert sp.extra_args["retention_scope"] == "alice"


def _build_request_with_directives(directives):
return ChatCompletionRequest(
model="dummy",
messages=[{"role": "user", "content": "hi"}],
retention_directives=directives,
)


def test_retention_directives_monotonic_priorities_valid():
# Strictly decreasing priorities across rising token positions: valid.
_build_request_with_directives(
[
{"start": 0, "end": 100, "priority": 90},
{"start": 100, "end": 200, "priority": 60},
{"start": 200, "end": 300, "priority": 30},
]
)


def test_retention_directives_monotonic_priorities_equal_is_valid():
# Non-increasing (equal) priorities: valid.
_build_request_with_directives(
[
{"start": 0, "end": 100, "priority": 50},
{"start": 100, "end": 200, "priority": 50},
]
)


def test_retention_directives_empty_or_none_is_valid():
_build_request_with_directives(None)
_build_request_with_directives([])


def test_retention_directives_single_directive_is_valid():
_build_request_with_directives([{"start": 0, "end": 16, "priority": 80}])


def test_retention_directives_increasing_priority_rejected():
with pytest.raises(ValueError, match="non-increasing"):
_build_request_with_directives(
[
{"start": 0, "end": 100, "priority": 30},
{"start": 100, "end": 200, "priority": 80}, # increases — invalid
]
)


def test_retention_directives_unsorted_input_still_validated():
# Input order is unsorted, but the validator should sort by start
# before checking monotonicity.
with pytest.raises(ValueError, match="non-increasing"):
_build_request_with_directives(
[
{"start": 100, "end": 200, "priority": 80},
{
"start": 0,
"end": 100,
"priority": 30,
}, # sorted: 30 then 80 → invalid
]
)
21 changes: 21 additions & 0 deletions tests/v1/core/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest


@pytest.fixture(autouse=True)
def _retention_priority_threshold_zero(monkeypatch):
"""Lower `_PRIORITY_THRESHOLD` to 0 inside every test in this dir.

The retention priority threshold is a production safety knob (default
60) that routes sub-threshold sidecar entries to the LRU rather than
the priority queue. The unit tests in this directory use priority=50
throughout as a convention to mean "any prioritized entry"; we lower
the threshold inside tests so the convention keeps working without
rewriting every test fixture. Tests that explicitly exercise the
threshold behavior re-set it via monkeypatch in-body.
"""
import vllm.v1.core.priority_eviction_queue as pq_mod

monkeypatch.setattr(pq_mod, "_PRIORITY_THRESHOLD", 0)
Loading