Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions vllm/engine/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def generate(
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
toolcall_turn: Optional[int] = None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't think it's a good idea to expose "toolcall" as a concept to engine and it's better to let engine focus on single turn scheduling while we can influence its scheduling behavior though setting priority.

is it possible to prevent this? like introducing some api server level stats?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make sense thanks! Let me close this PR for now and have more discussions.

) -> AsyncGenerator[RequestOutput, None]:
"""Generate outputs for a request."""
...
Expand Down
3 changes: 3 additions & 0 deletions vllm/entrypoints/openai/serving_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -866,6 +866,7 @@ async def _generate_with_builtin_tools(
**kwargs,
):
orig_priority = priority
toolcall_turn = 0
while True:
self._log_inputs(
request_id,
Expand All @@ -879,6 +880,7 @@ async def _generate_with_builtin_tools(
request_id,
lora_request=lora_request,
priority=priority,
toolcall_turn=toolcall_turn,
**kwargs,
)
async for res in generator:
Expand Down Expand Up @@ -908,6 +910,7 @@ async def _generate_with_builtin_tools(
prompt_token_ids)
# OPTIMIZATION
priority = orig_priority - 1
toolcall_turn += 1

def _log_inputs(
self,
Expand Down
15 changes: 10 additions & 5 deletions vllm/v1/core/kv_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
"""
blocks[i][j] refers to the i-th kv_cache_group and the j-th block of tokens.
We don't use block of tokens as the outer dimension because it assumes all
kv_cache_groups have the same number of blocks, which is true for now but
will be broken if we want to give different block_size to different
kv_cache_groups have the same number of blocks, which is true for now but
will be broken if we want to give different block_size to different
kv_cache_groups in the future.
"""

Expand Down Expand Up @@ -187,6 +187,11 @@
self.prefix_cache_stats.queries += request.num_tokens
self.prefix_cache_stats.hits += num_new_computed_tokens

# log tool call cache stats for turn > 0
if request.toolcall_turn is not None and request.toolcall_turn > 0:
self.prefix_cache_stats.toolcall_non_1st_turn_hits += num_new_computed_tokens

Check failure on line 192 in vllm/v1/core/kv_cache_manager.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

vllm/v1/core/kv_cache_manager.py:192:81: E501 Line too long (93 > 80)
self.prefix_cache_stats.toolcall_non_1st_turn_queries += request.num_tokens

Check failure on line 193 in vllm/v1/core/kv_cache_manager.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

vllm/v1/core/kv_cache_manager.py:193:81: E501 Line too long (91 > 80)

return KVCacheBlocks(computed_blocks), num_new_computed_tokens

def allocate_slots(
Expand All @@ -208,10 +213,10 @@
already been computed locally (i.e. new_computed_blocks).
num_new_computed_tokens: The number of new computed tokens just
hitting the prefix caching, excluding external tokens.
new_computed_blocks: The cached blocks for the above new computed
new_computed_blocks: The cached blocks for the above new computed
tokens.
num_lookahead_tokens: The number of speculative tokens to allocate.
This is used by spec decode proposers with kv-cache such
This is used by spec decode proposers with kv-cache such
as eagle.
delay_cache_blocks: Whether to skip caching the blocks. This is
used by P/D when allocating blocks used in a KV transfer
Expand Down Expand Up @@ -364,7 +369,7 @@
requests in the current step.

Returns:
list[int]: The number of common prefix blocks for each kv cache
list[int]: The number of common prefix blocks for each kv cache
group.
"""
assert request.status == RequestStatus.RUNNING
Expand Down
1 change: 1 addition & 0 deletions vllm/v1/engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class EngineCoreRequest(
lora_request: Optional[LoRARequest]
cache_salt: Optional[str]
data_parallel_rank: Optional[int]
toolcall_turn: Optional[int]

# Index of the client, used to ensure outputs are sent back to the same
# client for this request when scaling out the front-end.
Expand Down
5 changes: 4 additions & 1 deletion vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ async def add_request(
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
toolcall_turn: Optional[int] = None,
) -> RequestOutputCollector:
"""Add new request to the AsyncLLM."""

Expand All @@ -285,7 +286,7 @@ async def add_request(
# Convert Input --> Request.
prompt_str, request = self.processor.process_inputs(
request_id, prompt, params, arrival_time, lora_request,
tokenization_kwargs, trace_headers, priority, data_parallel_rank)
tokenization_kwargs, trace_headers, priority, data_parallel_rank, toolcall_turn)

if is_pooling or params.n == 1:
await self._add_request(request, prompt_str, None, 0, queue)
Expand Down Expand Up @@ -331,6 +332,7 @@ async def generate(
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
toolcall_turn: Optional[int] = None,
) -> AsyncGenerator[RequestOutput, None]:
"""
Main function called by the API server to kick off a request
Expand Down Expand Up @@ -378,6 +380,7 @@ async def generate(
priority=priority,
tokenization_kwargs=tokenization_kwargs,
data_parallel_rank=data_parallel_rank,
toolcall_turn=toolcall_turn,
)

# The output_handler task pushes items into the queue.
Expand Down
4 changes: 3 additions & 1 deletion vllm/v1/engine/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
"""
Validate that user-provided multi_modal_uuids align with
multi_modal_data in the incoming request prompt(s).
Only checks lengths; `None` entries are allowed and will be
Only checks lengths; `None` entries are allowed and will be
auto-hashed downstream.
"""

Expand Down Expand Up @@ -322,6 +322,7 @@ def process_inputs(
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
toolcall_turn: Optional[int] = None,
) -> tuple[Optional[str], EngineCoreRequest]:

# TODO(woosuk): Support pooling models.
Expand Down Expand Up @@ -434,6 +435,7 @@ def process_inputs(
priority=priority,
data_parallel_rank=data_parallel_rank,
trace_headers=trace_headers,
toolcall_turn=toolcall_turn,
)

def _validate_model_inputs(self,
Expand Down
4 changes: 4 additions & 0 deletions vllm/v1/metrics/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ class PrefixCacheStats:
# The number of hits in these requests.
hits: int = 0

# tool call specific stats
toolcall_non_1st_turn_hits: int = 0
toolcall_non_1st_turn_queries: int = 0


@dataclass
class SchedulerStats:
Expand Down
3 changes: 3 additions & 0 deletions vllm/v1/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def __init__(
trace_headers: Optional[Mapping[str, str]] = None,
block_hasher: Optional[Callable[["Request"],
list["BlockHash"]]] = None,
toolcall_turn: Optional[int] = None,
) -> None:
self.request_id = request_id
self.client_index = client_index
Expand Down Expand Up @@ -113,6 +114,7 @@ def __init__(
if block_hasher is not None:
self.get_hash_new_full_blocks = partial(block_hasher, self)
self.block_hashes = self.get_hash_new_full_blocks()
self.toolcall_turn = toolcall_turn

@classmethod
def from_engine_core_request(
Expand All @@ -136,6 +138,7 @@ def from_engine_core_request(
priority=request.priority,
trace_headers=request.trace_headers,
block_hasher=block_hasher,
toolcall_turn=request.toolcall_turn,
)

def append_output_token_ids(
Expand Down
Loading