diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 94eacfbdfb30..ea79b4999110 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -56,6 +56,7 @@ def generate( lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, + toolcall_turn: Optional[int] = None, ) -> AsyncGenerator[RequestOutput, None]: """Generate outputs for a request.""" ... diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index d391cc50ad23..985d66441418 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -866,6 +866,7 @@ async def _generate_with_builtin_tools( **kwargs, ): orig_priority = priority + toolcall_turn = 0 while True: self._log_inputs( request_id, @@ -879,6 +880,7 @@ async def _generate_with_builtin_tools( request_id, lora_request=lora_request, priority=priority, + toolcall_turn=toolcall_turn, **kwargs, ) async for res in generator: @@ -908,6 +910,7 @@ async def _generate_with_builtin_tools( prompt_token_ids) # OPTIMIZATION priority = orig_priority - 1 + toolcall_turn += 1 def _log_inputs( self, diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 3a0fbb5e5c41..a8127e8226f9 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -26,8 +26,8 @@ class KVCacheBlocks: """ blocks[i][j] refers to the i-th kv_cache_group and the j-th block of tokens. We don't use block of tokens as the outer dimension because it assumes all - kv_cache_groups have the same number of blocks, which is true for now but - will be broken if we want to give different block_size to different + kv_cache_groups have the same number of blocks, which is true for now but + will be broken if we want to give different block_size to different kv_cache_groups in the future. """ @@ -187,6 +187,11 @@ def get_computed_blocks(self, self.prefix_cache_stats.queries += request.num_tokens self.prefix_cache_stats.hits += num_new_computed_tokens + # log tool call cache stats for turn > 0 + if request.toolcall_turn is not None and request.toolcall_turn > 0: + self.prefix_cache_stats.toolcall_non_1st_turn_hits += num_new_computed_tokens + self.prefix_cache_stats.toolcall_non_1st_turn_queries += request.num_tokens + return KVCacheBlocks(computed_blocks), num_new_computed_tokens def allocate_slots( @@ -208,10 +213,10 @@ def allocate_slots( already been computed locally (i.e. new_computed_blocks). num_new_computed_tokens: The number of new computed tokens just hitting the prefix caching, excluding external tokens. - new_computed_blocks: The cached blocks for the above new computed + new_computed_blocks: The cached blocks for the above new computed tokens. num_lookahead_tokens: The number of speculative tokens to allocate. - This is used by spec decode proposers with kv-cache such + This is used by spec decode proposers with kv-cache such as eagle. delay_cache_blocks: Whether to skip caching the blocks. This is used by P/D when allocating blocks used in a KV transfer @@ -364,7 +369,7 @@ def get_num_common_prefix_blocks( requests in the current step. Returns: - list[int]: The number of common prefix blocks for each kv cache + list[int]: The number of common prefix blocks for each kv cache group. """ assert request.status == RequestStatus.RUNNING diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index dec4abec519b..d07067f41be6 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -56,6 +56,7 @@ class EngineCoreRequest( lora_request: Optional[LoRARequest] cache_salt: Optional[str] data_parallel_rank: Optional[int] + toolcall_turn: Optional[int] # Index of the client, used to ensure outputs are sent back to the same # client for this request when scaling out the front-end. diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a9ced402b974..94850a21aad0 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -271,6 +271,7 @@ async def add_request( trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, data_parallel_rank: Optional[int] = None, + toolcall_turn: Optional[int] = None, ) -> RequestOutputCollector: """Add new request to the AsyncLLM.""" @@ -285,7 +286,7 @@ async def add_request( # Convert Input --> Request. prompt_str, request = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, - tokenization_kwargs, trace_headers, priority, data_parallel_rank) + tokenization_kwargs, trace_headers, priority, data_parallel_rank, toolcall_turn) if is_pooling or params.n == 1: await self._add_request(request, prompt_str, None, 0, queue) @@ -331,6 +332,7 @@ async def generate( trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, data_parallel_rank: Optional[int] = None, + toolcall_turn: Optional[int] = None, ) -> AsyncGenerator[RequestOutput, None]: """ Main function called by the API server to kick off a request @@ -378,6 +380,7 @@ async def generate( priority=priority, tokenization_kwargs=tokenization_kwargs, data_parallel_rank=data_parallel_rank, + toolcall_turn=toolcall_turn, ) # The output_handler task pushes items into the queue. diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index f3fad15b750a..c7f088f966aa 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -162,7 +162,7 @@ def _validate_multi_modal_uuids(self, prompt: PromptType) -> None: """ Validate that user-provided multi_modal_uuids align with multi_modal_data in the incoming request prompt(s). - Only checks lengths; `None` entries are allowed and will be + Only checks lengths; `None` entries are allowed and will be auto-hashed downstream. """ @@ -322,6 +322,7 @@ def process_inputs( trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, data_parallel_rank: Optional[int] = None, + toolcall_turn: Optional[int] = None, ) -> tuple[Optional[str], EngineCoreRequest]: # TODO(woosuk): Support pooling models. @@ -434,6 +435,7 @@ def process_inputs( priority=priority, data_parallel_rank=data_parallel_rank, trace_headers=trace_headers, + toolcall_turn=toolcall_turn, ) def _validate_model_inputs(self, diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index e6c344d193df..990a70c6a8b7 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -25,6 +25,10 @@ class PrefixCacheStats: # The number of hits in these requests. hits: int = 0 + # tool call specific stats + toolcall_non_1st_turn_hits: int = 0 + toolcall_non_1st_turn_queries: int = 0 + @dataclass class SchedulerStats: diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 4e3e581235cc..fdd3f97743de 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -39,6 +39,7 @@ def __init__( trace_headers: Optional[Mapping[str, str]] = None, block_hasher: Optional[Callable[["Request"], list["BlockHash"]]] = None, + toolcall_turn: Optional[int] = None, ) -> None: self.request_id = request_id self.client_index = client_index @@ -113,6 +114,7 @@ def __init__( if block_hasher is not None: self.get_hash_new_full_blocks = partial(block_hasher, self) self.block_hashes = self.get_hash_new_full_blocks() + self.toolcall_turn = toolcall_turn @classmethod def from_engine_core_request( @@ -136,6 +138,7 @@ def from_engine_core_request( priority=request.priority, trace_headers=request.trace_headers, block_hasher=block_hasher, + toolcall_turn=request.toolcall_turn, ) def append_output_token_ids(