vllm-project · kingsmad · Sep 12, 2025 · Sep 12, 2025 · Sep 13, 2025 · yeqcharlotte
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
@@ -56,6 +56,7 @@ def generate(
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
+        toolcall_turn: Optional[int] = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """Generate outputs for a request."""
         ...

@@ -866,6 +866,7 @@ async def _generate_with_builtin_tools(
         **kwargs,
     ):
         orig_priority = priority
+        toolcall_turn = 0
         while True:
             self._log_inputs(
                 request_id,
@@ -879,6 +880,7 @@ async def _generate_with_builtin_tools(
                 request_id,
                 lora_request=lora_request,
                 priority=priority,
+                toolcall_turn=toolcall_turn,
                 **kwargs,
             )
             async for res in generator:
@@ -908,6 +910,7 @@ async def _generate_with_builtin_tools(
                 prompt_token_ids)
             # OPTIMIZATION
             priority = orig_priority - 1
+            toolcall_turn += 1
 
     def _log_inputs(
         self,

@@ -26,8 +26,8 @@
     """
     blocks[i][j] refers to the i-th kv_cache_group and the j-th block of tokens.
     We don't use block of tokens as the outer dimension because it assumes all
-    kv_cache_groups have the same number of blocks, which is true for now but 
-    will be broken if we want to give different block_size to different 
+    kv_cache_groups have the same number of blocks, which is true for now but
+    will be broken if we want to give different block_size to different
     kv_cache_groups in the future.
     """
 
@@ -187,6 +187,11 @@
             self.prefix_cache_stats.queries += request.num_tokens
             self.prefix_cache_stats.hits += num_new_computed_tokens
 
+            # log tool call cache stats for turn > 0
+            if request.toolcall_turn is not None and request.toolcall_turn > 0:
+                self.prefix_cache_stats.toolcall_non_1st_turn_hits += num_new_computed_tokens
+                self.prefix_cache_stats.toolcall_non_1st_turn_queries += request.num_tokens
+
         return KVCacheBlocks(computed_blocks), num_new_computed_tokens
 
     def allocate_slots(
@@ -208,10 +213,10 @@
                 already been computed locally (i.e. new_computed_blocks).
             num_new_computed_tokens: The number of new computed tokens just
                 hitting the prefix caching, excluding external tokens.
-            new_computed_blocks: The cached blocks for the above new computed 
+            new_computed_blocks: The cached blocks for the above new computed
                 tokens.
             num_lookahead_tokens: The number of speculative tokens to allocate.
-                This is used by spec decode proposers with kv-cache such 
+                This is used by spec decode proposers with kv-cache such
                 as eagle.
             delay_cache_blocks: Whether to skip caching the blocks. This is
                 used by P/D when allocating blocks used in a KV transfer
@@ -364,7 +369,7 @@
                 requests in the current step.
 
         Returns:
-            list[int]: The number of common prefix blocks for each kv cache 
+            list[int]: The number of common prefix blocks for each kv cache
             group.
         """
         assert request.status == RequestStatus.RUNNING

@@ -56,6 +56,7 @@ class EngineCoreRequest(
     lora_request: Optional[LoRARequest]
     cache_salt: Optional[str]
     data_parallel_rank: Optional[int]
+    toolcall_turn: Optional[int]
 
     # Index of the client, used to ensure outputs are sent back to the same
     # client for this request when scaling out the front-end.

@@ -271,6 +271,7 @@ async def add_request(
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
         data_parallel_rank: Optional[int] = None,
+        toolcall_turn: Optional[int] = None,
     ) -> RequestOutputCollector:
         """Add new request to the AsyncLLM."""
 
@@ -285,7 +286,7 @@ async def add_request(
         # Convert Input --> Request.
         prompt_str, request = self.processor.process_inputs(
             request_id, prompt, params, arrival_time, lora_request,
-            tokenization_kwargs, trace_headers, priority, data_parallel_rank)
+            tokenization_kwargs, trace_headers, priority, data_parallel_rank, toolcall_turn)
 
         if is_pooling or params.n == 1:
             await self._add_request(request, prompt_str, None, 0, queue)
@@ -331,6 +332,7 @@ async def generate(
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
         data_parallel_rank: Optional[int] = None,
+        toolcall_turn: Optional[int] = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """
         Main function called by the API server to kick off a request
@@ -378,6 +380,7 @@ async def generate(
                 priority=priority,
                 tokenization_kwargs=tokenization_kwargs,
                 data_parallel_rank=data_parallel_rank,
+                toolcall_turn=toolcall_turn,
             )
 
             # The output_handler task pushes items into the queue.

@@ -162,7 +162,7 @@ def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
         """
         Validate that user-provided multi_modal_uuids align with
         multi_modal_data in the incoming request prompt(s).
-        Only checks lengths; `None` entries are allowed and will be 
+        Only checks lengths; `None` entries are allowed and will be
         auto-hashed downstream.
         """
 
@@ -322,6 +322,7 @@ def process_inputs(
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
         data_parallel_rank: Optional[int] = None,
+        toolcall_turn: Optional[int] = None,
     ) -> tuple[Optional[str], EngineCoreRequest]:
 
         # TODO(woosuk): Support pooling models.
@@ -434,6 +435,7 @@ def process_inputs(
             priority=priority,
             data_parallel_rank=data_parallel_rank,
             trace_headers=trace_headers,
+            toolcall_turn=toolcall_turn,
         )
 
     def _validate_model_inputs(self,

@@ -25,6 +25,10 @@ class PrefixCacheStats:
     # The number of hits in these requests.
     hits: int = 0
 
+    # tool call specific stats
+    toolcall_non_1st_turn_hits: int = 0
+    toolcall_non_1st_turn_queries: int = 0
+
 
 @dataclass
 class SchedulerStats:

@@ -39,6 +39,7 @@ def __init__(
         trace_headers: Optional[Mapping[str, str]] = None,
         block_hasher: Optional[Callable[["Request"],
                                         list["BlockHash"]]] = None,
+        toolcall_turn: Optional[int] = None,
     ) -> None:
         self.request_id = request_id
         self.client_index = client_index
@@ -113,6 +114,7 @@ def __init__(
         if block_hasher is not None:
             self.get_hash_new_full_blocks = partial(block_hasher, self)
             self.block_hashes = self.get_hash_new_full_blocks()
+        self.toolcall_turn = toolcall_turn
 
     @classmethod
     def from_engine_core_request(
@@ -136,6 +138,7 @@ def from_engine_core_request(
             priority=request.priority,
             trace_headers=request.trace_headers,
             block_hasher=block_hasher,
+            toolcall_turn=request.toolcall_turn,
         )
 
     def append_output_token_ids(