huggingface
diff --git a/‎Cargo.lock‎
Lines changed: 107 additions & 141 deletions b/‎Cargo.lock‎
Lines changed: 107 additions & 141 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 5 additions & 1 deletion b/‎Cargo.toml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎benchmark/Cargo.toml‎
Lines changed: 2 additions & 2 deletions b/‎benchmark/Cargo.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎clients/python/tests/conftest.py‎
Lines changed: 10 additions & 0 deletions b/‎clients/python/tests/conftest.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎clients/python/tests/test_client.py‎
Lines changed: 35 additions & 32 deletions b/‎clients/python/tests/test_client.py‎
Lines changed: 35 additions & 32 deletions
diff --git a/‎clients/python/text_generation/types.py‎
Lines changed: 21 additions & 0 deletions b/‎clients/python/text_generation/types.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎docs/openapi.json‎
Lines changed: 30 additions & 10 deletions b/‎docs/openapi.json‎
Lines changed: 30 additions & 10 deletions
diff --git a/‎docs/source/basic_tutorials/launcher.md‎
Lines changed: 9 additions & 0 deletions b/‎docs/source/basic_tutorials/launcher.md‎
Lines changed: 9 additions & 0 deletions
@@ -9,11 +9,15 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "2.0.0"
+version = "2.0.1"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
 
+[workspace.dependencies]
+tokenizers = { version = "0.19.1", features = ["http"] }
+hf-hub = { version = "0.3.1", features = ["tokio"] }
+
 [profile.release]
 debug = 1
 incremental = true
 
@@ -23,9 +23,9 @@ serde_json = "1.0"
 tabled = "0.14.0"
 text-generation-client = { path = "../router/client" }
 thiserror = "1.0.48"
-tokenizers = { version = "0.14.0", features = ["http"] }
+tokenizers = { workspace = true }
 tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync", "macros"] }
 tui = {package = "ratatui", version = "0.23", default-features = false, features = ["crossterm"]}
 tracing = "0.1.37"
 tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
-hf-hub = "0.3.1"
+hf-hub = { workspace = true }
@@ -9,6 +9,11 @@ def flan_t5_xxl():
     return "google/flan-t5-xxl"
 
 
+@pytest.fixture
+def llama_7b():
+    return "meta-llama/Llama-2-7b-chat-hf"
+
+
 @pytest.fixture
 def fake_model():
     return "fake/model"
@@ -34,6 +39,11 @@ def flan_t5_xxl_url(base_url, flan_t5_xxl):
     return f"{base_url}/{flan_t5_xxl}"
 
 
+@pytest.fixture
+def llama_7b_url(base_url, llama_7b):
+    return f"{base_url}/{llama_7b}"
+
+
 @pytest.fixture
 def fake_url(base_url, fake_model):
     return f"{base_url}/{fake_model}"
 
@@ -5,24 +5,24 @@
 from text_generation.types import FinishReason, InputToken
 
 
-def test_generate(flan_t5_xxl_url, hf_headers):
-    client = Client(flan_t5_xxl_url, hf_headers)
+def test_generate(llama_7b_url, hf_headers):
+    client = Client(llama_7b_url, hf_headers)
     response = client.generate("test", max_new_tokens=1, decoder_input_details=True)
 
-    assert response.generated_text == ""
+    assert response.generated_text == "_"
     assert response.details.finish_reason == FinishReason.Length
     assert response.details.generated_tokens == 1
     assert response.details.seed is None
-    assert len(response.details.prefill) == 1
-    assert response.details.prefill[0] == InputToken(id=0, text="<pad>", logprob=None)
+    assert len(response.details.prefill) == 2
+    assert response.details.prefill[0] == InputToken(id=1, text="<s>", logprob=None)
     assert len(response.details.tokens) == 1
-    assert response.details.tokens[0].id == 3
-    assert response.details.tokens[0].text == " "
+    assert response.details.tokens[0].id == 29918
+    assert response.details.tokens[0].text == "_"
     assert not response.details.tokens[0].special
 
 
-def test_generate_best_of(flan_t5_xxl_url, hf_headers):
-    client = Client(flan_t5_xxl_url, hf_headers)
+def test_generate_best_of(llama_7b_url, hf_headers):
+    client = Client(llama_7b_url, hf_headers)
     response = client.generate(
         "test", max_new_tokens=1, best_of=2, do_sample=True, decoder_input_details=True
     )
@@ -39,22 +39,22 @@ def test_generate_not_found(fake_url, hf_headers):
         client.generate("test")
 
 
-def test_generate_validation_error(flan_t5_xxl_url, hf_headers):
-    client = Client(flan_t5_xxl_url, hf_headers)
+def test_generate_validation_error(llama_7b_url, hf_headers):
+    client = Client(llama_7b_url, hf_headers)
     with pytest.raises(ValidationError):
         client.generate("test", max_new_tokens=10_000)
 
 
-def test_generate_stream(flan_t5_xxl_url, hf_headers):
-    client = Client(flan_t5_xxl_url, hf_headers)
+def test_generate_stream(llama_7b_url, hf_headers):
+    client = Client(llama_7b_url, hf_headers)
     responses = [
         response for response in client.generate_stream("test", max_new_tokens=1)
     ]
 
     assert len(responses) == 1
     response = responses[0]
 
-    assert response.generated_text == ""
+    assert response.generated_text == "_"
     assert response.details.finish_reason == FinishReason.Length
     assert response.details.generated_tokens == 1
     assert response.details.seed is None
@@ -66,34 +66,37 @@ def test_generate_stream_not_found(fake_url, hf_headers):
         list(client.generate_stream("test"))
 
 
-def test_generate_stream_validation_error(flan_t5_xxl_url, hf_headers):
-    client = Client(flan_t5_xxl_url, hf_headers)
+def test_generate_stream_validation_error(llama_7b_url, hf_headers):
+    client = Client(llama_7b_url, hf_headers)
     with pytest.raises(ValidationError):
         list(client.generate_stream("test", max_new_tokens=10_000))
 
 
 @pytest.mark.asyncio
-async def test_generate_async(flan_t5_xxl_url, hf_headers):
-    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+async def test_generate_async(llama_7b_url, hf_headers):
+    client = AsyncClient(llama_7b_url, hf_headers)
     response = await client.generate(
         "test", max_new_tokens=1, decoder_input_details=True
     )
 
-    assert response.generated_text == ""
+    assert response.generated_text == "_"
     assert response.details.finish_reason == FinishReason.Length
     assert response.details.generated_tokens == 1
     assert response.details.seed is None
-    assert len(response.details.prefill) == 1
-    assert response.details.prefill[0] == InputToken(id=0, text="<pad>", logprob=None)
+    assert len(response.details.prefill) == 2
+    assert response.details.prefill[0] == InputToken(id=1, text="<s>", logprob=None)
+    assert response.details.prefill[1] == InputToken(
+        id=1243, text="test", logprob=-10.96875
+    )
     assert len(response.details.tokens) == 1
-    assert response.details.tokens[0].id == 3
-    assert response.details.tokens[0].text == " "
+    assert response.details.tokens[0].id == 29918
+    assert response.details.tokens[0].text == "_"
     assert not response.details.tokens[0].special
 
 
 @pytest.mark.asyncio
-async def test_generate_async_best_of(flan_t5_xxl_url, hf_headers):
-    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+async def test_generate_async_best_of(llama_7b_url, hf_headers):
+    client = AsyncClient(llama_7b_url, hf_headers)
     response = await client.generate(
         "test", max_new_tokens=1, best_of=2, do_sample=True, decoder_input_details=True
     )
@@ -112,23 +115,23 @@ async def test_generate_async_not_found(fake_url, hf_headers):
 
 
 @pytest.mark.asyncio
-async def test_generate_async_validation_error(flan_t5_xxl_url, hf_headers):
-    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+async def test_generate_async_validation_error(llama_7b_url, hf_headers):
+    client = AsyncClient(llama_7b_url, hf_headers)
     with pytest.raises(ValidationError):
         await client.generate("test", max_new_tokens=10_000)
 
 
 @pytest.mark.asyncio
-async def test_generate_stream_async(flan_t5_xxl_url, hf_headers):
-    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+async def test_generate_stream_async(llama_7b_url, hf_headers):
+    client = AsyncClient(llama_7b_url, hf_headers)
     responses = [
         response async for response in client.generate_stream("test", max_new_tokens=1)
     ]
 
     assert len(responses) == 1
     response = responses[0]
 
-    assert response.generated_text == ""
+    assert response.generated_text == "_"
     assert response.details.finish_reason == FinishReason.Length
     assert response.details.generated_tokens == 1
     assert response.details.seed is None
@@ -143,8 +146,8 @@ async def test_generate_stream_async_not_found(fake_url, hf_headers):
 
 
 @pytest.mark.asyncio
-async def test_generate_stream_async_validation_error(flan_t5_xxl_url, hf_headers):
-    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+async def test_generate_stream_async_validation_error(llama_7b_url, hf_headers):
+    client = AsyncClient(llama_7b_url, hf_headers)
     with pytest.raises(ValidationError):
         async for _ in client.generate_stream("test", max_new_tokens=10_000):
             pass
@@ -59,6 +59,17 @@ class ChatCompletionComplete(BaseModel):
     usage: Optional[Any] = None
 
 
+class CompletionComplete(BaseModel):
+    # Index of the chat completion
+    index: int
+    # Message associated with the chat completion
+    text: str
+    # Log probabilities for the chat completion
+    logprobs: Optional[Any]
+    # Reason for completion
+    finish_reason: str
+
+
 class Function(BaseModel):
     name: Optional[str]
     arguments: str
@@ -104,6 +115,16 @@ class ChatComplete(BaseModel):
     usage: Any
 
 
+class Completion(BaseModel):
+    # Completion details
+    id: str
+    object: str
+    created: int
+    model: str
+    system_fingerprint: str
+    choices: List[CompletionComplete]
+
+
 class ChatRequest(BaseModel):
     # Model identifier
     model: str
 
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "2.0.0"
+    "version": "2.0.1"
   },
   "paths": {
     "/": {
@@ -408,9 +408,14 @@
         },
         "responses": {
           "200": {
-            "description": "Generated Text",
+            "description": "Generated Chat Completion",
             "content": {
               "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ChatCompletion"
+                }
+              },
+              "text/event-stream": {
                 "schema": {
                   "$ref": "#/components/schemas/ChatCompletionChunk"
                 }
@@ -492,11 +497,16 @@
         },
         "responses": {
           "200": {
-            "description": "Generated Text",
+            "description": "Generated Chat Completion",
             "content": {
               "application/json": {
                 "schema": {
-                  "$ref": "#/components/schemas/ChatCompletionChunk"
+                  "$ref": "#/components/schemas/Completion"
+                }
+              },
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/CompletionCompleteChunk"
                 }
               }
             }
@@ -930,7 +940,7 @@
           "tool_prompt": {
             "type": "string",
             "description": "A prompt to be appended before the tools",
-            "example": "\"Based on the conversation, please choose the most appropriate tool to use: \"",
+            "example": "\"You will be presented with a JSON schema representing a set of tools.\nIf the user request lacks of sufficient information to make a precise tool selection: Do not invent any tool's properties, instead notify with an error message.\n\nJSON Schema:\n\"",
             "nullable": true
           },
           "tools": {
@@ -1071,7 +1081,10 @@
             "example": "mistralai/Mistral-7B-Instruct-v0.2"
           },
           "prompt": {
-            "type": "string",
+            "type": "array",
+            "items": {
+              "type": "string"
+            },
             "description": "The prompt to generate completions for.",
             "example": "What is Deep Learning?"
           },
@@ -1234,17 +1247,17 @@
         "type": "object",
         "required": [
           "name",
-          "parameters"
+          "arguments"
         ],
         "properties": {
+          "arguments": {},
           "description": {
             "type": "string",
             "nullable": true
           },
           "name": {
             "type": "string"
-          },
-          "parameters": {}
+          }
         }
       },
       "GenerateParameters": {
@@ -1260,7 +1273,7 @@
           },
           "decoder_input_details": {
             "type": "boolean",
-            "default": "true"
+            "default": "false"
           },
           "details": {
             "type": "boolean",
@@ -1285,6 +1298,7 @@
                 "$ref": "#/components/schemas/GrammarType"
               }
             ],
+            "default": "null",
             "nullable": true
           },
           "max_new_tokens": {
@@ -1478,6 +1492,7 @@
           "max_batch_total_tokens",
           "max_waiting_tokens",
           "validation_workers",
+          "max_client_batch_size",
           "version"
         ],
         "properties": {
@@ -1503,6 +1518,11 @@
             "example": "2",
             "minimum": 0
           },
+          "max_client_batch_size": {
+            "type": "integer",
+            "example": "32",
+            "minimum": 0
+          },
           "max_concurrent_requests": {
             "type": "integer",
             "description": "Router Parameters",
 
@@ -398,6 +398,15 @@ Options:
   -e, --env
           Display a lot of information about your runtime environment
 
+```
+## MAX_CLIENT_BATCH_SIZE
+```shell
+      --max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
+          Control the maximum number of inputs that a client can send in a single request
+          
+          [env: MAX_CLIENT_BATCH_SIZE=]
+          [default: 4]
+
 ```
 ## HELP
 ```shell