Tokenizer: Add native async bindings, via py03-async-runtimes. (#1843)

michaelfeil · ArthurZucker · web-flow · commit bd1149cb73cf · 2025-08-29T11:22:43.000+02:00
* add async bindings

* update based on review!

* us hf internal testing for testing

* reduce burden for the CI

* asyn is not necessarily fast

* remove comments

---------

Co-authored-by: Arthur &lt;arthur.zucker@gmail.com&gt;
Co-authored-by: Arthur &lt;48595927+ArthurZucker@users.noreply.github.com&gt;
diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml
@@ -15,6 +15,9 @@ serde_json = "1.0"
 libc = "0.2"
 env_logger = "0.11"
 pyo3 = { version = "0.25", features = ["abi3", "abi3-py39", "py-clone"] }
+pyo3-async-runtimes = { version = "0.25", features = ["tokio-runtime"] }
+tokio = { version = "1.47.1", features = ["rt", "rt-multi-thread", "macros", "signal"] }
+once_cell = "1.19.0"
 numpy = "0.25"
 ndarray = "0.16"
 itertools = "0.14"
diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi
@@ -725,6 +725,130 @@ class Tokenizer:
         """
         pass
 
+    def async_decode_batch(self, sequences, skip_special_tokens=True):
+        """
+        Decode a batch of ids back to their corresponding string
+
+        Args:
+            sequences (:obj:`List` of :obj:`List[int]`):
+                The batch of sequences we want to decode
+
+            skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether the special tokens should be removed from the decoded strings
+
+        Returns:
+            :obj:`List[str]`: A list of decoded strings
+        """
+        pass
+
+    def async_encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
+        """
+        Asynchronously encode the given input with character offsets.
+
+        This is an async version of encode that can be awaited in async Python code.
+
+        Example:
+            Here are some examples of the inputs that are accepted::
+
+                await async_encode("A single sequence")
+
+        Args:
+            sequence (:obj:`~tokenizers.InputSequence`):
+                The main input sequence we want to encode. This sequence can be either raw
+                text or pre-tokenized, according to the ``is_pretokenized`` argument:
+
+                - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
+                - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
+
+            pair (:obj:`~tokenizers.InputSequence`, `optional`):
+                An optional input sequence. The expected format is the same that for ``sequence``.
+
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+                Whether the input is already pre-tokenized
+
+            add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to add the special tokens
+
+        Returns:
+            :class:`~tokenizers.Encoding`: The encoded result
+
+        """
+        pass
+
+    def async_encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
+        """
+        Asynchronously encode the given batch of inputs with character offsets.
+
+        This is an async version of encode_batch that can be awaited in async Python code.
+
+        Example:
+            Here are some examples of the inputs that are accepted::
+
+                await async_encode_batch([
+                    "A single sequence",
+                    ("A tuple with a sequence", "And its pair"),
+                    [ "A", "pre", "tokenized", "sequence" ],
+                    ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
+                ])
+
+        Args:
+            input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
+                A list of single sequences or pair sequences to encode. Each sequence
+                can be either raw text or pre-tokenized, according to the ``is_pretokenized``
+                argument:
+
+                - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
+                - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
+
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+                Whether the input is already pre-tokenized
+
+            add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to add the special tokens
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
+
+        """
+        pass
+
+    def async_encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
+        """
+        Asynchronously encode the given batch of inputs without tracking character offsets.
+
+        This is an async version of encode_batch_fast that can be awaited in async Python code.
+
+        Example:
+            Here are some examples of the inputs that are accepted::
+
+                await async_encode_batch_fast([
+                    "A single sequence",
+                    ("A tuple with a sequence", "And its pair"),
+                    [ "A", "pre", "tokenized", "sequence" ],
+                    ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
+                ])
+
+        Args:
+            input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
+                A list of single sequences or pair sequences to encode. Each sequence
+                can be either raw text or pre-tokenized, according to the ``is_pretokenized``
+                argument:
+
+                - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
+                - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
+
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+                Whether the input is already pre-tokenized
+
+            add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to add the special tokens
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
+
+        """
+        pass
+
     def decode(self, ids, skip_special_tokens=True):
         """
         Decode the given list of ids back to a string
diff --git a/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py b/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py
@@ -259,6 +259,47 @@ def encode_batch(
 
         return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
 
+    async def async_encode_batch(
+        self,
+        inputs: List[EncodeInput],
+        is_pretokenized: bool = False,
+        add_special_tokens: bool = True,
+    ) -> List[Encoding]:
+        """Asynchronously encode a batch (tracks character offsets).
+
+        Args:
+            inputs: A list of single or pair sequences to encode.
+            is_pretokenized: Whether inputs are already pre-tokenized.
+            add_special_tokens: Whether to add special tokens.
+
+        Returns:
+            A list of Encoding.
+        """
+        if inputs is None:
+            raise ValueError("async_encode_batch: `inputs` can't be `None`")
+        # Exposed by the Rust bindings via pyo3_async_runtimes::tokio::future_into_py
+        return await self._tokenizer.async_encode_batch(inputs, is_pretokenized, add_special_tokens)
+
+    async def async_encode_batch_fast(
+        self,
+        inputs: List[EncodeInput],
+        is_pretokenized: bool = False,
+        add_special_tokens: bool = True,
+    ) -> List[Encoding]:
+        """Asynchronously encode a batch (no character offsets, faster).
+
+        Args:
+            inputs: A list of single or pair sequences to encode.
+            is_pretokenized: Whether inputs are already pre-tokenized.
+            add_special_tokens: Whether to add special tokens.
+
+        Returns:
+            A list of Encoding.
+        """
+        if inputs is None:
+            raise ValueError("async_encode_batch_fast: `inputs` can't be `None`")
+        return await self._tokenizer.async_encode_batch_fast(inputs, is_pretokenized, add_special_tokens)
+
     def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
         """Decode the given list of ids to a string sequence
 
diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml
@@ -31,7 +31,7 @@ Source = "https://github.com/huggingface/tokenizers"
 
 
 [project.optional-dependencies]
-testing = ["pytest", "requests", "numpy", "datasets", "black==22.3", "ruff"]
+testing = ["pytest", "pytest-asyncio", "requests", "numpy", "datasets", "black==22.3", "ruff"]
 docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
 dev = ["tokenizers[testing]"]
 
diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs
@@ -5,6 +5,19 @@
 
 extern crate tokenizers as tk;
 
+use once_cell::sync::Lazy;
+use std::sync::Arc;
+use tokio::runtime::Runtime;
+
+// We create a global runtime that will be initialized once when first needed
+// This ensures we always have a runtime available for tokio::task::spawn_blocking
+static TOKIO_RUNTIME: Lazy<Arc<Runtime>> = Lazy::new(|| {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .expect("Failed to create global Tokio runtime");
+    Arc::new(rt)
+});
 mod decoders;
 mod encoding;
 mod error;
diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py