Skip to content

Commit bd1149c

Browse files
Tokenizer: Add native async bindings, via py03-async-runtimes. (#1843)
* add async bindings * update based on review! * us hf internal testing for testing * reduce burden for the CI * asyn is not necessarily fast * remove comments --------- Co-authored-by: Arthur <[email protected]> Co-authored-by: Arthur <[email protected]>
1 parent b43d8d7 commit bd1149c

File tree

7 files changed

+805
-6
lines changed

7 files changed

+805
-6
lines changed

bindings/python/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ serde_json = "1.0"
1515
libc = "0.2"
1616
env_logger = "0.11"
1717
pyo3 = { version = "0.25", features = ["abi3", "abi3-py39", "py-clone"] }
18+
pyo3-async-runtimes = { version = "0.25", features = ["tokio-runtime"] }
19+
tokio = { version = "1.47.1", features = ["rt", "rt-multi-thread", "macros", "signal"] }
20+
once_cell = "1.19.0"
1821
numpy = "0.25"
1922
ndarray = "0.16"
2023
itertools = "0.14"

bindings/python/py_src/tokenizers/__init__.pyi

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,130 @@ class Tokenizer:
725725
"""
726726
pass
727727

728+
def async_decode_batch(self, sequences, skip_special_tokens=True):
729+
"""
730+
Decode a batch of ids back to their corresponding string
731+
732+
Args:
733+
sequences (:obj:`List` of :obj:`List[int]`):
734+
The batch of sequences we want to decode
735+
736+
skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
737+
Whether the special tokens should be removed from the decoded strings
738+
739+
Returns:
740+
:obj:`List[str]`: A list of decoded strings
741+
"""
742+
pass
743+
744+
def async_encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
745+
"""
746+
Asynchronously encode the given input with character offsets.
747+
748+
This is an async version of encode that can be awaited in async Python code.
749+
750+
Example:
751+
Here are some examples of the inputs that are accepted::
752+
753+
await async_encode("A single sequence")
754+
755+
Args:
756+
sequence (:obj:`~tokenizers.InputSequence`):
757+
The main input sequence we want to encode. This sequence can be either raw
758+
text or pre-tokenized, according to the ``is_pretokenized`` argument:
759+
760+
- If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
761+
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
762+
763+
pair (:obj:`~tokenizers.InputSequence`, `optional`):
764+
An optional input sequence. The expected format is the same that for ``sequence``.
765+
766+
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
767+
Whether the input is already pre-tokenized
768+
769+
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
770+
Whether to add the special tokens
771+
772+
Returns:
773+
:class:`~tokenizers.Encoding`: The encoded result
774+
775+
"""
776+
pass
777+
778+
def async_encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
779+
"""
780+
Asynchronously encode the given batch of inputs with character offsets.
781+
782+
This is an async version of encode_batch that can be awaited in async Python code.
783+
784+
Example:
785+
Here are some examples of the inputs that are accepted::
786+
787+
await async_encode_batch([
788+
"A single sequence",
789+
("A tuple with a sequence", "And its pair"),
790+
[ "A", "pre", "tokenized", "sequence" ],
791+
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
792+
])
793+
794+
Args:
795+
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
796+
A list of single sequences or pair sequences to encode. Each sequence
797+
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
798+
argument:
799+
800+
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
801+
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
802+
803+
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
804+
Whether the input is already pre-tokenized
805+
806+
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
807+
Whether to add the special tokens
808+
809+
Returns:
810+
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
811+
812+
"""
813+
pass
814+
815+
def async_encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
816+
"""
817+
Asynchronously encode the given batch of inputs without tracking character offsets.
818+
819+
This is an async version of encode_batch_fast that can be awaited in async Python code.
820+
821+
Example:
822+
Here are some examples of the inputs that are accepted::
823+
824+
await async_encode_batch_fast([
825+
"A single sequence",
826+
("A tuple with a sequence", "And its pair"),
827+
[ "A", "pre", "tokenized", "sequence" ],
828+
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
829+
])
830+
831+
Args:
832+
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
833+
A list of single sequences or pair sequences to encode. Each sequence
834+
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
835+
argument:
836+
837+
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
838+
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
839+
840+
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
841+
Whether the input is already pre-tokenized
842+
843+
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
844+
Whether to add the special tokens
845+
846+
Returns:
847+
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
848+
849+
"""
850+
pass
851+
728852
def decode(self, ids, skip_special_tokens=True):
729853
"""
730854
Decode the given list of ids back to a string

bindings/python/py_src/tokenizers/implementations/base_tokenizer.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,47 @@ def encode_batch(
259259

260260
return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
261261

262+
async def async_encode_batch(
263+
self,
264+
inputs: List[EncodeInput],
265+
is_pretokenized: bool = False,
266+
add_special_tokens: bool = True,
267+
) -> List[Encoding]:
268+
"""Asynchronously encode a batch (tracks character offsets).
269+
270+
Args:
271+
inputs: A list of single or pair sequences to encode.
272+
is_pretokenized: Whether inputs are already pre-tokenized.
273+
add_special_tokens: Whether to add special tokens.
274+
275+
Returns:
276+
A list of Encoding.
277+
"""
278+
if inputs is None:
279+
raise ValueError("async_encode_batch: `inputs` can't be `None`")
280+
# Exposed by the Rust bindings via pyo3_async_runtimes::tokio::future_into_py
281+
return await self._tokenizer.async_encode_batch(inputs, is_pretokenized, add_special_tokens)
282+
283+
async def async_encode_batch_fast(
284+
self,
285+
inputs: List[EncodeInput],
286+
is_pretokenized: bool = False,
287+
add_special_tokens: bool = True,
288+
) -> List[Encoding]:
289+
"""Asynchronously encode a batch (no character offsets, faster).
290+
291+
Args:
292+
inputs: A list of single or pair sequences to encode.
293+
is_pretokenized: Whether inputs are already pre-tokenized.
294+
add_special_tokens: Whether to add special tokens.
295+
296+
Returns:
297+
A list of Encoding.
298+
"""
299+
if inputs is None:
300+
raise ValueError("async_encode_batch_fast: `inputs` can't be `None`")
301+
return await self._tokenizer.async_encode_batch_fast(inputs, is_pretokenized, add_special_tokens)
302+
262303
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
263304
"""Decode the given list of ids to a string sequence
264305

bindings/python/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ Source = "https://github.com/huggingface/tokenizers"
3131

3232

3333
[project.optional-dependencies]
34-
testing = ["pytest", "requests", "numpy", "datasets", "black==22.3", "ruff"]
34+
testing = ["pytest", "pytest-asyncio", "requests", "numpy", "datasets", "black==22.3", "ruff"]
3535
docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
3636
dev = ["tokenizers[testing]"]
3737

bindings/python/src/lib.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,19 @@
55

66
extern crate tokenizers as tk;
77

8+
use once_cell::sync::Lazy;
9+
use std::sync::Arc;
10+
use tokio::runtime::Runtime;
11+
12+
// We create a global runtime that will be initialized once when first needed
13+
// This ensures we always have a runtime available for tokio::task::spawn_blocking
14+
static TOKIO_RUNTIME: Lazy<Arc<Runtime>> = Lazy::new(|| {
15+
let rt = tokio::runtime::Builder::new_multi_thread()
16+
.enable_all()
17+
.build()
18+
.expect("Failed to create global Tokio runtime");
19+
Arc::new(rt)
20+
});
821
mod decoders;
922
mod encoding;
1023
mod error;

0 commit comments

Comments
 (0)