@@ -725,6 +725,130 @@ class Tokenizer:
725
725
"""
726
726
pass
727
727
728
+ def async_decode_batch (self , sequences , skip_special_tokens = True ):
729
+ """
730
+ Decode a batch of ids back to their corresponding string
731
+
732
+ Args:
733
+ sequences (:obj:`List` of :obj:`List[int]`):
734
+ The batch of sequences we want to decode
735
+
736
+ skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
737
+ Whether the special tokens should be removed from the decoded strings
738
+
739
+ Returns:
740
+ :obj:`List[str]`: A list of decoded strings
741
+ """
742
+ pass
743
+
744
+ def async_encode (self , sequence , pair = None , is_pretokenized = False , add_special_tokens = True ):
745
+ """
746
+ Asynchronously encode the given input with character offsets.
747
+
748
+ This is an async version of encode that can be awaited in async Python code.
749
+
750
+ Example:
751
+ Here are some examples of the inputs that are accepted::
752
+
753
+ await async_encode("A single sequence")
754
+
755
+ Args:
756
+ sequence (:obj:`~tokenizers.InputSequence`):
757
+ The main input sequence we want to encode. This sequence can be either raw
758
+ text or pre-tokenized, according to the ``is_pretokenized`` argument:
759
+
760
+ - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
761
+ - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
762
+
763
+ pair (:obj:`~tokenizers.InputSequence`, `optional`):
764
+ An optional input sequence. The expected format is the same that for ``sequence``.
765
+
766
+ is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
767
+ Whether the input is already pre-tokenized
768
+
769
+ add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
770
+ Whether to add the special tokens
771
+
772
+ Returns:
773
+ :class:`~tokenizers.Encoding`: The encoded result
774
+
775
+ """
776
+ pass
777
+
778
+ def async_encode_batch (self , input , is_pretokenized = False , add_special_tokens = True ):
779
+ """
780
+ Asynchronously encode the given batch of inputs with character offsets.
781
+
782
+ This is an async version of encode_batch that can be awaited in async Python code.
783
+
784
+ Example:
785
+ Here are some examples of the inputs that are accepted::
786
+
787
+ await async_encode_batch([
788
+ "A single sequence",
789
+ ("A tuple with a sequence", "And its pair"),
790
+ [ "A", "pre", "tokenized", "sequence" ],
791
+ ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
792
+ ])
793
+
794
+ Args:
795
+ input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
796
+ A list of single sequences or pair sequences to encode. Each sequence
797
+ can be either raw text or pre-tokenized, according to the ``is_pretokenized``
798
+ argument:
799
+
800
+ - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
801
+ - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
802
+
803
+ is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
804
+ Whether the input is already pre-tokenized
805
+
806
+ add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
807
+ Whether to add the special tokens
808
+
809
+ Returns:
810
+ A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
811
+
812
+ """
813
+ pass
814
+
815
+ def async_encode_batch_fast (self , input , is_pretokenized = False , add_special_tokens = True ):
816
+ """
817
+ Asynchronously encode the given batch of inputs without tracking character offsets.
818
+
819
+ This is an async version of encode_batch_fast that can be awaited in async Python code.
820
+
821
+ Example:
822
+ Here are some examples of the inputs that are accepted::
823
+
824
+ await async_encode_batch_fast([
825
+ "A single sequence",
826
+ ("A tuple with a sequence", "And its pair"),
827
+ [ "A", "pre", "tokenized", "sequence" ],
828
+ ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
829
+ ])
830
+
831
+ Args:
832
+ input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
833
+ A list of single sequences or pair sequences to encode. Each sequence
834
+ can be either raw text or pre-tokenized, according to the ``is_pretokenized``
835
+ argument:
836
+
837
+ - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
838
+ - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
839
+
840
+ is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
841
+ Whether the input is already pre-tokenized
842
+
843
+ add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
844
+ Whether to add the special tokens
845
+
846
+ Returns:
847
+ A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
848
+
849
+ """
850
+ pass
851
+
728
852
def decode (self , ids , skip_special_tokens = True ):
729
853
"""
730
854
Decode the given list of ids back to a string
0 commit comments