fixed word level extract features for roberta-xlmr

Naman Goyal · facebook-github-bot · commit d48895bd53ff · 2019-12-03T13:00:50.000-08:00
Summary: Pull Request resolved: fairinternal/fairseq-py#933 Differential Revision: D18783780 fbshipit-source-id: fa0a27fab886a5fa5be8d5f49151d1d9dd9775f1
diff --git a/fairseq/models/roberta/alignment_utils.py b/fairseq/models/roberta/alignment_utils.py
@@ -22,6 +22,7 @@ def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor, other_tokens: List
         List[str]: mapping from *other_tokens* to corresponding *bpe_tokens*.
     """
     assert bpe_tokens.dim() == 1
+    assert bpe_tokens[0] == 0
 
     def clean(text):
         return text.strip()
@@ -32,7 +33,6 @@ def clean(text):
     other_tokens = [clean(str(o)) for o in other_tokens]
 
     # strip leading <s>
-    assert bpe_tokens[0] == '<s>'
     bpe_tokens = bpe_tokens[1:]
     assert ''.join(bpe_tokens) == ''.join(other_tokens)