Unverified Commit cd4584e3 authored by Shahad Mahmud's avatar Shahad Mahmud Committed by GitHub
Browse files

precompiled_charsmap checking before adding to the normalizers' list for...

precompiled_charsmap checking before adding to the normalizers' list for XLNetTokenizerFast conversion. (#24618)

* precompiled_charsmap checking before adding to the normalizers' list.

* precompiled_charsmap checking for all Sentencepiece tokenizer models

* precompiled_charsmap checking for SPM tokenizer models - correct formatting
parent f4e4b4d0
...@@ -551,7 +551,10 @@ class AlbertConverter(SpmConverter): ...@@ -551,7 +551,10 @@ class AlbertConverter(SpmConverter):
list_normalizers.append(normalizers.Lowercase()) list_normalizers.append(normalizers.Lowercase())
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
if precompiled_charsmap:
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " ")) list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))
return normalizers.Sequence(list_normalizers) return normalizers.Sequence(list_normalizers)
...@@ -802,7 +805,10 @@ class XLNetConverter(SpmConverter): ...@@ -802,7 +805,10 @@ class XLNetConverter(SpmConverter):
list_normalizers.append(normalizers.Lowercase()) list_normalizers.append(normalizers.Lowercase())
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
if precompiled_charsmap:
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " ")) list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))
return normalizers.Sequence(list_normalizers) return normalizers.Sequence(list_normalizers)
...@@ -836,7 +842,10 @@ class RemBertConverter(SpmConverter): ...@@ -836,7 +842,10 @@ class RemBertConverter(SpmConverter):
list_normalizers.append(normalizers.Lowercase()) list_normalizers.append(normalizers.Lowercase())
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
if precompiled_charsmap:
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
return normalizers.Sequence(list_normalizers) return normalizers.Sequence(list_normalizers)
def post_processor(self): def post_processor(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment