Unverified Commit 0cbddfb1 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Replace double occurrences as the last step (#11367)

parent 73fde1de
...@@ -453,7 +453,6 @@ class AlbertConverter(SpmConverter): ...@@ -453,7 +453,6 @@ class AlbertConverter(SpmConverter):
list_normalizers = [ list_normalizers = [
normalizers.Replace("``", '"'), normalizers.Replace("``", '"'),
normalizers.Replace("''", '"'), normalizers.Replace("''", '"'),
normalizers.Replace(Regex(" {2,}"), " "),
] ]
if not self.original_tokenizer.keep_accents: if not self.original_tokenizer.keep_accents:
list_normalizers.append(normalizers.NFKD()) list_normalizers.append(normalizers.NFKD())
...@@ -463,6 +462,7 @@ class AlbertConverter(SpmConverter): ...@@ -463,6 +462,7 @@ class AlbertConverter(SpmConverter):
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap)) list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))
return normalizers.Sequence(list_normalizers) return normalizers.Sequence(list_normalizers)
def post_processor(self): def post_processor(self):
...@@ -641,7 +641,6 @@ class XLNetConverter(SpmConverter): ...@@ -641,7 +641,6 @@ class XLNetConverter(SpmConverter):
list_normalizers = [ list_normalizers = [
normalizers.Replace("``", '"'), normalizers.Replace("``", '"'),
normalizers.Replace("''", '"'), normalizers.Replace("''", '"'),
normalizers.Replace(Regex(" {2,}"), " "),
] ]
if not self.original_tokenizer.keep_accents: if not self.original_tokenizer.keep_accents:
list_normalizers.append(normalizers.NFKD()) list_normalizers.append(normalizers.NFKD())
...@@ -651,6 +650,7 @@ class XLNetConverter(SpmConverter): ...@@ -651,6 +650,7 @@ class XLNetConverter(SpmConverter):
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap)) list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))
return normalizers.Sequence(list_normalizers) return normalizers.Sequence(list_normalizers)
def post_processor(self): def post_processor(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment