Unverified Commit 7915a259 authored by Pavel Iakubovskii's avatar Pavel Iakubovskii Committed by GitHub
Browse files

Fix donut token2json multiline (#30300)

* Fix multiline processing

* Update test for token2json
parent b65df514
...@@ -149,7 +149,9 @@ class DonutProcessor(ProcessorMixin): ...@@ -149,7 +149,9 @@ class DonutProcessor(ProcessorMixin):
end_token = end_token.group() end_token = end_token.group()
start_token_escaped = re.escape(start_token) start_token_escaped = re.escape(start_token)
end_token_escaped = re.escape(end_token) end_token_escaped = re.escape(end_token)
content = re.search(f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE) content = re.search(
f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE | re.DOTALL
)
if content is not None: if content is not None:
content = content.group(1).strip() content = content.group(1).strip()
if r"<s_" in content and r"</s_" in content: # non-leaf node if r"<s_" in content and r"</s_" in content: # non-leaf node
......
...@@ -35,6 +35,8 @@ class DonutProcessorTest(unittest.TestCase): ...@@ -35,6 +35,8 @@ class DonutProcessorTest(unittest.TestCase):
"zip": "30301", "zip": "30301",
"phone": "123-4567", "phone": "123-4567",
"nicknames": [{"nickname": "Johnny"}, {"nickname": "JD"}], "nicknames": [{"nickname": "Johnny"}, {"nickname": "JD"}],
"multiline": "text\nwith\nnewlines",
"empty": "",
} }
sequence = ( sequence = (
...@@ -42,6 +44,8 @@ class DonutProcessorTest(unittest.TestCase): ...@@ -42,6 +44,8 @@ class DonutProcessorTest(unittest.TestCase):
"<s_state>GA</s_state><s_zip>30301</s_zip><s_phone>123-4567</s_phone>" "<s_state>GA</s_state><s_zip>30301</s_zip><s_phone>123-4567</s_phone>"
"<s_nicknames><s_nickname>Johnny</s_nickname>" "<s_nicknames><s_nickname>Johnny</s_nickname>"
"<sep/><s_nickname>JD</s_nickname></s_nicknames>" "<sep/><s_nickname>JD</s_nickname></s_nicknames>"
"<s_multiline>text\nwith\nnewlines</s_multiline>"
"<s_empty></s_empty>"
) )
actual_json = self.processor.token2json(sequence) actual_json = self.processor.token2json(sequence)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment