Fixed conll2tree.py; handling correctly a sentence that has multiple elements...

Fixed conll2tree.py; handling correctly a sentence that has multiple elements with identical representation (#292)

Fixed conll2tree.py; handling correctly a sentence that has multiple elements...
Fixed conll2tree.py; handling correctly a sentence that has multiple elements with identical representation (#292)
4fddb5e0 · sgn-andot · calberti · 3a952ec6 · 4fddb5e0
Commit 4fddb5e0 authored Aug 26, 2016 by sgn-andot Committed by calberti Aug 25, 2016
Show whitespace changes
Inline Side-by-side

Showing with 10 additions and 3 deletions

syntaxnet/syntaxnet/conll2tree.py syntaxnet/syntaxnet/conll2tree.py +10 -3

No files found.
--- a/syntaxnet/syntaxnet/conll2tree.py
+++ b/syntaxnet/syntaxnet/conll2tree.py
@@ -15,6 +15,7 @@
 """A program to generate ASCII trees from conll files."""
 import collections
+import re
 import asciitree
 import tensorflow as tf
@@ -39,18 +40,21 @@ flags.DEFINE_string('corpus_name', 'stdin-conll',
 def to_dict(sentence):
  """Builds a dictionary representing the parse tree of a sentence.
+     Note that the suffix "@id" (where 'id' is a number) is appended to each element
+     to handle the sentence that has multiple elements with identical representation.
+     Those suffix needs to be removed after the asciitree is rendered.
  Args:
    sentence: Sentence protocol buffer to represent.
  Returns:
    Dictionary mapping tokens to children.
  """
-  token_str = ['%s %s %s' % (token.word, token.tag, token.label)
+  token_str = list()
-               for token in sentence.token]
  children = [[] for token in sentence.token]
  root = -1
  for i in range(0, len(sentence.token)):
    token = sentence.token[i]
+    token_str.append('%s %s %s @%d' % (token.word, token.tag, token.label, (i+1)))
    if token.head == -1:
      root = i
    else:
@@ -83,7 +87,10 @@ def main(unused_argv):
        d = to_dict(sentence)
        print 'Input: %s' % sentence.text
        print 'Parse:'
-        print tr(d)
+        tr_str = tr(d)
+        pat = re.compile('\s*@\d+$')
+        for tr_ln in tr_str.splitlines():
+          print pat.sub('', tr_ln)
      if finished:
        break