Unverified Commit 4f84a69a authored by Hongkun Yu's avatar Hongkun Yu Committed by GitHub
Browse files

Delete models:syntaxnet (#8170)

* remove tensorrt as the example repo has been moved for a while

* delete syntax net to reduce repo size as syntax net is going to move to google-research/

* Update README.md

Delete syntaxnet from readme.

* Update CODEOWNERS

Delete syntaxnet from codeowners
parent d2e30aef
[submodule "tensorflow"]
path = research/syntaxnet/tensorflow
url = https://github.com/tensorflow/tensorflow.git
......@@ -49,7 +49,6 @@
/research/street/ @theraysmith
/research/struct2depth/ @aneliaangelova
/research/swivel/ @waterson
/research/syntaxnet/ @calberti @andorardo @bogatyy @markomernick
/research/tcn/ @coreylynch @sermanet
/research/tensorrt/ @karmel
/research/textsum/ @panyx0718 @peterjliu
......
......@@ -72,7 +72,6 @@ request.
using a Deep RNN.
- [struct2depth](struct2depth): unsupervised learning of depth and ego-motion.
- [swivel](swivel): the Swivel algorithm for generating word embeddings.
- [syntaxnet](syntaxnet): neural models of natural language syntax.
- [tcn](tcn): Self-supervised representation learning from multi-view video.
- [textsum](textsum): sequence-to-sequence with attention model for text
summarization.
......
.git
bazel/
Dockerfile*
tensorflow/.git
/bazel-bin
/bazel-genfiles
/bazel-out
/bazel-tensorflow
/bazel-testlogs
/bazel-tf
/bazel-syntaxnet
FROM ubuntu:16.04
ENV SYNTAXNETDIR=/opt/tensorflow PATH=$PATH:/root/bin
# Install system packages. This doesn't include everything the TensorFlow
# dockerfile specifies, so if anything goes awry, maybe install more packages
# from there. Also, running apt-get clean before further commands will make the
# Docker images smaller.
RUN mkdir -p $SYNTAXNETDIR \
&& cd $SYNTAXNETDIR \
&& apt-get update \
&& apt-get install -y \
file \
git \
graphviz \
libcurl3-dev \
libfreetype6-dev \
libgraphviz-dev \
liblapack-dev \
libopenblas-dev \
libpng-dev \
libxft-dev \
openjdk-8-jdk \
python-dev \
python-mock \
python-pip \
python2.7 \
swig \
unzip \
vim \
wget \
zlib1g-dev \
&& apt-get clean \
&& (rm -f /var/cache/apt/archives/*.deb \
/var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true)
# Install common Python dependencies. Similar to above, remove caches
# afterwards to help keep Docker images smaller.
RUN pip install --ignore-installed pip \
&& python -m pip install numpy \
&& rm -rf /root/.cache/pip /tmp/pip*
RUN python -m pip install \
asciitree \
ipykernel \
jupyter \
matplotlib \
pandas \
protobuf \
scipy \
sklearn \
&& python -m ipykernel.kernelspec \
&& python -m pip install pygraphviz \
--install-option="--include-path=/usr/include/graphviz" \
--install-option="--library-path=/usr/lib/graphviz/" \
&& python -m jupyter_core.command nbextension enable \
--py --sys-prefix widgetsnbextension \
&& rm -rf /root/.cache/pip /tmp/pip*
# Installs Bazel.
RUN wget --quiet https://github.com/bazelbuild/bazel/releases/download/0.11.1/bazel-0.11.1-installer-linux-x86_64.sh \
&& chmod +x bazel-0.11.1-installer-linux-x86_64.sh \
&& ./bazel-0.11.1-installer-linux-x86_64.sh \
&& rm ./bazel-0.11.1-installer-linux-x86_64.sh
COPY WORKSPACE $SYNTAXNETDIR/syntaxnet/WORKSPACE
COPY tools/bazel.rc $SYNTAXNETDIR/syntaxnet/tools/bazel.rc
COPY tensorflow $SYNTAXNETDIR/syntaxnet/tensorflow
# Compile common TensorFlow targets, which don't depend on DRAGNN / SyntaxNet
# source. This makes it more convenient to re-compile DRAGNN / SyntaxNet for
# development (though not as convenient as the docker-devel scripts).
RUN cd $SYNTAXNETDIR/syntaxnet/tensorflow \
&& tensorflow/tools/ci_build/builds/configured CPU \
&& cd $SYNTAXNETDIR/syntaxnet \
&& bazel build -c opt @org_tensorflow//tensorflow:tensorflow_py
# Build the codez.
WORKDIR $SYNTAXNETDIR/syntaxnet
COPY dragnn $SYNTAXNETDIR/syntaxnet/dragnn
COPY syntaxnet $SYNTAXNETDIR/syntaxnet/syntaxnet
COPY third_party $SYNTAXNETDIR/syntaxnet/third_party
COPY util/utf8 $SYNTAXNETDIR/syntaxnet/util/utf8
RUN bazel build -c opt //dragnn/python:all //dragnn/tools:all
# This makes the IP exposed actually "*"; we'll do host restrictions by passing
# a hostname to the `docker run` command.
COPY tensorflow/tensorflow/tools/docker/jupyter_notebook_config.py /root/.jupyter/
EXPOSE 8888
# This does not need to be compiled, only copied.
COPY examples $SYNTAXNETDIR/syntaxnet/examples
# Todo: Move this earlier in the file (don't want to invalidate caches for now).
CMD /bin/bash -c "bazel-bin/dragnn/tools/oss_notebook_launcher notebook --debug --notebook-dir=/opt/tensorflow/syntaxnet/examples"
# SyntaxNet: Neural Models of Syntax.
*A TensorFlow toolkit for deep learning powered natural language understanding
(NLU).*
**CoNLL**: See [here](g3doc/conll2017/README.md) for instructions for using the
SyntaxNet/DRAGNN baseline for the CoNLL2017 Shared Task.
At Google, we spend a lot of time thinking about how computer systems can read
and understand human language in order to process it in intelligent ways. We are
excited to share the fruits of our research with the broader community by
releasing SyntaxNet, an open-source neural network framework for
[TensorFlow](http://www.tensorflow.org) that provides a foundation for Natural
Language Understanding (NLU) systems. Our release includes all the code needed
to train new SyntaxNet models on your own data, as well as a suite of models
that we have trained for you, and that you can use to analyze text in over 40
languages.
This repository is largely divided into two sub-packages:
1. **DRAGNN:
[code](https://github.com/tensorflow/models/tree/master/research/syntaxnet/dragnn),
[documentation](g3doc/DRAGNN.md),
[paper](https://arxiv.org/pdf/1703.04474.pdf)** implements Dynamic Recurrent
Acyclic Graphical Neural Networks (DRAGNN), a framework for building
multi-task, fully dynamically constructed computation graphs. Practically,
we use DRAGNN to extend our prior work from [Andor et al.
(2016)](http://arxiv.org/abs/1603.06042) with end-to-end, deep recurrent
models and to provide a much easier to use interface to SyntaxNet. *DRAGNN
is designed first and foremost as a Python library, and therefore much
easier to use than the original SyntaxNet implementation.*
1. **SyntaxNet:
[code](https://github.com/tensorflow/models/tree/master/research/syntaxnet/syntaxnet),
[documentation](g3doc/syntaxnet-tutorial.md)** is a transition-based
framework for natural language processing, with core functionality for
feature extraction, representing annotated data, and evaluation. As of the
DRAGNN release, it is recommended to train and deploy SyntaxNet models using
the DRAGNN framework.
## How to use this library
There are three ways to use SyntaxNet:
* See [here](g3doc/conll2017/README.md) for instructions for using the
SyntaxNet/DRAGNN baseline for the CoNLL2017 Shared Task, and running the
ParseySaurus models.
* You can use DRAGNN to train your NLP models for other tasks and dataset. See
"Getting started with DRAGNN" below.
* You can continue to use the Parsey McParseface family of pre-trained
SyntaxNet models. See "Pre-trained NLP models" below.
## Installation
### Docker installation
_This process takes ~10 minutes._
The simplest way to get started with DRAGNN is by loading our Docker container.
[Here](g3doc/CLOUD.md) is a tutorial for running the DRAGNN container on
[GCP](https://cloud.google.com) (just as applicable to your own computer).
### Ubuntu 16.04+ binary installation
_This process takes ~5 minutes, but is only compatible with Linux using GNU libc
3.4.22 and above (e.g. Ubuntu 16.04)._
Binary wheel packages are provided for TensorFlow and SyntaxNet. If you do not
need to write new binary TensorFlow ops, these should suffice.
* `apt-get install -y graphviz libgraphviz-dev libopenblas-base libpng16-16
libxft2 python-pip python-mock`
* `pip install pygraphviz
--install-option="--include-path=/usr/include/graphviz"
--install-option="--library-path=/usr/lib/graphviz/"`
* `pip install 'ipython<6.0' protobuf numpy scipy jupyter
syntaxnet-with-tensorflow`
* `python -m jupyter_core.command nbextension enable --py --sys-prefix
widgetsnbextension`
You can test that binary modules can be successfully imported by running,
* `python -c 'import dragnn.python.load_dragnn_cc_impl,
syntaxnet.load_parser_ops'`
### Manual installation
_This process takes 1-2 hours._
Running and training SyntaxNet/DRAGNN models requires building this package from
source. You'll need to install:
* python 2.7:
* Python 3 support is not available yet
* bazel 0.11.1:
* Follow the instructions [here](http://bazel.build/docs/install.html)
* Alternately, Download bazel 0.11.1 <.deb> from
[https://github.com/bazelbuild/bazel/releases](https://github.com/bazelbuild/bazel/releases)
for your system configuration.
* Install it using the command: sudo dpkg -i <.deb file>
* Check for the bazel version by typing: bazel version
* swig:
* `apt-get install swig` on Ubuntu
* `brew install swig` on OSX
* protocol buffers, with a version supported by TensorFlow:
* check your protobuf version with `pip freeze | grep protobuf`
* upgrade to a supported version with `pip install -U protobuf==3.3.0`
* autograd, with a version supported by TensorFlow:
* `pip install -U autograd==1.1.13`
* mock, the testing package:
* `pip install mock`
* asciitree, to draw parse trees on the console for the demo:
* `pip install asciitree`
* numpy, package for scientific computing:
* `pip install numpy`
* pygraphviz to visualize traces and parse trees:
* `apt-get install -y graphviz libgraphviz-dev`
* `pip install pygraphviz
--install-option="--include-path=/usr/include/graphviz"
--install-option="--library-path=/usr/lib/graphviz/"`
Once you completed the above steps, you can build and test SyntaxNet with the
following commands:
```shell
git clone --recursive https://github.com/tensorflow/models.git
cd models/research/syntaxnet/tensorflow
./configure
cd ..
bazel test ...
# On Mac, run the following:
bazel test --linkopt=-headerpad_max_install_names \
dragnn/... syntaxnet/... util/utf8/...
```
Bazel should complete reporting all tests passed.
Now you can install the SyntaxNet and DRAGNN Python modules with the following
commands:
```shell
mkdir /tmp/syntaxnet_pkg
bazel-bin/dragnn/tools/build_pip_package --output-dir=/tmp/syntaxnet_pkg
# The filename of the .whl depends on your platform.
sudo pip install /tmp/syntaxnet_pkg/syntaxnet-x.xx-none-any.whl
```
To build SyntaxNet with GPU support please refer to the instructions in
[issues/248](https://github.com/tensorflow/models/issues/248).
**Note:** If you are running Docker on OSX, make sure that you have enough
memory allocated for your Docker VM.
## Getting Started
We have a few guides on this README, as well as more extensive
[documentation](g3doc/).
### Learning the DRAGNN framework
![DRAGNN](g3doc/unrolled-dragnn.png)
An easy and visual way to get started with DRAGNN is to run our Jupyter
notebooks for [interactive
debugging](examples/dragnn/interactive_text_analyzer.ipynb) and [training a new
model](examples/dragnn/trainer_tutorial.ipynb). Our tutorial
[here](g3doc/CLOUD.md) explains how to start it up from the Docker container.
Once you have DRAGNN installed and running, try out the
[ParseySaurus](g3doc/conll2017) models.
### Using the Pre-trained NLP models
We are happy to release *Parsey McParseface*, an English parser that we have
trained for you, and that you can use to analyze English text, along with
[trained models for 40 languages](g3doc/universal.md) and support for text
segmentation and morphological analysis.
Once you have successfully built SyntaxNet, you can start parsing text right
away with Parsey McParseface, located under `syntaxnet/models`. The easiest
thing is to use or modify the included script `syntaxnet/demo.sh`, which shows a
basic setup to parse English taking plain text as input.
You can also skip right away to the [detailed SyntaxNet
tutorial](g3doc/syntaxnet-tutorial.md).
How accurate is Parsey McParseface? For the initial release, we tried to balance
a model that runs fast enough to be useful on a single machine (e.g. ~600
words/second on a modern desktop) and that is also the most accurate parser
available. Here's how Parsey McParseface compares to the academic literature on
several different English domains: (all numbers are % correct head assignments
in the tree, or unlabelled attachment score)
Model | News | Web | Questions
--------------------------------------------------------------------------------------------------------------- | :---: | :---: | :-------:
[Martins et al. (2013)](http://www.cs.cmu.edu/~ark/TurboParser/) | 93.10 | 88.23 | 94.21
[Zhang and McDonald (2014)](http://research.google.com/pubs/archive/38148.pdf) | 93.32 | 88.65 | 93.37
[Weiss et al. (2015)](http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43800.pdf) | 93.91 | 89.29 | 94.17
[Andor et al. (2016)](http://arxiv.org/abs/1603.06042)* | 94.44 | 90.17 | 95.40
Parsey McParseface | 94.15 | 89.08 | 94.77
We see that Parsey McParseface is state-of-the-art; more importantly, with
SyntaxNet you can train larger networks with more hidden units and bigger beam
sizes if you want to push the accuracy even further: [Andor et al.
(2016)](http://arxiv.org/abs/1603.06042)* is simply a SyntaxNet model with a
larger beam and network. For futher information on the datasets, see that paper
under the section "Treebank Union".
Parsey McParseface is also state-of-the-art for part-of-speech (POS) tagging
(numbers below are per-token accuracy):
Model | News | Web | Questions
-------------------------------------------------------------------------- | :---: | :---: | :-------:
[Ling et al. (2015)](http://www.cs.cmu.edu/~lingwang/papers/emnlp2015.pdf) | 97.44 | 94.03 | 96.18
[Andor et al. (2016)](http://arxiv.org/abs/1603.06042)* | 97.77 | 94.80 | 96.86
Parsey McParseface | 97.52 | 94.24 | 96.45
#### Parsing from Standard Input
Simply pass one sentence per line of text into the script at
`syntaxnet/demo.sh`. The script will break the text into words, run the POS
tagger, run the parser, and then generate an ASCII version of the parse tree:
```shell
echo 'Bob brought the pizza to Alice.' | syntaxnet/demo.sh
Input: Bob brought the pizza to Alice .
Parse:
brought VBD ROOT
+-- Bob NNP nsubj
+-- pizza NN dobj
| +-- the DT det
+-- to IN prep
| +-- Alice NNP pobj
+-- . . punct
```
The ASCII tree shows the text organized as in the parse, not left-to-right as
visualized in our tutorial graphs. In this example, we see that the verb
"brought" is the root of the sentence, with the subject "Bob", the object
"pizza", and the prepositional phrase "to Alice".
If you want to feed in tokenized, CONLL-formatted text, you can run `demo.sh
--conll`.
#### Annotating a Corpus
To change the pipeline to read and write to specific files (as opposed to piping
through stdin and stdout), we have to modify the `demo.sh` to point to the files
we want. The SyntaxNet models are configured via a combination of run-time flags
(which are easy to change) and a text format `TaskSpec` protocol buffer. The
spec file used in the demo is in
`syntaxnet/models/parsey_mcparseface/context.pbtxt`.
To use corpora instead of stdin/stdout, we have to:
1. Create or modify an `input` field inside the `TaskSpec`, with the
`file_pattern` specifying the location we want. If the input corpus is in
CONLL format, make sure to put `record_format: 'conll-sentence'`.
1. Change the `--input` and/or `--output` flag to use the name of the resource
as the output, instead of `stdin` and `stdout`.
E.g., if we wanted to POS tag the CONLL corpus `./wsj.conll`, we would create
two entries, one for the input and one for the output:
```protosame
input {
name: 'wsj-data'
record_format: 'conll-sentence'
Part {
file_pattern: './wsj.conll'
}
}
input {
name: 'wsj-data-tagged'
record_format: 'conll-sentence'
Part {
file_pattern: './wsj-tagged.conll'
}
}
```
Then we can use `--input=wsj-data --output=wsj-data-tagged` on the command line
to specify reading and writing to these files.
#### Configuring the Python Scripts
As mentioned above, the python scripts are configured in two ways:
1. **Run-time flags** are used to point to the `TaskSpec` file, switch between
inputs for reading and writing, and set various run-time model parameters.
At training time, these flags are used to set the learning rate, hidden
layer sizes, and other key parameters.
1. The **`TaskSpec` proto** stores configuration about the transition system,
the features, and a set of named static resources required by the parser. It
is specified via the `--task_context` flag. A few key notes to remember:
- The `Parameter` settings in the `TaskSpec` have a prefix: either
`brain_pos` (they apply to the tagger) or `brain_parser` (they apply to
the parser). The `--prefix` run-time flag switches between reading from
the two configurations.
- The resources will be created and/or modified during multiple stages of
training. As described above, the resources can also be used at
evaluation time to read or write to specific files. These resources are
also separate from the model parameters, which are saved separately via
calls to TensorFlow ops, and loaded via the `--model_path` flag.
- Because the `TaskSpec` contains file path, remember that copying around
this file is not enough to relocate a trained model: you need to move
and update all the paths as well.
Note that some run-time flags need to be consistent between training and testing
(e.g. the number of hidden units).
### Next Steps
There are many ways to extend this framework, e.g. adding new features, changing
the model structure, training on other languages, etc. We suggest reading the
detailed tutorial below to get a handle on the rest of the framework.
## Contact
To ask questions or report issues please post on Stack Overflow with the tag
[syntaxnet](http://stackoverflow.com/questions/tagged/syntaxnet) or open an
issue on the tensorflow/models [issues
tracker](https://github.com/tensorflow/models/issues). Please assign SyntaxNet
issues to @calberti or @andorardo.
## Credits
Original authors of the code in this package include (in alphabetical order):
* Alessandro Presta
* Aliaksei Severyn
* Andy Golding
* Bernd Bohnet
* Chayut Thanapirom
* Chris Alberti
* Daniel Andor
* David Weiss
* Emily Pitler
* Greg Coppola
* Ivan Bogatyy
* Ji Ma
* Keith Hall
* Kuzman Ganchev
* Lingpeng Kong
* Livio Baldini Soares
* Mark Omernick
* Michael Collins
* Michael Ringgaard
* Ryan McDonald
* Slav Petrov
* Stefan Istrate
* Terry Koo
* Tim Credo
* Zora Tung
local_repository(
name = "org_tensorflow",
path = "tensorflow",
)
# We need to pull in @io_bazel_rules_closure for TensorFlow. Bazel design
# documentation states that this verbosity is intentional, to prevent
# TensorFlow/SyntaxNet from depending on different versions of
# @io_bazel_rules_closure.
http_archive(
name = "io_bazel_rules_closure",
sha256 = "6691c58a2cd30a86776dd9bb34898b041e37136f2dc7e24cadaeaf599c95c657",
strip_prefix = "rules_closure-08039ba8ca59f64248bb3b6ae016460fe9c9914f",
urls = [
"http://bazel-mirror.storage.googleapis.com/github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",
"https://github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",
],
)
load("@org_tensorflow//tensorflow:workspace.bzl", "tf_workspace")
tf_workspace(
path_prefix = "",
tf_repo_name = "org_tensorflow",
)
http_archive(
name = "sling",
sha256 = "f1ce597476cb024808ca0a371a01db9dda4e0c58fb34a4f9c4ea91796f437b10",
strip_prefix = "sling-e3ae9d94eb1d9ee037a851070d54ed2eefaa928a",
urls = [
"http://bazel-mirror.storage.googleapis.com/github.com/google/sling/archive/e3ae9d94eb1d9ee037a851070d54ed2eefaa928a.tar.gz",
"https://github.com/google/sling/archive/e3ae9d94eb1d9ee037a851070d54ed2eefaa928a.tar.gz",
],
)
# Used by SLING.
bind(
name = "zlib",
actual = "@zlib_archive//:zlib",
)
FROM dragnn-oss-test-base:latest
RUN rm -rf \
$SYNTAXNETDIR/syntaxnet/dragnn \
$SYNTAXNETDIR/syntaxnet/syntaxnet \
$SYNTAXNETDIR/syntaxnet/third_party \
$SYNTAXNETDIR/syntaxnet/util/utf8
COPY dragnn $SYNTAXNETDIR/syntaxnet/dragnn
COPY syntaxnet $SYNTAXNETDIR/syntaxnet/syntaxnet
COPY third_party $SYNTAXNETDIR/syntaxnet/third_party
COPY util/utf8 $SYNTAXNETDIR/syntaxnet/util/utf8
COPY WORKSPACE $SYNTAXNETDIR/syntaxnet/WORKSPACE
FROM ubuntu:16.04
ENV SYNTAXNETDIR=/opt/tensorflow PATH=$PATH:/root/bin
# Install system packages. This doesn't include everything the TensorFlow
# dockerfile specifies, so if anything goes awry, maybe install more packages
# from there. Also, running apt-get clean before further commands will make the
# Docker images smaller.
RUN mkdir -p $SYNTAXNETDIR \
&& cd $SYNTAXNETDIR \
&& apt-get update \
&& apt-get install -y \
file \
git \
graphviz \
libcurl3-dev \
libfreetype6-dev \
libgraphviz-dev \
liblapack-dev \
libopenblas-dev \
libpng-dev \
libxft-dev \
openjdk-8-jdk \
python-dev \
python-mock \
python-pip \
python2.7 \
swig \
unzip \
vim \
wget \
zlib1g-dev \
&& apt-get clean \
&& (rm -f /var/cache/apt/archives/*.deb \
/var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true)
# Install common Python dependencies. Similar to above, remove caches
# afterwards to help keep Docker images smaller.
RUN pip install --ignore-installed pip \
&& python -m pip install numpy \
&& rm -rf /root/.cache/pip /tmp/pip*
RUN python -m pip install \
asciitree \
ipykernel \
jupyter \
matplotlib \
pandas \
protobuf \
scipy \
sklearn \
&& python -m ipykernel.kernelspec \
&& python -m pip install pygraphviz \
--install-option="--include-path=/usr/include/graphviz" \
--install-option="--library-path=/usr/lib/graphviz/" \
&& python -m jupyter_core.command nbextension enable \
--py --sys-prefix widgetsnbextension \
&& rm -rf /root/.cache/pip /tmp/pip*
# Installs Bazel.
RUN wget --quiet https://github.com/bazelbuild/bazel/releases/download/0.11.1/bazel-0.11.1-installer-linux-x86_64.sh \
&& chmod +x bazel-0.11.1-installer-linux-x86_64.sh \
&& JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ ./bazel-0.11.1-installer-linux-x86_64.sh \
&& rm ./bazel-0.11.1-installer-linux-x86_64.sh
COPY WORKSPACE $SYNTAXNETDIR/syntaxnet/WORKSPACE
COPY tools/bazel.rc $SYNTAXNETDIR/syntaxnet/tools/bazel.rc
# Compile common TensorFlow targets, which don't depend on DRAGNN / SyntaxNet
# source. This makes it more convenient to re-compile DRAGNN / SyntaxNet for
# development (though not as convenient as the docker-devel scripts).
RUN cd $SYNTAXNETDIR/syntaxnet \
&& git clone --branch r1.8 --recurse-submodules https://github.com/tensorflow/tensorflow \
&& cd tensorflow \
&& tensorflow/tools/ci_build/builds/configured CPU \
&& cd $SYNTAXNETDIR/syntaxnet \
&& bazel build -c opt @org_tensorflow//tensorflow:tensorflow_py
# Just copy the code and run tests. The build and test flags differ enough that
# doing a normal build of TensorFlow targets doesn't save much test time.
WORKDIR $SYNTAXNETDIR/syntaxnet
COPY dragnn $SYNTAXNETDIR/syntaxnet/dragnn
COPY syntaxnet $SYNTAXNETDIR/syntaxnet/syntaxnet
COPY third_party $SYNTAXNETDIR/syntaxnet/third_party
COPY util/utf8 $SYNTAXNETDIR/syntaxnet/util/utf8
# Doesn't matter if the tests pass or not, since we're going to re-copy over the
# code.
RUN bazel test -c opt ... || true
# You need to build wheels before building this image. Please consult
# docker-devel/README.txt.
#
# It might be more efficient to use a minimal distribution, like Alpine. But
# the upside of this being popular is that people might already have it.
FROM ubuntu:16.04
ENV SYNTAXNETDIR=/opt/tensorflow PATH=$PATH:/root/bin
RUN apt-get update \
&& apt-get install -y \
file \
git \
graphviz \
libcurl3 \
libfreetype6 \
libgraphviz-dev \
liblapack3 \
libopenblas-base \
libpng16-16 \
libxft2 \
python-dev \
python-mock \
python-pip \
python2.7 \
zlib1g-dev \
&& apt-get clean \
&& (rm -f /var/cache/apt/archives/*.deb \
/var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true)
# Install common Python dependencies. Similar to above, remove caches
# afterwards to help keep Docker images smaller.
RUN pip install --ignore-installed pip \
&& python -m pip install numpy \
&& rm -rf /root/.cache/pip /tmp/pip*
RUN python -m pip install \
asciitree \
ipykernel \
jupyter \
matplotlib \
pandas \
protobuf \
scipy \
sklearn \
&& python -m ipykernel.kernelspec \
&& python -m pip install pygraphviz \
--install-option="--include-path=/usr/include/graphviz" \
--install-option="--library-path=/usr/lib/graphviz/" \
&& python -m jupyter_core.command nbextension enable \
--py --sys-prefix widgetsnbextension \
&& rm -rf /root/.cache/pip /tmp/pip*
COPY syntaxnet_with_tensorflow-0.2-cp27-cp27mu-linux_x86_64.whl $SYNTAXNETDIR/
RUN python -m pip install \
$SYNTAXNETDIR/syntaxnet_with_tensorflow-0.2-cp27-cp27mu-linux_x86_64.whl \
&& rm -rf /root/.cache/pip /tmp/pip*
# This makes the IP exposed actually "*"; we'll do host restrictions by passing
# a hostname to the `docker run` command.
COPY tensorflow/tensorflow/tools/docker/jupyter_notebook_config.py /root/.jupyter/
EXPOSE 8888
# This does not need to be compiled, only copied.
COPY examples $SYNTAXNETDIR/syntaxnet/examples
# For some reason, this works if we run it in a bash shell :/ :/ :/
CMD /bin/bash -c "python -m jupyter_core.command notebook --debug --notebook-dir=/opt/tensorflow/syntaxnet/examples --allow-root"
Docker is used for packaging the SyntaxNet. There are three primary things we
build with Docker,
1. A development image, which contains all source built with Bazel.
2. Python/pip wheels, built by running a command in the development container.
3. A minified image, which only has the compiled version of TensorFlow and
SyntaxNet, by installing the wheel built by the above step.
Important info (please read)
------------------------------
One thing to be wary of is that YOU CAN LOSE DATA IF YOU DEVELOP IN A DOCKER
CONTAINER. Please be very careful to mount data you care about to Docker
volumes, or use a volume mount so that it's mapped to your host filesystem.
Another note, especially relevant to training models, is that Docker sends the
whole source tree to the Docker daemon every time you try to build an image.
This can take some time if you have large temporary model files lying around.
You can exclude your model files by editing .dockerignore, or just don't store
them in the base directory.
Step 1: Building the development image
------------------------------
Simply run `docker build -t dragnn-oss .` in the base directory. Make sure you
have all the source checked out correctly, including git submodules.
Step 2: Building wheels
------------------------------
Please run,
bash ./docker-devel/build_wheels.sh
This actually builds the image from Step 1 as well.
Step 3: Building the development image
------------------------------
First, ensure you have the file
syntaxnet_with_tensorflow-0.2-cp27-cp27mu-linux_x86_64.whl
in your working directory, from step 2. Then run,
docker build -t dragnn-oss:latest-minimal -f docker-devel/Dockerfile.min .
If the filename changes (e.g. you are on a different architecture), just update
Dockerfile.min.
Developing in Docker
------------------------------
We recommend developing in Docker by using the `./docker-devel/build_devel.sh`
script; it will set up a few volume mounts, and port mappings automatically.
You may want to add more port mappings on your own. If you want to drop into a
shell instead of launching the notebook, simply run,
./docker-devel/build_devel.sh /bin/bash
#!/bin/bash
#
# This file puts you in a Docker sub-shell where you can build SyntaxNet
# targets. It is intended for development, as the Dockerfile (build file) does
# not actually build any of SyntaxNet, but instead mounts it in a volume.
script_path="$(readlink -f "$0")"
root_path="$(dirname "$(dirname "${script_path}")")"
set -e
if [[ -z "$(docker images -q dragnn-oss)" ]]; then
docker build -t dragnn-oss .
else
echo "NOTE: dragnn-oss image already exists, not re-building." >&2
echo "Please run \`docker build -t dragnn-oss .\` if you need." >&2
fi
echo -e "\n\nRun bazel commands like \`bazel test syntaxnet/...\`"
# NOTE: Unfortunately, we need to mount /tensorflow over /syntaxnet/tensorflow
# (which happens via devel_entrypoint.sh). This requires privileged mode.
syntaxnet_base="/opt/tensorflow/syntaxnet"
docker run --rm -ti \
-v "${root_path}"/syntaxnet:"${syntaxnet_base}"/syntaxnet \
-v "${root_path}"/dragnn:"${syntaxnet_base}"/dragnn \
-v "${root_path}"/examples:"${syntaxnet_base}"/examples \
-p 127.0.0.1:8888:8888 \
dragnn-oss "$@"
#!/bin/bash
#
# Convenience script to build wheel files in Docker, and copy them out of the
# container.
#
# Usage: docker-devel/build_wheels.sh (takes no arguments; run it from the base
# directory).
set -e
docker build -t dragnn-oss .
# Start building the wheels.
script="bazel run //dragnn/tools:build_pip_package \
-- --output-dir=/opt/tensorflow/syntaxnet; \
bazel run //dragnn/tools:build_pip_package \
-- --output-dir=/opt/tensorflow/syntaxnet --include-tensorflow"
container_id="$(docker run -d dragnn-oss /bin/bash -c "${script}")"
echo "Waiting for container ${container_id} to finish building the wheel ..."
if [[ "$(docker wait "${container_id}")" != 0 ]]; then
echo "Container failed! Please run \`docker logs <id>\` to see errors." >&2
exit 1
fi
# The build_pip_package.py script prints lines like "Wrote x.whl". The wheel
# names are prefixed by architecture and such, so don't guess them.
wheels=(
$(docker logs "${container_id}" 2>/dev/null | grep Wrote | awk '{print $2;}'))
for wheel in "${wheels[@]}"; do
output=./"$(basename "${wheel}")"
docker cp "${container_id}:${wheel}" "${output}"
echo "Wrote ${output} ($(du -h "${output}" | awk '{print $1;}'))"
done
echo "Removing ${container_id} ..."
docker rm "${container_id}" >/dev/null
package_group(
name = "dragnn_visibility",
packages = [
],
)
package(
default_visibility = ["//visibility:public"],
features = ["-layering_check"],
)
cc_library(
name = "stateless_component",
srcs = ["stateless_component.cc"],
deps = [
"//dragnn/core:component_registry",
"//dragnn/core/interfaces:component",
"//dragnn/core/interfaces:transition_state",
"//dragnn/core/util:label",
"//dragnn/protos:data_proto_cc",
"//syntaxnet:base",
],
alwayslink = 1,
)
cc_test(
name = "stateless_component_test",
srcs = ["stateless_component_test.cc"],
deps = [
":stateless_component",
"//dragnn/core:component_registry",
"//dragnn/core:input_batch_cache",
"//dragnn/core/test:generic",
"//dragnn/core/test:mock_transition_state",
"//dragnn/io:sentence_input_batch",
"//syntaxnet:base",
"//syntaxnet:sentence_proto_cc",
"//syntaxnet:test_main",
],
)
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/core/component_registry.h"
#include "dragnn/core/interfaces/component.h"
#include "dragnn/core/interfaces/transition_state.h"
#include "dragnn/core/util/label.h"
#include "dragnn/protos/data.pb.h"
#include "syntaxnet/base.h"
namespace syntaxnet {
namespace dragnn {
namespace {
// A component that does not create its own transition states; instead, it
// simply forwards the states of the previous component. Requires that some
// previous component has converted the input batch. Does not support all
// methods. Intended for "compute-only" bulk components that only use linked
// features, which use only a small subset of DRAGNN functionality.
class StatelessComponent : public Component {
public:
void InitializeComponent(const ComponentSpec &spec) override {
name_ = spec.name();
}
// Stores the |parent_states| for forwarding to downstream components.
void InitializeData(
const std::vector<std::vector<const TransitionState *>> &parent_states,
int max_beam_size, InputBatchCache *input_data) override {
batch_size_ = input_data->Size();
beam_size_ = max_beam_size;
parent_states_ = parent_states;
// The beam should be wide enough for the previous component.
for (const auto &beam : parent_states) {
CHECK_LE(beam.size(), beam_size_);
}
}
// Forwards the states of the previous component.
std::vector<std::vector<const TransitionState *>> GetBeam() override {
return parent_states_;
}
// Forwards the |current_index| to the previous component.
int GetSourceBeamIndex(int current_index, int batch) const override {
return current_index;
}
string Name() const override { return name_; }
int BeamSize() const override { return beam_size_; }
int BatchSize() const override { return batch_size_; }
int StepsTaken(int batch_index) const override { return 0; }
bool IsReady() const override { return true; }
bool IsTerminal() const override { return true; }
void FinalizeData() override {}
void ResetComponent() override {}
void InitializeTracing() override {}
void DisableTracing() override {}
std::vector<std::vector<ComponentTrace>> GetTraceProtos() const override {
return {};
}
// Unsupported methods.
int GetBeamIndexAtStep(int step, int current_index,
int batch) const override {
LOG(FATAL) << "[" << name_ << "] Method not supported";
return 0;
}
std::function<int(int, int, int)> GetStepLookupFunction(
const string &method) override {
LOG(FATAL) << "[" << name_ << "] Method not supported";
return nullptr;
}
bool AdvanceFromPrediction(const float *transition_matrix, int num_items,
int num_actions) override {
LOG(FATAL) << "[" << name_ << "] AdvanceFromPrediction not supported";
}
void AdvanceFromOracle() override {
LOG(FATAL) << "[" << name_ << "] AdvanceFromOracle not supported";
}
std::vector<std::vector<std::vector<Label>>> GetOracleLabels()
const override {
LOG(FATAL) << "[" << name_ << "] Method not supported";
}
int GetFixedFeatures(std::function<int32 *(int)> allocate_indices,
std::function<int64 *(int)> allocate_ids,
std::function<float *(int)> allocate_weights,
int channel_id) const override {
LOG(FATAL) << "[" << name_ << "] Method not supported";
}
int BulkGetFixedFeatures(const BulkFeatureExtractor &extractor) override {
LOG(FATAL) << "[" << name_ << "] Method not supported";
}
void BulkEmbedFixedFeatures(
int batch_size_padding, int num_steps_padding, int output_array_size,
const vector<const float *> &per_channel_embeddings,
float *embedding_output) override {
LOG(FATAL) << "[" << name_ << "] Method not supported";
}
void BulkEmbedDenseFixedFeatures(
const vector<const float *> &per_channel_embeddings,
float *embedding_output, int embedding_output_size,
int32 *offset_array_output, int offset_array_size) override {
LOG(FATAL) << "[" << name_ << "] Method not supported";
}
int BulkDenseFeatureSize() const override {
LOG(FATAL) << "Method not supported";
}
std::vector<LinkFeatures> GetRawLinkFeatures(int channel_id) const override {
LOG(FATAL) << "[" << name_ << "] Method not supported";
}
void AddTranslatedLinkFeaturesToTrace(
const std::vector<LinkFeatures> &features, int channel_id) override {
LOG(FATAL) << "[" << name_ << "] Method not supported";
}
private:
string name_; // component name
int batch_size_ = 1; // number of sentences in current batch
int beam_size_ = 1; // maximum beam size
// Parent states passed to InitializeData(), and passed along in GetBeam().
std::vector<std::vector<const TransitionState *>> parent_states_;
};
REGISTER_DRAGNN_COMPONENT(StatelessComponent);
} // namespace
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/core/component_registry.h"
#include "dragnn/core/input_batch_cache.h"
#include "dragnn/core/test/generic.h"
#include "dragnn/core/test/mock_transition_state.h"
#include "dragnn/io/sentence_input_batch.h"
#include "dragnn/protos/data.pb.h"
#include "syntaxnet/base.h"
#include "syntaxnet/sentence.pb.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/io/path.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/protobuf.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace {
const char kSentence0[] = R"(
token {
word: "Sentence" start: 0 end: 7 tag: "NN" category: "NOUN" label: "ROOT"
break_level: NO_BREAK
}
token {
word: "0" start: 9 end: 9 head: 0 tag: "CD" category: "NUM" label: "num"
break_level: SPACE_BREAK
}
token {
word: "." start: 10 end: 10 head: 0 tag: "." category: "." label: "punct"
break_level: NO_BREAK
}
)";
const char kSentence1[] = R"(
token {
word: "Sentence" start: 0 end: 7 tag: "NN" category: "NOUN" label: "ROOT"
break_level: NO_BREAK
}
token {
word: "1" start: 9 end: 9 head: 0 tag: "CD" category: "NUM" label: "num"
break_level: SPACE_BREAK
}
token {
word: "." start: 10 end: 10 head: 0 tag: "." category: "." label: "punct"
break_level: NO_BREAK
}
)";
const char kLongSentence[] = R"(
token {
word: "Sentence" start: 0 end: 7 tag: "NN" category: "NOUN" label: "ROOT"
break_level: NO_BREAK
}
token {
word: "1" start: 9 end: 9 head: 0 tag: "CD" category: "NUM" label: "num"
break_level: SPACE_BREAK
}
token {
word: "2" start: 10 end: 10 head: 0 tag: "CD" category: "NUM" label: "num"
break_level: SPACE_BREAK
}
token {
word: "3" start: 11 end: 11 head: 0 tag: "CD" category: "NUM" label: "num"
break_level: SPACE_BREAK
}
token {
word: "." start: 12 end: 12 head: 0 tag: "." category: "." label: "punct"
break_level: NO_BREAK
}
)";
const char kMasterSpec[] = R"(
component {
name: "test"
transition_system {
registered_name: "shift-only"
}
linked_feature {
name: "prev"
fml: "input.focus"
embedding_dim: 32
size: 1
source_component: "prev"
source_translator: "identity"
source_layer: "last_layer"
}
backend {
registered_name: "StatelessComponent"
}
}
)";
} // namespace
using testing::Return;
class StatelessComponentTest : public ::testing::Test {
public:
std::unique_ptr<Component> CreateParser(
int beam_size,
const std::vector<std::vector<const TransitionState *>> &states,
const std::vector<string> &data) {
MasterSpec master_spec;
CHECK(TextFormat::ParseFromString(kMasterSpec, &master_spec));
data_.reset(new InputBatchCache(data));
// The stateless component does not use any particular input batch type, and
// relies on the preceding components to convert the input batch.
data_->GetAs<SentenceInputBatch>();
// Create a parser component with the specified beam size.
std::unique_ptr<Component> parser_component(
Component::Create("StatelessComponent"));
parser_component->InitializeComponent(master_spec.component(0));
parser_component->InitializeData(states, beam_size, data_.get());
return parser_component;
}
std::unique_ptr<InputBatchCache> data_;
};
TEST_F(StatelessComponentTest, ForwardsTransitionStates) {
MockTransitionState mock_state_1, mock_state_2, mock_state_3;
const std::vector<std::vector<const TransitionState *>> parent_states = {
{}, {&mock_state_1}, {&mock_state_2, &mock_state_3}};
std::vector<string> data;
for (const string &textproto : {kSentence0, kSentence1, kLongSentence}) {
Sentence sentence;
CHECK(TextFormat::ParseFromString(textproto, &sentence));
data.emplace_back();
CHECK(sentence.SerializeToString(&data.back()));
}
CHECK_EQ(parent_states.size(), data.size());
const int kBeamSize = 2;
auto test_parser = CreateParser(kBeamSize, parent_states, data);
EXPECT_TRUE(test_parser->IsReady());
EXPECT_TRUE(test_parser->IsTerminal());
EXPECT_EQ(kBeamSize, test_parser->BeamSize());
EXPECT_EQ(data.size(), test_parser->BatchSize());
EXPECT_TRUE(test_parser->GetTraceProtos().empty());
for (int batch_index = 0; batch_index < parent_states.size(); ++batch_index) {
EXPECT_EQ(0, test_parser->StepsTaken(batch_index));
const auto &beam = parent_states[batch_index];
for (int beam_index = 0; beam_index < beam.size(); ++beam_index) {
// Expect an identity mapping.
EXPECT_EQ(beam_index,
test_parser->GetSourceBeamIndex(beam_index, batch_index));
}
}
const auto forwarded_states = test_parser->GetBeam();
EXPECT_EQ(parent_states, forwarded_states);
}
TEST_F(StatelessComponentTest, UnimplementedMethodsDie) {
MockTransitionState mock_state_1, mock_state_2, mock_state_3;
const std::vector<std::vector<const TransitionState *>> parent_states;
std::vector<string> data;
for (const string &textproto : {kSentence0, kSentence1, kLongSentence}) {
Sentence sentence;
CHECK(TextFormat::ParseFromString(textproto, &sentence));
data.emplace_back();
CHECK(sentence.SerializeToString(&data.back()));
}
const int kBeamSize = 2;
auto test_parser = CreateParser(kBeamSize, parent_states, data);
EXPECT_TRUE(test_parser->IsReady());
EXPECT_DEATH(test_parser->AdvanceFromPrediction({}, 0, 0),
"AdvanceFromPrediction not supported");
EXPECT_DEATH(test_parser->AdvanceFromOracle(),
"AdvanceFromOracle not supported");
EXPECT_DEATH(test_parser->GetOracleLabels(), "Method not supported");
EXPECT_DEATH(test_parser->GetFixedFeatures(nullptr, nullptr, nullptr, 0),
"Method not supported");
BulkFeatureExtractor extractor(nullptr, nullptr, nullptr);
EXPECT_DEATH(test_parser->BulkEmbedFixedFeatures(0, 0, 0, {nullptr}, nullptr),
"Method not supported");
EXPECT_DEATH(test_parser->BulkGetFixedFeatures(extractor),
"Method not supported");
EXPECT_DEATH(test_parser->GetRawLinkFeatures(0), "Method not supported");
EXPECT_DEATH(test_parser->AddTranslatedLinkFeaturesToTrace({}, 0),
"Method not supported");
}
} // namespace dragnn
} // namespace syntaxnet
package(
default_visibility = ["//visibility:public"],
features = ["-layering_check"],
)
cc_library(
name = "syntaxnet_component",
srcs = ["syntaxnet_component.cc"],
hdrs = ["syntaxnet_component.h"],
deps = [
":syntaxnet_link_feature_extractor",
":syntaxnet_transition_state",
"//dragnn/components/util:bulk_feature_extractor",
"//dragnn/core:beam",
"//dragnn/core:component_registry",
"//dragnn/core:input_batch_cache",
"//dragnn/core/interfaces:component",
"//dragnn/core/interfaces:transition_state",
"//dragnn/core/util:label",
"//dragnn/io:sentence_input_batch",
"//dragnn/io:syntaxnet_sentence",
"//dragnn/protos:data_proto_cc",
"//dragnn/protos:spec_proto_cc",
"//dragnn/protos:trace_proto_cc",
"//syntaxnet:base",
"//syntaxnet:parser_transitions",
"//syntaxnet:registry",
"//syntaxnet:sparse_proto_cc",
"//syntaxnet:task_context",
"//syntaxnet:task_spec_proto_cc",
"//syntaxnet:utils",
"//util/utf8:unicodetext",
],
alwayslink = 1,
)
cc_library(
name = "syntaxnet_link_feature_extractor",
srcs = ["syntaxnet_link_feature_extractor.cc"],
hdrs = ["syntaxnet_link_feature_extractor.h"],
deps = [
"//dragnn/protos:spec_proto_cc",
"//syntaxnet:base",
"//syntaxnet:embedding_feature_extractor",
"//syntaxnet:parser_transitions",
"//syntaxnet:task_context",
],
)
cc_library(
name = "syntaxnet_transition_state",
srcs = ["syntaxnet_transition_state.cc"],
hdrs = ["syntaxnet_transition_state.h"],
deps = [
"//dragnn/core/interfaces:cloneable_transition_state",
"//dragnn/core/interfaces:transition_state",
"//dragnn/io:syntaxnet_sentence",
"//dragnn/protos:trace_proto_cc",
"//syntaxnet:base",
"//syntaxnet:parser_transitions",
],
)
# Test data.
filegroup(
name = "testdata",
data = glob(["testdata/**"]),
)
# Tests.
cc_test(
name = "syntaxnet_component_test",
srcs = ["syntaxnet_component_test.cc"],
data = [":testdata"],
deps = [
":syntaxnet_component",
"//dragnn/core:input_batch_cache",
"//dragnn/core/test:generic",
"//dragnn/core/test:mock_transition_state",
"//dragnn/io:sentence_input_batch",
"//syntaxnet:base",
"//syntaxnet:sentence_proto_cc",
"//syntaxnet:test_main",
],
)
cc_test(
name = "syntaxnet_link_feature_extractor_test",
srcs = ["syntaxnet_link_feature_extractor_test.cc"],
deps = [
":syntaxnet_link_feature_extractor",
"//dragnn/core/test:generic",
"//dragnn/protos:spec_proto_cc",
"//syntaxnet:task_context",
"//syntaxnet:test_main",
],
)
cc_test(
name = "syntaxnet_transition_state_test",
srcs = ["syntaxnet_transition_state_test.cc"],
data = [":testdata"],
deps = [
":syntaxnet_component",
":syntaxnet_transition_state",
"//dragnn/core:input_batch_cache",
"//dragnn/core/test:generic",
"//dragnn/core/test:mock_transition_state",
"//dragnn/io:sentence_input_batch",
"//dragnn/protos:spec_proto_cc",
"//syntaxnet:base",
"//syntaxnet:sentence_proto_cc",
"//syntaxnet:test_main",
],
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment