Fixed links to evaluation data in makefile (#5402)

eb370577 · Matthias Winkelmann · Chris Waterson · aec1fec6 · eb370577 · eb370577
Commit eb370577 authored Oct 05, 2018 by Matthias Winkelmann Committed by Chris Waterson Oct 04, 2018
5 changed files
--- a/research/swivel/.gitignore
+++ b/research/swivel/.gitignore
@@ -6,7 +6,9 @@ Mtruk.csv
 SimLex-999.zip
 analogy
 fastprep
-myz_naacl13_test_set.tgz
+*.dSYM
 questions-words.txt
+word_relationship.*
+tensorflow/
 rw.zip
 ws353simrel.tar.gz
--- a/research/swivel/README.md
+++ b/research/swivel/README.md
@@ -155,10 +155,10 @@ You can do some simple exploration using `nearest.py`:
    ...

 To evaluate the embeddings using common word similarity and analogy datasets,
-use `eval.mk` to retrieve the data sets and build the tools:
+use `eval.mk` to retrieve the data sets and build the tools. Note that wordsim is currently not compatible with Python 3.x.

    make -f eval.mk
-    ./wordsim.py -v vocab.txt -e vecs.bin *.ws.tab
+    ./wordsim.py --vocab vocab.txt --embeddings vecs.bin *.ws.tab
    ./analogy --vocab vocab.txt --embeddings vecs.bin *.an.tab

 The word similarity evaluation compares the embeddings' estimate of "similarity"

--- a/research/swivel/eval.mk
+++ b/research/swivel/eval.mk
@@ -59,9 +59,9 @@ simlex999.ws.tab: SimLex-999.zip
 mikolov.an.tab: questions-words.txt
 	egrep -v -E '^:' $^ | tr '[A-Z] ' '[a-z]\t' > $@

-msr.an.tab: myz_naacl13_test_set.tgz
-	tar Oxfz $^ test_set/word_relationship.questions | tr ' ' '\t' > /tmp/q
-	tar Oxfz $^ test_set/word_relationship.answers | cut -f2 -d ' ' > /tmp/a
+msr.an.tab: word_relationship.questions word_relationship.answers
+	cat word_relationship.questions | tr ' ' '\t' > /tmp/q
+	cat word_relationship.answers | cut -f2 -d ' ' > /tmp/a
 	paste /tmp/q /tmp/a > $@
 	rm -f /tmp/q /tmp/a

@@ -75,7 +75,7 @@ MEN.tar.gz:
 	wget http://clic.cimec.unitn.it/~elia.bruni/resources/MEN.tar.gz

 Mtruk.csv:
-	wget http://tx.technion.ac.il/~kirar/files/Mtruk.csv
+	wget http://www.kiraradinsky.com/files/Mtruk.csv

 rw.zip:
 	wget http://www-nlp.stanford.edu/~lmthang/morphoNLM/rw.zip
@@ -84,10 +84,13 @@ SimLex-999.zip:
 	wget http://www.cl.cam.ac.uk/~fh295/SimLex-999.zip

 questions-words.txt:
-	wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt
+	wget http://download.tensorflow.org/data/questions-words.txt

-myz_naacl13_test_set.tgz:
-	wget http://research.microsoft.com/en-us/um/people/gzweig/Pubs/myz_naacl13_test_set.tgz
+word_relationship.questions:
+	wget https://github.com/darshanhegde/SNLPProject/raw/master/word2vec/eval/word_relationship.questions
+
+word_relationship.answers:
+	wget https://github.com/darshanhegde/SNLPProject/raw/master/word2vec/eval/word_relationship.answers

 analogy: analogy.cc

@@ -95,4 +98,4 @@ clean:
 	rm -f *.ws.tab *.an.tab analogy *.pyc

 distclean: clean
-	rm -f *.tgz *.tar.gz *.zip Mtruk.csv questions-words.txt
+	rm -f *.tgz *.tar.gz *.zip Mtruk.csv questions-words.txt word_relationship.{questions,answers}
--- a/research/swivel/swivel.py
+++ b/research/swivel/swivel.py
--- a/research/swivel/vecs.py
+++ b/research/swivel/vecs.py
@@ -38,7 +38,7 @@ class Vecs(object):
            'unexpected file size for binary vector file %s' % rows_filename)

      # Memory map the rows.
-      dim = size / (4 * n)
+      dim = round(size / (4 * n))
      rows_mm = mmap.mmap(rows_fh.fileno(), 0, prot=mmap.PROT_READ)
      rows = np.matrix(
          np.frombuffer(rows_mm, dtype=np.float32).reshape(n, dim))