added readme

a449d312 · Mostofa Patwary · 59372322 · a449d312 · a449d312
Commit a449d312 authored Feb 18, 2021 by Mostofa Patwary
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 0 deletions

tools/openwebtext/README.md tools/openwebtext/README.md +9 -0

tools/openwebtext/filter_ngrams.py tools/openwebtext/filter_ngrams.py +9 -0

No files found.
--- a/tools/openwebtext/README.md
+++ b/tools/openwebtext/README.md
@@ -44,3 +44,12 @@ python remove_group_duplicates.py <file containing simialr documents> <cleaned d
 shuf <cleaned deduped data file> -o train_data.json
 ```

+# Deduplicating ngrams
+
+To deduplicate the downstream tasks from the training dataset, we run the following command.
+
+```
+python filter_ngrams.py <down stream task dataset> <training dataset to deduplicate> <output training dataset>
+```
+
+We use 13-grams for the deduplication. When we find a 13-gram match in a training document, we split the document into two pieces and remove the 13-gram along with 200 characters from the both side of the 13-gram. We also remove any splitted document with less than 200 characters or if a document got splitted more than 10 times.
--- a/tools/openwebtext/filter_ngrams.py
+++ b/tools/openwebtext/filter_ngrams.py
@@ -13,6 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+"""
+Deduplicate downstream tasks from training dataset. 13-grams have been used.
+All split documents with less than 200 characters got filtered. Any document
+with more than 10 splits got filtered as well.
+"""
+
 from functools import partial
 import json
 import multiprocessing
@@ -23,6 +29,7 @@ import sys
 import time

 def get_words(text):
+    # get all the lowercase words from text
    words, positions = [], []
    for match in re.finditer(r'\w+', text.lower()):
        words.append(match.group(0))
@@ -31,6 +38,8 @@ def get_words(text):

 def free_ngram(line, ngrams, ngram_size, filter_text_len, 
    splits_count, split_window_each_size):
+    # remove all the ngrams
+
    try:
        myjson = json.loads(line)
        text_buf = [myjson['text']]