"git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "1b673fd22b32a8260e5c47b3c11957e33874bb2c"
Commit 27ecc17a authored by Neel Kant's avatar Neel Kant
Browse files

Modify preprocess_data.py to accommodate titles dataset

parent 7bd2a3c5
import argparse import argparse
import itertools
import json import json
import multiprocessing import multiprocessing
import nltk import nltk
...@@ -43,18 +44,28 @@ class Encoder(object): ...@@ -43,18 +44,28 @@ class Encoder(object):
def encode(self, json_line): def encode(self, json_line):
text = json.loads(json_line)[self.args.json_key] text = json.loads(json_line)[self.args.json_key]
if not text:
text = "no text"
doc_ids = [] doc_ids = []
for sentence in Encoder.splitter.tokenize(text): for sentence in Encoder.splitter.tokenize(text):
tokens = Encoder.tokenizer.tokenize(sentence) tokens = Encoder.tokenizer.tokenize(sentence)
ids = Encoder.tokenizer.convert_tokens_to_ids(tokens) ids = Encoder.tokenizer.convert_tokens_to_ids(tokens)
if len(ids) > 0: if len(ids) > 0:
doc_ids.append(ids) doc_ids.append(ids)
else:
print("no ids!", flush=True)
tokens = Encoder.tokenizer.tokenize("no text")
ids = Encoder.tokenizer.convert_tokens_to_ids(tokens)
doc_ids.append(ids)
if self.args.flatten and len(doc_ids) > 1:
doc_ids = [list(itertools.chain(*doc_ids))]
return doc_ids, len(json_line) return doc_ids, len(json_line)
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--input', type=str, help='Path to input JSON') parser.add_argument('--input', type=str, help='Path to input JSON')
parser.add_argument('--vocab', type=str, help='Path to vocab.txt') parser.add_argument('--vocab', type=str, help='Path to vocab.txt')
parser.add_argument('--flatten', action='store_true', help='Path to input JSON')
parser.add_argument('--json-key', type=str, default='text', parser.add_argument('--json-key', type=str, default='text',
help='Key to extract from json') help='Key to extract from json')
parser.add_argument('--output-prefix', type=str, help='Path to binary output file without suffix') parser.add_argument('--output-prefix', type=str, help='Path to binary output file without suffix')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment