Commit df33344a authored by zihanl's avatar zihanl
Browse files

process wow

parent 7d044e4e
......@@ -3,4 +3,8 @@ __pycache__
# Distribution / packaging
build/
dist/
*.egg-info/
\ No newline at end of file
*.egg-info/
tensorboard/
commands
*.log
logs
\ No newline at end of file
#!/bin/bash
srun -p batch_short,batch -A gpu_adlr_nlp -t 2:00:00 --nodes=1 --ntasks-per-node=16 --gres=gpu:16,gpfs:circe --job-name=interact --container-mounts=/gpfs/fs1/projects/gpu_adlr/datasets:/gpfs/fs1/projects/gpu_adlr/datasets,/home/zihanl:/home/zihanl --container-image=gitlab-master.nvidia.com/adlr/megatron-lm/pytorch-nlp-retriever-faiss:20.12-py3-devel --exclusive --pty bash
#!/bin/bash
#SBATCH -p interactive -A gpu_adlr_nlp -t 1:00:00 --nodes=1 --exclusive --mem=0 --overcommit --ntasks-per-node=16 --gres=gpu:16,gpfs:circe --dependency=singleton --job-name=adlr-nlp-largelm:gpt3-357m
NAME="gpt3-357m"
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p $DIR/logs
TENSORBOARD_DIR="${DIR}/tensorboard/${NAME}"
mkdir -p ${TENSORBOARD_DIR}
DATA_PATH=/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/roberta_dataset/rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_text_document
options=" \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 2 \
--global-batch-size 256 \
--rampup-batch-size 32 32 1953125 \
--train-samples 192000000 \
--lr-decay-samples 166400000 \
--lr-warmup-samples 162761 \
--lr 3.0e-4 \
--min-lr 3.0e-5 \
--lr-decay-style cosine \
--log-interval 100 \
--eval-iters 50 \
--eval-interval 2000 \
--data-path ${DATA_PATH} \
--vocab-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-vocab.json \
--merge-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-merges.txt \
--save-interval 10000 \
--exit-interval 100 \
--save /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
--load /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.02 \
--log-params-norm \
--log-num-zeros-in-grad \
--fp16 \
--DDP-impl torch \
--tensorboard-dir ${TENSORBOARD_DIR} \
--checkpoint-activations "
run_cmd="python ${DIR}/pretrain_gpt.py ${options}"
srun -l \
--container-image "gitlab-master.nvidia.com/adlr/megatron-lm/pytorch-nlp-retriever-faiss:20.12-py3-devel" \
--container-mounts "/gpfs/fs1/projects/gpu_adlr/datasets:/gpfs/fs1/projects/gpu_adlr/datasets,/home/zihanl:/home/zihanl" \
--output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
set +x
#!/bin/bash
NAME="gpt3-357m"
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p $DIR/logs
TENSORBOARD_DIR="${DIR}/tensorboard/${NAME}"
mkdir -p ${TENSORBOARD_DIR}
DATA_PATH=/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/roberta_dataset/rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_text_document
options=" \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 2 \
--global-batch-size 256 \
--rampup-batch-size 32 32 1953125 \
--train-samples 192000000 \
--lr-decay-samples 166400000 \
--lr-warmup-samples 162761 \
--lr 3.0e-4 \
--min-lr 3.0e-5 \
--lr-decay-style cosine \
--log-interval 100 \
--eval-iters 50 \
--eval-interval 2000 \
--data-path ${DATA_PATH} \
--vocab-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-vocab.json \
--merge-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-merges.txt \
--save-interval 10000 \
--exit-interval 100 \
--save /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
--load /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.02 \
--log-params-norm \
--log-num-zeros-in-grad \
--fp16 \
--DDP-impl torch \
--tensorboard-dir ${TENSORBOARD_DIR} \
--checkpoint-activations "
run_cmd="${DIR}/pretrain_gpt.py ${options}"
GPUS_PER_NODE=16
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS ${run_cmd}
set +x
from src.config import get_params
from transformers import AutoTokenizer
import torch
import numpy as np
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import os
wn_lemma = WordNetLemmatizer()
stop_words = stopwords.words('english')
stop_words.append("n't")
stop_words.append("'s")
punctuations = list(string.punctuation)
punctuations.append("``")
punctuations.append("''")
stop_words_and_punctuations = stop_words + punctuations
stop_words_and_punctuations_table = {word: True for word in stop_words_and_punctuations}
label_set = ["O", "B", "I"]
def read_data(input_datapath):
data = []
print("Reading data from %s" % input_datapath)
with open(input_datapath, "r") as f:
for i, line in enumerate(f):
line = line.strip()
splits = line.split("\t")
length = len(splits)
assert length == 2 or length == 4
# length is 2: dialog context + response
# length is 4: dialog context + topic + control sentence + response
if length == 2:
# dialog context + response
data.append(line)
else:
# only need dialog context + control sentence + response
data.append(splits[0] + "\t" + splits[2] + "\t" + splits[3])
return data
def write_data(output_datapath, output_data):
print("Writing data to %s" % output_datapath)
with open(output_datapath, "w") as fw:
for data_sample in output_data:
fw.write(data_sample + "\n")
def detect_entities(tokenizer, ner_model, sentence):
tokens = sentence.split()
token_ids, first_tok_masks = [tokenizer.cls_token_id], [0]
for token in tokens:
subs_ = tokenizer.tokenize(token)
assert len(subs_) > 0
token_ids.extend(tokenizer.convert_tokens_to_ids(subs_))
first_tok_masks.extend([1] + [0] * (len(subs_) - 1))
token_ids.append(tokenizer.sep_token_id)
first_tok_masks.append(0)
token_ids = torch.LongTensor([token_ids]).cuda()
predictions = ner_model(token_ids)
predictions = predictions[0].data.cpu().numpy() # (seq_len, 3)
pred_ids = list(np.argmax(predictions, axis=1))
assert len(pred_ids) == len(first_tok_masks)
preds_for_each_word = []
for pred_id, mask in zip(pred_ids, first_tok_masks):
if mask == 1:
preds_for_each_word.append(label_set[pred_id])
assert len(preds_for_each_word) == len(tokens)
# extract entities
entity_list = []
temp = []
for i, (token, pred) in enumerate(zip(tokens, preds_for_each_word)):
if pred == "O":
if len(temp) > 0:
entity_list.append(" ".join(temp))
temp = []
else:
# pred == "B" or pred == "I"
temp.append(token)
return entity_list
def generate_entity_control_data(tokenizer, ner_model, input_data):
# aim to generate:
# dialog context + entity control code (optional) + relevant control sentence (contain entity) + response
output_data = []
## TODO
n_skip, n_skip_no_overlap, n_skip_one_contain_another = 0, 0, 0
n_control, n_entity_control, n_overlap_control = 0, 0, 0
total_num_control_code = 0
for sample_idx, data_item in enumerate(tqdm(input_data)):
# # Debug only
# if sample_idx > 1000:
# break
# 1. detect entities for dialog context, control sentence and response
splits = data_item.split("\t")
if len(splits) == 2:
output_data.append(data_item)
continue
assert len(splits) == 3
last_turn = splits[0].split(" [SEP] ")[-1]
control_sent = splits[1]
response = splits[2]
if control_sent in response or response in control_sent:
# if the whole control_sent is a part of response or vise versa, skip this data sample
n_skip += 1
n_skip_one_contain_another += 1
continue
last_turn_entities = detect_entities(tokenizer, ner_model, last_turn)
control_sent_entities = detect_entities(tokenizer, ner_model, control_sent)
response_entities = detect_entities(tokenizer, ner_model, response)
# 2. generate control code:
# 2.1 If there is one or more than one common entity in last_turn, control sentence and response. No need to use entity as control.
# 2.2 If the entity only exists in control sentence and response, use this as the control code.
# 2.3 If there is no overlaped entity or words between control sentence and response, skip this data sample.
# 2.4 If there is no overlapped entity but there are overlapped words, add entity in the control sentence (if any) as the control code if it is not in the dialog context
# TODO
# In general, need to trim the control sentence when it is too long.
# Need to lowercase to match?
# calculate common entity between control sentence and response
common_entity_list = []
for ctrl_entity in control_sent_entities:
for resp_entity in response_entities:
if resp_entity in ctrl_entity:
common_entity_list.append(ctrl_entity)
break
elif ctrl_entity in resp_entity:
common_entity_list.append(resp_entity)
break
if len(common_entity_list) == 0:
# calculate overlap between control sentence and response
control_word_list = control_sent.split()
response_word_list = response.split()
response_word_table = {wn_lemma.lemmatize(word): True for word in response_word_list}
overlap_phrases = []
temp = []
for word in control_word_list:
if word.lower() in stop_words_and_punctuations_table:
continue
if wn_lemma.lemmatize(word) in response_word_table:
temp.append(word)
else:
if len(temp) > 0:
if len(temp) > 4:
temp = temp[:4]
overlap_phrases.append(" ".join(temp))
temp = []
if len(overlap_phrases) == 0:
# skip this data sample
n_skip += 1
n_skip_no_overlap += 1
continue
n_control += 1
control_code_list = []
if len(control_sent_entities) > 0:
n_entity_control += 1
# reorder control_sent_entities based on the length of the entities (in a reverse order)
control_sent_entities = sorted(control_sent_entities, key=len, reverse=True)
for entity in control_sent_entities:
if entity not in last_turn:
add_flag = True
for code in control_code_list:
if entity in code:
add_flag = False
break
if add_flag:
control_code_list.append(entity)
else:
n_overlap_control += 1
# reorder overlap_phrases based on the length of the phrases (in a reverse order)
overlap_phrases = sorted(overlap_phrases, key=len, reverse=True)[:3]
for phrase in overlap_phrases:
if phrase not in last_turn:
add_flag = True
for code in control_code_list:
if phrase in code:
# remove repeat word
add_flag = False
break
if add_flag:
control_code_list.append(phrase)
else:
n_entity_control += 1
n_control += 1
control_code_list = []
# reorder common_entity_list based on the length of the entities (in a reverse order)
common_entity_list = sorted(common_entity_list, key=len, reverse=True)
for entity in common_entity_list:
if entity not in last_turn:
add_flag = True
for code in control_code_list:
if entity in code:
add_flag = False
break
if add_flag:
control_code_list.append(entity)
total_num_control_code += len(control_code_list)
if len(control_code_list) > 0:
output_data.append(splits[0] + "\t" + " [CTRL] ".join(control_code_list) + "\t" + control_sent + "\t" + response)
else:
output_data.append(splits[0] + "\t" + control_sent + "\t" + response)
avg_num_control_code = total_num_control_code * 1.0 / n_control
print("number of skip sentences: %d (one contain another: %d + no overlap: %d)" % (n_skip, n_skip_one_contain_another, n_skip_no_overlap))
print("Total data size: %d. Number of control case: %d (entity control: %d + overlap control: %d)" % (len(output_data), n_control, n_entity_control, n_overlap_control))
print("Number of control code: %d vs. number of control case: %d (averaged control code per case: %.4f)" % (total_num_control_code, n_control, avg_num_control_code))
return output_data
def main(params):
# load model and tokenizer
model_saved_path = os.path.join(params.saved_folder, params.model_name+".pt")
ner_model = torch.load(model_saved_path)["model"]
ner_model.cuda()
ner_model.eval()
tokenizer = AutoTokenizer.from_pretrained(params.model_name)
# load data
datafolder = os.path.join(params.default_folder, params.infer_datafolder)
input_datapath = os.path.join(datafolder, params.infer_dataname)
output_datapath = os.path.join(datafolder, params.output_dataname)
# read input data
input_data = read_data(input_datapath)
# process data (generate entity control data)
output_data = generate_entity_control_data(tokenizer, ner_model, input_data)
# write output data
write_data(output_datapath, output_data)
if __name__ == "__main__":
params = get_params()
main(params)
\ No newline at end of file
This diff is collapsed.
import torch
import numpy as np
from transformers import AutoTokenizer
from tabulate import tabulate
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
ner_model = torch.load("/gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt")["model"]
ner_model.cuda()
ner_model.eval()
label_set = ["O", "B", "I"]
for step in range(100):
print("===========================================================================")
input_sent = input(">> Input:")
tokens = input_sent.split()
token_ids, first_tok_masks = [tokenizer.cls_token_id], [0]
for token in tokens:
subs_ = tokenizer.tokenize(token)
assert len(subs_) > 0
token_ids.extend(tokenizer.convert_tokens_to_ids(subs_))
first_tok_masks.extend([1] + [0] * (len(subs_) - 1))
token_ids.append(tokenizer.sep_token_id)
first_tok_masks.append(0)
token_ids = torch.LongTensor([token_ids]).cuda()
predictions = ner_model(token_ids) # (1, seq_len, 3)
predictions = predictions[0].data.cpu().numpy() # (seq_len, 3)
pred_ids = list(np.argmax(predictions, axis=1))
assert len(pred_ids) == len(first_tok_masks)
preds_for_each_word = []
for pred, mask in zip(pred_ids, first_tok_masks):
if mask == 1:
preds_for_each_word.append(label_set[pred])
assert len(preds_for_each_word) == len(tokens)
table = [tokens, preds_for_each_word]
print(tabulate(table))
......@@ -23,6 +23,12 @@ def get_params():
parser.add_argument("--data_folder", type=str, default="/gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003", help="NER data folder")
parser.add_argument("--saved_folder", type=str, default="/gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model", help="NER data folder")
parser.add_argument("--default_folder", type=str, default="/gpfs/fs1/projects/gpu_adlr/datasets/zihanl")
parser.add_argument("--infer_datafolder", type=str, default="dialog_datasets/wizard_of_wikipedia/processed")
parser.add_argument("--infer_dataname", type=str, default="train.txt")
parser.add_argument("--output_dataname", type=str, default="train_entity_based_control.txt")
params = parser.parse_args()
return params
......@@ -7,7 +7,6 @@ from src.trainer import NERTrainer
import torch
import numpy as np
from tqdm import tqdm
import random
def random_seed(seed):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment