Commit 7d044e4e authored by root's avatar root
Browse files

add commands and dialog_ctrl

parent 90e0a0dd
#!/bin/bash
srun -p batch_short,batch -A gpu_adlr_nlp -t 2:00:00 --nodes=1 --ntasks-per-node=16 --gres=gpu:16,gpfs:circe --job-name=interact --container-mounts=/gpfs/fs1/projects/gpu_adlr/datasets:/gpfs/fs1/projects/gpu_adlr/datasets,/home/zihanl:/home/zihanl --container-image=gitlab-master.nvidia.com/adlr/megatron-lm/pytorch-nlp-retriever-faiss:20.12-py3-devel --exclusive --pty bash
#!/bin/bash
#SBATCH -p interactive -A gpu_adlr_nlp -t 1:00:00 --nodes=1 --exclusive --mem=0 --overcommit --ntasks-per-node=16 --gres=gpu:16,gpfs:circe --dependency=singleton --job-name=adlr-nlp-largelm:gpt3-357m
NAME="gpt3-357m"
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p $DIR/logs
TENSORBOARD_DIR="${DIR}/tensorboard/${NAME}"
mkdir -p ${TENSORBOARD_DIR}
DATA_PATH=/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/roberta_dataset/rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_text_document
options=" \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 2 \
--global-batch-size 256 \
--rampup-batch-size 32 32 1953125 \
--train-samples 192000000 \
--lr-decay-samples 166400000 \
--lr-warmup-samples 162761 \
--lr 3.0e-4 \
--min-lr 3.0e-5 \
--lr-decay-style cosine \
--log-interval 100 \
--eval-iters 50 \
--eval-interval 2000 \
--data-path ${DATA_PATH} \
--vocab-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-vocab.json \
--merge-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-merges.txt \
--save-interval 10000 \
--exit-interval 100 \
--save /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
--load /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.02 \
--log-params-norm \
--log-num-zeros-in-grad \
--fp16 \
--DDP-impl torch \
--tensorboard-dir ${TENSORBOARD_DIR} \
--checkpoint-activations "
run_cmd="python ${DIR}/pretrain_gpt.py ${options}"
srun -l \
--container-image "gitlab-master.nvidia.com/adlr/megatron-lm/pytorch-nlp-retriever-faiss:20.12-py3-devel" \
--container-mounts "/gpfs/fs1/projects/gpu_adlr/datasets:/gpfs/fs1/projects/gpu_adlr/datasets,/home/zihanl:/home/zihanl" \
--output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
set +x
#!/bin/bash
NAME="gpt3-357m"
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p $DIR/logs
TENSORBOARD_DIR="${DIR}/tensorboard/${NAME}"
mkdir -p ${TENSORBOARD_DIR}
DATA_PATH=/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/roberta_dataset/rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_text_document
options=" \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 2 \
--global-batch-size 256 \
--rampup-batch-size 32 32 1953125 \
--train-samples 192000000 \
--lr-decay-samples 166400000 \
--lr-warmup-samples 162761 \
--lr 3.0e-4 \
--min-lr 3.0e-5 \
--lr-decay-style cosine \
--log-interval 100 \
--eval-iters 50 \
--eval-interval 2000 \
--data-path ${DATA_PATH} \
--vocab-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-vocab.json \
--merge-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-merges.txt \
--save-interval 10000 \
--exit-interval 100 \
--save /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
--load /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.02 \
--log-params-norm \
--log-num-zeros-in-grad \
--fp16 \
--DDP-impl torch \
--tensorboard-dir ${TENSORBOARD_DIR} \
--checkpoint-activations "
run_cmd="${DIR}/pretrain_gpt.py ${options}"
GPUS_PER_NODE=16
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS ${run_cmd}
set +x
This diff is collapsed.
CUDA_VISIBLE_DEVICES=0 python train_ner.py --exp_name conll2003 --exp_id 1 --model_name roberta-large --lr 3e-5 --seed 111
import argparse
def get_params():
parser = argparse.ArgumentParser(description="NER Task")
parser.add_argument("--exp_name", type=str, default="conll2003", help="Experiment name")
parser.add_argument("--logger_filename", type=str, default="train.log")
parser.add_argument("--dump_path", type=str, default="logs", help="Experiment saved root path")
parser.add_argument("--exp_id", type=str, default="1", help="Experiment id")
parser.add_argument("--model_name", type=str, default="roberta-large", help="model name")
parser.add_argument("--seed", type=int, default=111, help="random seed")
# train parameters
parser.add_argument("--batch_size", type=int, default=32, help="Batch size")
parser.add_argument("--epoch", type=int, default=300, help="Number of epoch")
parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate")
parser.add_argument("--early_stop", type=int, default=3, help="No improvement after several epoch, we stop training")
parser.add_argument("--num_tag", type=int, default=3, help="Number of entity in the dataset")
parser.add_argument("--dropout", type=float, default=0.1, help="dropout rate")
parser.add_argument("--hidden_dim", type=int, default=1024, help="Hidden layer dimension")
parser.add_argument("--data_folder", type=str, default="/gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003", help="NER data folder")
parser.add_argument("--saved_folder", type=str, default="/gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model", help="NER data folder")
params = parser.parse_args()
return params
import torch
import torch.nn as nn
import torch.utils.data as data
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import os
from tqdm import tqdm
import logging
logger = logging.getLogger()
pad_token_label_id = nn.CrossEntropyLoss().ignore_index
label_set = ["O", "B-ENTITY", "I-ENTITY"]
def read_ner(tokenizer, datapath):
inputs, labels = [], []
with open(datapath, "r") as fr:
token_list, label_list = [], []
for i, line in enumerate(fr):
line = line.strip()
if line == "":
if len(token_list) > 0:
assert len(token_list) == len(label_list)
inputs.append([tokenizer.cls_token_id] + token_list + [tokenizer.sep_token_id])
labels.append([pad_token_label_id] + label_list + [pad_token_label_id])
token_list, label_list = [], []
continue
splits = line.split("\t")
token = splits[0]
label = splits[1]
if label.startswith("B-"):
label = "B-ENTITY"
elif label.startswith("I-"):
label = "I-ENTITY"
subs_ = tokenizer.tokenize(token)
if len(subs_) > 0:
label_list.extend([label_set.index(label)] + [pad_token_label_id] * (len(subs_) - 1))
token_list.extend(tokenizer.convert_tokens_to_ids(subs_))
else:
print("length of subwords for %s is zero; its label is %s" % (token, label))
return inputs, labels
class Dataset(data.Dataset):
def __init__(self, tokenizer, inputs, labels):
self.X = inputs
self.y = labels
self.tokenizer = tokenizer
def __getitem__(self, index):
return self.X[index], self.y[index]
def __len__(self):
return len(self.X)
def collate_fn(self, data):
X, y = zip(*data)
lengths = [len(bs_x) for bs_x in X]
max_lengths = max(lengths)
padded_seqs = torch.LongTensor(len(X), max_lengths).fill_(self.tokenizer.pad_token_id)
padded_y = torch.LongTensor(len(X), max_lengths).fill_(pad_token_label_id)
for i, (seq, y_) in enumerate(zip(X, y)):
length = lengths[i]
padded_seqs[i, :length] = torch.LongTensor(seq)
padded_y[i, :length] = torch.LongTensor(y_)
return padded_seqs, padded_y
def get_dataloader(model_name, batch_size, data_folder):
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs_train, labels_train = read_ner(tokenizer, os.path.join(data_folder, "train.txt"))
inputs_dev, labels_dev = read_ner(tokenizer, os.path.join(data_folder, "dev.txt"))
inputs_test, labels_test = read_ner(tokenizer, os.path.join(data_folder, "test.txt"))
logger.info("conll2003 dataset: train size: %d; dev size %d; test size: %d" % (len(inputs_train), len(inputs_dev), len(inputs_test)))
dataset_train = Dataset(tokenizer, inputs_train, labels_train)
dataset_dev = Dataset(tokenizer, inputs_dev, labels_dev)
dataset_test = Dataset(tokenizer, inputs_test, labels_test)
dataloader_train = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True, collate_fn=dataset_train.collate_fn)
dataloader_dev = DataLoader(dataset=dataset_dev, batch_size=batch_size, shuffle=False, collate_fn=dataset_dev.collate_fn)
dataloader_test = DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=False, collate_fn=dataset_test.collate_fn)
return dataloader_train, dataloader_dev, dataloader_test
#!/usr/bin/env python
# Python version of the evaluation script from CoNLL'00-
# Intentional differences:
# - accept any space as delimiter by default
# - optional file argument (default STDIN)
# - option to set boundary (-b argument)
# - LaTeX output (-l argument) not supported
# - raw tags (-r argument) not supported
import sys
import re
from collections import defaultdict, namedtuple
ANY_SPACE = '<SPACE>'
class FormatError(Exception):
pass
Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')
class EvalCounts(object):
def __init__(self):
self.correct_chunk = 0 # number of correctly identified chunks
self.correct_tags = 0 # number of correct chunk tags
self.found_correct = 0 # number of chunks in corpus
self.found_guessed = 0 # number of identified chunks
self.token_counter = 0 # token counter (ignores sentence breaks)
# counts by type
self.t_correct_chunk = defaultdict(int)
self.t_found_correct = defaultdict(int)
self.t_found_guessed = defaultdict(int)
def parse_args(argv):
import argparse
parser = argparse.ArgumentParser(
description='evaluate tagging results using CoNLL criteria',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
arg = parser.add_argument
arg('-b', '--boundary', metavar='STR', default='-X-',
help='sentence boundary')
arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE,
help='character delimiting items in input')
arg('-o', '--otag', metavar='CHAR', default='O',
help='alternative outside tag')
arg('file', nargs='?', default=None)
return parser.parse_args(argv)
def parse_tag(t):
m = re.match(r'^([^-]*)-(.*)$', t)
return m.groups() if m else (t, '')
def evaluate(lines, options=None):
if options is None:
options = parse_args([]) # use defaults
counts = EvalCounts()
num_features = None # number of features per line
in_correct = False # currently processed chunks is correct until now
last_correct = 'O' # previous chunk tag in corpus
last_correct_type = '' # type of previously identified chunk tag
last_guessed = 'O' # previously identified chunk tag
last_guessed_type = '' # type of previous chunk tag in corpus
for line in lines:
line = line.rstrip('\r\n')
if options.delimiter == ANY_SPACE:
features = line.split()
else:
features = line.split(options.delimiter)
if num_features is None:
num_features = len(features)
elif num_features != len(features) and len(features) != 0:
raise FormatError('unexpected number of features: %d (%d)' %
(len(features), num_features))
if len(features) == 0 or features[0] == options.boundary:
features = [options.boundary, 'O', 'O']
if len(features) < 3:
raise FormatError('unexpected number of features in line %s' % line)
guessed, guessed_type = parse_tag(features.pop())
correct, correct_type = parse_tag(features.pop())
first_item = features.pop(0)
if first_item == options.boundary:
guessed = 'O'
end_correct = end_of_chunk(last_correct, correct,
last_correct_type, correct_type)
end_guessed = end_of_chunk(last_guessed, guessed,
last_guessed_type, guessed_type)
start_correct = start_of_chunk(last_correct, correct,
last_correct_type, correct_type)
start_guessed = start_of_chunk(last_guessed, guessed,
last_guessed_type, guessed_type)
if in_correct:
if (end_correct and end_guessed and
last_guessed_type == last_correct_type):
in_correct = False
counts.correct_chunk += 1
counts.t_correct_chunk[last_correct_type] += 1
elif (end_correct != end_guessed or guessed_type != correct_type):
in_correct = False
if start_correct and start_guessed and guessed_type == correct_type:
in_correct = True
if start_correct:
counts.found_correct += 1
counts.t_found_correct[correct_type] += 1
if start_guessed:
counts.found_guessed += 1
counts.t_found_guessed[guessed_type] += 1
if first_item != options.boundary:
if correct == guessed and guessed_type == correct_type:
counts.correct_tags += 1
counts.token_counter += 1
last_guessed = guessed
last_correct = correct
last_guessed_type = guessed_type
last_correct_type = correct_type
if in_correct:
counts.correct_chunk += 1
counts.t_correct_chunk[last_correct_type] += 1
return counts
def uniq(iterable):
seen = set()
return [i for i in iterable if not (i in seen or seen.add(i))]
def calculate_metrics(correct, guessed, total):
tp, fp, fn = correct, guessed-correct, total-correct
p = 0 if tp + fp == 0 else 1.*tp / (tp + fp)
r = 0 if tp + fn == 0 else 1.*tp / (tp + fn)
f = 0 if p + r == 0 else 2 * p * r / (p + r)
return Metrics(tp, fp, fn, p, r, f)
def metrics(counts):
c = counts
overall = calculate_metrics(
c.correct_chunk, c.found_guessed, c.found_correct
)
by_type = {}
for t in uniq(list(c.t_found_correct) + list(c.t_found_guessed)):
by_type[t] = calculate_metrics(
c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t]
)
return overall, by_type
def report(counts, out=None):
if out is None:
out = sys.stdout
overall, by_type = metrics(counts)
c = counts
# out.write('processed %d tokens with %d phrases; ' %
# (c.token_counter, c.found_correct))
# out.write('found: %d phrases; correct: %d.\n' %
# (c.found_guessed, c.correct_chunk))
results = {}
if c.token_counter > 0:
results["fb1"] = 100.*overall.fscore
# comment it to not print details
# for i, m in sorted(by_type.items()):
# print('%17s: ' % i)
# print('precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f %d\n' % (100.*m.prec, 100.*m.rec, 100.*m.fscore, c.t_found_guessed[i]))
return results
def end_of_chunk(prev_tag, tag, prev_type, type_):
# check if a chunk ended between the previous and current word
# arguments: previous and current chunk tags, previous and current types
chunk_end = False
if prev_tag == 'E': chunk_end = True
if prev_tag == 'S': chunk_end = True
if prev_tag == 'B' and tag == 'B': chunk_end = True
if prev_tag == 'B' and tag == 'S': chunk_end = True
if prev_tag == 'B' and tag == 'O': chunk_end = True
if prev_tag == 'I' and tag == 'B': chunk_end = True
if prev_tag == 'I' and tag == 'S': chunk_end = True
if prev_tag == 'I' and tag == 'O': chunk_end = True
if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
chunk_end = True
# these chunks are assumed to have length 1
if prev_tag == ']': chunk_end = True
if prev_tag == '[': chunk_end = True
return chunk_end
def start_of_chunk(prev_tag, tag, prev_type, type_):
# check if a chunk started between the previous and current word
# arguments: previous and current chunk tags, previous and current types
chunk_start = False
if tag == 'B': chunk_start = True
if tag == 'S': chunk_start = True
if prev_tag == 'E' and tag == 'E': chunk_start = True
if prev_tag == 'E' and tag == 'I': chunk_start = True
if prev_tag == 'S' and tag == 'E': chunk_start = True
if prev_tag == 'S' and tag == 'I': chunk_start = True
if prev_tag == 'O' and tag == 'E': chunk_start = True
if prev_tag == 'O' and tag == 'I': chunk_start = True
if tag != 'O' and tag != '.' and prev_type != type_:
chunk_start = True
# these chunks are assumed to have length 1
if tag == '[': chunk_start = True
if tag == ']': chunk_start = True
return chunk_start
def main(argv):
args = parse_args(argv[1:])
if args.file is None:
counts = evaluate(sys.stdin, args)
else:
with open(args.file) as f:
counts = evaluate(f, args)
report(counts)
def conll2002_measure(lines, verbose=False):
counts = evaluate(lines, None)
return report(counts)
import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers import AutoModel
class EntityTagger(nn.Module):
def __init__(self, params):
super(EntityTagger, self).__init__()
self.num_tag = params.num_tag
self.hidden_dim = params.hidden_dim
self.model = AutoModel.from_pretrained(params.model_name)
self.dropout = nn.Dropout(params.dropout)
self.linear = nn.Linear(self.hidden_dim, self.num_tag)
def forward(self, X):
outputs = self.model(X) # a tuple ((bsz,seq_len,hidden_dim), (bsz, hidden_dim))
outputs = outputs[0] # (bsz, seq_len, hidden_dim)
outputs = self.dropout(outputs)
prediction = self.linear(outputs)
return prediction
import torch
import torch.nn as nn
from src.metrics import *
from src.dataloader import label_set, pad_token_label_id
import os
import numpy as np
from tqdm import tqdm
import logging
logger = logging.getLogger()
class NERTrainer(object):
def __init__(self, params, model):
self.params = params
self.model = model
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=params.lr)
self.loss_fn = nn.CrossEntropyLoss()
self.early_stop = params.early_stop
self.no_improvement_num = 0
self.best_dev_f1 = 0
def train_step(self, X, y):
self.model.train()
preds = self.model(X)
y = y.view(y.size(0)*y.size(1))
preds = preds.view(preds.size(0)*preds.size(1), preds.size(2))
self.optimizer.zero_grad()
loss = self.loss_fn(preds, y)
loss.backward()
self.optimizer.step()
return loss.item()
def train(self, dataloader_train, dataloader_dev, dataloader_test):
logger.info("Start NER training ...")
for e in range(self.params.epoch):
logger.info("============== epoch %d ==============" % e)
loss_list = []
pbar = tqdm(enumerate(dataloader_train), total=len(dataloader_train))
for i, (X, y) in pbar:
X, y = X.cuda(), y.cuda()
loss = self.train_step(X, y)
loss_list.append(loss)
pbar.set_description("(Epoch {}) LOSS:{:.4f}".format(e, np.mean(loss_list)))
logger.info("Finish training epoch %d. loss: %.4f" % (e, np.mean(loss_list)))
logger.info("============== Evaluate epoch %d on Dev Set ==============" % e)
f1_dev = self.evaluate(dataloader_dev)
logger.info("Evaluate on Dev Set. F1: %.4f." % f1_dev)
if f1_dev > self.best_dev_f1:
logger.info("Found better model!!")
self.best_dev_f1 = f1_dev
self.no_improvement_num = 0
self.save_model()
else:
self.no_improvement_num += 1
logger.info("No better model found (%d/%d)" % (self.no_improvement_num, self.early_stop))
if self.no_improvement_num >= self.early_stop:
break
logger.info("============== Evaluate on Test Set ==============")
f1_test = self.evaluate(dataloader_test)
logger.info("Evaluate on Test Set. F1: %.4f." % f1_test)
def evaluate(self, dataloader):
self.model.eval()
pred_list = []
y_list = []
pbar = tqdm(enumerate(dataloader), total=len(dataloader))
for i, (X, y) in pbar:
y_list.extend(y.data.numpy()) # y is a list
X = X.cuda()
preds = self.model(X)
pred_list.extend(preds.data.cpu().numpy())
# concatenation
pred_list = np.concatenate(pred_list, axis=0) # (length, num_tag)
pred_list = np.argmax(pred_list, axis=1)
y_list = np.concatenate(y_list, axis=0)
# calcuate f1 score
pred_list = list(pred_list)
y_list = list(y_list)
lines = []
for pred_index, gold_index in zip(pred_list, y_list):
gold_index = int(gold_index)
if gold_index != pad_token_label_id:
pred_token = label_set[pred_index]
gold_token = label_set[gold_index]
lines.append("w" + " " + pred_token + " " + gold_token)
results = conll2002_measure(lines)
f1 = results["fb1"]
return f1
def save_model(self):
"""
save the best model
"""
saved_path = os.path.join(self.params.saved_folder, self.params.model_name+".pt")
torch.save({
"model": self.model,
}, saved_path)
logger.info("Best model has been saved to %s" % saved_path)
import os
import subprocess
import pickle
import logging
import time
import random
from datetime import timedelta
import numpy as np
def init_experiment(params, logger_filename):
"""
Initialize the experiment:
- save parameters
- create a logger
"""
# save parameters
get_saved_path(params)
pickle.dump(params, open(os.path.join(params.dump_path, "params.pkl"), "wb"))
# create a logger
logger = create_logger(os.path.join(params.dump_path, logger_filename))
logger.info('============ Initialized logger ============')
logger.info('\n'.join('%s: %s' % (k, str(v))
for k, v in sorted(dict(vars(params)).items())))
logger.info('The experiment will be stored in %s\n' % params.dump_path)
return logger
class LogFormatter():
def __init__(self):
self.start_time = time.time()
def format(self, record):
elapsed_seconds = round(record.created - self.start_time)
prefix = "%s - %s - %s" % (
record.levelname,
time.strftime('%x %X'),
timedelta(seconds=elapsed_seconds)
)
message = record.getMessage()
message = message.replace('\n', '\n' + ' ' * (len(prefix) + 3))
return "%s - %s" % (prefix, message) if message else ''
def create_logger(filepath):
# create log formatter
log_formatter = LogFormatter()
# create file handler and set level to debug
if filepath is not None:
file_handler = logging.FileHandler(filepath, "a")
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(log_formatter)
# create console handler and set level to info
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(log_formatter)
# create logger and set level to debug
logger = logging.getLogger()
logger.handlers = []
logger.setLevel(logging.DEBUG)
logger.propagate = False
if filepath is not None:
logger.addHandler(file_handler)
logger.addHandler(console_handler)
# reset logger elapsed time
def reset_time():
log_formatter.start_time = time.time()
logger.reset_time = reset_time
return logger
def get_saved_path(params):
"""
create a directory to store the experiment
"""
dump_path = "./" if params.dump_path == "" else params.dump_path
if not os.path.isdir(dump_path):
subprocess.Popen("mkdir -p %s" % dump_path, shell=True).wait()
assert os.path.isdir(dump_path)
# create experiment path if it does not exist
exp_path = os.path.join(dump_path, params.exp_name)
if not os.path.exists(exp_path):
subprocess.Popen("mkdir -p %s" % exp_path, shell=True).wait()
# generate id for this experiment
if params.exp_id == "":
chars = "0123456789"
while True:
exp_id = "".join(random.choice(chars) for _ in range(0, 3))
if not os.path.isdir(os.path.join(exp_path, exp_id)):
break
else:
exp_id = params.exp_id
# update dump_path
params.dump_path = os.path.join(exp_path, exp_id)
if not os.path.isdir(params.dump_path):
subprocess.Popen("mkdir -p %s" % params.dump_path, shell=True).wait()
assert os.path.isdir(params.dump_path)
from src.config import get_params
from src.utils import init_experiment
from src.dataloader import get_dataloader
from src.model import EntityTagger
from src.trainer import NERTrainer
import torch
import numpy as np
from tqdm import tqdm
import random
def random_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
def train_ner(params):
# initialize experiment
logger = init_experiment(params, logger_filename=params.logger_filename)
# dataloader
dataloader_train, dataloader_dev, dataloader_test = get_dataloader(params.model_name, params.batch_size, params.data_folder)
# BERT-based NER Tagger
model = EntityTagger(params)
model.cuda()
# trainer
trainer = NERTrainer(params, model)
trainer.train(dataloader_train, dataloader_dev, dataloader_test)
if __name__ == "__main__":
params = get_params()
random_seed(params.seed)
train_ner(params)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment