Commit 7d044e4e authored by root's avatar root
Browse files

add commands and dialog_ctrl

parent 90e0a0dd
#!/bin/bash
srun -p batch_short,batch -A gpu_adlr_nlp -t 2:00:00 --nodes=1 --ntasks-per-node=16 --gres=gpu:16,gpfs:circe --job-name=interact --container-mounts=/gpfs/fs1/projects/gpu_adlr/datasets:/gpfs/fs1/projects/gpu_adlr/datasets,/home/zihanl:/home/zihanl --container-image=gitlab-master.nvidia.com/adlr/megatron-lm/pytorch-nlp-retriever-faiss:20.12-py3-devel --exclusive --pty bash
#!/bin/bash
#SBATCH -p interactive -A gpu_adlr_nlp -t 1:00:00 --nodes=1 --exclusive --mem=0 --overcommit --ntasks-per-node=16 --gres=gpu:16,gpfs:circe --dependency=singleton --job-name=adlr-nlp-largelm:gpt3-357m
NAME="gpt3-357m"
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p $DIR/logs
TENSORBOARD_DIR="${DIR}/tensorboard/${NAME}"
mkdir -p ${TENSORBOARD_DIR}
DATA_PATH=/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/roberta_dataset/rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_text_document
options=" \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 2 \
--global-batch-size 256 \
--rampup-batch-size 32 32 1953125 \
--train-samples 192000000 \
--lr-decay-samples 166400000 \
--lr-warmup-samples 162761 \
--lr 3.0e-4 \
--min-lr 3.0e-5 \
--lr-decay-style cosine \
--log-interval 100 \
--eval-iters 50 \
--eval-interval 2000 \
--data-path ${DATA_PATH} \
--vocab-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-vocab.json \
--merge-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-merges.txt \
--save-interval 10000 \
--exit-interval 100 \
--save /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
--load /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.02 \
--log-params-norm \
--log-num-zeros-in-grad \
--fp16 \
--DDP-impl torch \
--tensorboard-dir ${TENSORBOARD_DIR} \
--checkpoint-activations "
run_cmd="python ${DIR}/pretrain_gpt.py ${options}"
srun -l \
--container-image "gitlab-master.nvidia.com/adlr/megatron-lm/pytorch-nlp-retriever-faiss:20.12-py3-devel" \
--container-mounts "/gpfs/fs1/projects/gpu_adlr/datasets:/gpfs/fs1/projects/gpu_adlr/datasets,/home/zihanl:/home/zihanl" \
--output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
set +x
#!/bin/bash
NAME="gpt3-357m"
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p $DIR/logs
TENSORBOARD_DIR="${DIR}/tensorboard/${NAME}"
mkdir -p ${TENSORBOARD_DIR}
DATA_PATH=/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/roberta_dataset/rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_text_document
options=" \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 2 \
--global-batch-size 256 \
--rampup-batch-size 32 32 1953125 \
--train-samples 192000000 \
--lr-decay-samples 166400000 \
--lr-warmup-samples 162761 \
--lr 3.0e-4 \
--min-lr 3.0e-5 \
--lr-decay-style cosine \
--log-interval 100 \
--eval-iters 50 \
--eval-interval 2000 \
--data-path ${DATA_PATH} \
--vocab-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-vocab.json \
--merge-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-merges.txt \
--save-interval 10000 \
--exit-interval 100 \
--save /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
--load /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.02 \
--log-params-norm \
--log-num-zeros-in-grad \
--fp16 \
--DDP-impl torch \
--tensorboard-dir ${TENSORBOARD_DIR} \
--checkpoint-activations "
run_cmd="${DIR}/pretrain_gpt.py ${options}"
GPUS_PER_NODE=16
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS ${run_cmd}
set +x
INFO - 06/21/21 23:13:46 - 0:00:00 - ============ Initialized logger ============
INFO - 06/21/21 23:13:46 - 0:00:00 - batch_size: 32
data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
dropout: 0.1
dump_path: logs/conll2003/1
early_stop: 3
epoch: 300
exp_id: 1
exp_name: conll2003
hidden_dim: 1024
logger_filename: train.log
lr: 5e-05
model_name: roberta-large
num_tag: 3
saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
seed: 555
INFO - 06/21/21 23:13:46 - 0:00:00 - The experiment will be stored in logs/conll2003/1
INFO - 06/21/21 23:25:29 - 0:00:00 - ============ Initialized logger ============
INFO - 06/21/21 23:25:29 - 0:00:00 - batch_size: 32
data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
dropout: 0.1
dump_path: logs/conll2003/1
early_stop: 3
epoch: 300
exp_id: 1
exp_name: conll2003
hidden_dim: 1024
logger_filename: train.log
lr: 5e-05
model_name: roberta-large
num_tag: 3
saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
seed: 555
INFO - 06/21/21 23:25:29 - 0:00:00 - The experiment will be stored in logs/conll2003/1
DEBUG - 06/21/21 23:25:29 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:25:29 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:25:29 - 0:00:01 - Attempting to acquire lock 22598820184656 on /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373.lock
INFO - 06/21/21 23:25:29 - 0:00:01 - Lock 22598820184656 acquired on /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373.lock
DEBUG - 06/21/21 23:25:29 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:25:30 - 0:00:01 - https://huggingface.co:443 "GET /roberta-large/resolve/main/config.json HTTP/1.1" 200 482
DEBUG - 06/21/21 23:25:30 - 0:00:01 - Attempting to release lock 22598820184656 on /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373.lock
INFO - 06/21/21 23:25:30 - 0:00:01 - Lock 22598820184656 released on /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373.lock
DEBUG - 06/21/21 23:25:30 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:25:30 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:25:30 - 0:00:01 - Attempting to acquire lock 22598820184656 on /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
INFO - 06/21/21 23:25:30 - 0:00:01 - Lock 22598820184656 acquired on /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
DEBUG - 06/21/21 23:25:30 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:25:30 - 0:00:01 - https://huggingface.co:443 "GET /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 898823
DEBUG - 06/21/21 23:25:30 - 0:00:02 - Attempting to release lock 22598820184656 on /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
INFO - 06/21/21 23:25:30 - 0:00:02 - Lock 22598820184656 released on /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
DEBUG - 06/21/21 23:25:30 - 0:00:02 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:25:31 - 0:00:02 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
DEBUG - 06/21/21 23:25:31 - 0:00:02 - Attempting to acquire lock 22597850387840 on /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
INFO - 06/21/21 23:25:31 - 0:00:02 - Lock 22597850387840 acquired on /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
DEBUG - 06/21/21 23:25:31 - 0:00:02 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:25:31 - 0:00:02 - https://huggingface.co:443 "GET /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 456318
DEBUG - 06/21/21 23:25:31 - 0:00:02 - Attempting to release lock 22597850387840 on /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
INFO - 06/21/21 23:25:31 - 0:00:02 - Lock 22597850387840 released on /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
DEBUG - 06/21/21 23:25:31 - 0:00:02 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:25:31 - 0:00:03 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:25:31 - 0:00:03 - Attempting to acquire lock 22597850387840 on /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock
INFO - 06/21/21 23:25:31 - 0:00:03 - Lock 22597850387840 acquired on /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock
DEBUG - 06/21/21 23:25:31 - 0:00:03 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:25:32 - 0:00:03 - https://huggingface.co:443 "GET /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 1355863
DEBUG - 06/21/21 23:25:32 - 0:00:03 - Attempting to release lock 22597850387840 on /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock
INFO - 06/21/21 23:25:32 - 0:00:03 - Lock 22597850387840 released on /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock
INFO - 06/21/21 23:26:26 - 0:00:00 - ============ Initialized logger ============
INFO - 06/21/21 23:26:26 - 0:00:00 - batch_size: 32
data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
dropout: 0.1
dump_path: logs/conll2003/1
early_stop: 3
epoch: 300
exp_id: 1
exp_name: conll2003
hidden_dim: 1024
logger_filename: train.log
lr: 5e-05
model_name: roberta-large
num_tag: 3
saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
seed: 555
INFO - 06/21/21 23:26:26 - 0:00:00 - The experiment will be stored in logs/conll2003/1
DEBUG - 06/21/21 23:26:26 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:26:26 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:26:26 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:26:27 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:26:27 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:26:27 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
DEBUG - 06/21/21 23:26:27 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:26:27 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
INFO - 06/21/21 23:26:39 - 0:00:13 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
DEBUG - 06/21/21 23:26:39 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:26:39 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:26:39 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:26:39 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
DEBUG - 06/21/21 23:26:39 - 0:00:13 - Attempting to acquire lock 23082502829920 on /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352.lock
INFO - 06/21/21 23:26:39 - 0:00:13 - Lock 23082502829920 acquired on /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352.lock
DEBUG - 06/21/21 23:26:39 - 0:00:13 - Starting new HTTPS connection (1): cdn-lfs.huggingface.co:443
DEBUG - 06/21/21 23:26:39 - 0:00:13 - https://cdn-lfs.huggingface.co:443 "GET /roberta-large/36a10a8b694fadf9bf4f9049d14e257e88be45313ae02d882af9e60f39b8b2e8 HTTP/1.1" 200 1425941629
DEBUG - 06/21/21 23:27:01 - 0:00:34 - Attempting to release lock 23082502829920 on /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352.lock
INFO - 06/21/21 23:27:01 - 0:00:34 - Lock 23082502829920 released on /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352.lock
INFO - 06/21/21 23:27:57 - 0:00:00 - ============ Initialized logger ============
INFO - 06/21/21 23:27:57 - 0:00:00 - batch_size: 32
data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
dropout: 0.1
dump_path: logs/conll2003/1
early_stop: 3
epoch: 300
exp_id: 1
exp_name: conll2003
hidden_dim: 1024
logger_filename: train.log
lr: 5e-05
model_name: roberta-large
num_tag: 3
saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
seed: 555
INFO - 06/21/21 23:27:57 - 0:00:00 - The experiment will be stored in logs/conll2003/1
DEBUG - 06/21/21 23:27:57 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:27:57 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:27:57 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:27:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:27:58 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:27:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
DEBUG - 06/21/21 23:27:58 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:27:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
INFO - 06/21/21 23:28:09 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
DEBUG - 06/21/21 23:28:09 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:28:10 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:28:10 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:28:10 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
INFO - 06/21/21 23:28:17 - 0:00:20 - Start NER training ...
INFO - 06/21/21 23:28:17 - 0:00:20 - ============== epoch 0 ==============
INFO - 06/21/21 23:29:45 - 0:00:00 - ============ Initialized logger ============
INFO - 06/21/21 23:29:45 - 0:00:00 - batch_size: 32
data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
dropout: 0.1
dump_path: logs/conll2003/1
early_stop: 3
epoch: 300
exp_id: 1
exp_name: conll2003
hidden_dim: 1024
logger_filename: train.log
lr: 5e-05
model_name: roberta-large
num_tag: 3
saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
seed: 555
INFO - 06/21/21 23:29:45 - 0:00:00 - The experiment will be stored in logs/conll2003/1
DEBUG - 06/21/21 23:29:45 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:29:45 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:29:45 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:29:45 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:29:45 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:29:46 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
DEBUG - 06/21/21 23:29:46 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:29:46 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
INFO - 06/21/21 23:29:57 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
DEBUG - 06/21/21 23:29:57 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:29:57 - 0:00:12 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:29:57 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:29:57 - 0:00:12 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
INFO - 06/21/21 23:30:04 - 0:00:19 - Start NER training ...
INFO - 06/21/21 23:30:04 - 0:00:19 - ============== epoch 0 ==============
INFO - 06/21/21 23:31:17 - 0:00:00 - ============ Initialized logger ============
INFO - 06/21/21 23:31:17 - 0:00:00 - batch_size: 32
data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
dropout: 0.1
dump_path: logs/conll2003/1
early_stop: 3
epoch: 300
exp_id: 1
exp_name: conll2003
hidden_dim: 1024
logger_filename: train.log
lr: 5e-05
model_name: roberta-large
num_tag: 3
saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
seed: 555
INFO - 06/21/21 23:31:17 - 0:00:00 - The experiment will be stored in logs/conll2003/1
DEBUG - 06/21/21 23:31:17 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:31:17 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:31:17 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:31:17 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:31:17 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:31:18 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
DEBUG - 06/21/21 23:31:18 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:31:18 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
INFO - 06/21/21 23:31:29 - 0:00:13 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
DEBUG - 06/21/21 23:31:29 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:31:30 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:31:30 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:31:30 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
INFO - 06/21/21 23:31:37 - 0:00:20 - Start NER training ...
INFO - 06/21/21 23:31:37 - 0:00:20 - ============== epoch 0 ==============
INFO - 06/21/21 23:33:58 - 0:02:42 - Finish training epoch 0. loss: 0.0696
INFO - 06/21/21 23:33:58 - 0:02:42 - ============== Evaluate epoch 0 on Dev Set ==============
INFO - 06/21/21 23:34:08 - 0:02:51 - Evaluate on Dev Set. F1: 95.5005.
INFO - 06/21/21 23:34:08 - 0:02:51 - Found better model!!
INFO - 06/21/21 23:48:39 - 0:00:00 - ============ Initialized logger ============
INFO - 06/21/21 23:48:39 - 0:00:00 - batch_size: 32
data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
dropout: 0.1
dump_path: logs/conll2003/1
early_stop: 3
epoch: 300
exp_id: 1
exp_name: conll2003
hidden_dim: 1024
logger_filename: train.log
lr: 5e-05
model_name: roberta-large
num_tag: 3
saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
seed: 555
INFO - 06/21/21 23:48:39 - 0:00:00 - The experiment will be stored in logs/conll2003/1
DEBUG - 06/21/21 23:48:39 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:48:39 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:48:39 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:48:40 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:48:40 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:48:40 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
DEBUG - 06/21/21 23:48:40 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:48:40 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
INFO - 06/21/21 23:48:51 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
DEBUG - 06/21/21 23:48:51 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:48:51 - 0:00:12 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:48:51 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:48:51 - 0:00:12 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
INFO - 06/21/21 23:49:00 - 0:00:21 - Start NER training ...
INFO - 06/21/21 23:49:00 - 0:00:21 - ============== epoch 0 ==============
INFO - 06/21/21 23:51:22 - 0:02:43 - Finish training epoch 0. loss: 0.0696
INFO - 06/21/21 23:51:22 - 0:02:43 - ============== Evaluate epoch 0 on Dev Set ==============
INFO - 06/21/21 23:51:31 - 0:02:52 - Evaluate on Dev Set. F1: 95.5005.
INFO - 06/21/21 23:51:31 - 0:02:52 - Found better model!!
INFO - 06/21/21 23:51:33 - 0:02:54 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
INFO - 06/21/21 23:51:33 - 0:02:54 - ============== epoch 1 ==============
INFO - 06/21/21 23:53:55 - 0:05:16 - Finish training epoch 1. loss: 0.0234
INFO - 06/21/21 23:53:55 - 0:05:16 - ============== Evaluate epoch 1 on Dev Set ==============
INFO - 06/21/21 23:54:03 - 0:00:00 - ============ Initialized logger ============
INFO - 06/21/21 23:54:03 - 0:00:00 - batch_size: 32
data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
dropout: 0.1
dump_path: logs/conll2003/1
early_stop: 3
epoch: 300
exp_id: 1
exp_name: conll2003
hidden_dim: 1024
logger_filename: train.log
lr: 5e-05
model_name: roberta-large
num_tag: 3
saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
seed: 111
INFO - 06/21/21 23:54:03 - 0:00:00 - The experiment will be stored in logs/conll2003/1
DEBUG - 06/21/21 23:54:03 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:54:04 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:54:04 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:54:04 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:54:04 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:54:04 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
DEBUG - 06/21/21 23:54:04 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:54:05 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
INFO - 06/21/21 23:54:05 - 0:05:25 - Evaluate on Dev Set. F1: 96.9048.
INFO - 06/21/21 23:54:05 - 0:05:25 - Found better model!!
INFO - 06/21/21 23:54:06 - 0:05:27 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
INFO - 06/21/21 23:54:06 - 0:05:27 - ============== epoch 2 ==============
INFO - 06/21/21 23:54:16 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
DEBUG - 06/21/21 23:54:16 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:54:16 - 0:00:12 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:54:16 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:54:16 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
INFO - 06/21/21 23:54:24 - 0:00:20 - Start NER training ...
INFO - 06/21/21 23:54:24 - 0:00:20 - ============== epoch 0 ==============
INFO - 06/21/21 23:55:40 - 0:00:00 - ============ Initialized logger ============
INFO - 06/21/21 23:55:40 - 0:00:00 - batch_size: 32
data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
dropout: 0.1
dump_path: logs/conll2003/1
early_stop: 3
epoch: 300
exp_id: 1
exp_name: conll2003
hidden_dim: 1024
logger_filename: train.log
lr: 5e-05
model_name: roberta-large
num_tag: 3
saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
seed: 123456
INFO - 06/21/21 23:55:40 - 0:00:00 - The experiment will be stored in logs/conll2003/1
DEBUG - 06/21/21 23:55:40 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:55:40 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:55:40 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:55:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:55:41 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:55:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
DEBUG - 06/21/21 23:55:41 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:55:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
INFO - 06/21/21 23:55:53 - 0:00:13 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
DEBUG - 06/21/21 23:55:53 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:55:53 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:55:53 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:55:53 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
INFO - 06/21/21 23:56:01 - 0:00:21 - Start NER training ...
INFO - 06/21/21 23:56:01 - 0:00:21 - ============== epoch 0 ==============
INFO - 06/21/21 23:56:29 - 0:07:50 - Finish training epoch 2. loss: 0.0162
INFO - 06/21/21 23:56:29 - 0:07:50 - ============== Evaluate epoch 2 on Dev Set ==============
INFO - 06/21/21 23:56:38 - 0:07:59 - Evaluate on Dev Set. F1: 97.3381.
INFO - 06/21/21 23:56:38 - 0:07:59 - Found better model!!
INFO - 06/21/21 23:56:40 - 0:08:01 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
INFO - 06/21/21 23:56:40 - 0:08:01 - ============== epoch 3 ==============
INFO - 06/21/21 23:56:47 - 0:02:43 - Finish training epoch 0. loss: 0.0580
INFO - 06/21/21 23:56:47 - 0:02:43 - ============== Evaluate epoch 0 on Dev Set ==============
INFO - 06/21/21 23:56:56 - 0:02:53 - Evaluate on Dev Set. F1: 96.7327.
INFO - 06/21/21 23:56:56 - 0:02:53 - Found better model!!
INFO - 06/21/21 23:56:58 - 0:02:54 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
INFO - 06/21/21 23:56:58 - 0:02:54 - ============== epoch 1 ==============
INFO - 06/21/21 23:58:25 - 0:02:45 - Finish training epoch 0. loss: 0.0544
INFO - 06/21/21 23:58:25 - 0:02:45 - ============== Evaluate epoch 0 on Dev Set ==============
INFO - 06/21/21 23:58:34 - 0:02:54 - Evaluate on Dev Set. F1: 96.8227.
INFO - 06/21/21 23:58:34 - 0:02:54 - Found better model!!
INFO - 06/21/21 23:58:36 - 0:02:56 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
INFO - 06/21/21 23:58:36 - 0:02:56 - ============== epoch 1 ==============
INFO - 06/21/21 23:58:40 - 0:00:00 - ============ Initialized logger ============
INFO - 06/21/21 23:58:40 - 0:00:00 - batch_size: 32
data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
dropout: 0.1
dump_path: logs/conll2003/1
early_stop: 3
epoch: 300
exp_id: 1
exp_name: conll2003
hidden_dim: 1024
logger_filename: train.log
lr: 3e-05
model_name: roberta-large
num_tag: 3
saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
seed: 555
INFO - 06/21/21 23:58:40 - 0:00:00 - The experiment will be stored in logs/conll2003/1
DEBUG - 06/21/21 23:58:40 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:58:40 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:58:40 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:58:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:58:41 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:58:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
DEBUG - 06/21/21 23:58:41 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:58:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
INFO - 06/21/21 23:58:57 - 0:00:00 - ============ Initialized logger ============
INFO - 06/21/21 23:58:57 - 0:00:00 - batch_size: 32
data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
dropout: 0.1
dump_path: logs/conll2003/1
early_stop: 3
epoch: 300
exp_id: 1
exp_name: conll2003
hidden_dim: 1024
logger_filename: train.log
lr: 3e-05
model_name: roberta-large
num_tag: 3
saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
seed: 111
INFO - 06/21/21 23:58:57 - 0:00:00 - The experiment will be stored in logs/conll2003/1
DEBUG - 06/21/21 23:58:57 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:58:57 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:58:57 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:58:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:58:58 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:58:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
DEBUG - 06/21/21 23:58:58 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:58:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
INFO - 06/21/21 23:59:02 - 0:10:23 - Finish training epoch 3. loss: 0.0136
INFO - 06/21/21 23:59:02 - 0:10:23 - ============== Evaluate epoch 3 on Dev Set ==============
INFO - 06/21/21 23:59:10 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
DEBUG - 06/21/21 23:59:10 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:59:10 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:59:10 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:59:10 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
INFO - 06/21/21 23:59:12 - 0:10:33 - Evaluate on Dev Set. F1: 96.0542.
INFO - 06/21/21 23:59:12 - 0:10:33 - No better model found (1/3)
INFO - 06/21/21 23:59:12 - 0:10:33 - ============== epoch 4 ==============
INFO - 06/21/21 23:59:18 - 0:00:20 - Start NER training ...
INFO - 06/21/21 23:59:18 - 0:00:20 - ============== epoch 0 ==============
INFO - 06/21/21 23:59:21 - 0:05:18 - Finish training epoch 1. loss: 0.0190
INFO - 06/21/21 23:59:21 - 0:05:18 - ============== Evaluate epoch 1 on Dev Set ==============
INFO - 06/21/21 23:59:30 - 0:00:00 - ============ Initialized logger ============
INFO - 06/21/21 23:59:30 - 0:00:00 - batch_size: 32
data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
dropout: 0.1
dump_path: logs/conll2003/1
early_stop: 3
epoch: 300
exp_id: 1
exp_name: conll2003
hidden_dim: 1024
logger_filename: train.log
lr: 2e-05
model_name: roberta-large
num_tag: 3
saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
seed: 111
INFO - 06/21/21 23:59:30 - 0:00:00 - The experiment will be stored in logs/conll2003/1
DEBUG - 06/21/21 23:59:30 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:59:30 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:59:30 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
INFO - 06/21/21 23:59:31 - 0:05:27 - Evaluate on Dev Set. F1: 97.1510.
INFO - 06/21/21 23:59:31 - 0:05:27 - Found better model!!
DEBUG - 06/21/21 23:59:31 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:59:31 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:59:31 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
DEBUG - 06/21/21 23:59:31 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:59:31 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
INFO - 06/21/21 23:59:32 - 0:05:29 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
INFO - 06/21/21 23:59:32 - 0:05:29 - ============== epoch 2 ==============
INFO - 06/21/21 23:59:43 - 0:00:13 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
DEBUG - 06/21/21 23:59:43 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:59:43 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/21/21 23:59:43 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/21/21 23:59:44 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
INFO - 06/21/21 23:59:51 - 0:00:21 - Start NER training ...
INFO - 06/21/21 23:59:51 - 0:00:21 - ============== epoch 0 ==============
INFO - 06/22/21 00:01:00 - 0:05:20 - Finish training epoch 1. loss: 0.0229
INFO - 06/22/21 00:01:00 - 0:05:20 - ============== Evaluate epoch 1 on Dev Set ==============
INFO - 06/22/21 00:01:10 - 0:05:30 - Evaluate on Dev Set. F1: 97.0174.
INFO - 06/22/21 00:01:10 - 0:05:30 - Found better model!!
INFO - 06/22/21 00:01:12 - 0:05:31 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
INFO - 06/22/21 00:01:12 - 0:05:31 - ============== epoch 2 ==============
INFO - 06/22/21 00:01:35 - 0:12:56 - Finish training epoch 4. loss: 0.0170
INFO - 06/22/21 00:01:35 - 0:12:56 - ============== Evaluate epoch 4 on Dev Set ==============
INFO - 06/22/21 00:01:40 - 0:02:43 - Finish training epoch 0. loss: 0.0544
INFO - 06/22/21 00:01:40 - 0:02:43 - ============== Evaluate epoch 0 on Dev Set ==============
INFO - 06/22/21 00:01:45 - 0:13:05 - Evaluate on Dev Set. F1: 97.1884.
INFO - 06/22/21 00:01:45 - 0:13:05 - No better model found (2/3)
INFO - 06/22/21 00:01:45 - 0:13:05 - ============== epoch 5 ==============
INFO - 06/22/21 00:01:50 - 0:02:53 - Evaluate on Dev Set. F1: 96.2938.
INFO - 06/22/21 00:01:50 - 0:02:53 - Found better model!!
INFO - 06/22/21 00:01:52 - 0:02:55 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
INFO - 06/22/21 00:01:52 - 0:02:55 - ============== epoch 1 ==============
INFO - 06/22/21 00:01:55 - 0:07:51 - Finish training epoch 2. loss: 0.0200
INFO - 06/22/21 00:01:55 - 0:07:51 - ============== Evaluate epoch 2 on Dev Set ==============
INFO - 06/22/21 00:02:04 - 0:08:01 - Evaluate on Dev Set. F1: 96.9804.
INFO - 06/22/21 00:02:04 - 0:08:01 - No better model found (1/3)
INFO - 06/22/21 00:02:04 - 0:08:01 - ============== epoch 3 ==============
INFO - 06/22/21 00:02:13 - 0:02:42 - Finish training epoch 0. loss: 0.0547
INFO - 06/22/21 00:02:13 - 0:02:42 - ============== Evaluate epoch 0 on Dev Set ==============
INFO - 06/22/21 00:02:22 - 0:02:52 - Evaluate on Dev Set. F1: 97.0400.
INFO - 06/22/21 00:02:22 - 0:02:52 - Found better model!!
INFO - 06/22/21 00:02:24 - 0:02:54 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
INFO - 06/22/21 00:02:24 - 0:02:54 - ============== epoch 1 ==============
INFO - 06/22/21 00:03:35 - 0:07:55 - Finish training epoch 2. loss: 0.0173
INFO - 06/22/21 00:03:35 - 0:07:55 - ============== Evaluate epoch 2 on Dev Set ==============
INFO - 06/22/21 00:03:45 - 0:08:04 - Evaluate on Dev Set. F1: 97.3191.
INFO - 06/22/21 00:03:45 - 0:08:04 - Found better model!!
INFO - 06/22/21 00:03:46 - 0:08:06 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
INFO - 06/22/21 00:03:46 - 0:08:06 - ============== epoch 3 ==============
INFO - 06/22/21 00:04:07 - 0:15:28 - Finish training epoch 5. loss: 0.0083
INFO - 06/22/21 00:04:07 - 0:15:28 - ============== Evaluate epoch 5 on Dev Set ==============
INFO - 06/22/21 00:04:14 - 0:05:17 - Finish training epoch 1. loss: 0.0182
INFO - 06/22/21 00:04:14 - 0:05:17 - ============== Evaluate epoch 1 on Dev Set ==============
INFO - 06/22/21 00:04:17 - 0:15:37 - Evaluate on Dev Set. F1: 97.3169.
INFO - 06/22/21 00:04:17 - 0:15:37 - No better model found (3/3)
INFO - 06/22/21 00:04:17 - 0:15:37 - ============== Evaluate on Test Set ==============
INFO - 06/22/21 00:04:24 - 0:05:27 - Evaluate on Dev Set. F1: 97.6314.
INFO - 06/22/21 00:04:24 - 0:05:27 - Found better model!!
INFO - 06/22/21 00:04:26 - 0:15:46 - Evaluate on Test Set. F1: 95.6012.
INFO - 06/22/21 00:04:26 - 0:05:29 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
INFO - 06/22/21 00:04:26 - 0:05:29 - ============== epoch 2 ==============
INFO - 06/22/21 00:04:27 - 0:10:24 - Finish training epoch 3. loss: 0.0157
INFO - 06/22/21 00:04:27 - 0:10:24 - ============== Evaluate epoch 3 on Dev Set ==============
INFO - 06/22/21 00:04:37 - 0:10:33 - Evaluate on Dev Set. F1: 97.6654.
INFO - 06/22/21 00:04:37 - 0:10:33 - Found better model!!
INFO - 06/22/21 00:04:39 - 0:10:35 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
INFO - 06/22/21 00:04:39 - 0:10:35 - ============== epoch 4 ==============
INFO - 06/22/21 00:04:45 - 0:05:15 - Finish training epoch 1. loss: 0.0177
INFO - 06/22/21 00:04:45 - 0:05:15 - ============== Evaluate epoch 1 on Dev Set ==============
INFO - 06/22/21 00:04:55 - 0:05:25 - Evaluate on Dev Set. F1: 97.6093.
INFO - 06/22/21 00:04:55 - 0:05:25 - Found better model!!
INFO - 06/22/21 00:04:56 - 0:05:26 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
INFO - 06/22/21 00:04:56 - 0:05:26 - ============== epoch 2 ==============
INFO - 06/22/21 00:06:10 - 0:10:30 - Finish training epoch 3. loss: 0.0439
INFO - 06/22/21 00:06:10 - 0:10:30 - ============== Evaluate epoch 3 on Dev Set ==============
INFO - 06/22/21 00:06:20 - 0:10:40 - Evaluate on Dev Set. F1: 0.0000.
INFO - 06/22/21 00:06:20 - 0:10:40 - No better model found (1/3)
INFO - 06/22/21 00:06:20 - 0:10:40 - ============== epoch 4 ==============
INFO - 06/22/21 00:06:47 - 0:07:50 - Finish training epoch 2. loss: 0.0156
INFO - 06/22/21 00:06:47 - 0:07:50 - ============== Evaluate epoch 2 on Dev Set ==============
INFO - 06/22/21 00:06:57 - 0:07:59 - Evaluate on Dev Set. F1: 97.5384.
INFO - 06/22/21 00:06:57 - 0:07:59 - No better model found (1/3)
INFO - 06/22/21 00:06:57 - 0:07:59 - ============== epoch 3 ==============
INFO - 06/22/21 00:07:02 - 0:12:59 - Finish training epoch 4. loss: 0.0127
INFO - 06/22/21 00:07:02 - 0:12:59 - ============== Evaluate epoch 4 on Dev Set ==============
INFO - 06/22/21 00:07:12 - 0:13:08 - Evaluate on Dev Set. F1: 97.4583.
INFO - 06/22/21 00:07:12 - 0:13:08 - No better model found (1/3)
INFO - 06/22/21 00:07:12 - 0:13:08 - ============== epoch 5 ==============
INFO - 06/22/21 00:07:17 - 0:07:47 - Finish training epoch 2. loss: 0.0115
INFO - 06/22/21 00:07:17 - 0:07:47 - ============== Evaluate epoch 2 on Dev Set ==============
INFO - 06/22/21 00:07:26 - 0:07:56 - Evaluate on Dev Set. F1: 97.2615.
INFO - 06/22/21 00:07:26 - 0:07:56 - No better model found (1/3)
INFO - 06/22/21 00:07:26 - 0:07:56 - ============== epoch 3 ==============
INFO - 06/22/21 00:08:43 - 0:13:03 - Finish training epoch 4. loss: 0.5637
INFO - 06/22/21 00:08:43 - 0:13:03 - ============== Evaluate epoch 4 on Dev Set ==============
INFO - 06/22/21 00:08:53 - 0:13:12 - Evaluate on Dev Set. F1: 0.0000.
INFO - 06/22/21 00:08:53 - 0:13:12 - No better model found (2/3)
INFO - 06/22/21 00:08:53 - 0:13:12 - ============== epoch 5 ==============
INFO - 06/22/21 00:09:18 - 0:10:21 - Finish training epoch 3. loss: 0.0110
INFO - 06/22/21 00:09:18 - 0:10:21 - ============== Evaluate epoch 3 on Dev Set ==============
INFO - 06/22/21 00:09:28 - 0:10:31 - Evaluate on Dev Set. F1: 97.2738.
INFO - 06/22/21 00:09:28 - 0:10:31 - No better model found (2/3)
INFO - 06/22/21 00:09:28 - 0:10:31 - ============== epoch 4 ==============
INFO - 06/22/21 00:09:35 - 0:15:31 - Finish training epoch 5. loss: 0.0132
INFO - 06/22/21 00:09:35 - 0:15:31 - ============== Evaluate epoch 5 on Dev Set ==============
INFO - 06/22/21 00:09:45 - 0:15:41 - Evaluate on Dev Set. F1: 97.4630.
INFO - 06/22/21 00:09:45 - 0:15:41 - No better model found (2/3)
INFO - 06/22/21 00:09:45 - 0:15:41 - ============== epoch 6 ==============
INFO - 06/22/21 00:09:47 - 0:10:17 - Finish training epoch 3. loss: 0.0101
INFO - 06/22/21 00:09:47 - 0:10:17 - ============== Evaluate epoch 3 on Dev Set ==============
INFO - 06/22/21 00:09:57 - 0:10:27 - Evaluate on Dev Set. F1: 97.5034.
INFO - 06/22/21 00:09:57 - 0:10:27 - No better model found (2/3)
INFO - 06/22/21 00:09:57 - 0:10:27 - ============== epoch 4 ==============
INFO - 06/22/21 00:11:16 - 0:15:36 - Finish training epoch 5. loss: 0.5620
INFO - 06/22/21 00:11:16 - 0:15:36 - ============== Evaluate epoch 5 on Dev Set ==============
INFO - 06/22/21 00:11:26 - 0:15:45 - Evaluate on Dev Set. F1: 0.0000.
INFO - 06/22/21 00:11:26 - 0:15:45 - No better model found (3/3)
INFO - 06/22/21 00:11:26 - 0:15:45 - ============== Evaluate on Test Set ==============
INFO - 06/22/21 00:11:35 - 0:15:54 - Evaluate on Test Set. F1: 0.0000.
INFO - 06/22/21 00:11:50 - 0:12:53 - Finish training epoch 4. loss: 0.0137
INFO - 06/22/21 00:11:50 - 0:12:53 - ============== Evaluate epoch 4 on Dev Set ==============
INFO - 06/22/21 00:12:00 - 0:13:02 - Evaluate on Dev Set. F1: 97.4501.
INFO - 06/22/21 00:12:00 - 0:13:02 - No better model found (3/3)
INFO - 06/22/21 00:12:00 - 0:13:02 - ============== Evaluate on Test Set ==============
INFO - 06/22/21 00:12:08 - 0:18:04 - Finish training epoch 6. loss: 0.0129
INFO - 06/22/21 00:12:08 - 0:18:04 - ============== Evaluate epoch 6 on Dev Set ==============
INFO - 06/22/21 00:12:09 - 0:13:11 - Evaluate on Test Set. F1: 95.4761.
INFO - 06/22/21 00:12:17 - 0:18:14 - Evaluate on Dev Set. F1: 97.2311.
INFO - 06/22/21 00:12:17 - 0:18:14 - No better model found (3/3)
INFO - 06/22/21 00:12:17 - 0:18:14 - ============== Evaluate on Test Set ==============
INFO - 06/22/21 00:12:19 - 0:12:48 - Finish training epoch 4. loss: 0.0074
INFO - 06/22/21 00:12:19 - 0:12:48 - ============== Evaluate epoch 4 on Dev Set ==============
INFO - 06/22/21 00:12:26 - 0:18:23 - Evaluate on Test Set. F1: 95.2934.
INFO - 06/22/21 00:12:28 - 0:12:58 - Evaluate on Dev Set. F1: 97.0406.
INFO - 06/22/21 00:12:28 - 0:12:58 - No better model found (3/3)
INFO - 06/22/21 00:12:28 - 0:12:58 - ============== Evaluate on Test Set ==============
INFO - 06/22/21 00:12:37 - 0:13:07 - Evaluate on Test Set. F1: 95.3264.
INFO - 06/22/21 00:16:11 - 0:00:00 - ============ Initialized logger ============
INFO - 06/22/21 00:16:11 - 0:00:00 - batch_size: 32
data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
dropout: 0.1
dump_path: logs/conll2003/1
early_stop: 3
epoch: 300
exp_id: 1
exp_name: conll2003
hidden_dim: 1024
logger_filename: train.log
lr: 3e-05
model_name: roberta-large
num_tag: 3
saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
seed: 111
INFO - 06/22/21 00:16:11 - 0:00:00 - The experiment will be stored in logs/conll2003/1
DEBUG - 06/22/21 00:16:11 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/22/21 00:16:12 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/22/21 00:16:12 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/22/21 00:16:12 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
DEBUG - 06/22/21 00:16:12 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/22/21 00:16:12 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
DEBUG - 06/22/21 00:16:12 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/22/21 00:16:13 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
INFO - 06/22/21 00:16:24 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
DEBUG - 06/22/21 00:16:24 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/22/21 00:16:24 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
DEBUG - 06/22/21 00:16:24 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
DEBUG - 06/22/21 00:16:24 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
INFO - 06/22/21 00:16:31 - 0:00:20 - Start NER training ...
INFO - 06/22/21 00:16:31 - 0:00:20 - ============== epoch 0 ==============
INFO - 06/22/21 00:18:53 - 0:02:42 - Finish training epoch 0. loss: 0.0544
INFO - 06/22/21 00:18:53 - 0:02:42 - ============== Evaluate epoch 0 on Dev Set ==============
INFO - 06/22/21 00:19:03 - 0:02:51 - Evaluate on Dev Set. F1: 96.2938.
INFO - 06/22/21 00:19:03 - 0:02:51 - Found better model!!
INFO - 06/22/21 00:19:05 - 0:02:53 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
INFO - 06/22/21 00:19:05 - 0:02:53 - ============== epoch 1 ==============
CUDA_VISIBLE_DEVICES=0 python train_ner.py --exp_name conll2003 --exp_id 1 --model_name roberta-large --lr 3e-5 --seed 111
import argparse
def get_params():
parser = argparse.ArgumentParser(description="NER Task")
parser.add_argument("--exp_name", type=str, default="conll2003", help="Experiment name")
parser.add_argument("--logger_filename", type=str, default="train.log")
parser.add_argument("--dump_path", type=str, default="logs", help="Experiment saved root path")
parser.add_argument("--exp_id", type=str, default="1", help="Experiment id")
parser.add_argument("--model_name", type=str, default="roberta-large", help="model name")
parser.add_argument("--seed", type=int, default=111, help="random seed")
# train parameters
parser.add_argument("--batch_size", type=int, default=32, help="Batch size")
parser.add_argument("--epoch", type=int, default=300, help="Number of epoch")
parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate")
parser.add_argument("--early_stop", type=int, default=3, help="No improvement after several epoch, we stop training")
parser.add_argument("--num_tag", type=int, default=3, help="Number of entity in the dataset")
parser.add_argument("--dropout", type=float, default=0.1, help="dropout rate")
parser.add_argument("--hidden_dim", type=int, default=1024, help="Hidden layer dimension")
parser.add_argument("--data_folder", type=str, default="/gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003", help="NER data folder")
parser.add_argument("--saved_folder", type=str, default="/gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model", help="NER data folder")
params = parser.parse_args()
return params
import torch
import torch.nn as nn
import torch.utils.data as data
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import os
from tqdm import tqdm
import logging
logger = logging.getLogger()
pad_token_label_id = nn.CrossEntropyLoss().ignore_index
label_set = ["O", "B-ENTITY", "I-ENTITY"]
def read_ner(tokenizer, datapath):
inputs, labels = [], []
with open(datapath, "r") as fr:
token_list, label_list = [], []
for i, line in enumerate(fr):
line = line.strip()
if line == "":
if len(token_list) > 0:
assert len(token_list) == len(label_list)
inputs.append([tokenizer.cls_token_id] + token_list + [tokenizer.sep_token_id])
labels.append([pad_token_label_id] + label_list + [pad_token_label_id])
token_list, label_list = [], []
continue
splits = line.split("\t")
token = splits[0]
label = splits[1]
if label.startswith("B-"):
label = "B-ENTITY"
elif label.startswith("I-"):
label = "I-ENTITY"
subs_ = tokenizer.tokenize(token)
if len(subs_) > 0:
label_list.extend([label_set.index(label)] + [pad_token_label_id] * (len(subs_) - 1))
token_list.extend(tokenizer.convert_tokens_to_ids(subs_))
else:
print("length of subwords for %s is zero; its label is %s" % (token, label))
return inputs, labels
class Dataset(data.Dataset):
def __init__(self, tokenizer, inputs, labels):
self.X = inputs
self.y = labels
self.tokenizer = tokenizer
def __getitem__(self, index):
return self.X[index], self.y[index]
def __len__(self):
return len(self.X)
def collate_fn(self, data):
X, y = zip(*data)
lengths = [len(bs_x) for bs_x in X]
max_lengths = max(lengths)
padded_seqs = torch.LongTensor(len(X), max_lengths).fill_(self.tokenizer.pad_token_id)
padded_y = torch.LongTensor(len(X), max_lengths).fill_(pad_token_label_id)
for i, (seq, y_) in enumerate(zip(X, y)):
length = lengths[i]
padded_seqs[i, :length] = torch.LongTensor(seq)
padded_y[i, :length] = torch.LongTensor(y_)
return padded_seqs, padded_y
def get_dataloader(model_name, batch_size, data_folder):
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs_train, labels_train = read_ner(tokenizer, os.path.join(data_folder, "train.txt"))
inputs_dev, labels_dev = read_ner(tokenizer, os.path.join(data_folder, "dev.txt"))
inputs_test, labels_test = read_ner(tokenizer, os.path.join(data_folder, "test.txt"))
logger.info("conll2003 dataset: train size: %d; dev size %d; test size: %d" % (len(inputs_train), len(inputs_dev), len(inputs_test)))
dataset_train = Dataset(tokenizer, inputs_train, labels_train)
dataset_dev = Dataset(tokenizer, inputs_dev, labels_dev)
dataset_test = Dataset(tokenizer, inputs_test, labels_test)
dataloader_train = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True, collate_fn=dataset_train.collate_fn)
dataloader_dev = DataLoader(dataset=dataset_dev, batch_size=batch_size, shuffle=False, collate_fn=dataset_dev.collate_fn)
dataloader_test = DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=False, collate_fn=dataset_test.collate_fn)
return dataloader_train, dataloader_dev, dataloader_test
#!/usr/bin/env python
# Python version of the evaluation script from CoNLL'00-
# Intentional differences:
# - accept any space as delimiter by default
# - optional file argument (default STDIN)
# - option to set boundary (-b argument)
# - LaTeX output (-l argument) not supported
# - raw tags (-r argument) not supported
import sys
import re
from collections import defaultdict, namedtuple
ANY_SPACE = '<SPACE>'
class FormatError(Exception):
pass
Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')
class EvalCounts(object):
def __init__(self):
self.correct_chunk = 0 # number of correctly identified chunks
self.correct_tags = 0 # number of correct chunk tags
self.found_correct = 0 # number of chunks in corpus
self.found_guessed = 0 # number of identified chunks
self.token_counter = 0 # token counter (ignores sentence breaks)
# counts by type
self.t_correct_chunk = defaultdict(int)
self.t_found_correct = defaultdict(int)
self.t_found_guessed = defaultdict(int)
def parse_args(argv):
import argparse
parser = argparse.ArgumentParser(
description='evaluate tagging results using CoNLL criteria',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
arg = parser.add_argument
arg('-b', '--boundary', metavar='STR', default='-X-',
help='sentence boundary')
arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE,
help='character delimiting items in input')
arg('-o', '--otag', metavar='CHAR', default='O',
help='alternative outside tag')
arg('file', nargs='?', default=None)
return parser.parse_args(argv)
def parse_tag(t):
m = re.match(r'^([^-]*)-(.*)$', t)
return m.groups() if m else (t, '')
def evaluate(lines, options=None):
if options is None:
options = parse_args([]) # use defaults
counts = EvalCounts()
num_features = None # number of features per line
in_correct = False # currently processed chunks is correct until now
last_correct = 'O' # previous chunk tag in corpus
last_correct_type = '' # type of previously identified chunk tag
last_guessed = 'O' # previously identified chunk tag
last_guessed_type = '' # type of previous chunk tag in corpus
for line in lines:
line = line.rstrip('\r\n')
if options.delimiter == ANY_SPACE:
features = line.split()
else:
features = line.split(options.delimiter)
if num_features is None:
num_features = len(features)
elif num_features != len(features) and len(features) != 0:
raise FormatError('unexpected number of features: %d (%d)' %
(len(features), num_features))
if len(features) == 0 or features[0] == options.boundary:
features = [options.boundary, 'O', 'O']
if len(features) < 3:
raise FormatError('unexpected number of features in line %s' % line)
guessed, guessed_type = parse_tag(features.pop())
correct, correct_type = parse_tag(features.pop())
first_item = features.pop(0)
if first_item == options.boundary:
guessed = 'O'
end_correct = end_of_chunk(last_correct, correct,
last_correct_type, correct_type)
end_guessed = end_of_chunk(last_guessed, guessed,
last_guessed_type, guessed_type)
start_correct = start_of_chunk(last_correct, correct,
last_correct_type, correct_type)
start_guessed = start_of_chunk(last_guessed, guessed,
last_guessed_type, guessed_type)
if in_correct:
if (end_correct and end_guessed and
last_guessed_type == last_correct_type):
in_correct = False
counts.correct_chunk += 1
counts.t_correct_chunk[last_correct_type] += 1
elif (end_correct != end_guessed or guessed_type != correct_type):
in_correct = False
if start_correct and start_guessed and guessed_type == correct_type:
in_correct = True
if start_correct:
counts.found_correct += 1
counts.t_found_correct[correct_type] += 1
if start_guessed:
counts.found_guessed += 1
counts.t_found_guessed[guessed_type] += 1
if first_item != options.boundary:
if correct == guessed and guessed_type == correct_type:
counts.correct_tags += 1
counts.token_counter += 1
last_guessed = guessed
last_correct = correct
last_guessed_type = guessed_type
last_correct_type = correct_type
if in_correct:
counts.correct_chunk += 1
counts.t_correct_chunk[last_correct_type] += 1
return counts
def uniq(iterable):
seen = set()
return [i for i in iterable if not (i in seen or seen.add(i))]
def calculate_metrics(correct, guessed, total):
tp, fp, fn = correct, guessed-correct, total-correct
p = 0 if tp + fp == 0 else 1.*tp / (tp + fp)
r = 0 if tp + fn == 0 else 1.*tp / (tp + fn)
f = 0 if p + r == 0 else 2 * p * r / (p + r)
return Metrics(tp, fp, fn, p, r, f)
def metrics(counts):
c = counts
overall = calculate_metrics(
c.correct_chunk, c.found_guessed, c.found_correct
)
by_type = {}
for t in uniq(list(c.t_found_correct) + list(c.t_found_guessed)):
by_type[t] = calculate_metrics(
c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t]
)
return overall, by_type
def report(counts, out=None):
if out is None:
out = sys.stdout
overall, by_type = metrics(counts)
c = counts
# out.write('processed %d tokens with %d phrases; ' %
# (c.token_counter, c.found_correct))
# out.write('found: %d phrases; correct: %d.\n' %
# (c.found_guessed, c.correct_chunk))
results = {}
if c.token_counter > 0:
results["fb1"] = 100.*overall.fscore
# comment it to not print details
# for i, m in sorted(by_type.items()):
# print('%17s: ' % i)
# print('precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f %d\n' % (100.*m.prec, 100.*m.rec, 100.*m.fscore, c.t_found_guessed[i]))
return results
def end_of_chunk(prev_tag, tag, prev_type, type_):
# check if a chunk ended between the previous and current word
# arguments: previous and current chunk tags, previous and current types
chunk_end = False
if prev_tag == 'E': chunk_end = True
if prev_tag == 'S': chunk_end = True
if prev_tag == 'B' and tag == 'B': chunk_end = True
if prev_tag == 'B' and tag == 'S': chunk_end = True
if prev_tag == 'B' and tag == 'O': chunk_end = True
if prev_tag == 'I' and tag == 'B': chunk_end = True
if prev_tag == 'I' and tag == 'S': chunk_end = True
if prev_tag == 'I' and tag == 'O': chunk_end = True
if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
chunk_end = True
# these chunks are assumed to have length 1
if prev_tag == ']': chunk_end = True
if prev_tag == '[': chunk_end = True
return chunk_end
def start_of_chunk(prev_tag, tag, prev_type, type_):
# check if a chunk started between the previous and current word
# arguments: previous and current chunk tags, previous and current types
chunk_start = False
if tag == 'B': chunk_start = True
if tag == 'S': chunk_start = True
if prev_tag == 'E' and tag == 'E': chunk_start = True
if prev_tag == 'E' and tag == 'I': chunk_start = True
if prev_tag == 'S' and tag == 'E': chunk_start = True
if prev_tag == 'S' and tag == 'I': chunk_start = True
if prev_tag == 'O' and tag == 'E': chunk_start = True
if prev_tag == 'O' and tag == 'I': chunk_start = True
if tag != 'O' and tag != '.' and prev_type != type_:
chunk_start = True
# these chunks are assumed to have length 1
if tag == '[': chunk_start = True
if tag == ']': chunk_start = True
return chunk_start
def main(argv):
args = parse_args(argv[1:])
if args.file is None:
counts = evaluate(sys.stdin, args)
else:
with open(args.file) as f:
counts = evaluate(f, args)
report(counts)
def conll2002_measure(lines, verbose=False):
counts = evaluate(lines, None)
return report(counts)
import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers import AutoModel
class EntityTagger(nn.Module):
def __init__(self, params):
super(EntityTagger, self).__init__()
self.num_tag = params.num_tag
self.hidden_dim = params.hidden_dim
self.model = AutoModel.from_pretrained(params.model_name)
self.dropout = nn.Dropout(params.dropout)
self.linear = nn.Linear(self.hidden_dim, self.num_tag)
def forward(self, X):
outputs = self.model(X) # a tuple ((bsz,seq_len,hidden_dim), (bsz, hidden_dim))
outputs = outputs[0] # (bsz, seq_len, hidden_dim)
outputs = self.dropout(outputs)
prediction = self.linear(outputs)
return prediction
import torch
import torch.nn as nn
from src.metrics import *
from src.dataloader import label_set, pad_token_label_id
import os
import numpy as np
from tqdm import tqdm
import logging
logger = logging.getLogger()
class NERTrainer(object):
def __init__(self, params, model):
self.params = params
self.model = model
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=params.lr)
self.loss_fn = nn.CrossEntropyLoss()
self.early_stop = params.early_stop
self.no_improvement_num = 0
self.best_dev_f1 = 0
def train_step(self, X, y):
self.model.train()
preds = self.model(X)
y = y.view(y.size(0)*y.size(1))
preds = preds.view(preds.size(0)*preds.size(1), preds.size(2))
self.optimizer.zero_grad()
loss = self.loss_fn(preds, y)
loss.backward()
self.optimizer.step()
return loss.item()
def train(self, dataloader_train, dataloader_dev, dataloader_test):
logger.info("Start NER training ...")
for e in range(self.params.epoch):
logger.info("============== epoch %d ==============" % e)
loss_list = []
pbar = tqdm(enumerate(dataloader_train), total=len(dataloader_train))
for i, (X, y) in pbar:
X, y = X.cuda(), y.cuda()
loss = self.train_step(X, y)
loss_list.append(loss)
pbar.set_description("(Epoch {}) LOSS:{:.4f}".format(e, np.mean(loss_list)))
logger.info("Finish training epoch %d. loss: %.4f" % (e, np.mean(loss_list)))
logger.info("============== Evaluate epoch %d on Dev Set ==============" % e)
f1_dev = self.evaluate(dataloader_dev)
logger.info("Evaluate on Dev Set. F1: %.4f." % f1_dev)
if f1_dev > self.best_dev_f1:
logger.info("Found better model!!")
self.best_dev_f1 = f1_dev
self.no_improvement_num = 0
self.save_model()
else:
self.no_improvement_num += 1
logger.info("No better model found (%d/%d)" % (self.no_improvement_num, self.early_stop))
if self.no_improvement_num >= self.early_stop:
break
logger.info("============== Evaluate on Test Set ==============")
f1_test = self.evaluate(dataloader_test)
logger.info("Evaluate on Test Set. F1: %.4f." % f1_test)
def evaluate(self, dataloader):
self.model.eval()
pred_list = []
y_list = []
pbar = tqdm(enumerate(dataloader), total=len(dataloader))
for i, (X, y) in pbar:
y_list.extend(y.data.numpy()) # y is a list
X = X.cuda()
preds = self.model(X)
pred_list.extend(preds.data.cpu().numpy())
# concatenation
pred_list = np.concatenate(pred_list, axis=0) # (length, num_tag)
pred_list = np.argmax(pred_list, axis=1)
y_list = np.concatenate(y_list, axis=0)
# calcuate f1 score
pred_list = list(pred_list)
y_list = list(y_list)
lines = []
for pred_index, gold_index in zip(pred_list, y_list):
gold_index = int(gold_index)
if gold_index != pad_token_label_id:
pred_token = label_set[pred_index]
gold_token = label_set[gold_index]
lines.append("w" + " " + pred_token + " " + gold_token)
results = conll2002_measure(lines)
f1 = results["fb1"]
return f1
def save_model(self):
"""
save the best model
"""
saved_path = os.path.join(self.params.saved_folder, self.params.model_name+".pt")
torch.save({
"model": self.model,
}, saved_path)
logger.info("Best model has been saved to %s" % saved_path)
import os
import subprocess
import pickle
import logging
import time
import random
from datetime import timedelta
import numpy as np
def init_experiment(params, logger_filename):
"""
Initialize the experiment:
- save parameters
- create a logger
"""
# save parameters
get_saved_path(params)
pickle.dump(params, open(os.path.join(params.dump_path, "params.pkl"), "wb"))
# create a logger
logger = create_logger(os.path.join(params.dump_path, logger_filename))
logger.info('============ Initialized logger ============')
logger.info('\n'.join('%s: %s' % (k, str(v))
for k, v in sorted(dict(vars(params)).items())))
logger.info('The experiment will be stored in %s\n' % params.dump_path)
return logger
class LogFormatter():
def __init__(self):
self.start_time = time.time()
def format(self, record):
elapsed_seconds = round(record.created - self.start_time)
prefix = "%s - %s - %s" % (
record.levelname,
time.strftime('%x %X'),
timedelta(seconds=elapsed_seconds)
)
message = record.getMessage()
message = message.replace('\n', '\n' + ' ' * (len(prefix) + 3))
return "%s - %s" % (prefix, message) if message else ''
def create_logger(filepath):
# create log formatter
log_formatter = LogFormatter()
# create file handler and set level to debug
if filepath is not None:
file_handler = logging.FileHandler(filepath, "a")
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(log_formatter)
# create console handler and set level to info
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(log_formatter)
# create logger and set level to debug
logger = logging.getLogger()
logger.handlers = []
logger.setLevel(logging.DEBUG)
logger.propagate = False
if filepath is not None:
logger.addHandler(file_handler)
logger.addHandler(console_handler)
# reset logger elapsed time
def reset_time():
log_formatter.start_time = time.time()
logger.reset_time = reset_time
return logger
def get_saved_path(params):
"""
create a directory to store the experiment
"""
dump_path = "./" if params.dump_path == "" else params.dump_path
if not os.path.isdir(dump_path):
subprocess.Popen("mkdir -p %s" % dump_path, shell=True).wait()
assert os.path.isdir(dump_path)
# create experiment path if it does not exist
exp_path = os.path.join(dump_path, params.exp_name)
if not os.path.exists(exp_path):
subprocess.Popen("mkdir -p %s" % exp_path, shell=True).wait()
# generate id for this experiment
if params.exp_id == "":
chars = "0123456789"
while True:
exp_id = "".join(random.choice(chars) for _ in range(0, 3))
if not os.path.isdir(os.path.join(exp_path, exp_id)):
break
else:
exp_id = params.exp_id
# update dump_path
params.dump_path = os.path.join(exp_path, exp_id)
if not os.path.isdir(params.dump_path):
subprocess.Popen("mkdir -p %s" % params.dump_path, shell=True).wait()
assert os.path.isdir(params.dump_path)
from src.config import get_params
from src.utils import init_experiment
from src.dataloader import get_dataloader
from src.model import EntityTagger
from src.trainer import NERTrainer
import torch
import numpy as np
from tqdm import tqdm
import random
def random_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
def train_ner(params):
# initialize experiment
logger = init_experiment(params, logger_filename=params.logger_filename)
# dataloader
dataloader_train, dataloader_dev, dataloader_test = get_dataloader(params.model_name, params.batch_size, params.data_folder)
# BERT-based NER Tagger
model = EntityTagger(params)
model.cuda()
# trainer
trainer = NERTrainer(params, model)
trainer.train(dataloader_train, dataloader_dev, dataloader_test)
if __name__ == "__main__":
params = get_params()
random_seed(params.seed)
train_ner(params)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment