add commands and dialog_ctrl

7d044e4e · root · 90e0a0dd · 7d044e4e · 7d044e4e · 7d044e4e
Commit 7d044e4e authored Jun 22, 2021 by root
18 changed files
--- a/commands/get_node_int.sh
+++ b/commands/get_node_int.sh
+#!/bin/bash
+srun -p batch_short,batch -A gpu_adlr_nlp -t 2:00:00 --nodes=1 --ntasks-per-node=16 --gres=gpu:16,gpfs:circe --job-name=interact --container-mounts=/gpfs/fs1/projects/gpu_adlr/datasets:/gpfs/fs1/projects/gpu_adlr/datasets,/home/zihanl:/home/zihanl --container-image=gitlab-master.nvidia.com/adlr/megatron-lm/pytorch-nlp-retriever-faiss:20.12-py3-devel --exclusive --pty bash
--- a/commands/run_gpt3_357m.sh
+++ b/commands/run_gpt3_357m.sh
+#!/bin/bash
+#SBATCH -p interactive -A gpu_adlr_nlp -t 1:00:00 --nodes=1 --exclusive --mem=0 --overcommit --ntasks-per-node=16 --gres=gpu:16,gpfs:circe --dependency=singleton --job-name=adlr-nlp-largelm:gpt3-357m
+NAME="gpt3-357m"
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+mkdir -p $DIR/logs
+TENSORBOARD_DIR="${DIR}/tensorboard/${NAME}"
+mkdir -p ${TENSORBOARD_DIR}
+DATA_PATH=/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/roberta_dataset/rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_text_document
+options=" \
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --seq-length 2048 \
+    --max-position-embeddings 2048 \
+    --micro-batch-size 2 \
+    --global-batch-size 256 \
+    --rampup-batch-size 32 32 1953125 \
+    --train-samples 192000000 \
+    --lr-decay-samples 166400000 \
+    --lr-warmup-samples 162761 \
+    --lr 3.0e-4 \
+    --min-lr 3.0e-5 \
+    --lr-decay-style cosine \
+    --log-interval 100 \
+    --eval-iters 50 \
+    --eval-interval 2000 \
+    --data-path ${DATA_PATH} \
+    --vocab-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-vocab.json \
+    --merge-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-merges.txt \
+    --save-interval 10000 \
+    --exit-interval 100 \
+    --save /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
+    --load /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
+    --split 98,2,0 \
+    --clip-grad 1.0 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.02 \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    --fp16 \
+    --DDP-impl torch \
+    --tensorboard-dir ${TENSORBOARD_DIR} \
+    --checkpoint-activations "
+run_cmd="python ${DIR}/pretrain_gpt.py ${options}"
+srun -l \
+     --container-image "gitlab-master.nvidia.com/adlr/megatron-lm/pytorch-nlp-retriever-faiss:20.12-py3-devel" \
+     --container-mounts "/gpfs/fs1/projects/gpu_adlr/datasets:/gpfs/fs1/projects/gpu_adlr/datasets,/home/zihanl:/home/zihanl" \
+     --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
+set +x
--- a/commands/run_gpt3_357m_int.sh
+++ b/commands/run_gpt3_357m_int.sh
+#!/bin/bash
+NAME="gpt3-357m"
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+mkdir -p $DIR/logs
+TENSORBOARD_DIR="${DIR}/tensorboard/${NAME}"  
+mkdir -p ${TENSORBOARD_DIR}
+DATA_PATH=/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/roberta_dataset/rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_text_document
+options=" \
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --seq-length 2048 \
+    --max-position-embeddings 2048 \
+    --micro-batch-size 2 \
+    --global-batch-size 256 \
+    --rampup-batch-size 32 32 1953125 \
+    --train-samples 192000000 \
+    --lr-decay-samples 166400000 \
+    --lr-warmup-samples 162761 \
+    --lr 3.0e-4 \
+    --min-lr 3.0e-5 \
+    --lr-decay-style cosine \
+    --log-interval 100 \
+    --eval-iters 50 \
+    --eval-interval 2000 \
+    --data-path ${DATA_PATH} \
+    --vocab-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-vocab.json \
+    --merge-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-merges.txt \
+    --save-interval 10000 \
+    --exit-interval 100 \
+    --save /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
+    --load /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
+    --split 98,2,0 \
+    --clip-grad 1.0 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.02 \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    --fp16 \
+    --DDP-impl torch \
+    --tensorboard-dir ${TENSORBOARD_DIR} \
+    --checkpoint-activations "
+run_cmd="${DIR}/pretrain_gpt.py ${options}"
+GPUS_PER_NODE=16
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ${run_cmd}
+set +x
--- a/dialog_ctrl/ner/inference.py
+++ b/dialog_ctrl/ner/inference.py
--- a/dialog_ctrl/ner/logs/conll2003/1/params.pkl
+++ b/dialog_ctrl/ner/logs/conll2003/1/params.pkl
--- a/dialog_ctrl/ner/logs/conll2003/1/train.log
+++ b/dialog_ctrl/ner/logs/conll2003/1/train.log
+INFO - 06/21/21 23:13:46 - 0:00:00 - ============ Initialized logger ============
+INFO - 06/21/21 23:13:46 - 0:00:00 - batch_size: 32
+                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
+                                     dropout: 0.1
+                                     dump_path: logs/conll2003/1
+                                     early_stop: 3
+                                     epoch: 300
+                                     exp_id: 1
+                                     exp_name: conll2003
+                                     hidden_dim: 1024
+                                     logger_filename: train.log
+                                     lr: 5e-05
+                                     model_name: roberta-large
+                                     num_tag: 3
+                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
+                                     seed: 555
+INFO - 06/21/21 23:13:46 - 0:00:00 - The experiment will be stored in logs/conll2003/1
+INFO - 06/21/21 23:25:29 - 0:00:00 - ============ Initialized logger ============
+INFO - 06/21/21 23:25:29 - 0:00:00 - batch_size: 32
+                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
+                                     dropout: 0.1
+                                     dump_path: logs/conll2003/1
+                                     early_stop: 3
+                                     epoch: 300
+                                     exp_id: 1
+                                     exp_name: conll2003
+                                     hidden_dim: 1024
+                                     logger_filename: train.log
+                                     lr: 5e-05
+                                     model_name: roberta-large
+                                     num_tag: 3
+                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
+                                     seed: 555
+INFO - 06/21/21 23:25:29 - 0:00:00 - The experiment will be stored in logs/conll2003/1
+DEBUG - 06/21/21 23:25:29 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:25:29 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:25:29 - 0:00:01 - Attempting to acquire lock 22598820184656 on /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373.lock
+INFO - 06/21/21 23:25:29 - 0:00:01 - Lock 22598820184656 acquired on /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373.lock
+DEBUG - 06/21/21 23:25:29 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:25:30 - 0:00:01 - https://huggingface.co:443 "GET /roberta-large/resolve/main/config.json HTTP/1.1" 200 482
+DEBUG - 06/21/21 23:25:30 - 0:00:01 - Attempting to release lock 22598820184656 on /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373.lock
+INFO - 06/21/21 23:25:30 - 0:00:01 - Lock 22598820184656 released on /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373.lock
+DEBUG - 06/21/21 23:25:30 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:25:30 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:25:30 - 0:00:01 - Attempting to acquire lock 22598820184656 on /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
+INFO - 06/21/21 23:25:30 - 0:00:01 - Lock 22598820184656 acquired on /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
+DEBUG - 06/21/21 23:25:30 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:25:30 - 0:00:01 - https://huggingface.co:443 "GET /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 898823
+DEBUG - 06/21/21 23:25:30 - 0:00:02 - Attempting to release lock 22598820184656 on /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
+INFO - 06/21/21 23:25:30 - 0:00:02 - Lock 22598820184656 released on /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
+DEBUG - 06/21/21 23:25:30 - 0:00:02 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:25:31 - 0:00:02 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:25:31 - 0:00:02 - Attempting to acquire lock 22597850387840 on /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
+INFO - 06/21/21 23:25:31 - 0:00:02 - Lock 22597850387840 acquired on /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
+DEBUG - 06/21/21 23:25:31 - 0:00:02 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:25:31 - 0:00:02 - https://huggingface.co:443 "GET /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 456318
+DEBUG - 06/21/21 23:25:31 - 0:00:02 - Attempting to release lock 22597850387840 on /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
+INFO - 06/21/21 23:25:31 - 0:00:02 - Lock 22597850387840 released on /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
+DEBUG - 06/21/21 23:25:31 - 0:00:02 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:25:31 - 0:00:03 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:25:31 - 0:00:03 - Attempting to acquire lock 22597850387840 on /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock
+INFO - 06/21/21 23:25:31 - 0:00:03 - Lock 22597850387840 acquired on /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock
+DEBUG - 06/21/21 23:25:31 - 0:00:03 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:25:32 - 0:00:03 - https://huggingface.co:443 "GET /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 1355863
+DEBUG - 06/21/21 23:25:32 - 0:00:03 - Attempting to release lock 22597850387840 on /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock
+INFO - 06/21/21 23:25:32 - 0:00:03 - Lock 22597850387840 released on /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock
+INFO - 06/21/21 23:26:26 - 0:00:00 - ============ Initialized logger ============
+INFO - 06/21/21 23:26:26 - 0:00:00 - batch_size: 32
+                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
+                                     dropout: 0.1
+                                     dump_path: logs/conll2003/1
+                                     early_stop: 3
+                                     epoch: 300
+                                     exp_id: 1
+                                     exp_name: conll2003
+                                     hidden_dim: 1024
+                                     logger_filename: train.log
+                                     lr: 5e-05
+                                     model_name: roberta-large
+                                     num_tag: 3
+                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
+                                     seed: 555
+INFO - 06/21/21 23:26:26 - 0:00:00 - The experiment will be stored in logs/conll2003/1
+DEBUG - 06/21/21 23:26:26 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:26:26 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:26:26 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:26:27 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:26:27 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:26:27 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:26:27 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:26:27 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
+INFO - 06/21/21 23:26:39 - 0:00:13 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
+DEBUG - 06/21/21 23:26:39 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:26:39 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:26:39 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:26:39 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
+DEBUG - 06/21/21 23:26:39 - 0:00:13 - Attempting to acquire lock 23082502829920 on /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352.lock
+INFO - 06/21/21 23:26:39 - 0:00:13 - Lock 23082502829920 acquired on /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352.lock
+DEBUG - 06/21/21 23:26:39 - 0:00:13 - Starting new HTTPS connection (1): cdn-lfs.huggingface.co:443
+DEBUG - 06/21/21 23:26:39 - 0:00:13 - https://cdn-lfs.huggingface.co:443 "GET /roberta-large/36a10a8b694fadf9bf4f9049d14e257e88be45313ae02d882af9e60f39b8b2e8 HTTP/1.1" 200 1425941629
+DEBUG - 06/21/21 23:27:01 - 0:00:34 - Attempting to release lock 23082502829920 on /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352.lock
+INFO - 06/21/21 23:27:01 - 0:00:34 - Lock 23082502829920 released on /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352.lock
+INFO - 06/21/21 23:27:57 - 0:00:00 - ============ Initialized logger ============
+INFO - 06/21/21 23:27:57 - 0:00:00 - batch_size: 32
+                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
+                                     dropout: 0.1
+                                     dump_path: logs/conll2003/1
+                                     early_stop: 3
+                                     epoch: 300
+                                     exp_id: 1
+                                     exp_name: conll2003
+                                     hidden_dim: 1024
+                                     logger_filename: train.log
+                                     lr: 5e-05
+                                     model_name: roberta-large
+                                     num_tag: 3
+                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
+                                     seed: 555
+INFO - 06/21/21 23:27:57 - 0:00:00 - The experiment will be stored in logs/conll2003/1
+DEBUG - 06/21/21 23:27:57 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:27:57 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:27:57 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:27:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:27:58 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:27:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:27:58 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:27:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
+INFO - 06/21/21 23:28:09 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
+DEBUG - 06/21/21 23:28:09 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:28:10 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:28:10 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:28:10 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
+INFO - 06/21/21 23:28:17 - 0:00:20 - Start NER training ...
+INFO - 06/21/21 23:28:17 - 0:00:20 - ============== epoch 0 ==============
+INFO - 06/21/21 23:29:45 - 0:00:00 - ============ Initialized logger ============
+INFO - 06/21/21 23:29:45 - 0:00:00 - batch_size: 32
+                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
+                                     dropout: 0.1
+                                     dump_path: logs/conll2003/1
+                                     early_stop: 3
+                                     epoch: 300
+                                     exp_id: 1
+                                     exp_name: conll2003
+                                     hidden_dim: 1024
+                                     logger_filename: train.log
+                                     lr: 5e-05
+                                     model_name: roberta-large
+                                     num_tag: 3
+                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
+                                     seed: 555
+INFO - 06/21/21 23:29:45 - 0:00:00 - The experiment will be stored in logs/conll2003/1
+DEBUG - 06/21/21 23:29:45 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:29:45 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:29:45 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:29:45 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:29:45 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:29:46 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:29:46 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:29:46 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
+INFO - 06/21/21 23:29:57 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
+DEBUG - 06/21/21 23:29:57 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:29:57 - 0:00:12 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:29:57 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:29:57 - 0:00:12 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
+INFO - 06/21/21 23:30:04 - 0:00:19 - Start NER training ...
+INFO - 06/21/21 23:30:04 - 0:00:19 - ============== epoch 0 ==============
+INFO - 06/21/21 23:31:17 - 0:00:00 - ============ Initialized logger ============
+INFO - 06/21/21 23:31:17 - 0:00:00 - batch_size: 32
+                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
+                                     dropout: 0.1
+                                     dump_path: logs/conll2003/1
+                                     early_stop: 3
+                                     epoch: 300
+                                     exp_id: 1
+                                     exp_name: conll2003
+                                     hidden_dim: 1024
+                                     logger_filename: train.log
+                                     lr: 5e-05
+                                     model_name: roberta-large
+                                     num_tag: 3
+                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
+                                     seed: 555
+INFO - 06/21/21 23:31:17 - 0:00:00 - The experiment will be stored in logs/conll2003/1
+DEBUG - 06/21/21 23:31:17 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:31:17 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:31:17 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:31:17 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:31:17 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:31:18 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:31:18 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:31:18 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
+INFO - 06/21/21 23:31:29 - 0:00:13 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
+DEBUG - 06/21/21 23:31:29 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:31:30 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:31:30 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:31:30 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
+INFO - 06/21/21 23:31:37 - 0:00:20 - Start NER training ...
+INFO - 06/21/21 23:31:37 - 0:00:20 - ============== epoch 0 ==============
+INFO - 06/21/21 23:33:58 - 0:02:42 - Finish training epoch 0. loss: 0.0696
+INFO - 06/21/21 23:33:58 - 0:02:42 - ============== Evaluate epoch 0 on Dev Set ==============
+INFO - 06/21/21 23:34:08 - 0:02:51 - Evaluate on Dev Set. F1: 95.5005.
+INFO - 06/21/21 23:34:08 - 0:02:51 - Found better model!!
+INFO - 06/21/21 23:48:39 - 0:00:00 - ============ Initialized logger ============
+INFO - 06/21/21 23:48:39 - 0:00:00 - batch_size: 32
+                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
+                                     dropout: 0.1
+                                     dump_path: logs/conll2003/1
+                                     early_stop: 3
+                                     epoch: 300
+                                     exp_id: 1
+                                     exp_name: conll2003
+                                     hidden_dim: 1024
+                                     logger_filename: train.log
+                                     lr: 5e-05
+                                     model_name: roberta-large
+                                     num_tag: 3
+                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
+                                     seed: 555
+INFO - 06/21/21 23:48:39 - 0:00:00 - The experiment will be stored in logs/conll2003/1
+DEBUG - 06/21/21 23:48:39 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:48:39 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:48:39 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:48:40 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:48:40 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:48:40 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:48:40 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:48:40 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
+INFO - 06/21/21 23:48:51 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
+DEBUG - 06/21/21 23:48:51 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:48:51 - 0:00:12 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:48:51 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:48:51 - 0:00:12 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
+INFO - 06/21/21 23:49:00 - 0:00:21 - Start NER training ...
+INFO - 06/21/21 23:49:00 - 0:00:21 - ============== epoch 0 ==============
+INFO - 06/21/21 23:51:22 - 0:02:43 - Finish training epoch 0. loss: 0.0696
+INFO - 06/21/21 23:51:22 - 0:02:43 - ============== Evaluate epoch 0 on Dev Set ==============
+INFO - 06/21/21 23:51:31 - 0:02:52 - Evaluate on Dev Set. F1: 95.5005.
+INFO - 06/21/21 23:51:31 - 0:02:52 - Found better model!!
+INFO - 06/21/21 23:51:33 - 0:02:54 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
+INFO - 06/21/21 23:51:33 - 0:02:54 - ============== epoch 1 ==============
+INFO - 06/21/21 23:53:55 - 0:05:16 - Finish training epoch 1. loss: 0.0234
+INFO - 06/21/21 23:53:55 - 0:05:16 - ============== Evaluate epoch 1 on Dev Set ==============
+INFO - 06/21/21 23:54:03 - 0:00:00 - ============ Initialized logger ============
+INFO - 06/21/21 23:54:03 - 0:00:00 - batch_size: 32
+                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
+                                     dropout: 0.1
+                                     dump_path: logs/conll2003/1
+                                     early_stop: 3
+                                     epoch: 300
+                                     exp_id: 1
+                                     exp_name: conll2003
+                                     hidden_dim: 1024
+                                     logger_filename: train.log
+                                     lr: 5e-05
+                                     model_name: roberta-large
+                                     num_tag: 3
+                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
+                                     seed: 111
+INFO - 06/21/21 23:54:03 - 0:00:00 - The experiment will be stored in logs/conll2003/1
+DEBUG - 06/21/21 23:54:03 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:54:04 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:54:04 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:54:04 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:54:04 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:54:04 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:54:04 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:54:05 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
+INFO - 06/21/21 23:54:05 - 0:05:25 - Evaluate on Dev Set. F1: 96.9048.
+INFO - 06/21/21 23:54:05 - 0:05:25 - Found better model!!
+INFO - 06/21/21 23:54:06 - 0:05:27 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
+INFO - 06/21/21 23:54:06 - 0:05:27 - ============== epoch 2 ==============
+INFO - 06/21/21 23:54:16 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
+DEBUG - 06/21/21 23:54:16 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:54:16 - 0:00:12 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:54:16 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:54:16 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
+INFO - 06/21/21 23:54:24 - 0:00:20 - Start NER training ...
+INFO - 06/21/21 23:54:24 - 0:00:20 - ============== epoch 0 ==============
+INFO - 06/21/21 23:55:40 - 0:00:00 - ============ Initialized logger ============
+INFO - 06/21/21 23:55:40 - 0:00:00 - batch_size: 32
+                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
+                                     dropout: 0.1
+                                     dump_path: logs/conll2003/1
+                                     early_stop: 3
+                                     epoch: 300
+                                     exp_id: 1
+                                     exp_name: conll2003
+                                     hidden_dim: 1024
+                                     logger_filename: train.log
+                                     lr: 5e-05
+                                     model_name: roberta-large
+                                     num_tag: 3
+                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
+                                     seed: 123456
+INFO - 06/21/21 23:55:40 - 0:00:00 - The experiment will be stored in logs/conll2003/1
+DEBUG - 06/21/21 23:55:40 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:55:40 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:55:40 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:55:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:55:41 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:55:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:55:41 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:55:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
+INFO - 06/21/21 23:55:53 - 0:00:13 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
+DEBUG - 06/21/21 23:55:53 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:55:53 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:55:53 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:55:53 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
+INFO - 06/21/21 23:56:01 - 0:00:21 - Start NER training ...
+INFO - 06/21/21 23:56:01 - 0:00:21 - ============== epoch 0 ==============
+INFO - 06/21/21 23:56:29 - 0:07:50 - Finish training epoch 2. loss: 0.0162
+INFO - 06/21/21 23:56:29 - 0:07:50 - ============== Evaluate epoch 2 on Dev Set ==============
+INFO - 06/21/21 23:56:38 - 0:07:59 - Evaluate on Dev Set. F1: 97.3381.
+INFO - 06/21/21 23:56:38 - 0:07:59 - Found better model!!
+INFO - 06/21/21 23:56:40 - 0:08:01 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
+INFO - 06/21/21 23:56:40 - 0:08:01 - ============== epoch 3 ==============
+INFO - 06/21/21 23:56:47 - 0:02:43 - Finish training epoch 0. loss: 0.0580
+INFO - 06/21/21 23:56:47 - 0:02:43 - ============== Evaluate epoch 0 on Dev Set ==============
+INFO - 06/21/21 23:56:56 - 0:02:53 - Evaluate on Dev Set. F1: 96.7327.
+INFO - 06/21/21 23:56:56 - 0:02:53 - Found better model!!
+INFO - 06/21/21 23:56:58 - 0:02:54 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
+INFO - 06/21/21 23:56:58 - 0:02:54 - ============== epoch 1 ==============
+INFO - 06/21/21 23:58:25 - 0:02:45 - Finish training epoch 0. loss: 0.0544
+INFO - 06/21/21 23:58:25 - 0:02:45 - ============== Evaluate epoch 0 on Dev Set ==============
+INFO - 06/21/21 23:58:34 - 0:02:54 - Evaluate on Dev Set. F1: 96.8227.
+INFO - 06/21/21 23:58:34 - 0:02:54 - Found better model!!
+INFO - 06/21/21 23:58:36 - 0:02:56 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
+INFO - 06/21/21 23:58:36 - 0:02:56 - ============== epoch 1 ==============
+INFO - 06/21/21 23:58:40 - 0:00:00 - ============ Initialized logger ============
+INFO - 06/21/21 23:58:40 - 0:00:00 - batch_size: 32
+                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
+                                     dropout: 0.1
+                                     dump_path: logs/conll2003/1
+                                     early_stop: 3
+                                     epoch: 300
+                                     exp_id: 1
+                                     exp_name: conll2003
+                                     hidden_dim: 1024
+                                     logger_filename: train.log
+                                     lr: 3e-05
+                                     model_name: roberta-large
+                                     num_tag: 3
+                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
+                                     seed: 555
+INFO - 06/21/21 23:58:40 - 0:00:00 - The experiment will be stored in logs/conll2003/1
+DEBUG - 06/21/21 23:58:40 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:58:40 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:58:40 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:58:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:58:41 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:58:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:58:41 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:58:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
+INFO - 06/21/21 23:58:57 - 0:00:00 - ============ Initialized logger ============
+INFO - 06/21/21 23:58:57 - 0:00:00 - batch_size: 32
+                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
+                                     dropout: 0.1
+                                     dump_path: logs/conll2003/1
+                                     early_stop: 3
+                                     epoch: 300
+                                     exp_id: 1
+                                     exp_name: conll2003
+                                     hidden_dim: 1024
+                                     logger_filename: train.log
+                                     lr: 3e-05
+                                     model_name: roberta-large
+                                     num_tag: 3
+                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
+                                     seed: 111
+INFO - 06/21/21 23:58:57 - 0:00:00 - The experiment will be stored in logs/conll2003/1
+DEBUG - 06/21/21 23:58:57 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:58:57 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:58:57 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:58:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:58:58 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:58:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:58:58 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:58:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
+INFO - 06/21/21 23:59:02 - 0:10:23 - Finish training epoch 3. loss: 0.0136
+INFO - 06/21/21 23:59:02 - 0:10:23 - ============== Evaluate epoch 3 on Dev Set ==============
+INFO - 06/21/21 23:59:10 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
+DEBUG - 06/21/21 23:59:10 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:59:10 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:59:10 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:59:10 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
+INFO - 06/21/21 23:59:12 - 0:10:33 - Evaluate on Dev Set. F1: 96.0542.
+INFO - 06/21/21 23:59:12 - 0:10:33 - No better model found (1/3)
+INFO - 06/21/21 23:59:12 - 0:10:33 - ============== epoch 4 ==============
+INFO - 06/21/21 23:59:18 - 0:00:20 - Start NER training ...
+INFO - 06/21/21 23:59:18 - 0:00:20 - ============== epoch 0 ==============
+INFO - 06/21/21 23:59:21 - 0:05:18 - Finish training epoch 1. loss: 0.0190
+INFO - 06/21/21 23:59:21 - 0:05:18 - ============== Evaluate epoch 1 on Dev Set ==============
+INFO - 06/21/21 23:59:30 - 0:00:00 - ============ Initialized logger ============
+INFO - 06/21/21 23:59:30 - 0:00:00 - batch_size: 32
+                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
+                                     dropout: 0.1
+                                     dump_path: logs/conll2003/1
+                                     early_stop: 3
+                                     epoch: 300
+                                     exp_id: 1
+                                     exp_name: conll2003
+                                     hidden_dim: 1024
+                                     logger_filename: train.log
+                                     lr: 2e-05
+                                     model_name: roberta-large
+                                     num_tag: 3
+                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
+                                     seed: 111
+INFO - 06/21/21 23:59:30 - 0:00:00 - The experiment will be stored in logs/conll2003/1
+DEBUG - 06/21/21 23:59:30 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:59:30 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:59:30 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+INFO - 06/21/21 23:59:31 - 0:05:27 - Evaluate on Dev Set. F1: 97.1510.
+INFO - 06/21/21 23:59:31 - 0:05:27 - Found better model!!
+DEBUG - 06/21/21 23:59:31 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:59:31 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:59:31 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:59:31 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:59:31 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
+INFO - 06/21/21 23:59:32 - 0:05:29 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
+INFO - 06/21/21 23:59:32 - 0:05:29 - ============== epoch 2 ==============
+INFO - 06/21/21 23:59:43 - 0:00:13 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
+DEBUG - 06/21/21 23:59:43 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:59:43 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/21/21 23:59:43 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/21/21 23:59:44 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
+INFO - 06/21/21 23:59:51 - 0:00:21 - Start NER training ...
+INFO - 06/21/21 23:59:51 - 0:00:21 - ============== epoch 0 ==============
+INFO - 06/22/21 00:01:00 - 0:05:20 - Finish training epoch 1. loss: 0.0229
+INFO - 06/22/21 00:01:00 - 0:05:20 - ============== Evaluate epoch 1 on Dev Set ==============
+INFO - 06/22/21 00:01:10 - 0:05:30 - Evaluate on Dev Set. F1: 97.0174.
+INFO - 06/22/21 00:01:10 - 0:05:30 - Found better model!!
+INFO - 06/22/21 00:01:12 - 0:05:31 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
+INFO - 06/22/21 00:01:12 - 0:05:31 - ============== epoch 2 ==============
+INFO - 06/22/21 00:01:35 - 0:12:56 - Finish training epoch 4. loss: 0.0170
+INFO - 06/22/21 00:01:35 - 0:12:56 - ============== Evaluate epoch 4 on Dev Set ==============
+INFO - 06/22/21 00:01:40 - 0:02:43 - Finish training epoch 0. loss: 0.0544
+INFO - 06/22/21 00:01:40 - 0:02:43 - ============== Evaluate epoch 0 on Dev Set ==============
+INFO - 06/22/21 00:01:45 - 0:13:05 - Evaluate on Dev Set. F1: 97.1884.
+INFO - 06/22/21 00:01:45 - 0:13:05 - No better model found (2/3)
+INFO - 06/22/21 00:01:45 - 0:13:05 - ============== epoch 5 ==============
+INFO - 06/22/21 00:01:50 - 0:02:53 - Evaluate on Dev Set. F1: 96.2938.
+INFO - 06/22/21 00:01:50 - 0:02:53 - Found better model!!
+INFO - 06/22/21 00:01:52 - 0:02:55 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
+INFO - 06/22/21 00:01:52 - 0:02:55 - ============== epoch 1 ==============
+INFO - 06/22/21 00:01:55 - 0:07:51 - Finish training epoch 2. loss: 0.0200
+INFO - 06/22/21 00:01:55 - 0:07:51 - ============== Evaluate epoch 2 on Dev Set ==============
+INFO - 06/22/21 00:02:04 - 0:08:01 - Evaluate on Dev Set. F1: 96.9804.
+INFO - 06/22/21 00:02:04 - 0:08:01 - No better model found (1/3)
+INFO - 06/22/21 00:02:04 - 0:08:01 - ============== epoch 3 ==============
+INFO - 06/22/21 00:02:13 - 0:02:42 - Finish training epoch 0. loss: 0.0547
+INFO - 06/22/21 00:02:13 - 0:02:42 - ============== Evaluate epoch 0 on Dev Set ==============
+INFO - 06/22/21 00:02:22 - 0:02:52 - Evaluate on Dev Set. F1: 97.0400.
+INFO - 06/22/21 00:02:22 - 0:02:52 - Found better model!!
+INFO - 06/22/21 00:02:24 - 0:02:54 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
+INFO - 06/22/21 00:02:24 - 0:02:54 - ============== epoch 1 ==============
+INFO - 06/22/21 00:03:35 - 0:07:55 - Finish training epoch 2. loss: 0.0173
+INFO - 06/22/21 00:03:35 - 0:07:55 - ============== Evaluate epoch 2 on Dev Set ==============
+INFO - 06/22/21 00:03:45 - 0:08:04 - Evaluate on Dev Set. F1: 97.3191.
+INFO - 06/22/21 00:03:45 - 0:08:04 - Found better model!!
+INFO - 06/22/21 00:03:46 - 0:08:06 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
+INFO - 06/22/21 00:03:46 - 0:08:06 - ============== epoch 3 ==============
+INFO - 06/22/21 00:04:07 - 0:15:28 - Finish training epoch 5. loss: 0.0083
+INFO - 06/22/21 00:04:07 - 0:15:28 - ============== Evaluate epoch 5 on Dev Set ==============
+INFO - 06/22/21 00:04:14 - 0:05:17 - Finish training epoch 1. loss: 0.0182
+INFO - 06/22/21 00:04:14 - 0:05:17 - ============== Evaluate epoch 1 on Dev Set ==============
+INFO - 06/22/21 00:04:17 - 0:15:37 - Evaluate on Dev Set. F1: 97.3169.
+INFO - 06/22/21 00:04:17 - 0:15:37 - No better model found (3/3)
+INFO - 06/22/21 00:04:17 - 0:15:37 - ============== Evaluate on Test Set ==============
+INFO - 06/22/21 00:04:24 - 0:05:27 - Evaluate on Dev Set. F1: 97.6314.
+INFO - 06/22/21 00:04:24 - 0:05:27 - Found better model!!
+INFO - 06/22/21 00:04:26 - 0:15:46 - Evaluate on Test Set. F1: 95.6012.
+INFO - 06/22/21 00:04:26 - 0:05:29 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
+INFO - 06/22/21 00:04:26 - 0:05:29 - ============== epoch 2 ==============
+INFO - 06/22/21 00:04:27 - 0:10:24 - Finish training epoch 3. loss: 0.0157
+INFO - 06/22/21 00:04:27 - 0:10:24 - ============== Evaluate epoch 3 on Dev Set ==============
+INFO - 06/22/21 00:04:37 - 0:10:33 - Evaluate on Dev Set. F1: 97.6654.
+INFO - 06/22/21 00:04:37 - 0:10:33 - Found better model!!
+INFO - 06/22/21 00:04:39 - 0:10:35 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
+INFO - 06/22/21 00:04:39 - 0:10:35 - ============== epoch 4 ==============
+INFO - 06/22/21 00:04:45 - 0:05:15 - Finish training epoch 1. loss: 0.0177
+INFO - 06/22/21 00:04:45 - 0:05:15 - ============== Evaluate epoch 1 on Dev Set ==============
+INFO - 06/22/21 00:04:55 - 0:05:25 - Evaluate on Dev Set. F1: 97.6093.
+INFO - 06/22/21 00:04:55 - 0:05:25 - Found better model!!
+INFO - 06/22/21 00:04:56 - 0:05:26 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
+INFO - 06/22/21 00:04:56 - 0:05:26 - ============== epoch 2 ==============
+INFO - 06/22/21 00:06:10 - 0:10:30 - Finish training epoch 3. loss: 0.0439
+INFO - 06/22/21 00:06:10 - 0:10:30 - ============== Evaluate epoch 3 on Dev Set ==============
+INFO - 06/22/21 00:06:20 - 0:10:40 - Evaluate on Dev Set. F1: 0.0000.
+INFO - 06/22/21 00:06:20 - 0:10:40 - No better model found (1/3)
+INFO - 06/22/21 00:06:20 - 0:10:40 - ============== epoch 4 ==============
+INFO - 06/22/21 00:06:47 - 0:07:50 - Finish training epoch 2. loss: 0.0156
+INFO - 06/22/21 00:06:47 - 0:07:50 - ============== Evaluate epoch 2 on Dev Set ==============
+INFO - 06/22/21 00:06:57 - 0:07:59 - Evaluate on Dev Set. F1: 97.5384.
+INFO - 06/22/21 00:06:57 - 0:07:59 - No better model found (1/3)
+INFO - 06/22/21 00:06:57 - 0:07:59 - ============== epoch 3 ==============
+INFO - 06/22/21 00:07:02 - 0:12:59 - Finish training epoch 4. loss: 0.0127
+INFO - 06/22/21 00:07:02 - 0:12:59 - ============== Evaluate epoch 4 on Dev Set ==============
+INFO - 06/22/21 00:07:12 - 0:13:08 - Evaluate on Dev Set. F1: 97.4583.
+INFO - 06/22/21 00:07:12 - 0:13:08 - No better model found (1/3)
+INFO - 06/22/21 00:07:12 - 0:13:08 - ============== epoch 5 ==============
+INFO - 06/22/21 00:07:17 - 0:07:47 - Finish training epoch 2. loss: 0.0115
+INFO - 06/22/21 00:07:17 - 0:07:47 - ============== Evaluate epoch 2 on Dev Set ==============
+INFO - 06/22/21 00:07:26 - 0:07:56 - Evaluate on Dev Set. F1: 97.2615.
+INFO - 06/22/21 00:07:26 - 0:07:56 - No better model found (1/3)
+INFO - 06/22/21 00:07:26 - 0:07:56 - ============== epoch 3 ==============
+INFO - 06/22/21 00:08:43 - 0:13:03 - Finish training epoch 4. loss: 0.5637
+INFO - 06/22/21 00:08:43 - 0:13:03 - ============== Evaluate epoch 4 on Dev Set ==============
+INFO - 06/22/21 00:08:53 - 0:13:12 - Evaluate on Dev Set. F1: 0.0000.
+INFO - 06/22/21 00:08:53 - 0:13:12 - No better model found (2/3)
+INFO - 06/22/21 00:08:53 - 0:13:12 - ============== epoch 5 ==============
+INFO - 06/22/21 00:09:18 - 0:10:21 - Finish training epoch 3. loss: 0.0110
+INFO - 06/22/21 00:09:18 - 0:10:21 - ============== Evaluate epoch 3 on Dev Set ==============
+INFO - 06/22/21 00:09:28 - 0:10:31 - Evaluate on Dev Set. F1: 97.2738.
+INFO - 06/22/21 00:09:28 - 0:10:31 - No better model found (2/3)
+INFO - 06/22/21 00:09:28 - 0:10:31 - ============== epoch 4 ==============
+INFO - 06/22/21 00:09:35 - 0:15:31 - Finish training epoch 5. loss: 0.0132
+INFO - 06/22/21 00:09:35 - 0:15:31 - ============== Evaluate epoch 5 on Dev Set ==============
+INFO - 06/22/21 00:09:45 - 0:15:41 - Evaluate on Dev Set. F1: 97.4630.
+INFO - 06/22/21 00:09:45 - 0:15:41 - No better model found (2/3)
+INFO - 06/22/21 00:09:45 - 0:15:41 - ============== epoch 6 ==============
+INFO - 06/22/21 00:09:47 - 0:10:17 - Finish training epoch 3. loss: 0.0101
+INFO - 06/22/21 00:09:47 - 0:10:17 - ============== Evaluate epoch 3 on Dev Set ==============
+INFO - 06/22/21 00:09:57 - 0:10:27 - Evaluate on Dev Set. F1: 97.5034.
+INFO - 06/22/21 00:09:57 - 0:10:27 - No better model found (2/3)
+INFO - 06/22/21 00:09:57 - 0:10:27 - ============== epoch 4 ==============
+INFO - 06/22/21 00:11:16 - 0:15:36 - Finish training epoch 5. loss: 0.5620
+INFO - 06/22/21 00:11:16 - 0:15:36 - ============== Evaluate epoch 5 on Dev Set ==============
+INFO - 06/22/21 00:11:26 - 0:15:45 - Evaluate on Dev Set. F1: 0.0000.
+INFO - 06/22/21 00:11:26 - 0:15:45 - No better model found (3/3)
+INFO - 06/22/21 00:11:26 - 0:15:45 - ============== Evaluate on Test Set ==============
+INFO - 06/22/21 00:11:35 - 0:15:54 - Evaluate on Test Set. F1: 0.0000.
+INFO - 06/22/21 00:11:50 - 0:12:53 - Finish training epoch 4. loss: 0.0137
+INFO - 06/22/21 00:11:50 - 0:12:53 - ============== Evaluate epoch 4 on Dev Set ==============
+INFO - 06/22/21 00:12:00 - 0:13:02 - Evaluate on Dev Set. F1: 97.4501.
+INFO - 06/22/21 00:12:00 - 0:13:02 - No better model found (3/3)
+INFO - 06/22/21 00:12:00 - 0:13:02 - ============== Evaluate on Test Set ==============
+INFO - 06/22/21 00:12:08 - 0:18:04 - Finish training epoch 6. loss: 0.0129
+INFO - 06/22/21 00:12:08 - 0:18:04 - ============== Evaluate epoch 6 on Dev Set ==============
+INFO - 06/22/21 00:12:09 - 0:13:11 - Evaluate on Test Set. F1: 95.4761.
+INFO - 06/22/21 00:12:17 - 0:18:14 - Evaluate on Dev Set. F1: 97.2311.
+INFO - 06/22/21 00:12:17 - 0:18:14 - No better model found (3/3)
+INFO - 06/22/21 00:12:17 - 0:18:14 - ============== Evaluate on Test Set ==============
+INFO - 06/22/21 00:12:19 - 0:12:48 - Finish training epoch 4. loss: 0.0074
+INFO - 06/22/21 00:12:19 - 0:12:48 - ============== Evaluate epoch 4 on Dev Set ==============
+INFO - 06/22/21 00:12:26 - 0:18:23 - Evaluate on Test Set. F1: 95.2934.
+INFO - 06/22/21 00:12:28 - 0:12:58 - Evaluate on Dev Set. F1: 97.0406.
+INFO - 06/22/21 00:12:28 - 0:12:58 - No better model found (3/3)
+INFO - 06/22/21 00:12:28 - 0:12:58 - ============== Evaluate on Test Set ==============
+INFO - 06/22/21 00:12:37 - 0:13:07 - Evaluate on Test Set. F1: 95.3264.
+INFO - 06/22/21 00:16:11 - 0:00:00 - ============ Initialized logger ============
+INFO - 06/22/21 00:16:11 - 0:00:00 - batch_size: 32
+                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
+                                     dropout: 0.1
+                                     dump_path: logs/conll2003/1
+                                     early_stop: 3
+                                     epoch: 300
+                                     exp_id: 1
+                                     exp_name: conll2003
+                                     hidden_dim: 1024
+                                     logger_filename: train.log
+                                     lr: 3e-05
+                                     model_name: roberta-large
+                                     num_tag: 3
+                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
+                                     seed: 111
+INFO - 06/22/21 00:16:11 - 0:00:00 - The experiment will be stored in logs/conll2003/1
+DEBUG - 06/22/21 00:16:11 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/22/21 00:16:12 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/22/21 00:16:12 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/22/21 00:16:12 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
+DEBUG - 06/22/21 00:16:12 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/22/21 00:16:12 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
+DEBUG - 06/22/21 00:16:12 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/22/21 00:16:13 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
+INFO - 06/22/21 00:16:24 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
+DEBUG - 06/22/21 00:16:24 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/22/21 00:16:24 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
+DEBUG - 06/22/21 00:16:24 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
+DEBUG - 06/22/21 00:16:24 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
+INFO - 06/22/21 00:16:31 - 0:00:20 - Start NER training ...
+INFO - 06/22/21 00:16:31 - 0:00:20 - ============== epoch 0 ==============
+INFO - 06/22/21 00:18:53 - 0:02:42 - Finish training epoch 0. loss: 0.0544
+INFO - 06/22/21 00:18:53 - 0:02:42 - ============== Evaluate epoch 0 on Dev Set ==============
+INFO - 06/22/21 00:19:03 - 0:02:51 - Evaluate on Dev Set. F1: 96.2938.
+INFO - 06/22/21 00:19:03 - 0:02:51 - Found better model!!
+INFO - 06/22/21 00:19:05 - 0:02:53 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
+INFO - 06/22/21 00:19:05 - 0:02:53 - ============== epoch 1 ==============
--- a/dialog_ctrl/ner/run_ner.sh
+++ b/dialog_ctrl/ner/run_ner.sh
+CUDA_VISIBLE_DEVICES=0 python train_ner.py --exp_name conll2003 --exp_id 1 --model_name roberta-large --lr 3e-5 --seed 111
--- a/dialog_ctrl/ner/src/config.py
+++ b/dialog_ctrl/ner/src/config.py
+import argparse
+def get_params():
+    parser = argparse.ArgumentParser(description="NER Task")
+    parser.add_argument("--exp_name", type=str, default="conll2003", help="Experiment name")
+    parser.add_argument("--logger_filename", type=str, default="train.log")
+    parser.add_argument("--dump_path", type=str, default="logs", help="Experiment saved root path")
+    parser.add_argument("--exp_id", type=str, default="1", help="Experiment id")
+    parser.add_argument("--model_name", type=str, default="roberta-large", help="model name")
+    parser.add_argument("--seed", type=int, default=111, help="random seed")
+    # train parameters
+    parser.add_argument("--batch_size", type=int, default=32, help="Batch size")
+    parser.add_argument("--epoch", type=int, default=300, help="Number of epoch")
+    parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate")
+    parser.add_argument("--early_stop", type=int, default=3, help="No improvement after several epoch, we stop training")
+    parser.add_argument("--num_tag", type=int, default=3, help="Number of entity in the dataset")
+    parser.add_argument("--dropout", type=float, default=0.1, help="dropout rate")
+    parser.add_argument("--hidden_dim", type=int, default=1024, help="Hidden layer dimension")
+    parser.add_argument("--data_folder", type=str, default="/gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003", help="NER data folder")
+    parser.add_argument("--saved_folder", type=str, default="/gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model", help="NER data folder")
+    params = parser.parse_args()
+    return params
--- a/dialog_ctrl/ner/src/dataloader.py
+++ b/dialog_ctrl/ner/src/dataloader.py
+import torch
+import torch.nn as nn
+import torch.utils.data as data
+from torch.utils.data import DataLoader
+from transformers import AutoTokenizer
+import os
+from tqdm import tqdm
+import logging
+logger = logging.getLogger()
+pad_token_label_id = nn.CrossEntropyLoss().ignore_index
+label_set = ["O", "B-ENTITY", "I-ENTITY"]
+def read_ner(tokenizer, datapath):
+    inputs, labels = [], []
+    with open(datapath, "r") as fr:
+        token_list, label_list = [], []
+        for i, line in enumerate(fr):
+            line = line.strip()
+            if line == "":
+                if len(token_list) > 0:
+                    assert len(token_list) == len(label_list)
+                    inputs.append([tokenizer.cls_token_id] + token_list + [tokenizer.sep_token_id])
+                    labels.append([pad_token_label_id] + label_list + [pad_token_label_id])
+                token_list, label_list = [], []
+                continue
+            splits = line.split("\t")
+            token = splits[0]
+            label = splits[1]
+            if label.startswith("B-"):
+                label = "B-ENTITY"
+            elif label.startswith("I-"):
+                label = "I-ENTITY"
+            subs_ = tokenizer.tokenize(token)
+            if len(subs_) > 0:
+                label_list.extend([label_set.index(label)] + [pad_token_label_id] * (len(subs_) - 1))
+                token_list.extend(tokenizer.convert_tokens_to_ids(subs_))
+            else:
+                print("length of subwords for %s is zero; its label is %s" % (token, label))
+    return inputs, labels
+class Dataset(data.Dataset):
+    def __init__(self, tokenizer, inputs, labels):
+        self.X = inputs
+        self.y = labels
+        self.tokenizer = tokenizer
+    def __getitem__(self, index):
+        return self.X[index], self.y[index]
+    def __len__(self):
+        return len(self.X)
+    def collate_fn(self, data):
+        X, y = zip(*data)
+        lengths = [len(bs_x) for bs_x in X]
+        max_lengths = max(lengths)
+        padded_seqs = torch.LongTensor(len(X), max_lengths).fill_(self.tokenizer.pad_token_id)
+        padded_y = torch.LongTensor(len(X), max_lengths).fill_(pad_token_label_id)
+        for i, (seq, y_) in enumerate(zip(X, y)):
+            length = lengths[i]
+            padded_seqs[i, :length] = torch.LongTensor(seq)
+            padded_y[i, :length] = torch.LongTensor(y_)
+        return padded_seqs, padded_y
+def get_dataloader(model_name, batch_size, data_folder):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    inputs_train, labels_train = read_ner(tokenizer, os.path.join(data_folder, "train.txt"))
+    inputs_dev, labels_dev = read_ner(tokenizer, os.path.join(data_folder, "dev.txt"))
+    inputs_test, labels_test = read_ner(tokenizer, os.path.join(data_folder, "test.txt"))
+    logger.info("conll2003 dataset: train size: %d; dev size %d; test size: %d" % (len(inputs_train), len(inputs_dev), len(inputs_test)))
+    dataset_train = Dataset(tokenizer, inputs_train, labels_train)
+    dataset_dev = Dataset(tokenizer, inputs_dev, labels_dev)
+    dataset_test = Dataset(tokenizer, inputs_test, labels_test)
+    dataloader_train = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True, collate_fn=dataset_train.collate_fn)
+    dataloader_dev = DataLoader(dataset=dataset_dev, batch_size=batch_size, shuffle=False, collate_fn=dataset_dev.collate_fn)
+    dataloader_test = DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=False, collate_fn=dataset_test.collate_fn)
+    return dataloader_train, dataloader_dev, dataloader_test
--- a/dialog_ctrl/ner/src/metrics.py
+++ b/dialog_ctrl/ner/src/metrics.py
+#!/usr/bin/env python
+# Python version of the evaluation script from CoNLL'00-
+# Intentional differences:
+# - accept any space as delimiter by default
+# - optional file argument (default STDIN)
+# - option to set boundary (-b argument)
+# - LaTeX output (-l argument) not supported
+# - raw tags (-r argument) not supported
+import sys
+import re
+from collections import defaultdict, namedtuple
+ANY_SPACE = '<SPACE>'
+class FormatError(Exception):
+    pass
+Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')
+class EvalCounts(object):
+    def __init__(self):
+        self.correct_chunk = 0    # number of correctly identified chunks
+        self.correct_tags = 0     # number of correct chunk tags
+        self.found_correct = 0    # number of chunks in corpus
+        self.found_guessed = 0    # number of identified chunks
+        self.token_counter = 0    # token counter (ignores sentence breaks)
+        # counts by type
+        self.t_correct_chunk = defaultdict(int)
+        self.t_found_correct = defaultdict(int)
+        self.t_found_guessed = defaultdict(int)
+def parse_args(argv):
+    import argparse
+    parser = argparse.ArgumentParser(
+        description='evaluate tagging results using CoNLL criteria',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    arg = parser.add_argument
+    arg('-b', '--boundary', metavar='STR', default='-X-',
+        help='sentence boundary')
+    arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE,
+        help='character delimiting items in input')
+    arg('-o', '--otag', metavar='CHAR', default='O',
+        help='alternative outside tag')
+    arg('file', nargs='?', default=None)
+    return parser.parse_args(argv)
+def parse_tag(t):
+    m = re.match(r'^([^-]*)-(.*)$', t)
+    return m.groups() if m else (t, '')
+def evaluate(lines, options=None):
+    if options is None:
+        options = parse_args([])    # use defaults
+    counts = EvalCounts()
+    num_features = None       # number of features per line
+    in_correct = False        # currently processed chunks is correct until now
+    last_correct = 'O'        # previous chunk tag in corpus
+    last_correct_type = ''    # type of previously identified chunk tag
+    last_guessed = 'O'        # previously identified chunk tag
+    last_guessed_type = ''    # type of previous chunk tag in corpus
+    for line in lines:
+        line = line.rstrip('\r\n')
+        if options.delimiter == ANY_SPACE:
+            features = line.split()
+        else:
+            features = line.split(options.delimiter)
+        if num_features is None:
+            num_features = len(features)
+        elif num_features != len(features) and len(features) != 0:
+            raise FormatError('unexpected number of features: %d (%d)' %
+                              (len(features), num_features))
+        if len(features) == 0 or features[0] == options.boundary:
+            features = [options.boundary, 'O', 'O']
+        if len(features) < 3:
+            raise FormatError('unexpected number of features in line %s' % line)
+        guessed, guessed_type = parse_tag(features.pop())
+        correct, correct_type = parse_tag(features.pop())
+        first_item = features.pop(0)
+        if first_item == options.boundary:
+            guessed = 'O'
+        end_correct = end_of_chunk(last_correct, correct,
+                                   last_correct_type, correct_type)
+        end_guessed = end_of_chunk(last_guessed, guessed,
+                                   last_guessed_type, guessed_type)
+        start_correct = start_of_chunk(last_correct, correct,
+                                       last_correct_type, correct_type)
+        start_guessed = start_of_chunk(last_guessed, guessed,
+                                       last_guessed_type, guessed_type)
+        if in_correct:
+            if (end_correct and end_guessed and
+                last_guessed_type == last_correct_type):
+                in_correct = False
+                counts.correct_chunk += 1
+                counts.t_correct_chunk[last_correct_type] += 1
+            elif (end_correct != end_guessed or guessed_type != correct_type):
+                in_correct = False
+        if start_correct and start_guessed and guessed_type == correct_type:
+            in_correct = True
+        if start_correct:
+            counts.found_correct += 1
+            counts.t_found_correct[correct_type] += 1
+        if start_guessed:
+            counts.found_guessed += 1
+            counts.t_found_guessed[guessed_type] += 1
+        if first_item != options.boundary:
+            if correct == guessed and guessed_type == correct_type:
+                counts.correct_tags += 1
+            counts.token_counter += 1
+        last_guessed = guessed
+        last_correct = correct
+        last_guessed_type = guessed_type
+        last_correct_type = correct_type
+    if in_correct:
+        counts.correct_chunk += 1
+        counts.t_correct_chunk[last_correct_type] += 1
+    return counts
+def uniq(iterable):
+  seen = set()
+  return [i for i in iterable if not (i in seen or seen.add(i))]
+def calculate_metrics(correct, guessed, total):
+    tp, fp, fn = correct, guessed-correct, total-correct
+    p = 0 if tp + fp == 0 else 1.*tp / (tp + fp)
+    r = 0 if tp + fn == 0 else 1.*tp / (tp + fn)
+    f = 0 if p + r == 0 else 2 * p * r / (p + r)
+    return Metrics(tp, fp, fn, p, r, f)
+def metrics(counts):
+    c = counts
+    overall = calculate_metrics(
+        c.correct_chunk, c.found_guessed, c.found_correct
+    )
+    by_type = {}
+    for t in uniq(list(c.t_found_correct) + list(c.t_found_guessed)):
+        by_type[t] = calculate_metrics(
+            c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t]
+        )
+    return overall, by_type
+def report(counts, out=None):
+    if out is None:
+        out = sys.stdout
+    overall, by_type = metrics(counts)
+    c = counts
+    # out.write('processed %d tokens with %d phrases; ' %
+    #           (c.token_counter, c.found_correct))
+    # out.write('found: %d phrases; correct: %d.\n' %
+    #           (c.found_guessed, c.correct_chunk))
+    results = {}
+    if c.token_counter > 0:
+        results["fb1"] = 100.*overall.fscore
+    # comment it to not print details
+    # for i, m in sorted(by_type.items()):
+    #     print('%17s: ' % i)
+    #     print('precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f  %d\n' % (100.*m.prec, 100.*m.rec, 100.*m.fscore, c.t_found_guessed[i]))
+    return results
+def end_of_chunk(prev_tag, tag, prev_type, type_):
+    # check if a chunk ended between the previous and current word
+    # arguments: previous and current chunk tags, previous and current types
+    chunk_end = False
+    if prev_tag == 'E': chunk_end = True
+    if prev_tag == 'S': chunk_end = True
+    if prev_tag == 'B' and tag == 'B': chunk_end = True
+    if prev_tag == 'B' and tag == 'S': chunk_end = True
+    if prev_tag == 'B' and tag == 'O': chunk_end = True
+    if prev_tag == 'I' and tag == 'B': chunk_end = True
+    if prev_tag == 'I' and tag == 'S': chunk_end = True
+    if prev_tag == 'I' and tag == 'O': chunk_end = True
+    if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
+        chunk_end = True
+    # these chunks are assumed to have length 1
+    if prev_tag == ']': chunk_end = True
+    if prev_tag == '[': chunk_end = True
+    return chunk_end
+def start_of_chunk(prev_tag, tag, prev_type, type_):
+    # check if a chunk started between the previous and current word
+    # arguments: previous and current chunk tags, previous and current types
+    chunk_start = False
+    if tag == 'B': chunk_start = True
+    if tag == 'S': chunk_start = True
+    if prev_tag == 'E' and tag == 'E': chunk_start = True
+    if prev_tag == 'E' and tag == 'I': chunk_start = True
+    if prev_tag == 'S' and tag == 'E': chunk_start = True
+    if prev_tag == 'S' and tag == 'I': chunk_start = True
+    if prev_tag == 'O' and tag == 'E': chunk_start = True
+    if prev_tag == 'O' and tag == 'I': chunk_start = True
+    if tag != 'O' and tag != '.' and prev_type != type_:
+        chunk_start = True
+    # these chunks are assumed to have length 1
+    if tag == '[': chunk_start = True
+    if tag == ']': chunk_start = True
+    return chunk_start
+def main(argv):
+    args = parse_args(argv[1:])
+    if args.file is None:
+        counts = evaluate(sys.stdin, args)
+    else:
+        with open(args.file) as f:
+            counts = evaluate(f, args)
+    report(counts)
+def conll2002_measure(lines, verbose=False):
+    counts = evaluate(lines, None)
+    return report(counts)
--- a/dialog_ctrl/ner/src/model.py
+++ b/dialog_ctrl/ner/src/model.py
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from transformers import AutoModel
+class EntityTagger(nn.Module):
+    def __init__(self, params):
+        super(EntityTagger, self).__init__()
+        self.num_tag = params.num_tag
+        self.hidden_dim = params.hidden_dim
+        self.model = AutoModel.from_pretrained(params.model_name)
+        self.dropout = nn.Dropout(params.dropout)
+        self.linear = nn.Linear(self.hidden_dim, self.num_tag)
+    def forward(self, X):
+        outputs = self.model(X) # a tuple ((bsz,seq_len,hidden_dim), (bsz, hidden_dim))
+        outputs = outputs[0] # (bsz, seq_len, hidden_dim)
+        outputs = self.dropout(outputs)
+        prediction = self.linear(outputs)
+        return prediction
--- a/dialog_ctrl/ner/src/trainer.py
+++ b/dialog_ctrl/ner/src/trainer.py
+import torch
+import torch.nn as nn
+from src.metrics import *
+from src.dataloader import label_set, pad_token_label_id
+import os
+import numpy as np
+from tqdm import tqdm
+import logging
+logger = logging.getLogger()
+class NERTrainer(object):
+    def __init__(self, params, model):
+        self.params = params
+        self.model = model
+        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=params.lr)
+        self.loss_fn = nn.CrossEntropyLoss()
+        self.early_stop = params.early_stop
+        self.no_improvement_num = 0
+        self.best_dev_f1 = 0
+    def train_step(self, X, y):
+        self.model.train()
+        preds = self.model(X)
+        y = y.view(y.size(0)*y.size(1))
+        preds = preds.view(preds.size(0)*preds.size(1), preds.size(2))
+        self.optimizer.zero_grad()
+        loss = self.loss_fn(preds, y)
+        loss.backward()
+        self.optimizer.step()
+        return loss.item()
+    def train(self, dataloader_train, dataloader_dev, dataloader_test):
+        logger.info("Start NER training ...")
+        for e in range(self.params.epoch):
+            logger.info("============== epoch %d ==============" % e)
+            loss_list = []
+            pbar = tqdm(enumerate(dataloader_train), total=len(dataloader_train))
+            for i, (X, y) in pbar:
+                X, y = X.cuda(), y.cuda()
+                loss = self.train_step(X, y)
+                loss_list.append(loss)
+                pbar.set_description("(Epoch {}) LOSS:{:.4f}".format(e, np.mean(loss_list)))
+            logger.info("Finish training epoch %d. loss: %.4f" % (e, np.mean(loss_list)))
+            logger.info("============== Evaluate epoch %d on Dev Set ==============" % e)
+            f1_dev = self.evaluate(dataloader_dev)
+            logger.info("Evaluate on Dev Set. F1: %.4f." % f1_dev)
+            if f1_dev > self.best_dev_f1:
+                logger.info("Found better model!!")
+                self.best_dev_f1 = f1_dev
+                self.no_improvement_num = 0
+                self.save_model()
+            else:
+                self.no_improvement_num += 1
+                logger.info("No better model found (%d/%d)" % (self.no_improvement_num, self.early_stop))
+            if self.no_improvement_num >= self.early_stop:
+                break
+        logger.info("============== Evaluate on Test Set ==============")
+        f1_test = self.evaluate(dataloader_test)
+        logger.info("Evaluate on Test Set. F1: %.4f." % f1_test)
+    def evaluate(self, dataloader):
+        self.model.eval()
+        pred_list = []
+        y_list = []
+        pbar = tqdm(enumerate(dataloader), total=len(dataloader))
+        for i, (X, y) in pbar:
+            y_list.extend(y.data.numpy()) # y is a list
+            X = X.cuda()
+            preds = self.model(X)
+            pred_list.extend(preds.data.cpu().numpy())
+        # concatenation
+        pred_list = np.concatenate(pred_list, axis=0)   # (length, num_tag)
+        pred_list = np.argmax(pred_list, axis=1)
+        y_list = np.concatenate(y_list, axis=0)
+        # calcuate f1 score
+        pred_list = list(pred_list)
+        y_list = list(y_list)
+        lines = []
+        for pred_index, gold_index in zip(pred_list, y_list):
+            gold_index = int(gold_index)
+            if gold_index != pad_token_label_id:
+                pred_token = label_set[pred_index]
+                gold_token = label_set[gold_index]
+                lines.append("w" + " " + pred_token + " " + gold_token)
+        results = conll2002_measure(lines)
+        f1 = results["fb1"]
+        return f1
+    def save_model(self):
+        """
+        save the best model
+        """
+        saved_path = os.path.join(self.params.saved_folder, self.params.model_name+".pt")
+        torch.save({
+            "model": self.model,
+        }, saved_path)
+        logger.info("Best model has been saved to %s" % saved_path)
--- a/dialog_ctrl/ner/src/utils.py
+++ b/dialog_ctrl/ner/src/utils.py
+import os
+import subprocess
+import pickle
+import logging
+import time
+import random
+from datetime import timedelta
+import numpy as np
+def init_experiment(params, logger_filename):
+    """
+    Initialize the experiment:
+    - save parameters
+    - create a logger
+    """
+    # save parameters
+    get_saved_path(params)
+    pickle.dump(params, open(os.path.join(params.dump_path, "params.pkl"), "wb"))
+    # create a logger
+    logger = create_logger(os.path.join(params.dump_path, logger_filename))
+    logger.info('============ Initialized logger ============')
+    logger.info('\n'.join('%s: %s' % (k, str(v))
+                          for k, v in sorted(dict(vars(params)).items())))
+    logger.info('The experiment will be stored in %s\n' % params.dump_path)
+    return logger
+class LogFormatter():
+    def __init__(self):
+        self.start_time = time.time()
+    def format(self, record):
+        elapsed_seconds = round(record.created - self.start_time)
+        prefix = "%s - %s - %s" % (
+            record.levelname,
+            time.strftime('%x %X'),
+            timedelta(seconds=elapsed_seconds)
+        )
+        message = record.getMessage()
+        message = message.replace('\n', '\n' + ' ' * (len(prefix) + 3))
+        return "%s - %s" % (prefix, message) if message else ''
+def create_logger(filepath):
+    # create log formatter
+    log_formatter = LogFormatter()
+    # create file handler and set level to debug
+    if filepath is not None:
+        file_handler = logging.FileHandler(filepath, "a")
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(log_formatter)
+    # create console handler and set level to info
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    console_handler.setFormatter(log_formatter)
+    # create logger and set level to debug
+    logger = logging.getLogger()
+    logger.handlers = []
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+    if filepath is not None:
+        logger.addHandler(file_handler)
+    logger.addHandler(console_handler)
+    # reset logger elapsed time
+    def reset_time():
+        log_formatter.start_time = time.time()
+    logger.reset_time = reset_time
+    return logger
+def get_saved_path(params):
+    """
+    create a directory to store the experiment
+    """
+    dump_path = "./" if params.dump_path == "" else params.dump_path
+    if not os.path.isdir(dump_path):
+        subprocess.Popen("mkdir -p %s" % dump_path, shell=True).wait()
+    assert os.path.isdir(dump_path)
+    # create experiment path if it does not exist
+    exp_path = os.path.join(dump_path, params.exp_name)
+    if not os.path.exists(exp_path):
+        subprocess.Popen("mkdir -p %s" % exp_path, shell=True).wait()
+    # generate id for this experiment
+    if params.exp_id == "":
+        chars = "0123456789"
+        while True:
+            exp_id = "".join(random.choice(chars) for _ in range(0, 3))
+            if not os.path.isdir(os.path.join(exp_path, exp_id)):
+                break
+    else:
+        exp_id = params.exp_id
+    # update dump_path
+    params.dump_path = os.path.join(exp_path, exp_id)
+    if not os.path.isdir(params.dump_path):
+        subprocess.Popen("mkdir -p %s" % params.dump_path, shell=True).wait()
+    assert os.path.isdir(params.dump_path)
--- a/dialog_ctrl/ner/train_ner.py
+++ b/dialog_ctrl/ner/train_ner.py
+from src.config import get_params
+from src.utils import init_experiment
+from src.dataloader import get_dataloader
+from src.model import EntityTagger
+from src.trainer import NERTrainer
+import torch
+import numpy as np
+from tqdm import tqdm
+import random
+def random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+def train_ner(params):
+    # initialize experiment
+    logger = init_experiment(params, logger_filename=params.logger_filename)
+    # dataloader
+    dataloader_train, dataloader_dev, dataloader_test = get_dataloader(params.model_name, params.batch_size, params.data_folder)
+    # BERT-based NER Tagger
+    model = EntityTagger(params)
+    model.cuda()
+    # trainer
+    trainer = NERTrainer(params, model)
+    trainer.train(dataloader_train, dataloader_dev, dataloader_test)
+if __name__ == "__main__":
+    params = get_params()
+    random_seed(params.seed)
+    train_ner(params)
--- a/tensorboard/gpt3-357m/events.out.tfevents.1623896925.dgx0064.2537583.0
+++ b/tensorboard/gpt3-357m/events.out.tfevents.1623896925.dgx0064.2537583.0
--- a/tensorboard/gpt3-357m/events.out.tfevents.1623897185.dgx0066.499504.0
+++ b/tensorboard/gpt3-357m/events.out.tfevents.1623897185.dgx0066.499504.0
--- a/tensorboard/gpt3-357m/events.out.tfevents.1623897586.dgx0064.2550897.0
+++ b/tensorboard/gpt3-357m/events.out.tfevents.1623897586.dgx0064.2550897.0
--- a/tensorboard/gpt3-357m/events.out.tfevents.1623897835.dgx0064.2557370.0
+++ b/tensorboard/gpt3-357m/events.out.tfevents.1623897835.dgx0064.2557370.0