process wow

df33344a · zihanl · 7d044e4e · df33344a · 7d044e4e · 7d044e4e
Commit df33344a authored Jun 24, 2021 by zihanl
15 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,8 @@ __pycache__
 # Distribution / packaging
 build/
 dist/
-*.egg-info/
\ No newline at end of file
+*.egg-info/
+tensorboard/
+commands
+*.log
+logs
\ No newline at end of file
--- a/commands/get_node_int.sh
+++ b/commands/get_node_int.sh
-#!/bin/bash
-
-srun -p batch_short,batch -A gpu_adlr_nlp -t 2:00:00 --nodes=1 --ntasks-per-node=16 --gres=gpu:16,gpfs:circe --job-name=interact --container-mounts=/gpfs/fs1/projects/gpu_adlr/datasets:/gpfs/fs1/projects/gpu_adlr/datasets,/home/zihanl:/home/zihanl --container-image=gitlab-master.nvidia.com/adlr/megatron-lm/pytorch-nlp-retriever-faiss:20.12-py3-devel --exclusive --pty bash
--- a/commands/run_gpt3_357m.sh
+++ b/commands/run_gpt3_357m.sh
-#!/bin/bash
-
-#SBATCH -p interactive -A gpu_adlr_nlp -t 1:00:00 --nodes=1 --exclusive --mem=0 --overcommit --ntasks-per-node=16 --gres=gpu:16,gpfs:circe --dependency=singleton --job-name=adlr-nlp-largelm:gpt3-357m
-
-NAME="gpt3-357m"
-DIR=`pwd`
-DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
-mkdir -p $DIR/logs
-
-TENSORBOARD_DIR="${DIR}/tensorboard/${NAME}"
-mkdir -p ${TENSORBOARD_DIR}
-
-DATA_PATH=/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/roberta_dataset/rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_text_document
-
-options=" \
-    --tensor-model-parallel-size 1 \
-    --pipeline-model-parallel-size 1 \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 2048 \
-    --max-position-embeddings 2048 \
-    --micro-batch-size 2 \
-    --global-batch-size 256 \
-    --rampup-batch-size 32 32 1953125 \
-    --train-samples 192000000 \
-    --lr-decay-samples 166400000 \
-    --lr-warmup-samples 162761 \
-    --lr 3.0e-4 \
-    --min-lr 3.0e-5 \
-    --lr-decay-style cosine \
-    --log-interval 100 \
-    --eval-iters 50 \
-    --eval-interval 2000 \
-    --data-path ${DATA_PATH} \
-    --vocab-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-vocab.json \
-    --merge-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-merges.txt \
-    --save-interval 10000 \
-    --exit-interval 100 \
-    --save /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
-    --load /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
-    --split 98,2,0 \
-    --clip-grad 1.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --init-method-std 0.02 \
-    --log-params-norm \
-    --log-num-zeros-in-grad \
-    --fp16 \
-    --DDP-impl torch \
-    --tensorboard-dir ${TENSORBOARD_DIR} \
-    --checkpoint-activations "
-
-run_cmd="python ${DIR}/pretrain_gpt.py ${options}"
-
-srun -l \
-     --container-image "gitlab-master.nvidia.com/adlr/megatron-lm/pytorch-nlp-retriever-faiss:20.12-py3-devel" \
-     --container-mounts "/gpfs/fs1/projects/gpu_adlr/datasets:/gpfs/fs1/projects/gpu_adlr/datasets,/home/zihanl:/home/zihanl" \
-     --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
-
-set +x
-
--- a/commands/run_gpt3_357m_int.sh
+++ b/commands/run_gpt3_357m_int.sh
-#!/bin/bash
-
-NAME="gpt3-357m"
-DIR=`pwd`
-DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
-mkdir -p $DIR/logs
-
-TENSORBOARD_DIR="${DIR}/tensorboard/${NAME}"  
-mkdir -p ${TENSORBOARD_DIR}
-
-DATA_PATH=/gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/roberta_dataset/rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_text_document
-
-options=" \
-    --tensor-model-parallel-size 1 \
-    --pipeline-model-parallel-size 1 \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 2048 \
-    --max-position-embeddings 2048 \
-    --micro-batch-size 2 \
-    --global-batch-size 256 \
-    --rampup-batch-size 32 32 1953125 \
-    --train-samples 192000000 \
-    --lr-decay-samples 166400000 \
-    --lr-warmup-samples 162761 \
-    --lr 3.0e-4 \
-    --min-lr 3.0e-5 \
-    --lr-decay-style cosine \
-    --log-interval 100 \
-    --eval-iters 50 \
-    --eval-interval 2000 \
-    --data-path ${DATA_PATH} \
-    --vocab-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-vocab.json \
-    --merge-file /gpfs/fs1/projects/gpu_adlr/datasets/nlp/gpt2_indexed_dataset/bpe/gpt2-merges.txt \
-    --save-interval 10000 \
-    --exit-interval 100 \
-    --save /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
-    --load /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/gpt3/${NAME} \
-    --split 98,2,0 \
-    --clip-grad 1.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --init-method-std 0.02 \
-    --log-params-norm \
-    --log-num-zeros-in-grad \
-    --fp16 \
-    --DDP-impl torch \
-    --tensorboard-dir ${TENSORBOARD_DIR} \
-    --checkpoint-activations "
-
-run_cmd="${DIR}/pretrain_gpt.py ${options}"
-
-GPUS_PER_NODE=16
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ${run_cmd}
-
-set +x
-
--- a/dialog_ctrl/ner/gen_entityctrl_data.py
+++ b/dialog_ctrl/ner/gen_entityctrl_data.py
+
+from src.config import get_params
+from transformers import AutoTokenizer
+import torch
+import numpy as np
+from tqdm import tqdm
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+import string
+import os
+
+wn_lemma = WordNetLemmatizer()
+
+stop_words = stopwords.words('english')
+stop_words.append("n't")
+stop_words.append("'s")
+punctuations = list(string.punctuation)
+punctuations.append("``")
+punctuations.append("''")
+
+stop_words_and_punctuations = stop_words + punctuations
+stop_words_and_punctuations_table = {word: True for word in stop_words_and_punctuations}
+
+label_set = ["O", "B", "I"]
+
+def read_data(input_datapath):
+    data = []
+    print("Reading data from %s" % input_datapath)
+    with open(input_datapath, "r") as f:
+        for i, line in enumerate(f):
+            line = line.strip()
+            splits = line.split("\t")
+            length = len(splits)
+            assert length == 2 or length == 4
+
+            # length is 2: dialog context + response
+            # length is 4: dialog context + topic + control sentence + response
+            if length == 2:
+                # dialog context + response
+                data.append(line)
+            else:
+                # only need dialog context + control sentence + response
+                data.append(splits[0] + "\t" + splits[2] + "\t" + splits[3])
+
+    return data
+
+
+def write_data(output_datapath, output_data):
+    print("Writing data to %s" % output_datapath)
+    with open(output_datapath, "w") as fw:
+        for data_sample in output_data:
+            fw.write(data_sample + "\n")
+
+
+def detect_entities(tokenizer, ner_model, sentence):
+    tokens = sentence.split()
+    token_ids, first_tok_masks = [tokenizer.cls_token_id], [0]
+    for token in tokens:
+        subs_ = tokenizer.tokenize(token)
+        assert len(subs_) > 0
+        
+        token_ids.extend(tokenizer.convert_tokens_to_ids(subs_))
+        first_tok_masks.extend([1] + [0] * (len(subs_) - 1))
+    
+    token_ids.append(tokenizer.sep_token_id)
+    first_tok_masks.append(0)
+    
+    token_ids = torch.LongTensor([token_ids]).cuda()
+    predictions = ner_model(token_ids)
+
+    predictions = predictions[0].data.cpu().numpy() # (seq_len, 3)
+    pred_ids = list(np.argmax(predictions, axis=1))
+
+    assert len(pred_ids) == len(first_tok_masks)
+    preds_for_each_word = []
+    for pred_id, mask in zip(pred_ids, first_tok_masks):
+        if mask == 1:
+            preds_for_each_word.append(label_set[pred_id])
+
+    assert len(preds_for_each_word) == len(tokens)
+
+    # extract entities
+    entity_list = []
+    temp = []
+    for i, (token, pred) in enumerate(zip(tokens, preds_for_each_word)):
+        if pred == "O":
+            if len(temp) > 0:
+                entity_list.append(" ".join(temp))
+                temp = []
+        else: 
+            # pred == "B" or pred == "I"
+            temp.append(token)
+
+    return entity_list
+
+
+def generate_entity_control_data(tokenizer, ner_model, input_data):
+    # aim to generate:
+    # dialog context + entity control code (optional) + relevant control sentence (contain entity) + response
+    
+    output_data = []
+    ## TODO
+    n_skip, n_skip_no_overlap, n_skip_one_contain_another = 0, 0, 0
+    n_control, n_entity_control, n_overlap_control = 0, 0, 0
+    total_num_control_code = 0
+    for sample_idx, data_item in enumerate(tqdm(input_data)):
+        # # Debug only
+        # if sample_idx > 1000:
+        #     break
+
+        # 1. detect entities for dialog context, control sentence and response
+        splits = data_item.split("\t")
+        if len(splits) == 2:
+            output_data.append(data_item)
+            continue
+        assert len(splits) == 3
+        
+        last_turn = splits[0].split(" [SEP] ")[-1]
+        control_sent = splits[1]
+        response = splits[2]
+
+        if control_sent in response or response in control_sent:
+            # if the whole control_sent is a part of response or vise versa, skip this data sample 
+            n_skip += 1
+            n_skip_one_contain_another += 1
+            continue
+
+        last_turn_entities = detect_entities(tokenizer, ner_model, last_turn)
+        control_sent_entities = detect_entities(tokenizer, ner_model, control_sent)
+        response_entities = detect_entities(tokenizer, ner_model, response)
+
+        # 2. generate control code:
+        # 2.1 If there is one or more than one common entity in last_turn, control sentence and response. No need to use entity as control.
+        # 2.2 If the entity only exists in control sentence and response, use this as the control code.
+        # 2.3 If there is no overlaped entity or words between control sentence and response, skip this data sample.
+        # 2.4 If there is no overlapped entity but there are overlapped words, add entity in the control sentence (if any) as the control code if it is not in the dialog context
+
+        # TODO
+        # In general, need to trim the control sentence when it is too long.
+        # Need to lowercase to match?
+
+        # calculate common entity between control sentence and response
+        common_entity_list = []
+        for ctrl_entity in control_sent_entities:
+            for resp_entity in response_entities:
+                if resp_entity in ctrl_entity:
+                    common_entity_list.append(ctrl_entity)
+                    break
+                elif ctrl_entity in resp_entity:
+                    common_entity_list.append(resp_entity)
+                    break
+        
+        if len(common_entity_list) == 0:
+            # calculate overlap between control sentence and response
+            control_word_list = control_sent.split()
+            response_word_list = response.split()
+            response_word_table = {wn_lemma.lemmatize(word): True for word in response_word_list}
+            overlap_phrases = []
+            temp = []
+            for word in control_word_list:
+                if word.lower() in stop_words_and_punctuations_table:
+                    continue
+                
+                if wn_lemma.lemmatize(word) in response_word_table:
+                    temp.append(word)
+                else:
+                    if len(temp) > 0:
+                        if len(temp) > 4:
+                            temp = temp[:4]
+                        overlap_phrases.append(" ".join(temp))
+                        temp = []
+
+            if len(overlap_phrases) == 0:
+                # skip this data sample
+                n_skip += 1
+                n_skip_no_overlap += 1
+                continue
+            
+            n_control += 1
+            control_code_list = []
+
+            if len(control_sent_entities) > 0:
+                n_entity_control += 1
+                # reorder control_sent_entities based on the length of the entities (in a reverse order)
+                control_sent_entities = sorted(control_sent_entities, key=len, reverse=True)
+                for entity in control_sent_entities:
+                    if entity not in last_turn:
+                        add_flag = True
+                        for code in control_code_list:
+                            if entity in code:
+                                add_flag = False
+                                break
+                        if add_flag:
+                            control_code_list.append(entity)
+            else:
+                n_overlap_control += 1
+                # reorder overlap_phrases based on the length of the phrases (in a reverse order)
+                overlap_phrases = sorted(overlap_phrases, key=len, reverse=True)[:3]
+                for phrase in overlap_phrases:
+                    if phrase not in last_turn:
+                        add_flag = True
+                        for code in control_code_list:
+                            if phrase in code:
+                                # remove repeat word
+                                add_flag = False
+                                break
+                        if add_flag:
+                            control_code_list.append(phrase)
+
+        else:
+            n_entity_control += 1
+            n_control += 1
+            control_code_list = []
+            # reorder common_entity_list based on the length of the entities (in a reverse order)
+            common_entity_list = sorted(common_entity_list, key=len, reverse=True)
+            for entity in common_entity_list:
+                if entity not in last_turn:
+                    add_flag = True
+                    for code in control_code_list:
+                        if entity in code:
+                            add_flag = False
+                            break
+                    if add_flag:
+                        control_code_list.append(entity)
+
+        total_num_control_code += len(control_code_list)
+
+        if len(control_code_list) > 0:
+            output_data.append(splits[0] + "\t" + " [CTRL] ".join(control_code_list) + "\t" + control_sent + "\t" + response)
+        else:
+            output_data.append(splits[0] + "\t" + control_sent + "\t" + response)
+
+    avg_num_control_code = total_num_control_code * 1.0 / n_control
+
+    print("number of skip sentences: %d (one contain another: %d + no overlap: %d)" % (n_skip, n_skip_one_contain_another, n_skip_no_overlap))
+    print("Total data size: %d. Number of control case: %d (entity control: %d + overlap control: %d)" % (len(output_data), n_control, n_entity_control, n_overlap_control))
+    print("Number of control code: %d vs. number of control case: %d (averaged control code per case: %.4f)" % (total_num_control_code, n_control, avg_num_control_code))
+
+    return output_data
+
+
+def main(params):
+    # load model and tokenizer
+    model_saved_path = os.path.join(params.saved_folder, params.model_name+".pt")
+    ner_model = torch.load(model_saved_path)["model"]
+    ner_model.cuda()
+    ner_model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(params.model_name)
+
+    # load data
+    datafolder = os.path.join(params.default_folder, params.infer_datafolder)
+    input_datapath = os.path.join(datafolder, params.infer_dataname)
+    output_datapath = os.path.join(datafolder, params.output_dataname)
+
+    # read input data
+    input_data = read_data(input_datapath)
+
+    # process data (generate entity control data)
+    output_data = generate_entity_control_data(tokenizer, ner_model, input_data)
+
+    # write output data
+    write_data(output_datapath, output_data)
+
+
+if __name__ == "__main__":
+    params = get_params()
+    main(params)
\ No newline at end of file
--- a/dialog_ctrl/ner/inference.py
+++ b/dialog_ctrl/ner/inference.py
-
-
--- a/dialog_ctrl/ner/logs/conll2003/1/params.pkl
+++ b/dialog_ctrl/ner/logs/conll2003/1/params.pkl
--- a/dialog_ctrl/ner/logs/conll2003/1/train.log
+++ b/dialog_ctrl/ner/logs/conll2003/1/train.log
-INFO - 06/21/21 23:13:46 - 0:00:00 - ============ Initialized logger ============
-INFO - 06/21/21 23:13:46 - 0:00:00 - batch_size: 32
-                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
-                                     dropout: 0.1
-                                     dump_path: logs/conll2003/1
-                                     early_stop: 3
-                                     epoch: 300
-                                     exp_id: 1
-                                     exp_name: conll2003
-                                     hidden_dim: 1024
-                                     logger_filename: train.log
-                                     lr: 5e-05
-                                     model_name: roberta-large
-                                     num_tag: 3
-                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
-                                     seed: 555
-INFO - 06/21/21 23:13:46 - 0:00:00 - The experiment will be stored in logs/conll2003/1
-                                     
-INFO - 06/21/21 23:25:29 - 0:00:00 - ============ Initialized logger ============
-INFO - 06/21/21 23:25:29 - 0:00:00 - batch_size: 32
-                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
-                                     dropout: 0.1
-                                     dump_path: logs/conll2003/1
-                                     early_stop: 3
-                                     epoch: 300
-                                     exp_id: 1
-                                     exp_name: conll2003
-                                     hidden_dim: 1024
-                                     logger_filename: train.log
-                                     lr: 5e-05
-                                     model_name: roberta-large
-                                     num_tag: 3
-                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
-                                     seed: 555
-INFO - 06/21/21 23:25:29 - 0:00:00 - The experiment will be stored in logs/conll2003/1
-                                     
-DEBUG - 06/21/21 23:25:29 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:25:29 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:25:29 - 0:00:01 - Attempting to acquire lock 22598820184656 on /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373.lock
-INFO - 06/21/21 23:25:29 - 0:00:01 - Lock 22598820184656 acquired on /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373.lock
-DEBUG - 06/21/21 23:25:29 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:25:30 - 0:00:01 - https://huggingface.co:443 "GET /roberta-large/resolve/main/config.json HTTP/1.1" 200 482
-DEBUG - 06/21/21 23:25:30 - 0:00:01 - Attempting to release lock 22598820184656 on /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373.lock
-INFO - 06/21/21 23:25:30 - 0:00:01 - Lock 22598820184656 released on /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373.lock
-DEBUG - 06/21/21 23:25:30 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:25:30 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:25:30 - 0:00:01 - Attempting to acquire lock 22598820184656 on /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
-INFO - 06/21/21 23:25:30 - 0:00:01 - Lock 22598820184656 acquired on /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
-DEBUG - 06/21/21 23:25:30 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:25:30 - 0:00:01 - https://huggingface.co:443 "GET /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 898823
-DEBUG - 06/21/21 23:25:30 - 0:00:02 - Attempting to release lock 22598820184656 on /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
-INFO - 06/21/21 23:25:30 - 0:00:02 - Lock 22598820184656 released on /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
-DEBUG - 06/21/21 23:25:30 - 0:00:02 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:25:31 - 0:00:02 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:25:31 - 0:00:02 - Attempting to acquire lock 22597850387840 on /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
-INFO - 06/21/21 23:25:31 - 0:00:02 - Lock 22597850387840 acquired on /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
-DEBUG - 06/21/21 23:25:31 - 0:00:02 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:25:31 - 0:00:02 - https://huggingface.co:443 "GET /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 456318
-DEBUG - 06/21/21 23:25:31 - 0:00:02 - Attempting to release lock 22597850387840 on /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
-INFO - 06/21/21 23:25:31 - 0:00:02 - Lock 22597850387840 released on /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
-DEBUG - 06/21/21 23:25:31 - 0:00:02 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:25:31 - 0:00:03 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:25:31 - 0:00:03 - Attempting to acquire lock 22597850387840 on /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock
-INFO - 06/21/21 23:25:31 - 0:00:03 - Lock 22597850387840 acquired on /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock
-DEBUG - 06/21/21 23:25:31 - 0:00:03 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:25:32 - 0:00:03 - https://huggingface.co:443 "GET /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 1355863
-DEBUG - 06/21/21 23:25:32 - 0:00:03 - Attempting to release lock 22597850387840 on /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock
-INFO - 06/21/21 23:25:32 - 0:00:03 - Lock 22597850387840 released on /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock
-INFO - 06/21/21 23:26:26 - 0:00:00 - ============ Initialized logger ============
-INFO - 06/21/21 23:26:26 - 0:00:00 - batch_size: 32
-                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
-                                     dropout: 0.1
-                                     dump_path: logs/conll2003/1
-                                     early_stop: 3
-                                     epoch: 300
-                                     exp_id: 1
-                                     exp_name: conll2003
-                                     hidden_dim: 1024
-                                     logger_filename: train.log
-                                     lr: 5e-05
-                                     model_name: roberta-large
-                                     num_tag: 3
-                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
-                                     seed: 555
-INFO - 06/21/21 23:26:26 - 0:00:00 - The experiment will be stored in logs/conll2003/1
-                                     
-DEBUG - 06/21/21 23:26:26 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:26:26 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:26:26 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:26:27 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:26:27 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:26:27 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:26:27 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:26:27 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
-INFO - 06/21/21 23:26:39 - 0:00:13 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
-DEBUG - 06/21/21 23:26:39 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:26:39 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:26:39 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:26:39 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
-DEBUG - 06/21/21 23:26:39 - 0:00:13 - Attempting to acquire lock 23082502829920 on /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352.lock
-INFO - 06/21/21 23:26:39 - 0:00:13 - Lock 23082502829920 acquired on /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352.lock
-DEBUG - 06/21/21 23:26:39 - 0:00:13 - Starting new HTTPS connection (1): cdn-lfs.huggingface.co:443
-DEBUG - 06/21/21 23:26:39 - 0:00:13 - https://cdn-lfs.huggingface.co:443 "GET /roberta-large/36a10a8b694fadf9bf4f9049d14e257e88be45313ae02d882af9e60f39b8b2e8 HTTP/1.1" 200 1425941629
-DEBUG - 06/21/21 23:27:01 - 0:00:34 - Attempting to release lock 23082502829920 on /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352.lock
-INFO - 06/21/21 23:27:01 - 0:00:34 - Lock 23082502829920 released on /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352.lock
-INFO - 06/21/21 23:27:57 - 0:00:00 - ============ Initialized logger ============
-INFO - 06/21/21 23:27:57 - 0:00:00 - batch_size: 32
-                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
-                                     dropout: 0.1
-                                     dump_path: logs/conll2003/1
-                                     early_stop: 3
-                                     epoch: 300
-                                     exp_id: 1
-                                     exp_name: conll2003
-                                     hidden_dim: 1024
-                                     logger_filename: train.log
-                                     lr: 5e-05
-                                     model_name: roberta-large
-                                     num_tag: 3
-                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
-                                     seed: 555
-INFO - 06/21/21 23:27:57 - 0:00:00 - The experiment will be stored in logs/conll2003/1
-                                     
-DEBUG - 06/21/21 23:27:57 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:27:57 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:27:57 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:27:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:27:58 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:27:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:27:58 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:27:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
-INFO - 06/21/21 23:28:09 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
-DEBUG - 06/21/21 23:28:09 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:28:10 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:28:10 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:28:10 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
-INFO - 06/21/21 23:28:17 - 0:00:20 - Start NER training ...
-INFO - 06/21/21 23:28:17 - 0:00:20 - ============== epoch 0 ==============
-INFO - 06/21/21 23:29:45 - 0:00:00 - ============ Initialized logger ============
-INFO - 06/21/21 23:29:45 - 0:00:00 - batch_size: 32
-                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
-                                     dropout: 0.1
-                                     dump_path: logs/conll2003/1
-                                     early_stop: 3
-                                     epoch: 300
-                                     exp_id: 1
-                                     exp_name: conll2003
-                                     hidden_dim: 1024
-                                     logger_filename: train.log
-                                     lr: 5e-05
-                                     model_name: roberta-large
-                                     num_tag: 3
-                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
-                                     seed: 555
-INFO - 06/21/21 23:29:45 - 0:00:00 - The experiment will be stored in logs/conll2003/1
-                                     
-DEBUG - 06/21/21 23:29:45 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:29:45 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:29:45 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:29:45 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:29:45 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:29:46 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:29:46 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:29:46 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
-INFO - 06/21/21 23:29:57 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
-DEBUG - 06/21/21 23:29:57 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:29:57 - 0:00:12 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:29:57 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:29:57 - 0:00:12 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
-INFO - 06/21/21 23:30:04 - 0:00:19 - Start NER training ...
-INFO - 06/21/21 23:30:04 - 0:00:19 - ============== epoch 0 ==============
-INFO - 06/21/21 23:31:17 - 0:00:00 - ============ Initialized logger ============
-INFO - 06/21/21 23:31:17 - 0:00:00 - batch_size: 32
-                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
-                                     dropout: 0.1
-                                     dump_path: logs/conll2003/1
-                                     early_stop: 3
-                                     epoch: 300
-                                     exp_id: 1
-                                     exp_name: conll2003
-                                     hidden_dim: 1024
-                                     logger_filename: train.log
-                                     lr: 5e-05
-                                     model_name: roberta-large
-                                     num_tag: 3
-                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
-                                     seed: 555
-INFO - 06/21/21 23:31:17 - 0:00:00 - The experiment will be stored in logs/conll2003/1
-                                     
-DEBUG - 06/21/21 23:31:17 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:31:17 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:31:17 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:31:17 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:31:17 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:31:18 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:31:18 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:31:18 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
-INFO - 06/21/21 23:31:29 - 0:00:13 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
-DEBUG - 06/21/21 23:31:29 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:31:30 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:31:30 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:31:30 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
-INFO - 06/21/21 23:31:37 - 0:00:20 - Start NER training ...
-INFO - 06/21/21 23:31:37 - 0:00:20 - ============== epoch 0 ==============
-INFO - 06/21/21 23:33:58 - 0:02:42 - Finish training epoch 0. loss: 0.0696
-INFO - 06/21/21 23:33:58 - 0:02:42 - ============== Evaluate epoch 0 on Dev Set ==============
-INFO - 06/21/21 23:34:08 - 0:02:51 - Evaluate on Dev Set. F1: 95.5005.
-INFO - 06/21/21 23:34:08 - 0:02:51 - Found better model!!
-INFO - 06/21/21 23:48:39 - 0:00:00 - ============ Initialized logger ============
-INFO - 06/21/21 23:48:39 - 0:00:00 - batch_size: 32
-                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
-                                     dropout: 0.1
-                                     dump_path: logs/conll2003/1
-                                     early_stop: 3
-                                     epoch: 300
-                                     exp_id: 1
-                                     exp_name: conll2003
-                                     hidden_dim: 1024
-                                     logger_filename: train.log
-                                     lr: 5e-05
-                                     model_name: roberta-large
-                                     num_tag: 3
-                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
-                                     seed: 555
-INFO - 06/21/21 23:48:39 - 0:00:00 - The experiment will be stored in logs/conll2003/1
-                                     
-DEBUG - 06/21/21 23:48:39 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:48:39 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:48:39 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:48:40 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:48:40 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:48:40 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:48:40 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:48:40 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
-INFO - 06/21/21 23:48:51 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
-DEBUG - 06/21/21 23:48:51 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:48:51 - 0:00:12 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:48:51 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:48:51 - 0:00:12 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
-INFO - 06/21/21 23:49:00 - 0:00:21 - Start NER training ...
-INFO - 06/21/21 23:49:00 - 0:00:21 - ============== epoch 0 ==============
-INFO - 06/21/21 23:51:22 - 0:02:43 - Finish training epoch 0. loss: 0.0696
-INFO - 06/21/21 23:51:22 - 0:02:43 - ============== Evaluate epoch 0 on Dev Set ==============
-INFO - 06/21/21 23:51:31 - 0:02:52 - Evaluate on Dev Set. F1: 95.5005.
-INFO - 06/21/21 23:51:31 - 0:02:52 - Found better model!!
-INFO - 06/21/21 23:51:33 - 0:02:54 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
-INFO - 06/21/21 23:51:33 - 0:02:54 - ============== epoch 1 ==============
-INFO - 06/21/21 23:53:55 - 0:05:16 - Finish training epoch 1. loss: 0.0234
-INFO - 06/21/21 23:53:55 - 0:05:16 - ============== Evaluate epoch 1 on Dev Set ==============
-INFO - 06/21/21 23:54:03 - 0:00:00 - ============ Initialized logger ============
-INFO - 06/21/21 23:54:03 - 0:00:00 - batch_size: 32
-                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
-                                     dropout: 0.1
-                                     dump_path: logs/conll2003/1
-                                     early_stop: 3
-                                     epoch: 300
-                                     exp_id: 1
-                                     exp_name: conll2003
-                                     hidden_dim: 1024
-                                     logger_filename: train.log
-                                     lr: 5e-05
-                                     model_name: roberta-large
-                                     num_tag: 3
-                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
-                                     seed: 111
-INFO - 06/21/21 23:54:03 - 0:00:00 - The experiment will be stored in logs/conll2003/1
-                                     
-DEBUG - 06/21/21 23:54:03 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:54:04 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:54:04 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:54:04 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:54:04 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:54:04 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:54:04 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:54:05 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
-INFO - 06/21/21 23:54:05 - 0:05:25 - Evaluate on Dev Set. F1: 96.9048.
-INFO - 06/21/21 23:54:05 - 0:05:25 - Found better model!!
-INFO - 06/21/21 23:54:06 - 0:05:27 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
-INFO - 06/21/21 23:54:06 - 0:05:27 - ============== epoch 2 ==============
-INFO - 06/21/21 23:54:16 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
-DEBUG - 06/21/21 23:54:16 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:54:16 - 0:00:12 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:54:16 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:54:16 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
-INFO - 06/21/21 23:54:24 - 0:00:20 - Start NER training ...
-INFO - 06/21/21 23:54:24 - 0:00:20 - ============== epoch 0 ==============
-INFO - 06/21/21 23:55:40 - 0:00:00 - ============ Initialized logger ============
-INFO - 06/21/21 23:55:40 - 0:00:00 - batch_size: 32
-                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
-                                     dropout: 0.1
-                                     dump_path: logs/conll2003/1
-                                     early_stop: 3
-                                     epoch: 300
-                                     exp_id: 1
-                                     exp_name: conll2003
-                                     hidden_dim: 1024
-                                     logger_filename: train.log
-                                     lr: 5e-05
-                                     model_name: roberta-large
-                                     num_tag: 3
-                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
-                                     seed: 123456
-INFO - 06/21/21 23:55:40 - 0:00:00 - The experiment will be stored in logs/conll2003/1
-                                     
-DEBUG - 06/21/21 23:55:40 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:55:40 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:55:40 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:55:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:55:41 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:55:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:55:41 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:55:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
-INFO - 06/21/21 23:55:53 - 0:00:13 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
-DEBUG - 06/21/21 23:55:53 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:55:53 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:55:53 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:55:53 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
-INFO - 06/21/21 23:56:01 - 0:00:21 - Start NER training ...
-INFO - 06/21/21 23:56:01 - 0:00:21 - ============== epoch 0 ==============
-INFO - 06/21/21 23:56:29 - 0:07:50 - Finish training epoch 2. loss: 0.0162
-INFO - 06/21/21 23:56:29 - 0:07:50 - ============== Evaluate epoch 2 on Dev Set ==============
-INFO - 06/21/21 23:56:38 - 0:07:59 - Evaluate on Dev Set. F1: 97.3381.
-INFO - 06/21/21 23:56:38 - 0:07:59 - Found better model!!
-INFO - 06/21/21 23:56:40 - 0:08:01 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
-INFO - 06/21/21 23:56:40 - 0:08:01 - ============== epoch 3 ==============
-INFO - 06/21/21 23:56:47 - 0:02:43 - Finish training epoch 0. loss: 0.0580
-INFO - 06/21/21 23:56:47 - 0:02:43 - ============== Evaluate epoch 0 on Dev Set ==============
-INFO - 06/21/21 23:56:56 - 0:02:53 - Evaluate on Dev Set. F1: 96.7327.
-INFO - 06/21/21 23:56:56 - 0:02:53 - Found better model!!
-INFO - 06/21/21 23:56:58 - 0:02:54 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
-INFO - 06/21/21 23:56:58 - 0:02:54 - ============== epoch 1 ==============
-INFO - 06/21/21 23:58:25 - 0:02:45 - Finish training epoch 0. loss: 0.0544
-INFO - 06/21/21 23:58:25 - 0:02:45 - ============== Evaluate epoch 0 on Dev Set ==============
-INFO - 06/21/21 23:58:34 - 0:02:54 - Evaluate on Dev Set. F1: 96.8227.
-INFO - 06/21/21 23:58:34 - 0:02:54 - Found better model!!
-INFO - 06/21/21 23:58:36 - 0:02:56 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
-INFO - 06/21/21 23:58:36 - 0:02:56 - ============== epoch 1 ==============
-INFO - 06/21/21 23:58:40 - 0:00:00 - ============ Initialized logger ============
-INFO - 06/21/21 23:58:40 - 0:00:00 - batch_size: 32
-                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
-                                     dropout: 0.1
-                                     dump_path: logs/conll2003/1
-                                     early_stop: 3
-                                     epoch: 300
-                                     exp_id: 1
-                                     exp_name: conll2003
-                                     hidden_dim: 1024
-                                     logger_filename: train.log
-                                     lr: 3e-05
-                                     model_name: roberta-large
-                                     num_tag: 3
-                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
-                                     seed: 555
-INFO - 06/21/21 23:58:40 - 0:00:00 - The experiment will be stored in logs/conll2003/1
-                                     
-DEBUG - 06/21/21 23:58:40 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:58:40 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:58:40 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:58:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:58:41 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:58:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:58:41 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:58:41 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
-INFO - 06/21/21 23:58:57 - 0:00:00 - ============ Initialized logger ============
-INFO - 06/21/21 23:58:57 - 0:00:00 - batch_size: 32
-                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
-                                     dropout: 0.1
-                                     dump_path: logs/conll2003/1
-                                     early_stop: 3
-                                     epoch: 300
-                                     exp_id: 1
-                                     exp_name: conll2003
-                                     hidden_dim: 1024
-                                     logger_filename: train.log
-                                     lr: 3e-05
-                                     model_name: roberta-large
-                                     num_tag: 3
-                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
-                                     seed: 111
-INFO - 06/21/21 23:58:57 - 0:00:00 - The experiment will be stored in logs/conll2003/1
-                                     
-DEBUG - 06/21/21 23:58:57 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:58:57 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:58:57 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:58:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:58:58 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:58:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:58:58 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:58:58 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
-INFO - 06/21/21 23:59:02 - 0:10:23 - Finish training epoch 3. loss: 0.0136
-INFO - 06/21/21 23:59:02 - 0:10:23 - ============== Evaluate epoch 3 on Dev Set ==============
-INFO - 06/21/21 23:59:10 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
-DEBUG - 06/21/21 23:59:10 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:59:10 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:59:10 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:59:10 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
-INFO - 06/21/21 23:59:12 - 0:10:33 - Evaluate on Dev Set. F1: 96.0542.
-INFO - 06/21/21 23:59:12 - 0:10:33 - No better model found (1/3)
-INFO - 06/21/21 23:59:12 - 0:10:33 - ============== epoch 4 ==============
-INFO - 06/21/21 23:59:18 - 0:00:20 - Start NER training ...
-INFO - 06/21/21 23:59:18 - 0:00:20 - ============== epoch 0 ==============
-INFO - 06/21/21 23:59:21 - 0:05:18 - Finish training epoch 1. loss: 0.0190
-INFO - 06/21/21 23:59:21 - 0:05:18 - ============== Evaluate epoch 1 on Dev Set ==============
-INFO - 06/21/21 23:59:30 - 0:00:00 - ============ Initialized logger ============
-INFO - 06/21/21 23:59:30 - 0:00:00 - batch_size: 32
-                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
-                                     dropout: 0.1
-                                     dump_path: logs/conll2003/1
-                                     early_stop: 3
-                                     epoch: 300
-                                     exp_id: 1
-                                     exp_name: conll2003
-                                     hidden_dim: 1024
-                                     logger_filename: train.log
-                                     lr: 2e-05
-                                     model_name: roberta-large
-                                     num_tag: 3
-                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
-                                     seed: 111
-INFO - 06/21/21 23:59:30 - 0:00:00 - The experiment will be stored in logs/conll2003/1
-                                     
-DEBUG - 06/21/21 23:59:30 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:59:30 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:59:30 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-INFO - 06/21/21 23:59:31 - 0:05:27 - Evaluate on Dev Set. F1: 97.1510.
-INFO - 06/21/21 23:59:31 - 0:05:27 - Found better model!!
-DEBUG - 06/21/21 23:59:31 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:59:31 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:59:31 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:59:31 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:59:31 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
-INFO - 06/21/21 23:59:32 - 0:05:29 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
-INFO - 06/21/21 23:59:32 - 0:05:29 - ============== epoch 2 ==============
-INFO - 06/21/21 23:59:43 - 0:00:13 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
-DEBUG - 06/21/21 23:59:43 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:59:43 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/21/21 23:59:43 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/21/21 23:59:44 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
-INFO - 06/21/21 23:59:51 - 0:00:21 - Start NER training ...
-INFO - 06/21/21 23:59:51 - 0:00:21 - ============== epoch 0 ==============
-INFO - 06/22/21 00:01:00 - 0:05:20 - Finish training epoch 1. loss: 0.0229
-INFO - 06/22/21 00:01:00 - 0:05:20 - ============== Evaluate epoch 1 on Dev Set ==============
-INFO - 06/22/21 00:01:10 - 0:05:30 - Evaluate on Dev Set. F1: 97.0174.
-INFO - 06/22/21 00:01:10 - 0:05:30 - Found better model!!
-INFO - 06/22/21 00:01:12 - 0:05:31 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
-INFO - 06/22/21 00:01:12 - 0:05:31 - ============== epoch 2 ==============
-INFO - 06/22/21 00:01:35 - 0:12:56 - Finish training epoch 4. loss: 0.0170
-INFO - 06/22/21 00:01:35 - 0:12:56 - ============== Evaluate epoch 4 on Dev Set ==============
-INFO - 06/22/21 00:01:40 - 0:02:43 - Finish training epoch 0. loss: 0.0544
-INFO - 06/22/21 00:01:40 - 0:02:43 - ============== Evaluate epoch 0 on Dev Set ==============
-INFO - 06/22/21 00:01:45 - 0:13:05 - Evaluate on Dev Set. F1: 97.1884.
-INFO - 06/22/21 00:01:45 - 0:13:05 - No better model found (2/3)
-INFO - 06/22/21 00:01:45 - 0:13:05 - ============== epoch 5 ==============
-INFO - 06/22/21 00:01:50 - 0:02:53 - Evaluate on Dev Set. F1: 96.2938.
-INFO - 06/22/21 00:01:50 - 0:02:53 - Found better model!!
-INFO - 06/22/21 00:01:52 - 0:02:55 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
-INFO - 06/22/21 00:01:52 - 0:02:55 - ============== epoch 1 ==============
-INFO - 06/22/21 00:01:55 - 0:07:51 - Finish training epoch 2. loss: 0.0200
-INFO - 06/22/21 00:01:55 - 0:07:51 - ============== Evaluate epoch 2 on Dev Set ==============
-INFO - 06/22/21 00:02:04 - 0:08:01 - Evaluate on Dev Set. F1: 96.9804.
-INFO - 06/22/21 00:02:04 - 0:08:01 - No better model found (1/3)
-INFO - 06/22/21 00:02:04 - 0:08:01 - ============== epoch 3 ==============
-INFO - 06/22/21 00:02:13 - 0:02:42 - Finish training epoch 0. loss: 0.0547
-INFO - 06/22/21 00:02:13 - 0:02:42 - ============== Evaluate epoch 0 on Dev Set ==============
-INFO - 06/22/21 00:02:22 - 0:02:52 - Evaluate on Dev Set. F1: 97.0400.
-INFO - 06/22/21 00:02:22 - 0:02:52 - Found better model!!
-INFO - 06/22/21 00:02:24 - 0:02:54 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
-INFO - 06/22/21 00:02:24 - 0:02:54 - ============== epoch 1 ==============
-INFO - 06/22/21 00:03:35 - 0:07:55 - Finish training epoch 2. loss: 0.0173
-INFO - 06/22/21 00:03:35 - 0:07:55 - ============== Evaluate epoch 2 on Dev Set ==============
-INFO - 06/22/21 00:03:45 - 0:08:04 - Evaluate on Dev Set. F1: 97.3191.
-INFO - 06/22/21 00:03:45 - 0:08:04 - Found better model!!
-INFO - 06/22/21 00:03:46 - 0:08:06 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
-INFO - 06/22/21 00:03:46 - 0:08:06 - ============== epoch 3 ==============
-INFO - 06/22/21 00:04:07 - 0:15:28 - Finish training epoch 5. loss: 0.0083
-INFO - 06/22/21 00:04:07 - 0:15:28 - ============== Evaluate epoch 5 on Dev Set ==============
-INFO - 06/22/21 00:04:14 - 0:05:17 - Finish training epoch 1. loss: 0.0182
-INFO - 06/22/21 00:04:14 - 0:05:17 - ============== Evaluate epoch 1 on Dev Set ==============
-INFO - 06/22/21 00:04:17 - 0:15:37 - Evaluate on Dev Set. F1: 97.3169.
-INFO - 06/22/21 00:04:17 - 0:15:37 - No better model found (3/3)
-INFO - 06/22/21 00:04:17 - 0:15:37 - ============== Evaluate on Test Set ==============
-INFO - 06/22/21 00:04:24 - 0:05:27 - Evaluate on Dev Set. F1: 97.6314.
-INFO - 06/22/21 00:04:24 - 0:05:27 - Found better model!!
-INFO - 06/22/21 00:04:26 - 0:15:46 - Evaluate on Test Set. F1: 95.6012.
-INFO - 06/22/21 00:04:26 - 0:05:29 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
-INFO - 06/22/21 00:04:26 - 0:05:29 - ============== epoch 2 ==============
-INFO - 06/22/21 00:04:27 - 0:10:24 - Finish training epoch 3. loss: 0.0157
-INFO - 06/22/21 00:04:27 - 0:10:24 - ============== Evaluate epoch 3 on Dev Set ==============
-INFO - 06/22/21 00:04:37 - 0:10:33 - Evaluate on Dev Set. F1: 97.6654.
-INFO - 06/22/21 00:04:37 - 0:10:33 - Found better model!!
-INFO - 06/22/21 00:04:39 - 0:10:35 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
-INFO - 06/22/21 00:04:39 - 0:10:35 - ============== epoch 4 ==============
-INFO - 06/22/21 00:04:45 - 0:05:15 - Finish training epoch 1. loss: 0.0177
-INFO - 06/22/21 00:04:45 - 0:05:15 - ============== Evaluate epoch 1 on Dev Set ==============
-INFO - 06/22/21 00:04:55 - 0:05:25 - Evaluate on Dev Set. F1: 97.6093.
-INFO - 06/22/21 00:04:55 - 0:05:25 - Found better model!!
-INFO - 06/22/21 00:04:56 - 0:05:26 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
-INFO - 06/22/21 00:04:56 - 0:05:26 - ============== epoch 2 ==============
-INFO - 06/22/21 00:06:10 - 0:10:30 - Finish training epoch 3. loss: 0.0439
-INFO - 06/22/21 00:06:10 - 0:10:30 - ============== Evaluate epoch 3 on Dev Set ==============
-INFO - 06/22/21 00:06:20 - 0:10:40 - Evaluate on Dev Set. F1: 0.0000.
-INFO - 06/22/21 00:06:20 - 0:10:40 - No better model found (1/3)
-INFO - 06/22/21 00:06:20 - 0:10:40 - ============== epoch 4 ==============
-INFO - 06/22/21 00:06:47 - 0:07:50 - Finish training epoch 2. loss: 0.0156
-INFO - 06/22/21 00:06:47 - 0:07:50 - ============== Evaluate epoch 2 on Dev Set ==============
-INFO - 06/22/21 00:06:57 - 0:07:59 - Evaluate on Dev Set. F1: 97.5384.
-INFO - 06/22/21 00:06:57 - 0:07:59 - No better model found (1/3)
-INFO - 06/22/21 00:06:57 - 0:07:59 - ============== epoch 3 ==============
-INFO - 06/22/21 00:07:02 - 0:12:59 - Finish training epoch 4. loss: 0.0127
-INFO - 06/22/21 00:07:02 - 0:12:59 - ============== Evaluate epoch 4 on Dev Set ==============
-INFO - 06/22/21 00:07:12 - 0:13:08 - Evaluate on Dev Set. F1: 97.4583.
-INFO - 06/22/21 00:07:12 - 0:13:08 - No better model found (1/3)
-INFO - 06/22/21 00:07:12 - 0:13:08 - ============== epoch 5 ==============
-INFO - 06/22/21 00:07:17 - 0:07:47 - Finish training epoch 2. loss: 0.0115
-INFO - 06/22/21 00:07:17 - 0:07:47 - ============== Evaluate epoch 2 on Dev Set ==============
-INFO - 06/22/21 00:07:26 - 0:07:56 - Evaluate on Dev Set. F1: 97.2615.
-INFO - 06/22/21 00:07:26 - 0:07:56 - No better model found (1/3)
-INFO - 06/22/21 00:07:26 - 0:07:56 - ============== epoch 3 ==============
-INFO - 06/22/21 00:08:43 - 0:13:03 - Finish training epoch 4. loss: 0.5637
-INFO - 06/22/21 00:08:43 - 0:13:03 - ============== Evaluate epoch 4 on Dev Set ==============
-INFO - 06/22/21 00:08:53 - 0:13:12 - Evaluate on Dev Set. F1: 0.0000.
-INFO - 06/22/21 00:08:53 - 0:13:12 - No better model found (2/3)
-INFO - 06/22/21 00:08:53 - 0:13:12 - ============== epoch 5 ==============
-INFO - 06/22/21 00:09:18 - 0:10:21 - Finish training epoch 3. loss: 0.0110
-INFO - 06/22/21 00:09:18 - 0:10:21 - ============== Evaluate epoch 3 on Dev Set ==============
-INFO - 06/22/21 00:09:28 - 0:10:31 - Evaluate on Dev Set. F1: 97.2738.
-INFO - 06/22/21 00:09:28 - 0:10:31 - No better model found (2/3)
-INFO - 06/22/21 00:09:28 - 0:10:31 - ============== epoch 4 ==============
-INFO - 06/22/21 00:09:35 - 0:15:31 - Finish training epoch 5. loss: 0.0132
-INFO - 06/22/21 00:09:35 - 0:15:31 - ============== Evaluate epoch 5 on Dev Set ==============
-INFO - 06/22/21 00:09:45 - 0:15:41 - Evaluate on Dev Set. F1: 97.4630.
-INFO - 06/22/21 00:09:45 - 0:15:41 - No better model found (2/3)
-INFO - 06/22/21 00:09:45 - 0:15:41 - ============== epoch 6 ==============
-INFO - 06/22/21 00:09:47 - 0:10:17 - Finish training epoch 3. loss: 0.0101
-INFO - 06/22/21 00:09:47 - 0:10:17 - ============== Evaluate epoch 3 on Dev Set ==============
-INFO - 06/22/21 00:09:57 - 0:10:27 - Evaluate on Dev Set. F1: 97.5034.
-INFO - 06/22/21 00:09:57 - 0:10:27 - No better model found (2/3)
-INFO - 06/22/21 00:09:57 - 0:10:27 - ============== epoch 4 ==============
-INFO - 06/22/21 00:11:16 - 0:15:36 - Finish training epoch 5. loss: 0.5620
-INFO - 06/22/21 00:11:16 - 0:15:36 - ============== Evaluate epoch 5 on Dev Set ==============
-INFO - 06/22/21 00:11:26 - 0:15:45 - Evaluate on Dev Set. F1: 0.0000.
-INFO - 06/22/21 00:11:26 - 0:15:45 - No better model found (3/3)
-INFO - 06/22/21 00:11:26 - 0:15:45 - ============== Evaluate on Test Set ==============
-INFO - 06/22/21 00:11:35 - 0:15:54 - Evaluate on Test Set. F1: 0.0000.
-INFO - 06/22/21 00:11:50 - 0:12:53 - Finish training epoch 4. loss: 0.0137
-INFO - 06/22/21 00:11:50 - 0:12:53 - ============== Evaluate epoch 4 on Dev Set ==============
-INFO - 06/22/21 00:12:00 - 0:13:02 - Evaluate on Dev Set. F1: 97.4501.
-INFO - 06/22/21 00:12:00 - 0:13:02 - No better model found (3/3)
-INFO - 06/22/21 00:12:00 - 0:13:02 - ============== Evaluate on Test Set ==============
-INFO - 06/22/21 00:12:08 - 0:18:04 - Finish training epoch 6. loss: 0.0129
-INFO - 06/22/21 00:12:08 - 0:18:04 - ============== Evaluate epoch 6 on Dev Set ==============
-INFO - 06/22/21 00:12:09 - 0:13:11 - Evaluate on Test Set. F1: 95.4761.
-INFO - 06/22/21 00:12:17 - 0:18:14 - Evaluate on Dev Set. F1: 97.2311.
-INFO - 06/22/21 00:12:17 - 0:18:14 - No better model found (3/3)
-INFO - 06/22/21 00:12:17 - 0:18:14 - ============== Evaluate on Test Set ==============
-INFO - 06/22/21 00:12:19 - 0:12:48 - Finish training epoch 4. loss: 0.0074
-INFO - 06/22/21 00:12:19 - 0:12:48 - ============== Evaluate epoch 4 on Dev Set ==============
-INFO - 06/22/21 00:12:26 - 0:18:23 - Evaluate on Test Set. F1: 95.2934.
-INFO - 06/22/21 00:12:28 - 0:12:58 - Evaluate on Dev Set. F1: 97.0406.
-INFO - 06/22/21 00:12:28 - 0:12:58 - No better model found (3/3)
-INFO - 06/22/21 00:12:28 - 0:12:58 - ============== Evaluate on Test Set ==============
-INFO - 06/22/21 00:12:37 - 0:13:07 - Evaluate on Test Set. F1: 95.3264.
-INFO - 06/22/21 00:16:11 - 0:00:00 - ============ Initialized logger ============
-INFO - 06/22/21 00:16:11 - 0:00:00 - batch_size: 32
-                                     data_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003
-                                     dropout: 0.1
-                                     dump_path: logs/conll2003/1
-                                     early_stop: 3
-                                     epoch: 300
-                                     exp_id: 1
-                                     exp_name: conll2003
-                                     hidden_dim: 1024
-                                     logger_filename: train.log
-                                     lr: 3e-05
-                                     model_name: roberta-large
-                                     num_tag: 3
-                                     saved_folder: /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model
-                                     seed: 111
-INFO - 06/22/21 00:16:11 - 0:00:00 - The experiment will be stored in logs/conll2003/1
-                                     
-DEBUG - 06/22/21 00:16:11 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/22/21 00:16:12 - 0:00:00 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/22/21 00:16:12 - 0:00:00 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/22/21 00:16:12 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/vocab.json HTTP/1.1" 200 0
-DEBUG - 06/22/21 00:16:12 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/22/21 00:16:12 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/merges.txt HTTP/1.1" 200 0
-DEBUG - 06/22/21 00:16:12 - 0:00:01 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/22/21 00:16:13 - 0:00:01 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/tokenizer.json HTTP/1.1" 200 0
-INFO - 06/22/21 00:16:24 - 0:00:12 - conll2003 dataset: train size: 14040; dev size 3249; test size: 3452
-DEBUG - 06/22/21 00:16:24 - 0:00:12 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/22/21 00:16:24 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/config.json HTTP/1.1" 200 0
-DEBUG - 06/22/21 00:16:24 - 0:00:13 - Starting new HTTPS connection (1): huggingface.co:443
-DEBUG - 06/22/21 00:16:24 - 0:00:13 - https://huggingface.co:443 "HEAD /roberta-large/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
-INFO - 06/22/21 00:16:31 - 0:00:20 - Start NER training ...
-INFO - 06/22/21 00:16:31 - 0:00:20 - ============== epoch 0 ==============
-INFO - 06/22/21 00:18:53 - 0:02:42 - Finish training epoch 0. loss: 0.0544
-INFO - 06/22/21 00:18:53 - 0:02:42 - ============== Evaluate epoch 0 on Dev Set ==============
-INFO - 06/22/21 00:19:03 - 0:02:51 - Evaluate on Dev Set. F1: 96.2938.
-INFO - 06/22/21 00:19:03 - 0:02:51 - Found better model!!
-INFO - 06/22/21 00:19:05 - 0:02:53 - Best model has been saved to /gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt
-INFO - 06/22/21 00:19:05 - 0:02:53 - ============== epoch 1 ==============
--- a/dialog_ctrl/ner/ner_demo.py
+++ b/dialog_ctrl/ner/ner_demo.py
+
+import torch
+import numpy as np
+from transformers import AutoTokenizer
+from tabulate import tabulate
+
+tokenizer = AutoTokenizer.from_pretrained("roberta-large")
+ner_model = torch.load("/gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model/roberta-large.pt")["model"]
+ner_model.cuda()
+ner_model.eval()
+
+label_set = ["O", "B", "I"]
+
+for step in range(100):
+    print("===========================================================================")
+    input_sent = input(">> Input:")
+    tokens = input_sent.split()
+    token_ids, first_tok_masks = [tokenizer.cls_token_id], [0]
+    for token in tokens:
+        subs_ = tokenizer.tokenize(token)
+        assert len(subs_) > 0
+        
+        token_ids.extend(tokenizer.convert_tokens_to_ids(subs_))
+        first_tok_masks.extend([1] + [0] * (len(subs_) - 1))
+    
+    token_ids.append(tokenizer.sep_token_id)
+    first_tok_masks.append(0)
+    
+    token_ids = torch.LongTensor([token_ids]).cuda()
+    predictions = ner_model(token_ids)  # (1, seq_len, 3)
+
+    predictions = predictions[0].data.cpu().numpy() # (seq_len, 3)
+    pred_ids = list(np.argmax(predictions, axis=1))
+
+    assert len(pred_ids) == len(first_tok_masks)
+    preds_for_each_word = []
+    for pred, mask in zip(pred_ids, first_tok_masks):
+        if mask == 1:
+            preds_for_each_word.append(label_set[pred])
+
+    assert len(preds_for_each_word) == len(tokens)
+    table = [tokens, preds_for_each_word]
+    print(tabulate(table))
+
+    
--- a/dialog_ctrl/ner/src/config.py
+++ b/dialog_ctrl/ner/src/config.py
@@ -23,6 +23,12 @@ def get_params():
    parser.add_argument("--data_folder", type=str, default="/gpfs/fs1/projects/gpu_adlr/datasets/zihanl/conll2003", help="NER data folder")
    parser.add_argument("--saved_folder", type=str, default="/gpfs/fs1/projects/gpu_adlr/datasets/zihanl/checkpoints/ner_model", help="NER data folder")

+    parser.add_argument("--default_folder", type=str, default="/gpfs/fs1/projects/gpu_adlr/datasets/zihanl")
+    parser.add_argument("--infer_datafolder", type=str, default="dialog_datasets/wizard_of_wikipedia/processed")
+    parser.add_argument("--infer_dataname", type=str, default="train.txt")
+    parser.add_argument("--output_dataname", type=str, default="train_entity_based_control.txt")
+    
+
    params = parser.parse_args()

    return params
--- a/dialog_ctrl/ner/train_ner.py
+++ b/dialog_ctrl/ner/train_ner.py
@@ -7,7 +7,6 @@ from src.trainer import NERTrainer

 import torch
 import numpy as np
-from tqdm import tqdm
 import random

 def random_seed(seed):

--- a/tensorboard/gpt3-357m/events.out.tfevents.1623896925.dgx0064.2537583.0
+++ b/tensorboard/gpt3-357m/events.out.tfevents.1623896925.dgx0064.2537583.0
--- a/tensorboard/gpt3-357m/events.out.tfevents.1623897185.dgx0066.499504.0
+++ b/tensorboard/gpt3-357m/events.out.tfevents.1623897185.dgx0066.499504.0
--- a/tensorboard/gpt3-357m/events.out.tfevents.1623897586.dgx0064.2550897.0
+++ b/tensorboard/gpt3-357m/events.out.tfevents.1623897586.dgx0064.2550897.0
--- a/tensorboard/gpt3-357m/events.out.tfevents.1623897835.dgx0064.2557370.0
+++ b/tensorboard/gpt3-357m/events.out.tfevents.1623897835.dgx0064.2557370.0