Commit 688448db authored by silencealiang's avatar silencealiang
Browse files

更新代码

parent a02a5490
Pipeline #2503 passed with stage
......@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters: 2000
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -52,4 +52,5 @@ MODEL_ARGS:
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--log-memory-to-tensorboard: true
TEST_TYPE: regular
......@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters: 2000
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -49,4 +49,5 @@ MODEL_ARGS:
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--log-memory-to-tensorboard: true
TEST_TYPE: regular
{"lm loss": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 12.57679, "5": 12.5818, "10": 12.47354, "15": 11.80609, "20": 11.49702, "25": 10.98467}}, "num-zeros": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 521041248.0, "5": 520997440.0, "10": 521179808.0, "15": 521592416.0, "20": 521133664.0, "25": 523544832.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 24510808064.0, "5": 24510808064.0, "10": 24510808064.0, "15": 24510808064.0, "20": 24510808064.0, "25": 24510808064.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 52700401664.0, "5": 60489064448.0, "10": 60489064448.0, "15": 60489064448.0, "20": 60489064448.0, "25": 60489064448.0}}, "iteration-time": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": 2.87864, "15": "nan", "20": 2.89414, "25": "nan"}}}
\ No newline at end of file
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
NVTE_FWD_LAYERNORM_SM_MARGIN: 16
NVTE_BWD_LAYERNORM_SM_MARGIN: 16
MODEL_ARGS:
--num-layers: 32
--hidden-size: 4096
--num-attention-heads: 32
--group-query-attention: true
--num-query-groups: 8
--untie-embeddings-and-output-weights: true
--log-throughput: true
--log-params-norm: true
--log-num-zeros-in-grad: true
--log-validation-ppl-to-tensorboard: true
--log-timers-to-tensorboard: true
--tensorboard-dir: ${TENSORBOARD_PATH}
--micro-batch-size: 1
--global-batch-size: 8
--seq-length: 8192
--max-position-embeddings: 8192
--train-iters: 25
--timing-log-level: 0
--lr-decay-iters: 320000
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--tokenizer-type: NullTokenizer
--vocab-size: 131072
--mock-data: true
--split: 949,50,1
--distributed-backend: nccl
--lr: 0.00015
--lr-decay-style: cosine
--min-lr: 1.0e-5
--weight-decay: 1e-2
--clip-grad: 1.0
--lr-warmup-fraction: .01
--log-interval: 2
--save-interval: 10000
--eval-interval: 1000
--eval-iters: 5
--transformer-impl: transformer_engine
--tensor-model-parallel-size: 1
--pipeline-model-parallel-size: 4
--num-layers-per-virtual-pipeline-stage: 1
--use-distributed-optimizer: true
--overlap-grad-reduce: true
--overlap-param-gather: true
--deterministic-mode: true
--no-gradient-accumulation-fusion: true
--use-mcore-models: true
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--log-memory-to-tensorboard: true
TEST_TYPE: regular
{"lm loss": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 12.61262, "5": 12.60238, "10": 12.49879, "15": 11.82067, "20": 11.50566, "25": 10.99243}}, "num-zeros": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 523040896.0, "5": 523012096.0, "10": 523190944.0, "15": 523625088.0, "20": 523224032.0, "25": 525635776.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 20634214400.0, "5": 20634214400.0, "10": 20634214400.0, "15": 20634214400.0, "20": 20634214400.0, "25": 20634214400.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": 51333926912.0, "5": 58188226560.0, "10": 58188226560.0, "15": 58188226560.0, "20": 58188226560.0, "25": 58188226560.0}}, "iteration-time": {"start_step": 1, "end_step": 25, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": 2.72059, "15": "nan", "20": 2.72404, "25": "nan"}}}
\ No newline at end of file
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
NVTE_FWD_LAYERNORM_SM_MARGIN: 16
NVTE_BWD_LAYERNORM_SM_MARGIN: 16
MODEL_ARGS:
--num-layers: 32
--hidden-size: 4096
--num-attention-heads: 32
--group-query-attention: true
--num-query-groups: 8
--untie-embeddings-and-output-weights: true
--log-throughput: true
--log-params-norm: true
--log-num-zeros-in-grad: true
--log-validation-ppl-to-tensorboard: true
--log-timers-to-tensorboard: true
--tensorboard-dir: ${TENSORBOARD_PATH}
--micro-batch-size: 2
--global-batch-size: 8
--seq-length: 8192
--max-position-embeddings: 8192
--train-iters: 25
--timing-log-level: 0
--lr-decay-iters: 320000
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--tokenizer-type: NullTokenizer
--vocab-size: 131072
--mock-data: true
--split: 949,50,1
--distributed-backend: nccl
--lr: 0.00015
--lr-decay-style: cosine
--min-lr: 1.0e-5
--weight-decay: 1e-2
--clip-grad: 1.0
--lr-warmup-fraction: .01
--log-interval: 2
--save-interval: 10000
--eval-interval: 1000
--eval-iters: 5
--transformer-impl: transformer_engine
--tensor-model-parallel-size: 4
--pipeline-model-parallel-size: 1
--use-distributed-optimizer: true
--overlap-grad-reduce: true
--overlap-param-gather: true
--deterministic-mode: true
--no-gradient-accumulation-fusion: true
--use-mcore-models: true
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--log-memory-to-tensorboard: true
TEST_TYPE: regular
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.8401,
10.87259,
10.85024,
10.79646,
10.68156,
10.60618,
10.12768,
10.22185,
10.13788,
9.82309
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
1698.0,
1855.0,
1949.0,
1968.0,
1881.0,
1783.0,
1653.0,
2037.0,
2313.0,
2300.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
5.37706,
0.09618,
0.09432,
0.09666,
0.09442,
0.09619,
0.09453,
0.0975,
0.09517,
0.09727
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.8401, "5": 10.84034, "10": 10.8134, "15": 10.80277, "20": 10.70494, "25": 10.53848, "30": 10.3552, "35": 10.27145, "40": 10.08048, "45": 9.82288, "50": 9.90119}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1698.0, "5": 1900.0, "10": 1421.0, "15": 1946.0, "20": 1765.0, "25": 1726.0, "30": 2022.0, "35": 1962.0, "40": 2274.0, "45": 2172.0, "50": 2369.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 552128000.0, "5": 552128000.0, "10": 552128000.0, "15": 552128000.0, "20": 552128000.0, "25": 552128000.0, "30": 552128000.0, "35": 552128000.0, "40": 552128000.0, "45": 552128000.0, "50": 552128000.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4576452608.0, "5": 4673069056.0, "10": 4673069056.0, "15": 4673069056.0, "20": 4673069056.0, "25": 4673069056.0, "30": 4673069056.0, "35": 4673069056.0, "40": 4673069056.0, "45": 4673069056.0, "50": 4673069056.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5.82685, "5": 0.09636, "10": 0.09453, "15": 0.0951, "20": 0.09324, "25": 0.09311, "30": 0.09279, "35": 0.0934, "40": 0.09774, "45": 0.09122, "50": 0.08864}}}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.06904588235294119}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.8401, "5": 10.84032, "10": 10.8134, "15": 10.80276, "20": 10.70493, "25": 10.53847, "30": 10.35518, "35": 10.27143, "40": 10.08046, "45": 9.82288, "50": 9.90114}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1670.0, "5": 1970.0, "10": 1397.0, "15": 1886.0, "20": 1785.0, "25": 1695.0, "30": 2086.0, "35": 1976.0, "40": 2349.0, "45": 2240.0, "50": 2338.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 552238592.0, "5": 552238592.0, "10": 552238592.0, "15": 552238592.0, "20": 552238592.0, "25": 552238592.0, "30": 552238592.0, "35": 552238592.0, "40": 552238592.0, "45": 552238592.0, "50": 552238592.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4576563200.0, "5": 4673179648.0, "10": 4673179648.0, "15": 4673179648.0, "20": 4673179648.0, "25": 4673179648.0, "30": 4673179648.0, "35": 4673179648.0, "40": 4673179648.0, "45": 4673179648.0, "50": 4673179648.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 7.79296, "5": 0.08936, "10": 0.08747, "15": 0.09067, "20": 0.08679, "25": 0.08868, "30": 0.08685, "35": 0.08887, "40": 0.08682, "45": 0.08792, "50": 0.08604}}}
\ No newline at end of file
......@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -49,4 +49,5 @@ MODEL_ARGS:
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--attention-backend: unfused
--log-memory-to-tensorboard: true
TEST_TYPE: regular
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.84023, "5": 10.84074, "10": 10.81392, "15": 10.80238, "20": 10.70474, "25": 10.53876, "30": 10.35537, "35": 10.2716, "40": 10.08036, "45": 9.8231, "50": 9.90117, "55": 9.86414, "60": 9.48062, "65": 8.93763, "70": 9.7102, "75": 9.40888, "80": 9.39066, "85": 9.59766, "90": 9.80366, "95": 9.50574, "100": 9.38807}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1659.0, "5": 1886.0, "10": 1388.0, "15": 1827.0, "20": 1686.0, "25": 1696.0, "30": 1877.0, "35": 1967.0, "40": 2300.0, "45": 2176.0, "50": 2249.0, "55": 2468.0, "60": 2471.0, "65": 2688.0, "70": 3271.0, "75": 2633.0, "80": 3351.0, "85": 3332.0, "90": 2984.0, "95": 3459.0, "100": 3555.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 552128512.0, "5": 552128512.0, "10": 552128512.0, "15": 552128512.0, "20": 552128512.0, "25": 552128512.0, "30": 552128512.0, "35": 552128512.0, "40": 552128512.0, "45": 552128512.0, "50": 552128512.0, "55": 552128512.0, "60": 552128512.0, "65": 552128512.0, "70": 552128512.0, "75": 552128512.0, "80": 552128512.0, "85": 552128512.0, "90": 552128512.0, "95": 552128512.0, "100": 552128512.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2615097856.0, "5": 2711714304.0, "10": 2711714304.0, "15": 2711714304.0, "20": 2711714304.0, "25": 2711714304.0, "30": 2711714304.0, "35": 2711714304.0, "40": 2711714304.0, "45": 2711714304.0, "50": 2711714304.0, "55": 2711714304.0, "60": 2711714304.0, "65": 2711714304.0, "70": 2711714304.0, "75": 2711714304.0, "80": 2711714304.0, "85": 2711714304.0, "90": 2711714304.0, "95": 2711714304.0, "100": 2711714304.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 5.51223, "5": 0.08691, "10": 0.085, "15": 0.0859, "20": 0.08404, "25": 0.08464, "30": 0.08355, "35": 0.08189, "40": 0.08107, "45": 0.08112, "50": 0.08147, "55": 0.08204, "60": 0.08108, "65": 0.08132, "70": 0.0801, "75": 0.0805, "80": 0.08087, "85": 0.08073, "90": 0.08118, "95": 0.0798, "100": 0.0816}}}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.8401, "5": 10.84032, "10": 10.8134, "15": 10.80276, "20": 10.70493, "25": 10.53847, "30": 10.35518, "35": 10.27143, "40": 10.08046, "45": 9.82288, "50": 9.90114, "55": 9.86426, "60": 9.48028, "65": 8.93744, "70": 9.71023, "75": 9.40882, "80": 9.39078, "85": 9.59744, "90": 9.8039, "95": 9.50564, "100": 9.38814}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1670.0, "5": 1970.0, "10": 1397.0, "15": 1886.0, "20": 1785.0, "25": 1695.0, "30": 2086.0, "35": 1976.0, "40": 2349.0, "45": 2240.0, "50": 2338.0, "55": 2364.0, "60": 2474.0, "65": 2762.0, "70": 3207.0, "75": 2625.0, "80": 3502.0, "85": 3356.0, "90": 3142.0, "95": 3385.0, "100": 3449.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 552238592.0, "5": 552238592.0, "10": 552238592.0, "15": 552238592.0, "20": 552238592.0, "25": 552238592.0, "30": 552238592.0, "35": 552238592.0, "40": 552238592.0, "45": 552238592.0, "50": 552238592.0, "55": 552238592.0, "60": 552238592.0, "65": 552238592.0, "70": 552238592.0, "75": 552238592.0, "80": 552238592.0, "85": 552238592.0, "90": 552238592.0, "95": 552238592.0, "100": 552238592.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 4576563200.0, "5": 4673179648.0, "10": 4673179648.0, "15": 4673179648.0, "20": 4673179648.0, "25": 4673179648.0, "30": 4673179648.0, "35": 4673179648.0, "40": 4673179648.0, "45": 4673179648.0, "50": 4673179648.0, "55": 4673179648.0, "60": 4673179648.0, "65": 4673179648.0, "70": 4673179648.0, "75": 4673179648.0, "80": 4673179648.0, "85": 4673179648.0, "90": 4673179648.0, "95": 4673179648.0, "100": 4673179648.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 9.8249, "5": 0.09011, "10": 0.09012, "15": 0.09032, "20": 0.08958, "25": 0.0911, "30": 0.0899, "35": 0.09078, "40": 0.08965, "45": 0.09255, "50": 0.0906, "55": 0.08977, "60": 0.0869, "65": 0.08684, "70": 0.08704, "75": 0.08628, "80": 0.08639, "85": 0.08662, "90": 0.08701, "95": 0.08613, "100": 0.0859}}}
\ No newline at end of file
......@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters: 100
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -50,4 +50,5 @@ MODEL_ARGS:
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--log-memory-to-tensorboard: true
TEST_TYPE: ckpt-resume
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.84023, "5": 10.84074, "10": 10.81392, "15": 10.80238, "20": 10.70474, "25": 10.53876, "30": 10.35537, "35": 10.2716, "40": 10.08036, "45": 9.8231, "50": 9.90117, "55": 9.86414, "60": 9.48062, "65": 8.93763, "70": 9.7102, "75": 9.40888, "80": 9.39066, "85": 9.59766, "90": 9.80366, "95": 9.50574, "100": 9.38807}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1659.0, "5": 1886.0, "10": 1388.0, "15": 1827.0, "20": 1686.0, "25": 1696.0, "30": 1877.0, "35": 1967.0, "40": 2300.0, "45": 2176.0, "50": 2249.0, "55": 2468.0, "60": 2471.0, "65": 2688.0, "70": 3271.0, "75": 2633.0, "80": 3351.0, "85": 3332.0, "90": 2984.0, "95": 3459.0, "100": 3555.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 552128512.0, "5": 552128512.0, "10": 552128512.0, "15": 552128512.0, "20": 552128512.0, "25": 552128512.0, "30": 552128512.0, "35": 552128512.0, "40": 552128512.0, "45": 552128512.0, "50": 552128512.0, "55": 552128512.0, "60": 552128512.0, "65": 552128512.0, "70": 552128512.0, "75": 552128512.0, "80": 552128512.0, "85": 552128512.0, "90": 552128512.0, "95": 552128512.0, "100": 552128512.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2615097856.0, "5": 2711714304.0, "10": 2711714304.0, "15": 2711714304.0, "20": 2711714304.0, "25": 2711714304.0, "30": 2711714304.0, "35": 2711714304.0, "40": 2711714304.0, "45": 2711714304.0, "50": 2711714304.0, "55": 2711714304.0, "60": 2711714304.0, "65": 2711714304.0, "70": 2711714304.0, "75": 2711714304.0, "80": 2711714304.0, "85": 2711714304.0, "90": 2711714304.0, "95": 2711714304.0, "100": 2711714304.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 5.45269, "5": 0.08237, "10": 0.08305, "15": 0.08328, "20": 0.08344, "25": 0.08281, "30": 0.08195, "35": 0.08111, "40": 0.08016, "45": 0.07836, "50": 0.07936, "55": 0.07906, "60": 0.08023, "65": 0.07916, "70": 0.08026, "75": 0.07938, "80": 0.07948, "85": 0.07874, "90": 0.07885, "95": 0.0779, "100": 0.08116}}}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.8401, "5": 10.84032, "10": 10.8134, "15": 10.80276, "20": 10.70493, "25": 10.53847, "30": 10.35518, "35": 10.27143, "40": 10.08046, "45": 9.82288, "50": 9.90114, "55": 9.86426, "60": 9.48028, "65": 8.93744, "70": 9.71023, "75": 9.40882, "80": 9.39078, "85": 9.59744, "90": 9.8039, "95": 9.50564, "100": 9.38814}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1670.0, "5": 1970.0, "10": 1397.0, "15": 1886.0, "20": 1785.0, "25": 1695.0, "30": 2086.0, "35": 1976.0, "40": 2349.0, "45": 2240.0, "50": 2338.0, "55": 2364.0, "60": 2474.0, "65": 2762.0, "70": 3207.0, "75": 2625.0, "80": 3502.0, "85": 3356.0, "90": 3142.0, "95": 3385.0, "100": 3449.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 552238592.0, "5": 552238592.0, "10": 552238592.0, "15": 552238592.0, "20": 552238592.0, "25": 552238592.0, "30": 552238592.0, "35": 552238592.0, "40": 552238592.0, "45": 552238592.0, "50": 552238592.0, "55": 552238592.0, "60": 552238592.0, "65": 552238592.0, "70": 552238592.0, "75": 552238592.0, "80": 552238592.0, "85": 552238592.0, "90": 552238592.0, "95": 552238592.0, "100": 552238592.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 4576563200.0, "5": 4673179648.0, "10": 4673179648.0, "15": 4673179648.0, "20": 4673179648.0, "25": 4673179648.0, "30": 4673179648.0, "35": 4673179648.0, "40": 4673179648.0, "45": 4673179648.0, "50": 4673179648.0, "55": 4673179648.0, "60": 4673179648.0, "65": 4673179648.0, "70": 4673179648.0, "75": 4673179648.0, "80": 4673179648.0, "85": 4673179648.0, "90": 4673179648.0, "95": 4673179648.0, "100": 4673179648.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.41235, "5": 0.08775, "10": 0.08849, "15": 0.08737, "20": 0.08713, "25": 0.08696, "30": 0.08757, "35": 0.08803, "40": 0.08782, "45": 0.08739, "50": 0.08653, "55": 0.08734, "60": 0.08891, "65": 0.1011, "70": 0.08925, "75": 0.08826, "80": 0.08863, "85": 0.08797, "90": 0.08896, "95": 0.08827, "100": 0.08947}}}
\ No newline at end of file
......@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters: 100
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -49,4 +49,5 @@ MODEL_ARGS:
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--log-memory-to-tensorboard: true
TEST_TYPE: ckpt-resume
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.84023, "5": 10.84074, "10": 10.81392, "15": 10.80242, "20": 10.70474, "25": 10.53872, "30": 10.35534, "35": 10.27156, "40": 10.08035, "45": 9.82307, "50": 9.90117, "55": 9.86415, "60": 9.48061, "65": 8.9376, "70": 9.71013, "75": 9.40885, "80": 9.39066, "85": 9.59761, "90": 9.80368, "95": 9.50575, "100": 9.38809}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1659.0, "5": 1886.0, "10": 1413.0, "15": 1912.0, "20": 1710.0, "25": 1666.0, "30": 2033.0, "35": 2032.0, "40": 2271.0, "45": 2171.0, "50": 2321.0, "55": 2330.0, "60": 2399.0, "65": 2573.0, "70": 3346.0, "75": 2588.0, "80": 3342.0, "85": 3296.0, "90": 3157.0, "95": 3269.0, "100": 3445.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1232487936.0, "5": 1232487936.0, "10": 1232487936.0, "15": 1232487936.0, "20": 1232487936.0, "25": 1232487936.0, "30": 1232487936.0, "35": 1232487936.0, "40": 1232487936.0, "45": 1232487936.0, "50": 1232487936.0, "55": 1232487936.0, "60": 1232487936.0, "65": 1232487936.0, "70": 1232487936.0, "75": 1232487936.0, "80": 1232487936.0, "85": 1232487936.0, "90": 1232487936.0, "95": 1232487936.0, "100": 1232487936.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1984492544.0, "5": 2534070272.0, "10": 2534070272.0, "15": 2534070272.0, "20": 2534070272.0, "25": 2534070272.0, "30": 2534070272.0, "35": 2534070272.0, "40": 2534070272.0, "45": 2534070272.0, "50": 2534070272.0, "55": 2534070272.0, "60": 2534070272.0, "65": 2534070272.0, "70": 2534070272.0, "75": 2534070272.0, "80": 2534070272.0, "85": 2534070272.0, "90": 2534070272.0, "95": 2534070272.0, "100": 2534070272.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 6.57779, "5": 0.12157, "10": 0.11891, "15": 0.1176, "20": 0.11702, "25": 0.11688, "30": 0.11766, "35": 0.11769, "40": 0.11717, "45": 0.11722, "50": 0.11804, "55": 0.11618, "60": 0.11829, "65": 0.11649, "70": 0.11804, "75": 0.11577, "80": 0.11793, "85": 0.11663, "90": 0.1178, "95": 0.11648, "100": 0.11531}}}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.8401, "5": 10.84032, "10": 10.81341, "15": 10.80278, "20": 10.70496, "25": 10.53846, "30": 10.35517, "35": 10.27147, "40": 10.08045, "45": 9.82292, "50": 9.90114, "55": 9.86422, "60": 9.48029, "65": 8.93749, "70": 9.71025, "75": 9.40879, "80": 9.39077, "85": 9.59743, "90": 9.80386, "95": 9.50565, "100": 9.38812}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1670.0, "5": 1970.0, "10": 1436.0, "15": 1918.0, "20": 1786.0, "25": 1610.0, "30": 2039.0, "35": 2001.0, "40": 2321.0, "45": 2205.0, "50": 2365.0, "55": 2489.0, "60": 2508.0, "65": 2719.0, "70": 3241.0, "75": 2643.0, "80": 3368.0, "85": 3336.0, "90": 2961.0, "95": 3533.0, "100": 3432.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1230390272.0, "5": 1230390272.0, "10": 1230390272.0, "15": 1230390272.0, "20": 1230390272.0, "25": 1230390272.0, "30": 1230390272.0, "35": 1230390272.0, "40": 1230390272.0, "45": 1230390272.0, "50": 1230390272.0, "55": 1230390272.0, "60": 1230390272.0, "65": 1230390272.0, "70": 1230390272.0, "75": 1230390272.0, "80": 1230390272.0, "85": 1230390272.0, "90": 1230390272.0, "95": 1230390272.0, "100": 1230390272.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1984492032.0, "5": 2531972608.0, "10": 2531972608.0, "15": 2531972608.0, "20": 2531972608.0, "25": 2531972608.0, "30": 2531972608.0, "35": 2531972608.0, "40": 2531972608.0, "45": 2531972608.0, "50": 2531972608.0, "55": 2531972608.0, "60": 2531972608.0, "65": 2531972608.0, "70": 2531972608.0, "75": 2531972608.0, "80": 2531972608.0, "85": 2531972608.0, "90": 2531972608.0, "95": 2531972608.0, "100": 2531972608.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 8.60398, "5": 0.12229, "10": 0.12251, "15": 0.12206, "20": 0.1226, "25": 0.12185, "30": 0.12287, "35": 0.12365, "40": 0.12186, "45": 0.12198, "50": 0.1223, "55": 0.12246, "60": 0.12181, "65": 0.12238, "70": 0.12276, "75": 0.12137, "80": 0.12307, "85": 0.1219, "90": 0.1217, "95": 0.12183, "100": 0.12252}}}
\ No newline at end of file
......@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters: 100
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -50,4 +50,5 @@ MODEL_ARGS:
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--log-memory-to-tensorboard: true
TEST_TYPE: ckpt-resume
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.8401,
10.87259,
10.85023,
10.79646,
10.68153,
10.60619,
10.12767,
10.22185,
10.13787,
9.82307
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
1698.0,
1855.0,
1896.0,
1866.0,
2032.0,
1814.0,
1664.0,
1961.0,
2306.0,
2403.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
8.00253,
0.13176,
0.13026,
0.13184,
0.13023,
0.13135,
0.13014,
0.13143,
0.1305,
0.13191
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.8401, "5": 10.84034, "10": 10.81341, "15": 10.80277, "20": 10.70495, "25": 10.53848, "30": 10.35523, "35": 10.27145, "40": 10.08043, "45": 9.82293, "50": 9.90114}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1698.0, "5": 1900.0, "10": 1454.0, "15": 1969.0, "20": 1774.0, "25": 1736.0, "30": 1970.0, "35": 1941.0, "40": 2237.0, "45": 2180.0, "50": 2328.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1230390272.0, "5": 1230390272.0, "10": 1230390272.0, "15": 1230390272.0, "20": 1230390272.0, "25": 1230390272.0, "30": 1230390272.0, "35": 1230390272.0, "40": 1230390272.0, "45": 1230390272.0, "50": 1230390272.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1984492032.0, "5": 2531972608.0, "10": 2531972608.0, "15": 2531972608.0, "20": 2531972608.0, "25": 2531972608.0, "30": 2531972608.0, "35": 2531972608.0, "40": 2531972608.0, "45": 2531972608.0, "50": 2531972608.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 6.57733, "5": 0.12925, "10": 0.12965, "15": 0.12911, "20": 0.12836, "25": 0.12886, "30": 0.12957, "35": 0.12947, "40": 0.12911, "45": 0.12814, "50": 0.12753}}}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment