"src/turbomind/kernels/beam_search_topk_kernels.h" did not exist on "720fc533da804ac3f46ee938864403e51fcd9fa7"
Commit 688448db authored by silencealiang's avatar silencealiang
Browse files

更新代码

parent a02a5490
Pipeline #2503 passed with stage
{
"lm loss": {
"start_step": 1,
"end_step": 100,
"step_interval": 5,
"values": {
"1": 10.81978,
"5": 10.85277,
"10": 10.79054,
"15": 10.81259,
"20": 10.71561,
"25": 10.52391,
"30": 10.33354,
"35": 10.22869,
"40": 10.04307,
"45": 9.77101,
"50": 9.86315,
"55": 9.82489,
"60": 9.45369,
"65": 8.89336,
"70": 9.69013,
"75": 9.38429,
"80": 9.37031,
"85": 9.58022,
"90": 9.78525,
"95": 9.49638,
"100": 9.36739
}
},
"num-zeros": {
"start_step": 1,
"end_step": 100,
"step_interval": 5,
"values": {
"1": 27138.0,
"5": 32036.0,
"10": 26255.0,
"15": 31309.0,
"20": 28869.0,
"25": 28605.0,
"30": 30817.0,
"35": 32882.0,
"40": 35373.0,
"45": 35484.0,
"50": 2136527.0,
"55": 2135084.0,
"60": 2137981.0,
"65": 2138995.0,
"70": 2142528.0,
"75": 2215276.0,
"80": 2144227.0,
"85": 2146040.0,
"90": 2146440.0,
"95": 2144187.0,
"100": 2144354.0
}
},
"mem-allocated-bytes": {
"start_step": 1,
"end_step": 100,
"step_interval": 5,
"values": {
"1": 668320768.0,
"5": 668306944.0,
"10": 668313600.0,
"15": 668326912.0,
"20": 668314112.0,
"25": 668332544.0,
"30": 668326912.0,
"35": 668337664.0,
"40": 668306432.0,
"45": 668297728.0,
"50": 668282880.0,
"55": 668265984.0,
"60": 668249088.0,
"65": 668242944.0,
"70": 668224512.0,
"75": 668213248.0,
"80": 668222464.0,
"85": 668234752.0,
"90": 668237312.0,
"95": 668223488.0,
"100": 668209664.0
}
},
"mem-max-allocated-bytes": {
"start_step": 1,
"end_step": 100,
"step_interval": 5,
"values": {
"1": 2355231744.0,
"5": 2605464064.0,
"10": 2605464064.0,
"15": 2605464064.0,
"20": 2605464064.0,
"25": 2615321600.0,
"30": 2615321600.0,
"35": 2618603520.0,
"40": 2618603520.0,
"45": 2618603520.0,
"50": 2618603520.0,
"55": 2618603520.0,
"60": 2618603520.0,
"65": 2618603520.0,
"70": 2618603520.0,
"75": 2618603520.0,
"80": 2618603520.0,
"85": 2618603520.0,
"90": 2618603520.0,
"95": 2618603520.0,
"100": 2618603520.0
}
},
"iteration-time": {
"start_step": 1,
"end_step": 100,
"step_interval": 5,
"values": {
"1": 6.84429,
"5": 0.49894,
"10": 0.4932,
"15": 0.48106,
"20": 0.48362,
"25": 0.48615,
"30": 0.49038,
"35": 0.49011,
"40": 0.50012,
"45": 0.49982,
"50": 0.49286,
"55": 0.92115,
"60": 0.49142,
"65": 0.49128,
"70": 0.49444,
"75": 0.49725,
"80": 0.4978,
"85": 0.49747,
"90": 0.497,
"95": 0.49687,
"100": 0.49788
}
}
}
\ No newline at end of file
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
--num-attention-heads: 8
--log-params-norm: true
--log-num-zeros-in-grad: true
--log-validation-ppl-to-tensorboard: true
--log-timers-to-tensorboard: true
--log-memory-to-tensorboard: true
--tensorboard-dir: ${TENSORBOARD_PATH}
--micro-batch-size: 4
--global-batch-size: 32
--seq-length: 1024
--max-position-embeddings: 1024
--train-iters: 100
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
--split: 949,50,1
--distributed-backend: nccl
--lr: 0.00015
--lr-decay-style: cosine
--min-lr: 1.0e-5
--weight-decay: 1e-2
--clip-grad: 1.0
--lr-warmup-fraction: .01
--log-interval: 1
--save-interval: 50
--eval-interval: 1000
--eval-iters: 10
--transformer-impl: transformer_engine
--tensor-model-parallel-size: 2
--pipeline-model-parallel-size: 1
--expert-model-parallel-size: 2
--disable-bias-linear: true
--sequence-parallel: true
--num-experts: 8
--moe-router-load-balancing-type: aux_loss
--moe-router-topk: 2
--moe-aux-loss-coeff: 1e-2
--use-custom-fsdp: true
--calculate-per-token-loss: true
--data-parallel-sharding-strategy: optim_grads_params
--use-distributed-optimizer: true
--deterministic-mode: true
--no-gradient-accumulation-fusion: true
--moe-grouped-gemm: true
--attention-softmax-in-fp32: true
--use-checkpoint-opt_param-scheduler: true
--use-mcore-models: true
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
TEST_TYPE: ckpt-resume
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.86126,
10.88645,
10.87768,
10.83106,
10.71636,
10.60597,
10.13124,
10.22753,
10.1591,
9.83464
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
1755.0,
2147.0,
2147.0,
2042.0,
2108.0,
1931.0,
1762.0,
2184.0,
2529.0,
2615.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
6.25178,
0.35642,
0.31793,
0.31783,
0.31708,
0.31607,
0.31789,
0.31477,
0.31433,
0.31727
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86126, "5": 10.88248, "10": 10.83507, "15": 10.82743, "20": 10.72743, "25": 10.5575, "30": 10.37893, "35": 10.28325, "40": 10.08786, "45": 9.82625, "50": 9.91321}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1755.0, "5": 2185.0, "10": 1522.0, "15": 2063.0, "20": 1801.0, "25": 1775.0, "30": 2044.0, "35": 2294.0, "40": 2587.0, "45": 2425.0, "50": 2628.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 269842432.0, "5": 269842432.0, "10": 269842432.0, "15": 269842432.0, "20": 269842432.0, "25": 269842432.0, "30": 269842432.0, "35": 269842432.0, "40": 269842432.0, "45": 269842432.0, "50": 269842432.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1449633280.0, "5": 1515627520.0, "10": 1515627520.0, "15": 1515627520.0, "20": 1515627520.0, "25": 1515627520.0, "30": 1515627520.0, "35": 1515627520.0, "40": 1515627520.0, "45": 1515627520.0, "50": 1515627520.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 6.40817, "5": 0.30338, "10": 0.30031, "15": 0.59063, "20": 0.30088, "25": 0.30345, "30": 0.30256, "35": 0.30195, "40": 0.3015, "45": 0.30212, "50": 0.30102}}}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.2256223529411765}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86126, "5": 10.88253, "10": 10.83509, "15": 10.82739, "20": 10.72744, "25": 10.55749, "30": 10.37894, "35": 10.28322, "40": 10.08784, "45": 9.82625, "50": 9.91326}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1752.0, "5": 2204.0, "10": 1567.0, "15": 2068.0, "20": 1783.0, "25": 1795.0, "30": 2124.0, "35": 2186.0, "40": 2575.0, "45": 2278.0, "50": 2683.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 269891584.0, "5": 269891584.0, "10": 269891584.0, "15": 269891584.0, "20": 269891584.0, "25": 269891584.0, "30": 269891584.0, "35": 269891584.0, "40": 269891584.0, "45": 269891584.0, "50": 269891584.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1450730496.0, "5": 1513579520.0, "10": 1513579520.0, "15": 1513579520.0, "20": 1515676672.0, "25": 1515676672.0, "30": 1515676672.0, "35": 1515676672.0, "40": 1515676672.0, "45": 1515676672.0, "50": 1515676672.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 7.60475, "5": 0.30064, "10": 0.2968, "15": 0.29543, "20": 0.29652, "25": 0.29285, "30": 0.29834, "35": 0.2921, "40": 0.29827, "45": 0.29122, "50": 0.30005}}}
\ No newline at end of file
......@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -49,4 +49,5 @@ MODEL_ARGS:
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--attention-backend: unfused
--log-memory-to-tensorboard: true
TEST_TYPE: regular
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.86126,
10.88645,
10.87768,
10.83106,
10.71636,
10.60597,
10.13124,
10.22753,
10.1591,
9.83464
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
1755.0,
2147.0,
2147.0,
2042.0,
2108.0,
1931.0,
1762.0,
2184.0,
2529.0,
2615.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
7.0561,
0.32588,
0.32628,
0.32385,
0.32419,
0.32364,
0.32337,
0.32334,
0.32358,
0.32395
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86126, "5": 10.88248, "10": 10.83507, "15": 10.82743, "20": 10.72743, "25": 10.5575, "30": 10.37893, "35": 10.28325, "40": 10.08786, "45": 9.82625, "50": 9.91321}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1755.0, "5": 2185.0, "10": 1522.0, "15": 2063.0, "20": 1801.0, "25": 1775.0, "30": 2044.0, "35": 2294.0, "40": 2587.0, "45": 2425.0, "50": 2628.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 269842432.0, "5": 269842432.0, "10": 269842432.0, "15": 269842432.0, "20": 269842432.0, "25": 269842432.0, "30": 269842432.0, "35": 269842432.0, "40": 269842432.0, "45": 269842432.0, "50": 269842432.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1450682368.0, "5": 1515627520.0, "10": 1515627520.0, "15": 1515627520.0, "20": 1515627520.0, "25": 1515627520.0, "30": 1515627520.0, "35": 1515627520.0, "40": 1515627520.0, "45": 1515627520.0, "50": 1515627520.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 6.08264, "5": 0.50294, "10": 0.31361, "15": 0.31749, "20": 0.30552, "25": 0.31296, "30": 0.31703, "35": 0.30458, "40": 0.30685, "45": 0.31528, "50": 0.30493}}}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.22043823529411763}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86126, "5": 10.88253, "10": 10.83509, "15": 10.82739, "20": 10.72744, "25": 10.55749, "30": 10.37894, "35": 10.28322, "40": 10.08784, "45": 9.82625, "50": 9.91326}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1752.0, "5": 2204.0, "10": 1567.0, "15": 2068.0, "20": 1783.0, "25": 1795.0, "30": 2124.0, "35": 2186.0, "40": 2575.0, "45": 2278.0, "50": 2683.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 269891584.0, "5": 269891584.0, "10": 269891584.0, "15": 269891584.0, "20": 269891584.0, "25": 269891584.0, "30": 269891584.0, "35": 269891584.0, "40": 269891584.0, "45": 269891584.0, "50": 269891584.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1449682432.0, "5": 1515676672.0, "10": 1515676672.0, "15": 1515676672.0, "20": 1515676672.0, "25": 1515676672.0, "30": 1515676672.0, "35": 1515676672.0, "40": 1515676672.0, "45": 1515676672.0, "50": 1515676672.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 8.33148, "5": 0.3095, "10": 0.30881, "15": 0.30285, "20": 0.305, "25": 0.30028, "30": 0.30512, "35": 0.30125, "40": 0.30469, "45": 0.29938, "50": 0.30327}}}
\ No newline at end of file
......@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -50,4 +50,5 @@ MODEL_ARGS:
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--attention-backend: unfused
--log-memory-to-tensorboard: true
TEST_TYPE: regular
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.86217,
10.88646,
10.87861,
10.83295,
10.7203,
10.61089,
10.14181,
10.23434,
10.16609,
9.84444
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
1769.0,
2056.0,
2198.0,
2079.0,
2181.0,
1912.0,
1825.0,
2115.0,
2621.0,
2598.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
6.42448,
0.42854,
0.42836,
0.42582,
0.42274,
0.42187,
0.42561,
0.42178,
0.44234,
0.42304
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86217, "5": 10.88249, "10": 10.83646, "15": 10.82906, "20": 10.73236, "25": 10.56397, "30": 10.38482, "35": 10.28955, "40": 10.09137, "45": 9.83491, "50": 9.91602}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1722.0, "5": 2259.0, "10": 1596.0, "15": 2099.0, "20": 1919.0, "25": 1785.0, "30": 2048.0, "35": 2290.0, "40": 2558.0, "45": 2447.0, "50": 2676.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 368387584.0, "5": 368387584.0, "10": 368387584.0, "15": 368387584.0, "20": 368387584.0, "25": 368387584.0, "30": 368387584.0, "35": 368387584.0, "40": 368387584.0, "45": 368387584.0, "50": 368387584.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1522507264.0, "5": 1653494272.0, "10": 1653494272.0, "15": 1653494272.0, "20": 1653494272.0, "25": 1653494272.0, "30": 1653494272.0, "35": 1653494272.0, "40": 1653494272.0, "45": 1653494272.0, "50": 1653494272.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5.95291, "5": 0.41956, "10": 0.41644, "15": 0.41504, "20": 0.41541, "25": 0.41645, "30": 0.41452, "35": 0.41456, "40": 0.41402, "45": 0.41451, "50": 0.41368}}}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86217, 10.88641, 10.8786, 10.83291, 10.72031, 10.6109, 10.1418, 10.23434, 10.16605, 9.84445]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1769.0, 2019.0, 2145.0, 2058.0, 2166.0, 2060.0, 1776.0, 2174.0, 2524.0, 2645.0]}, "iteration_timing_avg": 0.2256223529411765}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86217, "5": 10.8825, "10": 10.83647, "15": 10.8291, "20": 10.73231, "25": 10.56391, "30": 10.38476, "35": 10.28957, "40": 10.09136, "45": 9.83492, "50": 9.91604}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1769.0, "5": 2216.0, "10": 1573.0, "15": 2132.0, "20": 1815.0, "25": 1849.0, "30": 2009.0, "35": 2182.0, "40": 2489.0, "45": 2381.0, "50": 2727.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 368387072.0, "5": 368387072.0, "10": 368387072.0, "15": 368387072.0, "20": 368387072.0, "25": 368387072.0, "30": 368387072.0, "35": 368387072.0, "40": 368387072.0, "45": 368387072.0, "50": 368387072.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1512020480.0, "5": 1647202304.0, "10": 1647202304.0, "15": 1647202304.0, "20": 1647202304.0, "25": 1647202304.0, "30": 1647202304.0, "35": 1647202304.0, "40": 1647202304.0, "45": 1647202304.0, "50": 1647202304.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 7.51772, "5": 0.40946, "10": 0.40614, "15": 0.40997, "20": 0.41426, "25": 0.41365, "30": 0.41686, "35": 0.40937, "40": 0.41695, "45": 0.4147, "50": 0.42032}}}
\ No newline at end of file
......@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -49,4 +49,5 @@ MODEL_ARGS:
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--attention-backend: unfused
--log-memory-to-tensorboard: true
TEST_TYPE: regular
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86065, "5": 10.88253, "10": 10.8352, "15": 10.82848, "20": 10.72795, "25": 10.55737, "30": 10.37935, "35": 10.28345, "40": 10.0878, "45": 9.82662, "50": 9.91321, "55": 9.87799, "60": 9.50877, "65": 8.95112, "70": 9.73131, "75": 9.43668, "80": 9.41164, "85": 9.61594, "90": 9.8216, "95": 9.51907, "100": 9.40583}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1716.0, "5": 2123.0, "10": 1539.0, "15": 2026.0, "20": 1842.0, "25": 1767.0, "30": 2099.0, "35": 2213.0, "40": 2387.0, "45": 2378.0, "50": 2771.0, "55": 2649.0, "60": 2734.0, "65": 2982.0, "70": 3716.0, "75": 2729.0, "80": 3758.0, "85": 3562.0, "90": 3313.0, "95": 3458.0, "100": 3432.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 269842944.0, "5": 269842944.0, "10": 269842944.0, "15": 269842944.0, "20": 269842944.0, "25": 269842944.0, "30": 269842944.0, "35": 269842944.0, "40": 269842944.0, "45": 269842944.0, "50": 269842944.0, "55": 269842944.0, "60": 269842944.0, "65": 269842944.0, "70": 269842944.0, "75": 269842944.0, "80": 269842944.0, "85": 269842944.0, "90": 269842944.0, "95": 269842944.0, "100": 269842944.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 956153344.0, "5": 1035779584.0, "10": 1035779584.0, "15": 1035779584.0, "20": 1035779584.0, "25": 1035779584.0, "30": 1035779584.0, "35": 1035779584.0, "40": 1035779584.0, "45": 1035779584.0, "50": 1035779584.0, "55": 1035779584.0, "60": 1035779584.0, "65": 1035779584.0, "70": 1035779584.0, "75": 1035779584.0, "80": 1035779584.0, "85": 1035779584.0, "90": 1035779584.0, "95": 1035779584.0, "100": 1035779584.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 6.03589, "5": 0.28552, "10": 0.28504, "15": 0.29587, "20": 0.28309, "25": 0.27926, "30": 0.27852, "35": 0.27751, "40": 0.27651, "45": 0.27785, "50": 0.27743, "55": 0.27487, "60": 0.27351, "65": 0.27319, "70": 0.27565, "75": 0.50898, "80": 0.27289, "85": 0.27348, "90": 0.27316, "95": 0.27294, "100": 0.27277}}}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86126, "5": 10.88253, "10": 10.83509, "15": 10.82739, "20": 10.72744, "25": 10.55749, "30": 10.37894, "35": 10.28322, "40": 10.08784, "45": 9.82625, "50": 9.91326, "55": 9.8779, "60": 9.50869, "65": 8.95102, "70": 9.73166, "75": 9.43677, "80": 9.41158, "85": 9.61615, "90": 9.82168, "95": 9.51915, "100": 9.40594}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1752.0, "5": 2204.0, "10": 1567.0, "15": 2068.0, "20": 1783.0, "25": 1795.0, "30": 2124.0, "35": 2186.0, "40": 2575.0, "45": 2278.0, "50": 2683.0, "55": 2676.0, "60": 2773.0, "65": 3019.0, "70": 3591.0, "75": 2870.0, "80": 3765.0, "85": 3549.0, "90": 3490.0, "95": 3544.0, "100": 3617.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 269891584.0, "5": 269891584.0, "10": 269891584.0, "15": 269891584.0, "20": 269891584.0, "25": 269891584.0, "30": 269891584.0, "35": 269891584.0, "40": 269891584.0, "45": 269891584.0, "50": 269891584.0, "55": 269891584.0, "60": 269891584.0, "65": 269891584.0, "70": 269891584.0, "75": 269891584.0, "80": 269891584.0, "85": 269891584.0, "90": 269891584.0, "95": 269891584.0, "100": 269891584.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1450730496.0, "5": 1515676672.0, "10": 1515676672.0, "15": 1515676672.0, "20": 1515676672.0, "25": 1515676672.0, "30": 1515676672.0, "35": 1515676672.0, "40": 1515676672.0, "45": 1515676672.0, "50": 1515676672.0, "55": 1515676672.0, "60": 1515676672.0, "65": 1515676672.0, "70": 1515676672.0, "75": 1515676672.0, "80": 1515676672.0, "85": 1515676672.0, "90": 1515676672.0, "95": 1515676672.0, "100": 1515676672.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 8.22865, "5": 0.31237, "10": 0.31889, "15": 0.30979, "20": 0.31592, "25": 0.31368, "30": 0.31292, "35": 0.31001, "40": 0.31087, "45": 0.30787, "50": 0.3067, "55": 0.30223, "60": 0.29974, "65": 0.29841, "70": 0.29787, "75": 0.30072, "80": 0.29729, "85": 0.29753, "90": 0.29692, "95": 0.29937, "100": 0.29618}}}
\ No newline at end of file
......@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters: 100
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -49,4 +49,5 @@ MODEL_ARGS:
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--log-memory-to-tensorboard: true
TEST_TYPE: ckpt-resume
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86065, "5": 10.88253, "10": 10.8352, "15": 10.82848, "20": 10.72795, "25": 10.55737, "30": 10.37935, "35": 10.28345, "40": 10.0878, "45": 9.82662, "50": 9.91321, "55": 9.87799, "60": 9.50877, "65": 8.95112, "70": 9.73131, "75": 9.43668, "80": 9.41164, "85": 9.61594, "90": 9.8216, "95": 9.51907, "100": 9.40583}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1716.0, "5": 2123.0, "10": 1539.0, "15": 2026.0, "20": 1842.0, "25": 1767.0, "30": 2099.0, "35": 2213.0, "40": 2387.0, "45": 2378.0, "50": 2771.0, "55": 2649.0, "60": 2734.0, "65": 2982.0, "70": 3716.0, "75": 2729.0, "80": 3758.0, "85": 3562.0, "90": 3313.0, "95": 3458.0, "100": 3432.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 269842944.0, "5": 269842944.0, "10": 269842944.0, "15": 269842944.0, "20": 269842944.0, "25": 269842944.0, "30": 269842944.0, "35": 269842944.0, "40": 269842944.0, "45": 269842944.0, "50": 269842944.0, "55": 269842944.0, "60": 269842944.0, "65": 269842944.0, "70": 269842944.0, "75": 269842944.0, "80": 269842944.0, "85": 269842944.0, "90": 269842944.0, "95": 269842944.0, "100": 269842944.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 964541952.0, "5": 1035779584.0, "10": 1035779584.0, "15": 1035779584.0, "20": 1035779584.0, "25": 1035779584.0, "30": 1035779584.0, "35": 1035779584.0, "40": 1035779584.0, "45": 1035779584.0, "50": 1035779584.0, "55": 1035779584.0, "60": 1035779584.0, "65": 1035779584.0, "70": 1035779584.0, "75": 1035779584.0, "80": 1035779584.0, "85": 1035779584.0, "90": 1035779584.0, "95": 1035779584.0, "100": 1035779584.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 6.5187, "5": 0.28962, "10": 0.29031, "15": 0.28267, "20": 0.28213, "25": 0.28217, "30": 0.28177, "35": 0.28205, "40": 0.28416, "45": 0.28076, "50": 0.28157, "55": 0.28142, "60": 0.28019, "65": 0.28032, "70": 0.28088, "75": 0.27982, "80": 0.27983, "85": 0.27971, "90": 0.28117, "95": 0.28008, "100": 0.27953}}}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86126, "5": 10.88253, "10": 10.83509, "15": 10.82739, "20": 10.72744, "25": 10.55749, "30": 10.37894, "35": 10.28322, "40": 10.08784, "45": 9.82625, "50": 9.91326, "55": 9.8779, "60": 9.50869, "65": 8.95102, "70": 9.73166, "75": 9.43677, "80": 9.41158, "85": 9.61615, "90": 9.82168, "95": 9.51915, "100": 9.40594}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1752.0, "5": 2204.0, "10": 1567.0, "15": 2068.0, "20": 1783.0, "25": 1795.0, "30": 2124.0, "35": 2186.0, "40": 2575.0, "45": 2278.0, "50": 2683.0, "55": 2676.0, "60": 2773.0, "65": 3019.0, "70": 3591.0, "75": 2870.0, "80": 3765.0, "85": 3549.0, "90": 3490.0, "95": 3544.0, "100": 3617.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 269891584.0, "5": 269891584.0, "10": 269891584.0, "15": 269891584.0, "20": 269891584.0, "25": 269891584.0, "30": 269891584.0, "35": 269891584.0, "40": 269891584.0, "45": 269891584.0, "50": 269891584.0, "55": 269891584.0, "60": 269891584.0, "65": 269891584.0, "70": 269891584.0, "75": 269891584.0, "80": 269891584.0, "85": 269891584.0, "90": 269891584.0, "95": 269891584.0, "100": 269891584.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1450731008.0, "5": 1515675648.0, "10": 1515676672.0, "15": 1515676672.0, "20": 1515676672.0, "25": 1515676672.0, "30": 1515676672.0, "35": 1515676672.0, "40": 1515676672.0, "45": 1515676672.0, "50": 1515676672.0, "55": 1515676672.0, "60": 1515676672.0, "65": 1515676672.0, "70": 1515676672.0, "75": 1515676672.0, "80": 1515676672.0, "85": 1515676672.0, "90": 1515676672.0, "95": 1515676672.0, "100": 1515676672.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 11.38789, "5": 0.322, "10": 0.31419, "15": 0.31533, "20": 0.30974, "25": 0.30867, "30": 0.30191, "35": 0.30301, "40": 0.30266, "45": 0.30177, "50": 0.30441, "55": 0.33472, "60": 0.31376, "65": 0.32009, "70": 0.31308, "75": 0.31965, "80": 0.31251, "85": 0.31098, "90": 0.30726, "95": 0.30595, "100": 0.30772}}}
\ No newline at end of file
......@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters: 100
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -50,4 +50,5 @@ MODEL_ARGS:
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--log-memory-to-tensorboard: true
TEST_TYPE: ckpt-resume
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86073, "5": 10.8823, "10": 10.83564, "15": 10.83051, "20": 10.73302, "25": 10.56317, "30": 10.38508, "35": 10.28979, "40": 10.09131, "45": 9.83512, "50": 9.91593, "55": 9.88231, "60": 9.51403, "65": 8.95406, "70": 9.7307, "75": 9.43134, "80": 9.40601, "85": 9.61116, "90": 9.8175, "95": 9.51556, "100": 9.40417}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1793.0, "5": 2158.0, "10": 1547.0, "15": 2089.0, "20": 1858.0, "25": 1753.0, "30": 2091.0, "35": 2200.0, "40": 2602.0, "45": 2415.0, "50": 2741.0, "55": 2688.0, "60": 2698.0, "65": 2813.0, "70": 3731.0, "75": 2787.0, "80": 3822.0, "85": 3525.0, "90": 3430.0, "95": 3582.0, "100": 3723.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 368387584.0, "5": 368387584.0, "10": 368387584.0, "15": 368387584.0, "20": 368387584.0, "25": 368387584.0, "30": 368387584.0, "35": 368387584.0, "40": 368387584.0, "45": 368387584.0, "50": 368387584.0, "55": 368387584.0, "60": 368387584.0, "65": 368387584.0, "70": 368387584.0, "75": 368387584.0, "80": 368387584.0, "85": 368387584.0, "90": 368387584.0, "95": 368387584.0, "100": 368387584.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1032173056.0, "5": 1163160064.0, "10": 1163160064.0, "15": 1163160064.0, "20": 1163160064.0, "25": 1163160064.0, "30": 1163160064.0, "35": 1163160064.0, "40": 1163160064.0, "45": 1163160064.0, "50": 1163160064.0, "55": 1163160064.0, "60": 1163160064.0, "65": 1163160064.0, "70": 1163160064.0, "75": 1163160064.0, "80": 1163160064.0, "85": 1163160064.0, "90": 1163160064.0, "95": 1163160064.0, "100": 1163160064.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 8.70935, "5": 0.38509, "10": 0.39398, "15": 0.39049, "20": 0.39065, "25": 0.38942, "30": 0.38888, "35": 0.39041, "40": 0.39256, "45": 0.39188, "50": 0.39096, "55": 0.38207, "60": 0.38257, "65": 0.38138, "70": 0.3975, "75": 0.38155, "80": 0.38011, "85": 0.38775, "90": 0.38412, "95": 0.3829, "100": 0.38287}}}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86217, "5": 10.8825, "10": 10.83647, "15": 10.8291, "20": 10.73231, "25": 10.56391, "30": 10.38476, "35": 10.28957, "40": 10.09136, "45": 9.83492, "50": 9.91604, "55": 9.88229, "60": 9.51379, "65": 8.95396, "70": 9.731, "75": 9.43126, "80": 9.40596, "85": 9.61136, "90": 9.81744, "95": 9.51567, "100": 9.4043}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1769.0, "5": 2216.0, "10": 1573.0, "15": 2132.0, "20": 1815.0, "25": 1849.0, "30": 2009.0, "35": 2182.0, "40": 2489.0, "45": 2381.0, "50": 2727.0, "55": 2667.0, "60": 2723.0, "65": 2907.0, "70": 3734.0, "75": 2746.0, "80": 3726.0, "85": 3599.0, "90": 3323.0, "95": 3615.0, "100": 3524.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 368387072.0, "5": 368387072.0, "10": 368387072.0, "15": 368387072.0, "20": 368387072.0, "25": 368387072.0, "30": 368387072.0, "35": 368387072.0, "40": 368387072.0, "45": 368387072.0, "50": 368387072.0, "55": 368387072.0, "60": 368387072.0, "65": 368387072.0, "70": 368387072.0, "75": 368387072.0, "80": 368387072.0, "85": 368387072.0, "90": 368387072.0, "95": 368387072.0, "100": 368387072.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1513069568.0, "5": 1647199744.0, "10": 1647199744.0, "15": 1647201792.0, "20": 1647201792.0, "25": 1647201792.0, "30": 1647201792.0, "35": 1647201792.0, "40": 1647201792.0, "45": 1647201792.0, "50": 1647201792.0, "55": 1647201792.0, "60": 1649298944.0, "65": 1649298944.0, "70": 1649298944.0, "75": 1649298944.0, "80": 1649298944.0, "85": 1649298944.0, "90": 1649298944.0, "95": 1649298944.0, "100": 1649298944.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.88726, "5": 0.40682, "10": 0.43529, "15": 0.47149, "20": 0.41093, "25": 0.40566, "30": 0.42086, "35": 0.40692, "40": 0.4028, "45": 0.40374, "50": 0.404, "55": 0.41679, "60": 0.42436, "65": 0.427, "70": 0.42395, "75": 0.4485, "80": 0.45249, "85": 0.41989, "90": 0.41911, "95": 0.42649, "100": 0.42528}}}
\ No newline at end of file
......@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters: 100
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -49,4 +49,5 @@ MODEL_ARGS:
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--log-memory-to-tensorboard: true
TEST_TYPE: ckpt-resume
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment