Commit 688448db authored by silencealiang's avatar silencealiang
Browse files

更新代码

parent a02a5490
Pipeline #2503 passed with stage
......@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters: 100
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -47,4 +47,5 @@ MODEL_ARGS:
--data-cache-path: ${DATA_CACHE_PATH}
--fp16: true
--apply-query-key-layer-scaling: true
--log-memory-to-tensorboard: true
TEST_TYPE: ckpt-resume
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.79206,
10.86691,
10.89065,
10.78186,
10.65978,
10.58022,
10.08207,
10.19156,
10.13495,
9.81167
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
1626.0,
1866.0,
1959.0,
1816.0,
1890.0,
1654.0,
1537.0,
1965.0,
2436.0,
2405.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
21.9348,
0.1633,
0.16334,
0.16269,
0.16133,
0.16064,
0.16007,
0.15926,
0.1592,
0.15982
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.79206, "5": 10.84592, "10": 10.76954, "15": 10.78975, "20": 10.67887, "25": 10.50432, "30": 10.33089, "35": 10.25276, "40": 10.05236, "45": 9.80262, "50": 9.88808}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1626.0, "5": 1909.0, "10": 1312.0, "15": 1911.0, "20": 1601.0, "25": 1600.0, "30": 1886.0, "35": 2056.0, "40": 2241.0, "45": 2090.0, "50": 2433.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 777900032.0, "5": 777900032.0, "10": 777900032.0, "15": 777900032.0, "20": 777900032.0, "25": 777900032.0, "30": 777900032.0, "35": 777900032.0, "40": 777900032.0, "45": 777900032.0, "50": 777900032.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2463815680.0, "5": 2744478720.0, "10": 2744478720.0, "15": 2744478720.0, "20": 2744478720.0, "25": 2744478720.0, "30": 2744478720.0, "35": 2744478720.0, "40": 2744478720.0, "45": 2744478720.0, "50": 2744478720.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5.17793, "5": 0.16541, "10": 0.16804, "15": 0.1616, "20": 0.16211, "25": 0.16186, "30": 0.1614, "35": 0.16111, "40": 0.16014, "45": 0.15818, "50": 0.15678}}}
\ No newline at end of file
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.79206,
10.86691,
10.89065,
10.78186,
10.65978,
10.58022,
10.08207,
10.19156,
10.13495,
9.81167
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
1626.0,
1866.0,
1959.0,
1816.0,
1890.0,
1654.0,
1537.0,
1965.0,
2436.0,
2405.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
21.9348,
0.1633,
0.16334,
0.16269,
0.16133,
0.16064,
0.16007,
0.15926,
0.1592,
0.15982
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.79206, "5": 10.84592, "10": 10.76954, "15": 10.78975, "20": 10.67887, "25": 10.50432, "30": 10.33089, "35": 10.25276, "40": 10.05236, "45": 9.80262, "50": 9.88808}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1626.0, "5": 1909.0, "10": 1312.0, "15": 1911.0, "20": 1601.0, "25": 1600.0, "30": 1886.0, "35": 2056.0, "40": 2241.0, "45": 2090.0, "50": 2433.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 782094336.0, "5": 782094336.0, "10": 782094336.0, "15": 782094336.0, "20": 782094336.0, "25": 782094336.0, "30": 782094336.0, "35": 782094336.0, "40": 782094336.0, "45": 782094336.0, "50": 782094336.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2462767104.0, "5": 2748673024.0, "10": 2748673024.0, "15": 2748673024.0, "20": 2748673024.0, "25": 2748673024.0, "30": 2748673024.0, "35": 2748673024.0, "40": 2748673024.0, "45": 2748673024.0, "50": 2748673024.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 17.13285, "5": 0.16867, "10": 0.16853, "15": 0.16651, "20": 0.16645, "25": 0.16655, "30": 0.37162, "35": 0.16698, "40": 0.1662, "45": 0.16431, "50": 0.1635}}}
\ No newline at end of file
......@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -46,4 +46,5 @@ MODEL_ARGS:
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--apply-query-key-layer-scaling: true
--log-memory-to-tensorboard: true
TEST_TYPE: regular
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.79219, "5": 10.84727, "10": 10.77729, "15": 10.84106, "20": 10.82889, "25": 10.7666, "30": 10.69961, "35": 10.61845, "40": 10.44051, "45": 10.2086, "50": 10.21168, "55": 10.15676, "60": 9.77265, "65": 9.22128, "70": 9.89371, "75": 9.56098, "80": 9.5311, "85": 9.71911, "90": 9.89982, "95": 9.59785, "100": 9.49008}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 569591808.0, "5": 569591808.0, "10": 569591808.0, "15": 569591808.0, "20": 852351488.0, "25": 852351488.0, "30": 852351488.0, "35": 852351488.0, "40": 852351488.0, "45": 852351488.0, "50": 852351488.0, "55": 852351488.0, "60": 852351488.0, "65": 852351488.0, "70": 852351488.0, "75": 852351488.0, "80": 852351488.0, "85": 852351488.0, "90": 852351488.0, "95": 852351488.0, "100": 852351488.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2394266112.0, "5": 2394266624.0, "10": 2394266624.0, "15": 2394266624.0, "20": 2677288448.0, "25": 2677288448.0, "30": 2677288448.0, "35": 2677288448.0, "40": 2677288448.0, "45": 2677288448.0, "50": 2677288448.0, "55": 2677288448.0, "60": 2677288448.0, "65": 2677288448.0, "70": 2677288448.0, "75": 2677288448.0, "80": 2677288448.0, "85": 2677288448.0, "90": 2677288448.0, "95": 2677288448.0, "100": 2677288448.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 9.20637, "5": 0.15525, "10": 0.15614, "15": 0.15359, "20": 0.16136, "25": 0.16083, "30": 0.16019, "35": 0.16169, "40": 0.16106, "45": 0.16081, "50": 0.16073, "55": 0.15707, "60": 0.1561, "65": 0.16078, "70": 0.15943, "75": 0.15999, "80": 0.15947, "85": 0.15903, "90": 0.15903, "95": 0.15832, "100": 0.15962}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1881.0, "25": 2342.0, "30": 2479.0, "35": 2015.0, "40": 2210.0, "45": 2480.0, "50": 2916.0, "55": 2451.0, "60": 2926.0, "65": 2270.0, "70": 3615.0, "75": 2951.0, "80": 3569.0, "85": 3977.0, "90": 3808.0, "95": 4246.0, "100": 3731.0}}}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.79219, "5": 10.84727, "10": 10.77729, "15": 10.84106, "20": 10.82889, "25": 10.7666, "30": 10.69961, "35": 10.61845, "40": 10.44051, "45": 10.2086, "50": 10.21168, "55": 10.15676, "60": 9.77265, "65": 9.22128, "70": 9.89371, "75": 9.56099, "80": 9.5311, "85": 9.71912, "90": 9.89983, "95": 9.59785, "100": 9.49009}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 570640384.0, "5": 570640384.0, "10": 570640384.0, "15": 570640384.0, "20": 852351488.0, "25": 852351488.0, "30": 852351488.0, "35": 852351488.0, "40": 852351488.0, "45": 852351488.0, "50": 852351488.0, "55": 852351488.0, "60": 852351488.0, "65": 852351488.0, "70": 852351488.0, "75": 852351488.0, "80": 852351488.0, "85": 852351488.0, "90": 852351488.0, "95": 852351488.0, "100": 852351488.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2393217536.0, "5": 2393218048.0, "10": 2393218048.0, "15": 2393218048.0, "20": 2675191296.0, "25": 2675191296.0, "30": 2675191296.0, "35": 2675191296.0, "40": 2675191296.0, "45": 2675191296.0, "50": 2675191296.0, "55": 2675191296.0, "60": 2675191296.0, "65": 2675191296.0, "70": 2675191296.0, "75": 2675191296.0, "80": 2675191296.0, "85": 2675191296.0, "90": 2675191296.0, "95": 2675191296.0, "100": 2675191296.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 16.58626, "5": 0.16569, "10": 0.16316, "15": 0.16279, "20": 0.17703, "25": 0.17217, "30": 0.17358, "35": 0.17246, "40": 0.17158, "45": 0.17224, "50": 0.1705, "55": 0.16674, "60": 0.16732, "65": 0.16787, "70": 0.16785, "75": 0.16687, "80": 0.16672, "85": 0.16784, "90": 0.16602, "95": 0.17069, "100": 0.16936}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1881.0, "25": 2342.0, "30": 2479.0, "35": 2015.0, "40": 2210.0, "45": 2480.0, "50": 2916.0, "55": 2451.0, "60": 2990.0, "65": 2327.0, "70": 3731.0, "75": 3015.0, "80": 3623.0, "85": 4045.0, "90": 3909.0, "95": 4212.0, "100": 3673.0}}}
\ No newline at end of file
......@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters: 100
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -47,4 +47,5 @@ MODEL_ARGS:
--data-cache-path: ${DATA_CACHE_PATH}
--fp16: true
--apply-query-key-layer-scaling: true
--log-memory-to-tensorboard: true
TEST_TYPE: ckpt-resume
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82027, "5": 10.84932, "10": 10.78695, "15": 10.82723, "20": 10.728, "25": 10.57817, "30": 10.40703, "35": 10.31124, "40": 10.13951, "45": 9.91072, "50": 9.9683}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5111.0, "5": 5732.0, "10": 4774.0, "15": 5398.0, "20": 5259.0, "25": 5163.0, "30": 5567.0, "35": 5831.0, "40": 6144.0, "45": 5834.0, "50": 6815.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 596698624.0, "5": 596698112.0, "10": 596696576.0, "15": 596697600.0, "20": 596698112.0, "25": 596698624.0, "30": 596699136.0, "35": 596696576.0, "40": 596698624.0, "45": 596698624.0, "50": 596699136.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 840132608.0, "5": 1070596096.0, "10": 1071991296.0, "15": 1071991296.0, "20": 1071991296.0, "25": 1071991296.0, "30": 1071991296.0, "35": 1071991296.0, "40": 1071991296.0, "45": 1071991296.0, "50": 1071991296.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.08967, "5": 0.79969, "10": 1.06452, "15": 0.78727, "20": 0.79043, "25": 0.99776, "30": 0.78847, "35": 0.79196, "40": 0.78657, "45": 0.79613, "50": 0.78348}}}
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
--num-attention-heads: 8
--log-params-norm: true
--log-num-zeros-in-grad: true
--log-validation-ppl-to-tensorboard: true
--log-timers-to-tensorboard: true
--tensorboard-dir: ${TENSORBOARD_PATH}
--micro-batch-size: 4
--global-batch-size: 32
--seq-length: 1024
--max-position-embeddings: 1024
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
--split: 949,50,1
--distributed-backend: nccl
--lr: 0.00015
--lr-decay-style: cosine
--min-lr: 1.0e-5
--weight-decay: 1e-2
--clip-grad: 1.0
--lr-warmup-fraction: .01
--log-interval: 1
--save-interval: 10000
--eval-interval: 1000
--eval-iters: 10
--transformer-impl: transformer_engine
--tensor-model-parallel-size: 2
--pipeline-model-parallel-size: 2
--context-parallel-size: 2
--expert-model-parallel-size: 2
--sequence-parallel: true
--num-experts: 4
--moe-router-load-balancing-type: sinkhorn
--moe-router-topk: 1
--deterministic-mode: true
--no-gradient-accumulation-fusion: true
--use-mcore-models: true
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--attention-backend: flash
--log-memory-to-tensorboard: true
TEST_TYPE: regular
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82027, "5": 10.84932, "10": 10.78695, "15": 10.82723, "20": 10.728, "25": 10.57817, "30": 10.40703, "35": 10.31124, "40": 10.13951, "45": 9.91072, "50": 9.9683}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5111.0, "5": 5732.0, "10": 4774.0, "15": 5398.0, "20": 5259.0, "25": 5163.0, "30": 5567.0, "35": 5831.0, "40": 6144.0, "45": 5834.0, "50": 6815.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 596698624.0, "5": 596698112.0, "10": 596696576.0, "15": 596697600.0, "20": 596698112.0, "25": 596698624.0, "30": 596699136.0, "35": 596696576.0, "40": 596698624.0, "45": 596698624.0, "50": 596699136.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 840132608.0, "5": 1070596096.0, "10": 1071991296.0, "15": 1071991296.0, "20": 1071991296.0, "25": 1071991296.0, "30": 1071991296.0, "35": 1071991296.0, "40": 1071991296.0, "45": 1071991296.0, "50": 1071991296.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.84481, "5": 1.02931, "10": 0.99191, "15": 0.97148, "20": 0.77381, "25": 1.02379, "30": 0.77383, "35": 0.77556, "40": 0.77762, "45": 0.77638, "50": 0.77403}}}
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
--num-attention-heads: 8
--log-params-norm: true
--log-num-zeros-in-grad: true
--log-validation-ppl-to-tensorboard: true
--log-timers-to-tensorboard: true
--tensorboard-dir: ${TENSORBOARD_PATH}
--micro-batch-size: 4
--global-batch-size: 32
--seq-length: 1024
--max-position-embeddings: 1024
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
--split: 949,50,1
--distributed-backend: nccl
--lr: 0.00015
--lr-decay-style: cosine
--min-lr: 1.0e-5
--weight-decay: 1e-2
--clip-grad: 1.0
--lr-warmup-fraction: .01
--log-interval: 1
--save-interval: 10000
--eval-interval: 1000
--eval-iters: 10
--transformer-impl: transformer_engine
--tensor-model-parallel-size: 2
--pipeline-model-parallel-size: 2
--context-parallel-size: 2
--expert-model-parallel-size: 2
--sequence-parallel: true
--num-experts: 4
--moe-router-load-balancing-type: sinkhorn
--moe-router-topk: 1
--deterministic-mode: true
--no-gradient-accumulation-fusion: true
--use-mcore-models: true
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--attention-backend: flash
--log-memory-to-tensorboard: true
--use-tp-pp-dp-mapping: true
TEST_TYPE: regular
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.7934, "5": 10.82731, "10": 10.76672, "15": 10.78912, "20": 10.70399, "25": 10.53774, "30": 10.39119, "35": 10.30163, "40": 10.12628, "45": 9.89627, "50": 9.97376}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5576.0, "5": 6583.0, "10": 5280.0, "15": 6361.0, "20": 5720.0, "25": 5806.0, "30": 6223.0, "35": 6684.0, "40": 6987.0, "45": 6837.0, "50": 7602.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 458715136.0, "5": 458715136.0, "10": 458718208.0, "15": 458717184.0, "20": 458715136.0, "25": 458715648.0, "30": 458714624.0, "35": 458716160.0, "40": 458716672.0, "45": 458715136.0, "50": 458715648.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1017808384.0, "5": 1183341568.0, "10": 1184804864.0, "15": 1184804864.0, "20": 1184804864.0, "25": 1184804864.0, "30": 1184804864.0, "35": 1184804864.0, "40": 1184804864.0, "45": 1184804864.0, "50": 1184804864.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 8.4596, "5": 0.66406, "10": 0.66034, "15": 0.66098, "20": 0.65918, "25": 0.65845, "30": 0.65966, "35": 0.6573, "40": 0.65347, "45": 0.65812, "50": 0.65616}}}
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.7934, "5": 10.82731, "10": 10.76672, "15": 10.78933, "20": 10.70416, "25": 10.53748, "30": 10.39181, "35": 10.30095, "40": 10.12594, "45": 9.89677, "50": 9.97407}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5576.0, "5": 6583.0, "10": 5262.0, "15": 6198.0, "20": 5805.0, "25": 5815.0, "30": 6199.0, "35": 6566.0, "40": 7076.0, "45": 6876.0, "50": 7591.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 458715136.0, "5": 458715136.0, "10": 458718208.0, "15": 458717184.0, "20": 458715136.0, "25": 458715648.0, "30": 458715136.0, "35": 458716672.0, "40": 458716672.0, "45": 458714624.0, "50": 458716160.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1017808384.0, "5": 1183341568.0, "10": 1184804864.0, "15": 1184804864.0, "20": 1184804864.0, "25": 1184804864.0, "30": 1184804864.0, "35": 1184804864.0, "40": 1184804864.0, "45": 1185401344.0, "50": 1185401344.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 22.12737, "5": 0.70643, "10": 0.72506, "15": 0.70455, "20": 0.70271, "25": 0.70404, "30": 0.70446, "35": 0.70004, "40": 0.89832, "45": 0.70671, "50": 0.70503}}}
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
--num-attention-heads: 8
--log-params-norm: true
--log-num-zeros-in-grad: true
--log-validation-ppl-to-tensorboard: true
--log-timers-to-tensorboard: true
--tensorboard-dir: ${TENSORBOARD_PATH}
--micro-batch-size: 4
--global-batch-size: 32
--seq-length: 1024
--max-position-embeddings: 1024
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
--split: 949,50,1
--distributed-backend: nccl
--lr: 0.00015
--lr-decay-style: cosine
--min-lr: 1.0e-5
--weight-decay: 1e-2
--clip-grad: 1.0
--lr-warmup-fraction: .01
--log-interval: 1
--save-interval: 10000
--eval-interval: 1000
--eval-iters: 10
--transformer-impl: local
--tensor-model-parallel-size: 4
--pipeline-model-parallel-size: 2
--expert-model-parallel-size: 2
--expert-tensor-parallel-size: 2
--sequence-parallel: true
--num-experts: 4
--moe-router-load-balancing-type: sinkhorn
--moe-router-topk: 1
--deterministic-mode: true
--no-gradient-accumulation-fusion: true
--use-mcore-models: true
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--attention-backend: unfused
--log-memory-to-tensorboard: true
TEST_TYPE: regular
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.7934, "5": 10.82731, "10": 10.76672, "15": 10.78912, "20": 10.70399, "25": 10.53774, "30": 10.39119, "35": 10.30163, "40": 10.12628, "45": 9.89627, "50": 9.97376}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5576.0, "5": 6583.0, "10": 5280.0, "15": 6361.0, "20": 5720.0, "25": 5806.0, "30": 6223.0, "35": 6684.0, "40": 6987.0, "45": 6837.0, "50": 7602.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 458715136.0, "5": 458715136.0, "10": 458718208.0, "15": 458717184.0, "20": 458715136.0, "25": 458715648.0, "30": 458714624.0, "35": 458716160.0, "40": 458716672.0, "45": 458715136.0, "50": 458715648.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1017808384.0, "5": 1183341568.0, "10": 1184804864.0, "15": 1184804864.0, "20": 1184804864.0, "25": 1184804864.0, "30": 1184804864.0, "35": 1184804864.0, "40": 1184804864.0, "45": 1184804864.0, "50": 1184804864.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 8.96135, "5": 0.66582, "10": 0.66672, "15": 0.66703, "20": 0.67005, "25": 0.66892, "30": 0.66766, "35": 0.66539, "40": 0.66319, "45": 0.66894, "50": 0.66499}}}
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.7934, "5": 10.82731, "10": 10.76672, "15": 10.78933, "20": 10.70416, "25": 10.53748, "30": 10.39181, "35": 10.30095, "40": 10.12594, "45": 9.89677, "50": 9.97407}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5576.0, "5": 6583.0, "10": 5262.0, "15": 6198.0, "20": 5805.0, "25": 5815.0, "30": 6199.0, "35": 6566.0, "40": 7076.0, "45": 6876.0, "50": 7591.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 458715136.0, "5": 458715136.0, "10": 458718208.0, "15": 458717184.0, "20": 458715136.0, "25": 458715648.0, "30": 458715136.0, "35": 458716672.0, "40": 458716672.0, "45": 458714624.0, "50": 458716160.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1017808384.0, "5": 1183341568.0, "10": 1184804864.0, "15": 1184804864.0, "20": 1184804864.0, "25": 1184804864.0, "30": 1184804864.0, "35": 1184804864.0, "40": 1184804864.0, "45": 1185401344.0, "50": 1185401344.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 26.12525, "5": 0.6932, "10": 0.69116, "15": 0.69227, "20": 0.69207, "25": 0.69063, "30": 0.69144, "35": 0.68372, "40": 0.89596, "45": 0.68537, "50": 0.69374}}}
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
--num-attention-heads: 8
--log-params-norm: true
--log-num-zeros-in-grad: true
--log-validation-ppl-to-tensorboard: true
--log-timers-to-tensorboard: true
--tensorboard-dir: ${TENSORBOARD_PATH}
--micro-batch-size: 4
--global-batch-size: 32
--seq-length: 1024
--max-position-embeddings: 1024
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
--split: 949,50,1
--distributed-backend: nccl
--lr: 0.00015
--lr-decay-style: cosine
--min-lr: 1.0e-5
--weight-decay: 1e-2
--clip-grad: 1.0
--lr-warmup-fraction: .01
--log-interval: 1
--save-interval: 10000
--eval-interval: 1000
--eval-iters: 10
--transformer-impl: local
--tensor-model-parallel-size: 4
--pipeline-model-parallel-size: 2
--expert-model-parallel-size: 2
--expert-tensor-parallel-size: 2
--sequence-parallel: true
--num-experts: 4
--moe-router-load-balancing-type: sinkhorn
--moe-router-topk: 1
--deterministic-mode: true
--no-gradient-accumulation-fusion: true
--use-mcore-models: true
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--attention-backend: unfused
--log-memory-to-tensorboard: true
--use-tp-pp-dp-mapping: true
TEST_TYPE: regular
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.82445, "5": 10.83944, "10": 10.7889, "15": 10.82831, "20": 10.72949, "25": 10.57667, "30": 10.40631, "35": 10.3135, "40": 10.13964, "45": 9.90704, "50": 9.96951, "55": 9.92826, "60": 9.56897, "65": 8.99498, "70": 9.76136, "75": 9.4768, "80": 9.44907, "85": 9.65155, "90": 9.84117, "95": 9.54761, "100": 9.43675}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 12856.0, "5": 15993.0, "10": 12573.0, "15": 14651.0, "20": 13663.0, "25": 13137.0, "30": 14643.0, "35": 15376.0, "40": 16684.0, "45": 16099.0, "50": 18966.0, "55": 16976.0, "60": 18925.0, "65": 19522.0, "70": 22271.0, "75": 18752.0, "80": 23211.0, "85": 24769.0, "90": 24231.0, "95": 23303.0, "100": 21066.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 626182656.0, "5": 626185728.0, "10": 626182656.0, "15": 626185216.0, "20": 626186240.0, "25": 626183168.0, "30": 626183680.0, "35": 626184704.0, "40": 626185728.0, "45": 626475008.0, "50": 626184704.0, "55": 626181632.0, "60": 626180096.0, "65": 626168832.0, "70": 626182656.0, "75": 626186752.0, "80": 626180608.0, "85": 626176000.0, "90": 626186752.0, "95": 627019776.0, "100": 626182656.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1819317248.0, "5": 2050089472.0, "10": 2050089472.0, "15": 2050322944.0, "20": 2050322944.0, "25": 2050322944.0, "30": 2050322944.0, "35": 2050341376.0, "40": 2050341376.0, "45": 2050341376.0, "50": 2050341376.0, "55": 2050341376.0, "60": 2050341376.0, "65": 2050715136.0, "70": 2052688896.0, "75": 2052688896.0, "80": 2052688896.0, "85": 2054681600.0, "90": 2054681600.0, "95": 2057086464.0, "100": 2057086464.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 24.60883, "5": 0.44856, "10": 0.45126, "15": 0.44816, "20": 0.44504, "25": 0.44156, "30": 0.44184, "35": 0.45033, "40": 0.45005, "45": 0.44616, "50": 0.44366, "55": 0.43822, "60": 0.43979, "65": 0.4557, "70": 0.4497, "75": 0.44309, "80": 0.44931, "85": 0.44544, "90": 0.45177, "95": 0.44859, "100": 0.44472}}}
\ No newline at end of file
......@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters: 100
--timing-log-level: 2
--lr-decay-iters: 320000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-gpt3_00_text_document
--vocab-file: ${DATA_PATH}/bpe/vocab.json
--merge-file: ${DATA_PATH}/bpe/merges.txt
......@@ -51,4 +51,5 @@ MODEL_ARGS:
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--log-memory-to-tensorboard: true
TEST_TYPE: ckpt-resume
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.82445,
10.86393,
10.85733,
10.80809,
10.70951,
10.63738,
10.16425,
10.28201,
10.19003,
9.88697
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
12678.0,
16220.0,
16626.0,
16055.0,
13829.0,
14904.0,
12931.0,
15765.0,
16771.0,
17621.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
16.34149,
0.66962,
0.66905,
0.66791,
0.67695,
0.66977,
0.67438,
0.67368,
0.6714,
0.67874
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82445, "5": 10.83944, "10": 10.7889, "15": 10.82895, "20": 10.72911, "25": 10.57606, "30": 10.40656, "35": 10.31389, "40": 10.13997, "45": 9.90738, "50": 9.96993}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 12856.0, "5": 15993.0, "10": 12573.0, "15": 14881.0, "20": 13775.0, "25": 13046.0, "30": 14831.0, "35": 15239.0, "40": 16747.0, "45": 16125.0, "50": 19024.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 626182656.0, "5": 626185728.0, "10": 626182656.0, "15": 626184192.0, "20": 626186240.0, "25": 626183168.0, "30": 626183680.0, "35": 626183680.0, "40": 626186240.0, "45": 626184192.0, "50": 626185216.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1819317248.0, "5": 2050089472.0, "10": 2050089472.0, "15": 2050438656.0, "20": 2050438656.0, "25": 2050438656.0, "30": 2050438656.0, "35": 2050946560.0, "40": 2050946560.0, "45": 2050946560.0, "50": 2050946560.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 7.97445, "5": 0.43504, "10": 0.43232, "15": 0.43326, "20": 0.43474, "25": 0.43463, "30": 0.43979, "35": 0.44199, "40": 0.44565, "45": 0.44017, "50": 0.43092}}}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment