Commit 688448db authored by silencealiang's avatar silencealiang
Browse files

更新代码

parent a02a5490
Pipeline #2503 passed with stage
......@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 990000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-bert_00_text_sentence
--vocab-file: ${DATA_PATH}/vocab.txt
--split: 949,50,1
......
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.46796,
10.45723,
10.44911,
10.44107,
10.41739,
10.34626,
10.11387,
10.0439,
9.86702,
9.679
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
2404.0,
2610.0,
2173.0,
2312.0,
2371.0,
2652.0,
3089.0,
3200.0,
3497.0,
3075.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
15.80389,
0.94155,
0.88518,
1.22442,
0.86955,
0.85166,
1.02329,
1.07525,
0.90283,
0.88308
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.4681, "5": 10.45367, "10": 10.45093, "15": 10.45825, "20": 10.42046, "25": 10.34044, "30": 10.18377, "35": 10.0388, "40": 9.89825, "45": 9.7511, "50": 9.67015}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2373.0, "5": 2811.0, "10": 2502.0, "15": 2735.0, "20": 2341.0, "25": 2828.0, "30": 2945.0, "35": 3125.0, "40": 2406.0, "45": 3739.0, "50": 3475.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2032164352.0, "5": 2032164352.0, "10": 2032164352.0, "15": 2032164352.0, "20": 2032164352.0, "25": 2032164352.0, "30": 2032164352.0, "35": 2032164352.0, "40": 2032164352.0, "45": 2032164352.0, "50": 2032164352.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4360259072.0, "5": 5220507136.0, "10": 5220507136.0, "15": 5220507136.0, "20": 5220507136.0, "25": 5220507136.0, "30": 5220507136.0, "35": 5220507136.0, "40": 5220507136.0, "45": 5220507136.0, "50": 5220507136.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 7.15888, "5": 1.41456, "10": 0.87396, "15": 0.86252, "20": 0.86858, "25": 1.09113, "30": 0.82733, "35": 0.83789, "40": 0.86729, "45": 1.13695, "50": 1.09113}}}
\ No newline at end of file
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.4681,
10.45734,
10.4491,
10.44121,
10.41764,
10.34626,
10.11384,
10.04383,
9.86686,
9.67906
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
2373.0,
2593.0,
2187.0,
2325.0,
2407.0,
2627.0,
3036.0,
3109.0,
3568.0,
3019.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
22.86543,
0.84168,
0.92727,
0.84734,
0.93196,
0.86308,
0.86633,
0.86112,
0.87598,
1.02461
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.4681, "5": 10.45367, "10": 10.45093, "15": 10.45815, "20": 10.42047, "25": 10.34052, "30": 10.18387, "35": 10.03878, "40": 9.89837, "45": 9.75113, "50": 9.67035}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2373.0, "5": 2811.0, "10": 2502.0, "15": 2700.0, "20": 2461.0, "25": 2883.0, "30": 2859.0, "35": 3009.0, "40": 2378.0, "45": 3799.0, "50": 3628.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2032164352.0, "5": 2032164352.0, "10": 2032164352.0, "15": 2032164352.0, "20": 2032164352.0, "25": 2032164352.0, "30": 2032164352.0, "35": 2032164352.0, "40": 2032164352.0, "45": 2032164352.0, "50": 2032164352.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4341384704.0, "5": 5201632768.0, "10": 5201632768.0, "15": 5201632768.0, "20": 5201632768.0, "25": 5201632768.0, "30": 5201632768.0, "35": 5201632768.0, "40": 5201632768.0, "45": 5201632768.0, "50": 5201632768.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 19.82133, "5": 0.83907, "10": 0.86137, "15": 0.79121, "20": 0.88909, "25": 0.97007, "30": 0.76254, "35": 0.78908, "40": 1.03257, "45": 0.78678, "50": 0.76108}}}
\ No newline at end of file
......@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 990000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-bert_00_text_sentence
--vocab-file: ${DATA_PATH}/vocab.txt
--split: 949,50,1
......
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.42085,
10.42901,
10.43576,
10.40804,
10.38463,
10.32426,
10.13148,
10.04317,
9.86257,
9.65771
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
3252.0,
2595.0,
3240.0,
3429.0,
3463.0,
3509.0,
4065.0,
4114.0,
4651.0,
4253.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.83012,
2.26196,
2.22779,
2.22677,
2.23847,
2.24307,
2.23859,
2.23544,
2.2414,
2.25107
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.4209, "5": 10.44499, "10": 10.4421, "15": 10.43146, "20": 10.40923, "25": 10.32639, "30": 10.18342, "35": 10.03454, "40": 9.91262, "45": 9.74932, "50": 9.66164}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2230.0, "5": 2902.0, "10": 3454.0, "15": 2607.0, "20": 3332.0, "25": 3721.0, "30": 3878.0, "35": 4165.0, "40": 3354.0, "45": 4875.0, "50": 4729.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1632405504.0, "5": 1632405504.0, "10": 1632405504.0, "15": 1632405504.0, "20": 1632405504.0, "25": 1632405504.0, "30": 1632405504.0, "35": 1632405504.0, "40": 1632405504.0, "45": 1632405504.0, "50": 1632405504.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2475474432.0, "5": 3175492096.0, "10": 3175492096.0, "15": 3175492096.0, "20": 3175493120.0, "25": 3175493120.0, "30": 3176545280.0, "35": 3176545280.0, "40": 3176545280.0, "45": 3176545280.0, "50": 3176545280.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 6.36768, "5": 2.77129, "10": 2.2485, "15": 2.24504, "20": 2.2713, "25": 2.27356, "30": 2.26503, "35": 2.2618, "40": 2.25789, "45": 2.58105, "50": 2.26297}}}
\ No newline at end of file
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.4209,
10.42905,
10.43557,
10.40806,
10.38457,
10.32414,
10.13167,
10.04335,
9.86262,
9.65771
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
2249.0,
3640.0,
3249.0,
2318.0,
3512.0,
3601.0,
4111.0,
3175.0,
4713.0,
3320.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
12.51144,
2.1285,
2.28886,
2.24273,
2.20818,
2.20231,
2.18786,
2.17554,
2.213,
2.18811
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.4209, "5": 10.44497, "10": 10.4422, "15": 10.43154, "20": 10.40919, "25": 10.32623, "30": 10.18344, "35": 10.03437, "40": 9.91272, "45": 9.74952, "50": 9.66165}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2249.0, "5": 3813.0, "10": 2393.0, "15": 3636.0, "20": 2343.0, "25": 3815.0, "30": 3843.0, "35": 4191.0, "40": 3318.0, "45": 4876.0, "50": 4696.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1632405504.0, "5": 1632405504.0, "10": 1632405504.0, "15": 1632405504.0, "20": 1632405504.0, "25": 1632405504.0, "30": 1632405504.0, "35": 1632405504.0, "40": 1632405504.0, "45": 1632405504.0, "50": 1632405504.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2458703360.0, "5": 3155576320.0, "10": 3155576320.0, "15": 3155576320.0, "20": 3155576320.0, "25": 3155576320.0, "30": 3155576320.0, "35": 3155576320.0, "40": 3155576320.0, "45": 3155576320.0, "50": 3155576320.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.80095, "5": 2.34418, "10": 2.03688, "15": 2.03547, "20": 2.0237, "25": 2.06209, "30": 2.04226, "35": 2.19438, "40": 2.04294, "45": 2.0364, "50": 2.03778}}}
\ No newline at end of file
......@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 990000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-bert_00_text_sentence
--vocab-file: ${DATA_PATH}/vocab.txt
--split: 949,50,1
......
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.49101,
10.49526,
10.48682,
10.48817,
10.49415,
10.4724,
10.42265,
10.29901,
10.1572,
9.97594
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
12.56945,
0.58599,
0.58451,
0.68178,
0.6056,
0.609,
0.59965,
0.60618,
0.60152,
0.59945
]
},
"num-zeros": {
"start_step": 0,
"end_step": 34,
"step_interval": 5,
"values": [
17032.0,
16918.0,
19957.0,
18761.0,
25689.0,
19897.0,
22224.0
]
}
}
\ No newline at end of file
{}
\ No newline at end of file
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.50096,
10.48594,
10.4936,
10.48501,
10.50417,
10.4773,
10.42154,
10.29716,
10.15831,
9.96751
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
12.85743,
0.58922,
0.54928,
0.54147,
0.56305,
0.56895,
0.56282,
0.56247,
0.56751,
0.69574
]
},
"num-zeros": {
"start_step": 0,
"end_step": 34,
"step_interval": 5,
"values": [
16595.0,
18537.0,
19509.0,
18532.0,
26712.0,
20164.0,
20981.0
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.50096, "5": 10.49354, "10": 10.49659, "15": 10.46666, "20": 10.49707, "25": 10.47716, "30": 10.43665, "35": 10.30674, "40": 10.15647, "45": 10.03905, "50": 9.9192}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2278173696.0, "5": 2278173696.0, "10": 2278173696.0, "15": 2278173696.0, "20": 3743563776.0, "25": 3743563776.0, "30": 3743563776.0, "35": 3743563776.0, "40": 3743563776.0, "45": 3743563776.0, "50": 3743563776.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3788467712.0, "5": 3788468736.0, "10": 3788468736.0, "15": 3788468736.0, "20": 5254907392.0, "25": 5254907392.0, "30": 5254907392.0, "35": 5254907392.0, "40": 5254907392.0, "45": 5254907392.0, "50": 5254907392.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 11.64757, "5": 0.57904, "10": 0.55485, "15": 0.54428, "20": 0.56278, "25": 0.56384, "30": 0.5642, "35": 0.58037, "40": 0.59811, "45": 0.57054, "50": 0.56519}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 14368.0, "25": 19189.0, "30": 21709.0, "35": 18201.0, "40": 19483.0, "45": 24956.0, "50": 21241.0}}}
\ No newline at end of file
......@@ -21,8 +21,8 @@ MODEL_ARGS:
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 990000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-bert_00_text_sentence
--vocab-file: ${DATA_PATH}/vocab.txt
--split: 949,50,1
......
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.49734,
10.49243,
10.49325,
10.50311,
10.48985,
10.4721,
10.41217,
10.2805,
10.14052,
9.94191
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
8.58282,
2.06311,
2.05789,
2.24493,
2.05273,
2.05118,
2.05666,
2.04533,
2.05152,
2.04761
]
},
"num-zeros": {
"start_step": 0,
"end_step": 34,
"step_interval": 5,
"values": [
26081.0,
18799.0,
24479.0,
23782.0,
21056.0,
19877.0,
19774.0
]
}
}
\ No newline at end of file
{}
\ No newline at end of file
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.48685,
10.49276,
10.48837,
10.51348,
10.49396,
10.4755,
10.41921,
10.28044,
10.14256,
9.94738
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.8221,
1.96114,
1.9401,
2.22227,
1.94508,
1.94212,
1.93958,
1.94562,
1.9442,
1.94606
]
},
"num-zeros": {
"start_step": 0,
"end_step": 34,
"step_interval": 5,
"values": [
26876.0,
19339.0,
24146.0,
23625.0,
21440.0,
17865.0,
19282.0
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.48685, "5": 10.48748, "10": 10.49154, "15": 10.49856, "20": 10.49971, "25": 10.47921, "30": 10.44762, "35": 10.29221, "40": 10.1426, "45": 10.01072, "50": 9.88753}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1110956544.0, "5": 1110956544.0, "10": 1110956544.0, "15": 1110956544.0, "20": 1809925632.0, "25": 1809925632.0, "30": 1809925632.0, "35": 1809925632.0, "40": 1809925632.0, "45": 1809925632.0, "50": 1809925632.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2286739456.0, "5": 2286740480.0, "10": 2286740480.0, "15": 2286740480.0, "20": 2983612416.0, "25": 2983612416.0, "30": 2983612416.0, "35": 2983612416.0, "40": 2983612416.0, "45": 2983612416.0, "50": 2983612416.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 8.15965, "5": 1.93164, "10": 2.35467, "15": 1.92211, "20": 1.96054, "25": 1.91619, "30": 1.92166, "35": 1.91436, "40": 1.91896, "45": 1.92099, "50": 1.92773}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 21908.0, "25": "nan", "30": 21225.0, "35": 23321.0, "40": 20665.0, "45": 34638.0, "50": 29484.0}}}
\ No newline at end of file
......@@ -21,8 +21,8 @@ MODEL_ARGS:
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 990000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-bert_00_text_sentence
--vocab-file: ${DATA_PATH}/vocab.txt
--split: 949,50,1
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -34,8 +34,8 @@ MODEL_ARGS:
--log-interval: 100
--save-interval: 2000
--eval-interval: 1000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--eval-iters: 10
--tensorboard-dir: ${TENSORBOARD_PATH}
--log-timers-to-tensorboard: true
......
......@@ -67,7 +67,7 @@ class ModelMeta:
"""Basic information about a model.
Args:
format (str): 'mcore', 'megatron', 'meta', or 'hf'.
format (str): 'core', 'legacy', 'meta', or 'hf'.
mp (ModelParallelState): Defines TP, PP, EP.
transformer_impl (str): 'transformer_engine' or 'local'.
"""
......@@ -77,9 +77,9 @@ class ModelMeta:
if isinstance(mp, tuple):
mp = ModelParallelState(*mp)
if transformer_impl is None:
transformer_impl = "transformer_engine" if format == "mcore" else "local"
transformer_impl = "transformer_engine" if format == "core" else "local"
assert format in ("mcore", "megatron", "meta", "hf")
assert format in ("core", "legacy", "meta", "hf")
assert isinstance(mp, ModelParallelState)
assert transformer_impl in ("transformer_engine", "local")
......@@ -176,7 +176,7 @@ class Pipeline:
sys.argv.append("--exit-on-missing-checkpoint")
# Use legacy.
if meta.format == "megatron":
if meta.format == "legacy":
sys.argv.append("--use-legacy-models")
# Parse args.
......@@ -576,26 +576,26 @@ class GPTPipeline(Pipeline):
def get_gpt_pipelines():
"""Get GPT (non-MoE) pipelines."""
return [
GPTPipeline(("mcore", (8, 1)), ("mcore", (1, 8))),
GPTPipeline(("mcore", (4, 2)), ("mcore", (2, 4))),
GPTPipeline(("mcore", (2, 4)), ("mcore", (4, 2))),
GPTPipeline(("mcore", (1, 8)), ("mcore", (8, 1))),
GPTPipeline(("mcore", (4, 2)), ("mcore", (2, 4), "local")),
GPTPipeline(("megatron", (4, 2)), ("mcore", (2, 4))),
GPTPipeline(("mcore", (4, 2), "local"), ("mcore", (2, 4), "local")),
GPTPipeline(("mcore", (4, 2), "local"), ("mcore", (2, 4))),
# [todo] GPTPipeline(("megatron", (4, 2)), ("megatron", (2, 4))),
# [todo] GPTPipeline(("megatron", (4, 2), "te"), ("megatron", (2, 4), "te")),
# [todo] GPTPipeline("meta", "mcore", None, (8, 1)),
# [todo] GPTPipeline("hf", "mcore", None, (8, 1)),
GPTPipeline(("core", (8, 1)), ("core", (1, 8))),
GPTPipeline(("core", (4, 2)), ("core", (2, 4))),
GPTPipeline(("core", (2, 4)), ("core", (4, 2))),
GPTPipeline(("core", (1, 8)), ("core", (8, 1))),
GPTPipeline(("core", (4, 2)), ("core", (2, 4), "local")),
GPTPipeline(("legacy", (4, 2)), ("core", (2, 4))),
GPTPipeline(("core", (4, 2), "local"), ("core", (2, 4), "local")),
GPTPipeline(("core", (4, 2), "local"), ("core", (2, 4))),
# [todo] GPTPipeline(("legacy", (4, 2)), ("legacy", (2, 4))),
# [todo] GPTPipeline(("legacy", (4, 2), "te"), ("legacy", (2, 4), "te")),
# [todo] GPTPipeline("meta", "core", None, (8, 1)),
# [todo] GPTPipeline("hf", "core", None, (8, 1)),
]
def get_moe_pipelines():
"""Get MoE pipelines."""
return [
GPTPipeline(("mcore", (2, 1, 2)), ("mcore", (1, 4, 1)), num_moe_experts=8),
GPTPipeline(("mcore", (1, 4, 1)), ("mcore", (2, 1, 2)), num_moe_experts=4),
GPTPipeline(("core", (2, 1, 2)), ("core", (1, 4, 1)), num_moe_experts=8),
GPTPipeline(("core", (1, 4, 1)), ("core", (2, 1, 2)), num_moe_experts=4),
]
......@@ -605,7 +605,7 @@ def test_all_pipelines():
# Collect pipelines.
pipelines = [
*get_gpt_pipelines(),
# [todo] *get_moe_pipelines(), # todo: MoE support in loader_mcore.py.
# [todo] *get_moe_pipelines(), # todo: MoE support in loader_core.py.
# [todo] *get_bert_pipelines(),
# [todo] *get_t5_pipelines(),
]
......
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: 1
SKIP_PYTEST: 1
MODEL_ARGS:
trainer.num_nodes: 1
trainer.devices: 8
trainer.max_steps: 50
trainer.val_check_interval: 50
trainer.limit_val_batches: 50
trainer.max_epochs: 'null'
trainer.precision: bf16
model.num_layers: 12
model.hidden_size: 768
model.num_attention_heads: 12
model.micro_batch_size: 1
model.global_batch_size: 8
model.tensor_model_parallel_size: 2
model.pipeline_model_parallel_size: 1
model.expert_model_parallel_size: 2
model.virtual_pipeline_model_parallel_size: 'null'
model.encoder_seq_length: 2048
model.max_position_embeddings: 2048
model.ffn_hidden_size: 3072
model.mcore_gpt: 'True'
model.apply_query_key_layer_scaling: 'True'
model.megatron_amp_O2: 'True'
model.data.data_prefix: '[]'
model.data.data_impl: mock
model.data.splits_string: '[99990,8,2]'
model.optim.name: mcore_distributed_optim
model.optim.weight_decay: 0.1
exp_manager.create_checkpoint_callback: 'False'
model.sequence_parallel: 'True'
model.overlap_p2p_comm: 'True'
model.batch_p2p_comm: 'False'
model.bias: 'False'
model.bias_activation_fusion: 'False'
++model.num_moe_experts: 8
++model.moe_grouped_gemm: 'True'
++model.moe_router_load_balancing_type: aux_loss
++model.moe_router_topk: 2
++model.moe_aux_loss_coeff: 1e-2
TEST_TYPE: regular
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment