Commit 688448db authored by silencealiang's avatar silencealiang
Browse files

更新代码

parent a02a5490
Pipeline #2503 passed with stage
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
...@@ -63,7 +63,8 @@ language_model: ...@@ -63,7 +63,8 @@ language_model:
# MoE related # MoE related
moe_router_load_balancing_type: "aux_loss" moe_router_load_balancing_type: "aux_loss"
moe_router_topk: 2 moe_router_topk: 2
moe_router_topk_limited_devices: null moe_router_group_topk: null
moe_router_num_groups: null
moe_grouped_gemm: False moe_grouped_gemm: False
moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss. moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss.
moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss
......
...@@ -8,7 +8,7 @@ do ...@@ -8,7 +8,7 @@ do
done done
mpirun -np 8 --allow-run-as-root \ mpirun -np 8 --allow-run-as-root \
train_GPT-MOE_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1 train_gpt_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
wait wait
......
...@@ -7,11 +7,11 @@ do ...@@ -7,11 +7,11 @@ do
fi fi
done done
mpirun -np 16 --hostfile mixtralnodes \ mpirun -np 512 --hostfile hostfile_gpt_567B \
--allow-run-as-root \ --allow-run-as-root \
--bind-to none \ --bind-to none \
--mca plm_rsh_no_tree_spawn 1 \ --mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x7B_2nodes.sh node021 --profiling=$profiling > output.log 2>&1 train_gpt_567B_multinodes.sh node002 --profiling=$profiling > output.log 2>&1
wait wait
......
...@@ -4,18 +4,23 @@ for para in $* ...@@ -4,18 +4,23 @@ for para in $*
do do
if [[ $para == --profiling* ]];then if [[ $para == --profiling* ]];then
profiling=${para#*=} profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi fi
done done
# Runs GPT 567B model
source /opt/dtk/env.sh source /opt/dtk/env.sh
# Runs Mixtral 8x7B model
# defauat env
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1 export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1 export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10 export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32 export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32 export NCCL_MAX_NCHANNELS=32
...@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7 ...@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0 export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml" export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1 export GROUPED_GEMM_BatchLinear=1
export GLOG_minloglevel=3
RANK=$OMPI_COMM_WORLD_RANK RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
...@@ -96,7 +102,6 @@ TRAINING_ARGS=( ...@@ -96,7 +102,6 @@ TRAINING_ARGS=(
--bf16 --bf16
--overlap-param-gather --overlap-param-gather
--overlap-grad-reduce --overlap-grad-reduce
#--tp-comm-overlap
) )
TORCH_PROFIE_ARGS=( TORCH_PROFIE_ARGS=(
...@@ -104,18 +109,10 @@ TORCH_PROFIE_ARGS=( ...@@ -104,18 +109,10 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 4 5 6 7 --profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3 --profile-step-start 3
--profile-step-end 4 --profile-step-end 4
--profile-dir torch_prof_gpt_1nodes --profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep8-ep_tp1
--use-pytorch-profiler --use-pytorch-profiler
) )
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=( MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2 --tensor-model-parallel-size 2
--pipeline-model-parallel-size 1 --pipeline-model-parallel-size 1
...@@ -157,10 +154,6 @@ APP="python3 -u pretrain_gpt.py \ ...@@ -157,10 +154,6 @@ APP="python3 -u pretrain_gpt.py \
if [[ $profiling == "torch" ]]; then if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS[@]}" APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS[@]}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi fi
#for hygon cpu #for hygon cpu
......
...@@ -4,18 +4,23 @@ for para in $* ...@@ -4,18 +4,23 @@ for para in $*
do do
if [[ $para == --profiling* ]];then if [[ $para == --profiling* ]];then
profiling=${para#*=} profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi fi
done done
# Runs GPT 567B model
source /opt/dtk/env.sh source /opt/dtk/env.sh
# Runs Mixtral 8x7B model
# defauat env
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1 export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1 export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10 export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32 export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32 export NCCL_MAX_NCHANNELS=32
...@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7 ...@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0 export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml" export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1 export GROUPED_GEMM_BatchLinear=1
export GLOG_minloglevel=3
RANK=$OMPI_COMM_WORLD_RANK RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
...@@ -49,7 +55,7 @@ MODEL_ARGS=( ...@@ -49,7 +55,7 @@ MODEL_ARGS=(
--disable-bias-linear --disable-bias-linear
--seq-length 8192 --seq-length 8192
--max-position-embeddings 32768 --max-position-embeddings 32768
--num-layers 64 --num-layers 32 #64
--hidden-size 8192 --hidden-size 8192
--ffn-hidden-size 32768 --ffn-hidden-size 32768
--num-attention-heads 64 --num-attention-heads 64
...@@ -72,7 +78,7 @@ MOE_ARGS=( ...@@ -72,7 +78,7 @@ MOE_ARGS=(
--moe-token-dispatcher-type alltoall --moe-token-dispatcher-type alltoall
--moe-expert-capacity-factor 0.5 --moe-expert-capacity-factor 0.5
--moe-pad-expert-input-to-capacity --moe-pad-expert-input-to-capacity
--moe-grouped-gemm #--moe-grouped-gemm
) )
DATA_ARGS=( DATA_ARGS=(
...@@ -84,7 +90,7 @@ DATA_ARGS=( ...@@ -84,7 +90,7 @@ DATA_ARGS=(
TRAINING_ARGS=( TRAINING_ARGS=(
--micro-batch-size 1 --micro-batch-size 1
--global-batch-size 4096 --global-batch-size 1024
--lr 1e-4 --lr 1e-4
--train-iters 10 --train-iters 10
--lr-decay-iters 320000 --lr-decay-iters 320000
...@@ -96,7 +102,6 @@ TRAINING_ARGS=( ...@@ -96,7 +102,6 @@ TRAINING_ARGS=(
--bf16 --bf16
--overlap-param-gather --overlap-param-gather
--overlap-grad-reduce --overlap-grad-reduce
#--tp-comm-overlap
) )
TORCH_PROFIE_ARGS=( TORCH_PROFIE_ARGS=(
...@@ -104,23 +109,16 @@ TORCH_PROFIE_ARGS=( ...@@ -104,23 +109,16 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 4 5 6 7 --profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3 --profile-step-start 3
--profile-step-end 4 --profile-step-end 4
--profile-dir torch_prof_gpt --profile-dir torch_prof_gpt_64nodes_tp2-pp16-ep16-ep_tp1-cp2
--use-pytorch-profiler --use-pytorch-profiler
) )
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=( MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2 --tensor-model-parallel-size 2
--pipeline-model-parallel-size 16 --pipeline-model-parallel-size 16
--expert-model-parallel-size 16 --expert-model-parallel-size 16
--expert-tensor-parallel-size 1 --expert-tensor-parallel-size 1
--context-parallel-size 2
--use-distributed-optimizer --use-distributed-optimizer
--sequence-parallel --sequence-parallel
) )
...@@ -157,10 +155,6 @@ APP="python3 -u pretrain_gpt.py \ ...@@ -157,10 +155,6 @@ APP="python3 -u pretrain_gpt.py \
if [[ $profiling == "torch" ]]; then if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS[@]}" APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS[@]}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi fi
#for hygon cpu #for hygon cpu
......
import os import os
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
InferenceWrapperConfig,
)
from pretrain_gpt import model_provider from pretrain_gpt import model_provider
import torch import torch
import sys import sys
import time
import tqdm
import warnings
from argparse import Namespace from argparse import Namespace
from megatron.core.inference.engines.abstract_engine import AbstractEngine from megatron.core.inference.engines.abstract_engine import AbstractEngine
from megatron.core.inference.engines.mcore_engine import MCoreEngine from megatron.core.inference.engines.mcore_engine import MCoreEngine
from megatron.core.inference.sampling_params import SamplingParams from megatron.core.inference.sampling_params import SamplingParams
from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
GPTInferenceWrapper,
)
from megatron.core.inference.inference_request import InferenceRequest from megatron.core.inference.inference_request import InferenceRequest
from megatron.core.inference.text_generation_controllers.text_generation_controller import TextGenerationController from megatron.core.inference.text_generation_controllers.text_generation_controller import (
TextGenerationController,
)
from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.module import MegatronModule
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
os.path.pardir, os.path.pardir))) sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
)
from megatron.training import get_args from megatron.training import get_args
from megatron.training import get_tokenizer from megatron.training import get_tokenizer
...@@ -20,26 +31,42 @@ from megatron.training.checkpointing import load_checkpoint ...@@ -20,26 +31,42 @@ from megatron.training.checkpointing import load_checkpoint
from megatron.core import mpu from megatron.core import mpu
from megatron.training.initialize import initialize_megatron from megatron.training.initialize import initialize_megatron
from megatron.training import get_model from megatron.training import get_model
from typing import List import asyncio
from typing import AsyncIterator, List
def add_text_generate_args(parser): def add_text_generate_args(parser):
"""Text generation arguments.""" """Text generation arguments."""
group = parser.add_argument_group(title='text generation') group = parser.add_argument_group(title='text generation')
group.add_argument("--temperature", type=float, default=1.0, group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
help='Sampling temperature.') group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
group.add_argument("--top_k", type=int, default=1, group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
help='Top k sampling.') group.add_argument(
group.add_argument("--top_p", type=float, default=0.0, "--return-log-probs",
help='Top p sampling.') action='store_true',
group.add_argument("--return-log-probs", action='store_true', default=False, default=False,
help='Return the log probabilities of the final output tokens') help='Return the log probabilities of the final output tokens',
group.add_argument("--num-tokens-to-generate", type=int, default=30, )
help='Number of tokens to generate for each prompt') group.add_argument(
group.add_argument("--prompts", metavar='N', type=str, nargs='+', "--num-tokens-to-generate",
help='Input prompts with each prompt within quotes and seperated by space') type=int,
group.add_argument("--max-batch-size", type=int, default=1, default=30,
help='Max number of prompts to process at once') help='Number of tokens to generate for each prompt',
)
group.add_argument(
"--prompts",
metavar='N',
type=str,
nargs='+',
help='Input prompts with each prompt within quotes and seperated by space',
)
group.add_argument(
"--max-batch-size", type=int, default=8, dest="inference_max_requests",
help='Max number of prompts to process at once'
)
group.add_argument("--stream", action="store_true", default=False, help="Stream output tokens")
return parser return parser
...@@ -62,23 +89,65 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngi ...@@ -62,23 +89,65 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngi
inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
fp32_residual_connection=args.fp32_residual_connection, fp32_residual_connection=args.fp32_residual_connection,
params_dtype=args.params_dtype, params_dtype=args.params_dtype,
padded_vocab_size=args.padded_vocab_size padded_vocab_size=args.padded_vocab_size,
inference_max_requests=args.inference_max_requests,
inference_max_seq_length=args.inference_max_seq_length,
) )
inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config) inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer) text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
return MCoreEngine(text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size) return MCoreEngine(text_generation_controller=text_generation_controller)
async def generate(
inference_engine: MCoreEngine,
sampling_params: SamplingParams,
prompts: List[str],
) -> List[InferenceRequest]:
async def collect_stream(prompt, request_id, stream_generator):
print(f"Request {request_id}: {prompt}", end="", flush=True)
prev_idx = 0
async for output in stream_generator:
print(output.generated_text[prev_idx:], end="", flush=True)
prev_idx = len(output.generated_text)
print()
request_ids: List[str] = [
inference_engine.add_request(
prompt=prompt, inference_parameters=sampling_params, streaming=True
)
for prompt in prompts
]
stream_generators = [inference_engine.get_stream_generator(request_id) for request_id in request_ids]
tasks = [
asyncio.create_task(collect_stream(prompt, request_id, stream_generator))
for (prompt, request_id, stream_generator) in zip(prompts, request_ids, stream_generators)
]
await inference_engine.run_engine_async()
await asyncio.gather(*tasks)
results: List[InferenceRequest] = [
inference_engine.scheduler.completed_request_pool[request_id] for request_id in request_ids
]
return results
def main(): def main():
"""Main program.""" """Main program."""
# Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file) # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
# Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument) # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
initialize_megatron(extra_args_provider=add_text_generate_args, initialize_megatron(
args_defaults={'no_load_rng': True, extra_args_provider=add_text_generate_args,
args_defaults={
'no_load_rng': True,
'no_load_optim': True, 'no_load_optim': True,
'micro_batch_size': 1, 'micro_batch_size': 1,
'exit_on_missing_checkpoint': True}) 'exit_on_missing_checkpoint': True,
},
)
# Set up model and load checkpoint # Set up model and load checkpoint
model = get_model(model_provider, wrap_with_ddp=False) model = get_model(model_provider, wrap_with_ddp=False)
...@@ -94,12 +163,25 @@ def main(): ...@@ -94,12 +163,25 @@ def main():
top_k=args.top_k, top_k=args.top_k,
top_p=args.top_p, top_p=args.top_p,
return_log_probs=args.return_log_probs, return_log_probs=args.return_log_probs,
num_tokens_to_generate=args.num_tokens_to_generate) num_tokens_to_generate=args.num_tokens_to_generate,
)
results: List[InferenceRequest] = inference_engine.generate( if args.enable_cuda_graph:
print(f"Running warmup for CUDA graphs...")
inference_engine.generate(
prompts=args.prompts, sampling_params=sampling_params prompts=args.prompts, sampling_params=sampling_params
) )
start_time = time.perf_counter()
if args.stream:
results: List[InferenceRequest] = asyncio.run(generate(inference_engine, sampling_params, args.prompts))
else:
results: List[InferenceRequest] = inference_engine.generate(
prompts=args.prompts, sampling_params=sampling_params,
)
end_time = time.perf_counter()
latency = end_time - start_time
if torch.distributed.get_rank() == 0: if torch.distributed.get_rank() == 0:
for idx, result in enumerate(results): for idx, result in enumerate(results):
print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ') print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
...@@ -107,9 +189,12 @@ def main(): ...@@ -107,9 +189,12 @@ def main():
'id': result.request_id, 'id': result.request_id,
'input_prompt': result.prompt, 'input_prompt': result.prompt,
'generated_text': result.generated_text, 'generated_text': result.generated_text,
'generated_tokens' : result.generated_tokens 'generated_tokens': result.generated_tokens,
'latency': latency,
} }
print(result) print(result)
torch.distributed.destroy_process_group()
if __name__ == "__main__": if __name__ == "__main__":
main() main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment