Commit 688448db authored by silencealiang's avatar silencealiang
Browse files

更新代码

parent a02a5490
Pipeline #2503 passed with stage
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
......@@ -63,7 +63,8 @@ language_model:
# MoE related
moe_router_load_balancing_type: "aux_loss"
moe_router_topk: 2
moe_router_topk_limited_devices: null
moe_router_group_topk: null
moe_router_num_groups: null
moe_grouped_gemm: False
moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss.
moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss
......
......@@ -8,7 +8,7 @@ do
done
mpirun -np 8 --allow-run-as-root \
train_GPT-MOE_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
train_gpt_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
wait
......
......@@ -7,11 +7,11 @@ do
fi
done
mpirun -np 16 --hostfile mixtralnodes \
mpirun -np 512 --hostfile hostfile_gpt_567B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x7B_2nodes.sh node021 --profiling=$profiling > output.log 2>&1
train_gpt_567B_multinodes.sh node002 --profiling=$profiling > output.log 2>&1
wait
......
......@@ -4,18 +4,23 @@ for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi
done
# Runs GPT 567B model
source /opt/dtk/env.sh
# Runs Mixtral 8x7B model
# defauat env
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
......@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
export GLOG_minloglevel=3
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
......@@ -96,7 +102,6 @@ TRAINING_ARGS=(
--bf16
--overlap-param-gather
--overlap-grad-reduce
#--tp-comm-overlap
)
TORCH_PROFIE_ARGS=(
......@@ -104,18 +109,10 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_1nodes
--profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep8-ep_tp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 1
......@@ -157,10 +154,6 @@ APP="python3 -u pretrain_gpt.py \
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS[@]}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
......
......@@ -4,18 +4,23 @@ for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi
done
# Runs GPT 567B model
source /opt/dtk/env.sh
# Runs Mixtral 8x7B model
# defauat env
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
......@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
export GLOG_minloglevel=3
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
......@@ -49,7 +55,7 @@ MODEL_ARGS=(
--disable-bias-linear
--seq-length 8192
--max-position-embeddings 32768
--num-layers 64
--num-layers 32 #64
--hidden-size 8192
--ffn-hidden-size 32768
--num-attention-heads 64
......@@ -72,7 +78,7 @@ MOE_ARGS=(
--moe-token-dispatcher-type alltoall
--moe-expert-capacity-factor 0.5
--moe-pad-expert-input-to-capacity
--moe-grouped-gemm
#--moe-grouped-gemm
)
DATA_ARGS=(
......@@ -84,7 +90,7 @@ DATA_ARGS=(
TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 4096
--global-batch-size 1024
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
......@@ -96,7 +102,6 @@ TRAINING_ARGS=(
--bf16
--overlap-param-gather
--overlap-grad-reduce
#--tp-comm-overlap
)
TORCH_PROFIE_ARGS=(
......@@ -104,23 +109,16 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt
--profile-dir torch_prof_gpt_64nodes_tp2-pp16-ep16-ep_tp1-cp2
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 16
--expert-model-parallel-size 16
--expert-tensor-parallel-size 1
--context-parallel-size 2
--use-distributed-optimizer
--sequence-parallel
)
......@@ -157,10 +155,6 @@ APP="python3 -u pretrain_gpt.py \
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS[@]}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
......
import os
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
InferenceWrapperConfig,
)
from pretrain_gpt import model_provider
import torch
import sys
import time
import tqdm
import warnings
from argparse import Namespace
from megatron.core.inference.engines.abstract_engine import AbstractEngine
from megatron.core.inference.engines.mcore_engine import MCoreEngine
from megatron.core.inference.sampling_params import SamplingParams
from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
GPTInferenceWrapper,
)
from megatron.core.inference.inference_request import InferenceRequest
from megatron.core.inference.text_generation_controllers.text_generation_controller import TextGenerationController
from megatron.core.inference.text_generation_controllers.text_generation_controller import (
TextGenerationController,
)
from megatron.core.transformer.module import MegatronModule
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
os.path.pardir, os.path.pardir)))
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
)
from megatron.training import get_args
from megatron.training import get_tokenizer
......@@ -20,26 +31,42 @@ from megatron.training.checkpointing import load_checkpoint
from megatron.core import mpu
from megatron.training.initialize import initialize_megatron
from megatron.training import get_model
from typing import List
import asyncio
from typing import AsyncIterator, List
def add_text_generate_args(parser):
"""Text generation arguments."""
group = parser.add_argument_group(title='text generation')
group.add_argument("--temperature", type=float, default=1.0,
help='Sampling temperature.')
group.add_argument("--top_k", type=int, default=1,
help='Top k sampling.')
group.add_argument("--top_p", type=float, default=0.0,
help='Top p sampling.')
group.add_argument("--return-log-probs", action='store_true', default=False,
help='Return the log probabilities of the final output tokens')
group.add_argument("--num-tokens-to-generate", type=int, default=30,
help='Number of tokens to generate for each prompt')
group.add_argument("--prompts", metavar='N', type=str, nargs='+',
help='Input prompts with each prompt within quotes and seperated by space')
group.add_argument("--max-batch-size", type=int, default=1,
help='Max number of prompts to process at once')
group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
group.add_argument(
"--return-log-probs",
action='store_true',
default=False,
help='Return the log probabilities of the final output tokens',
)
group.add_argument(
"--num-tokens-to-generate",
type=int,
default=30,
help='Number of tokens to generate for each prompt',
)
group.add_argument(
"--prompts",
metavar='N',
type=str,
nargs='+',
help='Input prompts with each prompt within quotes and seperated by space',
)
group.add_argument(
"--max-batch-size", type=int, default=8, dest="inference_max_requests",
help='Max number of prompts to process at once'
)
group.add_argument("--stream", action="store_true", default=False, help="Stream output tokens")
return parser
......@@ -62,23 +89,65 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngi
inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
fp32_residual_connection=args.fp32_residual_connection,
params_dtype=args.params_dtype,
padded_vocab_size=args.padded_vocab_size
padded_vocab_size=args.padded_vocab_size,
inference_max_requests=args.inference_max_requests,
inference_max_seq_length=args.inference_max_seq_length,
)
inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
return MCoreEngine(text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size)
return MCoreEngine(text_generation_controller=text_generation_controller)
async def generate(
inference_engine: MCoreEngine,
sampling_params: SamplingParams,
prompts: List[str],
) -> List[InferenceRequest]:
async def collect_stream(prompt, request_id, stream_generator):
print(f"Request {request_id}: {prompt}", end="", flush=True)
prev_idx = 0
async for output in stream_generator:
print(output.generated_text[prev_idx:], end="", flush=True)
prev_idx = len(output.generated_text)
print()
request_ids: List[str] = [
inference_engine.add_request(
prompt=prompt, inference_parameters=sampling_params, streaming=True
)
for prompt in prompts
]
stream_generators = [inference_engine.get_stream_generator(request_id) for request_id in request_ids]
tasks = [
asyncio.create_task(collect_stream(prompt, request_id, stream_generator))
for (prompt, request_id, stream_generator) in zip(prompts, request_ids, stream_generators)
]
await inference_engine.run_engine_async()
await asyncio.gather(*tasks)
results: List[InferenceRequest] = [
inference_engine.scheduler.completed_request_pool[request_id] for request_id in request_ids
]
return results
def main():
"""Main program."""
# Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
# Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
initialize_megatron(
extra_args_provider=add_text_generate_args,
args_defaults={
'no_load_rng': True,
'no_load_optim': True,
'micro_batch_size': 1,
'exit_on_missing_checkpoint': True})
'exit_on_missing_checkpoint': True,
},
)
# Set up model and load checkpoint
model = get_model(model_provider, wrap_with_ddp=False)
......@@ -94,12 +163,25 @@ def main():
top_k=args.top_k,
top_p=args.top_p,
return_log_probs=args.return_log_probs,
num_tokens_to_generate=args.num_tokens_to_generate)
num_tokens_to_generate=args.num_tokens_to_generate,
)
results: List[InferenceRequest] = inference_engine.generate(
if args.enable_cuda_graph:
print(f"Running warmup for CUDA graphs...")
inference_engine.generate(
prompts=args.prompts, sampling_params=sampling_params
)
start_time = time.perf_counter()
if args.stream:
results: List[InferenceRequest] = asyncio.run(generate(inference_engine, sampling_params, args.prompts))
else:
results: List[InferenceRequest] = inference_engine.generate(
prompts=args.prompts, sampling_params=sampling_params,
)
end_time = time.perf_counter()
latency = end_time - start_time
if torch.distributed.get_rank() == 0:
for idx, result in enumerate(results):
print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
......@@ -107,9 +189,12 @@ def main():
'id': result.request_id,
'input_prompt': result.prompt,
'generated_text': result.generated_text,
'generated_tokens' : result.generated_tokens
'generated_tokens': result.generated_tokens,
'latency': latency,
}
print(result)
torch.distributed.destroy_process_group()
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment