Unverified Commit e76bb324 authored by thatPepe's avatar thatPepe Committed by GitHub
Browse files

Merge pull request #238 from InfiniTensor/issue/237

issue/237 - support hygon in bench and inf server
parents 39b594ff 8f71a5ec
......@@ -15,7 +15,7 @@ xmake && xmake install
- 运行模型推理测试
```bash
python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] path/to/model_dir [n_device]
python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] path/to/model_dir [n_device]
```
- 部署模型推理服务
......@@ -77,11 +77,11 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
- 单次推理测试
- llama示例
```bash
python examples/llama.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali] --model_path=<path/to/model_dir>
python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali | --cambricon | --hygon] --model_path=<path/to/model_dir>
```
- 例如:
```bash
python examples/llama.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
python examples/jigue.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
```
- 分布式推理测试
- 9g示例
......@@ -119,7 +119,7 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
- 运行推理基准测试(C-Eval/MMLU)
```bash
python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
```
- 参数说明:
......
......@@ -167,6 +167,11 @@ def get_args():
action="store_true",
help="Run alippu test",
)
parser.add_argument(
"--hygon",
action="store_true",
help="Run hygon test",
)
parser.add_argument(
"--model",
type=str,
......@@ -387,6 +392,8 @@ if __name__ == "__main__":
device_str = "mlu"
elif args.ali:
device_str = "cuda"
elif args.hygon:
device_str = "cuda"
else:
print(
"python examples/bench.py --nvidia --model=~/TinyLlama-1.1B-Chat-v1.0/ --batch-size=2 --tp=1 --input-len=50 --output-len=50"
......
......@@ -594,6 +594,7 @@ def parse_args():
parser.add_argument("--iluvatar", action="store_true", help="Use Iluvatar device")
parser.add_argument("--cambricon", action="store_true", help="Use Cambricon device")
parser.add_argument("--ali", action="store_true", help="Use Ali PPU device")
parser.add_argument("--hygon", action="store_true", help="Use Hygon DCU device")
parser.add_argument(
"--enable-graph",
action="store_true",
......@@ -631,9 +632,11 @@ def main():
device = "mlu"
elif args.ali:
device = "cuda"
elif args.hygon:
device = "cuda"
else:
print(
"Usage: python infinilm.server.inference_server [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon | --ali] "
"Usage: python infinilm.server.inference_server [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon | --ali | --hygon] "
"--model_path=<path/to/model_dir> --max_tokens=MAX_TOKENS --max_batch_size=MAX_BATCH_SIZE"
"\n"
"Example: python infinilm.server.inference_server --nvidia --model_path=/data/shared/models/9G7B_MHA/ "
......
......@@ -73,6 +73,7 @@ class InfiniLMBenchmark(BaseBenchmark):
"iluvatar": "cuda",
"kunlun": "cuda",
"hygon": "cuda",
"ali": "cuda",
}
device_name = device_map.get(device_type_str.lower(), "cpu")
......@@ -184,11 +185,17 @@ class InfiniLMBenchmark(BaseBenchmark):
start_time = time.perf_counter()
# For cpp backend, reset cache before generation if use_cache is enabled
if self.model.use_cache and hasattr(self.model, "_model") and hasattr(self.model._model, "reset_cache"):
if (
self.model.use_cache
and hasattr(self.model, "_model")
and hasattr(self.model._model, "reset_cache")
):
batch_size = input_ids.shape[0]
seq_len = input_ids.shape[1]
max_cache_len = max_steps + seq_len
self.model.reset_cache(batch_size=batch_size, initial_capacity=max_cache_len)
self.model.reset_cache(
batch_size=batch_size, initial_capacity=max_cache_len
)
# Use model's built-in generate() method which properly handles KV cache
# Pass sampling parameters (temperature, topk, topp) via kwargs
......@@ -656,7 +663,7 @@ def test():
# Parse arguments manually to handle device flags properly
if len(sys.argv) < 4:
print(
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--split {test|val|all}] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--split {test|val|all}] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
)
sys.exit(1)
......@@ -739,9 +746,11 @@ def test():
device_type_str = "kunlun"
elif device_flag == "--hygon":
device_type_str = "hygon"
elif device_flag == "--ali":
device_type_str = "ali"
else:
print(
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
)
sys.exit(1)
......@@ -935,9 +944,7 @@ def test():
splits_to_load = (
["test"]
if split == "test"
else ["validation"]
if split == "val"
else ["validation", "test"]
else ["validation"] if split == "val" else ["validation", "test"]
)
# Load each subject individually from hardcoded list, excluding "all"
for subject_name in mmlu_subjects:
......@@ -959,9 +966,7 @@ def test():
splits_to_load = (
["test"]
if split == "test"
else ["validation"]
if split == "val"
else ["validation", "test"]
else ["validation"] if split == "val" else ["validation", "test"]
)
records = []
for sp in splits_to_load:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment