Merge pull request #238 from InfiniTensor/issue/237

issue/237 - support hygon in bench and inf server

Merge pull request #238 from InfiniTensor/issue/237
issue/237 - support hygon in bench and inf server
e76bb324 · thatPepe · GitHub · 39b594ff · 8f71a5ec · e76bb324
Unverified Commit e76bb324 authored Feb 24, 2026 by thatPepe Committed by GitHub Feb 24, 2026
4 changed files
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ xmake && xmake install
 - 运行模型推理测试

 ```bash
-python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] path/to/model_dir [n_device]
+python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] path/to/model_dir [n_device]
 ```

 - 部署模型推理服务
@@ -77,11 +77,11 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
  - 单次推理测试
    - llama示例
    ```bash
-    python examples/llama.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali] --model_path=<path/to/model_dir>
+    python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali | --cambricon | --hygon] --model_path=<path/to/model_dir>
    ```
    - 例如：
    ```bash
-    python examples/llama.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
+    python examples/jigue.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
    ```
  - 分布式推理测试
      - 9g示例
@@ -119,7 +119,7 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
  - 运行推理基准测试（C-Eval/MMLU）

    ```bash
-    python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
+    python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
    ```

    - 参数说明：

--- a/examples/bench.py
+++ b/examples/bench.py
@@ -167,6 +167,11 @@ def get_args():
        action="store_true",
        help="Run alippu test",
    )
+    parser.add_argument(
+        "--hygon",
+        action="store_true",
+        help="Run hygon test",
+    )
    parser.add_argument(
        "--model",
        type=str,
@@ -387,6 +392,8 @@ if __name__ == "__main__":
        device_str = "mlu"
    elif args.ali:
        device_str = "cuda"
+    elif args.hygon:
+        device_str = "cuda"
    else:
        print(
            "python examples/bench.py --nvidia --model=~/TinyLlama-1.1B-Chat-v1.0/ --batch-size=2 --tp=1 --input-len=50 --output-len=50"

--- a/python/infinilm/server/inference_server.py
+++ b/python/infinilm/server/inference_server.py
@@ -594,6 +594,7 @@ def parse_args():
    parser.add_argument("--iluvatar", action="store_true", help="Use Iluvatar device")
    parser.add_argument("--cambricon", action="store_true", help="Use Cambricon device")
    parser.add_argument("--ali", action="store_true", help="Use Ali PPU device")
+    parser.add_argument("--hygon", action="store_true", help="Use Hygon DCU device")
    parser.add_argument(
        "--enable-graph",
        action="store_true",
@@ -631,9 +632,11 @@ def main():
        device = "mlu"
    elif args.ali:
        device = "cuda"
+    elif args.hygon:
+        device = "cuda"
    else:
        print(
-            "Usage: python infinilm.server.inference_server [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon | --ali] "
+            "Usage: python infinilm.server.inference_server [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon | --ali | --hygon] "
            "--model_path=<path/to/model_dir> --max_tokens=MAX_TOKENS --max_batch_size=MAX_BATCH_SIZE"
            "\n"
            "Example: python infinilm.server.inference_server --nvidia --model_path=/data/shared/models/9G7B_MHA/ "

--- a/test/bench/test_benchmark.py
+++ b/test/bench/test_benchmark.py
@@ -73,6 +73,7 @@ class InfiniLMBenchmark(BaseBenchmark):
            "iluvatar": "cuda",
            "kunlun": "cuda",
            "hygon": "cuda",
+            "ali": "cuda",
        }

        device_name = device_map.get(device_type_str.lower(), "cpu")
@@ -184,11 +185,17 @@ class InfiniLMBenchmark(BaseBenchmark):
        start_time = time.perf_counter()

        # For cpp backend, reset cache before generation if use_cache is enabled
-        if self.model.use_cache and hasattr(self.model, "_model") and hasattr(self.model._model, "reset_cache"):
+        if (
+            self.model.use_cache
+            and hasattr(self.model, "_model")
+            and hasattr(self.model._model, "reset_cache")
+        ):
            batch_size = input_ids.shape[0]
            seq_len = input_ids.shape[1]
            max_cache_len = max_steps + seq_len
-            self.model.reset_cache(batch_size=batch_size, initial_capacity=max_cache_len)
+            self.model.reset_cache(
+                batch_size=batch_size, initial_capacity=max_cache_len
+            )

        # Use model's built-in generate() method which properly handles KV cache
        # Pass sampling parameters (temperature, topk, topp) via kwargs
@@ -656,7 +663,7 @@ def test():
    # Parse arguments manually to handle device flags properly
    if len(sys.argv) < 4:
        print(
-            "Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--split {test|val|all}] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
+            "Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--split {test|val|all}] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
        )
        sys.exit(1)

@@ -739,9 +746,11 @@ def test():
        device_type_str = "kunlun"
    elif device_flag == "--hygon":
        device_type_str = "hygon"
+    elif device_flag == "--ali":
+        device_type_str = "ali"
    else:
        print(
-            "Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
+            "Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
        )
        sys.exit(1)

@@ -935,9 +944,7 @@ def test():
                splits_to_load = (
                    ["test"]
                    if split == "test"
-                    else ["validation"]
-                    if split == "val"
-                    else ["validation", "test"]
+                    else ["validation"] if split == "val" else ["validation", "test"]
                )
                # Load each subject individually from hardcoded list, excluding "all"
                for subject_name in mmlu_subjects:
@@ -959,9 +966,7 @@ def test():
                splits_to_load = (
                    ["test"]
                    if split == "test"
-                    else ["validation"]
-                    if split == "val"
-                    else ["validation", "test"]
+                    else ["validation"] if split == "val" else ["validation", "test"]
                )
                records = []
                for sp in splits_to_load: