Initial commit

c71ac7cc · jerrrrry · c71ac7cc · c71ac7cc · c71ac7cc · c71ac7cc
Commit c71ac7cc authored Apr 19, 2025 by jerrrrry
6 changed files
--- a/Dockerfile
+++ b/Dockerfile
+# 使用官方光源基础镜像
+FROM 2.4.1-ubuntu22.04-dtk25.04-py3.10:vllm0.6.2
+# 创建目标目录
+RUN mkdir -p /workspace/test/results
+# 设置工作目录/workspace/test/results方便挂载结果
+WORKDIR /workspace/test/results
+# 将主机上的 test.sh 复制到容器中的 /workspace/test
+COPY ./run.sh /workspace/test
+COPY ./benchmark_throughput_0.6.2.py /workspace/test
+COPY ./topo.xml /workspace/test
+COPY ./models-to-test.cfg /workspace/test 
+# 确保 test.sh 有可执行权限
+RUN chmod +x /workspace/test/run.sh
+# 设置容器启动时运行的命令
+# 使用单个 CMD 执行所有命令
+CMD bash -c "\
+  rocm-bandwidth-test > rocm-bandwidth-test.txt && \
+  hy-smi > hy-smi.txt && \
+  hy-smi -c > hy-smi-c.txt && \
+  pip list > pip-list.txt && \
+  lscpu > lscpu.txt && \
+  bash /workspace/test/run.sh > test.log 2>&1"
\ No newline at end of file
--- a/benchmark_throughput_0.6.2.py
+++ b/benchmark_throughput_0.6.2.py
+"""Benchmark offline inference throughput."""
+import argparse
+import json
+import random
+import time
+from typing import List, Optional, Tuple
+import numpy as np
+import torch
+import uvloop
+from tqdm import tqdm
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          PreTrainedTokenizerBase)
+from vllm.inputs import PromptInputs
+from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+) -> List[Tuple[str, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [(data["conversations"][0]["value"],
+                data["conversations"][1]["value"]) for data in dataset]
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+    # Filter out sequences that are too long or too short
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = len(completion_token_ids
+                         ) if fixed_output_len is None else fixed_output_len
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append((prompt, prompt_len, output_len))
+    return filtered_dataset
+def run_vllm(
+    warmup_requests: List[Tuple[str, int, int]],
+    requests_json: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: str,
+    quantization: Optional[str],
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    use_beam_search: bool,
+    trust_remote_code: bool,
+    dtype: str,
+    max_model_len: Optional[int],
+    enforce_eager: bool,
+    kv_cache_dtype: str,
+    quantization_param_path: Optional[str],
+    device: str,
+    enable_prefix_caching: bool,
+    enable_chunked_prefill: bool,
+    max_num_batched_tokens: int,
+    distributed_executor_backend: Optional[str],
+    gpu_memory_utilization: float = 0.9,
+    num_scheduler_steps: int = 1,
+    use_v2_block_manager: bool = False,
+    download_dir: Optional[str] = None,
+    load_format: str = EngineArgs.load_format,
+    disable_async_output_proc: bool = False,
+    use_new_beam_search_impl: bool = False,
+) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(
+        model=model,
+        tokenizer=tokenizer,
+        quantization=quantization,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
+        quantization_param_path=quantization_param_path,
+        device=device,
+        enable_prefix_caching=enable_prefix_caching,
+        download_dir=download_dir,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        distributed_executor_backend=distributed_executor_backend,
+        load_format=load_format,
+        num_scheduler_steps=num_scheduler_steps,
+        use_v2_block_manager=use_v2_block_manager,
+        disable_async_output_proc=disable_async_output_proc,
+    )
+    # warmup
+    warmup_prompts = []
+    warmup_sampling_params = []
+    for prompt, _, output_len in warmup_requests:
+        warmup_prompts.append(prompt)
+        warmup_sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=0.0 if use_beam_search else 1.0,
+                top_p=1.0,
+                use_beam_search=use_beam_search,
+                ignore_eos=True,
+                max_tokens=output_len,
+            ))
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        llm.generate(warmup_prompts, warmup_sampling_params, use_tqdm=True)
+    info_json={}
+    for ELEprompt in args.num_prompts:
+        for ELEinput,ELEoutput  in zip(args.input_len,args.output_len):
+            info={}
+            requests=requests_json["{}_{}_{}".format(ELEprompt,ELEinput,ELEoutput)]
+            # Add the requests to the engine.
+            prompts: List[str] = []
+            sampling_params: List[SamplingParams] = []
+            for prompt, _, output_len in requests:
+                prompts.append(prompt)
+                sampling_params.append(
+                    SamplingParams(
+                        n=n,
+                        temperature=0.0 if use_beam_search else 1.0,
+                        top_p=1.0,
+                        use_beam_search=use_beam_search,
+                        ignore_eos=True,
+                        max_tokens=output_len,
+                    ))            
+            if not use_new_beam_search_impl:
+                start = time.perf_counter()
+                real_output = llm.generate(prompts, sampling_params, use_tqdm=True)
+                end = time.perf_counter()
+            else:
+                assert use_beam_search
+                prompts = [prompt for prompt, _, _ in requests]
+                # output_len should be the same for all requests.
+                output_len = requests[0][2]
+                for prompt, input_len, _output_len in requests:
+                    assert _output_len == output_len
+                start = time.perf_counter()
+                llm.beam_search(prompts,
+                                beam_width=n,
+                                max_tokens=output_len,
+                                ignore_eos=True)
+                end = time.perf_counter()
+            total_ttfts = []
+            total_tpops = []
+            total_output_token_throughput = []
+            total_inout_token_throughput = []
+            for output in real_output:
+                ttft_ = output.metrics.first_token_time - output.metrics.arrival_time
+                tpop_ = (output.metrics.finished_time - output.metrics.arrival_time - ttft_) / (ELEoutput-1)
+                output_token_throughput = (ELEoutput) / (output.metrics.finished_time - output.metrics.arrival_time)
+                inout_token_throughput = (ELEoutput + ELEinput) / (output.metrics.finished_time - output.metrics.arrival_time)
+                total_ttfts.append(ttft_)
+                total_tpops.append(tpop_)
+                total_output_token_throughput.append(output_token_throughput)
+                total_inout_token_throughput.append(inout_token_throughput)
+            # total_num_tokens = sum(request.prompt_len + request.expected_output_len
+            #     for request in requests)
+            # total_output_tokens = sum(request.expected_output_len
+            #     for request in requests)
+            total_num_tokens = sum(prompt_len + output_len
+                for _, prompt_len, output_len in requests)
+            total_output_tokens = sum(output_len
+                for _, prompt_len, output_len in requests)    
+            info["elapsed_time"] = np.around(end - start,2)
+            info["Throughput"] = np.around(len(requests) / info['elapsed_time'],2)
+            info["total_tokens"] = np.around(total_num_tokens / info['elapsed_time'],2)
+            info["output_tokens"] = np.around(total_output_tokens / info['elapsed_time'],2)
+            info["ttft_mean"] = np.around(np.mean(total_ttfts),5)
+            info["ttft_median"] = np.around(np.median(total_ttfts or 0),5)
+            info["ttft_p99"] = np.around(np.percentile(total_ttfts or 0, 99),5)
+            info["tpop_mean"] = np.around(np.mean(total_tpops),4)
+            info["tpop_median"] = np.around(np.median(total_tpops or 0),5)
+            info["tpop_p99"] = np.around(np.percentile(total_tpops or 0, 99),5)
+            info["output_token_throughput_mean"]  = np.around(np.mean(total_output_token_throughput),2)
+            info["output_token_throughput_median"]  = np.around(np.median(total_output_token_throughput or 0),2)
+            info["output_token_throughput_p99"]  = np.around(np.percentile(total_output_token_throughput or 0, 99),2)
+            info["inout_token_throughput_mean"] = np.around(np.mean(total_inout_token_throughput),2)
+            info["inout_token_throughput_median"] = np.around(np.median(total_inout_token_throughput or 0),2)
+            info["inout_token_throughput_p99"] = np.around(np.percentile(total_inout_token_throughput or 0, 99),2)
+            info_json["{}_{}_{}".format(ELEprompt,ELEinput,ELEoutput)] = info
+            print("promt:{},input:{},output:{}".format(ELEprompt,ELEinput,ELEoutput))
+            print(f"Latency: {info['elapsed_time']:.2f} s")
+            print(f"Throughput: {len(requests) / info['elapsed_time']:.2f} requests/s, "
+                f"{total_num_tokens / info['elapsed_time']:.2f} total tokens/s, "
+                f"{total_output_tokens / info['elapsed_time']:.2f} output tokens/s")
+            print("==============================================")
+            print(f"total_out_tokens: {total_output_tokens: .2f} tokens")
+            print(f"elapsed_time: {info['elapsed_time']: .2f} s")     # 总耗时
+            print(f"TTFT_mean: {info['ttft_mean']: .5f} s")           # 首字延时
+            print(f"ttft_p99: {info['ttft_p99']: .5f} s")
+            print(f"ttft_median: {info['ttft_median']: .5f} s")
+            print(f"TPOP_mean: {info['tpop_mean']: .5f} s")           # 单字decode时间
+            print(f"tpop_median: {info['tpop_median']: .5f} s")
+            print(f"tpop_p99: {info['tpop_p99']: .5f} s")
+            print(f"output_token_throughput_mean: {info['output_token_throughput_mean']:.2f} tokens/s")           # 单路生成吞吐
+            print(f"output_token_throughput_median: {info['output_token_throughput_median']:.2f} tokens/s")
+            print(f"output_token_throughput_p99: {info['output_token_throughput_p99']:.2f} tokens/s")
+            print(f"inout_token_throughput_mean: {info['inout_token_throughput_mean']:.2f} tokens/s")           # 单路总吞吐
+            print(f"tinout_token_throughput_median: {info['inout_token_throughput_median']:.2f} tokens/s")
+            print(f"inout_token_throughput_p99: {info['inout_token_throughput_p99']:.2f} tokens/s")
+            print("==============================================")
+            print("\n")
+    return info_json
+async def run_vllm_async(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: str,
+    quantization: Optional[str],
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    use_beam_search: bool,
+    trust_remote_code: bool,
+    dtype: str,
+    max_model_len: Optional[int],
+    enforce_eager: bool,
+    kv_cache_dtype: str,
+    quantization_param_path: Optional[str],
+    device: str,
+    enable_prefix_caching: bool,
+    enable_chunked_prefill: bool,
+    max_num_batched_tokens: int,
+    distributed_executor_backend: Optional[str],
+    gpu_memory_utilization: float = 0.9,
+    num_scheduler_steps: int = 1,
+    use_v2_block_manager: bool = False,
+    download_dir: Optional[str] = None,
+    load_format: str = EngineArgs.load_format,
+    disable_async_output_proc: bool = False,
+    disable_frontend_multiprocessing: bool = False,
+) -> float:
+    from vllm import SamplingParams
+    engine_args = AsyncEngineArgs(
+        model=model,
+        tokenizer=tokenizer,
+        quantization=quantization,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
+        quantization_param_path=quantization_param_path,
+        device=device,
+        enable_prefix_caching=enable_prefix_caching,
+        download_dir=download_dir,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        distributed_executor_backend=distributed_executor_backend,
+        load_format=load_format,
+        num_scheduler_steps=num_scheduler_steps,
+        use_v2_block_manager=use_v2_block_manager,
+        disable_async_output_proc=disable_async_output_proc,
+        worker_use_ray=False,
+        disable_log_requests=True,
+    )
+    async with build_async_engine_client_from_engine_args(
+            engine_args, disable_frontend_multiprocessing) as llm:
+        # Add the requests to the engine.
+        prompts: List[str] = []
+        sampling_params: List[SamplingParams] = []
+        for prompt, _, output_len in requests:
+            prompts.append(prompt)
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=0.0 if use_beam_search else 1.0,
+                    top_p=1.0,
+                    use_beam_search=use_beam_search,
+                    ignore_eos=True,
+                    max_tokens=output_len,
+                ))
+        generators = []
+        start = time.perf_counter()
+        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+            generator = llm.generate(prompt, sp, request_id=f"test{i}")
+            generators.append(generator)
+        all_gens = merge_async_iterators(*generators)
+        async for i, res in all_gens:
+            pass
+        end = time.perf_counter()
+        return end - start
+def run_hf(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    use_beam_search: bool,
+    max_batch_size: int,
+    trust_remote_code: bool,
+) -> float:
+    assert not use_beam_search
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: List[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt, prompt_len, output_len = requests[i]
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            _, next_prompt_len, next_output_len = requests[i + 1]
+            if (max(max_prompt_len, next_prompt_len) +
+                    max(max_output_len, next_output_len)) <= 2048:
+                # We can add more requests to the batch.
+                continue
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt",
+                              padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=not use_beam_search,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        # Include the decoding time.
+        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+def run_mii(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tensor_parallel_size: int,
+    output_len: int,
+) -> float:
+    from mii import client, serve
+    llm = serve(model, tensor_parallel=tensor_parallel_size)
+    prompts = [prompt for prompt, _, _ in requests]
+    start = time.perf_counter()
+    llm.generate(prompts, max_new_tokens=output_len)
+    end = time.perf_counter()
+    client = client(model)
+    client.terminate_server()
+    return end - start
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    warmup_prompt = "hi" * 10
+    warmup_requests = [(warmup_prompt, 10, 10)
+                for _ in range(1)]
+    if args.dataset is None:
+        requests_json={}
+        for ELEprompt in args.num_prompts:
+            for ELEinput,ELEoutput  in zip(args.input_len,args.output_len):
+                # Synthesize a prompt with the given input length.
+                prompt = "hi" * (ELEinput - 1)
+                requests = [(prompt, ELEinput, ELEoutput)
+                            for _ in range(ELEprompt)]
+                print("type(requests):",type(requests))
+                requests_json["{}_{}_{}".format(ELEprompt,ELEinput,ELEoutput)]=requests
+    else:
+        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
+                                   args.output_len)
+    if args.backend == "vllm":
+        if args.async_engine:
+            run_args = [
+                requests, args.model, args.tokenizer, args.quantization,
+                args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+                args.trust_remote_code, args.dtype, args.max_model_len,
+                args.enforce_eager, args.kv_cache_dtype,
+                args.quantization_param_path, args.device,
+                args.enable_prefix_caching, args.enable_chunked_prefill,
+                args.max_num_batched_tokens, args.distributed_executor_backend,
+                args.gpu_memory_utilization, args.num_scheduler_steps,
+                args.use_v2_block_manager, args.download_dir, args.load_format,
+                args.disable_async_output_proc
+            ]
+        else:
+            run_args = [
+                warmup_requests, requests_json, args.model, args.tokenizer, args.quantization,
+                args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+                args.trust_remote_code, args.dtype, args.max_model_len,
+                args.enforce_eager, args.kv_cache_dtype,
+                args.quantization_param_path, args.device,
+                args.enable_prefix_caching, args.enable_chunked_prefill,
+                args.max_num_batched_tokens, args.distributed_executor_backend,
+                args.gpu_memory_utilization, args.num_scheduler_steps,
+                args.use_v2_block_manager, args.download_dir, args.load_format,
+                args.disable_async_output_proc
+            ]
+        if args.async_engine:
+            run_args.append(args.disable_frontend_multiprocessing)
+            elapsed_time = uvloop.run(run_vllm_async(*run_args))
+        else:
+            info_json = run_vllm(*run_args, args.use_new_beam_search_impl)
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
+                              args.use_beam_search, args.hf_max_batch_size,
+                              args.trust_remote_code)
+    elif args.backend == "mii":
+        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
+                               args.output_len)
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+    # total_num_tokens = sum(prompt_len + output_len
+    #                        for _, prompt_len, output_len in requests)
+    # if args.dataset is None:
+    #     total_out_tokens = args.output_len * args.num_prompts
+    # else:
+    #     total_out_tokens = sum(output_len for _, _, output_len in requests) 
+    # print(f"Latency: {elapsed_time:.2f} s")
+    # print(f"All Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+    #       f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+    # print(f"Generate Throughput: {total_out_tokens / elapsed_time:.2f} tokens/s")
+    with open(args.output_json,"w") as f:
+        title="bs_in_out"
+        data_keys=info_json[list(info_json.keys())[0]].keys()
+        keys_string = ','.join(data_keys)
+        title=title+","+keys_string
+        f.write(title)
+        f.write("\n")
+        for key, value in info_json.items():
+            values_as_strings = [str(value) for value in info_json[key].values()]
+            values_string = ','.join(values_as_strings)
+            key=key+","+values_string
+            f.writelines(key)
+            f.write("\n")
+    # Output JSON results if specified
+    # if args.output_json:
+    #     results = {
+    #         "elapsed_time": elapsed_time,
+    #         "num_requests": len(requests),
+    #         "total_num_tokens": total_num_tokens,
+    #         "requests_per_second": len(requests) / elapsed_time,
+    #         "tokens_per_second": total_num_tokens / elapsed_time,
+    #     }
+    #     with open(args.output_json, "w") as f:
+    #         json.dump(results, f, indent=4)
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--backend",
+                        type=str,
+                        choices=["vllm", "hf", "mii"],
+                        default="vllm")
+    parser.add_argument("--dataset",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset.")
+    parser.add_argument("--input-len",
+                        type=int,
+                        nargs="*",
+                        default=None,
+                        help="Input prompt length for each request")
+    parser.add_argument("--output-len",
+                        type=int,
+                        nargs="*",
+                        default=None,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument("--model", type=str, default="facebook/opt-125m")
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument('--quantization',
+                        '-q',
+                        choices=[*QUANTIZATION_METHODS, None],
+                        default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=1,
+                        help='Number of iterations to run for warmup.')
+    parser.add_argument("--use-new-beam-search-impl", action="store_true")
+    parser.add_argument("--num-prompts",
+                        nargs="*",
+                        type=int,
+                        default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--hf-max-batch-size",
+                        type=int,
+                        default=None,
+                        help="Maximum batch size for HF backend.")
+    parser.add_argument('--trust-remote-code',
+                        action='store_true',
+                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+        help='data type for model weights and activations. '
+        'The "auto" option will use FP16 precision '
+        'for FP32 and FP16 models, and BF16 precision '
+        'for BF16 models.')
+    parser.add_argument('--gpu-memory-utilization',
+                        type=float,
+                        default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
+    parser.add_argument("--enforce-eager",
+                        action="store_true",
+                        help="enforce eager execution")
+    parser.add_argument(
+        '--kv-cache-dtype',
+        type=str,
+        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+        default="auto",
+        help='Data type for kv cache storage. If "auto", will use model '
+        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+        'ROCm (hcu) supports fp8 (=fp8_e4m3)')
+    parser.add_argument(
+        '--quantization-param-path',
+        type=str,
+        default=None,
+        help='Path to the JSON file containing the KV cache scaling factors. '
+        'This should generally be supplied, when KV cache dtype is FP8. '
+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+        'cuda version greater than 11.8. On ROCm (hcu), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
+    parser.add_argument("--device",
+                        type=str,
+                        default="auto",
+                        choices=DEVICE_OPTIONS,
+                        help='device type for vLLM execution')
+    parser.add_argument(
+        "--num-scheduler-steps",
+        type=int,
+        default=1,
+        help="Maximum number of forward steps per scheduler call.")
+    parser.add_argument("--use-v2-block-manager",
+                        action='store_true',
+                        help="Enable block manager v2.")
+    parser.add_argument(
+        "--enable-prefix-caching",
+        action='store_true',
+        help="Enable automatic prefix caching for vLLM backend.")
+    parser.add_argument("--enable-chunked-prefill",
+                        action='store_true',
+                        help="enable chunked prefill for vLLM backend.")
+    parser.add_argument('--max-num-batched-tokens',
+                        type=int,
+                        default=None,
+                        help='maximum number of batched tokens per '
+                        'iteration')
+    parser.add_argument('--download-dir',
+                        type=str,
+                        default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, will be automatically set to "ray" if installed '
+        'or "mp" (multiprocessing) otherwise.')
+    parser.add_argument(
+        '--load-format',
+        type=str,
+        default=EngineArgs.load_format,
+        choices=[
+            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
+            'bitsandbytes'
+        ],
+        help='The format of the model weights to load.\n\n'
+        '* "auto" will try to load the weights in the safetensors format '
+        'and fall back to the pytorch bin format if safetensors format '
+        'is not available.\n'
+        '* "pt" will load the weights in the pytorch bin format.\n'
+        '* "safetensors" will load the weights in the safetensors format.\n'
+        '* "npcache" will load the weights in pytorch format and store '
+        'a numpy cache to speed up the loading.\n'
+        '* "dummy" will initialize the weights with random values, '
+        'which is mainly for profiling.\n'
+        '* "tensorizer" will load the weights using tensorizer from '
+        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
+        'section for more information.\n'
+        '* "bitsandbytes" will load the weights using bitsandbytes '
+        'quantization.\n')
+    parser.add_argument(
+        "--disable-async-output-proc",
+        action='store_true',
+        default=False,
+        help="Disable async output processor for vLLM backend.")
+    parser.add_argument("--async-engine",
+                        action='store_true',
+                        default=False,
+                        help="Use vLLM async engine rather than LLM class.")
+    parser.add_argument("--disable-frontend-multiprocessing",
+                        action='store_true',
+                        default=False,
+                        help="Disable decoupled async engine frontend.")
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+    if args.backend == "vllm":
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+    elif args.backend == "hf":
+        if args.hf_max_batch_size is None:
+            raise ValueError("HF max batch size is required for HF backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+    elif args.backend == "mii":
+        if args.dtype != "auto":
+            raise ValueError("dtype must be auto for MII backend.")
+        if args.n != 1:
+            raise ValueError("n must be 1 for MII backend.")
+        if args.use_beam_search:
+            raise ValueError("Beam search is not supported for MII backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+        if args.tokenizer != args.model:
+            raise ValueError("Tokenizer must be the same as the model for MII "
+                             "backend.")
+    main(args)
\ No newline at end of file
--- a/models-to-test.cfg
+++ b/models-to-test.cfg
+# 格式说明:
+# 模型名称;模型路径;tp;batch;prompt_tokens;completion_tokens;dtype;max_model_len;gpu_memory_utilization
+# 多个值用逗号分隔
+DeepSeek-R1-Distill-Qwen-1.5B;/workspace/llms/DeepSeek-R1-Distill-Qwen-1.5B;1;1,2,4;128,512,1024;1024,1024,1024;float16;32768;0.95
+DeepSeek-R1-Distill-Qwen-7B;/workspace/llms/DeepSeek-R1-Distill-Qwen-7B;4;1,2;128,512;512,1024;bfloat16;4096;0.95
+DeepSeek-R1-Distill-Llama-8B;/workspace/llms/DeepSeek-R1-Distill-Llama-8B;1;1,2,4,8;128,256,512,1024;256,512,1024,2048;float16;8192;0.95
\ No newline at end of file
--- a/run.sh
+++ b/run.sh
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export ROCBLAS_COMPUTETYPE_FP16R=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export NCCL_ALGO=Ring
+export NCCL_LAUNCH_MODE=GROUP
+export NCCL_NCHANNELS_PER_PEER=16
+export NCCL_MAX_NCHANNELS=16
+export NCCL_MIN_NCHANNELS=16
+export NCCL_IB_TIMEOUT=22
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_P2P_LEVEL=SYS
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export SENDRECV_STREAM_WITH_COMPUTE=1
+export NCCL_TOPO_FILE="/workspace/test/topo.xml"
+export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/torch/lib/:$LD_LIBRARY_PATH
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export VLLM_NUMA_BIND=1
+export VLLM_RANK0_NUMA=3
+export VLLM_RANK1_NUMA=1
+export VLLM_RANK2_NUMA=1
+export VLLM_RANK3_NUMA=0
+export VLLM_RANK4_NUMA=7
+export VLLM_RANK5_NUMA=5
+export VLLM_RANK6_NUMA=5
+export VLLM_RANK7_NUMA=4
+export VLLM_RPC_TIMEOUT=100000
+#!/bin/bash
+# 模型配置文件路径
+MODELS_CONFIG="/workspace/test/models-to-test.cfg"
+# 结果目录
+RESULTS_DIR="/workspace/test/results"
+# 读取配置文件，跳过注释和空行
+while IFS= read -r line || [[ -n "$line" ]]; do
+    # 跳过注释行和空行
+    if [[ "$line" =~ ^# ]] || [[ -z "$line" ]]; then
+        continue
+    fi
+    # 解析配置行
+    IFS=';' read -ra CONFIG <<< "$line"
+    model_name="${CONFIG[0]}"
+    model_path="${CONFIG[1]}"
+    tp="${CONFIG[2]}"
+    batch="${CONFIG[3]//,/ }"        # 将逗号替换为空格
+    prompt_tokens="${CONFIG[4]//,/ }"
+    completion_tokens="${CONFIG[5]//,/ }"
+    dtype="${CONFIG[6]}"
+    max_model_len="${CONFIG[7]}"
+    gpu_memory_utilization="${CONFIG[8]}"
+    echo "开始测试模型: $model_name"
+    echo "模型路径: $model_path"
+    echo "参数配置:"
+    echo "  tensor_parallel_size: $tp"
+    echo "  batch_sizes: $batch"
+    echo "  prompt_tokens: $prompt_tokens"
+    echo "  completion_tokens: $completion_tokens"
+    echo "  dtype: $dtype"
+    echo "  max_model_len: $max_model_len"
+    echo "  gpu_memory_utilization: $gpu_memory_utilization"
+    # 创建模型专属结果目录
+    model_result_dir="${RESULTS_DIR}/${model_name}"
+    mkdir -p "$model_result_dir"
+    # 运行基准测试
+    python /workspace/test/benchmark_throughput_0.6.2.py \
+        --model "$model_path" \
+        --tensor-parallel-size "$tp" \
+        --num-prompts $batch \
+        --input-len $prompt_tokens \
+        --output-len $completion_tokens \
+        --dtype "$dtype" \
+        --trust-remote-code \
+        --max-model-len "$max_model_len" \
+        --gpu-memory-utilization "$gpu_memory_utilization" \
+        --output-json "${model_result_dir}/${model_name}_tp${tp}.txt" \
+        2>&1 | tee "${model_result_dir}/${model_name}_tp${tp}.log"
+    echo "完成测试模型: $model_name"
+    echo "结果保存在: $model_result_dir"
+    echo "----------------------------------------"
+done < "$MODELS_CONFIG"
\ No newline at end of file
--- a/start.sh
+++ b/start.sh
+docker build -t vllm-test . && docker run  -v /public/opendas/DL_DATA/llm-models:/workspace/llms  -v /usr/local/hyhal:/usr/local/hyhal:ro  --ipc=host --device=/dev/kfd --device=/dev/mkfd --device=/dev/dri --shm-size=500G --security-opt seccomp=unconfined  -v /opt/hyhal:/opt/hyhal:ro   -v $PWD/results:/workspace/test/results   vllm-test
\ No newline at end of file
--- a/topo.xml
+++ b/topo.xml
+<system version="2">
+  <cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:99:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:9d:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:9f:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:51:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:54:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:56:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:9b:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_3" dev="3" speed="200000" port="2" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:03:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:05:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
+            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:59:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:5b:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:5d:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:06:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_4" dev="4" speed="200000" port="1" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_5" dev="5" speed="200000" port="2" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="7" affinity="7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:e3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:e5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
+            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:bd:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:bf:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:c1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:e6:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_6" dev="6" speed="200000" port="1" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_7" dev="7" speed="200000" port="2" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:ab:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:af:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:b1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
+            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:c5:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:c8:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:ca:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:ad:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_8" dev="8" speed="200000" port="1" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_9" dev="9" speed="200000" port="2" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:71:00.0" class="0x020000" vendor="0x15b3" device="0xa2dc" subsystem_vendor="0x15b3" subsystem_device="0x0009" link_speed="32.0 GT/s PCIe" link_width="16">
+      <nic>
+        <net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
+        <net name="mlx5_1" dev="1" speed="40000" port="2" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
+      </nic>
+    </pci>
+  </cpu>
+</system>
\ No newline at end of file