update dtk24.04

3e1ef98f · zhuwenwen · b5d6a878 · 3e1ef98f · 3e1ef98f · 3e1ef98f
Commit 3e1ef98f authored May 24, 2024 by zhuwenwen
Showing with 49 additions and 377 deletions

.gitmodules .gitmodules +1 -1

Dockerfile Dockerfile +3 -0

README.md README.md +45 -15

benchmark_throughput.py benchmark_throughput.py +0 -342

offline_inference.py offline_inference.py +0 -19

No files found.
--- a/.gitmodules
+++ b/.gitmodules
 [submodule "vllm"]
 	path = vllm
 	url = http://developer.hpccube.com/codes/OpenDAS/vllm.git
-	branch = vllm-v0.3.3-dtk23.10
+	branch = vllm-v0.3.3-dtk24.04
\ No newline at end of file
--- a/Dockerfile
+++ b/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
+ENV LANG C.UTF-8
+RUN pip install ray==2.9.1 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 * @Author: zhuww
 * @email: zhuww@sugon.com
 * @Date: 2024-04-25 10:38:07
- * @LastEditTime: 2024-04-25 17:47:01
+ * @LastEditTime: 2024-05-24 15:47:01
 -->
 # LLAMA
@@ -24,30 +24,58 @@ LLama是一个基础语言模型的集合,参数范围从7B到65B。在数万亿
 ## 环境配置
+### Docker（方法一）
 提供[光源](https://www.sourcefind.cn/#/image/dcu/custom)拉取推理的docker镜像：
 ```
-docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.3.3-dtk23.10-py38
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
 # <Image ID>用上面拉取docker镜像的ID替换
 # <Host Path>主机端路径
 # <Container Path>容器映射路径
-docker run -it --name llama --privileged --shm-size=64G  --device=/dev/kfd --device=/dev/dri/ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v /opt/hyhal:/opt/hyhal -v <Host Path>:<Container Path> <Image ID> /bin/bash
+docker run -it --name qwen1.5_vllm --privileged --shm-size=64G  --device=/dev/kfd --device=/dev/dri/ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v /opt/hyhal:/opt/hyhal -v <Host Path>:<Container Path> <Image ID> /bin/bash
-# 更新ray版本
+# 更新镜像的ray版本
 pip install ray==2.9.1
 ```
-镜像版本依赖：
+### Dockerfile（方法二）
-* DTK驱动：dtk23.10
+```
+# <Host Path>主机端路径
+# <Container Path>容器映射路径
+docker build -t llama:latest .
+docker run -it --name llama_vllm --privileged --shm-size=64G  --device=/dev/kfd --device=/dev/dri/ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v /opt/hyhal:/opt/hyhal -v <Host Path>:<Container Path> llama:latest /bin/bash
+```
+### Anaconda（方法三）
+```
+conda create -n llama_vllm python=3.10
+```
+关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
+* DTK驱动：dtk24.04
 * Pytorch: 2.1.0
+* triton:2.1.0
 * vllm: 0.3.3
-* xformers: 0.0.23
+* xformers: 0.0.25
 * flash_attn: 2.0.4
-* python: python3.8
+* python: python3.10
+`Tips：以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应`
 ## 数据集
 无
 ## 推理
+### 源码编译安装
+```
+# 若使用光源的镜像，可以跳过源码编译安装，镜像中已安装vllm。
+git clone http://developer.hpccube.com/codes/modelzoo/qwen1.5_vllm.git
+cd qwen1.5_vllm
+git submodule init && git submodule update
+cd vllm
+pip install wheel
+python setup.py bdist_wheel
+cd dist && pip install vllm*
+```
 ### 模型下载
@@ -56,11 +84,13 @@ pip install ray==2.9.1
 | [Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)   | [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)    | [Llama-2-7B-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/tree/gptq-4bit-128g-actorder_True)   |
 | [Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf) | [Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | [Llama-2-13B-GPTQ](https://huggingface.co/TheBloke/Llama-2-13B-GPTQ/tree/gptq-4bit-128g-actorder_True) |
 | [Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf) | [Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | [Llama-2-70B-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-70B-Chat-GPTQ/tree/gptq-4bit-128g-actorder_True) |
+| [Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 
+| [Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | [Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 
 ### 离线批量推理
 ```bash
-python offline_inference.py
+python vllm/examples/offline_inference.py
 ```
 其中，`prompts`为提示词；`temperature`为控制采样随机性的值，值越小模型生成越确定，值变高模型生成更随机，0表示贪婪采样，默认为1；`max_tokens=16`为生成长度，默认为1；
 `model`为模型路径；`tensor_parallel_size=1`为使用卡数，默认为1；`dtype="float16"`为推理数据类型，如果模型权重是bfloat16,需要修改为float16推理,`quantization="gptq"`为使用gptq量化进行推理,需下载以上GPTQ模型。
@@ -69,7 +99,7 @@ python offline_inference.py
 ### 离线批量推理性能测试
 1、指定输入输出
 ```bash
-python benchmark_throughput.py --num-prompts 1 --input-len 32 --output-len 128 --model meta-llama/Llama-2-7b-chat-hf -tp 1 --trust-remote-code --enforce-eager --dtype float16
+python vllm/benchmarks/benchmark_throughput.py --num-prompts 1 --input-len 32 --output-len 128 --model meta-llama/Llama-2-7b-chat-hf -tp 1 --trust-remote-code --enforce-eager --dtype float16
 ```
 其中`--num-prompts`是batch数，`--input-len`是输入seqlen，`--output-len`是输出token长度，`--model`为模型路径，`-tp`为使用卡数，`dtype="float16"`为推理数据类型，如果模型权重是bfloat16,需要修改为float16推理。若指定`--output-len  1`即为首字延迟。
@@ -80,7 +110,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 ```
 ```bash
-python benchmark_throughput.py --num-prompts 1 --model meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json -tp 1 --trust-remote-code --enforce-eager --dtype float16
+python vllm/benchmarks/benchmark_throughput.py --num-prompts 1 --model meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json -tp 1 --trust-remote-code --enforce-eager --dtype float16
 ```
 其中`--num-prompts`是batch数，`--model`为模型路径，`--dataset`为使用的数据集，`-tp`为使用卡数，`dtype="float16"`为推理数据类型，如果模型权重是bfloat16,需要修改为float16推理。
@@ -115,13 +145,13 @@ curl http://localhost:8000/v1/models
 curl http://localhost:8000/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
-        "model": "meta-llama/Llama-2-7b-chat-hf",
+        "model": "meta-llama/Llama-2-7b-hf",
        "prompt": "I believe the meaning of life is",
        "max_tokens": 7,
        "temperature": 0
    }'
 ```
-或者使用[vllm/examples/openai_completion_client.py](https://developer.hpccube.com/codes/OpenDAS/vllm/-/tree/3e147e194e5a3b0fc25a61dd91fdc8a682cbba9d/examples/openai_completion_client.py)
+或者使用[vllm/examples/openai_completion_client.py](https://developer.hpccube.com/codes/OpenDAS/vllm/-/blob/675c0abe47eb9d29c126fbecda86fd5801162eba/examples/openai_completion_client.py)
 ### OpenAI Chat API和vllm结合使用
@@ -136,11 +166,11 @@ curl http://localhost:8000/v1/chat/completions \
        ]
    }'
 ```
-或者使用[vllm/examples/openai_chatcompletion_client.py](https://developer.hpccube.com/codes/OpenDAS/vllm/-/tree/3e147e194e5a3b0fc25a61dd91fdc8a682cbba9d/examples/openai_chatcompletion_client.py)
+或者使用[vllm/examples/openai_chatcompletion_client.py](https://developer.hpccube.com/codes/OpenDAS/vllm/-/blob/675c0abe47eb9d29c126fbecda86fd5801162eba/examples/openai_chatcompletion_client.py)
 ## result
-使用的加速卡:1张 DCU-K100-64G
+使用的加速卡:1张 DCU-K100_AI-64G
 ```
 Prompt: 'I believe the meaning of life is', Generated text: ' to find purpose, happiness, and fulfillment. Here are some reasons why:\n\n1. Purpose: Having a sense of purpose gives life meaning and direction. It helps individuals set goals and work towards achieving them, which can lead to a sense of accomplishment and fulfillment.\n2. Happiness: Happiness is a fundamental aspect of life that brings joy and satisfaction.
 ```

--- a/benchmark_throughput.py
+++ b/benchmark_throughput.py
-"""Benchmark offline inference throughput."""
-import argparse
-import json
-import random
-import time
-from typing import List, Optional, Tuple
-import torch
-from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          PreTrainedTokenizerBase)
-from tqdm import tqdm
-def sample_requests(
-    dataset_path: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
-    if fixed_output_len is not None and fixed_output_len < 4:
-        raise ValueError("output_len too small")
-    # Load the dataset.
-    with open(dataset_path) as f:
-        dataset = json.load(f)
-    # Filter out the conversations with less than 2 turns.
-    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
-    # Tokenize the prompts and completions.
-    prompts = [prompt for prompt, _ in dataset]
-    prompt_token_ids = tokenizer(prompts).input_ids
-    completions = [completion for _, completion in dataset]
-    completion_token_ids = tokenizer(completions).input_ids
-    tokenized_dataset = []
-    for i in range(len(dataset)):
-        output_len = len(completion_token_ids[i])
-        if fixed_output_len is not None:
-            output_len = fixed_output_len
-        tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
-    # Filter out too long sequences.
-    filtered_dataset: List[Tuple[str, int, int]] = []
-    for prompt, prompt_token_ids, output_len in tokenized_dataset:
-        prompt_len = len(prompt_token_ids)
-        if prompt_len < 4 or output_len < 4:
-            # Prune too short sequences.
-            continue
-        if prompt_len > 1024 or prompt_len + output_len > 2048:
-            # Prune too long sequences.
-            continue
-        filtered_dataset.append((prompt, prompt_len, output_len))
-    # Sample the requests.
-    sampled_requests = random.sample(filtered_dataset, num_requests)
-    return sampled_requests
-def run_vllm(
-    requests: List[Tuple[str, int, int]],
-    model: str,
-    tokenizer: str,
-    quantization: Optional[str],
-    tensor_parallel_size: int,
-    seed: int,
-    n: int,
-    use_beam_search: bool,
-    trust_remote_code: bool,
-    dtype: str,
-    max_model_len: Optional[int],
-    enforce_eager: bool,
-    kv_cache_dtype: str,
-    device: str,
-) -> float:
-    from vllm import LLM, SamplingParams
-    llm = LLM(
-        model=model,
-        tokenizer=tokenizer,
-        quantization=quantization,
-        tensor_parallel_size=tensor_parallel_size,
-        seed=seed,
-        trust_remote_code=trust_remote_code,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        enforce_eager=enforce_eager,
-        kv_cache_dtype=kv_cache_dtype,
-        device=device,
-    )
-    # Add the requests to the engine.
-    for prompt, _, output_len in requests:
-        sampling_params = SamplingParams(
-            n=n,
-            temperature=0.0 if use_beam_search else 1.0,
-            top_p=1.0,
-            use_beam_search=use_beam_search,
-            ignore_eos=True,
-            max_tokens=output_len,
-        )
-        # FIXME(woosuk): Do not use internal method.
-        llm._add_request(
-            prompt=prompt,
-            prompt_token_ids=None,
-            sampling_params=sampling_params,
-        )
-    start = time.perf_counter()
-    # FIXME(woosuk): Do not use internal method.
-    llm._run_engine(use_tqdm=True)
-    end = time.perf_counter()
-    return end - start
-def run_hf(
-    requests: List[Tuple[str, int, int]],
-    model: str,
-    tokenizer: PreTrainedTokenizerBase,
-    n: int,
-    use_beam_search: bool,
-    max_batch_size: int,
-    trust_remote_code: bool,
-) -> float:
-    assert not use_beam_search
-    llm = AutoModelForCausalLM.from_pretrained(
-        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
-    if llm.config.model_type == "llama":
-        # To enable padding in the HF backend.
-        tokenizer.pad_token = tokenizer.eos_token
-    llm = llm.cuda()
-    pbar = tqdm(total=len(requests))
-    start = time.perf_counter()
-    batch: List[str] = []
-    max_prompt_len = 0
-    max_output_len = 0
-    for i in range(len(requests)):
-        prompt, prompt_len, output_len = requests[i]
-        # Add the prompt to the batch.
-        batch.append(prompt)
-        max_prompt_len = max(max_prompt_len, prompt_len)
-        max_output_len = max(max_output_len, output_len)
-        if len(batch) < max_batch_size and i != len(requests) - 1:
-            # Check if we can add more requests to the batch.
-            _, next_prompt_len, next_output_len = requests[i + 1]
-            if (max(max_prompt_len, next_prompt_len) +
-                    max(max_output_len, next_output_len)) <= 2048:
-                # We can add more requests to the batch.
-                continue
-        # Generate the sequences.
-        input_ids = tokenizer(batch, return_tensors="pt",
-                              padding=True).input_ids
-        llm_outputs = llm.generate(
-            input_ids=input_ids.cuda(),
-            do_sample=not use_beam_search,
-            num_return_sequences=n,
-            temperature=1.0,
-            top_p=1.0,
-            use_cache=True,
-            max_new_tokens=max_output_len,
-        )
-        # Include the decoding time.
-        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
-        pbar.update(len(batch))
-        # Clear the batch.
-        batch = []
-        max_prompt_len = 0
-        max_output_len = 0
-    end = time.perf_counter()
-    return end - start
-def run_mii(
-    requests: List[Tuple[str, int, int]],
-    model: str,
-    tensor_parallel_size: int,
-    output_len: int,
-) -> float:
-    from mii import pipeline
-    llm = pipeline(model, tensor_parallel=tensor_parallel_size)
-    prompts = [prompt for prompt, _, _ in requests]
-    start = time.perf_counter()
-    llm(prompts, max_new_tokens=output_len)
-    end = time.perf_counter()
-    return end - start
-def main(args: argparse.Namespace):
-    print(args)
-    random.seed(args.seed)
-    # Sample the requests.
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer, trust_remote_code=args.trust_remote_code)
-    if args.dataset is None:
-        # Synthesize a prompt with the given input length.
-        prompt = "hi" * (args.input_len - 1)
-        requests = [(prompt, args.input_len, args.output_len)
-                    for _ in range(args.num_prompts)]
-    else:
-        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
-                                   args.output_len)
-    if args.backend == "vllm":
-        elapsed_time = run_vllm(requests, args.model, args.tokenizer,
-                                args.quantization, args.tensor_parallel_size,
-                                args.seed, args.n, args.use_beam_search,
-                                args.trust_remote_code, args.dtype,
-                                args.max_model_len, args.enforce_eager,
-                                args.kv_cache_dtype, args.device)
-    elif args.backend == "hf":
-        assert args.tensor_parallel_size == 1
-        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.use_beam_search, args.hf_max_batch_size,
-                              args.trust_remote_code)
-    elif args.backend == "mii":
-        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
-                               args.output_len)
-    else:
-        raise ValueError(f"Unknown backend: {args.backend}")
-    total_num_tokens = sum(prompt_len + output_len
-                           for _, prompt_len, output_len in requests)
-    if args.dataset is None:
-        total_out_tokens = args.output_len * args.num_prompts
-    else:
-        total_out_tokens = sum(output_len for _, _, output_len in requests) 
-    print(f"Latency: {elapsed_time:.2f} s")
-    print(f"All Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
-          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
-    print(f"Generate Throughput: {total_out_tokens / elapsed_time:.2f} tokens/s")
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
-    parser.add_argument("--backend",
-                        type=str,
-                        choices=["vllm", "hf", "mii"],
-                        default="vllm")
-    parser.add_argument("--dataset",
-                        type=str,
-                        default=None,
-                        help="Path to the dataset.")
-    parser.add_argument("--input-len",
-                        type=int,
-                        default=None,
-                        help="Input prompt length for each request")
-    parser.add_argument("--output-len",
-                        type=int,
-                        default=None,
-                        help="Output length for each request. Overrides the "
-                        "output length from the dataset.")
-    parser.add_argument("--model", type=str, default="facebook/opt-125m")
-    parser.add_argument("--tokenizer", type=str, default=None)
-    parser.add_argument('--quantization',
-                        '-q',
-                        choices=['awq', 'gptq', 'squeezellm', None],
-                        default=None)
-    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
-    parser.add_argument("--n",
-                        type=int,
-                        default=1,
-                        help="Number of generated sequences per prompt.")
-    parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument("--num-prompts",
-                        type=int,
-                        default=1000,
-                        help="Number of prompts to process.")
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--hf-max-batch-size",
-                        type=int,
-                        default=None,
-                        help="Maximum batch size for HF backend.")
-    parser.add_argument('--trust-remote-code',
-                        action='store_true',
-                        help='trust remote code from huggingface')
-    parser.add_argument(
-        '--max-model-len',
-        type=int,
-        default=None,
-        help='Maximum length of a sequence (including prompt and output). '
-        'If None, will be derived from the model.')
-    parser.add_argument(
-        '--dtype',
-        type=str,
-        default='auto',
-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
-        help='data type for model weights and activations. '
-        'The "auto" option will use FP16 precision '
-        'for FP32 and FP16 models, and BF16 precision '
-        'for BF16 models.')
-    parser.add_argument("--enforce-eager",
-                        action="store_true",
-                        help="enforce eager execution")
-    parser.add_argument(
-        "--kv-cache-dtype",
-        type=str,
-        choices=["auto", "fp8_e5m2"],
-        default="auto",
-        help=
-        'Data type for kv cache storage. If "auto", will use model data type.')
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cuda",
-        choices=["cuda"],
-        help='device type for vLLM execution, supporting CUDA only currently.')
-    args = parser.parse_args()
-    if args.tokenizer is None:
-        args.tokenizer = args.model
-    if args.dataset is None:
-        assert args.input_len is not None
-        assert args.output_len is not None
-    else:
-        assert args.input_len is None
-    if args.backend == "vllm":
-        if args.hf_max_batch_size is not None:
-            raise ValueError("HF max batch size is only for HF backend.")
-    elif args.backend == "hf":
-        if args.hf_max_batch_size is None:
-            raise ValueError("HF max batch size is required for HF backend.")
-        if args.quantization is not None:
-            raise ValueError("Quantization is only for vLLM backend.")
-    elif args.backend == "mii":
-        if args.dtype != "auto":
-            raise ValueError("dtype must be auto for MII backend.")
-        if args.n != 1:
-            raise ValueError("n must be 1 for MII backend.")
-        if args.use_beam_search:
-            raise ValueError("Beam search is not supported for MII backend.")
-        if args.quantization is not None:
-            raise ValueError("Quantization is only for vLLM backend.")
-        if args.hf_max_batch_size is not None:
-            raise ValueError("HF max batch size is only for HF backend.")
-        if args.tokenizer != args.model:
-            raise ValueError("Tokenizer must be the same as the model for MII "
-                             "backend.")
-    main(args)
\ No newline at end of file
--- a/offline_inference.py
+++ b/offline_inference.py
-from vllm import LLM, SamplingParams
-# Sample prompts.
-prompts = [
-    "I believe the meaning of life is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=16)
-# Create an LLM.
-llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", tensor_parallel_size=1, trust_remote_code=True, dtype="float16", enforce_eager=True)
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")