Initial commit

76238c74 · libo11 · 76238c74 · 76238c74 · 76238c74 · 76238c74
Commit 76238c74 authored Nov 12, 2024 by libo11
10 changed files
--- a/examples/template_inkbot.jinja
+++ b/examples/template_inkbot.jinja
+<#meta#>
+- Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }}
+- Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }}
+<#system#>
+{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
+<#chat#>
+{% for message in messages %}
+{% if message['role'] == 'user' %}
+<#user#>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+{% endif %}
+{% elif message['role'] == 'assistant' %}
+<#bot#>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+{% endif %}
+{% elif message['role'] == 'user_context' %}
+<#user_context#>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+{% endif %}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
+<#bot#>
+{% endif %}
\ No newline at end of file
--- a/examples/template_llama_chat.jinja
+++ b/examples/template_llama_chat.jinja
+{% if messages[0]['role'] == 'system' %}
+    {% set system_message = '<<SYS>>\n' + messages[0]['content'] | trim + '\n<</SYS>>\n\n' %}
+    {% set messages = messages[1:] %}
+{% else %}
+    {% set system_message = '' %}
+{% endif %}
+{% for message in messages %}
+    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {% endif %}
+    {% if loop.index0 == 0 %}
+        {% set content = system_message + message['content'] %}
+    {% else %}
+        {% set content = message['content'] %}
+    {% endif %}
+    {% if message['role'] == 'user' %}
+        {{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}
+    {% elif message['role'] == 'assistant' %}
+        {{ ' ' + content | trim + ' ' + eos_token }}
+    {% endif %}
+{% endfor %}
\ No newline at end of file
--- a/examples/template_llava.jinja
+++ b/examples/template_llava.jinja
+{%- if messages[0]['role'] == 'system' -%}
+    {%- set system_message = messages[0]['content'] -%}
+    {%- set messages = messages[1:] -%}
+{%- else -%}
+    {% set system_message = '' -%}
+{%- endif -%}
+{{ bos_token + system_message }}
+{%- for message in messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif -%}
+    {%- if message['role'] == 'user' -%}
+        {{ 'USER: ' + message['content'] + '\n' }}
+    {%- elif message['role'] == 'assistant' -%}
+        {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{ 'ASSISTANT:' }}
+{% endif %}
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
+import argparse
+import dataclasses
+import json
+import os
+import uuid
+from functools import partial
+from tensorizer import stream_io
+from vllm import LLM
+from vllm.distributed import (init_distributed_environment,
+                              initialize_model_parallel)
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
+                                                         TensorizerConfig,
+                                                         serialize_vllm_model)
+# yapf conflicts with isort for this docstring
+# yapf: disable
+"""
+tensorize_vllm_model.py is a script that can be used to serialize and 
+deserialize vLLM models. These models can be loaded using tensorizer 
+to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
+or locally. Tensor encryption and decryption is also supported, although 
+libsodium must be installed to use it. Install vllm with tensorizer support 
+using `pip install vllm[tensorizer]`. To learn more about tensorizer, visit
+https://github.com/coreweave/tensorizer
+To serialize a model, install vLLM from source, then run something 
+like this from the root level of this repository:
+python -m examples.tensorize_vllm_model \
+   --model facebook/opt-125m \
+   serialize \
+   --serialized-directory s3://my-bucket \
+   --suffix v1
+Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
+and saves it to your S3 bucket. A local directory can also be used. This
+assumes your S3 credentials are specified as environment variables
+in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and 
+`S3_ENDPOINT_URL`. To provide S3 credentials directly, you can provide 
+`--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint` 
+as CLI args to this script.
+You can also encrypt the model weights with a randomly-generated key by 
+providing a `--keyfile` argument.
+To deserialize a model, you can run something like this from the root 
+level of this repository:
+python -m examples.tensorize_vllm_model \
+   --model EleutherAI/gpt-j-6B \
+   --dtype float16 \
+   deserialize \
+   --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors
+Which downloads the model tensors from your S3 bucket and deserializes them.
+You can also provide a `--keyfile` argument to decrypt the model weights if 
+they were serialized with encryption.
+For more information on the available arguments for serializing, run 
+`python -m examples.tensorize_vllm_model serialize --help`.
+Or for deserializing:
+`python -m examples.tensorize_vllm_model deserialize --help`.
+Once a model is serialized, tensorizer can be invoked with the `LLM` class 
+directly to load models:
+    llm = LLM(model="facebook/opt-125m",
+              load_format="tensorizer",
+              model_loader_extra_config=TensorizerConfig(
+                    tensorizer_uri = path_to_tensors,
+                    num_readers=3,
+                    )
+              )
+A serialized model can be used during model loading for the vLLM OpenAI
+inference server. `model_loader_extra_config` is exposed as the CLI arg
+`--model-loader-extra-config`, and accepts a JSON string literal of the
+TensorizerConfig arguments desired.
+In order to see all of the available arguments usable to configure 
+loading with tensorizer that are given to `TensorizerConfig`, run:
+`python -m examples.tensorize_vllm_model deserialize --help`
+under the `tensorizer options` section. These can also be used for
+deserialization in this example script, although `--tensorizer-uri` and
+`--path-to-tensors` are functionally the same in this case.
+"""
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="An example script that can be used to serialize and "
+        "deserialize vLLM models. These models "
+        "can be loaded using tensorizer directly to the GPU "
+        "extremely quickly. Tensor encryption and decryption is "
+        "also supported, although libsodium must be installed to "
+        "use it.")
+    parser = EngineArgs.add_cli_args(parser)
+    subparsers = parser.add_subparsers(dest='command')
+    serialize_parser = subparsers.add_parser(
+        'serialize', help="Serialize a model to `--serialized-directory`")
+    serialize_parser.add_argument(
+        "--suffix",
+        type=str,
+        required=False,
+        help=(
+            "The suffix to append to the serialized model directory, which is "
+            "used to construct the location of the serialized model tensors, "
+            "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
+            "`--suffix` is `v1`, the serialized model tensors will be "
+            "saved to "
+            "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
+            "If none is provided, a random UUID will be used."))
+    serialize_parser.add_argument(
+        "--serialized-directory",
+        type=str,
+        required=True,
+        help="The directory to serialize the model to. "
+        "This can be a local directory or S3 URI. The path to where the "
+        "tensors are saved is a combination of the supplied `dir` and model "
+        "reference ID. For instance, if `dir` is the serialized directory, "
+        "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
+        "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
+        "where `suffix` is given by `--suffix` or a random UUID if not "
+        "provided.")
+    serialize_parser.add_argument(
+        "--keyfile",
+        type=str,
+        required=False,
+        help=("Encrypt the model weights with a randomly-generated binary key,"
+              " and save the key at this path"))
+    deserialize_parser = subparsers.add_parser(
+        'deserialize',
+        help=("Deserialize a model from `--path-to-tensors`"
+              " to verify it can be loaded and used."))
+    deserialize_parser.add_argument(
+        "--path-to-tensors",
+        type=str,
+        required=True,
+        help="The local path or S3 URI to the model tensors to deserialize. ")
+    deserialize_parser.add_argument(
+        "--keyfile",
+        type=str,
+        required=False,
+        help=("Path to a binary key to use to decrypt the model weights,"
+              " if the model was serialized with encryption"))
+    TensorizerArgs.add_cli_args(deserialize_parser)
+    return parser.parse_args()
+def deserialize():
+    llm = LLM(model=args.model,
+              load_format="tensorizer",
+              model_loader_extra_config=tensorizer_config
+    )
+    return llm
+args = parse_args()
+s3_access_key_id = (getattr(args, 's3_access_key_id', None)
+                    or os.environ.get("S3_ACCESS_KEY_ID", None))
+s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
+                        or os.environ.get("S3_SECRET_ACCESS_KEY", None))
+s3_endpoint = (getattr(args, 's3_endpoint', None)
+               or os.environ.get("S3_ENDPOINT_URL", None))
+credentials = {
+    "s3_access_key_id": s3_access_key_id,
+    "s3_secret_access_key": s3_secret_access_key,
+    "s3_endpoint": s3_endpoint
+}
+_read_stream, _write_stream = (partial(
+    stream_io.open_stream,
+    mode=mode,
+    s3_access_key_id=s3_access_key_id,
+    s3_secret_access_key=s3_secret_access_key,
+    s3_endpoint=s3_endpoint,
+) for mode in ("rb", "wb+"))
+model_ref = args.model
+model_name = model_ref.split("/")[1]
+os.environ["MASTER_ADDR"] = "127.0.0.1"
+os.environ["MASTER_PORT"] = "8080"
+init_distributed_environment(world_size=1, rank=0, local_rank=0)
+initialize_model_parallel()
+keyfile = args.keyfile if args.keyfile else None
+if args.model_loader_extra_config:
+    config = json.loads(args.model_loader_extra_config)
+    tensorizer_args = TensorizerConfig(**config)._construct_tensorizer_args()
+    tensorizer_args.tensorizer_uri = args.path_to_tensors
+else:
+    tensorizer_args = None
+if args.command == "serialize":
+    eng_args_dict = {f.name: getattr(args, f.name) for f in
+                     dataclasses.fields(EngineArgs)}
+    engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
+    engine = LLMEngine.from_engine_args(engine_args)
+    input_dir = args.serialized_directory.rstrip('/')
+    suffix = args.suffix if args.suffix else uuid.uuid4().hex
+    base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
+    model_path = f"{base_path}/model.tensors"
+    tensorizer_config = TensorizerConfig(
+        tensorizer_uri=model_path,
+        **credentials)
+    serialize_vllm_model(engine, tensorizer_config, keyfile)
+elif args.command == "deserialize":
+    if not tensorizer_args:
+        tensorizer_config = TensorizerConfig(
+            tensorizer_uri=args.path_to_tensors,
+            encryption_keyfile = keyfile,
+            **credentials
+        )
+    deserialize()
+else:
+    raise ValueError("Either serialize or deserialize must be specified.")
--- a/icon.png
+++ b/icon.png
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode=653
+# 模型名称
+modelName=qwen1.5_vllm
+# 模型描述
+modelDescription=Qwen1.5是阿里云开源大型语言模型系列，是Qwen2.0的beta版本。
+# 应用场景
+appScenario=推理,对话问答,科研,教育,政府,金融
+# 框架类型
+frameType=vllm
--- a/test13b-k100.sh
+++ b/test13b-k100.sh
+modelpath="/public/home/sugon_libo/tests/llms/llama2-13b"
+logdir="./log-k100/llama2-13b"
+if [ ! -f ${logdir} ]; then
+    mkdir ${logdir} -p
+fi
+all_log="${logdir}/all-log.log"
+echo -e "| num prompts | Input length | Output length |  All Throughput (tokens/s) |  Gen Throughput (tokens/s) | Latency (s) |" > $all_log
+echo -e "|:----------:|:------------:|:-------------:|:-------------:|:-------------:|:-------------:|" >> $all_log
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+export FA_PAD=0
+export ROCBLAS_COMPUTETYPE_FP16R=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_LAUNCH_MODE=GROUP
+export NCCL_MAX_NCHANNELS=20
+export NCCL_MIN_NCHANNELS=20
+export NCCL_P2P_LEVEL=SYS
+export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/torch/lib/:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/opt/rocblas-install/lib:$LD_LIBRARY_PATH
+for num_prompts in 1 2 4 8 16 32 64 128 256; do
+    for prompt_tuple in  "1000 1"  "1000 1000" "2000 1" "2000 2000" "4096 1" ; do
+        IFS=' ' read -r input_len output_len <<< "$prompt_tuple"
+        tmp_log=${logdir}/numprompts-${num_prompts}-input_len-${input_len}-output_len-${output_len}.log
+        python3 ./benchmarks/benchmark_throughput.py --enforce-eager --input-len $input_len --output-len $output_len --num-prompts $num_prompts --tensor-parallel-size 2  --model $modelpath  --dtype float16   --trust-remote-code  2>&1 | tee ${tmp_log}
+        avg_latency=`tail -n 7 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $2}'`
+        all_tht=`tail -n 6 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $5}'`
+        gen_tht=`tail -n 5 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $3}'`
+        echo "" | awk -v all_tht=$all_tht \
+                    -v gen_tht=$gen_tht \
+                    -v avg_latency=$avg_latency \
+                    -v num_prompts=$num_prompts \
+                    -v input_len=${input_len} -v output_len="$output_len" \
+                    '{printf "| %6d | %6d | %6d | %7.2f | %7.2f | %7.2f |\n", num_prompts, input_len, output_len,
+                    all_tht, gen_tht, avg_latency}' >> $all_log
+    done # input_len output_len
+done # num_prompts
--- a/test13b.sh
+++ b/test13b.sh
+modelpath="/public/home/sugon_libo/tests/llms/llama2-13b"
+logdir="./log/llama2-13b"
+if [ ! -f ${logdir} ]; then
+    mkdir ${logdir} -p
+fi
+all_log="${logdir}/all-log.log"
+echo -e "| num prompts | Input length | Output length |  All Throughput (tokens/s) |  Gen Throughput (tokens/s) | Latency (s) |" > $all_log
+echo -e "|:----------:|:------------:|:-------------:|:-------------:|:-------------:|:-------------:|" >> $all_log
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+export FA_PAD=0
+export ROCBLAS_COMPUTETYPE_FP16R=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_LAUNCH_MODE=GROUP
+export NCCL_MAX_NCHANNELS=20
+export NCCL_MIN_NCHANNELS=20
+export NCCL_P2P_LEVEL=SYS
+export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/torch/lib/:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/opt/rocblas-install/lib:$LD_LIBRARY_PATH
+for num_prompts in 1 2 4 8 16 32 64 128 256; do
+    for prompt_tuple in  "1000 1"  "1000 1000" "2000 1" "2000 2000" "4096 1" ; do
+        IFS=' ' read -r input_len output_len <<< "$prompt_tuple"
+        tmp_log=${logdir}/numprompts-${num_prompts}-input_len-${input_len}-output_len-${output_len}.log
+        python3 ./benchmarks/benchmark_throughput.py --enforce-eager --input-len $input_len --output-len $output_len --num-prompts $num_prompts --tensor-parallel-size 2  --model $modelpath  --dtype float16   --trust-remote-code  2>&1 | tee ${tmp_log}
+        avg_latency=`tail -n 7 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $2}'`
+        all_tht=`tail -n 6 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $5}'`
+        gen_tht=`tail -n 5 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $3}'`
+        echo "" | awk -v all_tht=$all_tht \
+                    -v gen_tht=$gen_tht \
+                    -v avg_latency=$avg_latency \
+                    -v num_prompts=$num_prompts \
+                    -v input_len=${input_len} -v output_len="$output_len" \
+                    '{printf "| %6d | %6d | %6d | %7.2f | %7.2f | %7.2f |\n", num_prompts, input_len, output_len,
+                    all_tht, gen_tht, avg_latency}' >> $all_log
+    done # input_len output_len
+done # num_prompts
--- a/test7b-k100.sh
+++ b/test7b-k100.sh
+modelpath="/public/home/sugon_libo/tests/llms/llama2-7b"
+logdir="./log-k100/llama2-7b"
+if [ ! -f ${logdir} ]; then
+    mkdir ${logdir} -p
+fi
+all_log="${logdir}/all-log.log"
+echo -e "| num prompts | Input length | Output length |  All Throughput (tokens/s) |  Gen Throughput (tokens/s) | Latency (s) |" > $all_log
+echo -e "|:----------:|:------------:|:-------------:|:-------------:|:-------------:|:-------------:|" >> $all_log
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+export FA_PAD=0
+export ROCBLAS_COMPUTETYPE_FP16R=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_LAUNCH_MODE=GROUP
+export NCCL_MAX_NCHANNELS=20
+export NCCL_MIN_NCHANNELS=20
+export NCCL_P2P_LEVEL=SYS
+export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/torch/lib/:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/opt/rocblas-install/lib:$LD_LIBRARY_PATH
+for num_prompts in 1 2 4 8 16 32 64 128 256; do
+    for prompt_tuple in  "1000 1"  "1000 1000" "2000 1" "2000 2000" "4096 1" ; do
+        IFS=' ' read -r input_len output_len <<< "$prompt_tuple"
+        tmp_log=${logdir}/numprompts-${num_prompts}-input_len-${input_len}-output_len-${output_len}.log
+        python3 ./benchmarks/benchmark_throughput.py --enforce-eager --input-len $input_len --output-len $output_len --num-prompts $num_prompts --tensor-parallel-size 1  --model $modelpath  --dtype float16   --trust-remote-code  2>&1 | tee ${tmp_log}
+        avg_latency=`tail -n 4 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $2}'`
+        all_tht=`tail -n 3 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $5}'`
+        gen_tht=`tail -n 2 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $3}'`
+        echo "" | awk -v all_tht=$all_tht \
+                    -v gen_tht=$gen_tht \
+                    -v avg_latency=$avg_latency \
+                    -v num_prompts=$num_prompts \
+                    -v input_len=${input_len} -v output_len="$output_len" \
+                    '{printf "| %6d | %6d | %6d | %7.2f | %7.2f | %7.2f |\n", num_prompts, input_len, output_len,
+                    all_tht, gen_tht, avg_latency}' >> $all_log
+    done # input_len output_len
+done # num_prompts
--- a/test7b.sh
+++ b/test7b.sh
+modelpath="/public/home/sugon_libo/tests/llms/llama2-7b"
+logdir="./log/llama2-7b"
+if [ ! -f ${logdir} ]; then
+    mkdir ${logdir} -p
+fi
+all_log="${logdir}/all-log.log"
+echo -e "| num prompts | Input length | Output length |  All Throughput (tokens/s) |  Gen Throughput (tokens/s) | Latency (s) |" > $all_log
+echo -e "|:----------:|:------------:|:-------------:|:-------------:|:-------------:|:-------------:|" >> $all_log
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+export FA_PAD=0
+export ROCBLAS_COMPUTETYPE_FP16R=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_LAUNCH_MODE=GROUP
+export NCCL_MAX_NCHANNELS=20
+export NCCL_MIN_NCHANNELS=20
+export NCCL_P2P_LEVEL=SYS
+export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/torch/lib/:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/opt/rocblas-install/lib:$LD_LIBRARY_PATH
+for num_prompts in 1 2 4 8 16 32 64 128 256; do
+    for prompt_tuple in  "1000 1"  "1000 1000" "2000 1" "2000 2000" "4096 1" ; do
+        IFS=' ' read -r input_len output_len <<< "$prompt_tuple"
+        tmp_log=${logdir}/numprompts-${num_prompts}-input_len-${input_len}-output_len-${output_len}.log
+        python3 ./benchmarks/benchmark_throughput.py --enforce-eager --input-len $input_len --output-len $output_len --num-prompts $num_prompts --tensor-parallel-size 1  --model $modelpath  --dtype float16   --trust-remote-code  2>&1 | tee ${tmp_log}
+        avg_latency=`tail -n 4 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $2}'`
+        all_tht=`tail -n 3 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $5}'`
+        gen_tht=`tail -n 2 ${tmp_log} | head -n 1 | awk '{printf "%.2f", $3}'`
+        echo "" | awk -v all_tht=$all_tht \
+                    -v gen_tht=$gen_tht \
+                    -v avg_latency=$avg_latency \
+                    -v num_prompts=$num_prompts \
+                    -v input_len=${input_len} -v output_len="$output_len" \
+                    '{printf "| %6d | %6d | %6d | %7.2f | %7.2f | %7.2f |\n", num_prompts, input_len, output_len,
+                    all_tht, gen_tht, avg_latency}' >> $all_log
+    done # input_len output_len
+done # num_prompts