"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "b474782ad776d4517675a6eae6f424a6d3ddf7ca"
Commit 4eabe123 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

parents 45840cd2 58738772
...@@ -20,6 +20,7 @@ Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1 ...@@ -20,6 +20,7 @@ Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
Learn more about LMCache environment setup, please refer to: Learn more about LMCache environment setup, please refer to:
https://docs.lmcache.ai/getting_started/installation.html https://docs.lmcache.ai/getting_started/installation.html
""" """
import argparse import argparse
import contextlib import contextlib
import os import os
...@@ -49,8 +50,7 @@ def setup_environment_variables(vllm_version: str): ...@@ -49,8 +50,7 @@ def setup_environment_variables(vllm_version: str):
@contextlib.contextmanager @contextlib.contextmanager
def build_llm_with_lmcache(lmcache_connector: str, model: str, def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str):
vllm_version: str):
ktc = KVTransferConfig( ktc = KVTransferConfig(
kv_connector=lmcache_connector, kv_connector=lmcache_connector,
kv_role="kv_both", kv_role="kv_both",
...@@ -97,18 +97,19 @@ def print_output( ...@@ -97,18 +97,19 @@ def print_output(
for output in outputs: for output in outputs:
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Generated text: {generated_text!r}") print(f"Generated text: {generated_text!r}")
print(f"Generation took {time.time() - start:.2f} seconds, " print(f"Generation took {time.time() - start:.2f} seconds, {req_str} request done.")
f"{req_str} request done.")
print("-" * 50) print("-" * 50)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("-v", parser.add_argument(
"--version", "-v",
choices=["v0", "v1"], "--version",
default="v1", choices=["v0", "v1"],
help="Specify vLLM version (default: v1)") default="v1",
help="Specify vLLM version (default: v1)",
)
return parser.parse_args() return parser.parse_args()
...@@ -125,7 +126,6 @@ def main(): ...@@ -125,7 +126,6 @@ def main():
setup_environment_variables(args.version) setup_environment_variables(args.version)
with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm: with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
# This example script runs two requests with a shared prefix. # This example script runs two requests with a shared prefix.
# Define the shared prompt and specific prompts # Define the shared prompt and specific prompts
shared_prompt = "Hello, how are you?" * 1000 shared_prompt = "Hello, how are you?" * 1000
...@@ -136,9 +136,7 @@ def main(): ...@@ -136,9 +136,7 @@ def main():
shared_prompt + "Tell me a very long story", shared_prompt + "Tell me a very long story",
] ]
sampling_params = SamplingParams(temperature=0, sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
top_p=0.95,
max_tokens=10)
# Print the first output # Print the first output
print_output(llm, first_prompt, sampling_params, "first") print_output(llm, first_prompt, sampling_params, "first")
......
...@@ -4,12 +4,13 @@ This file demonstrates the example usage of disaggregated prefilling ...@@ -4,12 +4,13 @@ This file demonstrates the example usage of disaggregated prefilling
with LMCache. with LMCache.
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode), We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
and launch an additional LMCache server. and launch an additional LMCache server.
KV cache is transferred in the following manner: KV cache is transferred in the following manner:
vLLM prefill node -> LMCache server -> vLLM decode node. vLLM prefill node -> LMCache server -> vLLM decode node.
Note that `pip install lmcache` is needed to run this example. Note that `pip install lmcache` is needed to run this example.
Learn more about LMCache in https://github.com/LMCache/LMCache. Learn more about LMCache in https://github.com/LMCache/LMCache.
""" """
import os import os
import subprocess import subprocess
import time import time
...@@ -49,19 +50,23 @@ def run_prefill(prefill_done, prompts): ...@@ -49,19 +50,23 @@ def run_prefill(prefill_done, prompts):
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
ktc = KVTransferConfig(kv_connector="LMCacheConnector", ktc = KVTransferConfig(
kv_role="kv_producer", kv_connector="LMCacheConnector",
kv_rank=0, kv_role="kv_producer",
kv_parallel_size=2) kv_rank=0,
kv_parallel_size=2,
)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory. # memory. Reduce the value if your GPU has less memory.
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", llm = LLM(
kv_transfer_config=ktc, model="mistralai/Mistral-7B-Instruct-v0.2",
max_model_len=8000, kv_transfer_config=ktc,
gpu_memory_utilization=0.8, max_model_len=8000,
enforce_eager=True) gpu_memory_utilization=0.8,
enforce_eager=True,
#llm.generate(prompts, sampling_params) )
# llm.generate(prompts, sampling_params)
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
for output in outputs: for output in outputs:
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
...@@ -79,17 +84,21 @@ def run_decode(prefill_done, prompts, timeout=1): ...@@ -79,17 +84,21 @@ def run_decode(prefill_done, prompts, timeout=1):
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
ktc = KVTransferConfig(kv_connector="LMCacheConnector", ktc = KVTransferConfig(
kv_role="kv_consumer", kv_connector="LMCacheConnector",
kv_rank=1, kv_role="kv_consumer",
kv_parallel_size=2) kv_rank=1,
kv_parallel_size=2,
)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# of memory. Reduce the value if your GPU has less memory. # of memory. Reduce the value if your GPU has less memory.
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", llm = LLM(
kv_transfer_config=ktc, model="mistralai/Mistral-7B-Instruct-v0.2",
max_model_len=8000, kv_transfer_config=ktc,
gpu_memory_utilization=0.8, max_model_len=8000,
enforce_eager=True) gpu_memory_utilization=0.8,
enforce_eager=True,
)
print("Waiting for prefill node to finish...") print("Waiting for prefill node to finish...")
prefill_done.wait() prefill_done.wait()
...@@ -105,10 +114,9 @@ def run_decode(prefill_done, prompts, timeout=1): ...@@ -105,10 +114,9 @@ def run_decode(prefill_done, prompts, timeout=1):
def run_lmcache_server(port): def run_lmcache_server(port):
server_proc = subprocess.Popen([ server_proc = subprocess.Popen(
"python", "-m", "lmcache.experimental.server", "localhost", ["python", "-m", "lmcache.experimental.server", "localhost", str(port)]
str(port) )
])
return server_proc return server_proc
......
...@@ -17,13 +17,17 @@ async def lifespan(app: FastAPI): ...@@ -17,13 +17,17 @@ async def lifespan(app: FastAPI):
Lifespan context manager to handle startup and shutdown events. Lifespan context manager to handle startup and shutdown events.
""" """
# Startup: Initialize clients # Startup: Initialize clients
prefiller_base_url = f'http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1' prefiller_base_url = (
decoder_base_url = f'http://{global_args.decoder_host}:{global_args.decoder_port}/v1' f"http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1"
)
app.state.prefill_client = httpx.AsyncClient(timeout=None, decoder_base_url = (
base_url=prefiller_base_url) f"http://{global_args.decoder_host}:{global_args.decoder_port}/v1"
app.state.decode_client = httpx.AsyncClient(timeout=None, )
base_url=decoder_base_url)
app.state.prefill_client = httpx.AsyncClient(
timeout=None, base_url=prefiller_base_url
)
app.state.decode_client = httpx.AsyncClient(timeout=None, base_url=decoder_base_url)
yield yield
...@@ -37,7 +41,6 @@ app = FastAPI(lifespan=lifespan) ...@@ -37,7 +41,6 @@ app = FastAPI(lifespan=lifespan)
class StatsCalculator: class StatsCalculator:
def __init__(self): def __init__(self):
self._stats = [] self._stats = []
self._last_log_time = time.time() self._last_log_time = time.time()
...@@ -51,13 +54,18 @@ class StatsCalculator: ...@@ -51,13 +54,18 @@ class StatsCalculator:
def _log_stats(self): def _log_stats(self):
# Print average, median, and 99th percentile # Print average, median, and 99th percentile
np_arr = np.array(self._stats) np_arr = np.array(self._stats)
output_str = f"\nNum requests: {len(self._stats)}" + \ output_str = (
"\nPrefill node TTFT stats:" + \ f"\nNum requests: {len(self._stats)}"
f"\n - Average (ms): {np.mean(np_arr)}" + \ + "\nPrefill node TTFT stats:"
f"\n - Median (ms): {np.median(np_arr)}" + \ + f"\n - Average (ms): {np.mean(np_arr)}"
f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n" + f"\n - Median (ms): {np.median(np_arr)}"
print("===============================", output_str, + f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n"
"===============================") )
print(
"===============================",
output_str,
"===============================",
)
stats_calculator = StatsCalculator() stats_calculator = StatsCalculator()
...@@ -82,15 +90,16 @@ app.state.prefill_client = None ...@@ -82,15 +90,16 @@ app.state.prefill_client = None
app.state.decode_client = None app.state.decode_client = None
async def send_request_to_service(client: httpx.AsyncClient, endpoint: str, async def send_request_to_service(
req_data: dict): client: httpx.AsyncClient, endpoint: str, req_data: dict
):
""" """
Send a request to a service using a persistent client. Send a request to a service using a persistent client.
""" """
req_data = req_data.copy() req_data = req_data.copy()
req_data['max_tokens'] = 1 req_data["max_tokens"] = 1
if 'max_completion_tokens' in req_data: if "max_completion_tokens" in req_data:
req_data['max_completion_tokens'] = 1 req_data["max_completion_tokens"] = 1
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
response = await client.post(endpoint, json=req_data, headers=headers) response = await client.post(endpoint, json=req_data, headers=headers)
...@@ -98,14 +107,16 @@ async def send_request_to_service(client: httpx.AsyncClient, endpoint: str, ...@@ -98,14 +107,16 @@ async def send_request_to_service(client: httpx.AsyncClient, endpoint: str,
return response return response
async def stream_service_response(client: httpx.AsyncClient, endpoint: str, async def stream_service_response(
req_data: dict): client: httpx.AsyncClient, endpoint: str, req_data: dict
):
""" """
Asynchronously stream the response from a service using a persistent client. Asynchronously stream the response from a service using a persistent client.
""" """
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
async with client.stream("POST", endpoint, json=req_data, async with client.stream(
headers=headers) as response: "POST", endpoint, json=req_data, headers=headers
) as response:
response.raise_for_status() response.raise_for_status()
async for chunk in response.aiter_bytes(): async for chunk in response.aiter_bytes():
yield chunk yield chunk
...@@ -121,28 +132,28 @@ async def handle_completions(request: Request): ...@@ -121,28 +132,28 @@ async def handle_completions(request: Request):
req_data = await request.json() req_data = await request.json()
# Send request to prefill service, ignore the response # Send request to prefill service, ignore the response
await send_request_to_service(app.state.prefill_client, "/completions", await send_request_to_service(
req_data) app.state.prefill_client, "/completions", req_data
)
et = time.time() et = time.time()
stats_calculator.add(et - st) stats_calculator.add(et - st)
# Stream response from decode service # Stream response from decode service
async def generate_stream(): async def generate_stream():
async for chunk in stream_service_response(app.state.decode_client, async for chunk in stream_service_response(
"/completions", app.state.decode_client, "/completions", req_data
req_data): ):
yield chunk yield chunk
return StreamingResponse(generate_stream(), return StreamingResponse(generate_stream(), media_type="text/event-stream")
media_type="application/json")
except Exception as e: except Exception as e:
import sys import sys
import traceback import traceback
exc_info = sys.exc_info() exc_info = sys.exc_info()
print("Error occurred in disagg prefill proxy server" print("Error occurred in disagg prefill proxy server - completions endpoint")
" - completions endpoint")
print(e) print(e)
print("".join(traceback.format_exception(*exc_info))) print("".join(traceback.format_exception(*exc_info)))
raise raise
...@@ -158,36 +169,39 @@ async def handle_chat_completions(request: Request): ...@@ -158,36 +169,39 @@ async def handle_chat_completions(request: Request):
req_data = await request.json() req_data = await request.json()
# Send request to prefill service, ignore the response # Send request to prefill service, ignore the response
await send_request_to_service(app.state.prefill_client, await send_request_to_service(
"/chat/completions", req_data) app.state.prefill_client, "/chat/completions", req_data
)
et = time.time() et = time.time()
stats_calculator.add(et - st) stats_calculator.add(et - st)
# Stream response from decode service # Stream response from decode service
async def generate_stream(): async def generate_stream():
async for chunk in stream_service_response(app.state.decode_client, async for chunk in stream_service_response(
"/chat/completions", app.state.decode_client, "/chat/completions", req_data
req_data): ):
yield chunk yield chunk
return StreamingResponse(generate_stream(), return StreamingResponse(generate_stream(), media_type="text/event-stream")
media_type="application/json")
except Exception as e: except Exception as e:
import sys import sys
import traceback import traceback
exc_info = sys.exc_info() exc_info = sys.exc_info()
print("Error occurred in disagg prefill proxy server " print(
" - chat completions endpoint") "Error occurred in disagg prefill proxy server - chat completions endpoint"
)
print(e) print(e)
print("".join(traceback.format_exception(*exc_info))) print("".join(traceback.format_exception(*exc_info)))
raise raise
if __name__ == '__main__': if __name__ == "__main__":
global global_args global global_args
global_args = parse_args() global_args = parse_args()
import uvicorn import uvicorn
uvicorn.run(app, host=global_args.host, port=global_args.port) uvicorn.run(app, host=global_args.host, port=global_args.port)
...@@ -3,13 +3,14 @@ ...@@ -3,13 +3,14 @@
This file demonstrates the example usage of remote KV cache sharing This file demonstrates the example usage of remote KV cache sharing
with LMCache. with LMCache.
We will launch 2 vllm instances, and launch an additional LMCache server. We will launch 2 vllm instances, and launch an additional LMCache server.
KV cache is transferred in the following manner: KV cache is transferred in the following manner:
(1) vLLM instance 1 -> LMCache server (KV cache store). (1) vLLM instance 1 -> LMCache server (KV cache store).
(2) LMCache server -> vLLM instance 2 (KV cache reuse/retrieve). (2) LMCache server -> vLLM instance 2 (KV cache reuse/retrieve).
Note that lmcache needs to be installed to run this example. Note that lmcache needs to be installed to run this example.
Learn more about LMCache in https://github.com/LMCache/LMCache. Learn more about LMCache in https://github.com/LMCache/LMCache.
""" """
import os import os
import subprocess import subprocess
import time import time
...@@ -49,15 +50,16 @@ def run_store(store_done, prompts): ...@@ -49,15 +50,16 @@ def run_store(store_done, prompts):
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both")
kv_role="kv_both")
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory. # memory. Reduce the value if your GPU has less memory.
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", llm = LLM(
kv_transfer_config=ktc, model="mistralai/Mistral-7B-Instruct-v0.2",
max_model_len=8000, kv_transfer_config=ktc,
gpu_memory_utilization=0.8, max_model_len=8000,
enforce_eager=True) gpu_memory_utilization=0.8,
enforce_eager=True,
)
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
for output in outputs: for output in outputs:
...@@ -76,15 +78,16 @@ def run_retrieve(store_done, prompts, timeout=1): ...@@ -76,15 +78,16 @@ def run_retrieve(store_done, prompts, timeout=1):
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both")
kv_role="kv_both")
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# of memory. Reduce the value if your GPU has less memory. # of memory. Reduce the value if your GPU has less memory.
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", llm = LLM(
kv_transfer_config=ktc, model="mistralai/Mistral-7B-Instruct-v0.2",
max_model_len=8000, kv_transfer_config=ktc,
gpu_memory_utilization=0.8, max_model_len=8000,
enforce_eager=True) gpu_memory_utilization=0.8,
enforce_eager=True,
)
print("Waiting for KV cache store to finish...") print("Waiting for KV cache store to finish...")
store_done.wait() store_done.wait()
...@@ -100,10 +103,9 @@ def run_retrieve(store_done, prompts, timeout=1): ...@@ -100,10 +103,9 @@ def run_retrieve(store_done, prompts, timeout=1):
def run_lmcache_server(port): def run_lmcache_server(port):
server_proc = subprocess.Popen([ server_proc = subprocess.Popen(
"python", "-m", "lmcache.experimental.server", "localhost", ["python", "-m", "lmcache.experimental.server", "localhost", str(port)]
str(port) )
])
return server_proc return server_proc
......
...@@ -6,11 +6,15 @@ import json ...@@ -6,11 +6,15 @@ import json
import os import os
import uuid import uuid
from vllm import LLM from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs, from vllm.lora.request import LoRARequest
TensorizerConfig, from vllm.model_executor.model_loader.tensorizer import (
tensorize_vllm_model) TensorizerArgs,
TensorizerConfig,
tensorize_lora_adapter,
tensorize_vllm_model,
)
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
# yapf conflicts with isort for this docstring # yapf conflicts with isort for this docstring
...@@ -27,7 +31,7 @@ https://github.com/coreweave/tensorizer ...@@ -27,7 +31,7 @@ https://github.com/coreweave/tensorizer
To serialize a model, install vLLM from source, then run something To serialize a model, install vLLM from source, then run something
like this from the root level of this repository: like this from the root level of this repository:
python -m examples.other.tensorize_vllm_model \ python examples/others/tensorize_vllm_model.py \
--model facebook/opt-125m \ --model facebook/opt-125m \
serialize \ serialize \
--serialized-directory s3://my-bucket \ --serialized-directory s3://my-bucket \
...@@ -47,7 +51,7 @@ providing a `--keyfile` argument. ...@@ -47,7 +51,7 @@ providing a `--keyfile` argument.
To deserialize a model, you can run something like this from the root To deserialize a model, you can run something like this from the root
level of this repository: level of this repository:
python -m examples.other.tensorize_vllm_model \ python examples/others/tensorize_vllm_model.py \
--model EleutherAI/gpt-j-6B \ --model EleutherAI/gpt-j-6B \
--dtype float16 \ --dtype float16 \
deserialize \ deserialize \
...@@ -65,11 +69,11 @@ shard's rank. Sharded models serialized with this script will be named as ...@@ -65,11 +69,11 @@ shard's rank. Sharded models serialized with this script will be named as
model-rank-%03d.tensors model-rank-%03d.tensors
For more information on the available arguments for serializing, run For more information on the available arguments for serializing, run
`python -m examples.other.tensorize_vllm_model serialize --help`. `python -m examples.others.tensorize_vllm_model serialize --help`.
Or for deserializing: Or for deserializing:
`python -m examples.other.tensorize_vllm_model deserialize --help`. `python examples/others/tensorize_vllm_model.py deserialize --help`.
Once a model is serialized, tensorizer can be invoked with the `LLM` class Once a model is serialized, tensorizer can be invoked with the `LLM` class
directly to load models: directly to load models:
...@@ -90,11 +94,27 @@ TensorizerConfig arguments desired. ...@@ -90,11 +94,27 @@ TensorizerConfig arguments desired.
In order to see all of the available arguments usable to configure In order to see all of the available arguments usable to configure
loading with tensorizer that are given to `TensorizerConfig`, run: loading with tensorizer that are given to `TensorizerConfig`, run:
`python -m examples.other.tensorize_vllm_model deserialize --help` `python examples/others/tensorize_vllm_model.py deserialize --help`
under the `tensorizer options` section. These can also be used for under the `tensorizer options` section. These can also be used for
deserialization in this example script, although `--tensorizer-uri` and deserialization in this example script, although `--tensorizer-uri` and
`--path-to-tensors` are functionally the same in this case. `--path-to-tensors` are functionally the same in this case.
Tensorizer can also be used to save and load LoRA adapters. A LoRA adapter
can be serialized directly with the path to the LoRA adapter on HF Hub and
a TensorizerConfig object. In this script, passing a HF id to a LoRA adapter
will serialize the LoRA adapter artifacts to `--serialized-directory`.
You can then use the LoRA adapter with `vllm serve`, for instance, by ensuring
the LoRA artifacts are in your model artifacts directory and specifying
`--enable-lora`. For instance:
```
vllm serve <model_path> \
--load-format tensorizer \
--model-loader-extra-config '{"tensorizer_uri": "<model_path>.tensors"}' \
--enable-lora
```
""" """
...@@ -107,6 +127,19 @@ def parse_args(): ...@@ -107,6 +127,19 @@ def parse_args():
"also supported, although libsodium must be installed to " "also supported, although libsodium must be installed to "
"use it.") "use it.")
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
parser.add_argument(
"--lora-path",
type=str,
required=False,
help="Path to a LoRA adapter to "
"serialize along with model tensors. This can then be deserialized "
"along with the model by passing a tensorizer_config kwarg to "
"LoRARequest with type TensorizerConfig. See the docstring for this "
"for a usage example."
)
subparsers = parser.add_subparsers(dest='command') subparsers = parser.add_subparsers(dest='command')
serialize_parser = subparsers.add_parser( serialize_parser = subparsers.add_parser(
...@@ -169,11 +202,42 @@ def parse_args(): ...@@ -169,11 +202,42 @@ def parse_args():
def deserialize(): def deserialize():
llm = LLM(model=args.model, if args.lora_path:
load_format="tensorizer", tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
tensor_parallel_size=args.tensor_parallel_size, llm = LLM(model=args.model,
model_loader_extra_config=tensorizer_config load_format="tensorizer",
) tensor_parallel_size=args.tensor_parallel_size,
model_loader_extra_config=tensorizer_config,
enable_lora=True,
)
sampling_params = SamplingParams(
temperature=0,
max_tokens=256,
stop=["[/assistant]"]
)
# Truncating this as the extra text isn't necessary
prompts = [
"[user] Write a SQL query to answer the question based on ..."
]
# Test LoRA load
print(
llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest("sql-lora",
1,
args.lora_path,
tensorizer_config = tensorizer_config)
)
)
else:
llm = LLM(model=args.model,
load_format="tensorizer",
tensor_parallel_size=args.tensor_parallel_size,
model_loader_extra_config=tensorizer_config
)
return llm return llm
...@@ -197,7 +261,10 @@ if __name__ == '__main__': ...@@ -197,7 +261,10 @@ if __name__ == '__main__':
model_name = model_ref.split("/")[1] model_name = model_ref.split("/")[1]
keyfile = args.keyfile if args.keyfile else None if args.command == "serialize" or args.command == "deserialize":
keyfile = args.keyfile
else:
keyfile = None
if args.model_loader_extra_config: if args.model_loader_extra_config:
config = json.loads(args.model_loader_extra_config) config = json.loads(args.model_loader_extra_config)
...@@ -228,6 +295,10 @@ if __name__ == '__main__': ...@@ -228,6 +295,10 @@ if __name__ == '__main__':
encryption_keyfile=keyfile, encryption_keyfile=keyfile,
**credentials) **credentials)
if args.lora_path:
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
tensorize_lora_adapter(args.lora_path, tensorizer_config)
tensorize_vllm_model(engine_args, tensorizer_config) tensorize_vllm_model(engine_args, tensorizer_config)
elif args.command == "deserialize": elif args.command == "deserialize":
......
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length = 88
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
"I",
# flake8-logging-format
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]
[tool.ruff.lint.isort]
known-first-party = ["vllm"]
[tool.ruff.format]
docstring-code-format = true
\ No newline at end of file
{{- bos_token }} {{- bos_token }}
{%- if custom_tools is defined %} {%- if custom_tools is defined and custom_tools%}
{%- set tools = custom_tools %} {%- set tools = custom_tools %}
{%- endif %} {%- endif %}
{%- if not tools_in_user_message is defined %} {%- if tools is defined and tools %}
{%- set tools_in_user_message = false %} {%- set tool_definition = tool_definition ~ (tools | tojson(indent=4)) %}
{%- endif %} {%- else %}
{%- if not tools is defined %}
{%- set tools = none %} {%- set tools = none %}
{%- endif %} {%- endif %}
{#- This block extracts the system message, so we can slot it into the right place. #} {#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %} {%- if messages[0]['role'] == 'system' %}
{%- set user_provided_system_message = true %}
{%- if messages[0]['content'] is string %} {%- if messages[0]['content'] is string %}
{%- set system_message = messages[0]['content']|trim %} {%- set system_message = messages[0]['content']|trim %}
{%- else %} {%- else %}
...@@ -18,68 +19,33 @@ ...@@ -18,68 +19,33 @@
{%- endif %} {%- endif %}
{%- set messages = messages[1:] %} {%- set messages = messages[1:] %}
{%- else %} {%- else %}
{%- if tools is not none %} {%- if tools is not none %}
{#- Add default tool system message when tools are provided #} {#- Since not system_message was provided by user, if tool is provided, system_message is now default tool system message #}
{%- set system_message = "You are a helpful assistant with tool calling " {#- This system message is from llama website:https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/ #}
"capabilities. Only reply with a tool call if the function exists in the " {%- set system_message = "You are a helpful assistant and an expert in function composition. You can answer general questions using your internal knowledge OR invoke functions when necessary. Follow these strict guidelines:\n\n1. FUNCTION CALLS:\n- ONLY use functions that are EXPLICITLY listed in the function list below\n- If NO functions are listed (empty function list []), respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If a function is not in the list, respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If ALL required parameters are present AND the query EXACTLY matches a listed function's purpose: output ONLY the function call(s)\n- Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\nExamples:\nCORRECT: [get_weather(location=\"Vancouver\"), calculate_route(start=\"Boston\", end=\"New York\")] <- Only if get_weather and calculate_route are in function list\nINCORRECT: get_weather(location=\"New York\")\nINCORRECT: Let me check the weather: [get_weather(location=\"New York\")]\nINCORRECT: [get_events(location=\"Singapore\")] <- If function not in list\n\n2. RESPONSE RULES:\n- For pure function requests matching a listed function: ONLY output the function call(s)\n- For knowledge questions: ONLY output text\n- For missing parameters: ONLY request the specific missing parameters\n- For unavailable services (not in function list): output ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\". Do NOT execute a function call.\n- If the query asks for information beyond what a listed function provides: output ONLY with internal knowledge about your limitations\n- NEVER combine text and function calls in the same response\n- NEVER suggest alternative functions when the requested service is unavailable\n- NEVER create or invent new functions not listed below\n\n3. STRICT BOUNDARIES:\n- ONLY use functions from the list below - no exceptions\n- NEVER use a function as an alternative to unavailable information\n- NEVER call functions not present in the function list\n- NEVER add explanatory text to function calls\n- NEVER respond with empty brackets\n- Use proper Python/JSON syntax for function calls\n- Check the function list carefully before responding\n\n4. TOOL RESPONSE HANDLING:\n- When receiving tool responses: provide concise, natural language responses\n- Don't repeat tool response verbatim\n- Don't add supplementary information\n\nHere is a list of functions in JSON format that you can invoke:\n" %}
"library provided by the user. If it doesn't exist, just reply directly in "
"natural language. When you receive a tool call response, use the output to "
"format an answer to the original user question." %}
{%- else %} {%- else %}
{%- set system_message = "" %} {%- set system_message = "" %}
{%- endif %} {%- endif %}
{%- endif %} {%- endif %}
{#- Now writing the system message: use the user provided system message if user_provided_system_message, else default tool system message if tools presented #}
{#- System message if the user supplied one, or if tools are used (default tool system message) #}
{%- if system_message %} {%- if system_message %}
{#- always use user provided system message to override default tool system message #} {#- always use user provided system message to override default tool system message #}
{{- "<|header_start|>system<|header_end|>\n\n" }} {{- "<|header_start|>system<|header_end|>\n\n" }}
{{- system_message }} {{- system_message }}
{%- if tools is not none and not tools_in_user_message %} {%- if user_provided_system_message and tools %}
{{- "Tools: You have access to the following tools. You might need to use one " {{- "\nHere is a list of functions in JSON format that you can invoke. Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\n" }}
"or more function/tool calls to fulfill the task. \n" {{- tool_definition -}}
"If none are needed, then proceed to the response.\n\n" {%- elif tool_definition %}
"Tool Call Syntax: You can call tools using the following syntax:\n" {{- tool_definition -}}
"[func_name1(params_name1=params_value1, params_name2=params_value2, ...), ...]\n"
"Do not include anything else when calling the tools with the syntax above.\n\n"
"Here is a list of functions in JSON format that you can invoke.\n " }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %} {%- endif %}
{{- "<|eot|>" }} {{- "<|eot|>" }}
{%- endif %} {%- endif %}
{#- Custom tools are passed in a user message with some extra guidance #} {#- Now deal with all other messages #}
{%- if tools_in_user_message and tools is not none %}
{#- Extract the first user message so we can plug it in here #}
{%- if messages | length != 0 %}
{%- if messages[0]['content'] is string %}
{%- set first_user_message = messages[0]['content']|trim %}
{%- else %}
{%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
{%- endif %}
{%- set messages = messages[1:] %}
{%- else %}
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
{%- endif %}
{{- '<|header_start|>user<|header_end|>\n\n' -}}
{{- first_user_message}}
{{- "\nHere is a list of functions in JSON format that you can invoke:"}}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{{- "Should you decide to return the function call(s), put them in the format "
"of [func_name1(params_name1=params_value1, params_name2=params_value2, "
"...), ...]\nDo not include anything else when calling the tools with the "
"syntax above." }}
{%- endif %}
{%- for message in messages %} {%- for message in messages %}
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} {#- Base case: messages that are not from tool role and has empty tool_call list #}
{{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }} {%- if not (message.role == 'ipython' or message.role == 'tool' or ('tool_calls' in message and message.tool_calls|length != 0 )) %}
{{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
{%- if message['content'] is string %} {%- if message['content'] is string %}
{{- message['content'] }} {{- message['content'] }}
{%- else %} {%- else %}
...@@ -91,10 +57,12 @@ ...@@ -91,10 +57,12 @@
{%- endif %} {%- endif %}
{%- endfor %} {%- endfor %}
{%- endif %} {%- endif %}
{{- "<|eot|>" }} {{- "<|eot|>" }}
{%- elif 'tool_calls' in message and message.tool_calls|length > 0 %} {#- Tool case: messages has non-empty tool_call list, must from assistant #}
{%- set tool_call = message.tool_calls[0].function %} {%- elif 'tool_calls' in message %}
{{- '<|header_start|>assistant<|header_end|>\n\n' -}} {#- assume tool_calls are always coming from assistant #}
{%- if message.role == 'assistant' %}
{{- '<|header_start|>assistant<|header_end|>\n\n' -}}
{%- if message['content'] is string %} {%- if message['content'] is string %}
{{- message['content'] }} {{- message['content'] }}
{%- else %} {%- else %}
...@@ -106,32 +74,36 @@ ...@@ -106,32 +74,36 @@
{%- endif %} {%- endif %}
{%- endfor %} {%- endfor %}
{%- endif %} {%- endif %}
{{- "[" }}
{%- for tool_call in message.tool_calls %} {%- for tool_call in message.tool_calls %}
{%- if tool_call.function is defined %} {%- if tool_call.function is defined %}
{%- set tool_call = tool_call.function %} {%- set tool_call = tool_call.function %}
{%- endif %} {%- endif %}
{{- tool_call.name + '(' -}} {{- tool_call.name + '(' -}}
{%- for param in tool_call.arguments %} {%- for param in tool_call.arguments %}
{{- param + '=' -}} {{- param + '="' -}}
{{- "%s" | format(tool_call.arguments[param]) -}} {{- "%s" | format(tool_call.arguments[param]) -}}
{{- '"' -}}
{% if not loop.last %}, {% endif %} {% if not loop.last %}, {% endif %}
{%- endfor %} {%- endfor %}
{{- ')' -}} {{- ')' -}}
{% if not loop.last %}, {% endif %} {% if not loop.last %}, {% endif %}
{%- endfor %} {%- endfor %}
{{- "<|eom|>" }} {{- "]<|eot|>" }}
{%- endif %}
{#- Tool_response case: messages are from tool_response #}
{%- elif message.role == "tool" or message.role == "ipython" %} {%- elif message.role == "tool" or message.role == "ipython" %}
{{- "<|header_start|>ipython<|header_end|>\n\n" }} {{- "<|header_start|>ipython<|header_end|>\n\n" }}
{%- if message.content is string %} {%- if message.content is string %}
{{- message.content | tojson }} {{- message.content | tojson }}
{%- else %} {%- else %}
{%- for content in message['content'] %} {%- for content in message['content'] %}
{%- if content['type'] == 'text' %} {%- if content['type'] == 'text' %}
{{- content['text'] | tojson }} {{- content['text'] | tojson }}
{%- endif %} {%- endif %}
{%- endfor %} {%- endfor %}
{%- endif %} {%- endif %}
{{- "<|eom|>" }} {{- "<|eot|>" }}
{%- endif %} {%- endif %}
{%- endfor %} {%- endfor %}
{%- if add_generation_prompt %} {%- if add_generation_prompt %}
......
site_name: vLLM
site_url: https://docs.vllm.ai
repo_url: https://github.com/vllm-project/vllm
exclude_docs: |
*.inc.md
*.template.md
theme:
name: material
logo: assets/logos/vllm-logo-only-light.ico
favicon: assets/logos/vllm-logo-only-light.ico
palette:
# Palette toggle for automatic mode
- media: "(prefers-color-scheme)"
toggle:
icon: material/brightness-auto
name: Switch to light mode
# Palette toggle for light mode
- media: "(prefers-color-scheme: light)"
scheme: default
primary: white
toggle:
icon: material/brightness-7
name: Switch to dark mode
# Palette toggle for dark mode
- media: "(prefers-color-scheme: dark)"
scheme: slate
primary: black
toggle:
icon: material/brightness-2
name: Switch to system preference
features:
- content.code.copy
- content.tabs.link
- navigation.tracking
- navigation.tabs
- navigation.sections
- navigation.prune
- navigation.top
- search.highlight
- search.share
- toc.follow
custom_dir: docs/mkdocs/overrides
hooks:
- docs/mkdocs/hooks/remove_announcement.py
- docs/mkdocs/hooks/generate_examples.py
- docs/mkdocs/hooks/url_schemes.py
# Required to stop api-autonav from raising an error
# https://github.com/tlambert03/mkdocs-api-autonav/issues/16
nav:
- api
plugins:
- meta
- search
- autorefs
- awesome-nav
# For API reference generation
- api-autonav:
modules: ["vllm"]
api_root_uri: "api"
exclude:
- "re:vllm\\._.*" # Internal modules
- "vllm.third_party"
- "vllm.vllm_flash_attn"
- mkdocstrings:
handlers:
python:
options:
show_symbol_type_heading: true
show_symbol_type_toc: true
filters: []
summary:
modules: true
show_if_no_docstring: true
show_signature_annotations: true
separate_signature: true
show_overloads: true
signature_crossrefs: true
inventories:
- https://docs.python.org/3/objects.inv
- https://typing-extensions.readthedocs.io/en/latest/objects.inv
- https://docs.aiohttp.org/en/stable/objects.inv
- https://pillow.readthedocs.io/en/stable/objects.inv
- https://numpy.org/doc/stable/objects.inv
- https://pytorch.org/docs/stable/objects.inv
- https://psutil.readthedocs.io/en/stable/objects.inv
markdown_extensions:
- attr_list
- md_in_html
- admonition
- pymdownx.details
# For content tabs
- pymdownx.superfences
- pymdownx.tabbed:
slugify: !!python/object/apply:pymdownx.slugs.slugify
kwds:
case: lower
alternate_style: true
# For code highlighting
- pymdownx.highlight:
anchor_linenums: true
line_spans: __span
pygments_lang_class: true
- pymdownx.inlinehilite
- pymdownx.snippets
# For emoji and icons
- pymdownx.emoji:
emoji_index: !!python/name:material.extensions.emoji.twemoji
emoji_generator: !!python/name:material.extensions.emoji.to_svg
# For in page [TOC] (not sidebar)
- toc:
permalink: true
# For math rendering
- mdx_math:
enable_dollar_delimiter: true
extra_css:
- mkdocs/stylesheets/extra.css
extra_javascript:
- mkdocs/javascript/run_llm_widget.js
- https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML
# Makes the url format end in .html rather than act as a dir
# So index.md generates as index.html and is available under URL /index.html
# https://www.mkdocs.org/user-guide/configuration/#use_directory_urls
use_directory_urls: false
...@@ -8,6 +8,7 @@ requires = [ ...@@ -8,6 +8,7 @@ requires = [
"setuptools-scm>=8.0", "setuptools-scm>=8.0",
"torch == 2.7.0", "torch == 2.7.0",
"wheel", "wheel",
"regex",
"jinja2", "jinja2",
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
...@@ -35,8 +36,8 @@ dynamic = [ "version", "dependencies", "optional-dependencies"] ...@@ -35,8 +36,8 @@ dynamic = [ "version", "dependencies", "optional-dependencies"]
[project.urls] [project.urls]
Homepage="https://github.com/vllm-project/vllm" Homepage="https://github.com/vllm-project/vllm"
Documentation="https://vllm.readthedocs.io/en/latest/" Documentation="https://docs.vllm.ai/en/latest/"
Slack="http://slack.vllm.ai/" Slack="https://slack.vllm.ai/"
[project.scripts] [project.scripts]
vllm = "vllm.entrypoints.cli.main:main" vllm = "vllm.entrypoints.cli.main:main"
...@@ -56,16 +57,12 @@ ignore_patterns = [ ...@@ -56,16 +57,12 @@ ignore_patterns = [
".buildkite/**", ".buildkite/**",
"benchmarks/**", "benchmarks/**",
"build/**", "build/**",
"examples/**",
] ]
[tool.ruff] [tool.ruff]
# Allow lines to be as long as 80. # Allow lines to be as long as 80.
line-length = 80 line-length = 80
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"] "vllm/third_party/**" = ["ALL"]
...@@ -148,6 +145,7 @@ skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora ...@@ -148,6 +145,7 @@ skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora
skip_glob = [ skip_glob = [
".buildkite/*", ".buildkite/*",
"benchmarks/*", "benchmarks/*",
"examples/*",
] ]
use_parentheses = true use_parentheses = true
skip_gitignore = true skip_gitignore = true
...@@ -165,9 +163,12 @@ markers = [ ...@@ -165,9 +163,12 @@ markers = [
[tool.pymarkdown] [tool.pymarkdown]
plugins.md004.style = "sublist" # ul-style plugins.md004.style = "sublist" # ul-style
plugins.md007.indent = 4 # ul-indent
plugins.md007.start_indented = true # ul-indent
plugins.md013.enabled = false # line-length plugins.md013.enabled = false # line-length
plugins.md041.enabled = false # first-line-h1 plugins.md041.enabled = false # first-line-h1
plugins.md033.enabled = false # inline-html plugins.md033.enabled = false # inline-html
plugins.md046.enabled = false # code-block-style
plugins.md024.allow_different_nesting = true # no-duplicate-headers plugins.md024.allow_different_nesting = true # no-duplicate-headers
[tool.ty] [tool.ty]
......
...@@ -7,3 +7,4 @@ setuptools-scm>=8 ...@@ -7,3 +7,4 @@ setuptools-scm>=8
torch==2.7.0 torch==2.7.0
wheel wheel
jinja2>=3.1.6 jinja2>=3.1.6
regex
regex # Replace re for higher-performance regex matching
cachetools cachetools
psutil psutil
sentencepiece # Required for LLaMA tokenizer. sentencepiece # Required for LLaMA tokenizer.
...@@ -7,7 +8,7 @@ tqdm ...@@ -7,7 +8,7 @@ tqdm
blake3 blake3
py-cpuinfo py-cpuinfo
transformers >= 4.51.1 transformers >= 4.51.1
huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads. huggingface-hub[hf_xet] >= 0.32.0 # Required for Xet downloads.
tokenizers >= 0.21.1 # Required for fast incremental detokenization. tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer. protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
...@@ -40,7 +41,7 @@ compressed-tensors == 0.9.4 # required for compressed-tensors ...@@ -40,7 +41,7 @@ compressed-tensors == 0.9.4 # required for compressed-tensors
depyf==0.18.0 # required for profiling and debugging with compilation config depyf==0.18.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files watchfiles # required for http server to monitor the updates of TLS files
python-json-logger # Used by logging as per examples/other/logging_configuration.md python-json-logger # Used by logging as per examples/others/logging_configuration.md
scipy # Required for phi-4-multimodal-instruct scipy # Required for phi-4-multimodal-instruct
ninja # Required for xgrammar, rocm, tpu, xpu ninja # Required for xgrammar, rocm, tpu, xpu
opentelemetry-sdk>=1.26.0 # vllm.tracing opentelemetry-sdk>=1.26.0 # vllm.tracing
......
...@@ -2,11 +2,12 @@ ...@@ -2,11 +2,12 @@
-r common.txt -r common.txt
# Dependencies for CPUs # Dependencies for CPUs
packaging>=24.2
setuptools>=77.0.3,<80.0.0
--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://download.pytorch.org/whl/cpu
torch==2.7.0+cpu; platform_machine == "x86_64" torch==2.7.0+cpu; platform_machine == "x86_64"
torch==2.7.0; platform_system == "Darwin" torch==2.7.0; platform_system == "Darwin"
torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64" torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
torch==2.7.0.dev20250304; platform_machine == "s390x"
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
...@@ -19,3 +20,7 @@ datasets # for benchmark scripts ...@@ -19,3 +20,7 @@ datasets # for benchmark scripts
# cpu cannot use triton 3.3.0 # cpu cannot use triton 3.3.0
triton==3.2.0; platform_machine == "x86_64" triton==3.2.0; platform_machine == "x86_64"
# Intel Extension for PyTorch, only for x86_64 CPUs
intel-openmp==2024.2.1; platform_machine == "x86_64"
intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64"
sphinx==7.4.7 mkdocs
sphinx-argparse==0.5.2 mkdocs-api-autonav
sphinx-book-theme==1.1.4 mkdocs-material
sphinx-copybutton==0.5.2 mkdocstrings-python
sphinx-design==0.6.1 mkdocs-gen-files
sphinx-togglebutton==0.3.2 mkdocs-awesome-nav
myst-parser==3.0.1 # `myst-parser==4.0.1` breaks inline code in titles python-markdown-math
msgspec regex
snowballstemmer<3 # https://github.com/snowballstem/snowball/issues/229 ruff
commonmark # Required by sphinx-argparse when using :markdownhelp:
# Custom autodoc2 is necessary for faster docstring processing
# see: https://github.com/sphinx-extensions2/sphinx-autodoc2/issues/33#issuecomment-2856386035
git+https://github.com/hmellor/sphinx-autodoc2.git # sphinx-autodoc2==0.5.0
# packages to install to build the documentation
cachetools
-f https://download.pytorch.org/whl/cpu
torch
\ No newline at end of file
...@@ -38,4 +38,4 @@ matplotlib # required for qwen-vl test ...@@ -38,4 +38,4 @@ matplotlib # required for qwen-vl test
# required for Multi-Modal Models Test (Standard) # required for Multi-Modal Models Test (Standard)
num2words # required for smolvlm test num2words # required for smolvlm test
pqdm pqdm
timm # required for internvl test timm # required for internvl test
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment