Commit 1591c68f authored by zhuwenwen's avatar zhuwenwen
Browse files

merge v0.4.2

parents 09bcf00b c7f2cf2b
...@@ -2,5 +2,5 @@ ...@@ -2,5 +2,5 @@
-r requirements-common.txt -r requirements-common.txt
# Dependencies for x86_64 CPUs # Dependencies for x86_64 CPUs
torch == 2.2.1+cpu torch == 2.3.0+cpu
triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
...@@ -5,5 +5,5 @@ ...@@ -5,5 +5,5 @@
ray >= 2.9 ray >= 2.9
nvidia-ml-py # for pynvml package nvidia-ml-py # for pynvml package
vllm-nccl-cu12>=2.18,<2.19 # for downloading nccl library vllm-nccl-cu12>=2.18,<2.19 # for downloading nccl library
torch == 2.2.1 torch == 2.3.0
xformers == 0.0.25 # Requires PyTorch 2.2.1 xformers == 0.0.26.post1 # Requires PyTorch 2.3.0
...@@ -14,19 +14,17 @@ types-setuptools ...@@ -14,19 +14,17 @@ types-setuptools
# testing # testing
pytest pytest
tensorizer==2.9.0a0 tensorizer==2.9.0
pytest-forked pytest-forked
pytest-asyncio pytest-asyncio
pytest-rerunfailures pytest-rerunfailures
pytest-shard pytest-shard
httpx httpx
einops # required for MPT einops # required for MPT
openai
requests requests
ray ray
peft peft
awscli awscli
ai2-olmo # required for OLMo
# Benchmarking # Benchmarking
aiohttp aiohttp
......
This diff is collapsed.
...@@ -91,4 +91,6 @@ async def test_new_requests_event(): ...@@ -91,4 +91,6 @@ async def test_new_requests_event():
assert engine.engine.step_calls == old_step_calls + 1 assert engine.engine.step_calls == old_step_calls + 1
engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True) engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
assert engine.get_model_config() is not None
assert engine.get_tokenizer() is not None assert engine.get_tokenizer() is not None
assert engine.get_decoding_config() is not None
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -55,7 +55,6 @@ def test_models( ...@@ -55,7 +55,6 @@ def test_models(
) )
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model del vllm_model
print(vllm_outputs[0])
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
......
This diff is collapsed.
...@@ -296,6 +296,7 @@ class VllmRunner: ...@@ -296,6 +296,7 @@ class VllmRunner:
tensor_parallel_size: int = 1, tensor_parallel_size: int = 1,
block_size: int = 16, block_size: int = 16,
enable_chunked_prefill: bool = False, enable_chunked_prefill: bool = False,
swap_space=4,
**kwargs, **kwargs,
) -> None: ) -> None:
self.model = LLM( self.model = LLM(
...@@ -303,7 +304,7 @@ class VllmRunner: ...@@ -303,7 +304,7 @@ class VllmRunner:
tokenizer=tokenizer_name, tokenizer=tokenizer_name,
trust_remote_code=True, trust_remote_code=True,
dtype=dtype, dtype=dtype,
swap_space=0, swap_space=swap_space,
disable_log_stats=disable_log_stats, disable_log_stats=disable_log_stats,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
max_model_len=max_model_len, max_model_len=max_model_len,
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment