Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a48e3ec0
Unverified
Commit
a48e3ec0
authored
Oct 22, 2024
by
Jee Jee Li
Committed by
GitHub
Oct 22, 2024
Browse files
[CI/Build][LoRA] Temporarily fix long context failure issue (#9579)
parent
6c5af09b
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
20 additions
and
11 deletions
+20
-11
tests/lora/test_long_context.py
tests/lora/test_long_context.py
+20
-11
No files found.
tests/lora/test_long_context.py
View file @
a48e3ec0
...
@@ -28,9 +28,15 @@ sampling_params = SamplingParams(
...
@@ -28,9 +28,15 @@ sampling_params = SamplingParams(
def
_create_lora_request
(
lora_id
,
long_context_infos
):
def
_create_lora_request
(
lora_id
,
long_context_infos
):
context_len
=
long_context_infos
[
lora_id
][
"context_length"
]
context_len
=
long_context_infos
[
lora_id
][
"context_length"
]
scaling_factor
=
context_len_to_scaling_factor
[
context_len
]
scaling_factor
=
context_len_to_scaling_factor
[
context_len
]
return
LoRARequest
(
context_len
,
lora_id
,
return
LoRARequest
(
long_context_infos
[
lora_id
][
"lora"
],
None
,
# There are 2 LoRAs for 16K, we need to add lora_id to indicate
4096
*
scaling_factor
)
# they are different LoRAs.
context_len
+
str
(
lora_id
),
lora_id
,
long_context_infos
[
lora_id
][
"lora"
],
None
,
4096
*
scaling_factor
,
)
def
evaluate_json_response
(
model_response
,
golden_response
):
def
evaluate_json_response
(
model_response
,
golden_response
):
...
@@ -108,13 +114,16 @@ def lora_llm(long_context_infos):
...
@@ -108,13 +114,16 @@ def lora_llm(long_context_infos):
for
info
in
long_context_infos
.
values
()
for
info
in
long_context_infos
.
values
()
]
]
llm
=
vllm
.
LLM
(
"meta-llama/Llama-2-13b-chat-hf"
,
llm
=
vllm
.
LLM
(
"meta-llama/Llama-2-13b-chat-hf"
,
enable_lora
=
True
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_num_seqs
=
16
,
max_loras
=
2
,
max_loras
=
2
,
long_lora_scaling_factors
=
tuple
(
scaling_factors
),
long_lora_scaling_factors
=
tuple
(
scaling_factors
),
max_num_batched_tokens
=
4096
*
8
,
max_num_batched_tokens
=
4096
*
8
,
tensor_parallel_size
=
4
,
tensor_parallel_size
=
4
,
# FIXME enable async output processor
disable_async_output_proc
=
True
,
distributed_executor_backend
=
"mp"
)
distributed_executor_backend
=
"mp"
)
yield
llm
yield
llm
del
llm
del
llm
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment