"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "d2643128f7741b937435b00fecde7d6b2e351d0c"
Commit 18c811ba authored by zhuwenwen's avatar zhuwenwen
Browse files

remove flashinfer and change float to half

parent 520d727f
...@@ -12,7 +12,7 @@ MODELS = ["facebook/opt-125m"] ...@@ -12,7 +12,7 @@ MODELS = ["facebook/opt-125m"]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", @pytest.mark.parametrize("dtype",
["float"]) # needed for comparing logprobs with HF ["half"]) # needed for comparing logprobs with HF
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1]) @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
@pytest.mark.parametrize("num_top_logprobs", [0, 6]) # 32000 == vocab_size @pytest.mark.parametrize("num_top_logprobs", [0, 6]) # 32000 == vocab_size
@pytest.mark.parametrize("detokenize", [True, False]) @pytest.mark.parametrize("detokenize", [True, False])
......
...@@ -43,7 +43,8 @@ def mock_causal_accepted_tensor( ...@@ -43,7 +43,8 @@ def mock_causal_accepted_tensor(
"which_tokens_accepted", "which_tokens_accepted",
["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"]) ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("use_flashinfer", [True, False]) # @pytest.mark.parametrize("use_flashinfer", [True, False])
@pytest.mark.parametrize("use_flashinfer", [False])
@torch.inference_mode() @torch.inference_mode()
def test_correct_output_format(which_tokens_accepted: str, seed: int, def test_correct_output_format(which_tokens_accepted: str, seed: int,
device: str, use_flashinfer: bool): device: str, use_flashinfer: bool):
...@@ -127,7 +128,8 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int, ...@@ -127,7 +128,8 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
@pytest.mark.parametrize("batch_size", list(range(1, 32))) @pytest.mark.parametrize("batch_size", list(range(1, 32)))
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("use_flashinfer", [True, False]) # @pytest.mark.parametrize("use_flashinfer", [True, False])
@pytest.mark.parametrize("use_flashinfer", [False])
@torch.inference_mode() @torch.inference_mode()
def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
device: str, use_flashinfer: bool): device: str, use_flashinfer: bool):
...@@ -159,7 +161,8 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, ...@@ -159,7 +161,8 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
@pytest.mark.parametrize("batch_size", [1, 8, 32, 128]) @pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
@pytest.mark.parametrize("n_rep", [100]) @pytest.mark.parametrize("n_rep", [100])
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("use_flashinfer", [True, False]) # @pytest.mark.parametrize("use_flashinfer", [True, False])
@pytest.mark.parametrize("use_flashinfer", [False])
@torch.inference_mode() @torch.inference_mode()
def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int, def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
frac_seeded: float, n_rep: int, device: str, frac_seeded: float, n_rep: int, device: str,
...@@ -258,7 +261,8 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int, ...@@ -258,7 +261,8 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
@pytest.mark.parametrize("which_token_ids", @pytest.mark.parametrize("which_token_ids",
["bonus_token_ids", "draft_token_ids"]) ["bonus_token_ids", "draft_token_ids"])
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("use_flashinfer", [True, False]) # @pytest.mark.parametrize("use_flashinfer", [True, False])
@pytest.mark.parametrize("use_flashinfer", [False])
@torch.inference_mode() @torch.inference_mode()
def test_raises_when_vocab_oob(above_or_below_vocab_range: str, def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
which_token_ids: str, device: str, which_token_ids: str, device: str,
...@@ -310,7 +314,8 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str, ...@@ -310,7 +314,8 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
@pytest.mark.parametrize("draft_and_target_probs_equal", [True, False]) @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
@pytest.mark.parametrize("seed", list(range(5))) @pytest.mark.parametrize("seed", list(range(5)))
@pytest.mark.parametrize("use_flashinfer", [True, False]) # @pytest.mark.parametrize("use_flashinfer", [True, False])
@pytest.mark.parametrize("use_flashinfer", [False])
@torch.inference_mode() @torch.inference_mode()
def test_rejection_sampling_approximates_target_distribution( def test_rejection_sampling_approximates_target_distribution(
seed: int, draft_and_target_probs_equal: bool, use_flashinfer: bool): seed: int, draft_and_target_probs_equal: bool, use_flashinfer: bool):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment