Unverified Commit d39899e8 authored by Yineng Zhang's avatar Yineng Zhang Committed by GitHub
Browse files

upgrade flashinfer v0.2.0.post2 (#3288)


Co-authored-by: default avatarpankajroark <pankajroark@users.noreply.github.com>
parent 70817a7e
...@@ -37,7 +37,7 @@ jobs: ...@@ -37,7 +37,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
env: env:
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
run: | run: |
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
...@@ -60,7 +60,7 @@ jobs: ...@@ -60,7 +60,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
env: env:
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
run: | run: |
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
...@@ -84,7 +84,7 @@ jobs: ...@@ -84,7 +84,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
env: env:
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
run: | run: |
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
...@@ -121,7 +121,7 @@ jobs: ...@@ -121,7 +121,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
env: env:
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
run: | run: |
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
...@@ -165,7 +165,7 @@ jobs: ...@@ -165,7 +165,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
env: env:
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
run: | run: |
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
...@@ -196,7 +196,7 @@ jobs: ...@@ -196,7 +196,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
env: env:
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
run: | run: |
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
...@@ -234,7 +234,7 @@ jobs: ...@@ -234,7 +234,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
env: env:
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
run: | run: |
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
...@@ -258,7 +258,7 @@ jobs: ...@@ -258,7 +258,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
env: env:
FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }} FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer' }}
run: | run: |
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
......
...@@ -26,7 +26,7 @@ runtime_common = [ ...@@ -26,7 +26,7 @@ runtime_common = [
srt = [ srt = [
"sglang[runtime_common]", "cuda-python", "sglang[runtime_common]", "cuda-python",
"sgl-kernel>=0.0.3.post1", "torch", "vllm==0.6.4.post1", "sgl-kernel>=0.0.3.post1", "torch", "vllm==0.6.4.post1",
"flashinfer==0.1.6", "outlines>=0.0.44,<0.1.0" "flashinfer_python>=0.2.0.post2", "outlines>=0.0.44,<0.1.0"
] ]
# HIP (Heterogeneous-computing Interface for Portability) for AMD # HIP (Heterogeneous-computing Interface for Portability) for AMD
......
...@@ -316,8 +316,8 @@ def _set_envs_and_config(server_args: ServerArgs): ...@@ -316,8 +316,8 @@ def _set_envs_and_config(server_args: ServerArgs):
# Check flashinfer version # Check flashinfer version
if server_args.attention_backend == "flashinfer": if server_args.attention_backend == "flashinfer":
assert_pkg_version( assert_pkg_version(
"flashinfer", "flashinfer_python",
"0.1.6", "0.2.0.post2",
"Please uninstall the old version and " "Please uninstall the old version and "
"reinstall the latest version by following the instructions " "reinstall the latest version by following the instructions "
"at https://docs.flashinfer.ai/installation.html.", "at https://docs.flashinfer.ai/installation.html.",
......
...@@ -149,6 +149,7 @@ class FlashInferAttnBackend(AttentionBackend): ...@@ -149,6 +149,7 @@ class FlashInferAttnBackend(AttentionBackend):
BatchPrefillWithPagedKVCacheWrapper( BatchPrefillWithPagedKVCacheWrapper(
self.workspace_buffer, self.workspace_buffer,
"NHD", "NHD",
backend="fa2",
) )
) )
self.prefill_wrappers_verify.append( self.prefill_wrappers_verify.append(
...@@ -313,7 +314,7 @@ class FlashInferAttnBackend(AttentionBackend): ...@@ -313,7 +314,7 @@ class FlashInferAttnBackend(AttentionBackend):
paged_kv_indices_buf=self.cuda_graph_kv_indices[i], paged_kv_indices_buf=self.cuda_graph_kv_indices[i],
paged_kv_last_page_len_buf=self.kv_last_page_len[:bs], paged_kv_last_page_len_buf=self.kv_last_page_len[:bs],
custom_mask_buf=self.cuda_graph_custom_mask, custom_mask_buf=self.cuda_graph_custom_mask,
qk_indptr_buf=self.cuda_graph_qk_indptr[i][: bs + 1], mask_indptr_buf=self.cuda_graph_qk_indptr[i][: bs + 1],
) )
) )
seq_lens_sum = seq_lens.sum().item() seq_lens_sum = seq_lens.sum().item()
...@@ -1155,41 +1156,24 @@ def fast_decode_plan( ...@@ -1155,41 +1156,24 @@ def fast_decode_plan(
self.last_page_len = torch.ones(32768, dtype=torch.int32) self.last_page_len = torch.ones(32768, dtype=torch.int32)
empty_q_data = self.empty_q_data empty_q_data = self.empty_q_data
empty_kv_cache = self.empty_kv_cache empty_kv_cache = self.empty_kv_cache
if self.use_tensor_cores: stream = torch.cuda.current_stream()
if not self.is_cuda_graph_enabled: self._cached_module.plan(
# when not using cudagraph, we need to create the indptr buffer, otherwise self._float_workspace_buffer,
# the buffer is already created during initialization self._int_workspace_buffer,
self._qo_indptr_buf = torch.arange( self._pin_memory_int_workspace_buffer,
batch_size + 1, dtype=torch.int32, device=indptr.device indptr.to("cpu"),
) batch_size,
self._wrapper.plan( num_qo_heads,
self._float_workspace_buffer, num_kv_heads,
self._int_workspace_buffer, page_size,
self._qo_indptr_buf, self.is_cuda_graph_enabled,
indptr, window_left,
batch_size, logits_soft_cap,
num_qo_heads, head_dim,
num_kv_heads, empty_q_data,
head_dim, empty_kv_cache,
page_size, stream.cuda_stream,
empty_q_data, )
)
else:
self._wrapper.plan(
self._float_workspace_buffer,
self._int_workspace_buffer,
indptr,
self.last_page_len,
batch_size,
num_qo_heads,
num_kv_heads,
head_dim,
page_size,
PosEncodingMode[pos_encoding_mode].value,
logits_soft_cap,
empty_q_data,
empty_kv_cache,
)
self._pos_encoding_mode = pos_encoding_mode self._pos_encoding_mode = pos_encoding_mode
self._window_left = window_left self._window_left = window_left
self._logits_soft_cap = logits_soft_cap self._logits_soft_cap = logits_soft_cap
......
...@@ -69,6 +69,7 @@ class EagleDraftInput: ...@@ -69,6 +69,7 @@ class EagleDraftInput:
accept_length_cpu = batch.spec_info.accept_length_cpu accept_length_cpu = batch.spec_info.accept_length_cpu
batch.extend_lens = [x + 1 for x in accept_length_cpu] batch.extend_lens = [x + 1 for x in accept_length_cpu]
batch.seq_lens = batch.spec_info.seq_lens_for_draft_extend batch.seq_lens = batch.spec_info.seq_lens_for_draft_extend
batch.req_pool_indices = batch.spec_info.req_pool_indices_for_draft_extend
seq_lens_cpu = batch.seq_lens.tolist() seq_lens_cpu = batch.seq_lens.tolist()
pt = 0 pt = 0
...@@ -353,8 +354,12 @@ class EagleVerifyInput: ...@@ -353,8 +354,12 @@ class EagleVerifyInput:
] ]
if has_finished: if has_finished:
draft_input.seq_lens_for_draft_extend = batch.seq_lens[unfinished_index] draft_input.seq_lens_for_draft_extend = batch.seq_lens[unfinished_index]
draft_input.req_pool_indices_for_draft_extend = batch.req_pool_indices[
unfinished_index
]
else: else:
draft_input.seq_lens_for_draft_extend = batch.seq_lens draft_input.seq_lens_for_draft_extend = batch.seq_lens
draft_input.req_pool_indices_for_draft_extend = batch.req_pool_indices
logits_output.next_token_logits = logits_output.next_token_logits[accept_index] logits_output.next_token_logits = logits_output.next_token_logits[accept_index]
return ( return (
......
...@@ -269,6 +269,7 @@ class EAGLEWorker(TpModelWorker): ...@@ -269,6 +269,7 @@ class EAGLEWorker(TpModelWorker):
def forward_draft_extend_after_decode(self, batch: ScheduleBatch): def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
seq_lens_backup = batch.seq_lens seq_lens_backup = batch.seq_lens
req_pool_indices_backup = batch.req_pool_indices
self._set_mem_pool(batch, self.model_runner) self._set_mem_pool(batch, self.model_runner)
batch.forward_mode = ForwardMode.DRAFT_EXTEND batch.forward_mode = ForwardMode.DRAFT_EXTEND
...@@ -284,6 +285,7 @@ class EAGLEWorker(TpModelWorker): ...@@ -284,6 +285,7 @@ class EAGLEWorker(TpModelWorker):
# This is because `seq_lens` can be modified in `prepare_extend_after_decode` # This is because `seq_lens` can be modified in `prepare_extend_after_decode`
batch.forward_mode = ForwardMode.DECODE batch.forward_mode = ForwardMode.DECODE
batch.seq_lens = seq_lens_backup batch.seq_lens = seq_lens_backup
batch.req_pool_indices = req_pool_indices_backup
def capture_for_decode( def capture_for_decode(
self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
......
...@@ -4,16 +4,17 @@ set -euxo pipefail ...@@ -4,16 +4,17 @@ set -euxo pipefail
# Install the dependency in CI. # Install the dependency in CI.
# Use repo from environment variable, passed from GitHub Actions # Use repo from environment variable, passed from GitHub Actions
FLASHINFER_REPO="${FLASHINFER_REPO:-https://flashinfer.ai/whl/cu124/torch2.4/flashinfer}" FLASHINFER_REPO="${FLASHINFER_REPO:-https://flashinfer.ai/whl/cu124/torch2.5/flashinfer}"
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
bash "${SCRIPT_DIR}/killall_sglang.sh" bash "${SCRIPT_DIR}/killall_sglang.sh"
pip install --upgrade pip pip install --upgrade pip
pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ pip uninstall flashinfer -y
pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/
# Force reinstall flashinfer and torch_memory_saver # Force reinstall flashinfer and torch_memory_saver
pip install flashinfer==0.1.6 --find-links ${FLASHINFER_REPO} --force-reinstall --no-deps pip install flashinfer_python==0.2.0.post2 --find-links ${FLASHINFER_REPO} --force-reinstall --no-deps
pip install torch_memory_saver --force-reinstall pip install torch_memory_saver --force-reinstall
pip install transformers==4.45.2 sentence_transformers accelerate peft pip install transformers==4.45.2 sentence_transformers accelerate peft
......
...@@ -52,7 +52,6 @@ suites = { ...@@ -52,7 +52,6 @@ suites = {
"test_vision_llm.py", "test_vision_llm.py",
"test_vision_openai_server.py", "test_vision_openai_server.py",
"test_w8a8_quantization.py", "test_w8a8_quantization.py",
"test_fp8_kvcache.py",
"test_fp8_kernel.py", "test_fp8_kernel.py",
], ],
"nightly": [ "nightly": [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment