Unverified Commit 711390a9 authored by Hubert Lu's avatar Hubert Lu Committed by GitHub
Browse files

[AMD] Support Hierarchical Caching on AMD GPUs (#8236)

parent 53430588
...@@ -223,7 +223,7 @@ jobs: ...@@ -223,7 +223,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
runner: [linux-mi300-gpu-1, linux-mi325-gpu-1] runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
part: [0, 1, 2, 3, 4, 5, 6] part: [0, 1, 2, 3, 4, 5, 6, 7]
runs-on: ${{matrix.runner}} runs-on: ${{matrix.runner}}
steps: steps:
- name: Checkout code - name: Checkout code
...@@ -240,7 +240,7 @@ jobs: ...@@ -240,7 +240,7 @@ jobs:
- name: Run test - name: Run test
timeout-minutes: 50 timeout-minutes: 50
run: | run: |
bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 7 bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 8
unit-test-backend-2-gpu-amd: unit-test-backend-2-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
...@@ -336,13 +336,14 @@ jobs: ...@@ -336,13 +336,14 @@ jobs:
bash scripts/ci/amd_ci_install_dependency.sh bash scripts/ci/amd_ci_install_dependency.sh
- name: Run test - name: Run test
timeout-minutes: 10 timeout-minutes: 14
run: | run: |
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py
docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_kvcacheio.py
pr-test-amd-finish: pr-test-amd-finish:
if: always() if: always()
......
...@@ -121,6 +121,48 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) { ...@@ -121,6 +121,48 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
*/ */
m.def("apply_token_bitmask_inplace_cuda(Tensor logits, Tensor bitmask, Tensor? indices=None) -> ()"); m.def("apply_token_bitmask_inplace_cuda(Tensor logits, Tensor bitmask, Tensor? indices=None) -> ()");
m.impl("apply_token_bitmask_inplace_cuda", &ApplyTokenBitmaskInplace); m.impl("apply_token_bitmask_inplace_cuda", &ApplyTokenBitmaskInplace);
/*
* From csrc/kvcacheio
*/
m.def(
"transfer_kv_per_layer(Tensor src_k, Tensor dst_k, Tensor src_v, Tensor dst_v, Tensor src_indices, Tensor "
"dst_indices, int item_size, int block_quota, int num_warps_per_block) -> ()");
m.impl("transfer_kv_per_layer", torch::kCUDA, &transfer_kv_per_layer);
m.def(
"transfer_kv_per_layer_pf_lf(Tensor src_k, Tensor dst_k, Tensor src_v, Tensor dst_v, Tensor src_indices, Tensor "
"dst_indices, int layer_id, int item_size, int src_layout_dim, int block_quota, int num_warps_per_block) -> ()");
m.impl("transfer_kv_per_layer_pf_lf", torch::kCUDA, &transfer_kv_per_layer_pf_lf);
m.def(
"transfer_kv_all_layer(Tensor src_k_layers, Tensor dst_k_layers, Tensor src_v_layers, Tensor dst_v_layers, "
"Tensor src_indices, Tensor dst_indices, int item_size, int num_layers, int block_quota, int "
"num_warps_per_block) -> ()");
m.impl("transfer_kv_all_layer", torch::kCUDA, &transfer_kv_all_layer);
m.def(
"transfer_kv_all_layer_lf_pf(Tensor src_k_layers, Tensor dst_k, Tensor src_v_layers, Tensor dst_v, "
"Tensor src_indices, Tensor dst_indices, int item_size, int dst_layout_dim, int num_layers, int block_quota, int "
"num_warps_per_block) -> ()");
m.impl("transfer_kv_all_layer_lf_pf", torch::kCUDA, &transfer_kv_all_layer_lf_pf);
m.def(
"transfer_kv_per_layer_mla(Tensor src, Tensor dst, Tensor src_indices, Tensor dst_indices, int item_size, int "
"block_quota, int num_warps_per_block) -> ()");
m.impl("transfer_kv_per_layer_mla", torch::kCUDA, &transfer_kv_per_layer_mla);
m.def(
"transfer_kv_per_layer_mla_pf_lf(Tensor src, Tensor dst, Tensor src_indices, Tensor dst_indices, int layer_id, "
"int item_size, int src_layout_dim, int block_quota, int num_warps_per_block) -> ()");
m.impl("transfer_kv_per_layer_mla_pf_lf", torch::kCUDA, &transfer_kv_per_layer_mla_pf_lf);
m.def(
"transfer_kv_all_layer_mla(Tensor src_layers, Tensor dst_layers, Tensor src_indices, Tensor dst_indices, int "
"item_size, int num_layers, int block_quota, int num_warps_per_block) -> ()");
m.impl("transfer_kv_all_layer_mla", torch::kCUDA, &transfer_kv_all_layer_mla);
m.def(
"transfer_kv_all_layer_mla_lf_pf(Tensor src_layers, Tensor dst, Tensor src_indices, Tensor dst_indices, "
"int item_size, int dst_layout_dim, int num_layers, int block_quota, int num_warps_per_block) -> ()");
m.impl("transfer_kv_all_layer_mla_lf_pf", torch::kCUDA, &transfer_kv_all_layer_mla_lf_pf);
m.def(
"transfer_kv_direct(Tensor[] src_layers, Tensor[] dst_layers, Tensor src_indices, Tensor dst_indices, int "
"page_size) -> ()");
m.impl("transfer_kv_direct", torch::kCUDA, &transfer_kv_direct);
} }
REGISTER_EXTENSION(common_ops) REGISTER_EXTENSION(common_ops)
...@@ -4,21 +4,31 @@ ...@@ -4,21 +4,31 @@
#include <cstdint> #include <cstdint>
#ifndef USE_ROCM
#define WARP_SIZE 32
#include "pytorch_extension_utils.h" #include "pytorch_extension_utils.h"
#else
#include "pytorch_extension_utils_rocm.h"
#include "utils.h" // WARP_SIZE
#endif
__device__ __forceinline__ void __device__ __forceinline__ void
transfer_item_warp(int32_t lane_id, const void* src_addr, void* dst_addr, int64_t item_size_bytes) { transfer_item_warp(int32_t lane_id, const void* src_addr, void* dst_addr, int64_t item_size_bytes) {
// todo, different chunk size const uint64_t* __restrict__ src = static_cast<const uint64_t*>(src_addr);
int total_chunks = item_size_bytes / 8; uint64_t* __restrict__ dst = static_cast<uint64_t*>(dst_addr);
const int64_t* src_8 = reinterpret_cast<const int64_t*>(src_addr); const int total_chunks = item_size_bytes / sizeof(uint64_t);
int64_t* dst_8 = reinterpret_cast<int64_t*>(dst_addr);
#pragma unroll #pragma unroll
for (int j = lane_id; j < total_chunks; j += 32) { for (int j = lane_id; j < total_chunks; j += WARP_SIZE) {
const int64_t* src_addr_lane = &src_8[j]; #ifndef USE_ROCM
int64_t* dst_addr_lane = &dst_8[j]; uint64_t tmp;
int64_t temp_val; asm volatile("ld.global.nc.b64 %0,[%1];" : "=l"(tmp) : "l"(src + j) : "memory");
asm volatile("ld.global.nc.b64 %0, [%1];" : "=l"(temp_val) : "l"(src_addr_lane) : "memory"); asm volatile("st.global.cg.b64 [%0],%1;" ::"l"(dst + j), "l"(tmp) : "memory");
asm volatile("st.global.cg.b64 [%0], %1;" ::"l"(dst_addr_lane), "l"(temp_val) : "memory");
#else
uint64_t tmp = __builtin_nontemporal_load(src + j);
__builtin_nontemporal_store(tmp, dst + j);
#endif
} }
} }
...@@ -78,8 +88,8 @@ __global__ void transfer_kernel_impl( ...@@ -78,8 +88,8 @@ __global__ void transfer_kernel_impl(
const uintptr_t* __restrict__ src_v_layer_tbl, const uintptr_t* __restrict__ src_v_layer_tbl,
const uintptr_t* __restrict__ dst_v_layer_tbl) { const uintptr_t* __restrict__ dst_v_layer_tbl) {
int32_t tid = blockIdx.x * blockDim.x + threadIdx.x; int32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
int32_t lane_id = tid % 32; int32_t lane_id = tid % WARP_SIZE;
int32_t warp_id = tid / 32; int32_t warp_id = tid / WARP_SIZE;
for (int i = 0; i < items_per_warp; ++i) { for (int i = 0; i < items_per_warp; ++i) {
int64_t item_id = warp_id * items_per_warp + i; int64_t item_id = warp_id * items_per_warp + i;
...@@ -139,7 +149,7 @@ void transfer_kv_launcher( ...@@ -139,7 +149,7 @@ void transfer_kv_launcher(
const int64_t items_per_warp = div_up(num_items, block_quota * num_warps_per_block); const int64_t items_per_warp = div_up(num_items, block_quota * num_warps_per_block);
const int32_t num_blocks = div_up(num_items, items_per_warp * num_warps_per_block); const int32_t num_blocks = div_up(num_items, items_per_warp * num_warps_per_block);
dim3 grid_dim(num_blocks, 1, 1); dim3 grid_dim(num_blocks, 1, 1);
const int32_t threads_per_block = num_warps_per_block * 32; const int32_t threads_per_block = num_warps_per_block * WARP_SIZE;
const void* src_k_ptr = src_k.defined() ? src_k.data_ptr() : nullptr; const void* src_k_ptr = src_k.defined() ? src_k.data_ptr() : nullptr;
void* dst_k_ptr = dst_k.defined() ? dst_k.data_ptr() : nullptr; void* dst_k_ptr = dst_k.defined() ? dst_k.data_ptr() : nullptr;
......
...@@ -3,6 +3,13 @@ from typing import List ...@@ -3,6 +3,13 @@ from typing import List
import torch import torch
def is_hip() -> bool:
return torch.version.hip is not None
_is_hip = is_hip()
def transfer_kv_per_layer( def transfer_kv_per_layer(
src_k: torch.Tensor, src_k: torch.Tensor,
dst_k: torch.Tensor, dst_k: torch.Tensor,
...@@ -12,7 +19,7 @@ def transfer_kv_per_layer( ...@@ -12,7 +19,7 @@ def transfer_kv_per_layer(
dst_indices: torch.Tensor, dst_indices: torch.Tensor,
item_size: int, item_size: int,
block_quota: int = 2, block_quota: int = 2,
num_warps_per_block: int = 32, num_warps_per_block: int = 16 if _is_hip else 32,
): ):
torch.ops.sgl_kernel.transfer_kv_per_layer( torch.ops.sgl_kernel.transfer_kv_per_layer(
src_k, src_k,
...@@ -38,7 +45,7 @@ def transfer_kv_per_layer_pf_lf( ...@@ -38,7 +45,7 @@ def transfer_kv_per_layer_pf_lf(
item_size: int, item_size: int,
src_layout_dim: int, src_layout_dim: int,
block_quota: int = 2, block_quota: int = 2,
num_warps_per_block: int = 32, num_warps_per_block: int = 16 if _is_hip else 32,
): ):
torch.ops.sgl_kernel.transfer_kv_per_layer_pf_lf( torch.ops.sgl_kernel.transfer_kv_per_layer_pf_lf(
src_k, src_k,
...@@ -65,7 +72,7 @@ def transfer_kv_all_layer( ...@@ -65,7 +72,7 @@ def transfer_kv_all_layer(
item_size: int, item_size: int,
num_layers: int, num_layers: int,
block_quota: int = 2, block_quota: int = 2,
num_warps_per_block: int = 32, num_warps_per_block: int = 16 if _is_hip else 32,
): ):
torch.ops.sgl_kernel.transfer_kv_all_layer( torch.ops.sgl_kernel.transfer_kv_all_layer(
src_k_layers, src_k_layers,
...@@ -92,7 +99,7 @@ def transfer_kv_all_layer_lf_pf( ...@@ -92,7 +99,7 @@ def transfer_kv_all_layer_lf_pf(
dst_layout_dim: int, dst_layout_dim: int,
num_layers: int, num_layers: int,
block_quota: int = 2, block_quota: int = 2,
num_warps_per_block: int = 32, num_warps_per_block: int = 16 if _is_hip else 32,
): ):
torch.ops.sgl_kernel.transfer_kv_all_layer_lf_pf( torch.ops.sgl_kernel.transfer_kv_all_layer_lf_pf(
src_k_layers, src_k_layers,
...@@ -128,7 +135,7 @@ def transfer_kv_per_layer_mla( ...@@ -128,7 +135,7 @@ def transfer_kv_per_layer_mla(
dst_indices: torch.Tensor, dst_indices: torch.Tensor,
item_size: int, item_size: int,
block_quota: int = 2, block_quota: int = 2,
num_warps_per_block: int = 32, num_warps_per_block: int = 16 if _is_hip else 32,
): ):
torch.ops.sgl_kernel.transfer_kv_per_layer_mla( torch.ops.sgl_kernel.transfer_kv_per_layer_mla(
src, src,
...@@ -150,7 +157,7 @@ def transfer_kv_per_layer_mla_pf_lf( ...@@ -150,7 +157,7 @@ def transfer_kv_per_layer_mla_pf_lf(
item_size: int, item_size: int,
src_layout_dim: int, src_layout_dim: int,
block_quota: int = 2, block_quota: int = 2,
num_warps_per_block: int = 32, num_warps_per_block: int = 16 if _is_hip else 32,
): ):
torch.ops.sgl_kernel.transfer_kv_per_layer_mla_pf_lf( torch.ops.sgl_kernel.transfer_kv_per_layer_mla_pf_lf(
src, src,
...@@ -173,7 +180,7 @@ def transfer_kv_all_layer_mla( ...@@ -173,7 +180,7 @@ def transfer_kv_all_layer_mla(
item_size: int, item_size: int,
num_layers: int, num_layers: int,
block_quota: int = 2, block_quota: int = 2,
num_warps_per_block: int = 32, num_warps_per_block: int = 16 if _is_hip else 32,
): ):
torch.ops.sgl_kernel.transfer_kv_all_layer_mla( torch.ops.sgl_kernel.transfer_kv_all_layer_mla(
src_layers, src_layers,
...@@ -196,7 +203,7 @@ def transfer_kv_all_layer_mla_lf_pf( ...@@ -196,7 +203,7 @@ def transfer_kv_all_layer_mla_lf_pf(
dst_layout_dim: int, dst_layout_dim: int,
num_layers: int, num_layers: int,
block_quota: int = 2, block_quota: int = 2,
num_warps_per_block: int = 32, num_warps_per_block: int = 16 if _is_hip else 32,
): ):
torch.ops.sgl_kernel.transfer_kv_all_layer_mla_lf_pf( torch.ops.sgl_kernel.transfer_kv_all_layer_mla_lf_pf(
src_layers, src_layers,
......
...@@ -49,6 +49,7 @@ sources = [ ...@@ -49,6 +49,7 @@ sources = [
"csrc/moe/moe_align_kernel.cu", "csrc/moe/moe_align_kernel.cu",
"csrc/moe/moe_topk_softmax_kernels.cu", "csrc/moe/moe_topk_softmax_kernels.cu",
"csrc/speculative/eagle_utils.cu", "csrc/speculative/eagle_utils.cu",
"csrc/kvcacheio/transfer.cu",
] ]
cxx_flags = ["-O3"] cxx_flags = ["-O3"]
......
import unittest import unittest
from types import SimpleNamespace from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import is_hip, kill_process_tree
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_MODEL_NAME_FOR_TEST,
...@@ -11,6 +11,8 @@ from sglang.test.test_utils import ( ...@@ -11,6 +11,8 @@ from sglang.test.test_utils import (
popen_launch_server, popen_launch_server,
) )
_is_hip = is_hip()
class TestHiCache(CustomTestCase): class TestHiCache(CustomTestCase):
@classmethod @classmethod
...@@ -26,7 +28,7 @@ class TestHiCache(CustomTestCase): ...@@ -26,7 +28,7 @@ class TestHiCache(CustomTestCase):
"--mem-fraction-static", "--mem-fraction-static",
0.7, 0.7,
"--hicache-size", "--hicache-size",
100, 100 if not _is_hip else 200,
], ],
) )
......
import unittest import unittest
from types import SimpleNamespace from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import is_hip, kill_process_tree
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_MLA_MODEL_NAME_FOR_TEST, DEFAULT_MLA_MODEL_NAME_FOR_TEST,
...@@ -11,6 +11,12 @@ from sglang.test.test_utils import ( ...@@ -11,6 +11,12 @@ from sglang.test.test_utils import (
popen_launch_server, popen_launch_server,
) )
_is_hip = is_hip()
if _is_hip:
hicache_args = ["--hicache-size", 200]
else:
hicache_args = ["--hicache-ratio", 2]
class TestHierarchicalMLA(CustomTestCase): class TestHierarchicalMLA(CustomTestCase):
@classmethod @classmethod
...@@ -24,9 +30,8 @@ class TestHierarchicalMLA(CustomTestCase): ...@@ -24,9 +30,8 @@ class TestHierarchicalMLA(CustomTestCase):
other_args=[ other_args=[
"--trust-remote-code", "--trust-remote-code",
"--enable-hierarchical-cache", "--enable-hierarchical-cache",
"--hicache-ratio", ]
2, + hicache_args,
],
) )
@classmethod @classmethod
......
import unittest import unittest
from types import SimpleNamespace from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import is_hip, kill_process_tree
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_MODEL_NAME_FOR_TEST,
...@@ -11,6 +11,8 @@ from sglang.test.test_utils import ( ...@@ -11,6 +11,8 @@ from sglang.test.test_utils import (
popen_launch_server, popen_launch_server,
) )
_is_hip = is_hip()
class TestHiCache(CustomTestCase): class TestHiCache(CustomTestCase):
@classmethod @classmethod
...@@ -26,7 +28,7 @@ class TestHiCache(CustomTestCase): ...@@ -26,7 +28,7 @@ class TestHiCache(CustomTestCase):
"--mem-fraction-static", "--mem-fraction-static",
0.7, 0.7,
"--hicache-size", "--hicache-size",
100, 100 if not _is_hip else 200,
"--page-size", "--page-size",
"64", "64",
"--hicache-storage-backend", "--hicache-storage-backend",
......
...@@ -162,6 +162,9 @@ suites = { ...@@ -162,6 +162,9 @@ suites = {
# Add AMD tests # Add AMD tests
suite_amd = { suite_amd = {
"per-commit-amd": [ "per-commit-amd": [
TestFile("hicache/test_hicache.py", 116),
TestFile("hicache/test_hicache_mla.py", 127),
TestFile("hicache/test_hicache_storage.py", 127),
TestFile("lora/test_lora.py", 200), TestFile("lora/test_lora.py", 200),
TestFile("lora/test_lora_eviction.py", 200), TestFile("lora/test_lora_eviction.py", 200),
TestFile("lora/test_lora_backend.py", 99), TestFile("lora/test_lora_backend.py", 99),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment