Commit a810671a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0rc0' into v0.14.0rc0-ori

parents 86b5aefe 6a09612b
......@@ -142,7 +142,7 @@ typedef struct {
int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
int prob_n, int prob_k, int num_bits, int group_size,
bool has_act_order, bool is_k_full) {
bool has_act_order, bool is_k_full, int stages) {
bool cache_scales_chunk = has_act_order && !is_k_full;
int tb_n = th_config.thread_n;
......@@ -160,13 +160,13 @@ int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
if (cache_scales_chunk) {
int load_groups =
tb_groups * pipe_stages * 2; // Chunk size is 2x pipeline over dim K
tb_groups * stages * 2; // Chunk size is 2x pipeline over dim K
load_groups = max(load_groups, 32); // We load at least 32 scale groups
return load_groups * tb_n * 2;
} else {
int tb_scales = tb_groups * tb_n * 2;
return tb_scales * pipe_stages;
return tb_scales * stages;
}
}
......@@ -174,7 +174,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
int thread_m_blocks, int prob_m, int prob_n,
int prob_k, int num_bits, int group_size,
bool has_act_order, bool is_k_full, int has_zp,
int is_zp_float, bool is_a_8bit) {
int is_zp_float, bool is_a_8bit, int stages) {
int pack_factor = 32 / num_bits;
// Get B size
......@@ -185,8 +185,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
// shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
// both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
int sh_block_meta_size = tb_m * 16;
int sh_a_size = pipe_stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
int sh_a_size = stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
int sh_b_size = stages * (tb_k * tb_n / pack_factor) * 4;
int sh_red_size = tb_m * (tb_n + 8) * 2;
int sh_bias_size = tb_n * 2;
int tmp_size =
......@@ -195,8 +195,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
int sh_s_size =
get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
group_size, has_act_order, is_k_full);
int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
group_size, has_act_order, is_k_full, stages);
int sh_g_idx_size = has_act_order && !is_k_full ? stages * tb_k / 4 : 0;
int sh_zp_size = 0;
if (has_zp) {
if (is_zp_float)
......@@ -217,7 +217,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
int thread_m_blocks, int prob_m, int prob_n, int prob_k,
int num_bits, int group_size, bool has_act_order,
bool is_k_full, int has_zp, int is_zp_float,
int max_shared_mem, bool is_a_8bit) {
bool is_a_8bit, int stages, int max_shared_mem) {
// Sanity
if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
th_config.num_threads == -1) {
......@@ -243,7 +243,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
int cache_size =
get_kernel_cache_size(th_config, m_block_size_8, thread_m_blocks, prob_m,
prob_n, prob_k, num_bits, group_size, has_act_order,
is_k_full, has_zp, is_zp_float, is_a_8bit);
is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
return cache_size <= max_shared_mem;
}
......@@ -252,7 +252,7 @@ MarlinFuncPtr get_marlin_kernel(
const vllm::ScalarType c_type, const vllm::ScalarType s_type,
int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
int threads, bool is_zp_float) {
int threads, bool is_zp_float, int stages) {
int num_bits = b_type.size_bits();
auto kernel = MarlinDefault;
......@@ -266,8 +266,8 @@ exec_config_t determine_exec_config(
const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
int prob_n, int prob_k, int num_experts, int top_k, int thread_m_blocks,
bool m_block_size_8, int num_bits, int group_size, bool has_act_order,
bool is_k_full, bool has_zp, bool is_zp_float, int max_shared_mem, int sms,
bool is_a_8bit) {
bool is_k_full, bool has_zp, bool is_zp_float, bool is_a_8bit, int stages,
int max_shared_mem, int sms) {
exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
thread_config_t* thread_configs = thread_m_blocks > 1
? large_batch_thread_configs
......@@ -284,15 +284,15 @@ exec_config_t determine_exec_config(
if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m,
prob_n, prob_k, num_bits, group_size, has_act_order,
is_k_full, has_zp, is_zp_float, max_shared_mem - 512,
is_a_8bit)) {
is_k_full, has_zp, is_zp_float, is_a_8bit, stages,
max_shared_mem - 512)) {
continue;
}
int cache_size = get_kernel_cache_size(
th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
is_a_8bit);
is_a_8bit, stages);
int group_blocks = 0;
if (!has_act_order) {
......@@ -303,7 +303,7 @@ exec_config_t determine_exec_config(
get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
th_config.thread_n / 16, th_config.thread_k / 16,
m_block_size_8, has_act_order, has_zp, group_blocks,
th_config.num_threads, is_zp_float);
th_config.num_threads, is_zp_float, stages);
if (kernel == MarlinDefault) continue;
......@@ -433,8 +433,14 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
dev);
cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
dev);
TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
"marlin kernel only support Ampere or newer GPUs.");
TORCH_CHECK(major_capability * 10 + minor_capability >= 75,
"marlin kernel only support Turing or newer GPUs.");
int stages = 4;
if (major_capability == 7 && minor_capability == 5) {
stages = 2;
TORCH_CHECK(a_type == vllm::kFloat16 || a_type == vllm::kS8,
"Turing only support FP16 or INT8 activation.");
}
if (a_type == vllm::kFE4M3fn) {
TORCH_CHECK(major_capability * 10 + minor_capability >= 89,
"FP8 only support Ada Lovelace or newer GPUs.");
......@@ -461,8 +467,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
exec_cfg = determine_exec_config(
a_type, b_type, c_type, s_type, prob_m, prob_n, prob_k, num_experts,
top_k, thread_m_blocks, m_block_size_8, num_bits, group_size,
has_act_order, is_k_full, has_zp, is_zp_float, max_shared_mem, sms,
is_a_8bit);
has_act_order, is_k_full, has_zp, is_zp_float, is_a_8bit, stages,
max_shared_mem, sms);
thread_tfg = exec_cfg.tb_cfg;
}
......@@ -479,7 +485,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
TORCH_CHECK(is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks,
prob_m, prob_n, prob_k, num_bits, group_size,
has_act_order, is_k_full, has_zp, is_zp_float,
max_shared_mem, is_a_8bit),
is_a_8bit, stages, max_shared_mem),
"Invalid thread config: thread_m_blocks = ", thread_m_blocks,
", thread_k = ", thread_tfg.thread_k,
", thread_n = ", thread_tfg.thread_n,
......@@ -493,12 +499,12 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
int sh_cache_size =
get_kernel_cache_size(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
prob_n, prob_k, num_bits, group_size, has_act_order,
is_k_full, has_zp, is_zp_float, is_a_8bit);
is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
auto kernel = get_marlin_kernel(
a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
num_threads, is_zp_float);
num_threads, is_zp_float, stages);
if (kernel == MarlinDefault) {
TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
......
sm*_kernel_*.cu
kernel_selector.h
kernel_*.cu
......@@ -67,7 +67,7 @@ where `scale_factor * multiplier` can be computed at weight loading.
namespace MARLIN_NAMESPACE_NAME {
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 750
// Lookup-table based 3-input logical operation; explicitly used for
// dequantization as the compiler does not seem to automatically recognize it in
// all cases.
......
......@@ -10,6 +10,8 @@ import jinja2
ARCHS = []
SUPPORT_FP8 = False
SUPPORT_SM75 = False
SUPPORT_SM80 = False
for arch in sys.argv[1].split(","):
arch = arch[: arch.index(".") + 2].replace(".", "")
arch = int(arch)
......@@ -19,6 +21,10 @@ for arch in sys.argv[1].split(","):
# with FP16 MMA, so it cannot achieve any acceleration.
if arch in [89, 120]:
SUPPORT_FP8 = True
if arch >= 80:
SUPPORT_SM80 = True
if arch == 75:
SUPPORT_SM75 = True
FILE_HEAD_COMMENT = """
// auto generated by generate_kernels.py
......@@ -166,6 +172,7 @@ def remove_old_kernels():
def generate_new_kernels():
result_dict = {}
sm_75_result_dict = {}
for quant_config in QUANT_CONFIGS:
c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
......@@ -184,6 +191,8 @@ def generate_new_kernels():
s_type = quant_config.get("s_type", c_type)
if (a_type, b_type, c_type) not in result_dict:
result_dict[(a_type, b_type, c_type)] = []
if a_type in ["kFloat16", "kS8"] and c_type == "kFloat16":
sm_75_result_dict[(a_type, b_type, c_type)] = []
for group_blocks, m_blocks, thread_configs in itertools.product(
all_group_blocks, all_m_blocks, all_thread_configs
......@@ -207,78 +216,89 @@ def generate_new_kernels():
"thread_k_blocks": thread_k // 16,
"thread_n_blocks": thread_n // 16,
"m_block_size_8": "true" if m_blocks == 0.5 else "false",
"stages": "pipe_stages",
"stages": 4,
"group_blocks": group_blocks,
"is_zp_float": "true" if is_zp_float else "false",
}
result_dict[(a_type, b_type, c_type)].append(config)
if SUPPORT_SM80:
result_dict[(a_type, b_type, c_type)].append(config)
if (a_type, b_type, c_type) in sm_75_result_dict and SUPPORT_SM75:
config_sm75 = config.copy()
config_sm75["stages"] = 2
sm_75_result_dict[(a_type, b_type, c_type)].append(config_sm75)
kernel_selector_str = FILE_HEAD_COMMENT
for (a_type, b_type, c_type), config_list in result_dict.items():
all_template_str_list = []
for config in config_list:
s_type = config["s_type"]
template_str = jinja2.Template(TEMPLATE).render(
a_type_id=f"vllm::{a_type}.id()",
b_type_id=f"vllm::{b_type}.id()",
c_type_id=f"vllm::{c_type}.id()",
s_type_id=f"vllm::{s_type}.id()",
**config,
)
all_template_str_list.append(template_str)
conditions = [
f"a_type == vllm::{a_type}",
f"b_type == vllm::{b_type}",
f"c_type == vllm::{c_type}",
f"s_type == vllm::{s_type}",
f"threads == {config['threads']}",
f"thread_m_blocks == {config['thread_m_blocks']}",
f"thread_n_blocks == {config['thread_n_blocks']}",
f"thread_k_blocks == {config['thread_k_blocks']}",
f"m_block_size_8 == {config['m_block_size_8']}",
f"group_blocks == {config['group_blocks']}",
f"is_zp_float == {config['is_zp_float']}",
]
conditions = " && ".join(conditions)
if kernel_selector_str == FILE_HEAD_COMMENT:
kernel_selector_str += f"if ({conditions})\n kernel = "
else:
kernel_selector_str += f"else if ({conditions})\n kernel = "
kernel_template2 = (
"Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
"{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
"{{thread_n_blocks}}, {{thread_k_blocks}}, "
"{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
"{{is_zp_float}}>;"
)
kernel_selector_str += (
jinja2.Template(kernel_template2).render(
for result_dict_tmp in [result_dict, sm_75_result_dict]:
for (a_type, b_type, c_type), config_list in result_dict_tmp.items():
all_template_str_list = []
if not config_list:
continue
for config in config_list:
s_type = config["s_type"]
template_str = jinja2.Template(TEMPLATE).render(
a_type_id=f"vllm::{a_type}.id()",
b_type_id=f"vllm::{b_type}.id()",
c_type_id=f"vllm::{c_type}.id()",
s_type_id=f"vllm::{s_type}.id()",
**config,
)
+ "\n"
)
all_template_str_list.append(template_str)
conditions = [
f"a_type == vllm::{a_type}",
f"b_type == vllm::{b_type}",
f"c_type == vllm::{c_type}",
f"s_type == vllm::{s_type}",
f"threads == {config['threads']}",
f"thread_m_blocks == {config['thread_m_blocks']}",
f"thread_n_blocks == {config['thread_n_blocks']}",
f"thread_k_blocks == {config['thread_k_blocks']}",
f"m_block_size_8 == {config['m_block_size_8']}",
f"stages == {config['stages']}",
f"group_blocks == {config['group_blocks']}",
f"is_zp_float == {config['is_zp_float']}",
]
conditions = " && ".join(conditions)
if kernel_selector_str == FILE_HEAD_COMMENT:
kernel_selector_str += f"if ({conditions})\n kernel = "
else:
kernel_selector_str += f"else if ({conditions})\n kernel = "
kernel_template2 = (
"Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
"{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
"{{thread_n_blocks}}, {{thread_k_blocks}}, "
"{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
"{{is_zp_float}}>;"
)
file_content = FILE_HEAD + "\n\n"
file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
if a_type == "kFE4M3fn":
filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
else:
filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
kernel_selector_str += (
jinja2.Template(kernel_template2).render(
a_type_id=f"vllm::{a_type}.id()",
b_type_id=f"vllm::{b_type}.id()",
c_type_id=f"vllm::{c_type}.id()",
s_type_id=f"vllm::{s_type}.id()",
**config,
)
+ "\n"
)
file_content = FILE_HEAD + "\n\n"
file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
if a_type == "kFE4M3fn":
filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
elif result_dict_tmp is sm_75_result_dict:
filename = f"sm75_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
else:
filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
filename = filename.lower()
filename = filename.lower()
with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
f.write(file_content)
with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
f.write(file_content)
if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
kernel_selector_str += (
......
......@@ -37,7 +37,7 @@ __global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
int const* __restrict__ perm_int_ptr,
......@@ -148,7 +148,7 @@ typedef struct {
int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
int prob_n, int prob_k, int num_bits, int group_size,
bool has_act_order, bool is_k_full) {
bool has_act_order, bool is_k_full, int stages) {
bool cache_scales_chunk = has_act_order && !is_k_full;
int tb_n = th_config.thread_n;
......@@ -166,28 +166,29 @@ int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
if (cache_scales_chunk) {
int load_groups =
tb_groups * pipe_stages * 2; // Chunk size is 2x pipeline over dim K
tb_groups * stages * 2; // Chunk size is 2x pipeline over dim K
load_groups = max(load_groups, 32); // We load at least 32 scale groups
return load_groups * tb_n * 2;
} else {
int tb_scales = tb_groups * tb_n * 2;
return tb_scales * pipe_stages;
return tb_scales * stages;
}
}
int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
int prob_m, int prob_n, int prob_k, int num_bits,
int group_size, bool has_act_order, bool is_k_full,
int has_zp, int is_zp_float) {
int has_zp, bool is_zp_float, bool is_a_8bit,
int stages) {
int pack_factor = 32 / num_bits;
// Get B size
int tb_k = th_config.thread_k;
int tb_n = th_config.thread_n;
int tb_m = thread_m_blocks * 16;
int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
int sh_a_size = stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
int sh_b_size = stages * (tb_k * tb_n / pack_factor) * 4;
int sh_red_size = tb_m * (tb_n + 8) * 2;
int sh_bias_size = tb_n * 2;
int tmp_size =
......@@ -196,8 +197,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
int sh_s_size =
get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
group_size, has_act_order, is_k_full);
int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
group_size, has_act_order, is_k_full, stages);
int sh_g_idx_size = has_act_order && !is_k_full ? stages * tb_k / 4 : 0;
int sh_zp_size = 0;
if (has_zp) {
if (is_zp_float)
......@@ -217,7 +218,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
int prob_m, int prob_n, int prob_k, int num_bits,
int group_size, bool has_act_order, bool is_k_full,
int has_zp, int is_zp_float, int max_shared_mem) {
int has_zp, bool is_zp_float, bool is_a_8bit, int stages,
int max_shared_mem) {
// Sanity
if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
th_config.num_threads == -1) {
......@@ -242,7 +244,7 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
// Check that pipeline fits into cache
int cache_size = get_kernel_cache_size(
th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
has_act_order, is_k_full, has_zp, is_zp_float);
has_act_order, is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
return cache_size <= max_shared_mem;
}
......@@ -251,7 +253,7 @@ MarlinFuncPtr get_marlin_kernel(
const vllm::ScalarType c_type, const vllm::ScalarType s_type,
int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
int threads, bool is_zp_float) {
int threads, bool is_zp_float, int stages) {
int num_bits = b_type.size_bits();
auto kernel = MarlinDefault;
......@@ -265,7 +267,8 @@ exec_config_t determine_exec_config(
const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
int prob_n, int prob_k, int thread_m_blocks, bool m_block_size_8,
int num_bits, int group_size, bool has_act_order, bool is_k_full,
bool has_zp, bool is_zp_float, int max_shared_mem, int sms) {
bool has_zp, bool is_zp_float, int is_a_8bit, int stages,
int max_shared_mem, int sms) {
exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
thread_config_t* thread_configs = thread_m_blocks > 1
? large_batch_thread_configs
......@@ -280,13 +283,15 @@ exec_config_t determine_exec_config(
if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k,
num_bits, group_size, has_act_order, is_k_full, has_zp,
is_zp_float, max_shared_mem - 512)) {
is_zp_float, is_a_8bit, stages,
max_shared_mem - 512)) {
continue;
}
int cache_size = get_kernel_cache_size(
th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits,
group_size, has_act_order, is_k_full, has_zp, is_zp_float);
int cache_size = get_kernel_cache_size(th_config, thread_m_blocks, prob_m,
prob_n, prob_k, num_bits, group_size,
has_act_order, is_k_full, has_zp,
is_zp_float, is_a_8bit, stages);
int group_blocks = 0;
if (!has_act_order) {
......@@ -297,14 +302,10 @@ exec_config_t determine_exec_config(
get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
th_config.thread_n / 16, th_config.thread_k / 16,
m_block_size_8, has_act_order, has_zp, group_blocks,
th_config.num_threads, is_zp_float);
th_config.num_threads, is_zp_float, stages);
if (kernel == MarlinDefault) continue;
// int m_tiles = div_ceil(prob_m, thread_m_blocks * 16);
// int n_tiles = prob_n / th_config.thread_n;
// int k_tiles = prob_k / th_config.thread_k;
return {1, th_config};
}
......@@ -321,6 +322,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
int group_size, int dev, cudaStream_t stream, int thread_k_init,
int thread_n_init, int sms, bool use_atomic_add,
bool use_fp32_reduce, bool is_zp_float) {
bool is_a_8bit = a_type.size_bits() == 8;
TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
", ", prob_n, ", ", prob_k, "]");
......@@ -389,8 +391,14 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
dev);
cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
dev);
TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
"marlin kernel only support Ampere or newer GPUs.");
TORCH_CHECK(major_capability * 10 + minor_capability >= 75,
"marlin kernel only support Turing or newer GPUs.");
int stages = 4;
if (major_capability == 7 && minor_capability == 5) {
stages = 2;
TORCH_CHECK(a_type == vllm::kFloat16 || a_type == vllm::kS8,
"Turing only support FP16 or INT8 activation.");
}
if (a_type == vllm::kFE4M3fn) {
TORCH_CHECK(
major_capability * 10 + minor_capability == 89 ||
......@@ -431,7 +439,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
exec_cfg = determine_exec_config(
a_type, b_type, c_type, s_type, prob_m_split, prob_n, prob_k,
thread_m_blocks, m_block_size_8, num_bits, group_size, has_act_order,
is_k_full, has_zp, is_zp_float, max_shared_mem, sms);
is_k_full, has_zp, is_zp_float, is_a_8bit, stages, max_shared_mem,
sms);
thread_tfg = exec_cfg.tb_cfg;
if (thread_tfg.thread_n != -1) {
if (prob_n / thread_tfg.thread_n *
......@@ -440,7 +449,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
if (is_valid_config({128, 64, 128}, thread_m_blocks, prob_m_split,
prob_n, prob_k, num_bits, group_size,
has_act_order, is_k_full, has_zp, is_zp_float,
max_shared_mem_new)) {
is_a_8bit, stages, max_shared_mem_new)) {
thread_tfg = {128, 64, 128};
exec_cfg = {1, thread_tfg};
}
......@@ -466,7 +475,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
TORCH_CHECK(
is_valid_config(thread_tfg, thread_m_blocks, prob_m_split, prob_n,
prob_k, num_bits, group_size, has_act_order, is_k_full,
has_zp, is_zp_float, max_shared_mem_new),
has_zp, is_zp_float, is_a_8bit, stages,
max_shared_mem_new),
"Invalid thread config: thread_m_blocks = ", thread_m_blocks,
", thread_k = ", thread_tfg.thread_k,
", thread_n = ", thread_tfg.thread_n,
......@@ -475,12 +485,12 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
", prob_m_split = ", prob_m_split, ", group_size = ", group_size,
", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
", max_shared_mem_new = ", max_shared_mem_new);
", stages = ", stages, ", max_shared_mem_new = ", max_shared_mem_new);
auto kernel = get_marlin_kernel(
a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
num_threads, is_zp_float);
num_threads, is_zp_float, stages);
if (kernel == MarlinDefault) {
TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
......
#pragma once
#include <torch/all.h>
#ifndef _marlin_cuh
#define _marlin_cuh
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <iostream>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <iostream>
#ifndef MARLIN_NAMESPACE_NAME
#define MARLIN_NAMESPACE_NAME marlin
#endif
#ifndef MARLIN_NAMESPACE_NAME
#define MARLIN_NAMESPACE_NAME marlin
#endif
namespace MARLIN_NAMESPACE_NAME {
......@@ -51,9 +53,51 @@ using I4 = Vec<int, 4>;
constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
// No support for async
#else
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
__device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr,
bool pred = true) {
if (pred) {
reinterpret_cast<int32_t*>(smem_ptr)[0] =
reinterpret_cast<const int32_t*>(glob_ptr)[0];
}
}
__device__ inline void cp_async2_ca_pred(void* smem_ptr, const void* glob_ptr,
bool pred = true) {
if (pred) {
reinterpret_cast<int64_t*>(smem_ptr)[0] =
reinterpret_cast<const int64_t*>(glob_ptr)[0];
}
}
__device__ inline void cp_async4_ca_pred(void* smem_ptr, const void* glob_ptr,
bool pred = true) {
if (pred) {
reinterpret_cast<int4*>(smem_ptr)[0] =
reinterpret_cast<const int4*>(glob_ptr)[0];
}
}
__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
bool pred = true) {
if (pred) {
reinterpret_cast<int4*>(smem_ptr)[0] =
reinterpret_cast<const int4*>(glob_ptr)[0];
}
}
__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
reinterpret_cast<int4*>(smem_ptr)[0] =
reinterpret_cast<const int4*>(glob_ptr)[0];
}
__device__ inline void cp_async_fence() {}
template <int n>
__device__ inline void cp_async_wait() {}
#else
__device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr,
bool pred = true) {
......@@ -126,6 +170,8 @@ __device__ inline void cp_async_wait() {
asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
}
#endif
#endif
} // namespace MARLIN_NAMESPACE_NAME
#endif
\ No newline at end of file
#include "marlin_dtypes.cuh"
namespace MARLIN_NAMESPACE_NAME {
// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
// output/accumulation.
template <vllm::ScalarTypeId type_id, bool use_fp16_accum, int k_size = 16>
__device__ inline void mma(
const typename MarlinScalarType<type_id>::FragA& a_frag,
const typename MarlinScalarType<type_id>::FragB& frag_b,
typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
if constexpr (!std::is_same<scalar_t, half>::value || k_size != 16) {
static_assert(!use_fp16_accum);
}
if constexpr (k_size == 16) {
if constexpr (std::is_same<scalar_t, half>::value && !use_fp16_accum) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(a[0]), "r"(a[1]), "r"(b[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
"f"(c[3]));
asm volatile(
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(a[2]), "r"(a[3]), "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
"f"(c[3]));
#else
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
#endif
} else if constexpr (std::is_same<scalar_t, half>::value &&
use_fp16_accum) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
"{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
: "=r"(c[0]), "=r"(c[1])
: "r"(a[0]), "r"(a[1]), "r"(b[0]), "r"(c[0]), "r"(c[1]));
asm volatile(
"mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
"{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
: "=r"(c[0]), "=r"(c[1])
: "r"(a[2]), "r"(a[3]), "r"(b[1]), "r"(c[0]), "r"(c[1]));
#else
uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
"{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
: "=r"(c[0]), "=r"(c[1])
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
"r"(c[0]), "r"(c[1]));
#endif
} else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
} else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
"f"(c[1]), "f"(c[2]), "f"(c[3]));
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
: "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
"r"(c[1]), "r"(c[2]), "r"(c[3]));
}
} else if (k_size == 32) {
if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
asm volatile(
"mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1}, {%2}, {%3}, {%4,%5};\n"
: "=r"(c[0]), "=r"(c[1])
: "r"(a[0]), "r"(b[0]), "r"(c[0]), "r"(c[1]));
asm volatile(
"mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1}, {%2}, {%3}, {%4,%5};\n"
: "=r"(c[2]), "=r"(c[3])
: "r"(a[1]), "r"(b[0]), "r"(c[2]), "r"(c[3]));
asm volatile(
"mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1}, {%2}, {%3}, {%4,%5};\n"
: "=r"(c[0]), "=r"(c[1])
: "r"(a[2]), "r"(b[1]), "r"(c[0]), "r"(c[1]));
asm volatile(
"mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1}, {%2}, {%3}, {%4,%5};\n"
: "=r"(c[2]), "=r"(c[3])
: "r"(a[3]), "r"(b[1]), "r"(c[2]), "r"(c[3]));
#else
asm volatile(
"mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
"r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
#endif
}
}
}
template <vllm::ScalarTypeId type_id, bool use_fp16_accum, int k_size = 16>
__device__ inline void mma_trans(
const typename MarlinScalarType<type_id>::FragA& a_frag,
const typename MarlinScalarType<type_id>::FragB& frag_b,
const typename MarlinScalarType<type_id>::FragB& frag_b2,
typename MarlinScalarType<type_id>::FragC& frag_c) {
const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
float* c = reinterpret_cast<float*>(&frag_c);
using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
if constexpr (!std::is_same<scalar_t, half>::value || k_size != 16) {
static_assert(!use_fp16_accum);
}
if constexpr (k_size == 16) {
if constexpr (std::is_same<scalar_t, half>::value && !use_fp16_accum) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
"f"(c[3]));
asm volatile(
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(b[1]), "r"(b2[1]), "r"(a[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
"f"(c[3]));
#else
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
#endif
} else if constexpr (std::is_same<scalar_t, half>::value &&
use_fp16_accum) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
"{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
: "=r"(c[0]), "=r"(c[1])
: "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]));
asm volatile(
"mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
"{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
: "=r"(c[0]), "=r"(c[1])
: "r"(b[1]), "r"(b2[1]), "r"(a[1]), "r"(c[0]), "r"(c[1]));
#else
uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
"{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
: "=r"(c[0]), "=r"(c[1])
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
"r"(c[0]), "r"(c[1]));
#endif
} else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
} else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
"f"(c[3]));
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
: "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
"r"(c[3]));
}
} else {
if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
asm volatile(
"mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1}, {%2}, {%3}, {%4,%5};\n"
: "=r"(c[0]), "=r"(c[1])
: "r"(b[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]));
asm volatile(
"mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1}, {%2}, {%3}, {%4,%5};\n"
: "=r"(c[2]), "=r"(c[3])
: "r"(b2[1]), "r"(a[0]), "r"(c[2]), "r"(c[3]));
asm volatile(
"mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1}, {%2}, {%3}, {%4,%5};\n"
: "=r"(c[0]), "=r"(c[1])
: "r"(b[0]), "r"(a[1]), "r"(c[0]), "r"(c[1]));
asm volatile(
"mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1}, {%2}, {%3}, {%4,%5};\n"
: "=r"(c[2]), "=r"(c[3])
: "r"(b2[1]), "r"(a[1]), "r"(c[2]), "r"(c[3]));
#else
asm volatile(
"mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
"r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
#endif
}
}
}
} // namespace MARLIN_NAMESPACE_NAME
\ No newline at end of file
......@@ -26,6 +26,7 @@
#include "marlin.cuh"
#include "marlin_dtypes.cuh"
#include "dequant.h"
#include "marlin_mma.h"
#include "core/scalar_type.hpp"
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \
......@@ -35,7 +36,7 @@
namespace MARLIN_NAMESPACE_NAME {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
template <typename scalar_t, // compute dtype, half or nv_float16
const vllm::ScalarTypeId b_type_id, // weight MarlinScalarType id
......@@ -75,137 +76,6 @@ __global__ void Marlin(
#else
// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
// output/accumulation.
template <vllm::ScalarTypeId type_id, int k_size = 16>
__device__ inline void mma(
const typename MarlinScalarType<type_id>::FragA& a_frag,
const typename MarlinScalarType<type_id>::FragB& frag_b,
typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
if constexpr (k_size == 16) {
if constexpr (std::is_same<scalar_t, half>::value) {
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
} else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
} else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
"f"(c[1]), "f"(c[2]), "f"(c[3]));
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
: "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
"r"(c[1]), "r"(c[2]), "r"(c[3]));
}
} else if (k_size == 32) {
if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
"r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
}
}
}
template <vllm::ScalarTypeId type_id, int k_size = 16>
__device__ inline void mma_trans(
const typename MarlinScalarType<type_id>::FragA& a_frag,
const typename MarlinScalarType<type_id>::FragB& frag_b,
const typename MarlinScalarType<type_id>::FragB& frag_b2,
typename MarlinScalarType<type_id>::FragC& frag_c) {
const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
float* c = reinterpret_cast<float*>(&frag_c);
using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
if constexpr (k_size == 16) {
if constexpr (std::is_same<scalar_t, half>::value) {
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
} else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
} else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
"f"(c[3]));
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
: "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
"r"(c[3]));
}
} else {
if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
float* c = reinterpret_cast<float*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
"f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
} else if constexpr (std::is_same<scalar_t, int8_t>::value) {
int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
asm volatile(
"mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
: "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
"r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
}
}
}
// Instruction for loading a full 16x16 matrix fragment of operand A from shared
// memory, directly in tensor core layout.
template <int count, vllm::ScalarTypeId type_id>
......@@ -415,6 +285,17 @@ __global__ void Marlin(
if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
#endif
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
// Turing TensorCore only supports fp16 and int8
if constexpr (a_type_id != vllm::kFloat16.id() && a_type_id != vllm::kS8.id())
return;
#endif
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
constexpr bool use_fp16_accum = a_type_id == vllm::kFloat16.id();
#else
constexpr bool use_fp16_accum = false;
#endif
using Adtype = MarlinScalarType<a_type_id>;
using Cdtype = MarlinScalarType<c_type_id>;
const int4* A = A0;
......@@ -873,10 +754,6 @@ __global__ void Marlin(
constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
: (stages * s_sh_stage);
int4* sh_s = sh_zp + (stages * zp_sh_stage);
// shared memory reused by reduction should be smaller than
// shared memory used by weight.
static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
stages * b_sh_stage);
int4* sh_a = sh_s + sh_s_size;
// Register storage for double buffer of shared memory reads.
......@@ -1395,11 +1272,13 @@ __global__ void Marlin(
#pragma unroll
for (int i = 0; i < thread_m_blocks; i++) {
if constexpr (m_block_size_8) {
mma_trans<a_type_id>(frag_a[k2][i], frag_b0, frag_b1,
frag_c[i][j][0]);
mma_trans<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0, frag_b1,
frag_c[i][j][0]);
} else {
mma<a_type_id>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
mma<a_type_id>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0,
frag_c[i][j][0]);
mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b1,
frag_c[i][j][1]);
}
}
}
......@@ -1433,10 +1312,12 @@ __global__ void Marlin(
#pragma unroll
for (int i = 0; i < thread_m_blocks; i++) {
mma<a_type_id, 32>(frag_a[k2][i], frag_b[0],
(group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
mma<a_type_id, 32>(frag_a[k2][i], frag_b[1],
(group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
mma<a_type_id, false, 32>(
frag_a[k2][i], frag_b[0],
(group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
mma<a_type_id, false, 32>(
frag_a[k2][i], frag_b[1],
(group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
}
if constexpr (group_blocks != -1) {
......@@ -1956,6 +1837,21 @@ __global__ void Marlin(
// While this pattern may not be the most readable, other ways of writing
// the loop seemed to noticeably worse performance after compilation.
if (slice_iters == 0) {
// convert fp16 accum to fp32 for reduction
if constexpr (use_fp16_accum) {
#pragma unroll
for (int i = 0; i < (thread_m_blocks * (is_a_8bit ? 2 : 4) * 2); i++) {
float* frag_c_part_float = reinterpret_cast<float*>(frag_c) + i * 4;
scalar_t* frag_c_part_half =
reinterpret_cast<scalar_t*>(frag_c_part_float);
#pragma unroll
for (int i = 3; i >= 0; i--) {
frag_c_part_float[i] = Cdtype::num2float(frag_c_part_half[i]);
}
}
}
if constexpr (is_a_8bit) {
float frag_a_s[2 * thread_m_blocks];
......
......@@ -565,8 +565,8 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill(
int rowEnd = rowEnds[rowIdx];
// Local pointers to this block
outIndices += rowIdx * topK;
logits += rowIdx * stride0;
outIndices += static_cast<int64_t>(rowIdx) * topK;
logits += static_cast<int64_t>(rowIdx) * stride0;
topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort>(
nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK);
......@@ -591,19 +591,21 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(
// Local pointers to this block
if constexpr (!multipleBlocksPerRow && !mergeBlocks) {
outIndices += rowIdx * topK;
outIndices += static_cast<int64_t>(rowIdx) * topK;
} else if constexpr (multipleBlocksPerRow) {
const auto blockSize = rowEnd / gridDim.y; // 16384 / 2 = 8192
rowStart = blockSize * blockIdx.y; // 8192 * 1 = 8192
rowEnd = gridDim.y == blockIdx.y + 1 ? rowEnd : rowStart + blockSize;
outIndices += rowIdx * gridDim.y * topK + blockIdx.y * topK;
outLogits += rowIdx * gridDim.y * topK + blockIdx.y * topK;
outIndices +=
static_cast<int64_t>(rowIdx) * gridDim.y * topK + blockIdx.y * topK;
outLogits +=
static_cast<int64_t>(rowIdx) * gridDim.y * topK + blockIdx.y * topK;
} else if constexpr (mergeBlocks) {
rowEnd = numBlocksToMerge * topK;
indices += rowIdx * numBlocksToMerge * topK;
outIndices += rowIdx * topK;
indices += static_cast<int64_t>(rowIdx) * numBlocksToMerge * topK;
outIndices += static_cast<int64_t>(rowIdx) * topK;
}
logits += rowIdx * stride0;
logits += static_cast<int64_t>(rowIdx) * stride0;
topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort,
multipleBlocksPerRow, mergeBlocks>(
......
......@@ -32,7 +32,7 @@ ARG DEADSNAKES_GPGKEY_URL
# The PyPA get-pip.py script is a self contained script+zip file, that provides
# both the installer script and the pip base85-encoded zip archive. This allows
# bootstrapping pip in environment where a dsitribution package does not exist.
# bootstrapping pip in environment where a distribution package does not exist.
#
# By parameterizing the URL for get-pip.py installation script, we allow
# third-party to use their own copy of the script stored in a private mirror.
......@@ -73,15 +73,13 @@ ARG INSTALL_KV_CONNECTORS=false
#################### BASE BUILD IMAGE ####################
# prepare basic build environment
FROM ${BUILD_BASE_IMAGE} AS base
ARG CUDA_VERSION
ARG PYTHON_VERSION
ARG TARGETPLATFORM
ARG INSTALL_KV_CONNECTORS=false
ENV DEBIAN_FRONTEND=noninteractive
ARG GET_PIP_URL
ENV DEBIAN_FRONTEND=noninteractive
# Install system dependencies and uv, then create Python virtual environment
# Install system dependencies including build tools
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
......@@ -107,32 +105,30 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& ln -s /opt/venv/bin/pip /usr/bin/pip \
&& python3 --version && python3 -m pip --version
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
# Activate virtual environment and add uv to PATH
ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
ENV VIRTUAL_ENV="/opt/venv"
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
# Environment for uv
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
RUN <<EOF
gcc --version
EOF
# Verify GCC version
RUN gcc --version
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
# Workaround for triton/pytorch issues
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# ============================================================
# SLOW-CHANGING DEPENDENCIES BELOW
# These are the expensive layers that we want to cache
# ============================================================
# Install PyTorch and core CUDA dependencies
# This is ~2GB and rarely changes
ARG PYTORCH_CUDA_INDEX_BASE_URL
WORKDIR /workspace
# install build and runtime dependencies
......@@ -142,13 +138,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# cuda arch list used by torch
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243
# CUDA arch list used by torch
# Explicitly set the list to avoid issues with torch 2.2
# See https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BASE BUILD IMAGE ####################
#################### BUILD BASE IMAGE ####################
#################### CSRC BUILD IMAGE ####################
FROM base AS csrc-build
......@@ -241,6 +236,48 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
fi
#################### CSRC BUILD IMAGE ####################
#################### EXTENSIONS BUILD IMAGE ####################
# Build DeepGEMM, pplx-kernels, DeepEP - runs in PARALLEL with csrc-build
# This stage is independent and doesn't affect csrc cache
FROM base AS extensions-build
ARG CUDA_VERSION
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
ENV UV_LINK_MODE=copy
WORKDIR /workspace
# Build DeepGEMM wheel
ARG DEEPGEMM_GIT_REF
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
RUN --mount=type=cache,target=/root/.cache/uv \
mkdir -p /tmp/deepgemm/dist && \
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh \
--cuda-version "${CUDA_VERSION}" \
${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} \
--wheel-dir /tmp/deepgemm/dist || \
echo "DeepGEMM build skipped (CUDA version requirement not met)"
# Ensure the wheel dir exists so COPY won't fail when DeepGEMM is skipped
RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
# Build pplx-kernels and DeepEP wheels
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
ARG PPLX_COMMIT_HASH
ARG DEEPEP_COMMIT_HASH
RUN --mount=type=cache,target=/root/.cache/uv \
mkdir -p /tmp/ep_kernels_workspace/dist && \
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
/tmp/install_python_libraries.sh \
--workspace /tmp/ep_kernels_workspace \
--mode wheel \
${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
#################### EXTENSIONS BUILD IMAGE ####################
#################### WHEEL BUILD IMAGE ####################
FROM base AS build
ARG TARGETPLATFORM
......@@ -265,6 +302,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
WORKDIR /workspace
# Copy pre-built csrc wheel directly
COPY --from=csrc-build /workspace/dist /precompiled-wheels
COPY . .
......@@ -286,27 +324,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
fi && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
# Install DeepGEMM from source
ARG DEEPGEMM_GIT_REF
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
RUN --mount=type=cache,target=/root/.cache/uv \
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} --wheel-dir /tmp/deepgemm/dist
# Ensure the wheel dir exists so later-stage COPY won't fail when DeepGEMM is skipped
RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
# Install EP kernels(pplx-kernels and DeepEP)
ARG PPLX_COMMIT_HASH
ARG DEEPEP_COMMIT_HASH
RUN --mount=type=cache,target=/root/.cache/uv \
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
/tmp/install_python_libraries.sh \
--workspace /tmp/ep_kernels_workspace \
--mode wheel \
${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
# Copy extension wheels from extensions-build stage for later use
COPY --from=extensions-build /tmp/deepgemm/dist /tmp/deepgemm/dist
COPY --from=extensions-build /tmp/ep_kernels_workspace/dist /tmp/ep_kernels_workspace/dist
# Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py
......@@ -344,32 +364,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
#################### DEV IMAGE ####################
#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM ${FINAL_BASE_IMAGE} AS vllm-base
ARG CUDA_VERSION
ARG PYTHON_VERSION
ARG INSTALL_KV_CONNECTORS=false
WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETPLATFORM
# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
ARG GDRCOPY_CUDA_VERSION=12.8
# Keep in line with FINAL_BASE_IMAGE
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
SHELL ["/bin/bash", "-c"]
ARG DEADSNAKES_MIRROR_URL
ARG DEADSNAKES_GPGKEY_URL
ARG GET_PIP_URL
ENV DEBIAN_FRONTEND=noninteractive
WORKDIR /vllm-workspace
# Python version string for paths (e.g., "312" for 3.12)
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
# Install Python and other dependencies
# Install Python and system dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
......@@ -408,62 +421,103 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Install CUDA development tools and build essentials for runtime JIT compilation
# Install CUDA development tools for runtime JIT compilation
# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
apt-get update -y && \
apt-get install -y --no-install-recommends \
cuda-nvcc-${CUDA_VERSION_DASH} \
cuda-cudart-${CUDA_VERSION_DASH} \
cuda-nvrtc-${CUDA_VERSION_DASH} \
cuda-cuobjdump-${CUDA_VERSION_DASH} \
# https://github.com/vllm-project/vllm/issues/29590
libcurand-dev-${CUDA_VERSION_DASH} \
libcublas-${CUDA_VERSION_DASH} \
# Fixes nccl_allocator requiring nccl.h at runtime
# https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
libnccl-dev && \
cuda-nvcc-${CUDA_VERSION_DASH} \
cuda-cudart-${CUDA_VERSION_DASH} \
cuda-nvrtc-${CUDA_VERSION_DASH} \
cuda-cuobjdump-${CUDA_VERSION_DASH} \
libcurand-dev-${CUDA_VERSION_DASH} \
libcublas-${CUDA_VERSION_DASH} \
# Fixes nccl_allocator requiring nccl.h at runtime
# https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
libnccl-dev && \
rm -rf /var/lib/apt/lists/*
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
# Install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv
RUN python3 -m pip install uv
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
# Environment for uv
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
# Workaround for triton/pytorch issues
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# ============================================================
# SLOW-CHANGING DEPENDENCIES BELOW
# These are the expensive layers that we want to cache
# ============================================================
# Install PyTorch and core CUDA dependencies
# This is ~2GB and rarely changes
ARG PYTORCH_CUDA_INDEX_BASE_URL
COPY requirements/common.txt /tmp/common.txt
COPY requirements/cuda.txt /tmp/requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r /tmp/requirements-cuda.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \
rm /tmp/requirements-cuda.txt /tmp/common.txt
# Install FlashInfer pre-compiled kernel cache and binaries
# This is ~1.1GB and only changes when FlashInfer version bumps
# https://docs.flashinfer.ai/installation.html
ARG FLASHINFER_VERSION=0.5.3
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system flashinfer-cubin==0.5.3 \
&& uv pip install --system flashinfer-jit-cache==0.5.3 \
uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
&& uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
&& flashinfer show-config
COPY examples examples
COPY benchmarks benchmarks
COPY ./vllm/collect_env.py .
# ============================================================
# OPENAI API SERVER DEPENDENCIES
# Pre-install these to avoid reinstalling on every vLLM wheel rebuild
# ============================================================
# Install gdrcopy (saves ~6s per build)
# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
ARG GDRCOPY_CUDA_VERSION=12.8
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
ARG TARGETPLATFORM
COPY tools/install_gdrcopy.sh /tmp/install_gdrcopy.sh
RUN set -eux; \
case "${TARGETPLATFORM}" in \
linux/arm64) UUARCH="aarch64" ;; \
linux/amd64) UUARCH="x64" ;; \
*) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
esac; \
/tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}" && \
rm /tmp/install_gdrcopy.sh
# Install vllm-openai dependencies (saves ~2.6s per build)
# These are stable packages that don't depend on vLLM itself
RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
BITSANDBYTES_VERSION="0.42.0"; \
else \
BITSANDBYTES_VERSION="0.46.1"; \
fi; \
uv pip install --system accelerate hf_transfer modelscope \
"bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
# ============================================================
# VLLM INSTALLATION (depends on build stage)
# ============================================================
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
# Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
RUN --mount=type=cache,target=/root/.cache/uv \
. /etc/environment && \
......@@ -478,7 +532,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
echo "No DeepGEMM wheels to install; skipping."; \
fi'
# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH (https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/.ci/manywheel/build_cuda.sh#L141C14-L141C36)
# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
......@@ -487,23 +541,17 @@ RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm
uv pip install --system ep_kernels/dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=/tmp/install_gdrcopy.sh,ro \
set -eux; \
case "${TARGETPLATFORM}" in \
linux/arm64) UUARCH="aarch64" ;; \
linux/amd64) UUARCH="x64" ;; \
*) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
esac; \
/tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"
# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
# return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
# consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
# Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
# Copy examples and benchmarks at the end to minimize cache invalidation
COPY examples examples
COPY benchmarks benchmarks
COPY ./vllm/collect_env.py .
#################### vLLM installation IMAGE ####################
#################### TEST IMAGE ####################
# image to run unit testing suite
# note that this uses vllm installed by `pip`
......@@ -569,18 +617,12 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500
# install additional dependencies for openai api server
# install kv_connectors if requested
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
uv pip install --system -r /tmp/kv_connectors.txt; \
fi; \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
BITSANDBYTES_VERSION="0.42.0"; \
else \
BITSANDBYTES_VERSION="0.46.1"; \
fi; \
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
uv pip install --system -r /tmp/kv_connectors.txt || true; \
fi
ENV VLLM_USAGE_SOURCE production-docker-image
......
......@@ -17,7 +17,7 @@
# VLLM_CPU_DISABLE_AVX512=false (default)|true
# VLLM_CPU_AVX512BF16=false (default)|true
# VLLM_CPU_AVX512VNNI=false (default)|true
# VLLM_CPU_AMXBF16=false (default)|true
# VLLM_CPU_AMXBF16=false |true (default)
#
######################### COMMON BASE IMAGE #########################
......@@ -95,7 +95,7 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
ARG VLLM_CPU_AVX512VNNI=0
ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
# Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
ARG VLLM_CPU_AMXBF16=0
ARG VLLM_CPU_AMXBF16=1
ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
WORKDIR /workspace/vllm
......@@ -147,7 +147,9 @@ WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get install -y --no-install-recommends vim numactl xz-utils
apt-get install -y --no-install-recommends vim numactl xz-utils make clangd-14
RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
......
......@@ -16,7 +16,7 @@ Async backends support the use of DBO (Dual Batch Overlap) and shared expert ove
Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. Llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass. For non-modular kernels, it is up to the experts function to deal with this flag.
Unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP.
Unless otherwise specified, backends are controlled via the `--all2all-backend` command-line argument (or the `all2all_backend` parameter in `ParallelConfig`). All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP.
<style>
td {
......
......@@ -109,7 +109,7 @@ Every plugin has three parts:
- `init_device`: This function is called to set up the device for the worker.
- `initialize_cache`: This function is called to set cache config for the worker.
- `load_model`: This function is called to load the model weights to device.
- `get_kv_cache_spaces`: This function is called to generate the kv cache spaces for the model.
- `get_kv_cache_spec`: This function is called to generate the kv cache spec for the model.
- `determine_available_memory`: This function is called to profiles the peak memory usage of the model to determine how much memory can be used for KV cache without OOMs.
- `initialize_from_config`: This function is called to allocate device KV cache with the specified kv_cache_config
- `execute_model`: This function is called every step to inference the model.
......
......@@ -27,3 +27,4 @@ The backends below live **outside** the main `vllm` repository and follow the
| IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
| Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
| Baidu Kunlun XPU | N/A, install from source | <https://github.com/baidu/vLLM-Kunlun> |
| Sophgo TPU | N/A, install from source | <https://github.com/sophgo/vllm-tpu> |
......@@ -19,7 +19,7 @@ Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels c
```bash
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index
```
??? console "pip"
......@@ -27,6 +27,20 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
```
!!! warning "set `LD_PRELOAD`"
Before use vLLM CPU installed via wheels, make sure TCMalloc is installed and added to `LD_PRELOAD`:
```bash
# install TCMalloc
sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
# manually find the path
sudo find / -iname *libtcmalloc_minimal.so.4
TC_PATH=...
# add them to LD_PRELOAD
export LD_PRELOAD="$TC_PATH:$LD_PRELOAD"
```
The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
**Install the latest code**
......@@ -37,7 +51,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
To install from nightly index, run:
```bash
uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu
uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu --index-strategy first-index
```
??? console "pip (there's a caveat)"
......@@ -56,7 +70,7 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
```bash
export VLLM_COMMIT=730bd35378bf2a5b56b6d3a45be28b3092d26519 # use full commit hash from the main branch
uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu
uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu --index-strategy first-index
```
# --8<-- [end:pre-built-wheels]
......@@ -105,6 +119,20 @@ VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
Testing has been conducted on AWS Graviton3 instances for compatibility.
!!! warning "set `LD_PRELOAD`"
Before use vLLM CPU installed via wheels, make sure TCMalloc is installed and added to `LD_PRELOAD`:
```bash
# install TCMalloc
sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
# manually find the path
sudo find / -iname *libtcmalloc_minimal.so.4
TC_PATH=...
# add them to LD_PRELOAD
export LD_PRELOAD="$TC_PATH:$LD_PRELOAD"
```
# --8<-- [end:build-wheel-from-source]
# --8<-- [start:pre-built-images]
......
......@@ -18,6 +18,12 @@ vLLM is a Python library that supports the following CPU variants. Select your C
--8<-- "docs/getting_started/installation/cpu.s390x.inc.md:installation"
## Technical Discussions
The main discussions happen in the `#sig-cpu` channel of [vLLM Slack](https://slack.vllm.ai/).
When open a Github issue about the CPU backend, please add `[CPU Backend]` in the title and it will be labeled with `cpu` for better awareness.
## Requirements
- Python: 3.10 -- 3.13
......@@ -258,11 +264,6 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel
- GPTQ (x86 only)
- compressed-tensor INT8 W8A8 (x86, s390x)
### (x86 only) What is the purpose of `VLLM_CPU_SGL_KERNEL`?
- Both of them require `amx` CPU flag.
- `VLLM_CPU_SGL_KERNEL` can provide better performance for MoE models and small-batch scenarios.
### Why do I see `get_mempolicy: Operation not permitted` when running in Docker?
In some container environments (like Docker), NUMA-related syscalls used by vLLM (e.g., `get_mempolicy`, `migrate_pages`) are blocked/denied in the runtime's default seccomp/capabilities settings. This may lead to warnings like `get_mempolicy: Operation not permitted`. Functionality is not affected, but NUMA memory binding/migration optimizations may not take effect and performance can be suboptimal.
......
......@@ -17,7 +17,51 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
# --8<-- [end:set-up-using-python]
# --8<-- [start:pre-built-wheels]
Currently, there are no pre-built x86 CPU wheels.
Pre-built vLLM wheels for x86 with AVX512 are available since version 0.13.0. To install release wheels:
```bash
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
# use uv
uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index --torch-backend cpu
```
??? console "pip"
```bash
# use pip
pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --extra-index-url https://download.pytorch.org/whl/cpu
```
!!! warning "set `LD_PRELOAD`"
Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`:
```bash
# install TCMalloc, Intel OpenMP is installed with vLLM CPU
sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
# manually find the path
sudo find / -iname *libtcmalloc_minimal.so.4
sudo find / -iname *libiomp5.so
TC_PATH=...
IOMP_PATH=...
# add them to LD_PRELOAD
export LD_PRELOAD="$TC_PATH:$IOMP_PATH:$LD_PRELOAD"
```
**Install the latest code**
To install the wheel built from the latest main branch:
```bash
uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly/cpu --index-strategy first-index --torch-backend cpu
```
**Install specific revisions**
If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
```bash
export VLLM_COMMIT=730bd35378bf2a5b56b6d3a45be28b3092d26519 # use full commit hash from the main branch
uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}/cpu --index-strategy first-index --torch-backend cpu
```
# --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source]
......@@ -26,10 +70,12 @@ Install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the def
```bash
sudo apt-get update -y
sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
sudo apt-get install -y gcc-12 g++-12 libnuma-dev
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
```
--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
Clone the vLLM project:
```bash
......@@ -82,6 +128,22 @@ uv pip install dist/*.whl
pip install dist/*.whl
```
!!! warning "set `LD_PRELOAD`"
Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`:
```bash
# install TCMalloc, Intel OpenMP is installed with vLLM CPU
sudo apt-get install -y --no-install-recommends libtcmalloc-minimal4
# manually find the path
sudo find / -iname *libtcmalloc_minimal.so.4
sudo find / -iname *libiomp5.so
TC_PATH=...
IOMP_PATH=...
# add them to LD_PRELOAD
export LD_PRELOAD="$TC_PATH:$IOMP_PATH:$LD_PRELOAD"
```
!!! example "Troubleshooting"
- **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
- **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
......@@ -95,7 +157,6 @@ uv pip install dist/*.whl
"torch==X.Y.Z+cpu" # <-------
]
```
- If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.
# --8<-- [end:build-wheel-from-source]
# --8<-- [start:pre-built-images]
......@@ -112,6 +173,7 @@ uv pip install dist/*.whl
docker build -f docker/Dockerfile.cpu \
--build-arg VLLM_CPU_AVX512BF16=false (default)|true \
--build-arg VLLM_CPU_AVX512VNNI=false (default)|true \
--build-arg VLLM_CPU_AMXBF16=false|true (default) \
--build-arg VLLM_CPU_DISABLE_AVX512=false (default)|true \
--tag vllm-cpu-env \
--target vllm-openai .
......@@ -123,9 +185,8 @@ docker run --rm \
--shm-size=4g \
-p 8000:8000 \
-e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
-e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
vllm-cpu-env \
--model=meta-llama/Llama-3.2-1B-Instruct \
meta-llama/Llama-3.2-1B-Instruct \
--dtype=bfloat16 \
other vLLM OpenAI server arguments
```
......
On NVIDIA CUDA only, it's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
```bash
uv venv --python 3.12 --seed
......
......@@ -181,3 +181,4 @@ If you have PRs touching the area, please feel free to ping the area owner for r
- Ascend NPU: [@wangxiyuan](https://github.com/wangxiyuan) and [see more details](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html#maintainers)
- Intel Gaudi HPU [@xuechendi](https://github.com/xuechendi) and [@kzawora-intel](https://github.com/kzawora-intel)
- Semantic Router: [@xunzhuo](https://github.com/xunzhuo), [@rootfs](https://github.com/rootfs) and [see more details](https://vllm-semantic-router.com/community/team)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment