"vscode:/vscode.git/clone" did not exist on "70fbdb26e99d7d0b0299acc108f83bd63626e589"
Unverified Commit f50dcb7c authored by Lu Fang's avatar Lu Fang Committed by GitHub
Browse files

[Easy] Eliminate c10::optional usage in vllm/csrc (#17819)

parent a1e19b63
...@@ -9,7 +9,7 @@ at::Tensor as_g_workspace; ...@@ -9,7 +9,7 @@ at::Tensor as_g_workspace;
torch::Tensor allspark_w8a16_gemm( torch::Tensor allspark_w8a16_gemm(
torch::Tensor const& a, torch::Tensor const& b_qweight, torch::Tensor const& a, torch::Tensor const& b_qweight,
torch::Tensor const& b_scales, c10::optional<torch::Tensor> const& b_qzeros, torch::Tensor const& b_scales, std::optional<torch::Tensor> const& b_qzeros,
int64_t n, int64_t group_size, int64_t sm_count, int64_t sm_version, int64_t n, int64_t group_size, int64_t sm_count, int64_t sm_version,
int64_t CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) { int64_t CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) {
TORCH_CHECK_NOT_IMPLEMENTED( TORCH_CHECK_NOT_IMPLEMENTED(
...@@ -918,7 +918,7 @@ void allspark_qgemm_w8a16_perc_ampere( ...@@ -918,7 +918,7 @@ void allspark_qgemm_w8a16_perc_ampere(
torch::Tensor allspark_w8a16_gemm( torch::Tensor allspark_w8a16_gemm(
torch::Tensor const& a, torch::Tensor const& b_qweight, torch::Tensor const& a, torch::Tensor const& b_qweight,
torch::Tensor const& b_scales, c10::optional<torch::Tensor> const& b_qzeros, torch::Tensor const& b_scales, std::optional<torch::Tensor> const& b_qzeros,
int64_t n, int64_t group_size, int64_t sm_count, int64_t sm_version, int64_t n, int64_t group_size, int64_t sm_count, int64_t sm_version,
int64_t CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) { int64_t CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) {
// Verify device and strides // Verify device and strides
......
...@@ -100,9 +100,9 @@ void rearrange_kn_weight_as_n32k16_order_ldg16( ...@@ -100,9 +100,9 @@ void rearrange_kn_weight_as_n32k16_order_ldg16(
void rearrange_kn_weight_as_n32k16_order( void rearrange_kn_weight_as_n32k16_order(
torch::Tensor const& b_qweight, torch::Tensor const& b_scales, torch::Tensor const& b_qweight, torch::Tensor const& b_scales,
c10::optional<torch::Tensor> const& b_zeros, bool has_zp, std::optional<torch::Tensor> const& b_zeros, bool has_zp,
torch::Tensor& b_qweight_reorder, torch::Tensor& b_scales_reorder, torch::Tensor& b_qweight_reorder, torch::Tensor& b_scales_reorder,
c10::optional<torch::Tensor> const& b_zeros_reorder, const int64_t K, std::optional<torch::Tensor> const& b_zeros_reorder, const int64_t K,
const int64_t N, const int64_t N_32align) { const int64_t N, const int64_t N_32align) {
// Verify device and strides // Verify device and strides
TORCH_CHECK(b_qweight.device().is_cuda(), "b_qweight is not on GPU"); TORCH_CHECK(b_qweight.device().is_cuda(), "b_qweight is not on GPU");
......
...@@ -1597,7 +1597,7 @@ void paged_attention_custom_launcher( ...@@ -1597,7 +1597,7 @@ void paged_attention_custom_launcher(
torch::Tensor& block_tables, torch::Tensor& context_lens, torch::Tensor& block_tables, torch::Tensor& context_lens,
const std::optional<torch::Tensor>& query_start_loc, int max_context_len, const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale, const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
torch::Tensor& v_scale, const c10::optional<torch::Tensor>& fp8_out_scale) { torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale) {
int num_seqs = block_tables.size(0); int num_seqs = block_tables.size(0);
int num_heads = query.size(1); int num_heads = query.size(1);
int head_size = query.size(2); int head_size = query.size(2);
...@@ -1825,7 +1825,7 @@ void paged_attention( ...@@ -1825,7 +1825,7 @@ void paged_attention(
const std::optional<torch::Tensor>& alibi_slopes, const std::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, torch::Tensor& k_scale, const std::string& kv_cache_dtype, torch::Tensor& k_scale,
torch::Tensor& v_scale, torch::Tensor& v_scale,
const c10::optional<torch::Tensor>& fp8_out_scale) { const std::optional<torch::Tensor>& fp8_out_scale) {
// clang-format on // clang-format on
const int head_size = query.size(2); const int head_size = query.size(2);
if (kv_cache_dtype == "auto") { if (kv_cache_dtype == "auto") {
......
...@@ -19,4 +19,4 @@ void paged_attention( ...@@ -19,4 +19,4 @@ void paged_attention(
const std::optional<torch::Tensor>& query_start_loc, int64_t block_size, const std::optional<torch::Tensor>& query_start_loc, int64_t block_size,
int64_t max_context_len, const std::optional<torch::Tensor>& alibi_slopes, int64_t max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, torch::Tensor& k_scale, const std::string& kv_cache_dtype, torch::Tensor& k_scale,
torch::Tensor& v_scale, const c10::optional<torch::Tensor>& fp8_out_scale); torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment