优化nmz tp1性能

4d897ed1 · zhanghj2 · 3722ec71 · 4d897ed1 · 4d897ed1
Commit 4d897ed1 authored Feb 28, 2026 by zhanghj2
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 87 additions and 345 deletions

csrc/extension/flash_api.h csrc/extension/flash_api.h +2 -2

csrc/extension/flash_fwd_mla_kernel_fp8.h csrc/extension/flash_fwd_mla_kernel_fp8.h +85 -343

No files found.
--- a/csrc/extension/flash_api.h
+++ b/csrc/extension/flash_api.h
@@ -683,9 +683,9 @@ mha_fwd_kvcache_mla_fp8(
    // auto dprops = at::cuda::getCurrentDeviceProperties();
    // bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
    // TORCH_CHECK(is_sm90);
-    static std::string FLASH_MLA_ROOT_DIR = execCommand("python -c 'import site; print(site.getsitepackages()[0])'");
+    // static std::string FLASH_MLA_ROOT_DIR = execCommand("python -c 'import site; print(site.getsitepackages()[0])'");
-    setenv("FLASH_MLA_ROOT_DIR", (FLASH_MLA_ROOT_DIR + "/flash_mla/asm/").c_str(), 1);
+    // setenv("FLASH_MLA_ROOT_DIR", (FLASH_MLA_ROOT_DIR + "/flash_mla/asm/").c_str(), 1);
    // std::cout << FLASH_MLA_ROOT_DIR << "\n";
    // exit(-1);
    at::Tensor vcache = vcache_.has_value() ? vcache_.value() : kcache;

--- a/csrc/extension/flash_fwd_mla_kernel_fp8.h
+++ b/csrc/extension/flash_fwd_mla_kernel_fp8.h