Merge branch 'kvcache-ai:main' into main

52fa671c · Yuhao Tsui · GitHub · e5694f91 · f142f4df · 52fa671c
Unverified Commit 52fa671c authored Mar 26, 2025 by Yuhao Tsui Committed by GitHub Mar 26, 2025
20 changed files
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
-name: Install KTransformers
+name: Install / Test KTransformers
-run-name: Install KTransformers
+run-name: Install / Test KTransformers
 on:
  workflow_dispatch:
    inputs:
      job_to_run:
        description: "Which job to run?"
        required: true
-        default: "install"
+        default: "test"
        type: choice
        options:
-          - create&install
+          - create-install-test
-          - install
+          - install-test
+          - test
 jobs:
-  Install-KTransformers:
+  Install-Test-KTransformers:
    runs-on: self-hosted
    steps:
      - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
@@ -22,19 +23,19 @@ jobs:
      - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
      - name: Remove old conda environment
        continue-on-error: true
-        if: ${{ inputs.job_to_run == 'create&install'}}
+        if: contains(inputs.job_to_run, 'create')
        run: |
          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
          conda env remove --name ktransformers-dev -y
      - name: Create conda environment
-        if: ${{ inputs.job_to_run == 'create&install'}}
+        if: contains(inputs.job_to_run, 'create')
        run: |
          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
          conda create --name ktransformers-dev python=3.11
          conda activate ktransformers-dev
          conda install -c conda-forge libstdcxx-ng -y
      - name: Install dependencies
-        if: ${{ inputs.job_to_run == 'create&install'}}
+        if: contains(inputs.job_to_run, 'create')
        run: |
          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
          conda activate ktransformers-dev
@@ -42,11 +43,29 @@ jobs:
          pip3 install packaging ninja cpufeature numpy
          pip install ~/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
      - name: Install KTransformers
+        if: contains(inputs.job_to_run, 'install')
        run: |
          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
          conda activate ktransformers-dev
+          pip3 uninstall ktransformers -y
          cd ${{ github.workspace }}
          git submodule init
          git submodule update
-          USE_NUMA=1 bash install.sh
+          bash install.sh
+      - name: Test Local Chat 1
+        run: |
+          set -e
+          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
+          conda activate ktransformers-dev
+          export PATH=/usr/local/cuda-12.4/bin:$PATH
+          export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH
+          export CUDA_HOME=/usr/local/cuda-12.4
+          cd ${{ github.workspace }}
+          echo "Running Local Chat 1 (book.txt) ..."
+          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt > log1.txt
+          sed -n '/Prompt:/,$p' log1.txt
+          echo "Running Local Chat 2 [force think] (chinese.txt) ..."
+          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt -f > log2.txt
+          sed -n '/Prompt:/,$p' log2.txt
      - run: echo "This job's status is ${{ job.status }}."
--- a/.github/workflows/score.yml
+++ b/.github/workflows/score.yml
+name: Human Eval Score
+run-name: Human Eval Score
+on: workflow_dispatch
+jobs:
+  Human-Eval-Score:
+    runs-on: self-hosted
+    steps:
+      - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
+      - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
+      - name: Check out repository code
+        uses: actions/checkout@v4
+      - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
+      - name: Human Eval Run
+        run: |
+          set -e
+          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
+          conda activate ktransformers-dev
+          export PATH=/usr/local/cuda-12.4/bin:$PATH
+          export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH
+          export CUDA_HOME=/usr/local/cuda-12.4
+          cd ${{ github.workspace }}
+          python ktransformers/tests/score.py
+      - run: echo "This job's status is ${{ job.status }}."
--- a/README.md
+++ b/README.md
@@ -23,7 +23,8 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin
 <h2 id="Updates">🔥 Updates</h2>
-* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context) for DeepSeek-V3 and R1 in 24GB VRAM.
+* **Mar 15, 2025**: Support ROCm on AMD GPU ([Tutorial](./doc/en/ROCm.md)).
+* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022--v023-longer-context--fp8-kernel) for DeepSeek-V3 and R1 in 24GB VRAM.
 * **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
 * **Feb 15, 2025**: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed （+15%, up to 16 Tokens/s), update [docs](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).
 * **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. For detailed show case and reproduction tutorial, see [here](./doc/en/DeepseekR1_V3_tutorial.md).
@@ -162,9 +163,9 @@ If you are interested in our design principles and the implementation of the inj
 <h2 id="ack">Acknowledgment and Contributors</h2>
-The development of KTransformer is based on the flexible and versatile framework provided by Transformers. We also benefit from advanced kernels such as GGUF/GGML, Llamafile, Marlin, sglang and flashinfer. We are planning to contribute back to the community by upstreaming our modifications.
+The development of KTransformers is based on the flexible and versatile framework provided by Transformers. We also benefit from advanced kernels such as GGUF/GGML, Llamafile, Marlin, sglang and flashinfer. We are planning to contribute back to the community by upstreaming our modifications.
-KTransformer is actively maintained and developed by contributors from the <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> at Tsinghua University and members from <a href="http://approaching.ai/">Approaching.AI</a>. We welcome new contributors to join us in making KTransformer faster and easier to use.
+KTransformers is actively maintained and developed by contributors from the <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> at Tsinghua University and members from <a href="http://approaching.ai/">Approaching.AI</a>. We welcome new contributors to join us in making KTransformers faster and easier to use.
 <h2 id="ack">Discussion</h2>

--- a/README_ZH.md
+++ b/README_ZH.md
@@ -152,9 +152,9 @@ YAML 文件中的每个规则都有两部分：`match` 和 `replace`。`match` 
 <h2 id="ack">致谢和贡献者</h2>
-KTransformer 的开发基于 Transformers 提供的灵活和多功能框架。我们还受益于 GGUF/GGML、Llamafile 、 Marlin、sglang和flashinfer 等高级内核。我们计划通过向上游贡献我们的修改来回馈社区。
+KTransformers 的开发基于 Transformers 提供的灵活和多功能框架。我们还受益于 GGUF/GGML、Llamafile 、 Marlin、sglang和flashinfer 等高级内核。我们计划通过向上游贡献我们的修改来回馈社区。
-KTransformer 由清华大学 <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> 小组的成员以及 <a href="http://approaching.ai/">Approaching.AI</a> 的成员积极维护和开发。我们欢迎新的贡献者加入我们，使 KTransformer 更快、更易于使用。
+KTransformers 由清华大学 <a href="https://madsys.cs.tsinghua.edu.cn/">MADSys group</a> 小组的成员以及 <a href="http://approaching.ai/">Approaching.AI</a> 的成员积极维护和开发。我们欢迎新的贡献者加入我们，使 KTransformers 更快、更易于使用。
 <h2 id="ack">讨论</h2>

--- a/WeChatGroup.png
+++ b/WeChatGroup.png
--- a/doc/README.md
+++ b/doc/README.md
@@ -22,8 +22,9 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin
 <h2 id="Updates">🔥 Updates</h2>
-* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context) for DeepSeek-V3 and R1 in 24GB VRAM.
+* **Mar 15, 2025**: Support ROCm on AMD GPU ([Tutorial](./en/ROCm.md)).
-* **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
+* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./en/fp8_kernel.md) weights. Support 139K [Longer Context](./en/DeepseekR1_V3_tutorial.md#v022-longer-context) for DeepSeek-V3 and R1 in 24GB VRAM.
+* **Feb 25, 2025**: Support [FP8 GPU kernel](./en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./en/DeepseekR1_V3_tutorial.md#v022-longer-context).
 * **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./en/DeepseekR1_V3_tutorial.md).
 * **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./en/long_context_tutorial.md).
 * **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.

--- a/doc/SUMMARY.md
+++ b/doc/SUMMARY.md
-# Ktransformer
+# Ktransformers
 [Introduction](./README.md)
 # Install
@@ -10,6 +10,7 @@
 - [Injection Tutorial](en/injection_tutorial.md)
 - [Multi-GPU Tutorial](en/multi-gpu-tutorial.md)
 - [Use FP8 GPU Kernel](en/fp8_kernel.md)
+- [Use AMD GPU](en/ROCm.md)
 # Server
  - [Server](en/api/server/server.md)
  - [Website](en/api/server/website.md)

--- a/doc/en/Docker.md
+++ b/doc/en/Docker.md
@@ -9,7 +9,7 @@ There is a Docker image available for our project, you can pull the docker image
 ```
 docker pull approachingai/ktransformers:0.2.1
 ```
-**Notice**: In this image, we compile the ktransformers in AVX512 instuction CPUs, if your cpu not support AVX512, it is suggested to recompile and install ktransformer in the /workspace/ktransformers directory within the container.
+**Notice**: In this image, we compile the ktransformers in AVX512 instuction CPUs, if your cpu not support AVX512, it is suggested to recompile and install ktransformers in the /workspace/ktransformers directory within the container.
 ## Building docker image locally
 - Download Dockerfile in [there](../../Dockerfile)

--- a/doc/en/FAQ.md
+++ b/doc/en/FAQ.md
@@ -118,7 +118,7 @@ From: https://github.com/kvcache-ai/ktransformers/issues/374
 1. First, download the latest source code using git.
 2. Then, modify the DeepSeek-V3-Chat-multi-gpu-4.yaml in the source code and all related yaml files, replacing all instances of KLinearMarlin with KLinearTorch.
-3. Next, you need to compile from the ktransformer source code until it successfully compiles on your local machine.
+3. Next, you need to compile from the ktransformers source code until it successfully compiles on your local machine.
 4. Then, install flash-attn. It won't be used, but not installing it will cause an error.
 5. Then, modify local_chat.py, replacing all instances of flash_attention_2 with eager.
 6. Then, run local_chat.py. Be sure to follow the official tutorial's commands and adjust according to your local machine's parameters.

--- a/doc/en/ROCm.md
+++ b/doc/en/ROCm.md
+# ROCm Support for ktransformers (Beta)
+## Introduction
+### Overview
+In our effort to expand GPU architecture support beyond NVIDIA, we are excited to introduce **AMD GPU support through ROCm** in ktransformers (Beta release). This implementation has been tested and developed using EPYC 9274F processors and AMD Radeon 7900xtx GPUs.
+## Installation Guide
+### 1. Install ROCm Driver
+Begin by installing the ROCm drivers for your AMD GPU:
+- [Official ROCm Installation Guide for Radeon GPUs](https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/install-radeon.html)
+### 2. Set Up Conda Environment
+We recommend using Miniconda3/Anaconda3 for environment management:
+```bash
+# Download Miniconda
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+# Create environment
+conda create --name ktransformers python=3.11
+conda activate ktransformers
+# Install required libraries
+conda install -c conda-forge libstdcxx-ng
+# Verify GLIBCXX version (should include 3.4.32)
+strings ~/anaconda3/envs/ktransformers/lib/libstdc++.so.6 | grep GLIBCXX
+```
+> **Note:** Adjust the Anaconda path if your installation directory differs from `~/anaconda3`
+### 3. Install PyTorch for ROCm
+Install PyTorch with ROCm 6.2.4 support:
+```bash
+pip3 install torch torchvision torchaudio \
+  --index-url https://download.pytorch.org/whl/rocm6.2.4
+pip3 install packaging ninja cpufeature numpy
+```
+> **Tip:** For other ROCm versions, visit [PyTorch Previous Versions](https://pytorch.org/get-started/previous-versions/)
+### 4. Build ktransformers
+```bash
+# Clone repository
+git clone https://github.com/kvcache-ai/ktransformers.git
+cd ktransformers
+git submodule update --init
+# Optional: Compile web interface
+# See: api/server/website.md
+# Install dependencies
+bash install.sh
+```
+## Running DeepSeek-R1 Models
+### Configuration for 24GB VRAM GPUs
+Use our optimized configuration for constrained VRAM:
+```bash
+python ktransformers/local_chat.py \
+  --model_path deepseek-ai/DeepSeek-R1 \
+  --gguf_path <path_to_gguf_files> \
+  --optimize_config_path ktransformers/optimize/optimize_rules/rocm/DeepSeek-V3-Chat.yaml \
+  --cpu_infer <cpu_cores + 1>
+```
+> **Beta Note:** Current Q8 linear implementation (Marlin alternative) shows suboptimal performance. Expect optimizations in future releases.
+### Configuration for 40GB+ VRAM GPUs
+For better performance on high-VRAM GPUs:
+1. Modify `DeepSeek-V3-Chat.yaml`:
+   ```yaml
+   # Replace all instances of:
+   KLinearMarlin → KLinearTorch
+   ```
+2. Execute with:
+   ```bash
+   python ktransformers/local_chat.py \
+     --model_path deepseek-ai/DeepSeek-R1 \
+     --gguf_path <path_to_gguf_files> \
+     --optimize_config_path <modified_yaml_path> \
+     --cpu_infer <cpu_cores + 1>
+   ```
+> **Tip:** If you got 2 * 24GB AMD GPUS, you may also do the same modify and run `ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml` instead.
+## Known Limitations
+- Marlin operations not supported on ROCm platform
+- Current Q8 linear implementation shows reduced performance (Beta limitation)
--- a/doc/en/install.md
+++ b/doc/en/install.md
@@ -49,7 +49,7 @@ Some preparation:
  conda install -c conda-forge libstdcxx-ng # Anaconda provides a package called `libstdcxx-ng` that includes a newer version of `libstdc++`, which can be installed via `conda-forge`.
-  strings ~/anaconda3/envs/ktransformers-0.3/lib/libstdc++.so.6 | grep GLIBCXX
+  strings ~/anaconda3/envs/ktransformers/lib/libstdc++.so.6 | grep GLIBCXX
  ```
 - Make sure that PyTorch, packaging, ninja is installed You can also [install previous versions of PyTorch](https://pytorch.org/get-started/previous-versions/)

--- a/ktransformers/__init__.py
+++ b/ktransformers/__init__.py
@@ -8,4 +8,4 @@ Version      : 1.0.0
 LastEditors  : chenxl 
 LastEditTime : 2025-02-15 03:53:02
 '''
-__version__ = "0.2.3.post1"
+__version__ = "0.2.3post2"
--- a/ktransformers/ktransformers_ext/CMakeLists.txt
+++ b/ktransformers/ktransformers_ext/CMakeLists.txt
@@ -32,6 +32,7 @@ endif()
 option(LLAMA_AVX512_FANCY_SIMD               "llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI"                        OFF)
 option(KTRANSFORMERS_USE_CUDA                "ktransformers: use CUDA"                          OFF)
 option(KTRANSFORMERS_USE_MUSA                "ktransformers: use MUSA"                          OFF)
+option(KTRANSFORMERS_USE_ROCM                "ktransformers: use ROCM"                          OFF)
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
@@ -201,6 +202,31 @@ endif()
 #     message(STATUS "Can't found CUDA lib")
 # endif()
+if (NOT EXISTS $ENV{ROCM_PATH})
+    if (NOT EXISTS /opt/rocm)
+        set(ROCM_PATH /usr)
+    else()
+        set(ROCM_PATH /opt/rocm)
+    endif()
+else()
+    set(ROCM_PATH $ENV{ROCM_PATH})
+endif()
+list(APPEND CMAKE_PREFIX_PATH  ${ROCM_PATH})
+list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
+if (NOT EXISTS $ENV{MUSA_PATH})
+    if (NOT EXISTS /opt/musa)
+        set(MUSA_PATH /usr/local/musa)
+    else()
+        set(MUSA_PATH /opt/musa)
+    endif()
+else()
+    set(MUSA_PATH $ENV{MUSA_PATH})
+endif()
+list(APPEND CMAKE_MODULE_PATH "${MUSA_PATH}/cmake")
 add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
 add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
@@ -218,6 +244,14 @@ elseif (UNIX)
        add_compile_definitions(KTRANSFORMERS_USE_CUDA=1)
    endif()
+    if (KTRANSFORMERS_USE_ROCM)
+        find_package(HIP REQUIRED)
+        if(HIP_FOUND)
+            include_directories("${HIP_INCLUDE_DIRS}")
+            add_compile_definitions(KTRANSFORMERS_USE_ROCM=1)
+        endif()
+    endif()
    if (KTRANSFORMERS_USE_MUSA)
        if (NOT EXISTS $ENV{MUSA_PATH})
            if (NOT EXISTS /opt/musa)
@@ -258,6 +292,11 @@ elseif(UNIX)
        endif()
        target_link_libraries(${PROJECT_NAME} PRIVATE "$ENV{CUDA_HOME}/lib64/libcudart.so")
    endif()
+    if (KTRANSFORMERS_USE_ROCM)
+        add_compile_definitions(USE_HIP=1)
+        target_link_libraries(${PROJECT_NAME} PRIVATE "${ROCM_PATH}/lib/libamdhip64.so")
+        message(STATUS "Building for HIP")
+    endif()
    if(KTRANSFORMERS_USE_MUSA)
        target_link_libraries(${PROJECT_NAME} PRIVATE MUSA::musart)
    endif()

--- a/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
@@ -7,79 +7,83 @@
 * @LastEditTime : 2024-08-07 09:47:43
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
-#ifndef CPUINFER_CPUINFER_H
+ #ifndef CPUINFER_CPUINFER_H
-#define CPUINFER_CPUINFER_H
+ #define CPUINFER_CPUINFER_H
-#include <atomic>
+ #include <atomic>
-#include <condition_variable>
+ #include <condition_variable>
-#include <functional>
+ #include <functional>
-#include <mutex>
+ #include <mutex>
-#include <queue>
+ #include <queue>
-#include <thread>
+ #include <thread>
-#include <vector>
+ #include <vector>
-#ifdef KTRANSFORMERS_USE_CUDA
+ #ifdef KTRANSFORMERS_USE_CUDA
-#include "vendors/cuda.h"
+ #include "vendors/cuda.h"
-#elif KTRANSFORMERS_USE_MUSA
+ #elif KTRANSFORMERS_USE_MUSA
-#include "vendors/musa.h"
+ #include "vendors/musa.h"
-#endif
+ #elif KTRANSFORMERS_USE_ROCM
+ #define __HIP_PLATFORM_AMD__
-#include "backend.h"
+ #include "vendors/hip.h"
-#include "task_queue.h"
+ #endif
-#include "llama.cpp/ggml-impl.h"
+ #include "backend.h"
+ #include "task_queue.h"
-class CPUInfer {
+ #include "../vendors/vendor.h"
-   public:
-    CPUInfer(int thread_num) {
+ #include "llama.cpp/ggml-impl.h"
-        backend_ = new Backend(thread_num - 1);
-        task_queue_ = new TaskQueue();
+ class CPUInfer {
-        for (int i = 0; i < (1 << 16); ++i) {
+    public:
-            ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(i);
+     CPUInfer(int thread_num) {
-        }
+         backend_ = new Backend(thread_num - 1);
-    }
+         task_queue_ = new TaskQueue();
+         for (int i = 0; i < (1 << 16); ++i) {
-    ~CPUInfer() {
+             ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(i);
-        delete backend_;
+         }
-        delete task_queue_;
+     }
-    }
+     ~CPUInfer() {
-    template <typename Func, typename Obj, typename... Args>
+         delete backend_;
-    void enqueue(Func f, Obj* obj, Args... args) {
+         delete task_queue_;
-        task_queue_->enqueue([=]() {
+     }
-            std::invoke(f, *obj, args..., backend_);
-        });
+     template <typename Func, typename Obj, typename... Args>
-    }
+     void enqueue(Func f, Obj* obj, Args... args) {
+         task_queue_->enqueue([=]() {
-    void submit(std::pair<intptr_t, intptr_t> params) {
+             std::invoke(f, *obj, args..., backend_);
-        void (*func)(void*) = (void (*)(void*))params.first;
+         });
-        void* args = (void*)params.second;
+     }
-        *((CPUInfer**)args) = this;
-        func(args);
+     void submit(std::pair<intptr_t, intptr_t> params) {
-    }
+         void (*func)(void*) = (void (*)(void*))params.first;
+         void* args = (void*)params.second;
-    void sync() {
+         *((CPUInfer**)args) = this;
-        task_queue_->sync();
+         func(args);
-    }
+     }
-    void submit_with_cuda_stream(intptr_t user_cuda_stream, std::pair<intptr_t, intptr_t> params) {
+     void sync() {
-        void (*func)(void*) = (void (*)(void*))params.first;
+         task_queue_->sync();
-        void* args = (void*)params.second;
+     }
-        *((CPUInfer**)args) = this;
-        cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)func, args);
+     void submit_with_cuda_stream(intptr_t user_cuda_stream, std::pair<intptr_t, intptr_t> params) {
-    }
+         void (*func)(void*) = (void (*)(void*))params.first;
+         void* args = (void*)params.second;
-    static void sync_(void* cpu_infer_ptr) {
+         *((CPUInfer**)args) = this;
-        CPUInfer* cpuinfer = (CPUInfer*)cpu_infer_ptr;
+         cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)func, args);
-        cpuinfer->sync();
+     }
-    }
+     static void sync_(void* cpu_infer_ptr) {
-    void sync_with_cuda_stream(intptr_t user_cuda_stream) {
+         CPUInfer* cpuinfer = (CPUInfer*)cpu_infer_ptr;
-        cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)&sync_, (void*)this);
+         cpuinfer->sync();
-    }
+     }
-   public:
+     void sync_with_cuda_stream(intptr_t user_cuda_stream) {
-    Backend* backend_;
+         cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)&sync_, (void*)this);
-    TaskQueue* task_queue_;
+     }
-};
+    public:
-#endif
+     Backend* backend_;
\ No newline at end of file
+     TaskQueue* task_queue_;
+ };
+ #endif
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/cpu_backend/vendors/cuda.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/vendors/cuda.h
 #pragma once
 #include <cuda_runtime.h>
\ No newline at end of file
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#if CUDART_VERSION < 11020
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#endif // CUDART_VERSION < 11020
--- a/ktransformers/ktransformers_ext/cpu_backend/vendors/hip.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/vendors/hip.h
+#pragma once
+#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif // __HIP_PLATFORM_AMD__
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F  HIPBLAS_R_16F
+#define CUDA_R_32F  HIPBLAS_R_32F
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
+#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
+#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
+#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
+#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
+#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define cublasOperation_t hipblasOperation_t
+#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
+#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEvent_t hipEvent_t
+#define cudaEventDestroy hipEventDestroy
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaHostRegister hipHostRegister
+#define cudaHostRegisterPortable hipHostRegisterPortable
+#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
+#define cudaHostUnregister hipHostUnregister
+#define cudaLaunchHostFunc hipLaunchHostFunc
+#define cudaMalloc hipMalloc
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaMemGetInfo hipMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define cudaSetDevice hipSetDevice
+#define cuDeviceGet hipDeviceGet
+#define CUdevice hipDevice_t
+#define CUdeviceptr hipDeviceptr_t
+#define cuMemUnmap hipMemUnmap
+#define CUmemAccessDesc hipMemAccessDesc
+#define cuMemAddressFree hipMemAddressFree
+#define cuMemRelease hipMemRelease
+#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
+#define cuMemCreate hipMemCreate
+#define cuMemAddressReserve hipMemAddressReserve
+#define cuMemMap hipMemMap
+#define cuMemSetAccess hipMemSetAccess
+#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
+#define CUmemAllocationProp hipMemAllocationProp
+#define cuDeviceGetAttribute hipDeviceGetAttribute
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamFireAndForget hipStreamFireAndForget
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamPerThread hipStreamPerThread
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
+#define cudaGraphExec_t hipGraphExec_t
+#define cudaGraphNode_t hipGraphNode_t
+#define cudaKernelNodeParams hipKernelNodeParams
+#define cudaKernelNodeParams hipKernelNodeParams
+#define cudaGraphExecDestroy hipGraphExecDestroy
+#define cudaGraphLaunch hipGraphLaunch
+#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
+#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
+#define cudaGraphNodeType hipGraphNodeType
+#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
+#define cudaGraphInstantiate hipGraphInstantiate
+#define cudaStreamEndCapture hipStreamEndCapture
+#define cudaGraphDestroy hipGraphDestroy
+#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
+#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
+#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
+#define cudaGraphNodeGetType hipGraphNodeGetType
+#define cudaGraphGetNodes hipGraphGetNodes
+#define cudaGraphExecUpdate hipGraphExecUpdate
+#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
+#define cudaStreamBeginCapture hipStreamBeginCapture
+#define cudaGraph_t hipGraph_t
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#define cudaHostFn_t hipHostFn_t
+#define __trap() do { abort(); __builtin_unreachable(); } while(0)
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+#define __CUDA_ARCH__ 1300
+#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
+#define GCN
+#endif
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
+#define CDNA
+#endif
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
+    defined(__gfx1150__) || defined(__gfx1151__)
+#define RDNA3
+#endif
+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
+#define RDNA2
+#endif
+#if defined(__gfx1010__) || defined(__gfx1012__)
+#define RDNA1
+#endif
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+typedef hip_bfloat16 nv_bfloat16;
--- a/ktransformers/ktransformers_ext/cpu_backend/vendors/musa.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/vendors/musa.h
 #pragma once
 #include <musa_runtime.h>
+#include <musa.h>
+#include <mublas.h>
 #include <musa_bf16.h>
+#include <musa_fp16.h>
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
+#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N MUBLAS_OP_N
+#define CUBLAS_OP_T MUBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
+#define CUDA_R_16F  MUSA_R_16F
+#define CUDA_R_32F  MUSA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#define cublasCreate mublasCreate
+#define cublasDestroy mublasDestroy
+#define cublasGemmEx mublasGemmEx
+#define cublasGemmBatchedEx mublasGemmBatchedEx
+#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
+#define cublasHandle_t mublasHandle_t
+#define cublasSetMathMode mublasSetMathMode
+#define cublasSetStream mublasSetStream
+#define cublasSgemm mublasSgemm
+#define cublasStatus_t mublasStatus_t
+#define cublasOperation_t mublasOperation_t
+#define cublasGetStatusString mublasStatus_to_string
+#define cudaDataType_t musaDataType_t
+#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
+#define cudaDeviceProp musaDeviceProp
+#define cudaDeviceSynchronize musaDeviceSynchronize
+#define cudaError_t musaError_t
+#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags musaEventCreateWithFlags
+#define cudaEventDisableTiming musaEventDisableTiming
+#define cudaEventRecord musaEventRecord
+#define cudaEventSynchronize musaEventSynchronize
+#define cudaEvent_t musaEvent_t
+#define cudaEventDestroy musaEventDestroy
+#define cudaFree musaFree
+#define cudaFreeHost musaFreeHost
+#define cudaGetDevice musaGetDevice
+#define cudaGetDeviceCount musaGetDeviceCount
+#define cudaGetDeviceProperties musaGetDeviceProperties
+#define cudaGetErrorString musaGetErrorString
+#define cudaGetLastError musaGetLastError
+#define cudaHostRegister musaHostRegister
+#define cudaHostRegisterPortable musaHostRegisterPortable
+#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
+#define cudaHostUnregister musaHostUnregister
 #define cudaLaunchHostFunc musaLaunchHostFunc
+#define cudaMalloc musaMalloc
+#define cudaMallocHost musaMallocHost
+#define cudaMallocManaged musaMallocManaged
+#define cudaMemcpy musaMemcpy
+#define cudaMemcpyAsync musaMemcpyAsync
+#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
+#define cudaMemcpy2DAsync musaMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
+#define cudaMemcpyKind musaMemcpyKind
+#define cudaMemset musaMemset
+#define cudaMemsetAsync musaMemsetAsync
+#define cudaMemGetInfo musaMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
+#define cudaSetDevice musaSetDevice
+#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
+#define cudaStreamDestroy musaStreamDestroy
+#define cudaStreamFireAndForget musaStreamFireAndForget
+#define cudaStreamNonBlocking musaStreamNonBlocking
+#define cudaStreamPerThread musaStreamPerThread
+#define cudaStreamSynchronize musaStreamSynchronize
+#define cudaStreamWaitEvent musaStreamWaitEvent
 #define cudaStream_t musaStream_t
-#define cudaHostFn_t musaHostFn_t
+#define cudaSuccess musaSuccess
-#define nv_bfloat16 mt_bfloat16
\ No newline at end of file
+// Additional mappings for MUSA virtual memory pool
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
+#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
+#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
+#define CUdevice MUdevice
+#define CUdeviceptr MUdeviceptr
+#define CUmemAccessDesc MUmemAccessDesc
+#define CUmemAllocationProp MUmemAllocationProp
+#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
+#define cuDeviceGet muDeviceGet
+#define cuDeviceGetAttribute muDeviceGetAttribute
+#define cuMemAddressFree muMemAddressFree
+#define cuMemAddressReserve muMemAddressReserve
+#define cuMemCreate muMemCreate
+#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
+#define cuMemMap muMemMap
+#define cuMemRelease muMemRelease
+#define cuMemSetAccess muMemSetAccess
+#define cuMemUnmap muMemUnmap
+#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
+#define cudaFuncSetAttribute musaFuncSetAttribute
+#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
+#define make_cudaExtent make_musaExtent
+#define make_cudaPitchedPtr make_musaPitchedPtr
+// Additional mappings for MUSA graphs
+#define CUDA_SUCCESS MUSA_SUCCESS
+#define CUresult MUresult
+#define cuGetErrorString muGetErrorString
+#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
+#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
+#define cudaGraphDestroy musaGraphDestroy
+#define cudaGraphExecDestroy musaGraphExecDestroy
+#define cudaGraphExec_t musaGraphExec_t
+#define cudaGraphExecUpdate musaGraphExecUpdate
+#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
+#define cudaGraphGetNodes musaGraphGetNodes
+#define cudaGraphInstantiate musaGraphInstantiate
+#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
+#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
+#define cudaGraphLaunch musaGraphLaunch
+#define cudaGraphNodeGetType musaGraphNodeGetType
+#define cudaGraphNode_t musaGraphNode_t
+#define cudaGraphNodeType musaGraphNodeType
+#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
+#define cudaGraph_t musaGraph_t
+#define cudaKernelNodeParams musaKernelNodeParams
+#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
+#define cudaStreamEndCapture musaStreamEndCapture
+typedef mt_bfloat16 nv_bfloat16;
--- a/ktransformers/ktransformers_ext/cpu_backend/vendors/vendor.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/vendors/vendor.h
+#ifndef CPUINFER_VENDOR_VENDOR_H
+#define CPUINFER_VENDOR_VENDOR_H
+#ifdef USE_CUDA
+#include "cuda.h"
+#elif USE_HIP
+#define __HIP_PLATFORM_AMD__
+#include "hip.h"
+#elif USE_MUSA
+#include "musa.h"
+#endif
+#endif  // CPUINFER_VENDOR_VENDOR_H
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
@@ -16,6 +16,10 @@
 #include <cstdint>
 #include <c10/cuda/CUDAGuard.h>
+#ifdef __HIP_PLATFORM_AMD__
+typedef hip_bfloat16 nv_bfloat16;
+#endif
 __global__ void dequantize_q8_0_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks) {
    long long global_idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (long long block_id = global_idx; block_id < num_blocks; block_id += blockDim.x * gridDim.x){

--- a/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
+++ b/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
@@ -36,7 +36,7 @@ inline std::string str(T x) {
 namespace gptq_marlin {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800) || defined(__HIP_PLATFORM_AMD__)
 __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                    int const* __restrict__ perm_int_ptr,