Merge pull request #25 from kvcache-ai/windows

Windows Support

Merge pull request #25 from kvcache-ai/windows
Windows Support
11544ef2 · UnicornChan · GitHub · 442e13bc · 0e613b60 · 11544ef2
Unverified Commit 11544ef2 authored Aug 08, 2024 by UnicornChan Committed by GitHub Aug 08, 2024
20 changed files
--- a/ktransformers/ktransformers_ext/CMakeLists.txt
+++ b/ktransformers/ktransformers_ext/CMakeLists.txt
-cmake_minimum_required(VERSION 3.16)
+cmake_minimum_required(VERSION 3.17)
 project(cpuinfer_ext VERSION 0.1.0)
 set(CMAKE_CXX_STANDARD 17)
@@ -190,7 +190,13 @@ else()
    message(STATUS "Unknown architecture")
 endif()
-find_package(CUDA REQUIRED)
+# message(STATUS "CUDAToolkit_ROOT:${CUDAToolkit_ROOT}")
+# find_package(FindCUDAToolkit REQUIRED)
+# if(CUDAToolkit_FOUND)
+#     message(STATUS "Found CUDA cudart lib at:${CUDAToolkit_LIBRARY_DIR}")
+# else()
+#     message(STATUS "Can't found CUDA lib")
+# endif()
 add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
 add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
@@ -199,7 +205,12 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/pybind11 ${CMAKE_
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llama.cpp ${CMAKE_CURRENT_BINARY_DIR}/third_party/llama.cpp)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
-include_directories("${CUDA_INCLUDE_DIRS}")
+if (WIN32)
+    include_directories("$ENV{CUDA_PATH}/include")
+elseif (UNIX)
+    find_package(CUDA REQUIRED)
+    include_directories("${CUDA_INCLUDE_DIRS}")
+endif()
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} SOURCE_DIR1)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2)
@@ -210,4 +221,8 @@ message(STATUS "ALL_SOURCES: ${ALL_SOURCES}")
 pybind11_add_module(${PROJECT_NAME} MODULE ${ALL_SOURCES})
 target_link_libraries(${PROJECT_NAME} PRIVATE llama)
-target_link_libraries(${PROJECT_NAME} PRIVATE "/usr/local/cuda/lib64/libcudart.so")
+if(WIN32)
\ No newline at end of file
+    target_link_libraries(${PROJECT_NAME} PRIVATE "$ENV{CUDA_PATH}/lib/x64/cudart.lib")#CUDA::cudart
+elseif(UNIX)
+    target_link_libraries(${PROJECT_NAME} PRIVATE "$ENV{CUDA_HOME}/lib64/libcudart.so")
+endif()
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/cpu_backend/task_queue.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/task_queue.h
@@ -3,8 +3,8 @@
 * @Author       : chenht2022
 * @Date         : 2024-07-16 10:43:18
 * @Version      : 1.0.0
- * @LastEditors  : chenht2022 
+ * @LastEditors  : chenxl 
- * @LastEditTime : 2024-07-25 10:33:47
+ * @LastEditTime : 2024-08-08 04:23:51
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 #ifndef CPUINFER_TASKQUEUE_H
@@ -17,6 +17,44 @@
 #include <queue>
 #include <thread>
 #include <vector>
+#ifdef _WIN32
+#include <windows.h>
+#endif
+class custom_mutex {
+private:
+#ifdef _WIN32
+    HANDLE  global_mutex;
+#else
+    std::mutex global_mutex;
+#endif
+public:
+    custom_mutex()
+    {
+#ifdef _WIN32
+        HANDLE  global_mutex;
+#endif
+    }
+    void lock()
+    {
+#ifdef _WIN32
+        WaitForSingleObject(global_mutex, INFINITE);
+#else
+        global_mutex.lock();
+#endif
+    }
+    void unlock()
+    {
+#ifdef _WIN32
+        ReleaseMutex(global_mutex);
+#else
+        global_mutex.lock();
+#endif
+    }
+};
 class TaskQueue {
   public:
@@ -32,7 +70,7 @@ class TaskQueue {
    std::queue<std::function<void()>> tasks;
    std::thread worker;
-    std::mutex mutex;
+    custom_mutex mutex;
    std::atomic<bool> sync_flag;
    std::atomic<bool> exit_flag;
 };

--- a/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
+++ b/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
@@ -1703,28 +1703,63 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
      thread_m_blocks = exec_cfg.max_m_blocks;
    }
    // Define kernel configurations
-    if (false) {
+#define undefined_error TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " + \
+    str(prob_n) + ", " + str(prob_k) + "]" + \
+        ", has_act_order = " + str(has_act_order) + \
+        ", num_groups = " + str(num_groups) + \
+        ", group_size = " + str(group_size) + \
+        ", thread_m_blocks = " + str(thread_m_blocks) + \
+        ", thread_n_blocks = " + str(thread_n_blocks) + \
+        ", thread_k_blocks = " + str(thread_k_blocks));
+    if (num_bits == 4 && num_threads == 256)
+    {
+        if (false) {
+        }
+        CALL_IF(4, 32, 2, 256)
+        CALL_IF(4, 16, 4, 256)
+        CALL_IF(4, 8, 8, 256)
+        else {
+            undefined_error
+        }
+    }
+    else if (num_bits == 4 && num_threads == 128)
+    {
+        if (false) {
+        }
+        CALL_IF(4, 8, 4, 128)
+        CALL_IF(4, 4, 8, 128)
+        else {
+            undefined_error
+        }
+    }
+    else if (num_bits == 8 && num_threads == 256)
+    {
+        if (false) {
+        }
+        CALL_IF(8, 32, 2, 256)
+        CALL_IF(8, 16, 4, 256)
+        CALL_IF(8, 8, 8, 256)
+        else {
+            undefined_error
+        }
+    }
+    else if (num_bits == 8 && num_threads == 128)
+    {
+        if (false) {
+        }
+        CALL_IF(8, 8, 4, 128)
+        CALL_IF(8, 4, 8, 128)
+        else {
+            undefined_error
+        }
    }
-    CALL_IF(4, 32, 2, 256)
-    CALL_IF(4, 16, 4, 256)
-    CALL_IF(4, 8, 8, 256)
-    CALL_IF(4, 8, 4, 128)
-    CALL_IF(4, 4, 8, 128)
-    CALL_IF(8, 32, 2, 256)
-    CALL_IF(8, 16, 4, 256)
-    CALL_IF(8, 8, 8, 256)
-    CALL_IF(8, 8, 4, 128)
-    CALL_IF(8, 4, 8, 128)
    else {
-      TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
+        undefined_error
-                             str(prob_n) + ", " + str(prob_k) + "]" +
-                             ", has_act_order = " + str(has_act_order) +
-                             ", num_groups = " + str(num_groups) +
-                             ", group_size = " + str(group_size) +
-                             ", thread_m_blocks = " + str(thread_m_blocks) +
-                             ", thread_n_blocks = " + str(thread_n_blocks) +
-                             ", thread_k_blocks = " + str(thread_k_blocks));
    }
    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;

--- a/ktransformers/ktransformers_ext/cuda/setup.py
+++ b/ktransformers/ktransformers_ext/cuda/setup.py
@@ -10,7 +10,7 @@ setup(name='KTransformersOps',
                'custom_gguf/dequant.cu',
                'binding.cpp',
                'gptq_marlin/gptq_marlin.cu',
-                # 'gptq_marlin_repack.cu',
+               # 'gptq_marlin_repack.cu',
      ])
      ],
      cmdclass={'build_ext': BuildExtension

--- a/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
@@ -68,10 +68,10 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
    int nth = config_.intermediate_size / config_.stride;
    backend->do_work_stealing_job(nth, [&](int task_id) {
        int ith = task_id;
-        void* gate_proj_ptr = gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
+        void* gate_proj_ptr = (uint8_t*)gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        float* gate_output_ptr = gate_output_.data() + ith * config_.stride;
        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
-        void* up_proj_ptr = up_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
+        void* up_proj_ptr = (uint8_t*)up_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        float* up_output_ptr = up_output_.data() + ith * config_.stride;
        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
@@ -79,7 +79,7 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
        }
        if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) {
            float* intermediate_fp32_ptr = intermediate_fp32_.data() + ith * config_.stride;
-            void* down_input_ptr = down_input_.data() + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
+            void* down_input_ptr = (uint8_t*)down_input_.data() + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
    });
@@ -89,11 +89,11 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
    nth = config_.hidden_size / config_.stride;
    backend->do_work_stealing_job(nth, [&](int task_id) {
        int ith = task_id;
-        void* down_proj_ptr = down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
+        void* down_proj_ptr = (uint8_t*)down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        float* down_output_ptr = down_output_.data() + ith * config_.stride;
        llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_.data(), config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
-            void* output_ptr = output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+            void* output_ptr = (uint8_t*)output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
            from_float(down_output_ptr, output_ptr, config_.stride, config_.hidden_type);
        }
    });

--- a/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
@@ -9,9 +9,9 @@
 **/
 #include "moe.h"
 #include <iostream>
-#include "unistd.h"
+#include <cstdint>
-void* MOE::buffer_ = nullptr;
+uint8_t* MOE::buffer_ = nullptr;
 MOE::MOE(MOEConfig config) {
    config_ = config;
@@ -32,7 +32,7 @@ MOE::MOE(MOEConfig config) {
        buffer_size += config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size;
        buffer_size += sizeof(float) * config_.group_max_len * config_.hidden_size;
-        buffer_ = malloc(buffer_size);
+        buffer_ = (uint8_t*)malloc(buffer_size);
    }
    uint64_t offset = 0;
@@ -95,7 +95,7 @@ MOE::MOE(MOEConfig config) {
    m_local_pos_.resize(config_.group_max_len);
    for (int i = 0; i < config_.group_max_len; i++) {
-        m_local_pos_[i].reserve(config_.expert_num);
+        m_local_pos_[i].resize(config_.routed_expert_num);
    }
    m_local_num_.resize(config_.expert_num);
    m_local_gate_input_ptr_.resize(config_.expert_num);
@@ -156,10 +156,10 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
        int expert_idx = task_id / nth;
        uint64_t expert_id = expert_ids[expert_idx];
        int ith = task_id % nth;
-        void* gate_proj_ptr = gate_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
+        void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        float* gate_output_ptr = s_gate_output_[expert_idx] + ith * config_.stride;
        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
-        void* up_proj_ptr = up_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
+        void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        float* up_output_ptr = s_up_output_[expert_idx] + ith * config_.stride;
        llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
@@ -184,7 +184,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
        }
        for (int expert_idx = 0; expert_idx < k; expert_idx++) {
            uint64_t expert_id = expert_ids[expert_idx];
-            void* down_proj_ptr = down_proj_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
+            void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
            float* down_output_ptr = s_down_output_[expert_idx] + ith * config_.stride;
            llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), s_down_input_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
            for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
@@ -193,7 +193,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
        }
        if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
            float* output_fp32_ptr = s_output_fp32_ + ith * config_.stride;
-            void* output_ptr = output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+            void* output_ptr = (uint8_t*)output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
            from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
        }
    });
@@ -226,9 +226,9 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
        const void* gate_input_ptr;
        const void* up_input_ptr;
        if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
-            gate_input_ptr = up_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+            gate_input_ptr = up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
        } else {
-            to_float(input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), m_input_fp32_[i], config_.hidden_size, config_.hidden_type);
+            to_float((uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), m_input_fp32_[i], config_.hidden_size, config_.hidden_type);
            if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
                from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
                gate_input_ptr = up_input_ptr = m_gate_input_[i];
@@ -237,13 +237,13 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
                    from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
                    gate_input_ptr = m_gate_input_[i];
                } else {
-                    gate_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+                    gate_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                }
                if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
                    from_float(m_input_fp32_[i], m_up_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
                    up_input_ptr = m_up_input_[i];
                } else {
-                    up_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
+                    up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
                }
            }
        }
@@ -258,11 +258,11 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
        int expert_idx = task_id / nth;
        int ith = task_id % nth;
        void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx];
-        void* gate_proj_ptr = gate_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
+        void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        float* gate_output_ptr = m_local_gate_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        void* up_input_ptr = m_local_up_input_ptr_[expert_idx];
-        void* up_proj_ptr = up_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
+        void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
        float* up_output_ptr = m_local_up_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
        for (int i = 0; i < m_local_num_[expert_idx]; i++) {
@@ -280,7 +280,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
        int expert_idx = task_id / nth;
        int ith = task_id % nth;
        void* down_input_ptr = m_local_down_input_ptr_[expert_idx];
-        void* down_proj_ptr = down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
+        void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
    });
@@ -293,18 +293,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
                m_output_fp32_[i][e] += m_local_down_output_ptr_[expert_ids[i * k + j]][m_local_pos_[i][j] * config_.hidden_size + e] * weights[i * k + j];
            }
        }
-        from_float(m_output_fp32_[i], output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type);
+        from_float(m_output_fp32_[i], (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type);
    });
 }
 void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
    if (qlen < config_.group_min_len) {
        for (int i = 0; i < qlen; i++) {
-            forward_one(k, expert_ids + i * k, weights + i * k, input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
+            forward_one(k, expert_ids + i * k, weights + i * k, (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
        }
        return;
    }
    int forward_len = std::min(config_.group_max_len, qlen);
    forward_many(forward_len, k, expert_ids, weights, input, output, backend);
-    forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
+    forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
 }
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/operators/llamafile/moe.h
+++ b/ktransformers/ktransformers_ext/operators/llamafile/moe.h
@@ -54,7 +54,7 @@ class MOE {
    void forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
   private:
-    static void* buffer_;
+    static uint8_t* buffer_;
    MOEConfig config_;
    void* gate_proj_;  // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
    void* up_proj_;    // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]

--- a/ktransformers/local_chat.py
+++ b/ktransformers/local_chat.py
@@ -112,4 +112,4 @@ def local_chat(
        generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens)
 if __name__ == "__main__":
    fire.Fire(local_chat)
\ No newline at end of file
--- a/ktransformers/models/custom_cache.py
+++ b/ktransformers/models/custom_cache.py
@@ -46,6 +46,7 @@ class StaticCache(transformers.StaticCache):
        self.value_cache: List[torch.Tensor] = []
        cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
        if config.architectures[0] == "DeepseekV2ForCausalLM":
+            # TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically
            # key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
            # value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
            key_shape = (max_batch_size, 1, self.max_cache_len, config.qk_rope_head_dim)

--- a/ktransformers/operators/experts.py
+++ b/ktransformers/operators/experts.py
@@ -19,7 +19,8 @@ import torch
 import sys, os
 from ktransformers.operators.base_operator import BaseInjectedModule
-sys.path.append(os.path.dirname(__file__) + "/../ktransformers_ext/build")
+#sys.path.append(os.path.dirname(__file__) + "/../ktransformers_ext/build/")
+sys.path.append(os.path.dirname(__file__) + "\\..\\ktransformers_ext\\build\\Release")
 import cpuinfer_ext
 from cpuinfer_ext.moe import MOEConfig, MOE
 import ctypes
@@ -179,6 +180,7 @@ class MLPCPUExperts(MLPExpertsBase):
    def forward(self, input_tensor, expert_ids, weights):
        # generate, capture and run cuda graph
        if input_tensor.size(0)==1:
+            # TODO: this branch is unreachable, but the shape of input_tensor([1,hidden_size]) and input_tensor_cpu([hidden_size]) is not compatible
            #print("capturing experts")
            MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
            MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
@@ -359,7 +361,12 @@ class MLPExpertsTorch(MLPExpertsBase):
            self.down = None
    def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
+        # TODO: forward should transfer data to gpu, and make the data transfering capturable using pin memory, 
+        # just like CPUInfer MLPCPUExperts. There may be a base class of experts on cpu
+        hidden_states_cpu = hidden_states_cpu.to("cpu")
+        selected_experts_cpu = selected_experts_cpu.to("cpu")
+        routing_weights_cpu = routing_weights_cpu.to("cpu")
        batch_sequence_length, hidden_dim = hidden_states_cpu.size()
        final_hidden_states = torch.zeros(
@@ -587,7 +594,7 @@ class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE):
        topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-        if sequence_length == 1:
+        if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode"):
            self.experts.generate_experts.submit_for_one_decode(hidden_states[0], topk_idx[0], topk_weight[0])
            if self.config.n_shared_experts is not None:
                y_ = self.shared_experts(identity).squeeze(0)

--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
@@ -26,7 +26,7 @@
      prefill_device: "cuda"
      prefill_mlp_type: "MLPExpertsTorch"
      generate_device: "cpu"
-      generate_mlp_type:  "MLPCPUExperts"
+      generate_mlp_type: "MLPCPUExperts"
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
 - match:

--- a/ktransformers/util/custom_gguf.py
+++ b/ktransformers/util/custom_gguf.py
@@ -7,6 +7,9 @@ Date         : 2024-07-26 08:48:54
 Version      : 1.0.0
 LastEditors  : Azure 
 LastEditTime : 2024-07-26 09:28:25
+Adapted from https://github.com/99991/pygguf/blob/main/gguf.py
+Copyright (c) 2023-2024 The ggml authors
+Copyright (c) 2024 Thomas Germer
 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
 '''
 # copied from llama.cpp/gguf-py/gguf/constants.py to satisfy dependence of gguf
@@ -96,6 +99,8 @@ def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantization
 GGML_TYPES = {
    "F32": 0,
    "F16": 1,
+    "Q4_0": 2,
+    "Q5_0": 6,
    "Q8_0": 8,
    "Q2_K": 10,
    "Q3_K": 11,
@@ -109,6 +114,8 @@ GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()}
 GGML_BLOCK_SIZES = {
    "F32": 4,
    "F16": 2,
+    "Q4_0": 2 + 16,
+    "Q5_0": 2 + 4 + 16,
    "Q8_0": 2 + 32,
    "Q2_K": 256 // 16 + 256 // 4 + 2 + 2,
    "Q3_K": 256 // 8 + 256 // 4 + 12 + 2,
@@ -120,6 +127,8 @@ GGML_BLOCK_SIZES = {
 GGML_ELEMENTS_PER_BLOCK = {
    "F32": 1,
    "F16": 1,
+    "Q4_0": 32,
+    "Q5_0": 32,
    "Q8_0": 32,
    "Q2_K": 256,
    "Q3_K": 256,
@@ -128,14 +137,6 @@ GGML_ELEMENTS_PER_BLOCK = {
    "Q6_K": 256,
 }
-# DATA_TYPES = {
-#     "uint32": 4,
-#     "int32": 5,
-#     "float32": 6,
-#     "string": 8,
-#     "array": 9,
-#     "uint64": 10,
-# }
 DATA_TYPES = {
    "uint8": 0,
    "int8": 1,
@@ -272,7 +273,7 @@ class GGUFLoader:
    def load_gguf_tensor(self, name: str, device:str = "cpu")->torch.Tensor:
        t = self.tensor_info[name]
        shape = t["shape"]
        ggml_type = t["ggml_type"]
@@ -282,10 +283,12 @@ class GGUFLoader:
        ggml_name = GGML_NAMES[ggml_type]
        data = self.get_mmap_tensor(name)
        if "cuda" in device.lower():
            values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
+            #values = GGML_DEQUANTIZE[ggml_name](data)
+            #print("load_gguf_tensor")
+            #values = torch.from_numpy(values).to(device = device)
        else:
            values = GGML_DEQUANTIZE[ggml_name](data)
            values = torch.from_numpy(values)
@@ -375,7 +378,7 @@ def dequantize_q2_k(data):
    return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4)
 def dequantize_q2_k_gpu(data):
-    pass
+    raise NotImplementedError()
 def dequantize_q3_k(data):
    # C implementation
@@ -420,7 +423,7 @@ def dequantize_q3_k(data):
    ], axis=1)
 def dequantize_q3_k_gpu(data):
-    pass
+    raise NotImplementedError()
 def dequantize_q4_k(data):
    # C implementation
@@ -429,20 +432,16 @@ def dequantize_q4_k(data):
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116
    block_size = GGML_BLOCK_SIZES["Q4_K"]
    num_blocks = len(data) // block_size
    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
    # Casting to float32 because float16 is very slow on CPU
    scale_factors = data_f16[:, 0].reshape(num_blocks, 1, 1).astype(np.float32)
    scale_offsets = data_f16[:, 1].reshape(num_blocks, 1, 1).astype(np.float32)
    qs1 = data_u8[:, 4:16].reshape(num_blocks, 12, 1)
    qs2 = data_u8[:, 16:].reshape(num_blocks, 4, 32)
    # Dequantize scales and offsets (6 bits and 4 + 2 bits)
    factors = scale_factors * np.concatenate([qs1[:, 0:4] & 0b111111, (qs1[:, 8:] & 15) | ((qs1[:, 0:4] >> 6) << 4)], axis=1)
    offsets = scale_offsets * np.concatenate([qs1[:, 4:8] & 0b111111, (qs1[:, 8:] >> 4) | ((qs1[:, 4:8] >> 6) << 4)], axis=1)
    # Interleave low and high quantized bits
    qs2 = np.stack([qs2 & 0xf, qs2 >> 4], axis=2).reshape(num_blocks, 8, 32)
    # Dequantize final weights using scales and offsets
@@ -513,7 +512,7 @@ def dequantize_q5_k(data):
    ], axis=1)
 def dequantize_q5_k_gpu(data):
-    pass
+    raise NotImplementedError()
 def dequantize_q6_k(data):
@@ -573,6 +572,48 @@ def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda"):
    data = torch.from_numpy(data)
    return KTransformersOps.dequantize_q6_k(data, 210, device)
+def dequantize_q4_0(data):
+    # C implementation
+    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1515
+    # C struct definition
+    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L141
+    num_blocks = len(data) // GGML_BLOCK_SIZES["Q4_0"]
+    scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 8)[:, :1].astype(np.float32)
+    qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 16)[:, 2:]
+    return np.concatenate([
+        scales * ((qs & 0xf).astype(np.int8) - 8),
+        scales * ((qs >> 4).astype(np.int8) - 8),
+    ], axis=1)
+def dequantize_q4_0_gpu(data):
+    raise NotImplementedError()
+def dequantize_q5_0(data):
+    # C implementation
+    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1556
+    # C struct definition
+    # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L161
+    num_blocks = len(data) // GGML_BLOCK_SIZES["Q5_0"]
+    scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 2 + 8)[:, :1].astype(np.float32)
+    qh = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2:2 + 4]
+    qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2 + 4:]
+    bits = np.unpackbits(qh, axis=-1, bitorder="little")
+    x0 = ((qs & 0xf).astype(np.int8) | (bits[:, :16] << 4)) - 16
+    x1 = ((qs >> 4).astype(np.int8) | (bits[:, 16:] << 4)) - 16
+    return np.concatenate([
+        scales * x0,
+        scales * x1,
+    ], axis=1)
+def dequantize_q5_0_gpu(data):
+    raise NotImplementedError()
 def dequantize_q8_0(data):
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
@@ -615,6 +656,8 @@ def dequantize_f16_gpu(data, device):
 GGML_DEQUANTIZE = {
    "F32": dequantize_f32,
    "F16": dequantize_f16,
+    "Q4_0": dequantize_q4_0,
+    "Q5_0": dequantize_q5_0,
    "Q8_0": dequantize_q8_0,
    "Q2_K": dequantize_q2_k,
    "Q3_K": dequantize_q3_k,
@@ -626,6 +669,8 @@ GGML_DEQUANTIZE = {
 GGML_DEQUANTIZE_GPU = {
    "F32": dequantize_f32_gpu,
    "F16": dequantize_f16_gpu,
+    "Q4_0": dequantize_q4_0_gpu,
+    "Q5_0": dequantize_q5_0_gpu,
    "Q8_0": dequantize_q8_0_gpu,
    "Q2_K": dequantize_q2_k_gpu,
    "Q3_K": dequantize_q3_k_gpu,

--- a/ktransformers/util/utils.py
+++ b/ktransformers/util/utils.py
@@ -79,13 +79,15 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
        logits = cuda_graph_runner(cur_token, position_ids, cache_position)
        past_key_values.change_seq_length(1)
        """
+        inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to("cuda")
+        custom_stream = torch.cuda.Stream()
        with torch.cuda.stream(custom_stream):
-            logits=model(cur_token, 
+            logits=model(inputs_embeds = inputs_embeds, 
-                         position_ids=position_ids,
+                         position_ids = position_ids,
-                         cache_position=cache_position,
+                         cache_position = cache_position,
-                         past_key_values=past_key_values,
+                         past_key_values = past_key_values,
-                         return_dict=False, use_cache=True)[0]
+                         return_dict = False, use_cache = True) [0]
-        #"""            
+        """            
        torch.cuda.synchronize()
        #print(logits)
        next_token_scores = logits_warper(inputs, logits[:, -1, :])
@@ -108,7 +110,6 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
        generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
        past_key_values.cur_idx=cache_position
        start_time = time.time()
-        #custom_stream = torch.cuda.Stream()
        inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to("cuda")
        logits = model(

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,8 @@ requires = [
  "setuptools",
  "torch >= 2.3.0", 
  "ninja",
-  "packaging"
+  "packaging",
+  "cpufeature"
  ]
 build-backend = "setuptools.build_meta"

--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@ Author       : chenxl
 Date         : 2024-07-27 16:15:27
 Version      : 1.0.0
 LastEditors  : chenxl 
-LastEditTime : 2024-07-31 09:44:46
+LastEditTime : 2024-08-08 02:45:15
 Adapted from:
 https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
 Copyright (c) 2023, Tri Dao.
@@ -19,6 +19,7 @@ import re
 import ast
 import subprocess
 import platform
+import shutil
 import http.client
 import urllib.request
 import urllib.error
@@ -27,6 +28,7 @@ from packaging.version import parse
 import torch.version
 from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 from setuptools import setup, Extension
+from cpufeature.extension import CPUFeature
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
 class CpuInstructInfo:
@@ -67,6 +69,8 @@ class VersionInfo:
        """
        if sys.platform.startswith("linux"):
            return f'linux_{platform.uname().machine}'
+        elif sys.platform == "win32":
+            return "win_amd64"
        else:
            raise ValueError("Unsupported platform: {}".format(sys.platform))
@@ -97,6 +101,15 @@ class VersionInfo:
                    return 'avx2'
            raise ValueError(
                "Unsupported cpu Instructions: {}".format(flags_line))
+        elif sys.platform == "win32":
+            if CPUFeature.get("AVX512bw", False):
+                return 'fancy'
+            if CPUFeature.get("AVX512f", False):
+                return 'avx512'
+            if CPUFeature.get("AVX2", False):
+                return 'avx2'
+            raise ValueError(
+                "Unsupported cpu Instructions: {}".format(str(CPUFeature)))
        else:
            raise ValueError("Unsupported platform: {}".format(sys.platform))
@@ -154,7 +167,7 @@ class BuildWheelsCommand(_bdist_wheel):
            wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl")
            print("Raw wheel path", wheel_path)
-            os.rename(wheel_filename, wheel_path)
+            shutil.move(wheel_filename, wheel_path)
        except (urllib.error.HTTPError, urllib.error.URLError, http.client.RemoteDisconnected):
            print("Precompiled wheel not found. Building from source...")
            # If the wheel could not be downloaded, build from source

--- a/third_party/llamafile/iqk_mul_mat.inc
+++ b/third_party/llamafile/iqk_mul_mat.inc
@@ -22,7 +22,7 @@
 #include <cstring>
 #include <type_traits>
-#if defined __x86_64__ || defined __aarch64__
+#if defined __x86_64__ || defined __aarch64__ || defined(_M_X64)
 #include "llama.cpp/ggml-impl.h"
 #include "llama.cpp/ggml-quants.h"
@@ -225,7 +225,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
    return true;
 }
-#if defined __x86_64__
+#if defined __x86_64__ || defined(_M_X64)
 #if defined HAVE_FANCY_SIMD
    #undef HAVE_FANCY_SIMD
@@ -1412,7 +1412,8 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
 bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int) {
-    row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
+    if (ne00 % ggml_blck_size(GGML_TYPE_Q8_K) == 0)
+        row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
    switch (typeA) {
        case GGML_TYPE_Q2_K:

--- a/third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
+++ b/third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
@@ -3,6 +3,6 @@
 // Copyrigth 2024 Iwan Kawrakow.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #include "iqk_mul_mat.inc"
 #endif  // __x86_64__
--- a/third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
+++ b/third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
@@ -3,7 +3,7 @@
 // Copyrigth 2024 Iwan Kawrakow.
 // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
 #define iqk_mul_mat iqk_mul_mat_zen4
 #define iqk_mul_mat_moe iqk_mul_mat_moe_zen4
 #include "iqk_mul_mat.inc"

--- a/third_party/llamafile/sgemm.cpp
+++ b/third_party/llamafile/sgemm.cpp
@@ -22,19 +22,22 @@
 #include "sgemm.h"
 // #include <cosmo.h>
-#include <cpuid.h>
+// #include <cpuid.h>
 // #include <libc/sysv/consts/hwcap.h>
 #include <stdio.h>
-#include <sys/auxv.h>
+// #include <sys/auxv.h>
 #include <cassert>
 // #include "llamafile.h"
 static const struct GemmFuncs {
-    typeof(llamafile_sgemm)* sgemm;
+    bool (*sgemm)(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
-    typeof(llamafile_mixmul)* mixmul;
+    bool (*mixmul)(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
-    typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported;
+    bool (*iqk_mixmul)(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
+    // typeof(llamafile_sgemm)* sgemm;
+    // typeof(llamafile_mixmul)* mixmul;
+    // typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported;
    GemmFuncs() {
-#ifdef __x86_64__
+#if defined(__x86_64__) || defined(_M_X64)
        // if (X86_HAVE(AVX)) {
        //     if (X86_HAVE(FMA)) {
        //         if (X86_HAVE(AVX2)) {
@@ -86,10 +89,12 @@ static const struct GemmFuncs {
        //     sgemm = llamafile_sgemm_unsupported;
        //     mixmul = llamafile_mixmul_unsupported;
        // }
 #if defined(__AVX__)
-#if defined(__FMA__)
+#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
 #if defined(__AVX2__)
 #if defined(__AVX512F__)
+        printf("__AVX512F__\n");
 #if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
        // AMD Zen4+ (2023-)
        sgemm = llamafile_sgemm_amd_zen4;

--- a/third_party/llamafile/tinyblas_cpu.h
+++ b/third_party/llamafile/tinyblas_cpu.h
@@ -223,7 +223,7 @@ inline float32x4_t badder(float32x4_t a, float b, float32x4_t c, float32x4_t* e)
 }
 #endif
-#if defined(__FMA__)
+#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
 #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 template <>
 inline __m256 madd(__m256 a, __m256 b, __m256 c) {