fix bug

23a7a73f · xuxzh1 · f38c5ca1 · 23a7a73f · 23a7a73f · 23a7a73f
Commit 23a7a73f authored Feb 28, 2025 by xuxzh1 🎱
10 changed files
--- a/llama/ggml-cuda/binbcast.cu
+++ b/llama/ggml-cuda/binbcast.cu
@@ -49,7 +49,7 @@ static __device__ __forceinline__ float op_div(const float a, const float b) {
 }
 template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
+static __global__ __launch_bounds__(1024) void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
        int ne0, int ne1, int ne2, int ne3,
        int ne10, int ne11, int ne12, int ne13,
        /*int s0, */ int s1,  int s2,  int s3,

--- a/llama/ggml-cuda/convert.cu
+++ b/llama/ggml-cuda/convert.cu
@@ -596,7 +596,7 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
 }
 template <typename src_t, typename dst_t>
-static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
+static __global__ __launch_bounds__(1024) void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
    const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {

--- a/llama/ggml-cuda/cpy.cu
+++ b/llama/ggml-cuda/cpy.cu
@@ -57,7 +57,7 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
 }
 template <cpy_kernel_t cpy_1>
-static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
+static __global__ __launch_bounds__(1024) void cpy_f32_f16(const char * cx, char * cdst, const int ne,
                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
                                   const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                   const int nb12, const int nb13) {

--- a/llama/ggml-cuda/ggml-cuda.cu
+++ b/llama/ggml-cuda/ggml-cuda.cu
@@ -1581,7 +1581,7 @@ static void ggml_cuda_op_mul_mat(
    }
 }
-static __global__ void k_compute_batched_ptrs(
+static __global__ __launch_bounds__(1024) void k_compute_batched_ptrs(
        const half * src0_as_f16, const half * src1_as_f16, char * dst,
        const void ** ptrs_src, void ** ptrs_dst,
        int64_t ne12, int64_t ne13,

--- a/llama/ggml-cuda/mmvq.cu
+++ b/llama/ggml-cuda/mmvq.cu
@@ -76,7 +76,7 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
 template <ggml_type type, int ncols_y>
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 // tell the compiler to use as many registers as it wants, see nwarps definition below
-__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
+__launch_bounds__((ncols_y <= 4 ? 4 : 2)*16, 1)
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void mul_mat_vec_q(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,

--- a/llama/ggml-cuda/quantize.cu
+++ b/llama/ggml-cuda/quantize.cu
@@ -27,7 +27,7 @@
 #include "quantize.cuh"
 #include <cstdint>
-static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
+static __global__ __launch_bounds__(1024) void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
    const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
    if (ix0 >= kx0_padded) {

--- a/llama/ggml-cuda/rope.cu
+++ b/llama/ggml-cuda/rope.cu
@@ -60,7 +60,7 @@ static __device__ void rope_yarn(
 }
 template<typename T, bool has_ff>
-static __global__ void rope_norm(
+static __global__ __launch_bounds__(1024) void rope_norm(
    const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) {
    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);

--- a/llama/ggml-cuda/softmax.cu
+++ b/llama/ggml-cuda/softmax.cu
@@ -38,7 +38,7 @@ __device__ float __forceinline__ t2f32<half>(half val) {
 }
 template <bool vals_smem, int ncols_template, int block_size_template, typename T>
-static __global__ void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
+static __global__ __launch_bounds__(1024) void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
    const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
    const int tid  = threadIdx.x;

--- a/llama/ggml-cuda/unary.cu
+++ b/llama/ggml-cuda/unary.cu
@@ -68,7 +68,7 @@ static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
    dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
 }
-static __global__ void silu_f32(const float * x, float * dst, const int k) {
+static __global__ __launch_bounds__(1024) void silu_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {

--- a/make/Makefile.rocm
+++ b/make/Makefile.rocm
@@ -76,7 +76,7 @@ GPU_COMPILER_CUFLAGS = \
 	-DGGML_CUDA_MMV_Y=1 \
 	-DGGML_SCHED_MAX_COPIES=4 \
 	-DGGML_USE_HIP \
-	--gpu-max-threads-per-block=1024 \
+	#--gpu-max-threads-per-block=1024 \
 	-DCDNA \
 	-DGGML_USE_LLAMAFILE \
 	-DHIP_FAST_MATH \