Commit b3befef4 authored by xuxz's avatar xuxz
Browse files

[Rocm]Decouple the driving dependencies && [Bugfix]accuracy problem

parent 9de895f0
...@@ -13,7 +13,7 @@ Ollama可快速部署主流模型。 ...@@ -13,7 +13,7 @@ Ollama可快速部署主流模型。
##### Docker ##### Docker
```bash ```bash
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10 docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10-fixpy
docker run -i -t -d --device=/dev/kfd --privileged --network=host --device=/dev/dri --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home -v /opt/hyhal:/opt/hyhal:ro -v --group-add video --shm-size 16G --name {容器名} {镜像ID} docker run -i -t -d --device=/dev/kfd --privileged --network=host --device=/dev/dri --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home -v /opt/hyhal:/opt/hyhal:ro -v --group-add video --shm-size 16G --name {容器名} {镜像ID}
``` ```
......
...@@ -55,11 +55,11 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { ...@@ -55,11 +55,11 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
} }
// Opportunistic logging of driver version to aid in troubleshooting // Opportunistic logging of driver version to aid in troubleshooting
driverMajor, driverMinor, err := AMDDriverVersion() // driverMajor, driverMinor, err := AMDDriverVersion()
if err != nil { // if err != nil {
// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU // // TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err) // slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
} // }
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others // Determine if the user has already pre-selected which GPUs to look at, then ignore the others
var visibleDevices []string var visibleDevices []string
...@@ -284,8 +284,8 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { ...@@ -284,8 +284,8 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
Name: name, Name: name,
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch), Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
MinimumMemory: rocmMinimumMemory, MinimumMemory: rocmMinimumMemory,
DriverMajor: driverMajor, // DriverMajor: driverMajor,
DriverMinor: driverMinor, // DriverMinor: driverMinor,
}, },
usedFilepath: usedFile, usedFilepath: usedFile,
index: gpuID, index: gpuID,
...@@ -415,15 +415,15 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { ...@@ -415,15 +415,15 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
// Quick check for AMD driver so we can skip amdgpu discovery if not present // Quick check for AMD driver so we can skip amdgpu discovery if not present
func AMDDetected() bool { func AMDDetected() bool {
// Some driver versions (older?) don't have a version file, so just lookup the parent dir // Some driver versions (older?) don't have a version file, so just lookup the parent dir
sysfsDir := filepath.Dir(DriverVersionFile) // sysfsDir := filepath.Dir(DriverVersionFile)
_, err := os.Stat(sysfsDir) // _, err := os.Stat(sysfsDir)
if errors.Is(err, os.ErrNotExist) { // if errors.Is(err, os.ErrNotExist) {
slog.Debug("amdgpu driver not detected " + sysfsDir) // slog.Debug("amdgpu driver not detected " + sysfsDir)
return false // return false
} else if err != nil { // } else if err != nil {
slog.Debug("error looking up amd driver", "path", sysfsDir, "error", err) // slog.Debug("error looking up amd driver", "path", sysfsDir, "error", err)
return false // return false
} // }
return true return true
} }
......
...@@ -49,7 +49,7 @@ static __device__ __forceinline__ float op_div(const float a, const float b) { ...@@ -49,7 +49,7 @@ static __device__ __forceinline__ float op_div(const float a, const float b) {
} }
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t> template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
static __global__ __launch_bounds__(1024) void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
int ne0, int ne1, int ne2, int ne3, int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13, int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3, /*int s0, */ int s1, int s2, int s3,
......
...@@ -596,7 +596,7 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t ...@@ -596,7 +596,7 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
} }
template <typename src_t, typename dst_t> template <typename src_t, typename dst_t>
static __global__ __launch_bounds__(1024) void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) { static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
......
...@@ -57,7 +57,7 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) { ...@@ -57,7 +57,7 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
} }
template <cpy_kernel_t cpy_1> template <cpy_kernel_t cpy_1>
static __global__ __launch_bounds__(1024) void cpy_f32_f16(const char * cx, char * cdst, const int ne, static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const int nb12, const int nb13) { const int nb12, const int nb13) {
......
...@@ -1581,7 +1581,7 @@ static void ggml_cuda_op_mul_mat( ...@@ -1581,7 +1581,7 @@ static void ggml_cuda_op_mul_mat(
} }
} }
static __global__ __launch_bounds__(1024) void k_compute_batched_ptrs( static __global__ void k_compute_batched_ptrs(
const half * src0_as_f16, const half * src1_as_f16, char * dst, const half * src0_as_f16, const half * src1_as_f16, char * dst,
const void ** ptrs_src, void ** ptrs_dst, const void ** ptrs_src, void ** ptrs_dst,
int64_t ne12, int64_t ne13, int64_t ne12, int64_t ne13,
......
...@@ -75,8 +75,8 @@ ...@@ -75,8 +75,8 @@
template <ggml_type type, int ncols_y> template <ggml_type type, int ncols_y>
// #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) // #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
// tell the compiler to use as many registers as it wants, see nwarps definition below // // tell the compiler to use as many registers as it wants, see nwarps definition below
//__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1) // __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
// #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) // #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static __global__ void mul_mat_vec_q( static __global__ void mul_mat_vec_q(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
...@@ -92,7 +92,7 @@ ...@@ -92,7 +92,7 @@
constexpr int rows_per_cuda_block = 1; constexpr int rows_per_cuda_block = 1;
#else #else
constexpr int nwarps = ncols_y <= 4 ? 4 : 2; constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 1; constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
...@@ -176,7 +176,7 @@ ...@@ -176,7 +176,7 @@
case 3: case 3:
case 4: case 4:
nwarps = 4; nwarps = 4;
rows_per_cuda_block = 1; rows_per_cuda_block = 2;
break; break;
case 5: case 5:
case 6: case 6:
......
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
#include "quantize.cuh" #include "quantize.cuh"
#include <cstdint> #include <cstdint>
static __global__ __launch_bounds__(1024) void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) { static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (ix0 >= kx0_padded) { if (ix0 >= kx0_padded) {
......
...@@ -60,7 +60,7 @@ static __device__ void rope_yarn( ...@@ -60,7 +60,7 @@ static __device__ void rope_yarn(
} }
template<typename T, bool has_ff> template<typename T, bool has_ff>
static __global__ __launch_bounds__(1024) void rope_norm( static __global__ void rope_norm(
const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) { float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
......
...@@ -68,7 +68,7 @@ static __global__ void gelu_quick_f32(const float * x, float * dst, int k) { ...@@ -68,7 +68,7 @@ static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i]))); dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
} }
static __global__ __launch_bounds__(1024) void silu_f32(const float * x, float * dst, const int k) { static __global__ void silu_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
......
...@@ -6,8 +6,8 @@ ...@@ -6,8 +6,8 @@
include make/common-defs.make include make/common-defs.make
include make/rocm-defs.make include make/rocm-defs.make
HIP_ARCHS_COMMON := gfx928 gfx906 HIP_ARCHS_COMMON := gfx928 gfx906 gfx936
HIP_ARCHS_LINUX := gfx928 gfx906 HIP_ARCHS_LINUX := gfx928 gfx906 gfx936
ifeq ($(OS),windows) ifeq ($(OS),windows)
GPU_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)/bin") GPU_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)/bin")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment