修改awq相关printf提示，在cmakelist中添加ck so路径

617e86ea · gaoqiong · d26f4c73 · 617e86ea · 617e86ea · 617e86ea
Commit 617e86ea authored May 28, 2024 by gaoqiong
4 changed files
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -381,7 +381,7 @@ class TurboMind:
        self.config = cfg
        self.model_name = cfg.model_name
        self.data_type = cfg.weight_type
-        print("from_workspace_cfg:",cfg)
+        #print("from_workspace_cfg:",cfg)
        # create model
        logger.warning(f'model_config:\n\n{cfg.toini()}')

--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -25,6 +25,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -fPIC")
 #set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
 #set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_directories(Llama PUBLIC ../../../../3rdparty/)
 target_link_libraries(Llama PUBLIC cudart
        gemm_s4_f16
        cublasMMWrapper

--- a/src/turbomind/models/llama/LlamaLinear.h
+++ b/src/turbomind/models/llama/LlamaLinear.h
@@ -165,11 +165,6 @@ private:
               }
            }
-            if(cublas_wrapper_->m_dump_switch==2)
-            {
-                std::cout<<" m: "<<batch_size<<" n: "<<weight.output_dims<<" k: "<<weight.input_dims<<std::endl;
-                PrintScale<T>(stream_,output_data,36,0,0,0);
-            }
            sync_check_cuda_error();
        }
        else {
@@ -249,10 +244,6 @@ private:
            else if(weight.w4_weight_layout==2) //TN 模式padding ck
            {
                //检查ck workspace 的空间是否足够
-                if(batch_size*weight.output_dims>M_max*N_max)
-                {
-                    FT_CHECK_WITH_INFO(0, "error! ck workspace is not enough");
-                }
                if(weight.input_dims%4096==0)
                {
@@ -265,11 +256,6 @@ private:
               }
            }
            addFusedSiluActivation(stream_,output_data,output_tmp,batch_size,weight.output_dims,1);
-            if(cublas_wrapper_->m_dump_switch==2)
-            {
-                std::cout<<" m: "<<batch_size<<" n: "<<weight.output_dims<<" k: "<<weight.input_dims<<std::endl;
-                PrintScale<T>(stream_,output_data,36,0,0,0);
-            }
            sync_check_cuda_error();
        }
        else {

--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -69,14 +69,14 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
        std::string str_w4_weight_layout = std::to_string(w4_weight_layout);
        const char* env_value = str_w4_weight_layout.c_str();
        setenv(env_name,env_value , 1);
-        printf("set LMDEPLOY_WEIGHTLAYOUT_SWITCH env: %d \n",w4_weight_layout);
+        //printf("set LMDEPLOY_WEIGHTLAYOUT_SWITCH env: %d \n",w4_weight_layout);
    }
    else
    {
        std::string str_w4_weight_layout = std::to_string(-1);
        const char* env_value = str_w4_weight_layout.c_str();
        setenv(env_name,env_value , 1);
-        printf("set LMDEPLOY_WEIGHTLAYOUT_SWITCH env: %d \n",w4_weight_layout);
+        //printf("set LMDEPLOY_WEIGHTLAYOUT_SWITCH env: %d \n",-1);
    }
    mallocWeights();
 }