Merge branch 'develop' into reopen_masking_att_instance

4fbb8598 · Chao Liu · GitHub · 45996360 · e9d4e893 · 4fbb8598
Unverified Commit 4fbb8598 authored Sep 22, 2022 by Chao Liu Committed by GitHub Sep 22, 2022
13 changed files
--- a/Config.cmake.in
+++ b/Config.cmake.in
 @PACKAGE_INIT@
-set(_composable_kernel_supported_components device_operations host_tensor)
+set(_composable_kernel_supported_components device_operations utility)
 foreach(_comp ${composable_kernel_FIND_COMPONENTS})
 	if(NOT _comp IN_LIST _composable_kernel_supported_components)

--- a/Dockerfile
+++ b/Dockerfile
@@ -12,7 +12,8 @@ RUN apt-get install -y wget gnupg
 RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
 RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
 RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
-RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
+#RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
+RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
@@ -68,7 +69,6 @@ ENV UBSAN_OPTIONS=print_stacktrace=1
 ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
-ADD dev-requirements.txt dev-requirements.txt
 RUN groupadd -f render
 # Install the new rocm-cmake version

--- a/Jenkinsfile
+++ b/Jenkinsfile
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -9,7 +9,7 @@ message(STATUS "Build with HIP ${hip_VERSION}")
 # add all example subdir
 file(GLOB dir_list LIST_DIRECTORIES true *)
 FOREACH(subdir ${dir_list})
-    IF(IS_DIRECTORY "${subdir}")
+    IF(IS_DIRECTORY "${subdir}" AND (NOT "${subdir}" MATCHES "build"))
        add_subdirectory(${subdir})
    ENDIF()
 ENDFOREACH()
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
 ROCmSoftwarePlatform/rocm-recipes
 # 1.90+
-danmar/cppcheck@dd05839a7e63ef04afd34711cb3e1e0ef742882f
\ No newline at end of file
--- a/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
@@ -506,12 +506,12 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
        {
-            return g_idx * static_cast<long_index_t>(batch_stride_A_);
+            return static_cast<long_index_t>(g_idx) * batch_stride_A_;
        }
        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
        {
-            return g_idx * static_cast<long_index_t>(batch_stride_B_);
+            return static_cast<long_index_t>(g_idx) * batch_stride_B_;
        }
        __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
@@ -519,8 +519,8 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
            std::array<long_index_t, NumDTensor> ds_offset;
            static_for<0, NumDTensor, 1>{}([&](auto i) {
-                ds_offset[i] =
+                ds_offset[i] = static_cast<long_index_t>(g_idx) *
-                    ds_grid_desc_g_m_n_[i].CalculateOffset(make_multi_index(g_idx, 0, 0));
+                               ds_grid_desc_g_m_n_[i].CalculateOffset(make_multi_index(1, 0, 0));
            });
            return ds_offset;
@@ -528,7 +528,8 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
        __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
        {
-            return e_grid_desc_g_m_n_.CalculateOffset(make_multi_index(g_idx, 0, 0));
+            return static_cast<long_index_t>(g_idx) *
+                   e_grid_desc_g_m_n_.CalculateOffset(make_multi_index(1, 0, 0));
        }
        private:

--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -332,7 +332,10 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
              a_element_op_{a_element_op},
              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
+              cde_element_op_{cde_element_op},
+              MRaw_{MRaw},
+              NRaw_{NRaw},
+              KRaw_{KRaw}
        {
            // populate pointer, desc for Ds
            static_for<0, NumDTensor, 1>{}([&](auto i) {
@@ -400,6 +403,11 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
        AElementwiseOperation a_element_op_;
        BElementwiseOperation b_element_op_;
        CDEElementwiseOperation cde_element_op_;
+        // for checking vector load/store
+        index_t MRaw_;
+        index_t NRaw_;
+        index_t KRaw_;
    };
    // Invoker
@@ -486,6 +494,86 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
            return false;
        }
+        // check vector load/store
+        {
+            using Row = ck::tensor_layout::gemm::RowMajor;
+            using Col = ck::tensor_layout::gemm::ColumnMajor;
+            // check vector load of A
+            if constexpr(is_same_v<ALayout, Row> && ABlockTransferSrcVectorDim == 2)
+            {
+                if(arg.KRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else if constexpr(is_same_v<ALayout, Col> && ABlockTransferSrcVectorDim == 1)
+            {
+                // FIXME: not rigorous
+                if(arg.MRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+            // check vector laod of B
+            if constexpr(is_same_v<BLayout, Col> && BBlockTransferSrcVectorDim == 2)
+            {
+                if(arg.KRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else if constexpr(is_same_v<BLayout, Row> && BBlockTransferSrcVectorDim == 1)
+            {
+                // FIXME: not rigorous
+                if(arg.NRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+            // check vector load of Ds
+            // only support RowMajor for now
+            bool all_valid = true;
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                if constexpr(!is_same_v<DLayout, Row>)
+                {
+                    all_valid = false;
+                }
+            });
+            if(!all_valid)
+            {
+                return false;
+            }
+            // check vector store of E
+            // only support RowMajor for now
+            if constexpr(is_same_v<ELayout, Row>)
+            {
+                if(arg.NRaw_ % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+        }
        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
                                           arg.b_grid_desc_n_k_,
                                           arg.ds_grid_desc_m_n_,

--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -3,27 +3,27 @@
 #include <cstring>
-// int profile_gemm(int, char*[]);
+int profile_gemm(int, char*[]);
-// int profile_gemm_splitk(int, char*[]);
+int profile_gemm_splitk(int, char*[]);
-// int profile_gemm_bilinear(int, char*[]);
+int profile_gemm_bilinear(int, char*[]);
-// int profile_gemm_add_add_fastgelu(int, char*[]);
+int profile_gemm_add_add_fastgelu(int, char*[]);
-// int profile_gemm_reduce(int, char*[]);
+int profile_gemm_reduce(int, char*[]);
-// int profile_gemm_bias_add_reduce(int, char*[]);
+int profile_gemm_bias_add_reduce(int, char*[]);
-// int profile_batched_gemm(int, char*[]);
+int profile_batched_gemm(int, char*[]);
-// int profile_batched_gemm_gemm(int, char*[]);
+int profile_batched_gemm_gemm(int, char*[]);
-// int profile_batched_gemm_add_relu_gemm_add(int, char*[]);
+int profile_batched_gemm_add_relu_gemm_add(int, char*[]);
-// int profile_batched_gemm_reduce(int, char*[]);
+int profile_batched_gemm_reduce(int, char*[]);
-// int profile_grouped_gemm(int, char*[]);
+int profile_grouped_gemm(int, char*[]);
-// int profile_conv_fwd(int, char*[]);
+int profile_conv_fwd(int, char*[]);
-// int profile_conv_fwd_bias_relu(int, char*[]);
+int profile_conv_fwd_bias_relu(int, char*[]);
-// int profile_conv_fwd_bias_relu_add(int, char*[]);
+int profile_conv_fwd_bias_relu_add(int, char*[]);
-// int profile_conv_bwd_data(int, char*[]);
+int profile_conv_bwd_data(int, char*[]);
-// int profile_conv_bwd_weight(int, char*[]);
+int profile_conv_bwd_weight(int, char*[]);
-// int profile_grouped_conv_fwd(int, char*[]);
+int profile_grouped_conv_fwd(int, char*[]);
-// int profile_normalization(int, char*[]);
+int profile_normalization(int, char*[]);
 int profile_layernorm(int, char*[]);
 int profile_groupnorm(int, char*[]);
-// int profile_reduce(int, char*[]);
+int profile_reduce(int, char*[]);
 static void print_helper_message()
 {
@@ -57,7 +57,6 @@ int main(int argc, char* argv[])
        return 0;
    }
-#if 0
    else if(strcmp(argv[1], "gemm") == 0)
    {
        return profile_gemm(argc, argv);
@@ -134,7 +133,6 @@ int main(int argc, char* argv[])
    {
        return profile_normalization(argc, argv);
    }
-#endif
    else if(strcmp(argv[1], "layernorm") == 0)
    {
        return profile_layernorm(argc, argv);

--- a/script/process_perf_data.sh
+++ b/script/process_perf_data.sh
@@ -2,15 +2,14 @@
 #
 # in order to run this script you'd need the following python packages:
-pip3 install --upgrade pip
+#pip3 install --upgrade pip
-pip3 install sqlalchemy pymysql pandas sshtunnel
+#pip3 install sqlalchemy pymysql pandas sshtunnel
 # you would also need to set up some environment variables in order to 
 # post your new test results to the database and compare them to the baseline
 # please contact Illia.Silin@amd.com for more details
 #process results
-gpu_arch=$1
+python3 process_perf_data.py perf_gemm.log
-python3 process_perf_data.py perf_gemm_"$gpu_arch".log
+python3 process_perf_data.py perf_resnet50_N256.log
-python3 process_perf_data.py perf_resnet50_N256_"$gpu_arch".log
+python3 process_perf_data.py perf_resnet50_N4.log
-python3 process_perf_data.py perf_resnet50_N4_"$gpu_arch".log
--- a/script/process_qa_data.sh
+++ b/script/process_qa_data.sh
@@ -10,15 +10,14 @@
 # please contact Illia.Silin@amd.com for more details
 #process results
-gpu_arch=$1
+python3 process_perf_data.py perf_gemm.log
-python3 process_perf_data.py perf_gemm_"$gpu_arch".log
+python3 process_perf_data.py perf_resnet50_N256.log
-python3 process_perf_data.py perf_resnet50_N256_"$gpu_arch".log
+python3 process_perf_data.py perf_resnet50_N4.log
-python3 process_perf_data.py perf_resnet50_N4_"$gpu_arch".log
+python3 process_perf_data.py perf_batched_gemm.log
-python3 process_perf_data.py perf_batched_gemm_"$gpu_arch".log
+python3 process_perf_data.py perf_grouped_gemm.log
-python3 process_perf_data.py perf_grouped_gemm_"$gpu_arch".log
+python3 process_perf_data.py perf_conv_fwd.log
-python3 process_perf_data.py perf_conv_fwd_"$gpu_arch".log
+python3 process_perf_data.py perf_conv_bwd_data.log
-python3 process_perf_data.py perf_conv_bwd_data_"$gpu_arch".log
+python3 process_perf_data.py perf_gemm_bilinear.log
-python3 process_perf_data.py perf_gemm_bilinear_"$gpu_arch".log
+python3 process_perf_data.py perf_reduction.log
-python3 process_perf_data.py perf_reduction_"$gpu_arch".log
+python3 process_perf_data.py perf_splitK_gemm.log
-python3 process_perf_data.py perf_splitK_gemm_"$gpu_arch".log
+python3 process_perf_data.py perf_onnx_gemm.log
-python3 process_perf_data.py perf_onnx_gemm_"$gpu_arch".log
--- a/script/run_full_performance_tests.sh
+++ b/script/run_full_performance_tests.sh
@@ -5,12 +5,11 @@
 # post your new test results to the database and compare them to the baseline
 # please contact Illia.Silin@amd.com for more details
 #
-# run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <gpu_arch> <branch name> < node name>
+# run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <branch name> < node name>
 # input arguments: 
 # verification = 0 : do not verify result correctness on CPU
 #              = 1 : verifuy correctness on CPU (may take a long time)
 # environment tag  : a string describing the specifics of your test environment
-# gpu_arch         : a string for GPU architecture, e.g. "gfx908" or "gfx90a".
 # branch name      : name of the branch in git repo (git status | grep -e 'On branch')
 # node name        : $hostname
@@ -19,11 +18,9 @@ export verify=$1
 echo 'Verification: ' $verify
 export env_type=$2
 echo 'Environment type: ' $env_type
-export gpu_arch=$3
+export branch=$3
-echo 'GPU architecture: ' $gpu_arch
-export branch=$4
 echo 'Branch name: ' $branch
-export host_name=$5
+export host_name=$4
 echo 'Host name: ' $host_name
 function print_log_header(){
 	rm -f $1;
@@ -38,7 +35,7 @@ function print_log_header(){
 }
 #run gemm tests
-export gemm_log="perf_gemm_${gpu_arch}.log"
+export gemm_log="perf_gemm.log"
 print_log_header $gemm_log $env_type $branch $host_name
 ./profile_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $gemm_log
 ./profile_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_log
@@ -58,7 +55,7 @@ print_log_header $gemm_log $env_type $branch $host_name
 ./profile_gemm.sh gemm 3 3 $verify 1 0 1 2>&1 | tee -a $gemm_log
 #run batched_gemm tests
-export batched_gemm_log="perf_batched_gemm_${gpu_arch}.log"
+export batched_gemm_log="perf_batched_gemm.log"
 print_log_header $batched_gemm_log $env_type $branch $host_name
 ./profile_batched_gemm.sh batched_gemm 0 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
 ./profile_batched_gemm.sh batched_gemm 0 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
@@ -78,7 +75,7 @@ print_log_header $batched_gemm_log $env_type $branch $host_name
 ./profile_batched_gemm.sh batched_gemm 3 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
 #run grouped_gemm tests
-export grouped_gemm_log="perf_grouped_gemm_${gpu_arch}.log"
+export grouped_gemm_log="perf_grouped_gemm.log"
 print_log_header $grouped_gemm_log $env_type $branch $host_name
 ./profile_grouped_gemm.sh grouped_gemm 1 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log
 ./profile_grouped_gemm.sh grouped_gemm 1 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log
@@ -86,7 +83,7 @@ print_log_header $grouped_gemm_log $env_type $branch $host_name
 ./profile_grouped_gemm.sh grouped_gemm 1 3 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log
 #run GEMM+Bilinear tests
-export gemm_bilinear_log="perf_gemm_bilinear_${gpu_arch}.log"
+export gemm_bilinear_log="perf_gemm_bilinear.log"
 print_log_header $gemm_bilinear_log $env_type $branch $host_name
 ./profile_gemm_bilinear.sh gemm_bilinear 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
 ./profile_gemm_bilinear.sh gemm_bilinear 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
@@ -94,7 +91,7 @@ print_log_header $gemm_bilinear_log $env_type $branch $host_name
 ./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
 #run conv_fwd tests
-export conv_fwd_log="perf_conv_fwd_${gpu_arch}.log"
+export conv_fwd_log="perf_conv_fwd.log"
 print_log_header $conv_fwd_log $env_type $branch $host_name
 ./profile_conv_fwd.sh conv_fwd 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
 ./profile_conv_fwd.sh conv_fwd 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
@@ -102,7 +99,7 @@ print_log_header $conv_fwd_log $env_type $branch $host_name
 ./profile_conv_fwd.sh conv_fwd 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
 #run conv_bwd_data tests
-export conv_bwd_data_log="perf_conv_bwd_data_${gpu_arch}.log"
+export conv_bwd_data_log="perf_conv_bwd_data.log"
 print_log_header $conv_bwd_data_log $env_type $branch $host_name
 ./profile_conv_bwd_data.sh conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
 ./profile_conv_bwd_data.sh conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
@@ -110,33 +107,43 @@ print_log_header $conv_bwd_data_log $env_type $branch $host_name
 ./profile_conv_bwd_data.sh conv_bwd_data 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
 #run resnet50 tests
-export resnet256_log="perf_resnet50_N256_${gpu_arch}.log"
+export resnet256_log="perf_resnet50_N256.log"
 print_log_header $resnet256_log $env_type $branch $host_name
 ./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 256 2>&1 | tee -a $resnet256_log
-export resnet4_log="perf_resnet50_N4_${gpu_arch}.log"
+export resnet4_log="perf_resnet50_N4.log"
 print_log_header $resnet4_log $env_type $branch $host_name
 ./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 4 2>&1 | tee -a $resnet4_log
 #run reduction tests
-export reduction_log="perf_reduction_${gpu_arch}.log"
+export reduction_log="perf_reduction.log"
 print_log_header $reduction_log $env_type $branch $host_name
 ./profile_reduce_with_index.sh $verify 2 10 --half 2>&1 | tee -a $reduction_log
 ./profile_reduce_no_index.sh $verify 2 10 --half 2>&1 | tee -a $reduction_log
-#run splitK_gemm tests
+#run splitK_gemm tests, first correctness verification, then performance
-export splitK_gemm_log="perf_splitK_gemm_${gpu_arch}.log"
+export splitK_gemm_ver_log="perf_splitK_gemm_verify.log"
+print_log_header $splitK_gemm_ver_log $env_type $branch $host_name
+./profile_splitK_gemm.sh gemm_splitk 0 0 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 0 1 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 0 2 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 0 3 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 1 0 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 1 1 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 1 2 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 1 3 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+export splitK_gemm_log="perf_splitK_gemm.log"
 print_log_header $splitK_gemm_log $env_type $branch $host_name
-./profile_splitK_gemm.sh gemm_splitk 0 0 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 0 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 0 1 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 1 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 0 2 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 2 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 0 3 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 3 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 0 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 0 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 1 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 1 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 2 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 2 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 3 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 3 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
 #run ONNX gemm tests
-export onnx_log="perf_onnx_gemm_${gpu_arch}.log"
+export onnx_log="perf_onnx_gemm.log"
 print_log_header $onnx_log $env_type $branch $host_name
 ./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
 ./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
--- a/script/run_performance_tests.sh
+++ b/script/run_performance_tests.sh
 #!/bin/bash 
 #
 # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
-# run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <gpu_arch> <branch name> < node name>
+# run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <branch name> < node name>
 # input arguments: 
 # verification = 0 : do not verify result correctness on CPU
 #              = 1 : verify correctness on CPU (may take a long time)
 # environment tag  : a string describing the specifics of your test environment
-# gpu_arch         : a string for GPU architecture, e.g. "gfx908" or "gfx90a".
 # branch name      : name of the branch in git repo (git status | grep -e 'On branch')
 # node name        : $hostname
@@ -15,11 +14,9 @@ export verify=$1
 echo 'Verification: ' $verify
 export env_type=$2
 echo 'Environment type: ' $env_type
-export gpu_arch=$3
+export branch=$3
-echo 'GPU architecture: ' $gpu_arch
-export branch=$4
 echo 'Branch name: ' $branch
-export host_name=$5
+export host_name=$4
 echo 'Host name: ' $host_name
 function print_log_header(){
@@ -35,7 +32,7 @@ function print_log_header(){
 }
 #run gemm tests
-export gemm_log="perf_gemm_${gpu_arch}.log"
+export gemm_log="perf_gemm.log"
 print_log_header $gemm_log $env_type $branch $host_name
 ./profile_gemm.sh gemm 0 0 $verify 1 0 1 | tee -a $gemm_log
 ./profile_gemm.sh gemm 1 0 $verify 1 0 1 | tee -a $gemm_log
@@ -55,9 +52,9 @@ print_log_header $gemm_log $env_type $branch $host_name
 ./profile_gemm.sh gemm 3 3 $verify 1 0 1 | tee -a $gemm_log
 #run resnet50 tests
-export resnet256_log="perf_resnet50_N256_${gpu_arch}.log"
+export resnet256_log="perf_resnet50_N256.log"
 print_log_header $resnet256_log $env_type $branch $host_name
 ./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 256 | tee -a $resnet256_log
-export resnet4_log="perf_resnet50_N4_${gpu_arch}.log"
+export resnet4_log="perf_resnet50_N4.log"
 print_log_header $resnet4_log $env_type $branch $host_name
 ./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 4 | tee -a $resnet4_log