/* * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "stdlib.h" namespace turbomind { #define ACTIVATION_AMAX_NUM 72 #define INT8O_GEMM_NUM 8 #define TRT_AMAX_NUM 3 #define SCALE_RESERVE_NUM 21 struct ScaleList { // Part 1 -- 72: // First 72 are for activation amaxs. For each activation amax, there are 4 values: amax, amax/127.0f, // amax/127.0f/127.0f, 127.0f/amax -- input_amax 0-3 , Q_aftergemm_amax 4-7, Qbias_amax 8-11, K_aftergemm_amax // 12-15, Kbias_amax 16-19, V_aftergemm_amax 20-23, Vbias_amax 24-27, bmm1_amax 28-31, Softmax_amax 32-35, // bmm2_amax 36-39, Proj_aftergemm_scale 40-43, ProjBiasNorm_amax 44-47, FC1_aftergemm_amax 48-51, F1Bias_amax // 52-55, FC2_aftergemm_amax 56-59, F2BiasNorm_amax 60-63, reserve 64-71 // Part 2 -- 9*hidden_dim: // Kernel amaxs, for each kernel amax list, there are output_channel values : query_weight_amax_list, // key_weight_amax_list, value_weight_amax_list, proj_weight_amax_list, FC1_weight_amax_list, FC2_weight_amax_list // Part 3 -- 8: // Int8 gemm deQFactor list (8 values): Q_deQ_scale, K_deQ_scale, V_deQ_scale, bmm1_deQ_scale, bmm2_deQ_scale, // FC0_deQ_scale, FC1_deQ_scale, FC2_deQ_scale // Part 4 -- 3: // Amax used in trt fused mha kernel (3 values) : QKVbias_amax, Softmax_amax, bmm2_amax // Part 5 -- 21: reverse const float* d_scale_list_ = nullptr; const float* h_scale_list_ = nullptr; size_t size_ = ACTIVATION_AMAX_NUM + 9 * 768 + INT8O_GEMM_NUM + TRT_AMAX_NUM; size_t p2_offset_ = ACTIVATION_AMAX_NUM; size_t p3_offset_ = ACTIVATION_AMAX_NUM + 9 * 768; size_t p4_offset_ = ACTIVATION_AMAX_NUM + 9 * 768 + INT8O_GEMM_NUM; }; } // namespace turbomind