Support windows platform (#209)

* __PRETTY_FUNCTION__ * CASE_K * uint * remove not * HALF_FLT_MAX * struct init * port utils * better build pthread-win32 * port kernels * port utils/gemm_test * hide windows header * port models * port examples && triton_backend && unittests * update build readme * fix lint * fix lint * fix lint * fix lint * fix lint * fix build * fix build * cmake version * fix typos * update ci * port kernels/gemm_s_f16 * update ci * fix ci * use cudaStreamSynchronize instead of volatile check * remove gettimeofday * remove pthread-win32 * remove dirent.h * update pre-commit * update * remove todo * fix include * fix build * fix build * fix build ci * fix github action trigger * update README * fix linux-build ci * remove windows folder * fix lint * update readme

Support windows platform (#209)
* __PRETTY_FUNCTION__ * CASE_K * uint * remove not * HALF_FLT_MAX * struct init * port utils * better build pthread-win32 * port kernels * port utils/gemm_test * hide windows header * port models * port examples && triton_backend && unittests * update build readme * fix lint * fix lint * fix lint * fix lint * fix lint * fix build * fix build * cmake version * fix typos * update ci * port kernels/gemm_s_f16 * update ci * fix ci * use cudaStreamSynchronize instead of volatile check * remove gettimeofday * remove pthread-win32 * remove dirent.h * update pre-commit * update * remove todo * fix include * fix build * fix build * fix build ci * fix github action trigger * update README * fix linux-build ci * remove windows folder * fix lint * update readme
4c9959f6 · Chen Xin · GitHub · 0d21f366 · 4c9959f6 · 4c9959f6
Unverified Commit 4c9959f6 authored Aug 17, 2023 by Chen Xin Committed by GitHub Aug 17, 2023
20 changed files
--- a/.github/workflows/linux-x64-gpu.yml
+++ b/.github/workflows/linux-x64-gpu.yml
+name: linux-x64-gpu
+on:
+  push:
+    paths:
+      - '.github/workflows/linux-x64-gpu.yml'
+      - 'src/**'
+      - 'CMakeLists.txt'
+      - 'cmake/**'
+      - 'examples/**'
+      - '3rdparty/**'
+      - 'tests/csrc/**'
+  pull_request:
+    paths:
+      - '.github/workflows/linux-x64-gpu.yml'
+      - 'src/**'
+      - 'CMakeLists.txt'
+      - 'cmake/**'
+      - 'examples/**'
+      - '3rdparty/**'
+      - 'tests/csrc/**'
+concurrency:
+  group: linux-x64-gpu-${{ github.ref }}
+  cancel-in-progress: true
+permissions:
+  contents: read
+jobs:
+  cuda-118:
+    runs-on: ubuntu-latest
+    container: openmmlab/lmdeploy-builder:cuda11.8
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Build
+        run: |
+          source /opt/conda/bin/activate
+          conda activate py38
+          mkdir build && cd build
+          bash ../generate.sh
+          make -j$(nproc) && make install
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -10,7 +10,7 @@ on:
 jobs:
-  build:
+  linux-build:
    strategy:
      matrix:
        pyver: [py38, py39, py310, py311]
@@ -39,11 +39,49 @@ jobs:
          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}/*
          retention-days: 1
+  windows-build:
+    strategy:
+      matrix:
+        pyver: ['3.8', '3.9', '3.10', '3.11']
+    runs-on: windows-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.pyver }}
+      - name: Install python packages
+        run: |
+          pip install pybind11 wheel
+      - uses: Jimver/cuda-toolkit@v0.2.11
+        id: cuda-toolkit
+        with:
+          cuda: '11.8.0'
+          use-github-cache: false
+      - name: Build wheel
+        run: |
+          mkdir build
+          cd build
+          ..\builder\windows\generate.ps1
+          cmake --build . --config Release -- /m > build.log.txt
+          cmake --install . --config Release
+          cd ..
+          rm build -Force -Recurse
+          python setup.py bdist_wheel -d build/wheel
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          if-no-files-found: error
+          path: build/wheel/*
+          retention-days: 1
  publish:
    runs-on: ubuntu-latest
    environment: 'prod'
    needs:
-      - build
+      - linux-build
+      - windows-build
    steps:
      - name: Download artifacts
        uses: actions/download-artifact@v3

--- a/.github/workflows/windows-x64-gpu.yml
+++ b/.github/workflows/windows-x64-gpu.yml
+name: windows-x64-gpu
+on:
+  push:
+    paths:
+      - '.github/workflows/windows-x64-gpu.yml'
+      - 'src/**'
+      - 'CMakeLists.txt'
+      - 'cmake/**'
+      - 'examples/**'
+      - '3rdparty/**'
+      - 'tests/csrc/**'
+  pull_request:
+    paths:
+      - '.github/workflows/windows-x64-gpu.yml'
+      - 'src/**'
+      - 'CMakeLists.txt'
+      - 'cmake/**'
+      - 'examples/**'
+      - '3rdparty/**'
+      - 'tests/csrc/**'
+concurrency:
+  group: windows-x64-gpu-${{ github.ref }}
+  cancel-in-progress: true
+permissions:
+  contents: read
+jobs:
+  cuda-118:
+    runs-on: windows-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
+      - name: Install python packages
+        run: |
+          pip install pybind11 wheel
+      - uses: Jimver/cuda-toolkit@v0.2.11
+        id: cuda-toolkit
+        with:
+          cuda: '11.8.0'
+          use-github-cache: false
+      - name: Build wheel
+        run: |
+          ((Get-Content -path CMakeLists.txt -Raw) -replace '-Wall','/W0') | Set-Content CMakeLists.txt
+          $env:BUILD_TEST="ON"
+          mkdir build
+          cd build
+          ..\builder\windows\generate.ps1
+          cmake --build . --config Release -- /m /v:q
+          if (-Not $?) {
+            echo "build failed"
+            exit 1
+          }
+          cmake --install . --config Release
+          cd ..
+          rm build -Force -Recurse
+          python setup.py bdist_wheel -d build/wheel
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -168,6 +168,7 @@ if(BUILD_PYT)
  set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
 endif()
+set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
 set(CMAKE_C_FLAGS_DEBUG    "${CMAKE_C_FLAGS_DEBUG}    -Wall -O0")
 set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG}  -Wall -O0")
 # set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall  --ptxas-options=-v --resource-usage")
@@ -288,6 +289,15 @@ if(BUILD_TEST)
    add_subdirectory(tests/csrc)
 endif()
+# install python api
+if (BUILD_PY_FFI)
+  install(TARGETS _turbomind DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/lib)
+endif ()
+if (MSVC)
+  return()
+endif ()
 # # Mesaure the compile time
 option(MEASURE_BUILD_TIME "Measure the build time of each module" OFF)
 if (MEASURE_BUILD_TIME)
@@ -387,13 +397,6 @@ install(
    ${INSTALL_CONFIGDIR}
 )
-# install python api
-if (BUILD_PY_FFI)
-  install(TARGETS _turbomind DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/lib)
-endif ()
-install(TARGETS TransformerTritonBackend DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/lib)
 export(
  EXPORT
    transformer-shared-targets

--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ ______________________________________________________________________
 ## News 🎉
+- \[2023/08\] TurboMind supports Windows (tp=1)
 - \[2023/08\] TurboMind supports 4-bit inference, 2.4x faster than FP16, the fastest open-source implementation🚀. Check [this](./docs/en/w4a16.md) guide for detailed info
 - \[2023/08\] LMDeploy has launched on the [HuggingFace Hub](https://huggingface.co/lmdeploy), providing ready-to-use 4-bit models.
 - \[2023/08\] LMDeploy supports 4-bit quantization using the [AWQ](https://arxiv.org/abs/2306.00978) algorithm.

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -13,6 +13,7 @@ ______________________________________________________________________
 ## 更新 🎉
+- \[2023/08\] TurboMind 支持 Windows (tp=1)
 - \[2023/08\] TurboMind 支持 4-bit 推理，速度是 FP16 的 2.4 倍，是目前最快的开源实现🚀。部署方式请看[这里](./docs/zh_cn/w4a16.md)
 - \[2023/08\] LMDeploy 开通了 [HuggingFace Hub](https://huggingface.co/lmdeploy) ，提供开箱即用的 4-bit 模型
 - \[2023/08\] LMDeploy 支持使用 [AWQ](https://arxiv.org/abs/2306.00978) 算法进行 4-bit 量化

--- a/builder/manywheel/build_wheel.sh
+++ b/builder/manywheel/build_wheel.sh
@@ -6,6 +6,8 @@ PLAT_NAME="$2"
 DOCKER_TAG="$3"
 OUTPUT_DIR="$4"
+GIT_REMOTE=${GIT_REMOTE:-https://github.com/InternLM/lmdeploy}
+GIT_BRANCH=${GIT_BRANCH:-main}
 DOCKER_IMAGE="openmmlab/lmdeploy-builder:${DOCKER_TAG}"
 export USERID=$(id -u)
 export GROUPID=$(id -g)
@@ -18,6 +20,8 @@ docker run --rm -it \
    --env PLAT_NAME="${PLAT_NAME}" \
    --env USERID="${USERID}" \
    --env GROUPID="${GROUPID}" \
+    --env GIT_BRANCH="${GIT_BRANCH}" \
+    --env GIT_REMOTE="${GIT_REMOTE}" \
    --volume "$(pwd)/${OUTPUT_DIR}:/lmdeploy_build" \
    --volume "$(pwd)/entrypoint_build.sh:/entrypoint_build.sh" \
    --entrypoint /entrypoint_build.sh \

--- a/builder/manywheel/entrypoint_build.sh
+++ b/builder/manywheel/entrypoint_build.sh
@@ -7,14 +7,20 @@ export USERID=${USERID}
 export GROUPID=${GROUPID}
 export CUDAVER=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\).*$/\1/p')
+export GIT_REMOTE=${GIT_REMOTE:-https://github.com/InternLM/lmdeploy}
+export GIT_BRANCH=${GIT_BRANCH:-main}
 source /opt/conda/bin/activate
 conda activate $PYTHON_VERSION
-git clone https://github.com/InternLM/lmdeploy
+git clone -b ${GIT_BRANCH} ${GIT_REMOTE}
 cd lmdeploy
 mkdir build && cd build
 bash ../generate.sh
 make -j$(nproc) && make install
+if [ $? != 0 ]; then
+    echo "build failed"
+    exit 1
+fi
 cd ..
 rm -rf build
 python setup.py bdist_wheel --cuda=${CUDAVER} --plat-name $PLAT_NAME -d /tmpbuild/

--- a/builder/windows/README.md
+++ b/builder/windows/README.md
+# Build lmdeploy on windows
+## Requirements
+- [CMake 3.17+](https://github.com/Kitware/CMake/releases)
+- [Visual Studio 2019+](https://visualstudio.microsoft.com/downloads/)
+- [CUDA Toolkit 11.8+](https://developer.nvidia.com/cuda-toolkit-archive)
+## Build lmdeploy wheel
+```powershell
+mkdir build
+cd build
+..\builder\windows\generate.ps1
+cmake --build . --config Release -- /m
+cmake --install . --config Release
+cd ..
+rm build -Force -Recurse
+python setup.py bdist_wheel -d build\wheel
+```
--- a/builder/windows/generate.ps1
+++ b/builder/windows/generate.ps1
+cmake .. -A x64 -T v142,cuda="$env:CUDA_PATH" `
+    -DCMAKE_BUILD_TYPE=Release `
+    -DCMAKE_INSTALL_PREFIX=install `
+    -DBUILD_PY_FFI=ON `
+    -DBUILD_MULTI_GPU=OFF `
+    -DCMAKE_CUDA_FLAGS="-lineinfo" `
+    -DUSE_NVTX=ON `
+    -DBUILD_TEST="$env:BUILD_TEST"
--- a/examples/cpp/llama/CMakeLists.txt
+++ b/examples/cpp/llama/CMakeLists.txt
@@ -3,6 +3,6 @@
 add_executable(llama_triton_example llama_triton_example.cc)
 target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart
        LlamaTritonBackend TransformerTritonBackend mpi_utils nccl_utils
-        nvtx_utils word_list)
+        nvtx_utils word_list -lpthread)
 install(TARGETS llama_triton_example DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -15,12 +15,15 @@
 * limitations under the License.
 */
-// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/examples/cpp/multi_gpu_gpt/multi_gpu_gpt_triton_example.cc
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/examples/cpp/multi_gpu_gpt/multi_gpu_gpt_triton_example.cc
 #include "3rdparty/INIReader.h"
+#include <chrono>
 #include <memory>
 #include <thread>
+#include "src/turbomind/macro.h"
 #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
 #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
@@ -427,6 +430,7 @@ int main(int argc, char* argv[])
    const int  batch_size   = output_tensors_lists[0].get()->at("output_ids").shape[0];
    const int  beam_width   = output_tensors_lists[0].get()->at("output_ids").shape[1];
    const int  seq_len      = output_tensors_lists[0].get()->at("output_ids").shape[2];
    std::vector<int> seq_lens(batch_size);
    // step 6: check results
    if (node_id == 0) {
@@ -473,8 +477,7 @@ int main(int argc, char* argv[])
    if (1) {
        // test time
-        struct timeval start, end;
+        auto start = std::chrono::high_resolution_clock::now();
-        gettimeofday(&start, NULL);
        const int ite = 1;
        for (int i = 0; i < ite; i++) {
@@ -497,14 +500,15 @@ int main(int argc, char* argv[])
            ft::mpi::barrier();
        }
-        gettimeofday(&end, NULL);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto dur = std::chrono::duration<float, std::milli>(end - start);
        printf("[INFO] batch_size %d beam_width %d seq_len %d"
               " FT-CPP-GPT-Triton-time %.2f ms\n",
               batch_size,
               beam_width,
               seq_lens[0],
-               ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite);
+               dur.count() / ite);
    }
    if (kUSE_MPI) {
@@ -551,7 +555,6 @@ int read_start_ids(size_t            batch_size,
        return 0;
    }
    // Add padding
    for (int i = 0; i < (int)tmp_start_ids.size(); i++) {
        for (int j = (int)tmp_start_ids[i].size(); j < max_input_len; j++) {

--- a/src/turbomind/kernels/activation_kernels.cu
+++ b/src/turbomind/kernels/activation_kernels.cu
@@ -15,6 +15,7 @@
 */
 #include "src/turbomind/kernels/activation_kernels.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/utils/cuda_type_utils.cuh"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"

--- a/src/turbomind/kernels/ban_bad_words.cu
+++ b/src/turbomind/kernels/ban_bad_words.cu
@@ -101,7 +101,7 @@ void invokeBanBadWords(T*           logits,
                       cudaStream_t stream)
 {
    dim3 block, grid;
-    block.x = min(((bad_words_len + 32 - 1) / 32) * 32, 256UL);
+    block.x = min((unsigned long)((bad_words_len + 32 - 1) / 32) * 32, 256UL);
    grid.x  = (bad_words_len + block.x - 1) / block.x;
    grid.y  = local_batch_size * beam_width;

--- a/src/turbomind/kernels/bert_preprocess_kernels.cu
+++ b/src/turbomind/kernels/bert_preprocess_kernels.cu
@@ -63,7 +63,11 @@ void invokeGetPaddingOffsetAndCuSeqLens(size_t*      h_pinned_token_num,
    h_pinned_token_num[0] = 0;
    getPaddingOffsetAndCuSeqLensKernel<<<1, 1, 0, stream>>>(
        h_pinned_token_num, tmp_mask_offset, cu_seqlens, sequence_lengths, batch_size, max_seq_len);
+#ifdef _MSC_VER
+    cudaStreamSynchronize(stream);
+#else
    while (((volatile size_t*)h_pinned_token_num)[0] == 0) {};
+#endif
    h_token_num[0] = h_pinned_token_num[0];
    sync_check_cuda_error();
 }

--- a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
@@ -17,6 +17,7 @@
 #include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
 #include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
+#include "src/turbomind/macro.h"
 // #include "src/turbomind/utils/cuda_bf16_wrapper.h"
 // #include "src/turbomind/utils/cuda_fp8_utils.h"
 #include "src/turbomind/utils/cuda_type_utils.cuh"
@@ -1469,7 +1470,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
                    int offset = bhi * params.memory_max_len * Dh + co * params.memory_max_len * QK_ELTS_IN_16B
                                 + tlength_circ * QK_ELTS_IN_16B + ci;
-                    if (not QUANT_POLICY) {
+                    if (!QUANT_POLICY) {
                        *reinterpret_cast<Qk_vec_m*>(&params.k_cache[offset]) = vec_conversion<Qk_vec_m, Qk_vec_k>(k);
                    }
                    else if (QUANT_POLICY == 4) {
@@ -1491,7 +1492,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
                                 + tlength_circ * Dh + co * QK_ELTS_IN_16B + ci;
                    }
-                    if (not QUANT_POLICY) {
+                    if (!QUANT_POLICY) {
                        *reinterpret_cast<Qk_vec_m*>(&params.k_cache_per_sample[bi][offset]) =
                            vec_conversion<Qk_vec_m, Qk_vec_k>(k);
                    }
@@ -1582,7 +1583,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
    T*      k_cache_batch      = nullptr;
    int8_t* k_cache_batch_int8 = nullptr;
-    if (not QUANT_POLICY) {
+    if (!QUANT_POLICY) {
        k_cache_batch = params.k_cache_per_sample ? (params.k_cache_per_sample[bi] + params.kv_cache_per_sample_offset
                                                     + kvhi * params.memory_max_len * Dh + ki) :
                                                    &params.k_cache[bhi * params.memory_max_len * Dh + ki];
@@ -1635,7 +1636,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
                        beam_offset = beam_indices[ti_circ] * params.num_heads * params.memory_max_len * Dh;
                    }
-                    if (not QUANT_POLICY) {
+                    if (!QUANT_POLICY) {
                        k[ii] = vec_conversion<K_vec_k, K_vec_m>(
                            (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B])));
                    }
@@ -1769,7 +1770,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
    int8_t* v_cache_int8       = nullptr;
    int8_t* v_cache_batch_int8 = nullptr;
-    if (not QUANT_POLICY) {
+    if (!QUANT_POLICY) {
        v_cache = params.v_cache_per_sample ? (params.v_cache_per_sample[bi] + params.kv_cache_per_sample_offset
                                               + kvhi * params.memory_max_len * Dh + vi) :
@@ -1824,7 +1825,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
            // Load the values from the cache.
            V_vec_k v;
-            if (not QUANT_POLICY) {
+            if (!QUANT_POLICY) {
                v = vec_conversion<V_vec_k, V_vec_m>(
                    *reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti * Dh]));
            }
@@ -1871,7 +1872,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
            const int beam_offset = HAS_BEAMS ? beam_src * params.num_heads * params.memory_max_len * Dh : 0;
            // Load the values from the cache.
            V_vec_k v;
-            if (not QUANT_POLICY) {
+            if (!QUANT_POLICY) {
                v = vec_conversion<V_vec_k, V_vec_m>(
                    *reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti_circ * Dh]));
            }
@@ -1931,7 +1932,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
            // Store the values with bias back to global memory in the cache for V.
            //*reinterpret_cast<V_vec_k*>(&v_cache[params.timestep*Dh]) = v;
-            if (not QUANT_POLICY) {
+            if (!QUANT_POLICY) {
                *reinterpret_cast<V_vec_m*>(&v_cache[tlength_circ * Dh]) = vec_conversion<V_vec_m, V_vec_k>(v);
            }
            else if (QUANT_POLICY == 4) {

--- a/src/turbomind/kernels/gemm_s_f16/common.h
+++ b/src/turbomind/kernels/gemm_s_f16/common.h
@@ -2,6 +2,7 @@
 #pragma once
+#include "src/turbomind/macro.h"
 #include <cassert>
 #include <cstdint>
 #include <cuda_fp16.h>

--- a/src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
+++ b/src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
@@ -3,6 +3,7 @@
 #pragma once
 #include "metric.h"
+#include "src/turbomind/macro.h"
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>

--- a/src/turbomind/kernels/logprob_kernels.cu
+++ b/src/turbomind/kernels/logprob_kernels.cu
@@ -28,6 +28,7 @@
 #include "src/turbomind/kernels/logprob_kernels.h"
 #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/utils/logger.h"
 namespace turbomind {

--- a/src/turbomind/kernels/reduce_kernel_utils.cuh
+++ b/src/turbomind/kernels/reduce_kernel_utils.cuh
@@ -63,7 +63,7 @@ __device__ inline void copy(const void* local, void* data)
    *out         = *in;
 }
-static const float HALF_FLT_MAX = 65504.F;
+#define HALF_FLT_MAX 65504.F
 #define FINAL_MASK 0xffffffff
 template<typename T>