revert the MoE dependence (#3230)

3ee62235 · Yineng Zhang · GitHub · 9829e77e · 9829e77e · 9829e77e
Unverified Commit 3ee62235 authored Jan 31, 2025 by Yineng Zhang Committed by GitHub Jan 31, 2025
20 changed files
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/tllmException.cpp
+++ b/sgl-kernel/3rdparty/tensorrt_llm/common/tllmException.cpp
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "tensorrt_llm/common/tllmException.h"
-#include "tensorrt_llm/common/stringUtils.h"
-#include <cstdlib>
-#if !defined(_MSC_VER)
-#include <cxxabi.h>
-#include <dlfcn.h>
-#include <execinfo.h>
-#endif
-#include <sstream>
-namespace tensorrt_llm::common
-{
-namespace
-{
-int constexpr VOID_PTR_SZ = 2 + sizeof(void*) * 2;
-}
-#if !defined(_MSC_VER)
-TllmException::TllmException(char const* file, std::size_t line, std::string const& msg)
-    : std::runtime_error{""}
-{
-    mNbFrames = backtrace(mCallstack.data(), MAX_FRAMES);
-    auto const trace = getTrace();
-    std::runtime_error::operator=(
-        std::runtime_error{fmtstr("%s (%s:%zu)\n%s", msg.c_str(), file, line, trace.c_str())});
-}
-#else
-TllmException::TllmException(char const* file, std::size_t line, std::string const& msg)
-    : mNbFrames{}
-    , std::runtime_error{fmtstr("%s (%s:%zu)", msg.c_str(), file, line)}
-{
-}
-#endif
-TllmException::~TllmException() noexcept = default;
-std::string TllmException::getTrace() const
-{
-#if defined(_MSC_VER)
-    return "";
-#else
-    auto const trace = backtrace_symbols(mCallstack.data(), mNbFrames);
-    std::ostringstream buf;
-    for (auto i = 1; i < mNbFrames; ++i)
-    {
-        Dl_info info;
-        if (dladdr(mCallstack[i], &info) && info.dli_sname)
-        {
-            auto const clearName = demangle(info.dli_sname);
-            buf << fmtstr("%-3d %*p %s + %zd", i, VOID_PTR_SZ, mCallstack[i], clearName.c_str(),
-                static_cast<char*>(mCallstack[i]) - static_cast<char*>(info.dli_saddr));
-        }
-        else
-        {
-            buf << fmtstr("%-3d %*p %s", i, VOID_PTR_SZ, mCallstack[i], trace[i]);
-        }
-        if (i < mNbFrames - 1)
-            buf << std::endl;
-    }
-    if (mNbFrames == MAX_FRAMES)
-        buf << std::endl << "[truncated]";
-    std::free(trace);
-    return buf.str();
-#endif
-}
-std::string TllmException::demangle(char const* name)
-{
-#if defined(_MSC_VER)
-    return name;
-#else
-    std::string clearName{name};
-    auto status = -1;
-    auto const demangled = abi::__cxa_demangle(name, nullptr, nullptr, &status);
-    if (status == 0)
-    {
-        clearName = demangled;
-        std::free(demangled);
-    }
-    return clearName;
-#endif
-}
-} // namespace tensorrt_llm::common
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/tllmException.h
+++ b/sgl-kernel/3rdparty/tensorrt_llm/common/tllmException.h
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include <array>
-#include <cstddef>
-#include <stdexcept>
-#include <string>
-#define NEW_TLLM_EXCEPTION(...)                                                                                        \
-    tensorrt_llm::common::TllmException(__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__))
-namespace tensorrt_llm::common
-{
-class TllmException : public std::runtime_error
-{
-public:
-    static auto constexpr MAX_FRAMES = 128;
-    explicit TllmException(char const* file, std::size_t line, std::string const& msg);
-    ~TllmException() noexcept override;
-    [[nodiscard]] std::string getTrace() const;
-    static std::string demangle(char const* name);
-private:
-    std::array<void*, MAX_FRAMES> mCallstack{};
-    int mNbFrames;
-};
-} // namespace tensorrt_llm::common
--- a/sgl-kernel/3rdparty/tensorrt_llm/common/workspace.h
+++ b/sgl-kernel/3rdparty/tensorrt_llm/common/workspace.h
-/*
- * Copyright (c) 1993-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include <cstddef>
-#include <cstdint>
-namespace tensorrt_llm::common
-{
-std::uintptr_t constexpr kCudaMemAlign = 128;
-inline int8_t* alignPtr(int8_t* ptr, uintptr_t to)
-{
-    uintptr_t addr = (uintptr_t) ptr;
-    if (addr % to)
-    {
-        addr += to - addr % to;
-    }
-    return (int8_t*) addr;
-}
-constexpr size_t alignSize(size_t size, size_t to)
-{
-    if ((size % to) != 0U)
-    {
-        size += to - size % to;
-    }
-    return size;
-}
-inline int8_t* nextWorkspacePtrCommon(int8_t* ptr, uintptr_t previousWorkspaceSize, uintptr_t const alignment)
-{
-    uintptr_t addr = (uintptr_t) ptr;
-    addr += previousWorkspaceSize;
-    return alignPtr((int8_t*) addr, alignment);
-}
-inline int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize)
-{
-    return nextWorkspacePtrCommon(ptr, previousWorkspaceSize, kCudaMemAlign);
-}
-inline int8_t* nextWorkspacePtr(
-    int8_t* const base, uintptr_t& offset, uintptr_t const size, uintptr_t const alignment = kCudaMemAlign)
-{
-    uintptr_t curr_offset = offset;
-    uintptr_t next_offset = curr_offset + ((size + alignment - 1) / alignment) * alignment;
-    int8_t* newptr = size == 0 ? nullptr : base + curr_offset;
-    offset = next_offset;
-    return newptr;
-}
-inline int8_t* nextWorkspacePtrWithAlignment(
-    int8_t* ptr, uintptr_t previousWorkspaceSize, uintptr_t const alignment = kCudaMemAlign)
-{
-    return nextWorkspacePtrCommon(ptr, previousWorkspaceSize, alignment);
-}
-inline size_t calculateTotalWorkspaceSize(
-    size_t const* workspaces, int count, uintptr_t const alignment = kCudaMemAlign)
-{
-    size_t total = 0;
-    for (int i = 0; i < count; i++)
-    {
-        total += workspaces[i];
-        if (workspaces[i] % alignment)
-        {
-            total += alignment - (workspaces[i] % alignment);
-        }
-    }
-    return total;
-}
-}; // namespace tensorrt_llm::common
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/copy_red_global.hpp
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/copy_red_global.hpp
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/mma.h
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/mma.h
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include <cuda_runtime_api.h>
-#include "cutlass/device_kernel.h"
-#include "tensorrt_llm/common/cudaUtils.h"
-namespace tensorrt_llm
-{
-namespace cutlass_extensions
-{
-template <typename GemmKernel, bool enable_cutlass_3x = false>
-inline int compute_occupancy_for_kernel()
-{
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-    if (smem_size > (48 << 10))
-    {
-        cudaFuncAttributes attr;
-        int device = 0;
-        int max_smem_per_block = 0;
-        tensorrt_llm::common::check_cuda_error(cudaGetDevice(&device));
-        tensorrt_llm::common::check_cuda_error(
-            cudaDeviceGetAttribute(&max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
-        if constexpr (enable_cutlass_3x)
-        {
-            tensorrt_llm::common::check_cuda_error(cudaFuncGetAttributes(&attr, cutlass::device_kernel<GemmKernel>));
-        }
-        else
-        {
-            tensorrt_llm::common::check_cuda_error(cudaFuncGetAttributes(&attr, cutlass::Kernel<GemmKernel>));
-        }
-        if (smem_size + attr.sharedSizeBytes >= static_cast<size_t>(max_smem_per_block))
-        {
-            // This should mean that
-            // cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)
-            // wouldn't work. In that case, we return an occupancy of 0. This will cause the heuristic to ignore this
-            // configuration.
-            return 0;
-        }
-        if constexpr (enable_cutlass_3x)
-        {
-            tensorrt_llm::common::check_cuda_error(cudaFuncSetAttribute(
-                cutlass::device_kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-        }
-        else
-        {
-            tensorrt_llm::common::check_cuda_error(cudaFuncSetAttribute(
-                cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-        }
-    }
-    int max_active_blocks = -1;
-    if constexpr (enable_cutlass_3x)
-    {
-        tensorrt_llm::common::check_cuda_error(
-            cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, cutlass::device_kernel<GemmKernel>,
-                128 * (GemmKernel::NumLoadWarpGroups + GemmKernel::NumMmaWarpGroups), smem_size));
-    }
-    else
-    {
-        tensorrt_llm::common::check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-            &max_active_blocks, cutlass::Kernel<GemmKernel>, GemmKernel::kThreadCount, smem_size));
-    }
-    return max_active_blocks;
-}
-} // namespace cutlass_extensions
-} // namespace tensorrt_llm
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/collective/epilogue_moe_finalize.hpp
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/collective/epilogue_moe_finalize.hpp
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/thread/fused_activations.h
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/thread/fused_activations.h
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_gated.inl
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_gated.inl
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_gated.hpp
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_gated.hpp
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_gated.hpp
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_gated.hpp
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/device/gemm_universal_base_compat.h
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/device/gemm_universal_base_compat.h
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/device/splitk_gemm_grouped.h
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/device/splitk_gemm_grouped.h
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h
--- a/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/default_int8_traits.h
+++ b/sgl-kernel/3rdparty/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/default_int8_traits.h