Unverified Commit 3ee62235 authored by Yineng Zhang's avatar Yineng Zhang Committed by GitHub
Browse files

revert the MoE dependence (#3230)

parent 9829e77e
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tensorrt_llm/common/tllmException.h"
#include "tensorrt_llm/common/stringUtils.h"
#include <cstdlib>
#if !defined(_MSC_VER)
#include <cxxabi.h>
#include <dlfcn.h>
#include <execinfo.h>
#endif
#include <sstream>
namespace tensorrt_llm::common
{
namespace
{
int constexpr VOID_PTR_SZ = 2 + sizeof(void*) * 2;
}
#if !defined(_MSC_VER)
TllmException::TllmException(char const* file, std::size_t line, std::string const& msg)
: std::runtime_error{""}
{
mNbFrames = backtrace(mCallstack.data(), MAX_FRAMES);
auto const trace = getTrace();
std::runtime_error::operator=(
std::runtime_error{fmtstr("%s (%s:%zu)\n%s", msg.c_str(), file, line, trace.c_str())});
}
#else
TllmException::TllmException(char const* file, std::size_t line, std::string const& msg)
: mNbFrames{}
, std::runtime_error{fmtstr("%s (%s:%zu)", msg.c_str(), file, line)}
{
}
#endif
TllmException::~TllmException() noexcept = default;
std::string TllmException::getTrace() const
{
#if defined(_MSC_VER)
return "";
#else
auto const trace = backtrace_symbols(mCallstack.data(), mNbFrames);
std::ostringstream buf;
for (auto i = 1; i < mNbFrames; ++i)
{
Dl_info info;
if (dladdr(mCallstack[i], &info) && info.dli_sname)
{
auto const clearName = demangle(info.dli_sname);
buf << fmtstr("%-3d %*p %s + %zd", i, VOID_PTR_SZ, mCallstack[i], clearName.c_str(),
static_cast<char*>(mCallstack[i]) - static_cast<char*>(info.dli_saddr));
}
else
{
buf << fmtstr("%-3d %*p %s", i, VOID_PTR_SZ, mCallstack[i], trace[i]);
}
if (i < mNbFrames - 1)
buf << std::endl;
}
if (mNbFrames == MAX_FRAMES)
buf << std::endl << "[truncated]";
std::free(trace);
return buf.str();
#endif
}
std::string TllmException::demangle(char const* name)
{
#if defined(_MSC_VER)
return name;
#else
std::string clearName{name};
auto status = -1;
auto const demangled = abi::__cxa_demangle(name, nullptr, nullptr, &status);
if (status == 0)
{
clearName = demangled;
std::free(demangled);
}
return clearName;
#endif
}
} // namespace tensorrt_llm::common
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <array>
#include <cstddef>
#include <stdexcept>
#include <string>
#define NEW_TLLM_EXCEPTION(...) \
tensorrt_llm::common::TllmException(__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__))
namespace tensorrt_llm::common
{
class TllmException : public std::runtime_error
{
public:
static auto constexpr MAX_FRAMES = 128;
explicit TllmException(char const* file, std::size_t line, std::string const& msg);
~TllmException() noexcept override;
[[nodiscard]] std::string getTrace() const;
static std::string demangle(char const* name);
private:
std::array<void*, MAX_FRAMES> mCallstack{};
int mNbFrames;
};
} // namespace tensorrt_llm::common
/*
* Copyright (c) 1993-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cstddef>
#include <cstdint>
namespace tensorrt_llm::common
{
std::uintptr_t constexpr kCudaMemAlign = 128;
inline int8_t* alignPtr(int8_t* ptr, uintptr_t to)
{
uintptr_t addr = (uintptr_t) ptr;
if (addr % to)
{
addr += to - addr % to;
}
return (int8_t*) addr;
}
constexpr size_t alignSize(size_t size, size_t to)
{
if ((size % to) != 0U)
{
size += to - size % to;
}
return size;
}
inline int8_t* nextWorkspacePtrCommon(int8_t* ptr, uintptr_t previousWorkspaceSize, uintptr_t const alignment)
{
uintptr_t addr = (uintptr_t) ptr;
addr += previousWorkspaceSize;
return alignPtr((int8_t*) addr, alignment);
}
inline int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize)
{
return nextWorkspacePtrCommon(ptr, previousWorkspaceSize, kCudaMemAlign);
}
inline int8_t* nextWorkspacePtr(
int8_t* const base, uintptr_t& offset, uintptr_t const size, uintptr_t const alignment = kCudaMemAlign)
{
uintptr_t curr_offset = offset;
uintptr_t next_offset = curr_offset + ((size + alignment - 1) / alignment) * alignment;
int8_t* newptr = size == 0 ? nullptr : base + curr_offset;
offset = next_offset;
return newptr;
}
inline int8_t* nextWorkspacePtrWithAlignment(
int8_t* ptr, uintptr_t previousWorkspaceSize, uintptr_t const alignment = kCudaMemAlign)
{
return nextWorkspacePtrCommon(ptr, previousWorkspaceSize, alignment);
}
inline size_t calculateTotalWorkspaceSize(
size_t const* workspaces, int count, uintptr_t const alignment = kCudaMemAlign)
{
size_t total = 0;
for (int i = 0; i < count; i++)
{
total += workspaces[i];
if (workspaces[i] % alignment)
{
total += alignment - (workspaces[i] % alignment);
}
}
return total;
}
}; // namespace tensorrt_llm::common
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cuda_runtime_api.h>
#include "cutlass/device_kernel.h"
#include "tensorrt_llm/common/cudaUtils.h"
namespace tensorrt_llm
{
namespace cutlass_extensions
{
template <typename GemmKernel, bool enable_cutlass_3x = false>
inline int compute_occupancy_for_kernel()
{
int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
if (smem_size > (48 << 10))
{
cudaFuncAttributes attr;
int device = 0;
int max_smem_per_block = 0;
tensorrt_llm::common::check_cuda_error(cudaGetDevice(&device));
tensorrt_llm::common::check_cuda_error(
cudaDeviceGetAttribute(&max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
if constexpr (enable_cutlass_3x)
{
tensorrt_llm::common::check_cuda_error(cudaFuncGetAttributes(&attr, cutlass::device_kernel<GemmKernel>));
}
else
{
tensorrt_llm::common::check_cuda_error(cudaFuncGetAttributes(&attr, cutlass::Kernel<GemmKernel>));
}
if (smem_size + attr.sharedSizeBytes >= static_cast<size_t>(max_smem_per_block))
{
// This should mean that
// cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)
// wouldn't work. In that case, we return an occupancy of 0. This will cause the heuristic to ignore this
// configuration.
return 0;
}
if constexpr (enable_cutlass_3x)
{
tensorrt_llm::common::check_cuda_error(cudaFuncSetAttribute(
cutlass::device_kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
}
else
{
tensorrt_llm::common::check_cuda_error(cudaFuncSetAttribute(
cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
}
}
int max_active_blocks = -1;
if constexpr (enable_cutlass_3x)
{
tensorrt_llm::common::check_cuda_error(
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, cutlass::device_kernel<GemmKernel>,
128 * (GemmKernel::NumLoadWarpGroups + GemmKernel::NumMmaWarpGroups), smem_size));
}
else
{
tensorrt_llm::common::check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks, cutlass::Kernel<GemmKernel>, GemmKernel::kThreadCount, smem_size));
}
return max_active_blocks;
}
} // namespace cutlass_extensions
} // namespace tensorrt_llm
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment