Unverified Commit 8d09630a authored by gongchensu's avatar gongchensu Committed by GitHub
Browse files

Merge branch 'demo131' into Issue/862

parents ab52dead 012df56c
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def linear_w8a8i8(
input: Tensor,
weight_packed: Tensor,
weight_scale: Tensor,
bias=None,
out=None,
) -> Tensor:
r"""Linear layer with weight quantized to int8 and input quantized to int8 with per-tensor scale."""
if out is None:
return Tensor(
_infinicore.linear_w8a8i8(
input._underlying,
weight_packed._underlying,
weight_scale._underlying,
None if bias is None else bias._underlying,
)
)
_infinicore.linear_w8a8i8_(
out._underlying,
input._underlying,
weight_packed._underlying,
weight_scale._underlying,
None if bias is None else bias._underlying,
)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def silu_and_mul(input: Tensor, out=None) -> Tensor:
r"""Apply the SiLU and Mul (SwiGLU) function.
Formula: output = SiLU(input_gate) * input_up
Input shape: [..., 2*d], Output shape: [..., d]
"""
if out is None:
return Tensor(_infinicore.silu_and_mul(input._underlying))
_infinicore.silu_and_mul_(out._underlying, input._underlying)
return out
import infinicore.tensor as tensor
from infinicore.lib import _infinicore
def add_rms_norm(a, b, weight, epsilon=1e-5, *, out=None, residual=None):
"""
Fused Add and RMS Normalization.
Args:
a: First input tensor
b: Second input tensor
weight: Scale weights
epsilon: Small constant for numerical stability, default is 1e-5
out: Optional output tuple (y, residual_out) for in-place operation
Returns:
Tuple of (normalized_result, add_result): (RMSNorm(a + b) * weight, a + b)
The add_result can be used as residual for subsequent layers.
"""
if out is None:
out = tensor.empty(a.shape, dtype=a.dtype, device=a.device)
if residual is None:
residual = tensor.empty(b.shape, dtype=b.dtype, device=b.device)
_infinicore.add_rms_norm_(
out._underlying,
residual._underlying,
a._underlying,
b._underlying,
weight._underlying,
epsilon,
)
return out, residual
from infinicore.lib import _infinicore
def kv_caching(k_cache, v_cache, k, v, past_kv_lengths):
_infinicore.kv_caching_(
k_cache._underlying,
v_cache._underlying,
k._underlying,
v._underlying,
past_kv_lengths._underlying,
)
return k_cache, v_cache
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def paged_attention(
q: Tensor,
k_cache: Tensor,
v_cache: Tensor,
block_tables: Tensor,
cache_lens: Tensor,
alibi_slopes: Tensor | None = None,
scale: float = 1.0,
*,
out: Tensor | None = None,
):
if out is None:
return Tensor(
_infinicore.paged_attention(
q._underlying,
k_cache._underlying,
v_cache._underlying,
block_tables._underlying,
cache_lens._underlying,
alibi_slopes._underlying if alibi_slopes is not None else None,
scale,
)
)
_infinicore.paged_attention_(
out._underlying,
q._underlying,
k_cache._underlying,
v_cache._underlying,
block_tables._underlying,
cache_lens._underlying,
alibi_slopes._underlying if alibi_slopes is not None else None,
scale,
)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def paged_attention_prefill(
q: Tensor,
k_cache: Tensor,
v_cache: Tensor,
block_tables: Tensor,
history_lens: Tensor,
cu_seqlens_q: Tensor,
alibi_slopes: Tensor | None = None,
scale: float = 1.0,
*,
out: Tensor | None = None,
):
alibi_ptr = alibi_slopes._underlying if alibi_slopes is not None else None
if out is None:
return Tensor(
_infinicore.paged_attention_prefill(
q._underlying,
k_cache._underlying,
v_cache._underlying,
block_tables._underlying,
history_lens._underlying,
cu_seqlens_q._underlying,
alibi_ptr,
scale,
)
)
_infinicore.paged_attention_prefill_(
out._underlying,
q._underlying,
k_cache._underlying,
v_cache._underlying,
block_tables._underlying,
history_lens._underlying,
cu_seqlens_q._underlying,
alibi_ptr,
scale,
)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def paged_caching(
k_cache: Tensor,
v_cache: Tensor,
k: Tensor,
v: Tensor,
slot_mapping: Tensor,
):
Tensor(
_infinicore.paged_caching_(
k_cache._underlying,
v_cache._underlying,
k._underlying,
v._underlying,
slot_mapping._underlying,
)
)
return (k_cache, v_cache)
......@@ -42,6 +42,11 @@ class Tensor:
getattr(self._underlying, name)
),
)
else:
raise AttributeError(
"{!r} object has no attribute {!r}".format(__name__, name)
)
return getattr(self, name)
@property
......
import concurrent.futures
import importlib
import pathlib
......@@ -11,16 +12,32 @@ SRC_DIR_PATH = CURRENT_FILE_PATH.parent.parent / "src"
def _find_and_build_ops():
ops_path = SRC_DIR_PATH / "infiniop" / "ops"
for op_dir in ops_path.iterdir():
ninetoothed_path = op_dir / "ninetoothed"
with concurrent.futures.ProcessPoolExecutor() as executor:
futures = []
if ninetoothed_path.is_dir():
module_path = ninetoothed_path / "build"
relative_path = module_path.relative_to(SRC_DIR_PATH)
import_name = ".".join(relative_path.parts)
module = importlib.import_module(import_name)
for op_dir in ops_path.iterdir():
ninetoothed_path = op_dir / "ninetoothed"
module.build()
if not ninetoothed_path.is_dir():
continue
build_file = ninetoothed_path / "build.py"
if not build_file.exists():
continue
futures.append(executor.submit(_build, ninetoothed_path))
for future in concurrent.futures.as_completed(futures):
future.result()
def _build(ninetoothed_path):
module_path = ninetoothed_path / "build"
relative_path = module_path.relative_to(SRC_DIR_PATH)
import_name = ".".join(relative_path.parts)
module = importlib.import_module(import_name)
module.build()
if __name__ == "__main__":
......
......@@ -12,7 +12,7 @@ void printUsage() {
std::cout << "infiniccl-test --<device>" << std::endl
<< std::endl;
std::cout << " --<device>" << std::endl;
std::cout << " Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|qy|kunlun|hygon)." << std::endl
std::cout << " Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|qy|kunlun|hygon|ali)." << std::endl
<< std::endl;
std::cout << "The program will run tests on all visible devices of the specified device type."
<< " Use Environmental Variables such as CUDA_VSIBLE_DEVICES to limit visible device IDs.";
......@@ -46,6 +46,7 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
else PARSE_DEVICE("--qy", INFINI_DEVICE_QY)
else PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN)
else PARSE_DEVICE("--hygon", INFINI_DEVICE_HYGON)
else PARSE_DEVICE("--ali", INFINI_DEVICE_ALI)
else {
printUsage();
}
......
......@@ -62,7 +62,7 @@ infiniStatus_t commInitAll(
for (int i = 0; i < ndevice; i++) {
rank_list[i] = i;
CHECK_INTERNAL(cnrtSetDevice(device_ids[i]), CNRT_RET_SUCCESS);
CHECK_INTERNAL(cnrtSetDevice(device_ids[i]), cnrtSuccess);
}
CHECK_CNCL(cnclInitComms(cncl_comms.data(), ndevice,
......
......@@ -4,7 +4,7 @@
#include "../infiniccl_impl.h"
// Windows does not support CUDA
#if (defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)) && defined(ENABLE_CCL) && !defined(_WIN32)
#if (defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)) && defined(ENABLE_CCL) && !defined(_WIN32)
INFINICCL_DEVICE_API_IMPL(cuda)
#else
INFINICCL_DEVICE_API_NOOP(cuda)
......
......@@ -27,6 +27,7 @@ __C infiniStatus_t infinicclCommInitAll(
COMM_INIT_ALL(INFINI_DEVICE_METAX, metax);
COMM_INIT_ALL(INFINI_DEVICE_MOORE, moore);
COMM_INIT_ALL(INFINI_DEVICE_KUNLUN, kunlun);
COMM_INIT_ALL(INFINI_DEVICE_ALI, cuda);
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -53,6 +54,7 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
COMM_DESTROY(INFINI_DEVICE_METAX, metax);
COMM_DESTROY(INFINI_DEVICE_MOORE, moore);
COMM_DESTROY(INFINI_DEVICE_KUNLUN, kunlun);
COMM_DESTROY(INFINI_DEVICE_ALI, cuda);
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -86,6 +88,7 @@ __C infiniStatus_t infinicclAllReduce(
ALL_REDUCE(INFINI_DEVICE_METAX, metax);
ALL_REDUCE(INFINI_DEVICE_MOORE, moore);
ALL_REDUCE(INFINI_DEVICE_KUNLUN, kunlun);
ALL_REDUCE(INFINI_DEVICE_ALI, cuda);
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
......@@ -66,6 +66,7 @@ xmake build infinicore-test
./infinicore-test --qy
./infinicore-test --kunlun
./infinicore-test --hygon
./infinicore-test --ali
```
### Customize Test Parameters
......
......@@ -42,6 +42,7 @@ void printUsage() {
<< " qy" << std::endl
<< " kunlun" << std::endl
<< " hygon" << std::endl
<< " ali" << std::endl
<< std::endl
<< "Available tests:" << std::endl
<< " basic - Basic memory allocation and deallocation tests" << std::endl
......@@ -84,6 +85,8 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
args.device_type = INFINI_DEVICE_KUNLUN;
} else if (arg == "--hygon") {
args.device_type = INFINI_DEVICE_HYGON;
} else if (arg == "--ali") {
args.device_type = INFINI_DEVICE_ALI;
} else if (arg == "--test") {
if (i + 1 >= argc) {
std::cerr << "Error: --test requires a test name" << std::endl;
......
......@@ -12,12 +12,18 @@ DevicePinnedHostAllocator::~DevicePinnedHostAllocator() {
}
std::byte *DevicePinnedHostAllocator::allocate(size_t size) {
if (size == 0) {
return nullptr;
}
void *ptr;
INFINICORE_CHECK_ERROR(infinirtMallocHost(&ptr, size));
return (std::byte *)ptr;
}
void DevicePinnedHostAllocator::deallocate(std::byte *ptr) {
if (ptr == nullptr) {
return;
}
if (owner_ == context::getDevice()) {
INFINICORE_CHECK_ERROR(infinirtFreeHost(ptr));
gc();
......
......@@ -4,10 +4,16 @@
namespace infinicore {
std::byte *HostAllocator::allocate(size_t size) {
if (size == 0) {
return nullptr;
}
return (std::byte *)std::malloc(size);
}
void HostAllocator::deallocate(std::byte *ptr) {
if (ptr == nullptr) {
return;
}
std::free(ptr);
}
......
#include "pinnable_block_allocator.hpp"
#include "../context_impl.hpp"
#include "../../utils.hpp"
#include <algorithm>
#include <infinirt.h>
#include <stdexcept>
namespace infinicore {
// ------------------- Helper functions -------------------
// Round up size to nearest multiple of alignment
inline size_t align_up(size_t size, size_t alignment) {
return (size + alignment - 1) / alignment * alignment;
}
// ------------------- Constructor -------------------
PinnableBlockAllocator::PinnableBlockAllocator(Device device)
: device_(device) {
size_classes_ = {
{32 * 1024, {}}, // 32 KB
{256 * 1024, {}}, // 256 KB
{1 * 1024 * 1024, {}}, // 1 MB
{2 * 1024 * 1024, {}}, // 2 MB
{4 * 1024 * 1024, {}}, // 4 MB
{8 * 1024 * 1024, {}}, // 8 MB
{16 * 1024 * 1024, {}}, // 16 MB
{32 * 1024 * 1024, {}}, // 32 MB
{64 * 1024 * 1024, {}}, // 64 MB
{128 * 1024 * 1024, {}}, // 128 MB
{256 * 1024 * 1024, {}}, // 256 MB
};
}
// ------------------- allocate -------------------
std::byte *PinnableBlockAllocator::allocate(size_t size) {
if (size == 0) {
return nullptr;
}
std::lock_guard<std::mutex> lock(mutex_);
// Align size to 256 bytes for GPU
size = align_up(size, 256);
std::shared_ptr<Block> block;
// 1. Try size-class allocation for small/medium
for (auto &cls : size_classes_) {
if (size <= cls.block_size) {
if (!cls.free_blocks.empty()) {
block = cls.free_blocks.back();
while (block != nullptr && block->in_use) {
cls.free_blocks.pop_back();
if (cls.free_blocks.empty()) {
block = nullptr;
break;
}
block = cls.free_blocks.back();
}
if (block != nullptr) {
cls.free_blocks.pop_back();
block->in_use = true;
return reinterpret_cast<std::byte *>(block->ptr);
}
}
// Allocate a new block for this class
block = std::make_shared<Block>();
block->size = cls.block_size;
block->frozen = pinned_mode_;
block->in_use = true;
INFINICORE_CHECK_ERROR(infinirtMalloc(&block->ptr, block->size));
all_blocks_[block->ptr] = block;
return reinterpret_cast<std::byte *>(block->ptr);
}
}
// 2. Large block allocation
// Try to reuse a frozen or free large block
auto it = std::find_if(large_blocks_.begin(), large_blocks_.end(),
[size](const std::shared_ptr<Block> &b) { return b->size >= size && !b->in_use; });
if (it != large_blocks_.end()) {
block = *it;
block->in_use = true;
block->frozen = block->frozen || pinned_mode_;
return reinterpret_cast<std::byte *>(block->ptr);
}
// Allocate new large block
block = std::make_shared<Block>();
block->size = size;
block->frozen = pinned_mode_;
block->in_use = true;
INFINICORE_CHECK_ERROR(infinirtMalloc(&block->ptr, block->size));
large_blocks_.push_back(block);
all_blocks_[block->ptr] = block;
return reinterpret_cast<std::byte *>(block->ptr);
}
// ------------------- deallocate -------------------
void PinnableBlockAllocator::deallocate(std::byte *ptr) {
if (ptr == nullptr) {
return;
}
std::lock_guard<std::mutex> lock(mutex_);
auto it = all_blocks_.find(reinterpret_cast<void *>(ptr));
if (it == all_blocks_.end()) {
throw std::runtime_error("Pointer not allocated by this allocator");
}
auto block = it->second;
if (!block->in_use) {
throw std::runtime_error("Double free detected in PinnableBlockAllocator");
}
block->in_use = false;
if (!block->in_use) {
for (auto &cls : size_classes_) {
if (block->size == cls.block_size) {
cls.free_blocks.push_back(block);
break;
}
}
}
}
size_t PinnableBlockAllocator::mark_in_use_(void *ptr, bool in_use) {
auto it = all_blocks_.find(reinterpret_cast<void *>(ptr));
if (it == all_blocks_.end()) {
throw std::runtime_error("Pointer not allocated by this allocator");
}
std::lock_guard<std::mutex> lock(mutex_);
it->second->in_use = in_use;
return it->second->size;
}
// ------------------- trim -------------------
void PinnableBlockAllocator::trim() {
std::lock_guard<std::mutex> lock(mutex_);
// Free non-frozen size-class blocks
for (auto &cls : size_classes_) {
for (auto it = cls.free_blocks.begin(); it != cls.free_blocks.end();) {
if (!(*it)->frozen) {
INFINICORE_CHECK_ERROR(infinirtFree((*it)->ptr));
all_blocks_.erase((*it)->ptr);
it = cls.free_blocks.erase(it);
} else {
++it;
}
}
}
// Free non-frozen large blocks
for (auto it = large_blocks_.begin(); it != large_blocks_.end();) {
if (!(*it)->frozen && !(*it)->in_use) {
INFINICORE_CHECK_ERROR(infinirtFree((*it)->ptr));
all_blocks_.erase((*it)->ptr);
it = large_blocks_.erase(it);
} else {
++it;
}
}
}
// ------------------- Destructor -------------------
PinnableBlockAllocator::~PinnableBlockAllocator() {
std::lock_guard<std::mutex> lock(mutex_);
for (auto &p : all_blocks_) {
if (p.second->ptr) {
infinirtFree(p.second->ptr);
}
}
all_blocks_.clear();
large_blocks_.clear();
for (auto &cls : size_classes_) {
cls.free_blocks.clear();
}
}
} // namespace infinicore
#pragma once
#include "memory_allocator.hpp"
#include <mutex>
#include <unordered_map>
#include <vector>
namespace infinicore {
class PinnableBlockAllocator : public MemoryAllocator {
// Represents a single memory block
struct Block {
void *ptr = nullptr; // Device pointer
size_t size = 0; // Block size in bytes
bool frozen = false; // True if used in pinned/graph mode
bool in_use = false; // Wether the block is currently in use
};
// A simple size-class allocator for small/medium blocks
struct SizeClass {
size_t block_size; // Fixed size for this class
std::vector<std::shared_ptr<Block>> free_blocks;
};
public:
PinnableBlockAllocator(Device device);
~PinnableBlockAllocator();
std::byte *allocate(size_t size) override;
void deallocate(std::byte *ptr) override;
// Switch pinned/graph mode
void set_pin_mode(bool pinned) { pinned_mode_ = pinned; }
// internal use only, force set in_use flag for a mem block
// return the size of the block
size_t mark_in_use_(void *ptr, bool in_use);
// trim cached blocks back to GPU (not pinned)
void trim();
private:
Device device_;
bool pinned_mode_ = false;
std::vector<SizeClass> size_classes_;
std::vector<std::shared_ptr<Block>> large_blocks_;
std::unordered_map<void *, std::shared_ptr<Block>> all_blocks_;
std::mutex mutex_; // Thread safety
};
} // namespace infinicore
#include "device_caching_allocator.hpp"
#include "stream_ordered_allocator.hpp"
#include <infinirt.h>
#include "../../utils.hpp"
namespace infinicore {
DeviceCachingAllocator::DeviceCachingAllocator(Device device) : MemoryAllocator(), device_(device) {}
StreamOrderedAllocator::StreamOrderedAllocator(Device device) : MemoryAllocator(), device_(device) {}
std::byte *DeviceCachingAllocator::allocate(size_t size) {
std::byte *StreamOrderedAllocator::allocate(size_t size) {
if (size == 0) {
return nullptr;
}
void *ptr = nullptr;
INFINICORE_CHECK_ERROR(infinirtMallocAsync(&ptr, size, context::getStream()));
return (std::byte *)ptr;
}
void DeviceCachingAllocator::deallocate(std::byte *ptr) {
void StreamOrderedAllocator::deallocate(std::byte *ptr) {
if (ptr == nullptr) {
return;
}
INFINICORE_CHECK_ERROR(infinirtFreeAsync(ptr, context::getStream()));
}
} // namespace infinicore
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment