"tests/entrypoints/openai/responses/test_harmony.py" did not exist on "95a935fc48563ec63de02a65d41fd2d7cb1d9ea5"
mod.rs 6.46 KB
Newer Older
Ryan Olson's avatar
Ryan Olson committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

//! Decoupled layout system for block management.
//!
//! This module provides a simplified layout abstraction that:
//! - Maps block IDs to physical memory regions (address + size)
//! - Decouples memory regions from storage type information
//! - Specifies allocation requirements without performing allocation
//! - Uses trait objects for memory ownership

pub(crate) mod builder;

mod config;
mod fully_contiguous;
mod kv_block_layout;
mod layer_separate;
mod physical;
mod serialize;
mod validation;

#[cfg(all(test, feature = "testing-kvbm"))]
pub(super) mod tests;

// #[cfg(test)]
// mod integration_tests;

pub use builder::PhysicalLayoutBuilder;
pub use config::{BlockDimension, LayoutConfig};
pub(crate) use fully_contiguous::FullyContiguousLayout;
pub use kv_block_layout::{BlockDim, KvBlockLayout};
pub(crate) use layer_separate::LayerSeparateLayout;
pub use physical::NixlMetadata;
pub use physical::PhysicalLayout;
pub(crate) use serialize::LayoutDescriptor;
pub use serialize::{BlockFormat, FullyContiguousDetails, LayerSeparateDetails, LayoutTypeDetails};

// mod registration;
// pub use registration::{RegisteredLayout, RegisteredStorageMetadata, RegistrationManager};

use anyhow::Result;
use serde::{Deserialize, Serialize};

pub(crate) use dynamo_memory::MemoryDescriptor;
pub use dynamo_memory::{Buffer, MemoryRegion};

/// Core layout trait for mapping block IDs to memory regions.
///
/// Layouts specify how KV cache blocks are organized in memory without
/// performing allocation themselves. They provide:
/// - Memory region lookup for specific blocks
/// - Allocation requirements for external allocators
/// - Metadata about block organization
pub trait Layout: Send + Sync + std::fmt::Debug {
    /// Get the configuration for this layout.
    fn config(&self) -> &LayoutConfig;

    /// Get the root memory regions backing this layout.
    ///
    /// These regions correspond to the concrete allocations that store the layout's data.
    /// Implementations that derive memory procedurally can return an empty slice.
    fn memory_regions(&self) -> &[Buffer];

    /// Get memory regions for a specific block_id, layer_id, outer_id.
    ///
    /// Returns a [MemoryRegion] for the continuous region specified by the given block_id,
    /// layer_id, outer_id.
    ///
    /// # Arguments
    /// * `block_id` - The ID of the block to query (0..num_blocks)
    /// * `layer_id` - The ID of the layer to query (0..num_layers)
    /// * `outer_id` - The ID of the outer dimension to query (0..outer_dim)
    fn memory_region(
        &self,
        block_id: usize,
        layer_id: usize,
        outer_id: usize,
    ) -> Result<MemoryRegion>;

    /// Get the allocation requirements for this layout.
    ///
    /// Returns a vector of allocation sizes needed to back this layout.
    /// For fully contiguous layouts, this will be a single size.
    /// For layer-separate layouts, this will contain one size per layer.
    ///
    /// # Returns
    /// Vector of allocation sizes in bytes.
    fn required_allocations(&self) -> Vec<usize>;

    /// Check if this layout uses fully contiguous memory.
    ///
    /// Fully contiguous layouts have all blocks in a single allocation,
    /// which enables certain optimizations.
    fn is_fully_contiguous(&self) -> bool;

    /// Get the total number of blocks in this layout.
    fn num_blocks(&self) -> usize;

    /// Get the number of layers per block.
    fn num_layers(&self) -> usize;

    /// Get the outer dimension size.
    ///
    /// In typical KV cache layouts, this is often 2 (for K and V),
    /// but can be 1 for architectures like MLA.
    fn outer_dim(&self) -> usize;

    /// Get the page size (often corresponds to block size in tokens).
    fn page_size(&self) -> usize;

    /// Get the inner dimension size.
    ///
    /// This is typically the hidden size divided by tensor parallel size.
    fn inner_dim(&self) -> usize;

    /// Get the data type width in bytes.
    fn dtype_width_bytes(&self) -> usize;

    /// Get serialization details for this layout type.
    ///
    /// This provides the layout-type-specific information needed to serialize
    /// and reconstruct the layout on a remote node.
    fn serialization_details(&self) -> serialize::LayoutTypeDetails;

    /// Get the KV block layout describing how dimensions are permuted within blocks.
    ///
    /// Returns the internal tensor ordering for blocks in this layout.
    /// For layer-separate layouts, this describes the inner tensor format.
    /// For fully contiguous layouts, this describes the full block format.
    fn block_layout(&self) -> KvBlockLayout;
}

/// Inner shape format for tensor layout
#[allow(clippy::upper_case_acronyms)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub(crate) enum InnerShape {
    /// Unknown shape - fallback when we can't determine the format
    Unknown,
    /// NHD format: [block_size, num_heads, head_dim]
    /// Common for attention layers where N=tokens, H=heads, D=dimension
    NHD,
    /// HND format: [num_heads, block_size, head_dim]
    /// Alternative layout with heads first
    HND,
}

/// Trait for layouts that provide contiguous per-block memory regions.
///
/// This trait enables direct access to entire blocks as contiguous memory,
/// without requiring layer/outer indexing. It is implemented by
/// [`FullyContiguousLayout`] but NOT by [`LayerSeparateLayout`] (which
/// stores each layer separately).
///
/// Use this trait when you need to:
/// - Access raw block memory for transformation kernels
/// - Reinterpret block memory under different [`KvBlockLayout`] formats
/// - Perform whole-block operations without layer decomposition
pub trait ContiguousBlockLayout: Send + Sync + std::fmt::Debug {
    /// Get the total number of blocks in this layout.
    fn num_blocks(&self) -> usize;

    /// Get the size of each block in bytes.
    fn bytes_per_block(&self) -> usize;

    /// Get the contiguous memory region for a specific block.
    ///
    /// # Arguments
    /// * `block_id` - The ID of the block to query (0..num_blocks)
    ///
    /// # Returns
    /// A [`MemoryRegion`] covering the entire block's memory.
    ///
    /// # Errors
    /// Returns an error if `block_id` is out of range.
    fn raw_block(&self, block_id: usize) -> Result<MemoryRegion>;

    /// Get the KV block layout for this contiguous layout.
    fn block_layout(&self) -> KvBlockLayout;
}