Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
13a5d61b
Unverified
Commit
13a5d61b
authored
Sep 23, 2025
by
Olga Andreeva
Committed by
GitHub
Sep 23, 2025
Browse files
fix: FullyContiguous Memory Region Size Bug (#3175)
Signed-off-by:
Olga Andreeva
<
oandreeva@nvidia.com
>
parent
8bb9a555
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
231 additions
and
4 deletions
+231
-4
lib/llm/src/block_manager/block/data/local.rs
lib/llm/src/block_manager/block/data/local.rs
+18
-2
lib/llm/src/block_manager/layout.rs
lib/llm/src/block_manager/layout.rs
+1
-1
lib/llm/src/block_manager/layout/utils.rs
lib/llm/src/block_manager/layout/utils.rs
+212
-1
No files found.
lib/llm/src/block_manager/block/data/local.rs
View file @
13a5d61b
...
...
@@ -124,7 +124,15 @@ impl<S: Storage> BlockDataViews<S> for LocalBlockData<S> {
if
self
.is_fully_contiguous
()
{
let
mr
=
self
.layout
.memory_region
(
self
.block_idx
,
0
,
0
)
?
;
let
offset
=
mr
.addr
();
let
size
=
mr
.size
()
*
self
.num_layers
();
let
size
=
mr
.size
()
.checked_mul
(
self
.num_layers
())
.and_then
(|
intermediate
|
intermediate
.checked_mul
(
self
.num_outer_dims
()))
.ok_or_else
(||
{
BlockError
::
InvalidState
(
format!
(
"Block size calculation overflow: region_size={} * layers={} * outer_dims={}"
,
mr
.size
(),
self
.num_layers
(),
self
.num_outer_dims
()
))
})
?
;
let
storage_type
=
mr
.storage_type
();
unsafe
{
view
::
BlockView
::
new
(
self
,
offset
,
size
,
storage_type
)
}
}
else
{
...
...
@@ -138,7 +146,15 @@ impl<S: Storage> BlockDataViews<S> for LocalBlockData<S> {
if
self
.is_fully_contiguous
()
{
let
mr
=
self
.layout
.memory_region
(
self
.block_idx
,
0
,
0
)
?
;
let
offset
=
mr
.addr
();
let
size
=
mr
.size
()
*
self
.num_layers
();
let
size
=
mr
.size
()
.checked_mul
(
self
.num_layers
())
.and_then
(|
intermediate
|
intermediate
.checked_mul
(
self
.num_outer_dims
()))
.ok_or_else
(||
{
BlockError
::
InvalidState
(
format!
(
"Block size calculation overflow: region_size={} * layers={} * outer_dims={}"
,
mr
.size
(),
self
.num_layers
(),
self
.num_outer_dims
()
))
})
?
;
let
storage_type
=
mr
.storage_type
();
unsafe
{
view
::
BlockViewMut
::
new
(
self
,
offset
,
size
,
storage_type
)
}
}
else
{
...
...
lib/llm/src/block_manager/layout.rs
View file @
13a5d61b
...
...
@@ -325,7 +325,7 @@ impl FullyContiguousConfig {
let
outer_dim_stride_in_bytes
=
config
.page_size
*
config
.inner_dim
*
config
.dtype_width_bytes
;
let
layer_stride_in_bytes
=
outer_dim_stride_in_bytes
*
config
.outer_dim
;
let
memory_region_size
=
layer
_stride_in_bytes
;
let
memory_region_size
=
outer_dim
_stride_in_bytes
;
let
natural_block_stride
=
config
.num_layers
*
layer_stride_in_bytes
;
let
block_stride_in_bytes
=
if
alignment
>
1
{
...
...
lib/llm/src/block_manager/layout/utils.rs
View file @
13a5d61b
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use
crate
::
block_manager
::
layout
::{
BlockLayoutConfig
,
LayoutError
};
use
crate
::
block_manager
::
layout
::{
BlockLayoutConfig
,
GenericBlockLayout
,
LayoutError
};
use
crate
::
block_manager
::
storage
::
Storage
;
use
validator
::
ValidationError
;
/// Verification result for a memory region
#[derive(Debug,
Clone)]
#[allow(dead_code)]
pub
struct
RegionVerificationResult
{
/// Block index that was verified
pub
block_idx
:
usize
,
/// Layer index that was verified
pub
layer_idx
:
usize
,
/// Outer dimension index that was verified
pub
outer_idx
:
usize
,
/// Expected memory address for this region
pub
expected_addr
:
usize
,
/// Actual memory address for this region
pub
actual_addr
:
usize
,
/// Expected size in bytes for this region
pub
expected_size
:
usize
,
/// Actual size in bytes for this region
pub
actual_size
:
usize
,
/// Whether the addresses match
pub
addr_matches
:
bool
,
/// Whether the sizes match
pub
size_matches
:
bool
,
}
/// Layout verification statistics
#[derive(Debug,
Clone,
Default)]
#[allow(dead_code)]
pub
struct
LayoutVerificationStats
{
/// Total number of memory regions verified
pub
total_regions
:
usize
,
/// Number of regions with address mismatches
pub
addr_mismatches
:
usize
,
/// Number of regions with size mismatches
pub
size_mismatches
:
usize
,
/// Number of regions that passed all verifications
pub
successful_verifications
:
usize
,
}
#[derive(Debug)]
#[allow(dead_code)]
pub
struct
WorkerLayoutVerifier
{
stats
:
LayoutVerificationStats
,
}
#[allow(dead_code)]
impl
WorkerLayoutVerifier
{
// Constructor: Start with clean slate
pub
fn
new
()
->
Self
{
Self
{
stats
:
LayoutVerificationStats
::
default
(),
}
}
pub
fn
verify_layout_consistency
<
L
:
GenericBlockLayout
>
(
&
mut
self
,
layout
:
&
L
,
)
->
Result
<
Vec
<
RegionVerificationResult
>
,
LayoutError
>
{
// This is the main orchestrator method.
// It systematically checks every memory region in
// the layout to ensure consistency.
self
.stats
=
LayoutVerificationStats
::
default
();
let
mut
results
=
Vec
::
new
();
// Iterate over all blocks, layers, and outer dimensions
for
block_idx
in
0
..
layout
.num_blocks
()
{
for
layer_idx
in
0
..
layout
.num_layers
()
{
for
outer_idx
in
0
..
layout
.outer_dim
()
{
let
result
=
self
.verify_memory_region
(
layout
,
block_idx
,
layer_idx
,
outer_idx
)
?
;
self
.update_stats
(
&
result
);
results
.push
(
result
);
}
}
}
Ok
(
results
)
}
pub
fn
verify_memory_region
<
L
:
GenericBlockLayout
>
(
&
mut
self
,
layout
:
&
L
,
block_idx
:
usize
,
layer_idx
:
usize
,
outer_idx
:
usize
,
)
->
Result
<
RegionVerificationResult
,
LayoutError
>
{
let
memory_region
=
layout
.memory_region
(
block_idx
,
layer_idx
,
outer_idx
)
?
;
let
config
=
layout
.config
();
let
expected_size
=
config
.page_size
*
config
.inner_dim
*
config
.dtype_width_bytes
;
let
expected_addr
=
memory_region
.addr
();
Ok
(
RegionVerificationResult
{
block_idx
,
layer_idx
,
outer_idx
,
expected_addr
,
actual_addr
:
memory_region
.addr
(),
expected_size
,
actual_size
:
memory_region
.size
(),
addr_matches
:
expected_addr
==
memory_region
.addr
(),
size_matches
:
expected_size
==
memory_region
.size
(),
})
}
fn
update_stats
(
&
mut
self
,
result
:
&
RegionVerificationResult
)
{
self
.stats.total_regions
+=
1
;
if
!
result
.addr_matches
{
self
.stats.addr_mismatches
+=
1
;
}
if
!
result
.size_matches
{
self
.stats.size_mismatches
+=
1
;
}
if
result
.addr_matches
&&
result
.size_matches
{
self
.stats.successful_verifications
+=
1
;
}
}
pub
fn
has_critical_mismatches
(
&
self
)
->
bool
{
self
.stats.size_mismatches
>
0
}
}
/// Validation function for Option<usize> to check if it's Some(power_of_2).
pub
fn
validate_power_of_2
(
alignment
:
usize
)
->
Result
<
(),
ValidationError
>
{
if
!
alignment
.is_power_of_two
()
{
...
...
@@ -87,3 +211,90 @@ pub fn validate_indices<C: BlockLayoutConfig>(
Ok
(())
}
#[cfg(test)]
mod
worker_verification_tests
{
use
super
::
*
;
use
crate
::
block_manager
::
LayoutConfig
;
use
crate
::
block_manager
::
layout
::{
FullyContiguous
,
LayerSeparate
};
use
crate
::
block_manager
::
storage
::
tests
::{
NullDeviceAllocator
,
NullDeviceStorage
};
// Test constants (same as layout.rs tests)
const
NUM_BLOCKS
:
usize
=
7
;
const
NUM_LAYERS
:
usize
=
5
;
const
OUTER_DIM
:
usize
=
2
;
const
PAGE_SIZE
:
usize
=
4
;
const
INNER_DIM
:
usize
=
13
;
const
DTYPE_WIDTH_BYTES
:
usize
=
4
;
fn
create_test_config
()
->
LayoutConfig
{
LayoutConfig
{
num_blocks
:
NUM_BLOCKS
,
num_layers
:
NUM_LAYERS
,
outer_dim
:
OUTER_DIM
,
page_size
:
PAGE_SIZE
,
inner_dim
:
INNER_DIM
,
alignment
:
1
,
dtype_width_bytes
:
DTYPE_WIDTH_BYTES
,
}
}
fn
create_fully_contiguous_layout
()
->
Result
<
FullyContiguous
<
NullDeviceStorage
>
,
LayoutError
>
{
let
config
=
create_test_config
();
FullyContiguous
::
allocate
(
config
,
&
NullDeviceAllocator
)
}
fn
create_layer_separate_layout
()
->
Result
<
LayerSeparate
<
NullDeviceStorage
>
,
LayoutError
>
{
let
config
=
create_test_config
();
LayerSeparate
::
allocate
(
config
,
&
NullDeviceAllocator
,
true
)
// outer_contiguous = true
}
#[test]
fn
test_verify_initialisation
()
{
let
verifier
=
WorkerLayoutVerifier
::
new
();
assert_eq!
(
verifier
.stats.total_regions
,
0
);
assert_eq!
(
verifier
.stats.addr_mismatches
,
0
);
assert_eq!
(
verifier
.stats.size_mismatches
,
0
);
assert_eq!
(
verifier
.stats.successful_verifications
,
0
);
assert
!
(
!
verifier
.has_critical_mismatches
());
}
#[test]
fn
test_layer_separate_verification
()
{
let
layout
=
create_layer_separate_layout
()
.expect
(
"Failed to create LayerSeparate layout"
);
let
mut
verifier
=
WorkerLayoutVerifier
::
new
();
let
results
=
verifier
.verify_layout_consistency
(
&
layout
)
.expect
(
"Failed to verify layout"
);
assert_eq!
(
results
.len
(),
NUM_BLOCKS
*
NUM_LAYERS
*
OUTER_DIM
);
assert
!
(
!
verifier
.has_critical_mismatches
(),
"Expected no critical mismatches but got: total={}, size_mismatches={}, successful={}"
,
verifier
.stats.total_regions
,
verifier
.stats.size_mismatches
,
verifier
.stats.successful_verifications
);
}
#[test]
fn
test_fully_contiguous_verification
()
{
let
layout
=
create_fully_contiguous_layout
()
.expect
(
"Failed to create FullyContiguous layout"
);
let
mut
verifier
=
WorkerLayoutVerifier
::
new
();
let
results
=
verifier
.verify_layout_consistency
(
&
layout
)
.expect
(
"Failed to verify layout"
);
assert_eq!
(
results
.len
(),
NUM_BLOCKS
*
NUM_LAYERS
*
OUTER_DIM
);
assert
!
(
!
verifier
.has_critical_mismatches
(),
"Expected no critical mismatches but got: total={}, size_mismatches={}, successful={}"
,
verifier
.stats.total_regions
,
verifier
.stats.size_mismatches
,
verifier
.stats.successful_verifications
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment