mod.rs 19.4 KB
Newer Older
Ryan Olson's avatar
Ryan Olson committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

//! Comprehensive transfer tests for verifying data integrity across storage types and layout configurations.

mod local_transfers;

/// Skip test if stub kernels are in use (no real CUDA available).
///
/// Call this at the start of any test that requires real CUDA operations.
/// When stubs are in use, the test will print a message and return early.
///
/// # Example
/// ```ignore
/// #[test]
/// fn my_cuda_test() {
///     skip_if_stubs!();
///     // ... test code that requires CUDA ...
/// }
/// ```
#[allow(unused_macros)]
macro_rules! skip_if_stubs {
    () => {
        if kvbm_kernels::is_using_stubs() {
            eprintln!(
                "Skipping test '{}': stub kernels in use (no real CUDA)",
                module_path!()
            );
            return;
        }
    };
}

/// Check if any of the storage kinds require CUDA, and skip if stubs are in use.
///
/// Call this at the start of parameterized tests that may or may not use Device storage.
#[allow(unused_macros)]
macro_rules! skip_if_stubs_and_device {
    ($($kind:expr),+ $(,)?) => {
        if kvbm_kernels::is_using_stubs() {
            let needs_cuda = false $(|| matches!($kind, StorageKind::Device(_)))+;
            if needs_cuda {
                eprintln!(
                    "Skipping test '{}': stub kernels in use and test requires Device storage",
                    module_path!()
                );
                return Ok(());
            }
        }
    };
}

// Make the macros available to submodules
#[allow(unused_imports)]
pub(crate) use skip_if_stubs;
#[allow(unused_imports)]
pub(crate) use skip_if_stubs_and_device;

use super::{
    BlockChecksum, FillPattern, NixlAgent, PhysicalLayout, StorageKind, TransferCapabilities,
    compute_block_checksums, compute_layer_checksums, fill_blocks, fill_layers,
};
use crate::{
    BlockId,
    layout::{
        BlockDimension, LayoutConfig,
        builder::{HasConfig, NoLayout, NoMemory, PhysicalLayoutBuilder},
    },
};
use anyhow::Result;
use cudarc::driver::sys::CUdevice_attribute_enum;
use cudarc::driver::{CudaContext, CudaStream, LaunchConfig, PushKernelArg};
use cudarc::nvrtc::{CompileOptions, compile_ptx_with_opts};
use std::collections::HashMap;
use std::ops::Range;
use std::sync::{Arc, OnceLock};
use std::time::{Duration, Instant};

/// Layout kind for parameterized testing.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LayoutKind {
    /// Fully contiguous layout
    FC,
    /// Layer-wise (layer-separate) layout
    LW,
}

/// Storage and layout specification for creating test layouts.
#[derive(Debug, Clone, Copy)]
pub struct LayoutSpec {
    pub kind: LayoutKind,
    pub storage: StorageKind,
}

impl LayoutSpec {
    pub fn new(kind: LayoutKind, storage: StorageKind) -> Self {
        Self { kind, storage }
    }
}

/// Transfer mode for parameterized testing.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TransferMode {
    /// Transfer entire blocks (all layers)
    FullBlocks,
    /// Transfer only the first layer
    FirstLayerOnly,
    /// Transfer only the second layer
    SecondLayerOnly,
}

impl TransferMode {
    /// Convert to optional layer range for execute_transfer.
    pub fn layer_range(&self) -> Option<Range<usize>> {
        match self {
            TransferMode::FullBlocks => None,
            TransferMode::FirstLayerOnly => Some(0..1),
            TransferMode::SecondLayerOnly => Some(1..2),
        }
    }

    /// Get a descriptive suffix for test names.
    pub fn suffix(&self) -> &'static str {
        match self {
            TransferMode::FullBlocks => "full",
            TransferMode::FirstLayerOnly => "layer0",
            TransferMode::SecondLayerOnly => "layer1",
        }
    }
}

/// Standard layout configuration for all tests.
pub fn standard_config(num_blocks: usize) -> LayoutConfig {
    LayoutConfig::builder()
        .num_blocks(num_blocks)
        .num_layers(2)
        .outer_dim(2)
        .page_size(16)
        .inner_dim(128)
        .dtype_width_bytes(2)
        .build()
        .unwrap()
}

/// Helper function for creating a PhysicalLayout builder with standard config.
///
/// This is used by other test modules (fill, checksum, validation) for backwards compatibility.
pub fn builder(num_blocks: usize) -> PhysicalLayoutBuilder<HasConfig, NoLayout, NoMemory> {
    let agent = create_test_agent("test_agent");
    let config = standard_config(num_blocks);
    PhysicalLayout::builder(agent).with_config(config)
}

/// Create a test agent with no backends.
///
/// Use this for tests that don't require specific NIXL backends.
pub fn create_test_agent(name: &str) -> NixlAgent {
    NixlAgent::new(name).expect("Failed to create agent")
}

/// Create a test agent with specific backends (strict - all must succeed).
#[expect(dead_code)]
pub fn create_test_agent_with_backends(name: &str, backends: &[&str]) -> Result<NixlAgent> {
    NixlAgent::with_backends(name, backends)
}

/// Create a fully contiguous physical layout with the specified storage type.
pub fn create_fc_layout(
    agent: NixlAgent,
    storage_kind: StorageKind,
    num_blocks: usize,
) -> PhysicalLayout {
    let config = standard_config(num_blocks);
    let builder = PhysicalLayout::builder(agent)
        .with_config(config)
        .fully_contiguous();

    match storage_kind {
        StorageKind::System => builder.allocate_system().build().unwrap(),
        StorageKind::Pinned => builder.allocate_pinned(None).build().unwrap(),
        StorageKind::Device(device_id) => builder.allocate_device(device_id).build().unwrap(),
        StorageKind::Disk(_) => builder.allocate_disk(None).build().unwrap(),
    }
}

/// Create a layer-separate physical layout with the specified storage type.
pub fn create_lw_layout(
    agent: NixlAgent,
    storage_kind: StorageKind,
    num_blocks: usize,
) -> PhysicalLayout {
    let config = standard_config(num_blocks);
    let builder = PhysicalLayout::builder(agent)
        .with_config(config)
        .layer_separate(BlockDimension::BlockIsFirstDim);

    match storage_kind {
        StorageKind::System => builder.allocate_system().build().unwrap(),
        StorageKind::Pinned => builder.allocate_pinned(None).build().unwrap(),
        StorageKind::Device(device_id) => builder.allocate_device(device_id).build().unwrap(),
        StorageKind::Disk(_) => builder.allocate_disk(None).build().unwrap(),
    }
}

/// Create a physical layout based on the specification.
///
/// This is a DRY helper that dispatches to create_fc_layout or create_lw_layout
/// based on the layout kind in the spec.
pub fn create_layout(agent: NixlAgent, spec: LayoutSpec, num_blocks: usize) -> PhysicalLayout {
    match spec.kind {
        LayoutKind::FC => create_fc_layout(agent, spec.storage, num_blocks),
        LayoutKind::LW => create_lw_layout(agent, spec.storage, num_blocks),
    }
}

/// Create a transport manager for testing with the specified agent.
///
/// Note: The agent should already have backends configured. Use `create_test_agent`
/// or `build_agent_with_backends` to create properly configured agents.
pub fn create_transfer_context(
    agent: NixlAgent,
    capabilities: Option<TransferCapabilities>,
) -> Result<crate::manager::TransferManager> {
    crate::manager::TransferManager::builder()
        .capabilities(capabilities.unwrap_or_default())
        .nixl_agent(agent)
        .cuda_device_id(0)
        .build()
}

/// Fill blocks and compute checksums.
///
/// This can only be called on System or Pinned layouts.
pub fn fill_and_checksum(
    layout: &PhysicalLayout,
    block_ids: &[BlockId],
    pattern: FillPattern,
) -> Result<HashMap<BlockId, BlockChecksum>> {
    fill_blocks(layout, block_ids, pattern)?;
    compute_block_checksums(layout, block_ids)
}

/// Fill blocks or layers based on transfer mode and compute checksums.
///
/// This is a mode-aware version of fill_and_checksum that handles both
/// full block transfers and layer-wise transfers.
pub fn fill_and_checksum_with_mode(
    layout: &PhysicalLayout,
    block_ids: &[BlockId],
    pattern: FillPattern,
    mode: TransferMode,
) -> Result<HashMap<BlockId, BlockChecksum>> {
    match mode {
        TransferMode::FullBlocks => {
            fill_blocks(layout, block_ids, pattern)?;
            compute_block_checksums(layout, block_ids)
        }
        TransferMode::FirstLayerOnly => {
            fill_layers(layout, block_ids, 0..1, pattern)?;
            compute_layer_checksums(layout, block_ids, 0..1)
        }
        TransferMode::SecondLayerOnly => {
            fill_layers(layout, block_ids, 1..2, pattern)?;
            compute_layer_checksums(layout, block_ids, 1..2)
        }
    }
}

/// Verify that destination block checksums match the expected source checksums.
///
/// This function compares checksums in order, assuming the source and destination
/// block arrays have a 1:1 correspondence (src[i] was transferred to dst[i]).
pub fn verify_checksums_by_position(
    src_checksums: &HashMap<BlockId, BlockChecksum>,
    src_block_ids: &[BlockId],
    dst_layout: &PhysicalLayout,
    dst_block_ids: &[BlockId],
) -> Result<()> {
    assert_eq!(
        src_block_ids.len(),
        dst_block_ids.len(),
        "Source and destination block arrays must have same length"
    );

    let dst_checksums = compute_block_checksums(dst_layout, dst_block_ids)?;

    for (src_id, dst_id) in src_block_ids.iter().zip(dst_block_ids.iter()) {
        let src_checksum = src_checksums
            .get(src_id)
            .unwrap_or_else(|| panic!("Missing source checksum for block {}", src_id));
        let dst_checksum = dst_checksums
            .get(dst_id)
            .unwrap_or_else(|| panic!("Missing destination checksum for block {}", dst_id));

        assert_eq!(
            src_checksum, dst_checksum,
            "Checksum mismatch: src[{}] != dst[{}]: {} != {}",
            src_id, dst_id, src_checksum, dst_checksum
        );
    }

    Ok(())
}

/// Verify checksums with transfer mode awareness.
///
/// This is a mode-aware version that handles both full block and layer-wise verification.
pub fn verify_checksums_by_position_with_mode(
    src_checksums: &HashMap<BlockId, BlockChecksum>,
    src_block_ids: &[BlockId],
    dst_layout: &PhysicalLayout,
    dst_block_ids: &[BlockId],
    mode: TransferMode,
) -> Result<()> {
    assert_eq!(
        src_block_ids.len(),
        dst_block_ids.len(),
        "Source and destination block arrays must have same length"
    );

    let dst_checksums = match mode {
        TransferMode::FullBlocks => compute_block_checksums(dst_layout, dst_block_ids)?,
        TransferMode::FirstLayerOnly => compute_layer_checksums(dst_layout, dst_block_ids, 0..1)?,
        TransferMode::SecondLayerOnly => compute_layer_checksums(dst_layout, dst_block_ids, 1..2)?,
    };

    for (src_id, dst_id) in src_block_ids.iter().zip(dst_block_ids.iter()) {
        let src_checksum = src_checksums
            .get(src_id)
            .unwrap_or_else(|| panic!("Missing source checksum for block {}", src_id));
        let dst_checksum = dst_checksums
            .get(dst_id)
            .unwrap_or_else(|| panic!("Missing destination checksum for block {}", dst_id));

        assert_eq!(
            src_checksum, dst_checksum,
            "Checksum mismatch (mode={:?}): src[{}] != dst[{}]: {} != {}",
            mode, src_id, dst_id, src_checksum, dst_checksum
        );
    }

    Ok(())
}

/// Fill guard blocks and return their checksums for later verification.
///
/// Guard blocks are blocks adjacent to transfer destinations that should
/// remain unchanged during transfers. This function fills them with a
/// distinctive pattern and returns their checksums for later validation.
///
/// # Arguments
/// * `layout` - The physical layout containing the guard blocks
/// * `guard_block_ids` - Block IDs to use as guards
/// * `pattern` - Fill pattern for guard blocks (typically a constant like 0xFF)
///
/// # Returns
/// A map of block ID to checksum for all guard blocks
pub fn create_guard_blocks(
    layout: &PhysicalLayout,
    guard_block_ids: &[usize],
    pattern: FillPattern,
) -> Result<HashMap<usize, BlockChecksum>> {
    fill_blocks(layout, guard_block_ids, pattern)?;
    compute_block_checksums(layout, guard_block_ids)
}

/// Verify that guard blocks remain unchanged after transfers.
///
/// This function compares the current checksums of guard blocks against
/// their expected values. Any mismatch indicates memory corruption or
/// unintended overwrites during transfer operations.
///
/// # Arguments
/// * `layout` - The physical layout containing the guard blocks
/// * `guard_block_ids` - Block IDs to verify
/// * `expected_checksums` - Expected checksums from create_guard_blocks
///
/// # Errors
/// Returns an error if any guard block checksum has changed
pub fn verify_guard_blocks_unchanged(
    layout: &PhysicalLayout,
    guard_block_ids: &[usize],
    expected_checksums: &HashMap<usize, BlockChecksum>,
) -> Result<()> {
    let current_checksums = compute_block_checksums(layout, guard_block_ids)?;

    for &block_id in guard_block_ids {
        let expected = expected_checksums
            .get(&block_id)
            .unwrap_or_else(|| panic!("Missing expected checksum for guard block {}", block_id));
        let current = current_checksums
            .get(&block_id)
            .unwrap_or_else(|| panic!("Missing current checksum for guard block {}", block_id));

        if expected != current {
            return Err(anyhow::anyhow!(
                "Guard block {} was modified during transfer! Expected: {}, Got: {}",
                block_id,
                expected,
                current
            ));
        }
    }

    Ok(())
}

/// CUDA sleep kernel source code.
const SLEEP_KERNEL_SRC: &str = r#"
extern "C" __global__ void sleep_kernel(unsigned long long min_cycles) {
    const unsigned long long start = clock64();
    while ((clock64() - start) < min_cycles) {
        asm volatile("");
    }
}
"#;

/// A reusable CUDA sleep utility for tests.
///
/// This struct provides a simple interface to execute GPU sleep operations
/// with calibrated timing. It compiles the sleep kernel once per CUDA context
/// and caches the calibration for reuse.
///
/// The calibration is conservative (prefers longer sleep durations over shorter)
/// to ensure minimum sleep times are met.
pub struct CudaSleep {
    function: cudarc::driver::CudaFunction,
    cycles_per_ms: f64,
}

impl CudaSleep {
    /// Get or create a CudaSleep instance for the given CUDA context.
    ///
    /// This function uses lazy initialization and caches instances per device ID.
    /// The first call for each device will compile the kernel and run calibration.
    ///
    /// # Arguments
    /// * `cuda_ctx` - The CUDA context to use
    ///
    /// # Returns
    /// A shared reference to the CudaSleep instance for this context's device.
    pub fn for_context(cuda_ctx: &Arc<CudaContext>) -> Result<Arc<Self>> {
        static INSTANCES: OnceLock<parking_lot::Mutex<HashMap<usize, Arc<CudaSleep>>>> =
            OnceLock::new();

        let instances = INSTANCES.get_or_init(|| parking_lot::Mutex::new(HashMap::new()));
        let device_ordinal = cuda_ctx.ordinal();

        // Fast path: check if instance already exists
        {
            let instances_guard = instances.lock();
            if let Some(instance) = instances_guard.get(&device_ordinal) {
                return Ok(Arc::clone(instance));
            }
        }

        // Slow path: create new instance with calibration
        let instance = Arc::new(Self::new(cuda_ctx)?);

        // Store in cache
        let mut instances_guard = instances.lock();
        instances_guard
            .entry(device_ordinal)
            .or_insert_with(|| Arc::clone(&instance));

        Ok(instance)
    }

    /// Create a new CudaSleep instance with calibration.
    ///
    /// This compiles the sleep kernel and runs a calibration loop to determine
    /// the relationship between clock cycles and wall-clock time.
    fn new(cuda_ctx: &Arc<CudaContext>) -> Result<Self> {
        // Get device compute capability
        let major = cuda_ctx
            .attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)?;
        let minor = cuda_ctx
            .attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)?;

        // Compile PTX for this device
        let mut compile_opts = CompileOptions {
            name: Some("sleep_kernel.cu".into()),
            ..Default::default()
        };
        compile_opts
            .options
            .push(format!("--gpu-architecture=compute_{}{}", major, minor));
        let ptx = compile_ptx_with_opts(SLEEP_KERNEL_SRC, compile_opts)?;
        let module = cuda_ctx.load_module(ptx)?;
        let function = module.load_function("sleep_kernel")?;

        // Get device clock rate
        let clock_rate_khz =
            cuda_ctx.attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_CLOCK_RATE)? as u64;

        // Create a temporary stream for calibration
        let stream = cuda_ctx.new_stream()?;

        // Warm up to absorb JIT overhead
        let warm_cycles = clock_rate_khz.saturating_mul(10).max(1);
        Self::launch_kernel(&function, &stream, warm_cycles)?;
        stream.synchronize()?;

        // Run calibration loop
        let desired_delay = Duration::from_millis(600);
        let mut target_cycles = clock_rate_khz.saturating_mul(50).max(1); // ~50ms starting point
        let mut actual_duration = Duration::ZERO;

        for _ in 0..8 {
            let start = Instant::now();
            Self::launch_kernel(&function, &stream, target_cycles)?;
            stream.synchronize()?;
            actual_duration = start.elapsed();

            if actual_duration >= desired_delay {
                break;
            }

            target_cycles = target_cycles.saturating_mul(2);
        }

        // Calculate cycles per millisecond with conservative 20% margin
        // (prefer longer sleeps over shorter)
        let cycles_per_ms = if actual_duration.as_millis() > 0 {
            (target_cycles as f64 / actual_duration.as_millis() as f64) * 1.2
        } else {
            clock_rate_khz as f64 // Fallback to clock rate
        };

        Ok(Self {
            function,
            cycles_per_ms,
        })
    }

    /// Launch the sleep kernel with the specified number of cycles.
    fn launch_kernel(
        function: &cudarc::driver::CudaFunction,
        stream: &Arc<CudaStream>,
        cycles: u64,
    ) -> Result<()> {
        let launch_cfg = LaunchConfig {
            grid_dim: (1, 1, 1),
            block_dim: (1, 1, 1),
            shared_mem_bytes: 0,
        };

        let mut launch = stream.launch_builder(function);
        unsafe {
            launch.arg(&cycles);
            launch.launch(launch_cfg)?;
        }

        Ok(())
    }

    /// Launch a sleep operation on the given stream.
    ///
    /// This queues a GPU kernel that will sleep for approximately the specified
    /// duration. The sleep is conservative and may take longer than requested.
    ///
    /// # Arguments
    /// * `duration` - The minimum duration to sleep
    /// * `stream` - The CUDA stream to launch the kernel on
    ///
    /// # Returns
    /// Ok(()) if the kernel was successfully queued
    pub fn launch(&self, duration: Duration, stream: &Arc<CudaStream>) -> Result<()> {
        let target_cycles = (duration.as_millis() as f64 * self.cycles_per_ms) as u64;
        Self::launch_kernel(&self.function, stream, target_cycles)
    }
}