utils.py 23.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
"""
Utility functions for InfiniLM Llama model tests.

This module provides shared utility functions for tensor conversion,
parameter name normalization, and tensor comparison.
"""

from typing import Tuple, Dict, Callable, Optional, Any, List
import torch

try:
    import infinicore
except ImportError:
    infinicore = None


def normalize_param_name(name: str) -> str:
    """Normalize parameter name (remove 'model.' prefix if present)"""
    if name.startswith("model."):
        return name[6:]  # Remove "model." prefix
    return name


def to_infinicore_dtype(torch_dtype):
    """Convert PyTorch data type to infinicore data type"""
    if infinicore is None:
        raise ImportError("InfiniCore package not found")

    if torch_dtype == torch.float32:
        return infinicore.float32
    elif torch_dtype == torch.float16:
        return infinicore.float16
    elif torch_dtype == torch.bfloat16:
        return infinicore.bfloat16
    elif torch_dtype == torch.int8:
        return infinicore.int8
    elif torch_dtype == torch.int16:
        return infinicore.int16
    elif torch_dtype == torch.int32:
        return infinicore.int32
    elif torch_dtype == torch.int64:
        return infinicore.int64
    elif torch_dtype == torch.uint8:
        return infinicore.uint8
    elif torch_dtype == torch.bool:
        return infinicore.bool
    else:
        raise ValueError(f"Unsupported torch dtype: {torch_dtype}")


def torch_to_infinicore_tensor(torch_tensor, infini_device):
    """
    Convert PyTorch tensor to InfiniCore tensor.

    Args:
        torch_tensor: PyTorch tensor
        infini_device: InfiniCore device object

    Returns:
        InfiniCore tensor
    """
    if infinicore is None:
        raise ImportError("InfiniCore package not found")

    # Ensure tensor is contiguous (but keep it on its current device)
    torch_tensor = torch_tensor.contiguous()

    # Convert dtype
    infini_dtype = to_infinicore_dtype(torch_tensor.dtype)

    # Create InfiniCore tensor from torch tensor's data pointer
    if torch_tensor.is_contiguous():
        return infinicore.from_blob(
            torch_tensor.data_ptr(),
            list(torch_tensor.shape),
            dtype=infini_dtype,
            device=infini_device,
        )
    else:
        return infinicore.strided_from_blob(
            torch_tensor.data_ptr(),
            list(torch_tensor.shape),
            list(torch_tensor.stride()),
            dtype=infini_dtype,
            device=infini_device,
        )


def to_torch_dtype(infini_dtype):
    """Convert InfiniCore data type to PyTorch data type"""
    if infinicore is None:
        raise ImportError("InfiniCore package not found")

    # infini_dtype is a dtype object from infinicore.dtype
    # Access the underlying enum value for comparison
    from infinicore.lib import _infinicore

    # Get underlying enum value
    if hasattr(infini_dtype, "_underlying"):
        underlying = infini_dtype._underlying
    else:
        # If it's not a dtype object, try to use it directly
        underlying = infini_dtype

    # Compare underlying enum values
    if underlying == _infinicore.DataType.F32:
        return torch.float32
    elif underlying == _infinicore.DataType.F16:
        return torch.float16
    elif underlying == _infinicore.DataType.BF16:
        return torch.bfloat16
    elif underlying == _infinicore.DataType.I8:
        return torch.int8
    elif underlying == _infinicore.DataType.I16:
        return torch.int16
    elif underlying == _infinicore.DataType.I32:
        return torch.int32
    elif underlying == _infinicore.DataType.I64:
        return torch.int64
    elif underlying == _infinicore.DataType.U8:
        return torch.uint8
    elif underlying == _infinicore.DataType.BOOL:
        return torch.bool
    else:
        raise ValueError(
            f"Unsupported infinicore dtype: {infini_dtype} (underlying enum: {underlying})"
        )


def infinicore_to_torch_tensor(infini_tensor, torch_reference):
    """
    Convert InfiniCore tensor to PyTorch tensor for comparison.

    Args:
        infini_tensor: InfiniCore tensor (can be raw C++ tensor or Python wrapper)
        torch_reference: PyTorch tensor reference (for shape and device)

    Returns:
        PyTorch tensor with InfiniCore data on the same device as torch_reference
    """
    if infinicore is None:
        raise ImportError("InfiniCore package not found")

    # Wrap raw C++ tensor in Python Tensor wrapper if needed
    # get_parameter returns a raw _infinicore.Tensor, but we need infinicore.Tensor
    if not hasattr(infini_tensor, "_underlying"):
        # It's a raw C++ tensor, wrap it in the Python Tensor class
        infini_tensor = infinicore.Tensor(infini_tensor)

    # Get device from reference tensor
    ref_device = torch_reference.device

    # Determine target InfiniCore device
    if ref_device.type == "cuda":
        target_infini_device = infinicore.device("cuda", ref_device.index)
    else:
        target_infini_device = infinicore.device("cpu", 0)

    # Ensure source tensor is on the target device and contiguous
    # This is important when GPU support is compiled - we need to explicitly
    # move tensors to the correct device and make them contiguous
    # When GPU support is compiled but we're using CPU, we need to be extra careful
    try:
        # For CPU, always ensure tensor is explicitly on CPU and contiguous
        if ref_device.type == "cpu":
            cpu_device = infinicore.device("cpu", 0)
            # Move to CPU if not already there
            if hasattr(infini_tensor, "device"):
                source_device = infini_tensor.device
                if str(source_device) != str(cpu_device):
                    infini_tensor = infini_tensor.to(cpu_device)
            # Ensure contiguous
            if not infini_tensor.is_contiguous():
                infini_tensor = infini_tensor.contiguous()
        else:
            # For GPU, ensure on target device and contiguous
            if hasattr(infini_tensor, "device"):
                source_device = infini_tensor.device
                source_device_str = str(source_device)
                target_device_str = str(target_infini_device)
                if source_device_str != target_device_str:
                    infini_tensor = infini_tensor.to(target_infini_device)
            if not infini_tensor.is_contiguous():
                infini_tensor = infini_tensor.contiguous()
    except Exception as e:
        # If device operations fail, try to ensure contiguous at least
        if (
            hasattr(infini_tensor, "is_contiguous")
            and not infini_tensor.is_contiguous()
        ):
            infini_tensor = infini_tensor.contiguous()

    # Create a PyTorch tensor with the same shape, dtype, and device as reference
    torch_result = torch.zeros(
        list(infini_tensor.shape),
        dtype=to_torch_dtype(infini_tensor.dtype),
        device=ref_device,
    )

    # For CPU, use a workaround: create an intermediate tensor and copy through it
    # This avoids issues with rearrange when GPU support is compiled
    if ref_device.type == "cpu":
        # Check if source tensor is on CUDA - if so, we need pinned memory
        source_is_cuda = False
        source_cuda_device = None
        if hasattr(infini_tensor, "device"):
            source_device = infini_tensor.device
            source_device_str = str(source_device)
            source_is_cuda = source_device_str.startswith("cuda")
            if source_is_cuda:
                # Extract CUDA device index from device string (e.g., "cuda:0")
                try:
                    cuda_index = (
                        int(source_device_str.split(":")[1])
                        if ":" in source_device_str
                        else 0
                    )
                    source_cuda_device = infinicore.device("cuda", cuda_index)
                except:
                    source_cuda_device = infinicore.device("cuda", 0)

        # If source is on CUDA, we need to ensure the intermediate CPU tensor
        # uses pinned memory. The copy_from function will handle setting the
        # CUDA context, but we need to create the intermediate with pin_memory=True
        # so it gets pinned host memory that CUDA can safely copy to.
        # Note: The empty() function will check the current runtime when pin_memory=True.
        # Since copy_from sets the context to CUDA before copying, we create the
        # intermediate with pin_memory=True, and even if it initially gets regular
        # memory, the copy operation should still work. However, for better performance
        # and reliability, we try to use .to() method which handles device transfers more safely.

        # Try using .to() method first, which handles device transfers internally
        try:
            # Use .to() to move tensor to CPU - this should handle the transfer safely
            cpu_tensor = infini_tensor.to(target_infini_device)
            if not cpu_tensor.is_contiguous():
                cpu_tensor = cpu_tensor.contiguous()

            # Create temp tensor from PyTorch and copy from the CPU tensor
            temp_tensor = torch_to_infinicore_tensor(torch_result, target_infini_device)
            temp_tensor.copy_(cpu_tensor)
        except Exception as e:
            # Fallback: create intermediate tensor and copy through it
            # Create an intermediate contiguous tensor on CPU
            # Use pin_memory=True if source is CUDA to ensure proper D2H copy
            intermediate = infinicore.empty(
                list(infini_tensor.shape),
                dtype=infini_tensor.dtype,
                device=target_infini_device,
                pin_memory=source_is_cuda,  # Pin memory if copying from CUDA
            )

            # Copy source to intermediate first
            try:
                intermediate.copy_(infini_tensor)
            except Exception as e2:
                raise RuntimeError(f"Failed to copy tensor to intermediate: {e2}")

            # Now create temp tensor from PyTorch and copy from intermediate
            temp_tensor = torch_to_infinicore_tensor(torch_result, target_infini_device)
            temp_tensor.copy_(intermediate)
    else:
        # For GPU, use direct copy
        temp_tensor = torch_to_infinicore_tensor(torch_result, target_infini_device)
        temp_tensor.copy_(infini_tensor)

    return torch_result


def tensor_all_close(
    tensor1: torch.Tensor, tensor2: torch.Tensor, rtol: float = 1e-5, atol: float = 1e-5
) -> Tuple[bool, Dict]:
    """
    Compare two tensors for approximate equality.

    Args:
        tensor1: First tensor to compare
        tensor2: Second tensor to compare
        rtol: Relative tolerance (default: 1e-5)
        atol: Absolute tolerance (default: 1e-5)

    Returns:
        Tuple of (is_close, stats_dict) where stats_dict contains:
        - max_abs_diff: Maximum absolute difference
        - mean_abs_diff: Mean absolute difference
        - max_rel_diff: Maximum relative difference
        - is_close: Boolean indicating if tensors are close
        - has_nan: Boolean indicating if either tensor has NaN
        - has_inf: Boolean indicating if either tensor has Inf
    """
    if tensor1.shape != tensor2.shape:
        return False, {
            "error": "Shape mismatch",
            "shape1": tensor1.shape,
            "shape2": tensor2.shape,
        }

    # Check for NaN/Inf values
    tensor1_has_nan = torch.isnan(tensor1).any().item()
    tensor1_has_inf = torch.isinf(tensor1).any().item()
    tensor2_has_nan = torch.isnan(tensor2).any().item()
    tensor2_has_inf = torch.isinf(tensor2).any().item()

    has_nan = tensor1_has_nan or tensor2_has_nan
    has_inf = tensor1_has_inf or tensor2_has_inf

    # If either tensor has NaN/Inf, handle specially
    if has_nan or has_inf:
        # Compute stats only on finite values
        finite_mask = torch.isfinite(tensor1) & torch.isfinite(tensor2)

        if finite_mask.any():
            diff = (tensor1 - tensor2).abs()
            finite_diff = diff[finite_mask]
            max_diff = (
                finite_diff.max().item() if len(finite_diff) > 0 else float("nan")
            )
            mean_diff = (
                finite_diff.mean().item() if len(finite_diff) > 0 else float("nan")
            )

            # For relative diff, use finite values from tensor2
            finite_tensor2 = tensor2[finite_mask]
            if len(finite_tensor2) > 0:
                relative_max_diff = (
                    (finite_diff / finite_tensor2.abs().clamp(min=1e-8)).max().item()
                )
            else:
                relative_max_diff = float("nan")
        else:
            max_diff = float("nan")
            mean_diff = float("nan")
            relative_max_diff = float("nan")

        is_close = False  # Can't be close if there are NaN/Inf
    else:
        # Normal comparison when no NaN/Inf
        diff = (tensor1 - tensor2).abs()
        max_diff = diff.max().item()
        mean_diff = diff.mean().item()
        relative_max_diff = (diff / tensor2.abs().clamp(min=1e-8)).max().item()
        is_close = torch.allclose(tensor1, tensor2, rtol=rtol, atol=atol)

    stats = {
        "max_abs_diff": max_diff,
        "mean_abs_diff": mean_diff,
        "max_rel_diff": relative_max_diff,
        "is_close": is_close,
        "has_nan": has_nan,
        "has_inf": has_inf,
        "tensor1_has_nan": tensor1_has_nan,
        "tensor1_has_inf": tensor1_has_inf,
        "tensor2_has_nan": tensor2_has_nan,
        "tensor2_has_inf": tensor2_has_inf,
    }

    return is_close, stats


def validate_infinicore_component(
    op_name: str,
    infinicore_op: Callable,
    transformers_input: torch.Tensor,
    transformers_output: torch.Tensor,
    infinicore_input: torch.Tensor,
    infinicore_output: torch.Tensor,
    infini_device: Any,
    op_kwargs: Optional[Dict[str, Any]] = None,
    tolerance: float = 1e-5,
    debug_callback: Optional[Callable] = None,
    verbose: bool = True,
) -> Dict[str, Any]:
    """
    Validate an InfiniCore component by comparing it with Transformers implementation.

    This function implements the pattern from section 9d2b:
    1. Test 1: Run InfiniCore ops with InfiniCore input (current behavior)
    2. Test 2: Run InfiniCore ops with Transformers input (eliminate input diff)
    3. Compare Test 2 output with Transformers output to verify ops implementation
    4. Compare Test 1 vs Test 2 to see impact of input difference

    Args:
        op_name: Name of the operation (for logging)
        infinicore_op: InfiniCore operation function (e.g., F.rms_norm)
        transformers_input: Input tensor from Transformers model
        transformers_output: Output tensor from Transformers model
        infinicore_input: Input tensor from InfiniLM model
        infinicore_output: Output tensor from InfiniLM model
        infini_device: InfiniCore device object
        op_kwargs: Additional keyword arguments to pass to the InfiniCore op
        tolerance: Tolerance for comparison (default: 1e-5)
        debug_callback: Optional callback function for detailed debugging
                       Signature: debug_callback(trans_input, infini_input, trans_output,
                                                  infini_output, test1_output, test2_output)
        verbose: Whether to print detailed output (default: True)

    Returns:
        Dictionary containing validation results:
        - test1_match: Whether Test 1 output matches InfiniLM output
        - test2_match: Whether Test 2 output matches Transformers output
        - ops_correct: Whether InfiniCore ops implementation is correct (Test 2 result)
        - input_impact: Impact of input difference (Test 1 vs Test 2)
        - test1_stats: Statistics for Test 1 comparison
        - test2_stats: Statistics for Test 2 comparison
        - input_diff_stats: Statistics for input difference analysis
    """
    if op_kwargs is None:
        op_kwargs = {}

    results = {
        "test1_match": False,
        "test2_match": False,
        "ops_correct": False,
        "input_impact": "unknown",
        "test1_stats": {},
        "test2_stats": {},
        "input_diff_stats": {},
    }

    try:
        if verbose:
            print(f"\n   Validating {op_name} with InfiniCore ops using real data...")

        # Convert inputs to InfiniCore tensors
        infini_input_tensor = torch_to_infinicore_tensor(
            infinicore_input, infini_device
        )
        trans_input_tensor = torch_to_infinicore_tensor(
            transformers_input, infini_device
        )

        # Test 1: Call InfiniCore ops with InfiniCore input (current behavior)
        if verbose:
            print(f"\n   Test 1: InfiniCore ops with InfiniCore input...")

        # Prepare arguments for the op
        # For ops that take multiple inputs, we need to handle them
        # This assumes the op takes input as first arg and kwargs
        test1_inputs = [infini_input_tensor]
        test1_output = infinicore_op(*test1_inputs, **op_kwargs)
        test1_output_torch = infinicore_to_torch_tensor(test1_output, infinicore_output)

        # Compare Test 1 with InfiniLM output
        test1_match, test1_stats = tensor_all_close(
            test1_output_torch, infinicore_output, rtol=tolerance, atol=tolerance
        )
        results["test1_match"] = test1_match
        results["test1_stats"] = test1_stats

        if verbose:
            if test1_match:
                print(f"      ✓ Test 1: InfiniCore ops matches InfiniLM output")
            else:
                print(f"      ⚠ Test 1: InfiniCore ops differs from InfiniLM output")
                print(f"         Max abs diff: {test1_stats['max_abs_diff']:.15f}")
                print(f"         Mean abs diff: {test1_stats['mean_abs_diff']:.15f}")

        # Test 2: Call InfiniCore ops with Transformers input (to eliminate input diff)
        if verbose:
            print(
                f"\n   Test 2: InfiniCore ops with Transformers input (eliminating input diff)..."
            )

        test2_inputs = [trans_input_tensor]
        test2_output = infinicore_op(*test2_inputs, **op_kwargs)
        test2_output_torch = infinicore_to_torch_tensor(
            test2_output, transformers_output
        )

        # Compare Test 2 (InfiniCore ops with Transformers input) vs Transformers output
        if verbose:
            print(
                f"\n   Test 2 Results: InfiniCore ops (Transformers input) vs Transformers output:"
            )

        test2_match, test2_stats = tensor_all_close(
            test2_output_torch, transformers_output, rtol=tolerance, atol=tolerance
        )
        results["test2_match"] = test2_match
        results["test2_stats"] = test2_stats
        results["ops_correct"] = test2_match

        if verbose:
            print(f"      Max abs diff: {test2_stats['max_abs_diff']:.15f}")
            print(f"      Mean abs diff: {test2_stats['mean_abs_diff']:.15f}")
            print(f"      Max rel diff: {test2_stats['max_rel_diff']:.15f}")

            if test2_match:
                print(
                    f"      ✓ InfiniCore ops matches Transformers when using same input!"
                )
            else:
                print(
                    f"      ⚠ InfiniCore ops still differs from Transformers even with same input"
                )
                print(
                    f"         This suggests the {op_name} computation itself differs"
                )

                # Find max diff position
                diff = (test2_output_torch - transformers_output).abs()
                max_diff_idx = diff.argmax()
                max_diff_pos = torch.unravel_index(max_diff_idx, diff.shape)
                if verbose:
                    print(f"\n      Max diff position {max_diff_pos}:")
                    print(
                        f"         Transformers: {transformers_output[max_diff_pos].item():.15f}"
                    )
                    print(
                        f"         InfiniCore ops (Trans input): {test2_output_torch[max_diff_pos].item():.15f}"
                    )
                    print(f"         Difference: {diff[max_diff_pos].item():.15f}")

        # Compare Test 1 vs Test 2 to see impact of input difference
        if verbose:
            print(f"\n   Comparing Test 1 vs Test 2 (impact of input difference):")

        test1_vs_test2_diff = (test1_output_torch - test2_output_torch).abs()
        test1_vs_test2_max = test1_vs_test2_diff.max().item()
        test1_vs_test2_mean = test1_vs_test2_diff.mean().item()

        results["input_diff_stats"] = {
            "max_abs_diff": test1_vs_test2_max,
            "mean_abs_diff": test1_vs_test2_mean,
        }

        if verbose:
            print(f"      Max abs diff: {test1_vs_test2_max:.15f}")
            print(f"      Mean abs diff: {test1_vs_test2_mean:.15f}")

        if test1_vs_test2_max > tolerance:
            results["input_impact"] = "significant"
            if verbose:
                print(f"      ⚠ Input difference causes significant output difference")
        else:
            results["input_impact"] = "minimal"
            if verbose:
                print(f"      ✓ Input difference has minimal impact on output")

        # Compare input data between Transformers and InfiniCore
        if verbose:
            print(f"\n   Comparing input data (Transformers vs InfiniCore):")

        input_diff = (transformers_input - infinicore_input).abs()
        input_diff_max = input_diff.max().item()
        input_diff_mean = input_diff.mean().item()

        results["input_diff_stats"]["input_max_diff"] = input_diff_max
        results["input_diff_stats"]["input_mean_diff"] = input_diff_mean

        if verbose:
            print(
                f"   Input diff stats: min={input_diff.min().item():.15f}, "
                f"max={input_diff_max:.15f}, mean={input_diff_mean:.15f}"
            )

            if input_diff_max > 1e-6:
                max_input_diff_idx = input_diff.argmax()
                max_input_diff_pos = torch.unravel_index(
                    max_input_diff_idx, input_diff.shape
                )
                print(f"   ⚠ Max input diff at position {max_input_diff_pos}:")
                print(
                    f"      Transformers: {transformers_input[max_input_diff_pos].item():.15f}"
                )
                print(
                    f"      InfiniCore: {infinicore_input[max_input_diff_pos].item():.15f}"
                )
                print(f"      Difference: {input_diff[max_input_diff_pos].item():.15f}")
            else:
                print(f"   ✓ Input data matches (within tolerance)")

        # Call debug callback if provided
        if debug_callback is not None:
            try:
                debug_callback(
                    transformers_input,
                    infinicore_input,
                    transformers_output,
                    infinicore_output,
                    test1_output_torch,
                    test2_output_torch,
                )
            except Exception as e:
                if verbose:
                    print(f"   ⚠ Debug callback failed: {e}")

        # Summary
        if verbose:
            print(f"\n   Summary:")
            print(
                f"      Test 1 (InfiniCore input): {'✓ PASS' if test1_match else '✗ FAIL'}"
            )
            print(
                f"      Test 2 (Transformers input): {'✓ PASS' if test2_match else '✗ FAIL'}"
            )
            print(
                f"      InfiniCore ops correctness: {'✓ CORRECT' if results['ops_correct'] else '✗ INCORRECT'}"
            )
            print(f"      Input impact: {results['input_impact']}")

    except Exception as e:
        if verbose:
            print(f"   ✗ Validation failed with exception: {e}")
            import traceback

            traceback.print_exc()
        results["error"] = str(e)

    return results