v1.0.1

d99506f3 · chenzk · 61e92904 · d99506f3 · d99506f3 · d99506f3
Commit d99506f3 authored Dec 03, 2024 by chenzk
20 changed files
--- a/random.py
+++ b/random.py
--- a/s3_checkpoints/__init__.py
+++ b/s3_checkpoints/__init__.py
--- a/s3_checkpoints/fsspec.py
+++ b/s3_checkpoints/fsspec.py
--- a/s3_checkpoints/s3_mover.py
+++ b/s3_checkpoints/s3_mover.py
--- a/sanity_checks.py
+++ b/sanity_checks.py
--- a/scaling/parametrization.py
+++ b/scaling/parametrization.py
--- a/serialize/__init__.py
+++ b/serialize/__init__.py
--- a/serialize/main.py
+++ b/serialize/main.py
--- a/serialize/metadata.py
+++ b/serialize/metadata.py
--- a/serialize/optimizer.py
+++ b/serialize/optimizer.py
--- a/serialize/random.py
+++ b/serialize/random.py
--- a/serialize/utils.py
+++ b/serialize/utils.py
--- a/serialize/weights.py
+++ b/serialize/weights.py
--- a/trainer.py
+++ b/trainer.py
--- a/utils.py
+++ b/utils.py
--- a/openwebtext-10k @ 152771d7
+++ b/openwebtext-10k @ 152771d7
+Subproject commit 152771d7ae284673c3ad7ffdd9b3afc2741f1d00
--- a/tests/fp8/test_fp8_parameter.py
+++ b/tests/fp8/test_fp8_parameter.py
+import torch
+from nanotron.fp8 import DTypes, FP8Parameter, FP8Tensor
+from nanotron.fp8.meta import FP8Meta
+
+
+def test_create_fp8_parameter():
+    # TODO(xrsrke): test FP8E5M2 format
+    # TODO(xrsrke): test take a cpu tensor
+    tensor = torch.randn(16, 16, device="cuda", dtype=torch.float32)
+
+    fp8_parameter = FP8Parameter(tensor, DTypes.FP8E4M3)
+
+    assert isinstance(fp8_parameter.data, FP8Tensor)
+    assert fp8_parameter.requires_grad is True
+    assert fp8_parameter.grad is None
+    assert isinstance(fp8_parameter.fp8_meta, FP8Meta)
+    assert isinstance(fp8_parameter.data.fp8_meta, FP8Meta)
+
+
+# TODO(xrsrke): add test for preventing torch autograd do the backward pass
+# on a FP8Parameter
--- a/tests/fp8/test_linear.py
+++ b/tests/fp8/test_linear.py
+import pytest
+import torch
+from nanotron.fp8 import DTypes, FP8Linear, FP8Parameter, FP8Tensor
+from torch import nn
+from torch.optim import Adam
+
+
+@pytest.mark.parametrize("is_bias", [True, False])
+def test_fp8_linear_forward_pass(is_bias):
+    input = torch.randn(16, 16, device="cuda", dtype=torch.float32)
+    ref_input = input.detach().clone()
+    ref_linear = nn.Linear(16, 16, bias=is_bias, device="cuda", dtype=torch.float32)
+
+    fp8_linear = FP8Linear(16, 16, bias=is_bias, device="cuda:0")
+    fp8_linear.weight = FP8Parameter(ref_linear.weight.detach().clone(), DTypes.FP8E4M3)
+
+    if is_bias:
+        fp8_linear.bias.data = ref_linear.bias.detach().clone()
+
+    ref_output = ref_linear(ref_input)
+    output = fp8_linear(input)
+
+    assert isinstance(output, torch.Tensor)
+    assert output.dtype == torch.float32
+    assert torch.allclose(output, ref_output, rtol=0, atol=0.1)
+
+
+# TODO(xrsrke): add cases where the input requires and don't require grad
+@pytest.mark.parametrize("input_requires_grad", [True, False])
+@pytest.mark.parametrize("device", ["cuda:0", "cpu"])
+def test_fp8_linear_backward_pass(input_requires_grad, device):
+    input = torch.randn(16, 16, device=device, dtype=torch.float32, requires_grad=input_requires_grad)
+    ref_input = input.detach().clone().requires_grad_(True)
+    ref_linear = nn.Linear(16, 16, device=device, dtype=torch.float32)
+    fp8_linear = FP8Linear(16, 16, device=device)
+
+    if device == "cpu":
+        fp8_linear.weight.data = ref_linear.weight.detach().clone()
+    else:
+        fp8_linear.weight.data = FP8Tensor(ref_linear.weight.detach().clone(), dtype=DTypes.FP8E4M3)
+    fp8_linear.bias.data = ref_linear.bias.detach().clone()
+
+    ref_linear(ref_input).sum().backward()
+    fp8_linear(input).sum().backward()
+
+    # TODO(xrsrke): investigate why input.grad is so high tolerance
+    # assert torch.allclose(input.grad, ref_input.grad, 0.2, 0.2) if input_requires_grad else True
+    assert torch.allclose(fp8_linear.weight.grad, ref_linear.weight.grad, 0.1, 0.1)
+    assert torch.allclose(fp8_linear.bias.grad, ref_linear.bias.grad, 0, 0.1)
+
+
+# TODO(xrsrke): test if FP8Linear has all the methods of a torch.nn.Linear
+
+
+def test_fp8_linear_attrs():
+    fp8_linear = FP8Linear(16, 16, device="cuda:0")
+
+    assert next(fp8_linear.parameters()) is not None
+    assert all(p.requires_grad for p in fp8_linear.parameters()) is True
+
+
+# TODO(xrsrke): test only calculating the gradients of the weight, bias, or input based
+# on the requires_grad of the input, weight, or bias
+
+
+def test_fp8_model_bwd():
+    HIDEEN_SIZE = 128
+    N_LAYERS = 5
+    N_EPOCHS = 3
+
+    input = torch.randn(HIDEEN_SIZE, HIDEEN_SIZE, device="cuda", requires_grad=True)
+
+    model = nn.Sequential(
+        *[nn.Sequential(FP8Linear(HIDEEN_SIZE, HIDEEN_SIZE, device="cuda"), nn.ReLU()) for _ in range(N_LAYERS)]
+    )
+    optim = Adam(model.parameters(), lr=1e-3)
+
+    for _ in range(N_EPOCHS):
+        optim.zero_grad()
+        model(input).sum().backward()
+        optim.step()
+
+    assert all(p.grad is not None for p in model.parameters())
--- a/tests/fp8/test_tensor.py
+++ b/tests/fp8/test_tensor.py
+from copy import deepcopy
+
+import numpy as np
+import pytest
+import torch
+import transformer_engine as te  # noqa
+import transformer_engine_extensions as tex
+from nanotron.fp8 import DTypes, FP8Tensor
+from nanotron.fp8.meta import FP8Meta
+from nanotron.fp8.tensor import convert_tensor_from_fp8
+
+
+@pytest.mark.parametrize("size", [4, 8, 16, 64])
+def test_quantize_and_dequantize_tensor_in_fp8(size):
+    tensor = torch.randn((size, size), dtype=torch.float32, device="cuda")
+    ref_tensor = deepcopy(tensor)
+
+    fp8_tensor = FP8Tensor(tensor, dtype=DTypes.FP8E4M3)
+
+    assert isinstance(fp8_tensor, FP8Tensor)
+    assert isinstance(fp8_tensor.fp8_meta, FP8Meta)
+    assert fp8_tensor.device == ref_tensor.device
+    assert fp8_tensor.dtype == torch.uint8
+    assert fp8_tensor.shape == ref_tensor.shape
+    assert fp8_tensor.numel() == ref_tensor.numel()
+    assert not np.array_equal(fp8_tensor.cpu().numpy(), ref_tensor.cpu().numpy())
+
+    # TODO(xrsrke): remove the fixed 1 factor
+    # it couples with the current implementation of FP8Meta
+    # because we initialize scale with 1
+    assert fp8_tensor.fp8_meta.amax == ref_tensor.abs().max()
+    assert isinstance(fp8_tensor.fp8_meta.inverse_scale, torch.Tensor)
+    assert fp8_tensor.fp8_meta.scale != 0.1 and fp8_tensor.fp8_meta.scale != 1
+    assert isinstance(fp8_tensor.fp8_meta.te_dtype, tex.DType)
+
+    tensor = convert_tensor_from_fp8(fp8_tensor, fp8_tensor.fp8_meta, torch.float32)
+    assert isinstance(tensor, torch.Tensor)
+    assert tensor.dtype == ref_tensor.dtype
+    assert torch.allclose(tensor, ref_tensor, rtol=1e-1, atol=1e-1)
+
+
+def test_fp8_tensor_attrs():
+    SIZE = 64
+    tensor = torch.randn((SIZE, SIZE), dtype=torch.float32, device="cuda:0")
+    ref_tensor = tensor.detach().clone()
+
+    fp8_tensor = FP8Tensor(tensor, DTypes.FP8E4M3)
+
+    assert isinstance(fp8_tensor, FP8Tensor)
+    assert isinstance(fp8_tensor.fp8_meta, FP8Meta)
+    assert fp8_tensor.device == ref_tensor.device
+    assert fp8_tensor.dtype == torch.uint8
+    assert fp8_tensor.shape == ref_tensor.shape
+    assert fp8_tensor.numel() == ref_tensor.numel()
+    assert fp8_tensor.device == ref_tensor.device
+
+
+# TODO(xrsrke): test it has all the methods of torch.Tensor
+
+# TODO(xrsrke): test it has all the attributes of its input tensor
--- a/tests/helpers/context.py
+++ b/tests/helpers/context.py
+import shutil
+import uuid
+from functools import lru_cache
+from pathlib import Path
+
+
+class TestContext:
+    def __init__(self):
+        self._random_string = str(uuid.uuid1())
+        self._root_dir = Path(__file__).parent.parent / ".test_cache"
+        self._root_dir.mkdir(parents=True, exist_ok=True)
+
+    @lru_cache(maxsize=1)
+    def get_auto_remove_tmp_dir(self):
+        path = self._root_dir / self._random_string
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+
+    def __del__(self):
+        path = self.get_auto_remove_tmp_dir()
+        shutil.rmtree(path)