Commit d99506f3 authored by chenzk's avatar chenzk
Browse files

v1.0.1

parent 61e92904
Pipeline #2033 canceled with stages
openwebtext-10k @ 152771d7
Subproject commit 152771d7ae284673c3ad7ffdd9b3afc2741f1d00
import torch
from nanotron.fp8 import DTypes, FP8Parameter, FP8Tensor
from nanotron.fp8.meta import FP8Meta
def test_create_fp8_parameter():
# TODO(xrsrke): test FP8E5M2 format
# TODO(xrsrke): test take a cpu tensor
tensor = torch.randn(16, 16, device="cuda", dtype=torch.float32)
fp8_parameter = FP8Parameter(tensor, DTypes.FP8E4M3)
assert isinstance(fp8_parameter.data, FP8Tensor)
assert fp8_parameter.requires_grad is True
assert fp8_parameter.grad is None
assert isinstance(fp8_parameter.fp8_meta, FP8Meta)
assert isinstance(fp8_parameter.data.fp8_meta, FP8Meta)
# TODO(xrsrke): add test for preventing torch autograd do the backward pass
# on a FP8Parameter
import pytest
import torch
from nanotron.fp8 import DTypes, FP8Linear, FP8Parameter, FP8Tensor
from torch import nn
from torch.optim import Adam
@pytest.mark.parametrize("is_bias", [True, False])
def test_fp8_linear_forward_pass(is_bias):
input = torch.randn(16, 16, device="cuda", dtype=torch.float32)
ref_input = input.detach().clone()
ref_linear = nn.Linear(16, 16, bias=is_bias, device="cuda", dtype=torch.float32)
fp8_linear = FP8Linear(16, 16, bias=is_bias, device="cuda:0")
fp8_linear.weight = FP8Parameter(ref_linear.weight.detach().clone(), DTypes.FP8E4M3)
if is_bias:
fp8_linear.bias.data = ref_linear.bias.detach().clone()
ref_output = ref_linear(ref_input)
output = fp8_linear(input)
assert isinstance(output, torch.Tensor)
assert output.dtype == torch.float32
assert torch.allclose(output, ref_output, rtol=0, atol=0.1)
# TODO(xrsrke): add cases where the input requires and don't require grad
@pytest.mark.parametrize("input_requires_grad", [True, False])
@pytest.mark.parametrize("device", ["cuda:0", "cpu"])
def test_fp8_linear_backward_pass(input_requires_grad, device):
input = torch.randn(16, 16, device=device, dtype=torch.float32, requires_grad=input_requires_grad)
ref_input = input.detach().clone().requires_grad_(True)
ref_linear = nn.Linear(16, 16, device=device, dtype=torch.float32)
fp8_linear = FP8Linear(16, 16, device=device)
if device == "cpu":
fp8_linear.weight.data = ref_linear.weight.detach().clone()
else:
fp8_linear.weight.data = FP8Tensor(ref_linear.weight.detach().clone(), dtype=DTypes.FP8E4M3)
fp8_linear.bias.data = ref_linear.bias.detach().clone()
ref_linear(ref_input).sum().backward()
fp8_linear(input).sum().backward()
# TODO(xrsrke): investigate why input.grad is so high tolerance
# assert torch.allclose(input.grad, ref_input.grad, 0.2, 0.2) if input_requires_grad else True
assert torch.allclose(fp8_linear.weight.grad, ref_linear.weight.grad, 0.1, 0.1)
assert torch.allclose(fp8_linear.bias.grad, ref_linear.bias.grad, 0, 0.1)
# TODO(xrsrke): test if FP8Linear has all the methods of a torch.nn.Linear
def test_fp8_linear_attrs():
fp8_linear = FP8Linear(16, 16, device="cuda:0")
assert next(fp8_linear.parameters()) is not None
assert all(p.requires_grad for p in fp8_linear.parameters()) is True
# TODO(xrsrke): test only calculating the gradients of the weight, bias, or input based
# on the requires_grad of the input, weight, or bias
def test_fp8_model_bwd():
HIDEEN_SIZE = 128
N_LAYERS = 5
N_EPOCHS = 3
input = torch.randn(HIDEEN_SIZE, HIDEEN_SIZE, device="cuda", requires_grad=True)
model = nn.Sequential(
*[nn.Sequential(FP8Linear(HIDEEN_SIZE, HIDEEN_SIZE, device="cuda"), nn.ReLU()) for _ in range(N_LAYERS)]
)
optim = Adam(model.parameters(), lr=1e-3)
for _ in range(N_EPOCHS):
optim.zero_grad()
model(input).sum().backward()
optim.step()
assert all(p.grad is not None for p in model.parameters())
from copy import deepcopy
import numpy as np
import pytest
import torch
import transformer_engine as te # noqa
import transformer_engine_extensions as tex
from nanotron.fp8 import DTypes, FP8Tensor
from nanotron.fp8.meta import FP8Meta
from nanotron.fp8.tensor import convert_tensor_from_fp8
@pytest.mark.parametrize("size", [4, 8, 16, 64])
def test_quantize_and_dequantize_tensor_in_fp8(size):
tensor = torch.randn((size, size), dtype=torch.float32, device="cuda")
ref_tensor = deepcopy(tensor)
fp8_tensor = FP8Tensor(tensor, dtype=DTypes.FP8E4M3)
assert isinstance(fp8_tensor, FP8Tensor)
assert isinstance(fp8_tensor.fp8_meta, FP8Meta)
assert fp8_tensor.device == ref_tensor.device
assert fp8_tensor.dtype == torch.uint8
assert fp8_tensor.shape == ref_tensor.shape
assert fp8_tensor.numel() == ref_tensor.numel()
assert not np.array_equal(fp8_tensor.cpu().numpy(), ref_tensor.cpu().numpy())
# TODO(xrsrke): remove the fixed 1 factor
# it couples with the current implementation of FP8Meta
# because we initialize scale with 1
assert fp8_tensor.fp8_meta.amax == ref_tensor.abs().max()
assert isinstance(fp8_tensor.fp8_meta.inverse_scale, torch.Tensor)
assert fp8_tensor.fp8_meta.scale != 0.1 and fp8_tensor.fp8_meta.scale != 1
assert isinstance(fp8_tensor.fp8_meta.te_dtype, tex.DType)
tensor = convert_tensor_from_fp8(fp8_tensor, fp8_tensor.fp8_meta, torch.float32)
assert isinstance(tensor, torch.Tensor)
assert tensor.dtype == ref_tensor.dtype
assert torch.allclose(tensor, ref_tensor, rtol=1e-1, atol=1e-1)
def test_fp8_tensor_attrs():
SIZE = 64
tensor = torch.randn((SIZE, SIZE), dtype=torch.float32, device="cuda:0")
ref_tensor = tensor.detach().clone()
fp8_tensor = FP8Tensor(tensor, DTypes.FP8E4M3)
assert isinstance(fp8_tensor, FP8Tensor)
assert isinstance(fp8_tensor.fp8_meta, FP8Meta)
assert fp8_tensor.device == ref_tensor.device
assert fp8_tensor.dtype == torch.uint8
assert fp8_tensor.shape == ref_tensor.shape
assert fp8_tensor.numel() == ref_tensor.numel()
assert fp8_tensor.device == ref_tensor.device
# TODO(xrsrke): test it has all the methods of torch.Tensor
# TODO(xrsrke): test it has all the attributes of its input tensor
import shutil
import uuid
from functools import lru_cache
from pathlib import Path
class TestContext:
def __init__(self):
self._random_string = str(uuid.uuid1())
self._root_dir = Path(__file__).parent.parent / ".test_cache"
self._root_dir.mkdir(parents=True, exist_ok=True)
@lru_cache(maxsize=1)
def get_auto_remove_tmp_dir(self):
path = self._root_dir / self._random_string
path.mkdir(parents=True, exist_ok=True)
return path
def __del__(self):
path = self.get_auto_remove_tmp_dir()
shutil.rmtree(path)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment