CMake build, allowing parent build (#19)

013f0c4f · Luka Govedič · GitHub · 344c988d · 344c988d · 344c988d
Unverified Commit 013f0c4f authored Sep 20, 2024 by Luka Govedič Committed by GitHub Sep 20, 2024
17 changed files
--- a/tests/models/test_llama.py
+++ b/tests/models/test_llama.py
--- a/tests/models/test_opt.py
+++ b/tests/models/test_opt.py
-import re
-import time
-import pytest
-import torch
-from einops import rearrange
-from flash_attn.models.gpt import GPTLMHeadModel
-from flash_attn.models.opt import opt_config_to_gpt2_config, remap_state_dict_hf_opt
-from flash_attn.utils.generation import update_graph_cache
-from flash_attn.utils.pretrained import state_dict_from_pretrained
-from transformers import AutoTokenizer, OPTConfig
-from transformers.models.opt.modeling_opt import OPTForCausalLM
-@pytest.mark.parametrize(
-    "model_name", ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b"]
-)
-# @pytest.mark.parametrize('model_name', ["facebook/opt-350m"])
-def test_opt_state_dict(model_name):
-    config = opt_config_to_gpt2_config(OPTConfig.from_pretrained(model_name))
-    pretrained_state_dict = remap_state_dict_hf_opt(state_dict_from_pretrained(model_name), config)
-    model = GPTLMHeadModel(config)
-    state_dict = model.state_dict()
-    assert state_dict.keys() == pretrained_state_dict.keys()
-    for k in state_dict.keys():
-        assert state_dict[k].shape == pretrained_state_dict[k].shape
-@pytest.mark.parametrize(
-    "model_name", ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b"]
-)
-# @pytest.mark.parametrize('model_name', ["facebook/opt-350m"])
-def test_opt_optimized(model_name):
-    """Check that our implementation of OPT (without all optimizations enabled) matches the
-    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
-    forward pass in fp16, when compared to the HF forward pass in fp32.
-    """
-    dtype = torch.float16
-    device = "cuda"
-    config = opt_config_to_gpt2_config(OPTConfig.from_pretrained(model_name))
-    config.use_flash_attn = True
-    config.fused_bias_fc = True
-    config.fused_mlp = True
-    config.fused_dropout_add_ln = True
-    # Only prenorm supports residual_in_fp32
-    config.residual_in_fp32 = getattr(config, "prenorm", True)
-    config.pad_vocab_size_multiple = 8
-    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
-    model_ref = OPTForCausalLM.from_pretrained(model_name).to(device=device)
-    model_hf = OPTForCausalLM.from_pretrained(model_name, torch_dtype=dtype).to(device=device)
-    model.eval()
-    model_ref.eval()
-    model_hf.eval()
-    torch.manual_seed(0)
-    batch_size = 2
-    max_seqlen = 256
-    seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device="cuda")
-    input_ids = torch.randint(
-        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device="cuda"
-    )
-    if model_name != "facebook/opt-350m":  # The OPT-350m projects the embeddings to dimension 512
-        out = model.transformer(input_ids)
-        out_hf = model_hf.model(input_ids).last_hidden_state
-        out_ref = model_ref.model(input_ids).last_hidden_state
-        print(f"Output max diff: {(out - out_ref).abs().max().item()}")
-        print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
-        print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
-        print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
-        assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item()
-    logits = model(input_ids).logits
-    logits_hf = model_hf(input_ids).logits
-    logits_ref = model_ref(input_ids).logits
-    print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
-    print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
-    print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
-    print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
-    assert (logits - logits_ref).abs().max().item() < 3 * (
-        logits_hf - logits_ref
-    ).abs().max().item()
-@pytest.mark.parametrize(
-    "model_name",
-    [
-        "facebook/opt-125m",
-        "facebook/opt-350m",
-        "facebook/opt-1.3b",
-        "facebook/opt-2.7b",
-        "facebook/opt-6.7b",
-    ],
-)
-# @pytest.mark.parametrize('model_name', ["facebook/opt-125m"])
-def test_opt_generation(model_name):
-    """Check that our implementation of OPT generation matches the HF implementation:
-    the scores in fp16 should be around the same as the HF scores in fp16, when compared to
-    the HF scores in fp32.
-    """
-    print(f"\nMODEL: {model_name}")
-    verbose = False
-    dtype = torch.float16
-    device = "cuda"
-    rtol, atol = 3e-3, 3e-1
-    config = opt_config_to_gpt2_config(OPTConfig.from_pretrained(model_name))
-    # Only prenorm supports residual_in_fp32
-    config.residual_in_fp32 = getattr(config, "prenorm", True)
-    config.use_flash_attn = True
-    config.fused_bias_fc = True
-    config.fused_mlp = True
-    config.fused_dropout_add_ln = True
-    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
-    model.eval()
-    torch.manual_seed(0)
-    # OPT tokenizer requires use_fast=False
-    # https://huggingface.co/docs/transformers/model_doc/opt
-    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
-    eos_token_id = tokenizer.eos_token_id
-    input_ids = tokenizer("Hello, my dog is cute and he", return_tensors="pt").input_ids.to(
-        device=device
-    )
-    max_length = 25
-    # input_ids = torch.randint(0, 100, (2, 10), dtype=torch.long, device='cuda')
-    # max_length = input_ids.shape[1] + 40
-    # Slow generation for reference
-    sequences = []
-    scores = []
-    cur_input_ids = input_ids
-    with torch.inference_mode():
-        scores.append(model(cur_input_ids).logits[:, -1])
-        sequences.append(scores[-1].argmax(dim=-1))
-        for _ in range(input_ids.shape[1] + 1, max_length):
-            cur_input_ids = torch.cat([cur_input_ids, rearrange(sequences[-1], "b -> b 1")], dim=-1)
-            scores.append(model(cur_input_ids).logits[:, -1])
-            sequences.append(scores[-1].argmax(dim=-1))
-            if eos_token_id is not None and (sequences[-1] == eos_token_id).all():
-                break
-    sequences = torch.cat([input_ids, torch.stack(sequences, dim=1)], dim=1)
-    scores = tuple(scores)
-    print("Without CUDA graph")
-    torch.cuda.synchronize()
-    start = time.time()
-    out = model.generate(
-        input_ids=input_ids,
-        max_length=max_length,
-        eos_token_id=eos_token_id,
-        return_dict_in_generate=True,
-        output_scores=True,
-        enable_timing=True,
-    )
-    torch.cuda.synchronize()
-    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
-    if verbose:
-        print(out.sequences)
-    print(tokenizer.batch_decode(out.sequences.tolist()))
-    if getattr(config, "use_flash_attn", False):
-        # Capture graph outside the timing loop
-        batch_size, seqlen_og = input_ids.shape
-        model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length)
-        print("With CUDA graph")
-        torch.cuda.synchronize()
-        start = time.time()
-        out_cg = model.generate(
-            input_ids=input_ids,
-            max_length=max_length,
-            cg=True,
-            return_dict_in_generate=True,
-            output_scores=True,
-            enable_timing=True,
-        )
-        torch.cuda.synchronize()
-        print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
-        if verbose:
-            print(out_cg.sequences)
-        print(tokenizer.batch_decode(out_cg.sequences.tolist()))
-    del model
-    model_hf = OPTForCausalLM.from_pretrained(model_name, torch_dtype=dtype).to(device=device)
-    model_hf.eval()
-    print("HF fp16")
-    torch.cuda.synchronize()
-    start = time.time()
-    out_hf = model_hf.generate(
-        input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True
-    )
-    torch.cuda.synchronize()
-    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
-    del model_hf
-    model_ref = OPTForCausalLM.from_pretrained(model_name).to(device=device)
-    model_ref.eval()
-    print("HF fp32")
-    torch.cuda.synchronize()
-    start = time.time()
-    out_ref = model_ref.generate(
-        input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True
-    )
-    torch.cuda.synchronize()
-    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
-    del model_ref
-    print(tokenizer.batch_decode(out_ref.sequences.tolist()))
-    if verbose:
-        print(
-            f"Scores max diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}"
-        )
-        print(
-            f"Scores mean diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}"
-        )
-        print(
-            f"HF fp16 max diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}"
-        )
-        print(
-            f"HF fp16 mean diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}"
-        )
-    assert torch.all(out.sequences == sequences)
-    assert torch.allclose(
-        torch.stack(out.scores, dim=1), torch.stack(scores, dim=1), rtol=rtol, atol=atol
-    )
-    assert torch.all(out.sequences == out_ref.sequences)
-    assert torch.all(out.sequences == out_hf.sequences)
-    assert (torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item() < 3 * (
-        torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)
-    ).abs().max().item()
--- a/tests/models/test_vit.py
+++ b/tests/models/test_vit.py
-import re
-import pytest
-import torch
-from flash_attn.models.vit import vit_base_patch16_224 as flash_vit_base_patch16_224
-from timm.models.vision_transformer import vit_base_patch16_224
-@pytest.mark.parametrize("fused_mlp", [False, True])
-# @pytest.mark.parametrize('fused_mlp', [False])
-@pytest.mark.parametrize("optimized", [False, True])
-# @pytest.mark.parametrize('optimized', [True])
-def test_vit(optimized, fused_mlp):
-    """Check that our implementation of ViT matches the timm's implementation:
-    the output of our forward pass in fp16 should be around the same as
-    timm' forward pass in fp16, when compared to timm's forward pass in fp32.
-    """
-    dtype = torch.float16
-    device = "cuda"
-    kwargs = {}
-    if optimized:
-        kwargs = dict(use_flash_attn=True, fused_bias_fc=True, fused_dropout_add_ln=True)
-    kwargs["fused_mlp"] = fused_mlp
-    model = flash_vit_base_patch16_224(**kwargs).to(device=device, dtype=dtype)
-    model_ref = vit_base_patch16_224(pretrained=True).to(device=device)
-    model_timm = vit_base_patch16_224(pretrained=True).to(device=device, dtype=dtype)
-    model.load_state_dict(model_ref.state_dict())
-    model.eval()
-    model_ref.eval()
-    model_timm.eval()
-    torch.manual_seed(0)
-    batch_size = 2
-    x = torch.randn(batch_size, 3, 224, 224, device=device, dtype=dtype)
-    out = model(x)
-    out_timm = model_timm(x)
-    out_ref = model_ref(x.float())
-    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
-    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
-    print(f"timm fp16 max diff: {(out_timm - out_ref).abs().max().item()}")
-    print(f"timm fp16 mean diff: {(out_timm - out_ref).abs().mean().item()}")
-    rtol = 2 if not fused_mlp else 8
-    assert (out - out_ref).abs().max().item() < rtol * (out_timm - out_ref).abs().max().item()
--- a/tests/modules/test_block_parallel.py
+++ b/tests/modules/test_block_parallel.py
--- a/tests/modules/test_embedding_parallel.py
+++ b/tests/modules/test_embedding_parallel.py
-# Run test with:
-# torchrun --no_python --nproc_per_node=8 pytest -q -s tests/modules/test_embedding_parallel.py
-import pytest
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from apex.transformer import parallel_state
-from einops import rearrange
-from flash_attn.modules.embedding import GPT2Embeddings, ParallelGPT2Embeddings
-is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8
-@pytest.mark.parametrize("dtype", [torch.float16] + ([torch.bfloat16] if is_sm8x else []))
-# @pytest.mark.parametrize('dtype', [torch.bfloat16])
-@pytest.mark.parametrize("world_size", [1, 2, 4, 8])
-# @pytest.mark.parametrize('world_size', [2])
-@pytest.mark.parametrize("sequence_parallel", [True, False])
-# @pytest.mark.parametrize('sequence_parallel', [False])
-@pytest.mark.parametrize("has_pos_emb", [True, False])
-# @pytest.mark.parametrize('has_pos_emb', [True])
-@pytest.mark.parametrize("dim", [1024])
-def test_embedding_parallel(dim, has_pos_emb, sequence_parallel, world_size, dtype):
-    vocab_size = 50264
-    seqlen = 2048
-    assert vocab_size % world_size == 0
-    assert dim % world_size == 0
-    rtol, atol = (3e-3, 5e-2) if dtype == torch.bfloat16 else (3e-3, 3e-3)
-    if not torch.distributed.is_initialized():
-        torch.distributed.init_process_group(backend="nccl", init_method="env://")
-    device = f"cuda:{torch.distributed.get_rank()}"
-    assert world_size <= torch.distributed.get_world_size()
-    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
-    rank = parallel_state.get_tensor_model_parallel_rank()
-    # set seed
-    torch.random.manual_seed(0)
-    batch_size = 8
-    seqlen = 1024
-    assert (batch_size * seqlen) % world_size == 0
-    input_ids_pt = torch.randint(0, vocab_size, (batch_size, seqlen), device=device)
-    input_ids = input_ids_pt.detach().clone()
-    model_pt = GPT2Embeddings(
-        dim, vocab_size, seqlen if has_pos_emb else 0, device=device, dtype=dtype
-    )
-    model = ParallelGPT2Embeddings(
-        dim,
-        vocab_size,
-        seqlen if has_pos_emb else 0,
-        parallel_state.get_tensor_model_parallel_group(),
-        sequence_parallel=sequence_parallel,
-        device=device,
-        dtype=dtype,
-    )
-    partition_vocab_size = vocab_size // world_size
-    partition_dim = dim // world_size
-    with torch.no_grad():
-        model.word_embeddings.weight.copy_(
-            model_pt.word_embeddings.weight[
-                rank * partition_vocab_size : (rank + 1) * partition_vocab_size
-            ]
-        )
-        if has_pos_emb:
-            model.position_embeddings.weight.copy_(
-                model_pt.position_embeddings.weight[
-                    :, rank * partition_dim : (rank + 1) * partition_dim
-                ]
-            )
-    out = model(input_ids, combine_batch_seqlen_dim=True)
-    out_pt = rearrange(model_pt(input_ids), "b s d -> (b s) d")
-    partition_batch_dim = batch_size * seqlen // world_size
-    assert torch.allclose(
-        out,
-        out_pt[rank * partition_batch_dim : (rank + 1) * partition_batch_dim]
-        if sequence_parallel
-        else out_pt,
-        rtol=rtol,
-        atol=atol,
-    )
-    g = torch.randn_like(out_pt)
-    out_pt.backward(g)
-    out.backward(
-        g[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] if sequence_parallel else g
-    )
-    parallel_state.destroy_model_parallel()
-    assert torch.allclose(
-        model.word_embeddings.weight.grad,
-        model_pt.word_embeddings.weight.grad[
-            rank * partition_vocab_size : (rank + 1) * partition_vocab_size
-        ],
-        rtol=rtol,
-        atol=atol,
-    )
-    if has_pos_emb:
-        assert torch.allclose(
-            model.position_embeddings.weight.grad,
-            model_pt.position_embeddings.weight.grad[
-                :, rank * partition_dim : (rank + 1) * partition_dim
-            ],
-            rtol=rtol,
-            atol=atol,
-        )
--- a/tests/modules/test_mha_parallel.py
+++ b/tests/modules/test_mha_parallel.py
--- a/tests/modules/test_mlp_parallel.py
+++ b/tests/modules/test_mlp_parallel.py
--- a/tests/ops/test_dropout_layer_norm.py
+++ b/tests/ops/test_dropout_layer_norm.py
--- a/tests/ops/test_fused_dense.py
+++ b/tests/ops/test_fused_dense.py
--- a/tests/ops/test_fused_dense_parallel.py
+++ b/tests/ops/test_fused_dense_parallel.py
--- a/tests/ops/triton/test_layer_norm.py
+++ b/tests/ops/triton/test_layer_norm.py
--- a/tests/pyproject.toml
+++ b/tests/pyproject.toml
-[tool.black]
-line-length = 100
-target-version = ['py38']
\ No newline at end of file
--- a/tests/test_flash_attn.py
+++ b/tests/test_flash_attn.py
--- a/tests/test_rotary.py
+++ b/tests/test_rotary.py
--- a/tests/test_vllm_flash_attn.py
+++ b/tests/test_vllm_flash_attn.py
--- a/vllm_flash_attn/__init__.py
+++ b/vllm_flash_attn/__init__.py
 __version__ = "2.6.2"
-from vllm_flash_attn.flash_attn_interface import (
+# Use relative import to support build-from-source installation in vLLM
+from .flash_attn_interface import (
    flash_attn_func,
    flash_attn_kvpacked_func,
    flash_attn_qkvpacked_func,

--- a/vllm_flash_attn/flash_attn_interface.py
+++ b/vllm_flash_attn/flash_attn_interface.py
@@ -7,7 +7,8 @@ import torch.nn as nn
 # isort: off
 # We need to import the CUDA kernels after importing torch
-import vllm_flash_attn_2_cuda as flash_attn_cuda
+# Use relative import to support build-from-source installation in vLLM
+from . import vllm_flash_attn_c # noqa: F401
 # isort: on
@@ -49,7 +50,7 @@ def _flash_attn_forward(
    q, k, v, dropout_p, softmax_scale, causal, window_size, softcap, alibi_slopes, return_softmax, *, out=None
 ):
    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
-    out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.fwd(
+    out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = torch.ops.vllm_flash_attn_c.fwd(
        q,
        k,
        v,
@@ -87,7 +88,7 @@ def _flash_attn_varlen_forward(
    out=None
 ):
    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
-    out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.varlen_fwd(
+    out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = torch.ops.vllm_flash_attn_c.varlen_fwd(
        q,
        k,
        v,
@@ -140,7 +141,7 @@ def _flash_attn_backward(
        dk,
        dv,
        softmax_d,
-    ) = flash_attn_cuda.bwd(
+    ) = torch.ops.vllm_flash_attn_c.bwd(
        dout,
        q,
        k,
@@ -194,7 +195,7 @@ def _flash_attn_varlen_backward(
        dk,
        dv,
        softmax_d,
-    ) = flash_attn_cuda.varlen_bwd(
+    ) = torch.ops.vllm_flash_attn_c.varlen_bwd(
        dout,
        q,
        k,
@@ -1292,7 +1293,7 @@ def flash_attn_with_kvcache(
        cache_seqlens = maybe_contiguous(cache_seqlens)
    cache_batch_idx = maybe_contiguous(cache_batch_idx)
    block_table = maybe_contiguous(block_table)
-    out, softmax_lse = flash_attn_cuda.fwd_kvcache(
+    out, softmax_lse = torch.ops.vllm_flash_attn_c.fwd_kvcache(
        q,
        k_cache,
        v_cache,