Unverified Commit 0b98ba15 authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

Change the name to vLLM (#150)

parent e5464ee4
......@@ -2,7 +2,7 @@
import torch
import torch.nn as nn
from cacheflow import activation_ops
from vllm import activation_ops
_ACTIVATION_REGISTRY = {
"gelu": nn.GELU(),
......
......@@ -5,16 +5,16 @@ import torch
import torch.nn as nn
from xformers import ops as xops
from cacheflow import attention_ops
from cacheflow import cache_ops
from cacheflow import pos_encoding_ops
from cacheflow.model_executor.input_metadata import InputMetadata
from vllm import attention_ops
from vllm import cache_ops
from vllm import pos_encoding_ops
from vllm.model_executor.input_metadata import InputMetadata
_SUPPORTED_HEAD_SIZES = [64, 80, 96, 128]
class GPTCacheFlowAttention(nn.Module):
"""GPT-style multi-head attention.
class PagedAttention(nn.Module):
"""GPT-style multi-head PagedAttention.
This class takes flattened 1D query, key, and value tensors as input. The
input 1D tensors can be split into three parts: the prompt tokens, the
......@@ -164,8 +164,8 @@ class GPTCacheFlowAttention(nn.Module):
return output.view(-1, self.num_heads * self.head_size)
class GPTNeoXCacheFlowAttention(GPTCacheFlowAttention):
"""Attention with GPT-NeoX style rotary embedding."""
class PagedAttentionWithRoPE(PagedAttention):
"""PagedAttention with GPT-NeoX style rotary embedding."""
def __init__(
self,
......
......@@ -2,7 +2,7 @@
import torch
import torch.nn as nn
from cacheflow import layernorm_ops
from vllm import layernorm_ops
class RMSNorm(nn.Module):
......
......@@ -5,11 +5,11 @@ import numpy as np
import torch
import torch.nn as nn
from cacheflow.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.parallel_utils.tensor_parallel import (
from vllm.model_executor.input_metadata import InputMetadata
from vllm.model_executor.parallel_utils.tensor_parallel import (
gather_from_tensor_model_parallel_region)
from cacheflow.sampling_params import SamplingParams
from cacheflow.sequence import SequenceOutputs
from vllm.sampling_params import SamplingParams
from vllm.sequence import SequenceOutputs
class Sampler(nn.Module):
......
......@@ -5,10 +5,10 @@ import torch
import torch.nn as nn
from transformers import PretrainedConfig
from cacheflow.config import ModelConfig
from cacheflow.model_executor.models import (
GPT2LMHeadModel, GPTNeoXForCausalLM, LlamaForCausalLM, OPTForCausalLM)
from cacheflow.model_executor.weight_utils import initialize_dummy_weights
from vllm.config import ModelConfig
from vllm.model_executor.models import (GPT2LMHeadModel, GPTNeoXForCausalLM,
LlamaForCausalLM, OPTForCausalLM)
from vllm.model_executor.weight_utils import initialize_dummy_weights
# TODO(woosuk): Lazy-load the model classes.
_MODEL_REGISTRY = {
......
from cacheflow.model_executor.models.gpt_neox import GPTNeoXForCausalLM
from cacheflow.model_executor.models.gpt2 import GPT2LMHeadModel
from cacheflow.model_executor.models.llama import LlamaForCausalLM
from cacheflow.model_executor.models.opt import OPTForCausalLM
from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
from vllm.model_executor.models.gpt2 import GPT2LMHeadModel
from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.model_executor.models.opt import OPTForCausalLM
__all__ = [
......
# coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
# Copyright 2023 The CacheFlow team.
# Copyright 2023 The vLLM team.
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
......@@ -26,17 +26,17 @@ import torch
from torch import nn
from transformers import GPT2Config
from cacheflow.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.activation import get_act_fn
from cacheflow.model_executor.layers.attention import GPTCacheFlowAttention
from cacheflow.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import (
from vllm.model_executor.input_metadata import InputMetadata
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.attention import PagedAttention
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.model_executor.parallel_utils.tensor_parallel import (
from vllm.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs
from vllm.sequence import SequenceOutputs
KVCache = Tuple[torch.Tensor, torch.Tensor]
......@@ -53,14 +53,14 @@ class GPT2Attention(nn.Module):
self.head_dim = self.hidden_size // total_num_heads
self.scale = self.head_dim ** -0.5
self.c_attn = ColumnParallelLinear(self.hidden_size, 3 * self.hidden_size, bias=True,
gather_output=False,
self.c_attn = ColumnParallelLinear(self.hidden_size, 3 * self.hidden_size,
bias=True, gather_output=False,
perform_initialization=False)
self.c_proj = RowParallelLinear(self.hidden_size, self.hidden_size, bias=True,
input_is_parallel=True,
self.c_proj = RowParallelLinear(self.hidden_size, self.hidden_size,
bias=True, input_is_parallel=True,
perform_initialization=False)
self.attn = GPTCacheFlowAttention(self.num_heads, self.head_dim,
scale=self.scale)
self.attn = PagedAttention(self.num_heads, self.head_dim,
scale=self.scale)
def forward(
self,
......
# coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
# Copyright 2023 The CacheFlow team.
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -25,17 +25,17 @@ import torch
from torch import nn
from transformers import GPTNeoXConfig
from cacheflow.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.activation import get_act_fn
from cacheflow.model_executor.layers.attention import GPTNeoXCacheFlowAttention
from cacheflow.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import (
from vllm.model_executor.input_metadata import InputMetadata
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.model_executor.parallel_utils.tensor_parallel import (
from vllm.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs
from vllm.sequence import SequenceOutputs
KVCache = Tuple[torch.Tensor, torch.Tensor]
......@@ -63,8 +63,8 @@ class GPTNeoXAttention(nn.Module):
scaling = self.head_size ** -0.5
rotary_dim = int(self.head_size * config.rotary_pct)
assert rotary_dim % 2 == 0
self.attn = GPTNeoXCacheFlowAttention(self.num_heads, self.head_size,
scaling, rotary_dim)
self.attn = PagedAttentionWithRoPE(self.num_heads, self.head_size,
scaling, rotary_dim)
def forward(
self,
......@@ -149,6 +149,7 @@ class GPTNeoXLayer(nn.Module):
class GPTNeoXModel(nn.Module):
def __init__(self, config: GPTNeoXConfig):
super().__init__()
self.config = config
......
# coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The CacheFlow team.
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
......@@ -30,19 +30,19 @@ import torch
from torch import nn
from transformers import LlamaConfig
from cacheflow.sequence import SequenceOutputs
from cacheflow.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.activation import SiluAndMul
from cacheflow.model_executor.layers.layernorm import RMSNorm
from cacheflow.model_executor.layers.attention import GPTNeoXCacheFlowAttention
from cacheflow.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import (
from vllm.sequence import SequenceOutputs
from vllm.model_executor.input_metadata import InputMetadata
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.model_executor.parallel_utils.tensor_parallel import (
from vllm.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs
from vllm.sequence import SequenceOutputs
KVCache = Tuple[torch.Tensor, torch.Tensor]
......@@ -104,8 +104,8 @@ class LlamaAttention(nn.Module):
input_is_parallel=True,
perform_initialization=False,
)
self.attn = GPTNeoXCacheFlowAttention(self.num_heads, self.head_dim,
self.scaling, rotary_dim=self.head_dim)
self.attn = PagedAttentionWithRoPE(self.num_heads, self.head_dim,
self.scaling, rotary_dim=self.head_dim)
def forward(
self,
......
# coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
# Copyright 2023 The CacheFlow team.
# Copyright 2023 The vLLM team.
# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -25,17 +25,17 @@ import torch
from torch import nn
from transformers import OPTConfig
from cacheflow.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.activation import get_act_fn
from cacheflow.model_executor.layers.attention import GPTCacheFlowAttention
from cacheflow.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import (
from vllm.model_executor.input_metadata import InputMetadata
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.attention import PagedAttention
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights)
from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.model_executor.parallel_utils.tensor_parallel import (
from vllm.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs
from vllm.sequence import SequenceOutputs
KVCache = Tuple[torch.Tensor, torch.Tensor]
......@@ -75,8 +75,8 @@ class OPTAttention(nn.Module):
self.out_proj = RowParallelLinear(embed_dim, embed_dim, bias=bias,
input_is_parallel=True,
perform_initialization=False)
self.attn = GPTCacheFlowAttention(self.num_heads, self.head_dim,
scale=self.scaling)
self.attn = PagedAttention(self.num_heads, self.head_dim,
scale=self.scaling)
def forward(
self,
......
import cacheflow.model_executor.parallel_utils.parallel_state
import cacheflow.model_executor.parallel_utils.tensor_parallel
import vllm.model_executor.parallel_utils.parallel_state
import vllm.model_executor.parallel_utils.tensor_parallel
# Alias parallel_state as mpu, its legacy name
mpu = parallel_state
......
# Copyright 2023 The CacheFlow team.
# Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
......
# Copyright 2023 The CacheFlow team.
# Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
......@@ -11,7 +11,7 @@ import torch.nn.functional as F
import torch.nn.init as init
from torch.nn.parameter import Parameter
from cacheflow.model_executor.parallel_utils.parallel_state import (
from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
get_all_reduce_launcher,
......
# Copyright 2023 The CacheFlow team.
# Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/mappings.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import torch
from cacheflow.model_executor.parallel_utils.parallel_state import (
from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
get_tensor_model_parallel_group,
......
# Copyright 2023 The CacheFlow team.
# Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/random.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
......@@ -11,7 +11,7 @@ import torch
from torch import _C
from torch.cuda import _lazy_call, device as device_ctx_manager
from cacheflow.model_executor.parallel_utils.parallel_state import (
from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank,
)
......
# Copyright 2023 The CacheFlow team.
# Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
......
......@@ -4,8 +4,8 @@ import random
import numpy as np
import torch
from cacheflow.model_executor.parallel_utils.parallel_state import model_parallel_is_initialized
from cacheflow.model_executor.parallel_utils.tensor_parallel import model_parallel_cuda_manual_seed
from vllm.model_executor.parallel_utils.parallel_state import model_parallel_is_initialized
from vllm.model_executor.parallel_utils.tensor_parallel import model_parallel_cuda_manual_seed
def set_random_seed(seed: int) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment