Unverified Commit 0b98ba15 authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

Change the name to vLLM (#150)

parent e5464ee4
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
import torch import torch
import torch.nn as nn import torch.nn as nn
from cacheflow import activation_ops from vllm import activation_ops
_ACTIVATION_REGISTRY = { _ACTIVATION_REGISTRY = {
"gelu": nn.GELU(), "gelu": nn.GELU(),
......
...@@ -5,16 +5,16 @@ import torch ...@@ -5,16 +5,16 @@ import torch
import torch.nn as nn import torch.nn as nn
from xformers import ops as xops from xformers import ops as xops
from cacheflow import attention_ops from vllm import attention_ops
from cacheflow import cache_ops from vllm import cache_ops
from cacheflow import pos_encoding_ops from vllm import pos_encoding_ops
from cacheflow.model_executor.input_metadata import InputMetadata from vllm.model_executor.input_metadata import InputMetadata
_SUPPORTED_HEAD_SIZES = [64, 80, 96, 128] _SUPPORTED_HEAD_SIZES = [64, 80, 96, 128]
class GPTCacheFlowAttention(nn.Module): class PagedAttention(nn.Module):
"""GPT-style multi-head attention. """GPT-style multi-head PagedAttention.
This class takes flattened 1D query, key, and value tensors as input. The This class takes flattened 1D query, key, and value tensors as input. The
input 1D tensors can be split into three parts: the prompt tokens, the input 1D tensors can be split into three parts: the prompt tokens, the
...@@ -164,8 +164,8 @@ class GPTCacheFlowAttention(nn.Module): ...@@ -164,8 +164,8 @@ class GPTCacheFlowAttention(nn.Module):
return output.view(-1, self.num_heads * self.head_size) return output.view(-1, self.num_heads * self.head_size)
class GPTNeoXCacheFlowAttention(GPTCacheFlowAttention): class PagedAttentionWithRoPE(PagedAttention):
"""Attention with GPT-NeoX style rotary embedding.""" """PagedAttention with GPT-NeoX style rotary embedding."""
def __init__( def __init__(
self, self,
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
import torch import torch
import torch.nn as nn import torch.nn as nn
from cacheflow import layernorm_ops from vllm import layernorm_ops
class RMSNorm(nn.Module): class RMSNorm(nn.Module):
......
...@@ -5,11 +5,11 @@ import numpy as np ...@@ -5,11 +5,11 @@ import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from cacheflow.model_executor.input_metadata import InputMetadata from vllm.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.parallel_utils.tensor_parallel import ( from vllm.model_executor.parallel_utils.tensor_parallel import (
gather_from_tensor_model_parallel_region) gather_from_tensor_model_parallel_region)
from cacheflow.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from cacheflow.sequence import SequenceOutputs from vllm.sequence import SequenceOutputs
class Sampler(nn.Module): class Sampler(nn.Module):
......
...@@ -5,10 +5,10 @@ import torch ...@@ -5,10 +5,10 @@ import torch
import torch.nn as nn import torch.nn as nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from cacheflow.config import ModelConfig from vllm.config import ModelConfig
from cacheflow.model_executor.models import ( from vllm.model_executor.models import (GPT2LMHeadModel, GPTNeoXForCausalLM,
GPT2LMHeadModel, GPTNeoXForCausalLM, LlamaForCausalLM, OPTForCausalLM) LlamaForCausalLM, OPTForCausalLM)
from cacheflow.model_executor.weight_utils import initialize_dummy_weights from vllm.model_executor.weight_utils import initialize_dummy_weights
# TODO(woosuk): Lazy-load the model classes. # TODO(woosuk): Lazy-load the model classes.
_MODEL_REGISTRY = { _MODEL_REGISTRY = {
......
from cacheflow.model_executor.models.gpt_neox import GPTNeoXForCausalLM from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
from cacheflow.model_executor.models.gpt2 import GPT2LMHeadModel from vllm.model_executor.models.gpt2 import GPT2LMHeadModel
from cacheflow.model_executor.models.llama import LlamaForCausalLM from vllm.model_executor.models.llama import LlamaForCausalLM
from cacheflow.model_executor.models.opt import OPTForCausalLM from vllm.model_executor.models.opt import OPTForCausalLM
__all__ = [ __all__ = [
......
# coding=utf-8 # coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py # Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# #
...@@ -26,17 +26,17 @@ import torch ...@@ -26,17 +26,17 @@ import torch
from torch import nn from torch import nn
from transformers import GPT2Config from transformers import GPT2Config
from cacheflow.model_executor.input_metadata import InputMetadata from vllm.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from cacheflow.model_executor.layers.attention import GPTCacheFlowAttention from vllm.model_executor.layers.attention import PagedAttention
from cacheflow.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights) load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import ( from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.model_executor.parallel_utils.tensor_parallel import ( from vllm.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear) VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs from vllm.sequence import SequenceOutputs
KVCache = Tuple[torch.Tensor, torch.Tensor] KVCache = Tuple[torch.Tensor, torch.Tensor]
...@@ -53,14 +53,14 @@ class GPT2Attention(nn.Module): ...@@ -53,14 +53,14 @@ class GPT2Attention(nn.Module):
self.head_dim = self.hidden_size // total_num_heads self.head_dim = self.hidden_size // total_num_heads
self.scale = self.head_dim ** -0.5 self.scale = self.head_dim ** -0.5
self.c_attn = ColumnParallelLinear(self.hidden_size, 3 * self.hidden_size, bias=True, self.c_attn = ColumnParallelLinear(self.hidden_size, 3 * self.hidden_size,
gather_output=False, bias=True, gather_output=False,
perform_initialization=False) perform_initialization=False)
self.c_proj = RowParallelLinear(self.hidden_size, self.hidden_size, bias=True, self.c_proj = RowParallelLinear(self.hidden_size, self.hidden_size,
input_is_parallel=True, bias=True, input_is_parallel=True,
perform_initialization=False) perform_initialization=False)
self.attn = GPTCacheFlowAttention(self.num_heads, self.head_dim, self.attn = PagedAttention(self.num_heads, self.head_dim,
scale=self.scale) scale=self.scale)
def forward( def forward(
self, self,
......
# coding=utf-8 # coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py # Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved. # Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
...@@ -25,17 +25,17 @@ import torch ...@@ -25,17 +25,17 @@ import torch
from torch import nn from torch import nn
from transformers import GPTNeoXConfig from transformers import GPTNeoXConfig
from cacheflow.model_executor.input_metadata import InputMetadata from vllm.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from cacheflow.model_executor.layers.attention import GPTNeoXCacheFlowAttention from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
from cacheflow.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights) load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import ( from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.model_executor.parallel_utils.tensor_parallel import ( from vllm.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear) VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs from vllm.sequence import SequenceOutputs
KVCache = Tuple[torch.Tensor, torch.Tensor] KVCache = Tuple[torch.Tensor, torch.Tensor]
...@@ -63,8 +63,8 @@ class GPTNeoXAttention(nn.Module): ...@@ -63,8 +63,8 @@ class GPTNeoXAttention(nn.Module):
scaling = self.head_size ** -0.5 scaling = self.head_size ** -0.5
rotary_dim = int(self.head_size * config.rotary_pct) rotary_dim = int(self.head_size * config.rotary_pct)
assert rotary_dim % 2 == 0 assert rotary_dim % 2 == 0
self.attn = GPTNeoXCacheFlowAttention(self.num_heads, self.head_size, self.attn = PagedAttentionWithRoPE(self.num_heads, self.head_size,
scaling, rotary_dim) scaling, rotary_dim)
def forward( def forward(
self, self,
...@@ -149,6 +149,7 @@ class GPTNeoXLayer(nn.Module): ...@@ -149,6 +149,7 @@ class GPTNeoXLayer(nn.Module):
class GPTNeoXModel(nn.Module): class GPTNeoXModel(nn.Module):
def __init__(self, config: GPTNeoXConfig): def __init__(self, config: GPTNeoXConfig):
super().__init__() super().__init__()
self.config = config self.config = config
......
# coding=utf-8 # coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
# #
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
...@@ -30,19 +30,19 @@ import torch ...@@ -30,19 +30,19 @@ import torch
from torch import nn from torch import nn
from transformers import LlamaConfig from transformers import LlamaConfig
from cacheflow.sequence import SequenceOutputs from vllm.sequence import SequenceOutputs
from cacheflow.model_executor.input_metadata import InputMetadata from vllm.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from cacheflow.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from cacheflow.model_executor.layers.attention import GPTNeoXCacheFlowAttention from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
from cacheflow.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights) load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import ( from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.model_executor.parallel_utils.tensor_parallel import ( from vllm.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear) VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs from vllm.sequence import SequenceOutputs
KVCache = Tuple[torch.Tensor, torch.Tensor] KVCache = Tuple[torch.Tensor, torch.Tensor]
...@@ -104,8 +104,8 @@ class LlamaAttention(nn.Module): ...@@ -104,8 +104,8 @@ class LlamaAttention(nn.Module):
input_is_parallel=True, input_is_parallel=True,
perform_initialization=False, perform_initialization=False,
) )
self.attn = GPTNeoXCacheFlowAttention(self.num_heads, self.head_dim, self.attn = PagedAttentionWithRoPE(self.num_heads, self.head_dim,
self.scaling, rotary_dim=self.head_dim) self.scaling, rotary_dim=self.head_dim)
def forward( def forward(
self, self,
......
# coding=utf-8 # coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py # Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. # Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
...@@ -25,17 +25,17 @@ import torch ...@@ -25,17 +25,17 @@ import torch
from torch import nn from torch import nn
from transformers import OPTConfig from transformers import OPTConfig
from cacheflow.model_executor.input_metadata import InputMetadata from vllm.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from cacheflow.model_executor.layers.attention import GPTCacheFlowAttention from vllm.model_executor.layers.attention import PagedAttention
from cacheflow.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.sampler import Sampler
from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator, from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
load_tensor_parallel_weights) load_tensor_parallel_weights)
from cacheflow.model_executor.parallel_utils.parallel_state import ( from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from cacheflow.model_executor.parallel_utils.tensor_parallel import ( from vllm.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear) VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
from cacheflow.sequence import SequenceOutputs from vllm.sequence import SequenceOutputs
KVCache = Tuple[torch.Tensor, torch.Tensor] KVCache = Tuple[torch.Tensor, torch.Tensor]
...@@ -75,8 +75,8 @@ class OPTAttention(nn.Module): ...@@ -75,8 +75,8 @@ class OPTAttention(nn.Module):
self.out_proj = RowParallelLinear(embed_dim, embed_dim, bias=bias, self.out_proj = RowParallelLinear(embed_dim, embed_dim, bias=bias,
input_is_parallel=True, input_is_parallel=True,
perform_initialization=False) perform_initialization=False)
self.attn = GPTCacheFlowAttention(self.num_heads, self.head_dim, self.attn = PagedAttention(self.num_heads, self.head_dim,
scale=self.scaling) scale=self.scaling)
def forward( def forward(
self, self,
......
import cacheflow.model_executor.parallel_utils.parallel_state import vllm.model_executor.parallel_utils.parallel_state
import cacheflow.model_executor.parallel_utils.tensor_parallel import vllm.model_executor.parallel_utils.tensor_parallel
# Alias parallel_state as mpu, its legacy name # Alias parallel_state as mpu, its legacy name
mpu = parallel_state mpu = parallel_state
......
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
......
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
...@@ -11,7 +11,7 @@ import torch.nn.functional as F ...@@ -11,7 +11,7 @@ import torch.nn.functional as F
import torch.nn.init as init import torch.nn.init as init
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from cacheflow.model_executor.parallel_utils.parallel_state import ( from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
get_all_reduce_launcher, get_all_reduce_launcher,
......
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/mappings.py # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/mappings.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import torch import torch
from cacheflow.model_executor.parallel_utils.parallel_state import ( from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
get_tensor_model_parallel_group, get_tensor_model_parallel_group,
......
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/random.py # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/random.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
...@@ -11,7 +11,7 @@ import torch ...@@ -11,7 +11,7 @@ import torch
from torch import _C from torch import _C
from torch.cuda import _lazy_call, device as device_ctx_manager from torch.cuda import _lazy_call, device as device_ctx_manager
from cacheflow.model_executor.parallel_utils.parallel_state import ( from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
) )
......
# Copyright 2023 The CacheFlow team. # Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
......
...@@ -4,8 +4,8 @@ import random ...@@ -4,8 +4,8 @@ import random
import numpy as np import numpy as np
import torch import torch
from cacheflow.model_executor.parallel_utils.parallel_state import model_parallel_is_initialized from vllm.model_executor.parallel_utils.parallel_state import model_parallel_is_initialized
from cacheflow.model_executor.parallel_utils.tensor_parallel import model_parallel_cuda_manual_seed from vllm.model_executor.parallel_utils.tensor_parallel import model_parallel_cuda_manual_seed
def set_random_seed(seed: int) -> None: def set_random_seed(seed: int) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment