Commit 55c5f16f authored by zhuwenwen's avatar zhuwenwen
Browse files

remove xformers deps

parent 32a996c5
......@@ -19,10 +19,10 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention
| BloomForCausalLM | BLOOM | Yes | Yes |
| InternLMForCausalLM | InternLM | Yes | Yes |
| InternLM2ForCausalLM | InternLM2 | Yes | Yes |
| TeleChat12BForCausalLM (#TelechatForCausalLM) | TeleChat-12B | Yes | Yes |
| MiniCPMForCausalLM | MiniCPM | Yes | Yes |
| MiniCPM3ForCausalLM | MiniCPM3 | Yes | Yes |
| MixtralForCausalLM | Mixtral-8x7B,Mixtral-8x7B-Instruct | Yes | Yes |
| TeleChat12BForCausalLM (#TelechatForCausalLM) | TeleChat-12B | Yes | Yes |
| Qwen2MoeForCausalLM | Qwen2-57B-A14B,Qwen2-57B-A14B-Instruct | Yes | Yes |
| LlavaForConditionalGeneration | LLaMA,LLaMA-2,LLaMA-3 | Yes | Yes |
| Qwen2VLForConditionalGeneration | Qwen2-VL | Yes | Yes |
......@@ -74,7 +74,6 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install
2、根据pytorch2.3.0、python、dtk及系统下载对应的依赖包:
- triton:[https://cancon.hpccube.com:65024/4/main/triton](https://cancon.hpccube.com:65024/4/main/triton/)
- xformers:[https://cancon.hpccube.com:65024/4/main/xformers](https://cancon.hpccube.com:65024/4/main/xformers)
- flash_attn: [https://cancon.hpccube.com:65024/4/main/flash_attn](https://cancon.hpccube.com:65024/4/main/flash_attn)
- lmslim: [https://cancon.hpccube.com:65024/4/main/lmslim](https://cancon.hpccube.com:65024/4/main/lmslim)
......
......@@ -14,5 +14,4 @@ setuptools_scm>=8
torch == 2.3.0
triton == 2.1.0
flash_attn == 2.6.1
xformers == 0.0.25
lmslim == 0.1.2
\ No newline at end of file
lmslim == 0.1.2 # future version 0.2.0
\ No newline at end of file
......@@ -20,10 +20,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal.utils import (cached_get_tokenizer,
repeat_and_pad_placeholder_tokens)
from vllm.sequence import SequenceData
import vllm.envs as envs
try:
from xformers import ops as xops
USE_XFORMERS_OPS = True
if envs.VLLM_ATTENTION_BACKEND=="XFormers":
from xformers import ops as xops
USE_XFORMERS_OPS = True
else:
USE_XFORMERS_OPS = False
except ImportError:
USE_XFORMERS_OPS = False
......
......@@ -21,10 +21,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal.utils import (cached_get_tokenizer,
repeat_and_pad_placeholder_tokens)
from vllm.sequence import SequenceData
import vllm.envs as envs
try:
from xformers import ops as xops
USE_XFORMERS_OPS = True
if envs.VLLM_ATTENTION_BACKEND=="XFormers":
from xformers import ops as xops
USE_XFORMERS_OPS = True
else:
USE_XFORMERS_OPS = False
except ImportError:
USE_XFORMERS_OPS = False
......
......@@ -19,10 +19,14 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
import vllm.envs as envs
try:
from xformers import ops as xops
USE_XFORMERS_OPS = True
if envs.VLLM_ATTENTION_BACKEND=="XFormers":
from xformers import ops as xops
USE_XFORMERS_OPS = True
else:
USE_XFORMERS_OPS = False
except ImportError:
USE_XFORMERS_OPS = False
......@@ -200,7 +204,8 @@ class InternSdpaAttention(nn.Module):
v = v.transpose(1, 2)
x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
x = x.transpose(1, 2).view(B, N, -1)
# x = x.transpose(1, 2).view(B, N, -1)
x = x.transpose(1, 2).reshape(B, N, -1)
x = self.proj(x)
return x
......
......@@ -8,8 +8,18 @@ import torch.nn.functional as F
from mistral_common.protocol.instruct.messages import ImageChunk
from PIL import Image
from transformers import PretrainedConfig
from xformers.ops.fmha import memory_efficient_attention
from xformers.ops.fmha.attn_bias import BlockDiagonalMask
# from xformers.ops.fmha import memory_efficient_attention
# from xformers.ops.fmha.attn_bias import BlockDiagonalMask
import vllm.envs as envs
try:
if envs.VLLM_ATTENTION_BACKEND=="XFormers":
from xformers.ops.fmha import memory_efficient_attention
from xformers.ops.fmha.attn_bias import BlockDiagonalMask
else:
print("INFO: VLLM_ATTENTION_BACKEND is not XFormers.\n")
except ImportError:
print("INFO: Please install xformers if you want to infer pixtral.\n")
from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, MultiModalConfig
......
......@@ -25,10 +25,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal.utils import (cached_get_tokenizer,
repeat_and_pad_placeholder_tokens)
from vllm.sequence import SequenceData
import vllm.envs as envs
try:
from xformers import ops as xops
USE_XFORMERS_OPS = True
if envs.VLLM_ATTENTION_BACKEND=="XFormers":
from xformers import ops as xops
USE_XFORMERS_OPS = True
else:
USE_XFORMERS_OPS = False
except ImportError:
USE_XFORMERS_OPS = False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment