Commit 55c5f16f authored by zhuwenwen's avatar zhuwenwen
Browse files

remove xformers deps

parent 32a996c5
...@@ -19,10 +19,10 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention ...@@ -19,10 +19,10 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention
| BloomForCausalLM | BLOOM | Yes | Yes | | BloomForCausalLM | BLOOM | Yes | Yes |
| InternLMForCausalLM | InternLM | Yes | Yes | | InternLMForCausalLM | InternLM | Yes | Yes |
| InternLM2ForCausalLM | InternLM2 | Yes | Yes | | InternLM2ForCausalLM | InternLM2 | Yes | Yes |
| TeleChat12BForCausalLM (#TelechatForCausalLM) | TeleChat-12B | Yes | Yes |
| MiniCPMForCausalLM | MiniCPM | Yes | Yes | | MiniCPMForCausalLM | MiniCPM | Yes | Yes |
| MiniCPM3ForCausalLM | MiniCPM3 | Yes | Yes | | MiniCPM3ForCausalLM | MiniCPM3 | Yes | Yes |
| MixtralForCausalLM | Mixtral-8x7B,Mixtral-8x7B-Instruct | Yes | Yes | | MixtralForCausalLM | Mixtral-8x7B,Mixtral-8x7B-Instruct | Yes | Yes |
| TeleChat12BForCausalLM (#TelechatForCausalLM) | TeleChat-12B | Yes | Yes |
| Qwen2MoeForCausalLM | Qwen2-57B-A14B,Qwen2-57B-A14B-Instruct | Yes | Yes | | Qwen2MoeForCausalLM | Qwen2-57B-A14B,Qwen2-57B-A14B-Instruct | Yes | Yes |
| LlavaForConditionalGeneration | LLaMA,LLaMA-2,LLaMA-3 | Yes | Yes | | LlavaForConditionalGeneration | LLaMA,LLaMA-2,LLaMA-3 | Yes | Yes |
| Qwen2VLForConditionalGeneration | Qwen2-VL | Yes | Yes | | Qwen2VLForConditionalGeneration | Qwen2-VL | Yes | Yes |
...@@ -74,7 +74,6 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install ...@@ -74,7 +74,6 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install
2、根据pytorch2.3.0、python、dtk及系统下载对应的依赖包: 2、根据pytorch2.3.0、python、dtk及系统下载对应的依赖包:
- triton:[https://cancon.hpccube.com:65024/4/main/triton](https://cancon.hpccube.com:65024/4/main/triton/) - triton:[https://cancon.hpccube.com:65024/4/main/triton](https://cancon.hpccube.com:65024/4/main/triton/)
- xformers:[https://cancon.hpccube.com:65024/4/main/xformers](https://cancon.hpccube.com:65024/4/main/xformers)
- flash_attn: [https://cancon.hpccube.com:65024/4/main/flash_attn](https://cancon.hpccube.com:65024/4/main/flash_attn) - flash_attn: [https://cancon.hpccube.com:65024/4/main/flash_attn](https://cancon.hpccube.com:65024/4/main/flash_attn)
- lmslim: [https://cancon.hpccube.com:65024/4/main/lmslim](https://cancon.hpccube.com:65024/4/main/lmslim) - lmslim: [https://cancon.hpccube.com:65024/4/main/lmslim](https://cancon.hpccube.com:65024/4/main/lmslim)
......
...@@ -14,5 +14,4 @@ setuptools_scm>=8 ...@@ -14,5 +14,4 @@ setuptools_scm>=8
torch == 2.3.0 torch == 2.3.0
triton == 2.1.0 triton == 2.1.0
flash_attn == 2.6.1 flash_attn == 2.6.1
xformers == 0.0.25 lmslim == 0.1.2 # future version 0.2.0
lmslim == 0.1.2 \ No newline at end of file
\ No newline at end of file
...@@ -20,10 +20,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader ...@@ -20,10 +20,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import (cached_get_tokenizer,
repeat_and_pad_placeholder_tokens) repeat_and_pad_placeholder_tokens)
from vllm.sequence import SequenceData from vllm.sequence import SequenceData
import vllm.envs as envs
try: try:
from xformers import ops as xops if envs.VLLM_ATTENTION_BACKEND=="XFormers":
USE_XFORMERS_OPS = True from xformers import ops as xops
USE_XFORMERS_OPS = True
else:
USE_XFORMERS_OPS = False
except ImportError: except ImportError:
USE_XFORMERS_OPS = False USE_XFORMERS_OPS = False
......
...@@ -21,10 +21,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader ...@@ -21,10 +21,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import (cached_get_tokenizer,
repeat_and_pad_placeholder_tokens) repeat_and_pad_placeholder_tokens)
from vllm.sequence import SequenceData from vllm.sequence import SequenceData
import vllm.envs as envs
try: try:
from xformers import ops as xops if envs.VLLM_ATTENTION_BACKEND=="XFormers":
USE_XFORMERS_OPS = True from xformers import ops as xops
USE_XFORMERS_OPS = True
else:
USE_XFORMERS_OPS = False
except ImportError: except ImportError:
USE_XFORMERS_OPS = False USE_XFORMERS_OPS = False
......
...@@ -19,10 +19,14 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, ...@@ -19,10 +19,14 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
RowParallelLinear) RowParallelLinear)
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
import vllm.envs as envs
try: try:
from xformers import ops as xops if envs.VLLM_ATTENTION_BACKEND=="XFormers":
USE_XFORMERS_OPS = True from xformers import ops as xops
USE_XFORMERS_OPS = True
else:
USE_XFORMERS_OPS = False
except ImportError: except ImportError:
USE_XFORMERS_OPS = False USE_XFORMERS_OPS = False
...@@ -200,7 +204,8 @@ class InternSdpaAttention(nn.Module): ...@@ -200,7 +204,8 @@ class InternSdpaAttention(nn.Module):
v = v.transpose(1, 2) v = v.transpose(1, 2)
x = F.scaled_dot_product_attention(q, k, v, scale=self.scale) x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
x = x.transpose(1, 2).view(B, N, -1) # x = x.transpose(1, 2).view(B, N, -1)
x = x.transpose(1, 2).reshape(B, N, -1)
x = self.proj(x) x = self.proj(x)
return x return x
......
...@@ -8,8 +8,18 @@ import torch.nn.functional as F ...@@ -8,8 +8,18 @@ import torch.nn.functional as F
from mistral_common.protocol.instruct.messages import ImageChunk from mistral_common.protocol.instruct.messages import ImageChunk
from PIL import Image from PIL import Image
from transformers import PretrainedConfig from transformers import PretrainedConfig
from xformers.ops.fmha import memory_efficient_attention # from xformers.ops.fmha import memory_efficient_attention
from xformers.ops.fmha.attn_bias import BlockDiagonalMask # from xformers.ops.fmha.attn_bias import BlockDiagonalMask
import vllm.envs as envs
try:
if envs.VLLM_ATTENTION_BACKEND=="XFormers":
from xformers.ops.fmha import memory_efficient_attention
from xformers.ops.fmha.attn_bias import BlockDiagonalMask
else:
print("INFO: VLLM_ATTENTION_BACKEND is not XFormers.\n")
except ImportError:
print("INFO: Please install xformers if you want to infer pixtral.\n")
from vllm.attention import AttentionMetadata from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, MultiModalConfig from vllm.config import CacheConfig, MultiModalConfig
......
...@@ -25,10 +25,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader ...@@ -25,10 +25,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import (cached_get_tokenizer,
repeat_and_pad_placeholder_tokens) repeat_and_pad_placeholder_tokens)
from vllm.sequence import SequenceData from vllm.sequence import SequenceData
import vllm.envs as envs
try: try:
from xformers import ops as xops if envs.VLLM_ATTENTION_BACKEND=="XFormers":
USE_XFORMERS_OPS = True from xformers import ops as xops
USE_XFORMERS_OPS = True
else:
USE_XFORMERS_OPS = False
except ImportError: except ImportError:
USE_XFORMERS_OPS = False USE_XFORMERS_OPS = False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment