Unverified Commit c16b33cc authored by Yineng Zhang's avatar Yineng Zhang Committed by GitHub
Browse files

cleanup deps 3/n (#4541)

parent 2d004512
......@@ -152,8 +152,6 @@ class Fp8Config(QuantizationConfig):
def get_quant_method(
self, layer: torch.nn.Module, prefix: str
) -> Optional["QuantizeMethodBase"]:
from vllm.attention.layer import Attention # Avoid circular import
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
if isinstance(layer, LinearBase):
......@@ -162,8 +160,6 @@ class Fp8Config(QuantizationConfig):
return Fp8LinearMethod(self)
elif isinstance(layer, FusedMoE):
return Fp8MoEMethod(self)
elif isinstance(layer, Attention):
return Fp8KVCacheMethod(self)
return None
def get_scaled_act_names(self) -> List[str]:
......
......@@ -18,7 +18,6 @@ from typing import Iterable, Optional, Tuple
import torch
from torch import nn
from transformers import PretrainedConfig
from vllm import _custom_ops as ops
from sglang.srt.layers.layernorm import RMSNorm
from sglang.srt.layers.linear import ReplicatedLinear
......@@ -48,6 +47,8 @@ _is_cuda = is_cuda()
if _is_cuda:
from sgl_kernel import awq_dequantize
else:
from vllm import _custom_ops as ops
class DeepseekModelNextN(nn.Module):
......
......@@ -23,7 +23,6 @@ import torch
import torch.nn.functional as F
from torch import nn
from transformers import PretrainedConfig
from vllm import _custom_ops as ops
from sglang.srt.distributed import (
get_tensor_model_parallel_world_size,
......@@ -75,6 +74,8 @@ _is_cuda = is_cuda()
if _is_cuda:
from sgl_kernel import awq_dequantize, bmm_fp8
else:
from vllm import _custom_ops as ops
class DeepseekV2MLP(nn.Module):
......
......@@ -531,7 +531,10 @@ def load_image(image_file: Union[str, bytes]):
def suppress_other_loggers():
from vllm.logger import logger as vllm_default_logger
try:
from vllm.logger import logger as vllm_default_logger
except ImportError:
return
vllm_default_logger.setLevel(logging.WARN)
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
......@@ -620,11 +623,14 @@ def monkey_patch_p2p_access_check():
def monkey_patch_vllm_gguf_config():
from vllm.model_executor.layers.quantization.gguf import (
GGUFConfig,
GGUFEmbeddingMethod,
GGUFLinearMethod,
)
try:
from vllm.model_executor.layers.quantization.gguf import (
GGUFConfig,
GGUFEmbeddingMethod,
GGUFLinearMethod,
)
except ImportError:
return
from sglang.srt.layers.linear import LinearBase
from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment