Unverified Commit 9acc6e35 authored by Liangsheng Yin's avatar Liangsheng Yin Committed by GitHub
Browse files

add `.isort.cfg` (#378)

parent cf9d8efd
...@@ -4,6 +4,7 @@ from typing import List ...@@ -4,6 +4,7 @@ from typing import List
import numpy as np import numpy as np
import torch import torch
from sglang.srt.managers.router.radix_cache import RadixCache from sglang.srt.managers.router.radix_cache import RadixCache
from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool
......
...@@ -4,6 +4,7 @@ import logging ...@@ -4,6 +4,7 @@ import logging
import uvloop import uvloop
import zmq import zmq
import zmq.asyncio import zmq.asyncio
from sglang.srt.backend_config import GLOBAL_BACKEND_CONFIG from sglang.srt.backend_config import GLOBAL_BACKEND_CONFIG
from sglang.srt.managers.router.model_rpc import ModelRpcClient from sglang.srt.managers.router.model_rpc import ModelRpcClient
from sglang.srt.server_args import PortArgs, ServerArgs from sglang.srt.server_args import PortArgs, ServerArgs
......
...@@ -10,6 +10,8 @@ import rpyc ...@@ -10,6 +10,8 @@ import rpyc
import torch import torch
from rpyc.utils.classic import obtain from rpyc.utils.classic import obtain
from rpyc.utils.server import ThreadedServer from rpyc.utils.server import ThreadedServer
from vllm.logger import _default_handler as vllm_default_handler
from sglang.srt.constrained.fsm_cache import FSMCache from sglang.srt.constrained.fsm_cache import FSMCache
from sglang.srt.constrained.jump_forward import JumpForwardCache from sglang.srt.constrained.jump_forward import JumpForwardCache
from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer
...@@ -30,7 +32,6 @@ from sglang.srt.utils import ( ...@@ -30,7 +32,6 @@ from sglang.srt.utils import (
is_multimodal_model, is_multimodal_model,
set_random_seed, set_random_seed,
) )
from vllm.logger import _default_handler as vllm_default_handler
logger = logging.getLogger("model_rpc") logger = logging.getLogger("model_rpc")
......
...@@ -9,16 +9,17 @@ from typing import List ...@@ -9,16 +9,17 @@ from typing import List
import numpy as np import numpy as np
import torch import torch
from sglang.srt.managers.router.infer_batch import Batch, ForwardMode
from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool
from sglang.srt.utils import is_multimodal_model
from sglang.utils import get_available_gpu_memory
from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.awq import AWQConfig
from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.gptq import GPTQConfig
from vllm.model_executor.layers.quantization.marlin import MarlinConfig from vllm.model_executor.layers.quantization.marlin import MarlinConfig
from vllm.model_executor.model_loader import _set_default_torch_dtype from vllm.model_executor.model_loader import _set_default_torch_dtype
from vllm.model_executor.parallel_utils.parallel_state import initialize_model_parallel from vllm.model_executor.parallel_utils.parallel_state import initialize_model_parallel
from sglang.srt.managers.router.infer_batch import Batch, ForwardMode
from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool
from sglang.srt.utils import is_multimodal_model
from sglang.utils import get_available_gpu_memory
QUANTIZATION_CONFIG_MAPPING = { QUANTIZATION_CONFIG_MAPPING = {
"awq": AWQConfig, "awq": AWQConfig,
"gptq": GPTQConfig, "gptq": GPTQConfig,
......
import heapq import heapq
import time import time
from collections import defaultdict from collections import defaultdict
from dataclasses import dataclass
from typing import Tuple
import torch import torch
......
...@@ -10,6 +10,7 @@ import transformers ...@@ -10,6 +10,7 @@ import transformers
import uvloop import uvloop
import zmq import zmq
import zmq.asyncio import zmq.asyncio
from sglang.srt.hf_transformers_utils import ( from sglang.srt.hf_transformers_utils import (
get_config, get_config,
get_context_length, get_context_length,
......
...@@ -20,13 +20,10 @@ ...@@ -20,13 +20,10 @@
# This file is based on the LLama model definition file in transformers # This file is based on the LLama model definition file in transformers
"""PyTorch Cohere model.""" """PyTorch Cohere model."""
from typing import List, Optional, Tuple from typing import Optional, Tuple
import torch import torch
import torch.utils.checkpoint import torch.utils.checkpoint
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
from torch import nn from torch import nn
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from transformers import PretrainedConfig from transformers import PretrainedConfig
...@@ -49,6 +46,10 @@ from vllm.model_executor.weight_utils import ( ...@@ -49,6 +46,10 @@ from vllm.model_executor.weight_utils import (
hf_model_weights_iterator, hf_model_weights_iterator,
) )
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
@torch.compile @torch.compile
def layer_norm_func(hidden_states, weight, variance_epsilon): def layer_norm_func(hidden_states, weight, variance_epsilon):
......
...@@ -5,10 +5,6 @@ from typing import Optional ...@@ -5,10 +5,6 @@ from typing import Optional
import torch import torch
import torch.nn as nn import torch.nn as nn
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
from sglang.srt.models.dbrx_config import DbrxConfig
from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.fused_moe import fused_moe
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
LinearMethodBase, LinearMethodBase,
...@@ -35,6 +31,11 @@ from vllm.model_executor.weight_utils import ( ...@@ -35,6 +31,11 @@ from vllm.model_executor.weight_utils import (
hf_model_weights_iterator, hf_model_weights_iterator,
) )
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
from sglang.srt.models.dbrx_config import DbrxConfig
class DbrxRouter(nn.Module): class DbrxRouter(nn.Module):
"""A Router implementation for DBRX that returns logits for each expert """A Router implementation for DBRX that returns logits for each expert
......
...@@ -4,9 +4,6 @@ ...@@ -4,9 +4,6 @@
from typing import Optional, Tuple from typing import Optional, Tuple
import torch import torch
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import LoRAConfig from vllm.config import LoRAConfig
...@@ -28,6 +25,10 @@ from vllm.model_executor.weight_utils import ( ...@@ -28,6 +25,10 @@ from vllm.model_executor.weight_utils import (
hf_model_weights_iterator, hf_model_weights_iterator,
) )
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
class GemmaMLP(nn.Module): class GemmaMLP(nn.Module):
def __init__( def __init__(
......
# Adapted from # Adapted from
# https://github.com/vllm-project/vllm/blob/671af2b1c0b3ed6d856d37c21a561cc429a10701/vllm/model_executor/models/llama.py#L1 # https://github.com/vllm-project/vllm/blob/671af2b1c0b3ed6d856d37c21a561cc429a10701/vllm/model_executor/models/llama.py#L1
"""Inference-only LLaMA model compatible with HuggingFace weights.""" """Inference-only LLaMA model compatible with HuggingFace weights."""
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, Optional, Tuple
import torch import torch
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
from torch import nn from torch import nn
from transformers import LlamaConfig from transformers import LlamaConfig
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
...@@ -30,6 +27,10 @@ from vllm.model_executor.weight_utils import ( ...@@ -30,6 +27,10 @@ from vllm.model_executor.weight_utils import (
hf_model_weights_iterator, hf_model_weights_iterator,
) )
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
class LlamaMLP(nn.Module): class LlamaMLP(nn.Module):
def __init__( def __init__(
......
...@@ -4,6 +4,15 @@ from typing import List, Optional ...@@ -4,6 +4,15 @@ from typing import List, Optional
import numpy as np import numpy as np
import torch import torch
from torch import nn
from transformers import CLIPVisionModel, LlavaConfig
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
from vllm.model_executor.layers.linear import LinearMethodBase
from vllm.model_executor.weight_utils import (
default_weight_loader,
hf_model_weights_iterator,
)
from sglang.srt.managers.router.infer_batch import ForwardMode from sglang.srt.managers.router.infer_batch import ForwardMode
from sglang.srt.managers.router.model_runner import InputMetadata from sglang.srt.managers.router.model_runner import InputMetadata
from sglang.srt.mm_utils import ( from sglang.srt.mm_utils import (
...@@ -12,14 +21,6 @@ from sglang.srt.mm_utils import ( ...@@ -12,14 +21,6 @@ from sglang.srt.mm_utils import (
unpad_image_shape, unpad_image_shape,
) )
from sglang.srt.models.llama2 import LlamaForCausalLM from sglang.srt.models.llama2 import LlamaForCausalLM
from torch import nn
from transformers import CLIPVisionModel, LlamaConfig, LlavaConfig
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
from vllm.model_executor.layers.linear import LinearMethodBase
from vllm.model_executor.weight_utils import (
default_weight_loader,
hf_model_weights_iterator,
)
class LlavaLlamaForCausalLM(nn.Module): class LlavaLlamaForCausalLM(nn.Module):
......
# Adapted from # Adapted from
# https://github.com/vllm-project/vllm/blob/d0215a58e78572d91dadafe9d832a2db89b09a13/vllm/model_executor/models/mixtral.py#L1 # https://github.com/vllm-project/vllm/blob/d0215a58e78572d91dadafe9d832a2db89b09a13/vllm/model_executor/models/mixtral.py#L1
"""Inference-only Mixtral model.""" """Inference-only Mixtral model."""
from typing import List, Optional, Tuple from typing import Optional
import numpy as np import numpy as np
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
from torch import nn from torch import nn
from transformers import MixtralConfig from transformers import MixtralConfig
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
...@@ -35,6 +32,10 @@ from vllm.model_executor.weight_utils import ( ...@@ -35,6 +32,10 @@ from vllm.model_executor.weight_utils import (
hf_model_weights_iterator, hf_model_weights_iterator,
) )
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
class MixtralMLP(nn.Module): class MixtralMLP(nn.Module):
def __init__( def __init__(
......
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, Optional
import torch import torch
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
...@@ -27,6 +24,10 @@ from vllm.model_executor.weight_utils import ( ...@@ -27,6 +24,10 @@ from vllm.model_executor.weight_utils import (
hf_model_weights_iterator, hf_model_weights_iterator,
) )
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
class QWenMLP(nn.Module): class QWenMLP(nn.Module):
def __init__( def __init__(
......
# Adapted from llama2.py # Adapted from llama2.py
# Modify details for the adaptation of Qwen2 model. # Modify details for the adaptation of Qwen2 model.
"""Inference-only Qwen2 model compatible with HuggingFace weights.""" """Inference-only Qwen2 model compatible with HuggingFace weights."""
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, Optional, Tuple
import torch import torch
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
from torch import nn from torch import nn
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
...@@ -29,6 +26,10 @@ from vllm.model_executor.weight_utils import ( ...@@ -29,6 +26,10 @@ from vllm.model_executor.weight_utils import (
hf_model_weights_iterator, hf_model_weights_iterator,
) )
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
Qwen2Config = None Qwen2Config = None
......
...@@ -5,9 +5,6 @@ model compatible with HuggingFace weights.""" ...@@ -5,9 +5,6 @@ model compatible with HuggingFace weights."""
from typing import Optional, Tuple from typing import Optional, Tuple
import torch import torch
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
...@@ -30,6 +27,10 @@ from vllm.model_executor.weight_utils import ( ...@@ -30,6 +27,10 @@ from vllm.model_executor.weight_utils import (
hf_model_weights_iterator, hf_model_weights_iterator,
) )
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
class StablelmMLP(nn.Module): class StablelmMLP(nn.Module):
def __init__( def __init__(
......
"""Inference-only Yi-VL model.""" """Inference-only Yi-VL model."""
import os from typing import Optional
from typing import List, Optional
import torch import torch
import torch.nn as nn import torch.nn as nn
from sglang.srt.models.llava import (
LlavaLlamaForCausalLM,
clip_vision_embed_forward,
monkey_path_clip_vision_embed_forward,
)
from transformers import CLIPVisionModel, LlavaConfig from transformers import CLIPVisionModel, LlavaConfig
from vllm.model_executor.weight_utils import ( from vllm.model_executor.weight_utils import (
default_weight_loader, default_weight_loader,
hf_model_weights_iterator, hf_model_weights_iterator,
) )
from sglang.srt.models.llava import (
LlavaLlamaForCausalLM,
monkey_path_clip_vision_embed_forward,
)
class YiVLForCausalLM(LlavaLlamaForCausalLM): class YiVLForCausalLM(LlavaLlamaForCausalLM):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
......
...@@ -10,9 +10,6 @@ import threading ...@@ -10,9 +10,6 @@ import threading
import time import time
from typing import List, Optional, Union from typing import List, Optional, Union
# Fix a Python bug
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
import aiohttp import aiohttp
import psutil import psutil
import pydantic import pydantic
...@@ -22,6 +19,9 @@ import uvloop ...@@ -22,6 +19,9 @@ import uvloop
from fastapi import FastAPI, HTTPException, Request from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import Response, StreamingResponse from fastapi.responses import Response, StreamingResponse
from pydantic import BaseModel from pydantic import BaseModel
from starlette.middleware.base import BaseHTTPMiddleware
from starlette.responses import JSONResponse
from sglang.backend.runtime_endpoint import RuntimeEndpoint from sglang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.srt.constrained import disable_cache from sglang.srt.constrained import disable_cache
from sglang.srt.conversation import ( from sglang.srt.conversation import (
...@@ -54,8 +54,9 @@ from sglang.srt.managers.router.manager import start_router_process ...@@ -54,8 +54,9 @@ from sglang.srt.managers.router.manager import start_router_process
from sglang.srt.managers.tokenizer_manager import TokenizerManager from sglang.srt.managers.tokenizer_manager import TokenizerManager
from sglang.srt.server_args import PortArgs, ServerArgs from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.utils import enable_show_time_cost, handle_port_init from sglang.srt.utils import enable_show_time_cost, handle_port_init
from starlette.middleware.base import BaseHTTPMiddleware
from starlette.responses import JSONResponse # Fix a Python bug
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
...@@ -618,7 +619,7 @@ def launch_server(server_args, pipe_finish_writer): ...@@ -618,7 +619,7 @@ def launch_server(server_args, pipe_finish_writer):
try: try:
requests.get(url + "/get_model_info", timeout=5, headers=headers) requests.get(url + "/get_model_info", timeout=5, headers=headers)
break break
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException:
pass pass
else: else:
if pipe_finish_writer is not None: if pipe_finish_writer is not None:
......
...@@ -157,7 +157,6 @@ def get_exception_traceback(): ...@@ -157,7 +157,6 @@ def get_exception_traceback():
def get_int_token_logit_bias(tokenizer, vocab_size): def get_int_token_logit_bias(tokenizer, vocab_size):
from transformers import LlamaTokenizer, LlamaTokenizerFast
# a bug when model's vocab size > tokenizer.vocab_size # a bug when model's vocab size > tokenizer.vocab_size
vocab_size = tokenizer.vocab_size vocab_size = tokenizer.vocab_size
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
import numpy as np import numpy as np
import requests import requests
from sglang.backend.openai import OpenAI from sglang.backend.openai import OpenAI
from sglang.backend.runtime_endpoint import RuntimeEndpoint from sglang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.global_config import global_config from sglang.global_config import global_config
......
import argparse import argparse
import glob import glob
import multiprocessing import multiprocessing
import os
import time import time
import unittest import unittest
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment