Commit 72ee382d authored by OlivierDehaene's avatar OlivierDehaene
Browse files

chore: formatting

parent 3a521c92
...@@ -3,7 +3,10 @@ import torch ...@@ -3,7 +3,10 @@ import torch
from typing import Optional from typing import Optional
from text_generation_server.models.flash_mistral import BaseFlashMistral from text_generation_server.models.flash_mistral import BaseFlashMistral
from text_generation_server.models.custom_modeling.flash_mixtral_modeling import MixtralConfig, FlashMixtralForCausalLM from text_generation_server.models.custom_modeling.flash_mixtral_modeling import (
MixtralConfig,
FlashMixtralForCausalLM,
)
class FlashMixtral(BaseFlashMistral): class FlashMixtral(BaseFlashMistral):
...@@ -22,5 +25,5 @@ class FlashMixtral(BaseFlashMistral): ...@@ -22,5 +25,5 @@ class FlashMixtral(BaseFlashMistral):
revision=revision, revision=revision,
quantize=quantize, quantize=quantize,
dtype=dtype, dtype=dtype,
trust_remote_code=trust_remote_code trust_remote_code=trust_remote_code,
) )
...@@ -792,7 +792,10 @@ class IdeficsCausalLM(Model): ...@@ -792,7 +792,10 @@ class IdeficsCausalLM(Model):
skip_special_tokens=False, skip_special_tokens=False,
) )
prefill_tokens = Tokens( prefill_tokens = Tokens(
prefill_token_ids, prefill_logprobs, prefill_texts, is_special=[] prefill_token_ids,
prefill_logprobs,
prefill_texts,
is_special=[],
) )
else: else:
prefill_tokens = None prefill_tokens = None
......
...@@ -56,7 +56,7 @@ class Model(ABC): ...@@ -56,7 +56,7 @@ class Model(ABC):
dtype=str(self.dtype), dtype=str(self.dtype),
device_type=self.device.type, device_type=self.device.type,
window_size=self.sliding_window, window_size=self.sliding_window,
speculate=self.speculate speculate=self.speculate,
) )
@property @property
......
...@@ -736,7 +736,7 @@ class Seq2SeqLM(Model): ...@@ -736,7 +736,7 @@ class Seq2SeqLM(Model):
[self.tokenizer.bos_token_id], [self.tokenizer.bos_token_id],
[float("nan")], [float("nan")],
[self.tokenizer.bos_token], [self.tokenizer.bos_token],
[False] [False],
) )
else: else:
prefill_tokens = None prefill_tokens = None
......
...@@ -66,7 +66,10 @@ class Tokens: ...@@ -66,7 +66,10 @@ class Tokens:
def to_pb(self) -> generate_pb2.Tokens: def to_pb(self) -> generate_pb2.Tokens:
return generate_pb2.Tokens( return generate_pb2.Tokens(
ids=self.token_ids, logprobs=self.logprobs, texts=self.texts, is_special=self.is_special ids=self.token_ids,
logprobs=self.logprobs,
texts=self.texts,
is_special=self.is_special,
) )
def __len__(self): def __len__(self):
......
...@@ -159,7 +159,13 @@ def serve( ...@@ -159,7 +159,13 @@ def serve(
try: try:
model = get_model( model = get_model(
model_id, revision, sharded, quantize, speculate, dtype, trust_remote_code model_id,
revision,
sharded,
quantize,
speculate,
dtype,
trust_remote_code,
) )
except Exception: except Exception:
logger.exception("Error when initializing model") logger.exception("Error when initializing model")
...@@ -207,5 +213,7 @@ def serve( ...@@ -207,5 +213,7 @@ def serve(
await server.stop(0) await server.stop(0)
asyncio.run( asyncio.run(
serve_inner(model_id, revision, sharded, quantize, speculate, dtype, trust_remote_code) serve_inner(
model_id, revision, sharded, quantize, speculate, dtype, trust_remote_code
)
) )
...@@ -51,7 +51,9 @@ except ImportError as e: ...@@ -51,7 +51,9 @@ except ImportError as e:
) from e ) from e
elif IS_ROCM_SYSTEM: elif IS_ROCM_SYSTEM:
for idx in range(torch.cuda.device_count()): for idx in range(torch.cuda.device_count()):
if "MI210" not in torch.cuda.get_device_name(idx) and "MI250" not in torch.cuda.get_device_name(idx): if "MI210" not in torch.cuda.get_device_name(
idx
) and "MI250" not in torch.cuda.get_device_name(idx):
raise ImportError( raise ImportError(
f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention" f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
) )
...@@ -91,7 +93,9 @@ def attention( ...@@ -91,7 +93,9 @@ def attention(
) )
elif HAS_FLASH_ATTN_V2_ROCM: elif HAS_FLASH_ATTN_V2_ROCM:
if window_size_left != -1: if window_size_left != -1:
raise ValueError(f"RoCm version of Flash Attention v2 does not support window attention (window_size_left != -1, got window_size_left={window_size_left}).") raise ValueError(
f"RoCm version of Flash Attention v2 does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
)
# RoCm flash API does not take the window_size_left and window_size_right arguments. # RoCm flash API does not take the window_size_left and window_size_right arguments.
return flash_attn_2_cuda.varlen_fwd( return flash_attn_2_cuda.varlen_fwd(
......
...@@ -11,20 +11,22 @@ logger = getLogger(__name__) ...@@ -11,20 +11,22 @@ logger = getLogger(__name__)
try: try:
from exllamav2_kernels import make_q_matrix, gemm_half_q_half from exllamav2_kernels import make_q_matrix, gemm_half_q_half
except ImportError: except ImportError:
logger.error('exllamav2_kernels not installed.') logger.error("exllamav2_kernels not installed.")
raise raise
# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension # Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
none_tensor = torch.empty((1, 1), device="meta") none_tensor = torch.empty((1, 1), device="meta")
def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda): def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
"""Matrix multiplication, returns x @ q4""" """Matrix multiplication, returns x @ q4"""
output_shape = x.shape[:-1] + (q4_width,) output_shape = x.shape[:-1] + (q4_width,)
x = x.view(-1, x.shape[-1]) x = x.view(-1, x.shape[-1])
output = torch.empty((x.shape[0], q4_width), dtype = torch.half, device = x.device) output = torch.empty((x.shape[0], q4_width), dtype=torch.half, device=x.device)
gemm_half_q_half(x, q_handle, output, force_cuda) gemm_half_q_half(x, q_handle, output, force_cuda)
return output.view(output_shape) return output.view(output_shape)
def ext_make_q_matrix(w: dict, temp_dq, key: str = None): def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
""" """
Create Q matrix Create Q matrix
...@@ -35,7 +37,8 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None): ...@@ -35,7 +37,8 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
w["q_scale_max"] /= 256 w["q_scale_max"] /= 256
w["q_perm"] = w["q_perm"].short() w["q_perm"] = w["q_perm"].short()
w["q_invperm"] = w["q_invperm"].short() w["q_invperm"] = w["q_invperm"].short()
return make_q_matrix(w["q_weight"], return make_q_matrix(
w["q_weight"],
w["q_perm"], w["q_perm"],
w["q_invperm"], w["q_invperm"],
w["q_scale"], w["q_scale"],
...@@ -44,7 +47,8 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None): ...@@ -44,7 +47,8 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
none_tensor, none_tensor,
none_tensor, none_tensor,
none_tensor, none_tensor,
temp_dq) temp_dq,
)
# GPTQ # GPTQ
elif "qweight" in w: elif "qweight" in w:
if w["scales"].dtype == torch.float: if w["scales"].dtype == torch.float:
...@@ -52,10 +56,15 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None): ...@@ -52,10 +56,15 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
# GPTQ with g_idx (act_order) # GPTQ with g_idx (act_order)
if w.get("g_idx", None) is not None and not (w["g_idx"] == 0).all().item(): if w.get("g_idx", None) is not None and not (w["g_idx"] == 0).all().item():
w["q_perm"] = torch.empty((w["qweight"].shape[0] * 8,), dtype = torch.short, device = w["qweight"].device) w["q_perm"] = torch.empty(
(w["qweight"].shape[0] * 8,),
dtype=torch.short,
device=w["qweight"].device,
)
w["q_invperm"] = torch.empty_like(w["q_perm"]) w["q_invperm"] = torch.empty_like(w["q_perm"])
# make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx. # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
return make_q_matrix(w["qweight"], return make_q_matrix(
w["qweight"],
w["q_perm"], w["q_perm"],
w["q_invperm"], w["q_invperm"],
none_tensor, none_tensor,
...@@ -64,10 +73,12 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None): ...@@ -64,10 +73,12 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
w["qzeros"], w["qzeros"],
w["scales"], w["scales"],
w["g_idx"].cpu(), w["g_idx"].cpu(),
temp_dq) temp_dq,
)
# GPTQ without g_idx # GPTQ without g_idx
else: else:
return make_q_matrix(w["qweight"], return make_q_matrix(
w["qweight"],
none_tensor, none_tensor,
none_tensor, none_tensor,
none_tensor, none_tensor,
...@@ -76,7 +87,9 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None): ...@@ -76,7 +87,9 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
w["qzeros"], w["qzeros"],
w["scales"], w["scales"],
none_tensor, none_tensor,
temp_dq) temp_dq,
)
DEVICE = None DEVICE = None
FIXED_BYTES = 0 FIXED_BYTES = 0
...@@ -106,14 +119,15 @@ class QuantLinear(nn.Module): ...@@ -106,14 +119,15 @@ class QuantLinear(nn.Module):
super().__init__() super().__init__()
if bits != 4: if bits != 4:
raise ValueError( raise ValueError(
f"Exllamav2 kernel supports only bits=4, requested bits={bits}. Something is wrong in the model initialization.") f"Exllamav2 kernel supports only bits=4, requested bits={bits}. Something is wrong in the model initialization."
)
self.q_handle = None self.q_handle = None
self.q_tensors = None self.q_tensors = None
self.bits = bits self.bits = bits
self.maxq = 2 ** self.bits - 1 self.maxq = 2**self.bits - 1
self.infeatures = qweight.shape[0] // self.bits * 32 self.infeatures = qweight.shape[0] // self.bits * 32
self.outfeatures = qweight.shape[1] self.outfeatures = qweight.shape[1]
self.padding = - self.outfeatures % 32 self.padding = -self.outfeatures % 32
self.outfeatures = self.outfeatures + self.padding self.outfeatures = self.outfeatures + self.padding
self.device = qweight.device self.device = qweight.device
...@@ -128,9 +142,12 @@ class QuantLinear(nn.Module): ...@@ -128,9 +142,12 @@ class QuantLinear(nn.Module):
outfeatures = self.outfeatures outfeatures = self.outfeatures
assert qweight.shape == (infeatures // 32 * self.bits, outfeatures) assert qweight.shape == (infeatures // 32 * self.bits, outfeatures)
assert infeatures % self.group_size == 0 assert infeatures % self.group_size == 0
assert qzeros.shape == (infeatures // self.group_size, outfeatures // 32 * self.bits) assert qzeros.shape == (
infeatures // self.group_size,
outfeatures // 32 * self.bits,
)
assert scales.shape == (infeatures // self.group_size, outfeatures) assert scales.shape == (infeatures // self.group_size, outfeatures)
assert g_idx.shape == (infeatures, ), f"{g_idx.shape}, {infeatures}" assert g_idx.shape == (infeatures,), f"{g_idx.shape}, {infeatures}"
global FIXED_BYTES, LAYERS global FIXED_BYTES, LAYERS
FIXED_BYTES = max(FIXED_BYTES, self.scratch_space_fixed()) FIXED_BYTES = max(FIXED_BYTES, self.scratch_space_fixed())
...@@ -140,17 +157,15 @@ class QuantLinear(nn.Module): ...@@ -140,17 +157,15 @@ class QuantLinear(nn.Module):
assert self.qweight.device.type == "cuda" assert self.qweight.device.type == "cuda"
assert self.qweight.device.index is not None assert self.qweight.device.index is not None
self.q_tensors = { self.q_tensors = {
"qweight":self.qweight, "qweight": self.qweight,
"qzeros":self.qzeros, "qzeros": self.qzeros,
"scales":self.scales, "scales": self.scales,
"g_idx":self.g_idx "g_idx": self.g_idx,
} }
temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size()) temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size())
self.q_handle = ext_make_q_matrix( self.q_handle = ext_make_q_matrix(self.q_tensors, temp_dq)
self.q_tensors, temp_dq
)
def forward(self, x, force_cuda = False): def forward(self, x, force_cuda=False):
output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda) output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda)
if self.bias is not None: if self.bias is not None:
...@@ -179,11 +194,14 @@ class ExLlamaV2DeviceTensors: ...@@ -179,11 +194,14 @@ class ExLlamaV2DeviceTensors:
self.scratch_bytes = scratch_bytes self.scratch_bytes = scratch_bytes
def prepare(self): def prepare(self):
self.scratch = torch.empty((self.scratch_bytes // 2,), dtype = torch.half, device = self.device) self.scratch = torch.empty(
(self.scratch_bytes // 2,), dtype=torch.half, device=self.device
)
def get_scratch_slice(self, size_bytes): def get_scratch_slice(self, size_bytes):
if self.scratch is None: self.prepare() if self.scratch is None:
self.prepare()
size_bytes = ((size_bytes + 127) // 128) * 128 size_bytes = ((size_bytes + 127) // 128) * 128
size_half = size_bytes // 2 size_half = size_bytes // 2
......
...@@ -35,7 +35,9 @@ HAS_EXLLAMA = False ...@@ -35,7 +35,9 @@ HAS_EXLLAMA = False
CAN_EXLLAMA = major >= 8 CAN_EXLLAMA = major >= 8
V2 = os.getenv("EXLLAMA_VERSION", "2") == "2" V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
if V2 and int(os.getenv("WORLD_SIZE", "1")) > 1: if V2 and int(os.getenv("WORLD_SIZE", "1")) > 1:
logger.warning("Disabling exllama v2 and using v1 instead because there are issues when sharding") logger.warning(
"Disabling exllama v2 and using v1 instead because there are issues when sharding"
)
V2 = False V2 = False
if os.getenv("DISABLE_EXLLAMA") == "True": if os.getenv("DISABLE_EXLLAMA") == "True":
...@@ -43,14 +45,16 @@ if os.getenv("DISABLE_EXLLAMA") == "True": ...@@ -43,14 +45,16 @@ if os.getenv("DISABLE_EXLLAMA") == "True":
elif CAN_EXLLAMA: elif CAN_EXLLAMA:
try: try:
if V2: if V2:
from text_generation_server.utils.gptq.exllamav2 import (QuantLinear as ExllamaQuantLinear, from text_generation_server.utils.gptq.exllamav2 import (
QuantLinear as ExllamaQuantLinear,
create_exllama_buffers, create_exllama_buffers,
set_device, set_device,
) )
HAS_EXLLAMA = "2" HAS_EXLLAMA = "2"
else: else:
from text_generation_server.utils.gptq.exllama import (Ex4bitLinear as ExllamaQuantLinear, from text_generation_server.utils.gptq.exllama import (
Ex4bitLinear as ExllamaQuantLinear,
create_exllama_buffers, create_exllama_buffers,
set_device, set_device,
) )
...@@ -325,7 +329,9 @@ def get_linear(weight, bias, quantize): ...@@ -325,7 +329,9 @@ def get_linear(weight, bias, quantize):
) )
if use_exllama: if use_exllama:
linear = ExllamaQuantLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize) linear = ExllamaQuantLinear(
qweight, qzeros, scales, g_idx, bias, bits, groupsize
)
else: else:
linear = QuantLinear( linear = QuantLinear(
qweight, qweight,
...@@ -533,7 +539,6 @@ try: ...@@ -533,7 +539,6 @@ try:
else: else:
dropout_layer_norm = None dropout_layer_norm = None
class FastLayerNorm(nn.LayerNorm): class FastLayerNorm(nn.LayerNorm):
def forward(self, hidden_states, residual=None): def forward(self, hidden_states, residual=None):
if hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM: if hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM:
...@@ -569,7 +574,6 @@ try: ...@@ -569,7 +574,6 @@ try:
return normed_hidden_states, residual return normed_hidden_states, residual
class FastRMSNorm(nn.Module): class FastRMSNorm(nn.Module):
def __init__(self, weight: torch.Tensor, eps: float): def __init__(self, weight: torch.Tensor, eps: float):
super().__init__() super().__init__()
...@@ -601,7 +605,11 @@ try: ...@@ -601,7 +605,11 @@ try:
return self.weight * hidden_states, residual return self.weight * hidden_states, residual
elif IS_CUDA_SYSTEM: elif IS_CUDA_SYSTEM:
# faster post attention rms norm # faster post attention rms norm
normed_hidden_states, res, *rest = dropout_layer_norm.dropout_add_ln_fwd( (
normed_hidden_states,
res,
*rest,
) = dropout_layer_norm.dropout_add_ln_fwd(
hidden_states, hidden_states,
residual, residual,
self.weight, self.weight,
...@@ -638,7 +646,8 @@ try: ...@@ -638,7 +646,8 @@ try:
return out, residual return out, residual
else: else:
raise ValueError( raise ValueError(
"Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction.") "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
)
except ImportError: except ImportError:
pass pass
...@@ -650,14 +659,12 @@ try: ...@@ -650,14 +659,12 @@ try:
elif IS_ROCM_SYSTEM: elif IS_ROCM_SYSTEM:
from vllm import pos_encoding_ops from vllm import pos_encoding_ops
def _create_inv_freq(dim, base, device): def _create_inv_freq(dim, base, device):
inv_freq = 1.0 / ( inv_freq = 1.0 / (
base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim) base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
) )
return inv_freq return inv_freq
def _get_rope_config(config): def _get_rope_config(config):
if os.getenv("ROPE_SCALING", None) is not None: if os.getenv("ROPE_SCALING", None) is not None:
rope_scaling = { rope_scaling = {
...@@ -667,7 +674,6 @@ try: ...@@ -667,7 +674,6 @@ try:
return rope_scaling return rope_scaling
return getattr(config, "rope_scaling", None) return getattr(config, "rope_scaling", None)
class PositionRotaryEmbedding(nn.Module): class PositionRotaryEmbedding(nn.Module):
def __init__(self, inv_freq, scaling_factor): def __init__(self, inv_freq, scaling_factor):
super().__init__() super().__init__()
...@@ -680,17 +686,23 @@ try: ...@@ -680,17 +686,23 @@ try:
self.scaling_factor = scaling_factor self.scaling_factor = scaling_factor
self.dynamic_args = None self.dynamic_args = None
def forward(self, query: torch.Tensor, key: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor): def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
cos: torch.Tensor,
sin: torch.Tensor,
):
# Such controlflows may add some overhead. # Such controlflows may add some overhead.
if IS_CUDA_SYSTEM: if IS_CUDA_SYSTEM:
rotary_dim = cos.shape[-1] rotary_dim = cos.shape[-1]
q1 = query[..., :rotary_dim] q1 = query[..., :rotary_dim]
q2 = query[..., rotary_dim: 2 * rotary_dim] q2 = query[..., rotary_dim : 2 * rotary_dim]
rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False) rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
k1 = key[..., :rotary_dim] k1 = key[..., :rotary_dim]
k2 = key[..., rotary_dim: 2 * rotary_dim] k2 = key[..., rotary_dim : 2 * rotary_dim]
rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False) rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
elif IS_ROCM_SYSTEM: elif IS_ROCM_SYSTEM:
...@@ -700,17 +712,11 @@ try: ...@@ -700,17 +712,11 @@ try:
head_size = query.shape[-1] head_size = query.shape[-1]
# Inplace operation, updating query and key. # Inplace operation, updating query and key.
pos_encoding_ops.rotary_embedding( pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, True)
query,
key,
head_size,
cos,
sin,
True
)
else: else:
raise ValueError( raise ValueError(
"Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction.") "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
)
@classmethod @classmethod
def static(cls, config, dim, base, device): def static(cls, config, dim, base, device):
...@@ -732,15 +738,16 @@ try: ...@@ -732,15 +738,16 @@ try:
elif rope_scaling["type"] == "yarn": elif rope_scaling["type"] == "yarn":
return YarnPositionRotaryEmbedding( return YarnPositionRotaryEmbedding(
dim=2 * inv_freq.shape[0], dim=2 * inv_freq.shape[0],
max_position_embeddings=rope_scaling["original_max_position_embeddings"], max_position_embeddings=rope_scaling[
"original_max_position_embeddings"
],
base=10000.0, base=10000.0,
device=inv_freq.device, device=inv_freq.device,
scaling_factor=scaling_factor, scaling_factor=scaling_factor,
extrapolation_factor=1, extrapolation_factor=1,
attn_factor=1, attn_factor=1,
beta_fast=32, beta_fast=32,
beta_slow=1 beta_slow=1,
) )
else: else:
raise NotImplementedError( raise NotImplementedError(
...@@ -773,15 +780,16 @@ try: ...@@ -773,15 +780,16 @@ try:
elif rope_scaling["type"] == "yarn": elif rope_scaling["type"] == "yarn":
return YarnPositionRotaryEmbedding( return YarnPositionRotaryEmbedding(
dim=2 * inv_freq.shape[0], dim=2 * inv_freq.shape[0],
max_position_embeddings=rope_scaling["original_max_position_embeddings"], max_position_embeddings=rope_scaling[
"original_max_position_embeddings"
],
base=10000.0, base=10000.0,
device=inv_freq.device, device=inv_freq.device,
scaling_factor=scaling_factor, scaling_factor=scaling_factor,
extrapolation_factor=1, extrapolation_factor=1,
attn_factor=1, attn_factor=1,
beta_fast=32, beta_fast=32,
beta_slow=1 beta_slow=1,
) )
else: else:
raise NotImplementedError( raise NotImplementedError(
...@@ -827,7 +835,6 @@ try: ...@@ -827,7 +835,6 @@ try:
# Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow. # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
return cos.unsqueeze(1), sin.unsqueeze(1) return cos.unsqueeze(1), sin.unsqueeze(1)
class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding): class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
def __init__(self, dim, max_position_embeddings, base, device, scaling_factor): def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
inv_freq = _create_inv_freq(dim, base, device) inv_freq = _create_inv_freq(dim, base, device)
...@@ -861,24 +868,28 @@ try: ...@@ -861,24 +868,28 @@ try:
self._cos_cached = torch.cos(freqs).to(dtype) self._cos_cached = torch.cos(freqs).to(dtype)
self._sin_cached = torch.sin(freqs).to(dtype) self._sin_cached = torch.sin(freqs).to(dtype)
# Inverse dim formula to find dim based on number of rotations # Inverse dim formula to find dim based on number of rotations
import math import math
def find_correction_dim(
def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048): num_rotations, dim, base=10000, max_position_embeddings=2048
return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) ):
return (
dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))
) / (2 * math.log(base))
# Find dim range bounds based on rotations # Find dim range bounds based on rotations
def find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048): def find_correction_range(
low = math.floor(find_correction_dim( low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
low_rot, dim, base, max_position_embeddings)) ):
high = math.ceil(find_correction_dim( low = math.floor(
high_rot, dim, base, max_position_embeddings)) find_correction_dim(low_rot, dim, base, max_position_embeddings)
)
high = math.ceil(
find_correction_dim(high_rot, dim, base, max_position_embeddings)
)
return max(low, 0), min(high, dim - 1) # Clamp values just in case return max(low, 0), min(high, dim - 1) # Clamp values just in case
def linear_ramp_mask(min, max, dim): def linear_ramp_mask(min, max, dim):
if min == max: if min == max:
max += 0.001 # Prevent singularity max += 0.001 # Prevent singularity
...@@ -887,16 +898,25 @@ try: ...@@ -887,16 +898,25 @@ try:
ramp_func = torch.clamp(linear_func, 0, 1) ramp_func = torch.clamp(linear_func, 0, 1)
return ramp_func return ramp_func
def get_mscale(scale=1): def get_mscale(scale=1):
if scale <= 1: if scale <= 1:
return 1.0 return 1.0
return 0.1 * math.log(scale) + 1.0 return 0.1 * math.log(scale) + 1.0
class YarnPositionRotaryEmbedding(PositionRotaryEmbedding): class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
def __init__(self, dim, max_position_embeddings, base, device, scaling_factor, *, extrapolation_factor, def __init__(
attn_factor, beta_fast, beta_slow): self,
dim,
max_position_embeddings,
base,
device,
scaling_factor,
*,
extrapolation_factor,
attn_factor,
beta_fast,
beta_slow,
):
inv_freq = _create_inv_freq(dim, base, device) inv_freq = _create_inv_freq(dim, base, device)
super().__init__(inv_freq, scaling_factor) super().__init__(inv_freq, scaling_factor)
self.dim = dim self.dim = dim
...@@ -906,8 +926,9 @@ try: ...@@ -906,8 +926,9 @@ try:
self.attn_factor = attn_factor self.attn_factor = attn_factor
self.beta_fast = beta_fast self.beta_fast = beta_fast
self.beta_slow = beta_slow self.beta_slow = beta_slow
self.mscale = float(get_mscale( self.mscale = float(
self.scaling_factor) * self.attn_factor) # Get n-d magnitude scaling corrected for interpolation get_mscale(self.scaling_factor) * self.attn_factor
) # Get n-d magnitude scaling corrected for interpolation
def _update_cos_sin_cache(self, dtype, device, seqlen): def _update_cos_sin_cache(self, dtype, device, seqlen):
# Reset the tables if the sequence length has changed, # Reset the tables if the sequence length has changed,
...@@ -923,15 +944,26 @@ try: ...@@ -923,15 +944,26 @@ try:
) )
freqs = 1.0 / inv_freq_extrapolation freqs = 1.0 / inv_freq_extrapolation
inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs) inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
low, high = find_correction_range(self.beta_fast, self.beta_slow, self.dim, self.base, low, high = find_correction_range(
self.max_position_embeddings) self.beta_fast,
inv_freq_mask = (1 - linear_ramp_mask(low, high, self.dim // 2).float().to( self.beta_slow,
device)) * self.extrapolation_factor # Get n-d rotational scaling corrected for extrapolation self.dim,
inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask self.base,
self.max_position_embeddings,
)
inv_freq_mask = (
1
- linear_ramp_mask(low, high, self.dim // 2).float().to(device)
) * self.extrapolation_factor # Get n-d rotational scaling corrected for extrapolation
inv_freq = (
inv_freq_interpolation * (1 - inv_freq_mask)
+ inv_freq_extrapolation * inv_freq_mask
)
self.inv_freq = inv_freq self.inv_freq = inv_freq
self.mscale = float(get_mscale( self.mscale = float(
self.scaling_factor) * self.attn_factor) # Get n-d magnitude scaling corrected for interpolation get_mscale(self.scaling_factor) * self.attn_factor
) # Get n-d magnitude scaling corrected for interpolation
self._seq_len_cached = seqlen self._seq_len_cached = seqlen
t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
......
...@@ -2,6 +2,7 @@ import torch ...@@ -2,6 +2,7 @@ import torch
from dataclasses import dataclass from dataclasses import dataclass
from text_generation_server.utils.layers import TensorParallelHead, FastLinear from text_generation_server.utils.layers import TensorParallelHead, FastLinear
@dataclass @dataclass
class Output: class Output:
logits: torch.FloatTensor = None logits: torch.FloatTensor = None
...@@ -11,7 +12,9 @@ class Output: ...@@ -11,7 +12,9 @@ class Output:
class ResBlock(torch.nn.Module): class ResBlock(torch.nn.Module):
def __init__(self, config, prefix, weights): def __init__(self, config, prefix, weights):
super().__init__() super().__init__()
self.linear = FastLinear.load(config, prefix=f"{prefix}.linear", weights=weights, bias=True) self.linear = FastLinear.load(
config, prefix=f"{prefix}.linear", weights=weights, bias=True
)
self.act = torch.nn.SiLU() self.act = torch.nn.SiLU()
def forward(self, x): def forward(self, x):
...@@ -19,15 +22,13 @@ class ResBlock(torch.nn.Module): ...@@ -19,15 +22,13 @@ class ResBlock(torch.nn.Module):
class MedusaModel(torch.nn.Module): class MedusaModel(torch.nn.Module):
def __init__( def __init__(self, config, weights, lm_head):
self,
config,
weights,
lm_head
):
super().__init__() super().__init__()
self.heads = torch.nn.ModuleList( self.heads = torch.nn.ModuleList(
[MedusaHead(config, prefix=f"{i}", weights=weights) for i in range(config["medusa_num_heads"])] [
MedusaHead(config, prefix=f"{i}", weights=weights)
for i in range(config["medusa_num_heads"])
]
) )
self.lm_head = lm_head self.lm_head = lm_head
...@@ -40,9 +41,16 @@ class MedusaModel(torch.nn.Module): ...@@ -40,9 +41,16 @@ class MedusaModel(torch.nn.Module):
class MedusaHead(torch.nn.Module): class MedusaHead(torch.nn.Module):
def __init__(self, config, prefix, weights): def __init__(self, config, prefix, weights):
super().__init__() super().__init__()
self.blocks = torch.nn.ModuleList([ResBlock(config, prefix=f"{prefix}.{i}", weights=weights) for i in range(config["medusa_num_layers"])]) self.blocks = torch.nn.ModuleList(
[
ResBlock(config, prefix=f"{prefix}.{i}", weights=weights)
for i in range(config["medusa_num_layers"])
]
)
n = len(self.blocks) n = len(self.blocks)
self.out = FastLinear.load(config, prefix=f"{prefix}.{n}", weights=weights, bias=False) self.out = FastLinear.load(
config, prefix=f"{prefix}.{n}", weights=weights, bias=False
)
def forward(self, x): def forward(self, x):
for block in self.blocks: for block in self.blocks:
......
...@@ -7,11 +7,14 @@ from vllm import attention_ops ...@@ -7,11 +7,14 @@ from vllm import attention_ops
_PARTITION_SIZE = 512 _PARTITION_SIZE = 512
def reshape_and_cache(key: torch.Tensor, value: torch.Tensor, key_cache: torch.Tensor, value_cache: torch.Tensor, def reshape_and_cache(
slots: torch.Tensor): key: torch.Tensor,
cache_ops.reshape_and_cache( value: torch.Tensor,
key, value, key_cache, value_cache, slots key_cache: torch.Tensor,
) value_cache: torch.Tensor,
slots: torch.Tensor,
):
cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots)
def attention( def attention(
...@@ -45,9 +48,7 @@ def attention( ...@@ -45,9 +48,7 @@ def attention(
# value_cache => [num_blocks, num_heads, head_size, block_size] # value_cache => [num_blocks, num_heads, head_size, block_size]
block_size = value_cache.shape[3] block_size = value_cache.shape[3]
num_seqs, num_heads, head_size = query.shape num_seqs, num_heads, head_size = query.shape
max_num_partitions = ( max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
(max_s + _PARTITION_SIZE - 1) //
_PARTITION_SIZE)
# NOTE(woosuk): We use a simple heuristic to decide whether to use # NOTE(woosuk): We use a simple heuristic to decide whether to use
# PagedAttention V1 or V2. If the number of partitions is 1, we use # PagedAttention V1 or V2. If the number of partitions is 1, we use
# V1 to avoid the overhead of reduction. Also, if the number of # V1 to avoid the overhead of reduction. Also, if the number of
......
...@@ -38,7 +38,9 @@ def download_and_unload_peft(model_id, revision, trust_remote_code): ...@@ -38,7 +38,9 @@ def download_and_unload_peft(model_id, revision, trust_remote_code):
os.makedirs(model_id, exist_ok=True) os.makedirs(model_id, exist_ok=True)
cache_dir = model_id cache_dir = model_id
logger.info(f"Saving the newly created merged model to {cache_dir}") logger.info(f"Saving the newly created merged model to {cache_dir}")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=trust_remote_code) tokenizer = AutoTokenizer.from_pretrained(
base_model_id, trust_remote_code=trust_remote_code
)
model.save_pretrained(cache_dir, safe_serialization=True) model.save_pretrained(cache_dir, safe_serialization=True)
model.config.save_pretrained(cache_dir) model.config.save_pretrained(cache_dir)
tokenizer.save_pretrained(cache_dir) tokenizer.save_pretrained(cache_dir)
SPECULATE = None SPECULATE = None
def get_speculate() -> int: def get_speculate() -> int:
global SPECULATE global SPECULATE
return SPECULATE return SPECULATE
def set_speculate(speculate: int): def set_speculate(speculate: int):
global SPECULATE global SPECULATE
SPECULATE = speculate SPECULATE = speculate
...@@ -16,6 +16,7 @@ from text_generation_server.utils.logits_process import ( ...@@ -16,6 +16,7 @@ from text_generation_server.utils.logits_process import (
from text_generation_server.utils.watermark import WatermarkLogitsProcessor from text_generation_server.utils.watermark import WatermarkLogitsProcessor
from transformers import PreTrainedTokenizerBase, RepetitionPenaltyLogitsProcessor from transformers import PreTrainedTokenizerBase, RepetitionPenaltyLogitsProcessor
class NextTokenChooser: class NextTokenChooser:
def __init__( def __init__(
self, self,
...@@ -145,21 +146,31 @@ class StoppingCriteria: ...@@ -145,21 +146,31 @@ class StoppingCriteria:
pb.ignore_eos_token, pb.ignore_eos_token,
) )
def create_n_gram_speculation(input_ids: torch.Tensor, next_ids: torch.Tensor, accepted_ids: torch.Tensor, speculate: int, verbose: bool):
def create_n_gram_speculation(
input_ids: torch.Tensor,
next_ids: torch.Tensor,
accepted_ids: torch.Tensor,
speculate: int,
verbose: bool,
):
# Very trivial approach, find first match in the string. # Very trivial approach, find first match in the string.
# This is much less refined than actual n-gram but seems to work # This is much less refined than actual n-gram but seems to work
# relatively OK in grounded mode and is by far much faster with # relatively OK in grounded mode and is by far much faster with
# much less worst case complexity as everything happens on device. # much less worst case complexity as everything happens on device.
B = accepted_ids.shape[0] B = accepted_ids.shape[0]
device = input_ids.device device = input_ids.device
seeds = next_ids[accepted_ids.cumsum(dim=-1) -1 ] seeds = next_ids[accepted_ids.cumsum(dim=-1) - 1]
indices = (input_ids == seeds.unsqueeze(-1)).max(dim=1).indices + 1 indices = (input_ids == seeds.unsqueeze(-1)).max(dim=1).indices + 1
all_indices = indices.unsqueeze(-1).expand(B, speculate) + torch.arange(speculate, device=device) all_indices = indices.unsqueeze(-1).expand(B, speculate) + torch.arange(
speculate, device=device
)
all_indices = torch.clamp(all_indices, max=input_ids.shape[1] - 1) all_indices = torch.clamp(all_indices, max=input_ids.shape[1] - 1)
speculative_ids = input_ids.gather(dim=-1, index=all_indices) speculative_ids = input_ids.gather(dim=-1, index=all_indices)
return speculative_ids return speculative_ids
class HeterogeneousNextTokenChooser: class HeterogeneousNextTokenChooser:
def __init__( def __init__(
self, self,
...@@ -228,7 +239,15 @@ class HeterogeneousNextTokenChooser: ...@@ -228,7 +239,15 @@ class HeterogeneousNextTokenChooser:
self.dtype = dtype self.dtype = dtype
self.device = device self.device = device
def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor, speculate: int, speculated_ids: Optional[torch.Tensor] = None, speculative_scores: Optional[torch.Tensor] = None, verbose=False): def __call__(
self,
input_ids: torch.Tensor,
scores: torch.Tensor,
speculate: int,
speculated_ids: Optional[torch.Tensor] = None,
speculative_scores: Optional[torch.Tensor] = None,
verbose=False,
):
if speculated_ids is not None: if speculated_ids is not None:
B = scores.shape[0] // (speculated_ids.shape[1] + 1) B = scores.shape[0] // (speculated_ids.shape[1] + 1)
S = speculated_ids.shape[1] + 1 S = speculated_ids.shape[1] + 1
...@@ -249,12 +268,11 @@ class HeterogeneousNextTokenChooser: ...@@ -249,12 +268,11 @@ class HeterogeneousNextTokenChooser:
for warper in self.warpers: for warper in self.warpers:
_scores = warper(input_ids, _scores) _scores = warper(input_ids, _scores)
_next_ids = self.choice(_scores) _next_ids = self.choice(_scores)
scores[:, j] = _scores scores[:, j] = _scores
next_ids[:, j] = _next_ids next_ids[:, j] = _next_ids
next_ids = next_ids.view(B*S) next_ids = next_ids.view(B * S)
scores = scores.view( B* S, -1) scores = scores.view(B * S, -1)
if speculated_ids is not None: if speculated_ids is not None:
accepted_ids = [] accepted_ids = []
...@@ -262,7 +280,7 @@ class HeterogeneousNextTokenChooser: ...@@ -262,7 +280,7 @@ class HeterogeneousNextTokenChooser:
S = speculated_ids.shape[1] + 1 S = speculated_ids.shape[1] + 1
indices = [] indices = []
for i in range(B): for i in range(B):
_next_ids = next_ids[i*S: (i + 1)*S] _next_ids = next_ids[i * S : (i + 1) * S]
_speculated_ids = speculated_ids[i] _speculated_ids = speculated_ids[i]
validate_speculative = _next_ids[:-1] == _speculated_ids validate_speculative = _next_ids[:-1] == _speculated_ids
index = i * S index = i * S
...@@ -278,7 +296,9 @@ class HeterogeneousNextTokenChooser: ...@@ -278,7 +296,9 @@ class HeterogeneousNextTokenChooser:
break break
accepted_ids.append(accepted) accepted_ids.append(accepted)
accepted_ids = torch.tensor(accepted_ids, device=input_ids.device, dtype=input_ids.dtype) accepted_ids = torch.tensor(
accepted_ids, device=input_ids.device, dtype=input_ids.dtype
)
next_ids = next_ids[indices] next_ids = next_ids[indices]
scores = scores[indices] scores = scores[indices]
indices = torch.arange(B, device=input_ids.device) * S indices = torch.arange(B, device=input_ids.device) * S
...@@ -296,7 +316,9 @@ class HeterogeneousNextTokenChooser: ...@@ -296,7 +316,9 @@ class HeterogeneousNextTokenChooser:
speculative_ids = Greedy()(speculative_scores) speculative_ids = Greedy()(speculative_scores)
else: else:
# n-gram # n-gram
speculative_ids = create_n_gram_speculation(input_ids, next_ids, accepted_ids, speculate, verbose) speculative_ids = create_n_gram_speculation(
input_ids, next_ids, accepted_ids, speculate, verbose
)
else: else:
speculative_ids = None speculative_ids = None
......
...@@ -16,7 +16,7 @@ class Weights: ...@@ -16,7 +16,7 @@ class Weights:
dtype, dtype,
process_group, process_group,
aliases: Optional[Dict[str, List[str]]] = None, aliases: Optional[Dict[str, List[str]]] = None,
prefix: Optional[str] = None prefix: Optional[str] = None,
): ):
routing = {} routing = {}
for filename in filenames: for filename in filenames:
...@@ -213,7 +213,8 @@ class Weights: ...@@ -213,7 +213,8 @@ class Weights:
bits, groupsize = self._get_gptq_params() bits, groupsize = self._get_gptq_params()
from text_generation_server.utils.layers import HAS_EXLLAMA from text_generation_server.utils.layers import HAS_EXLLAMA
use_exllama = bits==4 and HAS_EXLLAMA and quantize == "gptq"
use_exllama = bits == 4 and HAS_EXLLAMA and quantize == "gptq"
weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama) weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
else: else:
w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes] w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
...@@ -283,7 +284,7 @@ class Weights: ...@@ -283,7 +284,7 @@ class Weights:
if use_exllama: if use_exllama:
qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0) qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0)
scales = self.get_sharded(f"{prefix}.scales", dim=0) scales = self.get_sharded(f"{prefix}.scales", dim=0)
g_idx = self.get_sharded(f"{prefix}.g_idx", dim= 0) g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0)
g_idx = g_idx - g_idx[0] g_idx = g_idx - g_idx[0]
else: else:
# The triton kernel reorders the scales/zero points instead of the weight/activation. # The triton kernel reorders the scales/zero points instead of the weight/activation.
......
...@@ -21,14 +21,14 @@ def main(): ...@@ -21,14 +21,14 @@ def main():
block = [] block = []
for line in lines: for line in lines:
if line.startswith(" -") or line.startswith(" -"): if line.startswith(" -") or line.startswith(" -"):
rendered_block = '\n'.join(block) rendered_block = "\n".join(block)
if header: if header:
final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n" final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
else: else:
final_doc += f"```shell\n{rendered_block}\n```\n" final_doc += f"```shell\n{rendered_block}\n```\n"
block = [] block = []
tokens = line.split("<") tokens = line.split("<")
if len(tokens)>1: if len(tokens) > 1:
header = tokens[-1][:-1] header = tokens[-1][:-1]
else: else:
header = line.split("--")[-1] header = line.split("--")[-1]
...@@ -36,7 +36,7 @@ def main(): ...@@ -36,7 +36,7 @@ def main():
block.append(line) block.append(line)
rendered_block = '\n'.join(block) rendered_block = "\n".join(block)
final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n" final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
block = [] block = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment