"src/vscode:/vscode.git/clone" did not exist on "25feac9e65ff7a7ca87d75150555bc010f3dfdd0"
Commit 72ee382d authored by OlivierDehaene's avatar OlivierDehaene
Browse files

chore: formatting

parent 3a521c92
...@@ -3,17 +3,20 @@ import torch ...@@ -3,17 +3,20 @@ import torch
from typing import Optional from typing import Optional
from text_generation_server.models.flash_mistral import BaseFlashMistral from text_generation_server.models.flash_mistral import BaseFlashMistral
from text_generation_server.models.custom_modeling.flash_mixtral_modeling import MixtralConfig, FlashMixtralForCausalLM from text_generation_server.models.custom_modeling.flash_mixtral_modeling import (
MixtralConfig,
FlashMixtralForCausalLM,
)
class FlashMixtral(BaseFlashMistral): class FlashMixtral(BaseFlashMistral):
def __init__( def __init__(
self, self,
model_id: str, model_id: str,
revision: Optional[str] = None, revision: Optional[str] = None,
quantize: Optional[str] = None, quantize: Optional[str] = None,
dtype: Optional[torch.dtype] = None, dtype: Optional[torch.dtype] = None,
trust_remote_code: bool = False, trust_remote_code: bool = False,
): ):
super(FlashMixtral, self).__init__( super(FlashMixtral, self).__init__(
config_cls=MixtralConfig, config_cls=MixtralConfig,
...@@ -22,5 +25,5 @@ class FlashMixtral(BaseFlashMistral): ...@@ -22,5 +25,5 @@ class FlashMixtral(BaseFlashMistral):
revision=revision, revision=revision,
quantize=quantize, quantize=quantize,
dtype=dtype, dtype=dtype,
trust_remote_code=trust_remote_code trust_remote_code=trust_remote_code,
) )
...@@ -792,7 +792,10 @@ class IdeficsCausalLM(Model): ...@@ -792,7 +792,10 @@ class IdeficsCausalLM(Model):
skip_special_tokens=False, skip_special_tokens=False,
) )
prefill_tokens = Tokens( prefill_tokens = Tokens(
prefill_token_ids, prefill_logprobs, prefill_texts, is_special=[] prefill_token_ids,
prefill_logprobs,
prefill_texts,
is_special=[],
) )
else: else:
prefill_tokens = None prefill_tokens = None
...@@ -803,10 +806,10 @@ class IdeficsCausalLM(Model): ...@@ -803,10 +806,10 @@ class IdeficsCausalLM(Model):
request.id, request.id,
prefill_tokens, prefill_tokens,
Tokens( Tokens(
[next_token_id_squeezed], [next_token_id_squeezed],
[next_token_logprob], [next_token_logprob],
[next_token_text], [next_token_text],
[next_token_id_squeezed.item() in self.all_special_ids], [next_token_id_squeezed.item() in self.all_special_ids],
), ),
generated_text, generated_text,
top_tokens, top_tokens,
......
...@@ -56,7 +56,7 @@ class Model(ABC): ...@@ -56,7 +56,7 @@ class Model(ABC):
dtype=str(self.dtype), dtype=str(self.dtype),
device_type=self.device.type, device_type=self.device.type,
window_size=self.sliding_window, window_size=self.sliding_window,
speculate=self.speculate speculate=self.speculate,
) )
@property @property
......
...@@ -736,7 +736,7 @@ class Seq2SeqLM(Model): ...@@ -736,7 +736,7 @@ class Seq2SeqLM(Model):
[self.tokenizer.bos_token_id], [self.tokenizer.bos_token_id],
[float("nan")], [float("nan")],
[self.tokenizer.bos_token], [self.tokenizer.bos_token],
[False] [False],
) )
else: else:
prefill_tokens = None prefill_tokens = None
...@@ -763,10 +763,10 @@ class Seq2SeqLM(Model): ...@@ -763,10 +763,10 @@ class Seq2SeqLM(Model):
request.id, request.id,
prefill_tokens, prefill_tokens,
Tokens( Tokens(
[next_token_id_squeezed], [next_token_id_squeezed],
[next_token_logprob], [next_token_logprob],
[next_token_text], [next_token_text],
[next_token_id_squeezed.item() in self.all_special_ids], [next_token_id_squeezed.item() in self.all_special_ids],
), ),
generated_text, generated_text,
top_tokens, top_tokens,
......
...@@ -66,7 +66,10 @@ class Tokens: ...@@ -66,7 +66,10 @@ class Tokens:
def to_pb(self) -> generate_pb2.Tokens: def to_pb(self) -> generate_pb2.Tokens:
return generate_pb2.Tokens( return generate_pb2.Tokens(
ids=self.token_ids, logprobs=self.logprobs, texts=self.texts, is_special=self.is_special ids=self.token_ids,
logprobs=self.logprobs,
texts=self.texts,
is_special=self.is_special,
) )
def __len__(self): def __len__(self):
......
...@@ -159,7 +159,13 @@ def serve( ...@@ -159,7 +159,13 @@ def serve(
try: try:
model = get_model( model = get_model(
model_id, revision, sharded, quantize, speculate, dtype, trust_remote_code model_id,
revision,
sharded,
quantize,
speculate,
dtype,
trust_remote_code,
) )
except Exception: except Exception:
logger.exception("Error when initializing model") logger.exception("Error when initializing model")
...@@ -207,5 +213,7 @@ def serve( ...@@ -207,5 +213,7 @@ def serve(
await server.stop(0) await server.stop(0)
asyncio.run( asyncio.run(
serve_inner(model_id, revision, sharded, quantize, speculate, dtype, trust_remote_code) serve_inner(
model_id, revision, sharded, quantize, speculate, dtype, trust_remote_code
)
) )
...@@ -51,7 +51,9 @@ except ImportError as e: ...@@ -51,7 +51,9 @@ except ImportError as e:
) from e ) from e
elif IS_ROCM_SYSTEM: elif IS_ROCM_SYSTEM:
for idx in range(torch.cuda.device_count()): for idx in range(torch.cuda.device_count()):
if "MI210" not in torch.cuda.get_device_name(idx) and "MI250" not in torch.cuda.get_device_name(idx): if "MI210" not in torch.cuda.get_device_name(
idx
) and "MI250" not in torch.cuda.get_device_name(idx):
raise ImportError( raise ImportError(
f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention" f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
) )
...@@ -91,8 +93,10 @@ def attention( ...@@ -91,8 +93,10 @@ def attention(
) )
elif HAS_FLASH_ATTN_V2_ROCM: elif HAS_FLASH_ATTN_V2_ROCM:
if window_size_left != -1: if window_size_left != -1:
raise ValueError(f"RoCm version of Flash Attention v2 does not support window attention (window_size_left != -1, got window_size_left={window_size_left}).") raise ValueError(
f"RoCm version of Flash Attention v2 does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
)
# RoCm flash API does not take the window_size_left and window_size_right arguments. # RoCm flash API does not take the window_size_left and window_size_right arguments.
return flash_attn_2_cuda.varlen_fwd( return flash_attn_2_cuda.varlen_fwd(
q, q,
......
...@@ -11,40 +11,44 @@ logger = getLogger(__name__) ...@@ -11,40 +11,44 @@ logger = getLogger(__name__)
try: try:
from exllamav2_kernels import make_q_matrix, gemm_half_q_half from exllamav2_kernels import make_q_matrix, gemm_half_q_half
except ImportError: except ImportError:
logger.error('exllamav2_kernels not installed.') logger.error("exllamav2_kernels not installed.")
raise raise
# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension # Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
none_tensor = torch.empty((1, 1), device="meta") none_tensor = torch.empty((1, 1), device="meta")
def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda): def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
"""Matrix multiplication, returns x @ q4""" """Matrix multiplication, returns x @ q4"""
output_shape = x.shape[:-1] + (q4_width,) output_shape = x.shape[:-1] + (q4_width,)
x = x.view(-1, x.shape[-1]) x = x.view(-1, x.shape[-1])
output = torch.empty((x.shape[0], q4_width), dtype = torch.half, device = x.device) output = torch.empty((x.shape[0], q4_width), dtype=torch.half, device=x.device)
gemm_half_q_half(x, q_handle, output, force_cuda) gemm_half_q_half(x, q_handle, output, force_cuda)
return output.view(output_shape) return output.view(output_shape)
def ext_make_q_matrix(w: dict, temp_dq, key: str = None): def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
""" """
Create Q matrix Create Q matrix
""" """
# EXL2 # EXL2
# won't work as the moment because the tensors are not the same. # won't work as the moment because the tensors are not the same.
if "q_weight" in w: if "q_weight" in w:
w["q_scale_max"] /= 256 w["q_scale_max"] /= 256
w["q_perm"] = w["q_perm"].short() w["q_perm"] = w["q_perm"].short()
w["q_invperm"] = w["q_invperm"].short() w["q_invperm"] = w["q_invperm"].short()
return make_q_matrix(w["q_weight"], return make_q_matrix(
w["q_perm"], w["q_weight"],
w["q_invperm"], w["q_perm"],
w["q_scale"], w["q_invperm"],
w["q_scale_max"], w["q_scale"],
w["q_groups"], w["q_scale_max"],
none_tensor, w["q_groups"],
none_tensor, none_tensor,
none_tensor, none_tensor,
temp_dq) none_tensor,
temp_dq,
)
# GPTQ # GPTQ
elif "qweight" in w: elif "qweight" in w:
if w["scales"].dtype == torch.float: if w["scales"].dtype == torch.float:
...@@ -52,31 +56,40 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None): ...@@ -52,31 +56,40 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
# GPTQ with g_idx (act_order) # GPTQ with g_idx (act_order)
if w.get("g_idx", None) is not None and not (w["g_idx"] == 0).all().item(): if w.get("g_idx", None) is not None and not (w["g_idx"] == 0).all().item():
w["q_perm"] = torch.empty((w["qweight"].shape[0] * 8,), dtype = torch.short, device = w["qweight"].device) w["q_perm"] = torch.empty(
(w["qweight"].shape[0] * 8,),
dtype=torch.short,
device=w["qweight"].device,
)
w["q_invperm"] = torch.empty_like(w["q_perm"]) w["q_invperm"] = torch.empty_like(w["q_perm"])
# make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx. # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
return make_q_matrix(w["qweight"], return make_q_matrix(
w["q_perm"], w["qweight"],
w["q_invperm"], w["q_perm"],
none_tensor, w["q_invperm"],
none_tensor, none_tensor,
none_tensor, none_tensor,
w["qzeros"], none_tensor,
w["scales"], w["qzeros"],
w["g_idx"].cpu(), w["scales"],
temp_dq) w["g_idx"].cpu(),
temp_dq,
)
# GPTQ without g_idx # GPTQ without g_idx
else: else:
return make_q_matrix(w["qweight"], return make_q_matrix(
none_tensor, w["qweight"],
none_tensor, none_tensor,
none_tensor, none_tensor,
none_tensor, none_tensor,
none_tensor, none_tensor,
w["qzeros"], none_tensor,
w["scales"], w["qzeros"],
none_tensor, w["scales"],
temp_dq) none_tensor,
temp_dq,
)
DEVICE = None DEVICE = None
FIXED_BYTES = 0 FIXED_BYTES = 0
...@@ -106,14 +119,15 @@ class QuantLinear(nn.Module): ...@@ -106,14 +119,15 @@ class QuantLinear(nn.Module):
super().__init__() super().__init__()
if bits != 4: if bits != 4:
raise ValueError( raise ValueError(
f"Exllamav2 kernel supports only bits=4, requested bits={bits}. Something is wrong in the model initialization.") f"Exllamav2 kernel supports only bits=4, requested bits={bits}. Something is wrong in the model initialization."
)
self.q_handle = None self.q_handle = None
self.q_tensors = None self.q_tensors = None
self.bits = bits self.bits = bits
self.maxq = 2 ** self.bits - 1 self.maxq = 2**self.bits - 1
self.infeatures = qweight.shape[0] // self.bits * 32 self.infeatures = qweight.shape[0] // self.bits * 32
self.outfeatures = qweight.shape[1] self.outfeatures = qweight.shape[1]
self.padding = - self.outfeatures % 32 self.padding = -self.outfeatures % 32
self.outfeatures = self.outfeatures + self.padding self.outfeatures = self.outfeatures + self.padding
self.device = qweight.device self.device = qweight.device
...@@ -128,9 +142,12 @@ class QuantLinear(nn.Module): ...@@ -128,9 +142,12 @@ class QuantLinear(nn.Module):
outfeatures = self.outfeatures outfeatures = self.outfeatures
assert qweight.shape == (infeatures // 32 * self.bits, outfeatures) assert qweight.shape == (infeatures // 32 * self.bits, outfeatures)
assert infeatures % self.group_size == 0 assert infeatures % self.group_size == 0
assert qzeros.shape == (infeatures // self.group_size, outfeatures // 32 * self.bits) assert qzeros.shape == (
infeatures // self.group_size,
outfeatures // 32 * self.bits,
)
assert scales.shape == (infeatures // self.group_size, outfeatures) assert scales.shape == (infeatures // self.group_size, outfeatures)
assert g_idx.shape == (infeatures, ), f"{g_idx.shape}, {infeatures}" assert g_idx.shape == (infeatures,), f"{g_idx.shape}, {infeatures}"
global FIXED_BYTES, LAYERS global FIXED_BYTES, LAYERS
FIXED_BYTES = max(FIXED_BYTES, self.scratch_space_fixed()) FIXED_BYTES = max(FIXED_BYTES, self.scratch_space_fixed())
...@@ -140,33 +157,31 @@ class QuantLinear(nn.Module): ...@@ -140,33 +157,31 @@ class QuantLinear(nn.Module):
assert self.qweight.device.type == "cuda" assert self.qweight.device.type == "cuda"
assert self.qweight.device.index is not None assert self.qweight.device.index is not None
self.q_tensors = { self.q_tensors = {
"qweight":self.qweight, "qweight": self.qweight,
"qzeros":self.qzeros, "qzeros": self.qzeros,
"scales":self.scales, "scales": self.scales,
"g_idx":self.g_idx "g_idx": self.g_idx,
} }
temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size()) temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size())
self.q_handle = ext_make_q_matrix( self.q_handle = ext_make_q_matrix(self.q_tensors, temp_dq)
self.q_tensors, temp_dq
) def forward(self, x, force_cuda=False):
def forward(self, x, force_cuda = False):
output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda) output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda)
if self.bias is not None: if self.bias is not None:
output.add_(self.bias) output.add_(self.bias)
return output return output
def temp_dq_size(self): def temp_dq_size(self):
return self.infeatures * self.outfeatures * 2 + 128 return self.infeatures * self.outfeatures * 2 + 128
def temp_fwd_size(self, max_input_len, max_batch_size): def temp_fwd_size(self, max_input_len, max_batch_size):
return self.outfeatures * max_input_len * max_batch_size * 4 + 128 return self.outfeatures * max_input_len * max_batch_size * 4 + 128
def scratch_space_fixed(self, max_input_len=4096, max_batch_size=16): def scratch_space_fixed(self, max_input_len=4096, max_batch_size=16):
return self.temp_dq_size() + self.temp_fwd_size(max_input_len, max_batch_size) return self.temp_dq_size() + self.temp_fwd_size(max_input_len, max_batch_size)
class ExLlamaV2DeviceTensors: class ExLlamaV2DeviceTensors:
device_idx: int device_idx: int
...@@ -177,13 +192,16 @@ class ExLlamaV2DeviceTensors: ...@@ -177,13 +192,16 @@ class ExLlamaV2DeviceTensors:
def __init__(self, device, scratch_bytes): def __init__(self, device, scratch_bytes):
self.device = device self.device = device
self.scratch_bytes = scratch_bytes self.scratch_bytes = scratch_bytes
def prepare(self): def prepare(self):
self.scratch = torch.empty((self.scratch_bytes // 2,), dtype = torch.half, device = self.device) self.scratch = torch.empty(
(self.scratch_bytes // 2,), dtype=torch.half, device=self.device
)
def get_scratch_slice(self, size_bytes): def get_scratch_slice(self, size_bytes):
if self.scratch is None: self.prepare() if self.scratch is None:
self.prepare()
size_bytes = ((size_bytes + 127) // 128) * 128 size_bytes = ((size_bytes + 127) // 128) * 128
size_half = size_bytes // 2 size_half = size_bytes // 2
......
...@@ -35,7 +35,9 @@ HAS_EXLLAMA = False ...@@ -35,7 +35,9 @@ HAS_EXLLAMA = False
CAN_EXLLAMA = major >= 8 CAN_EXLLAMA = major >= 8
V2 = os.getenv("EXLLAMA_VERSION", "2") == "2" V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
if V2 and int(os.getenv("WORLD_SIZE", "1")) > 1: if V2 and int(os.getenv("WORLD_SIZE", "1")) > 1:
logger.warning("Disabling exllama v2 and using v1 instead because there are issues when sharding") logger.warning(
"Disabling exllama v2 and using v1 instead because there are issues when sharding"
)
V2 = False V2 = False
if os.getenv("DISABLE_EXLLAMA") == "True": if os.getenv("DISABLE_EXLLAMA") == "True":
...@@ -43,17 +45,19 @@ if os.getenv("DISABLE_EXLLAMA") == "True": ...@@ -43,17 +45,19 @@ if os.getenv("DISABLE_EXLLAMA") == "True":
elif CAN_EXLLAMA: elif CAN_EXLLAMA:
try: try:
if V2: if V2:
from text_generation_server.utils.gptq.exllamav2 import (QuantLinear as ExllamaQuantLinear, from text_generation_server.utils.gptq.exllamav2 import (
create_exllama_buffers, QuantLinear as ExllamaQuantLinear,
set_device, create_exllama_buffers,
) set_device,
)
HAS_EXLLAMA = "2" HAS_EXLLAMA = "2"
else: else:
from text_generation_server.utils.gptq.exllama import (Ex4bitLinear as ExllamaQuantLinear, from text_generation_server.utils.gptq.exllama import (
create_exllama_buffers, Ex4bitLinear as ExllamaQuantLinear,
set_device, create_exllama_buffers,
) set_device,
)
HAS_EXLLAMA = "1" HAS_EXLLAMA = "1"
...@@ -114,7 +118,7 @@ def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, st ...@@ -114,7 +118,7 @@ def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, st
@classmethod @classmethod
def load_conv2d_no_bias( def load_conv2d_no_bias(
cls, prefix, weights, in_channels, out_channels, kernel_size, stride cls, prefix, weights, in_channels, out_channels, kernel_size, stride
): ):
weight = weights.get_tensor(f"{prefix}.weight") weight = weights.get_tensor(f"{prefix}.weight")
with init_empty_weights(): with init_empty_weights():
...@@ -138,9 +142,9 @@ torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias ...@@ -138,9 +142,9 @@ torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
class FastLinear(nn.Module): class FastLinear(nn.Module):
def __init__( def __init__(
self, self,
weight, weight,
bias, bias,
) -> None: ) -> None:
super().__init__() super().__init__()
self.weight = nn.Parameter(weight) self.weight = nn.Parameter(weight)
...@@ -164,9 +168,9 @@ class FastLinear(nn.Module): ...@@ -164,9 +168,9 @@ class FastLinear(nn.Module):
class EETQLinear(nn.Module): class EETQLinear(nn.Module):
def __init__( def __init__(
self, self,
weight, weight,
bias, bias,
) -> None: ) -> None:
super().__init__() super().__init__()
device = weight.device device = weight.device
...@@ -185,13 +189,13 @@ class EETQLinear(nn.Module): ...@@ -185,13 +189,13 @@ class EETQLinear(nn.Module):
class Linear8bitLt(nn.Module): class Linear8bitLt(nn.Module):
def __init__( def __init__(
self, self,
weight, weight,
bias, bias,
has_fp16_weights=True, has_fp16_weights=True,
memory_efficient_backward=False, memory_efficient_backward=False,
threshold=0.0, threshold=0.0,
index=None, index=None,
): ):
super().__init__() super().__init__()
assert ( assert (
...@@ -325,7 +329,9 @@ def get_linear(weight, bias, quantize): ...@@ -325,7 +329,9 @@ def get_linear(weight, bias, quantize):
) )
if use_exllama: if use_exllama:
linear = ExllamaQuantLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize) linear = ExllamaQuantLinear(
qweight, qzeros, scales, g_idx, bias, bits, groupsize
)
else: else:
linear = QuantLinear( linear = QuantLinear(
qweight, qweight,
...@@ -533,7 +539,6 @@ try: ...@@ -533,7 +539,6 @@ try:
else: else:
dropout_layer_norm = None dropout_layer_norm = None
class FastLayerNorm(nn.LayerNorm): class FastLayerNorm(nn.LayerNorm):
def forward(self, hidden_states, residual=None): def forward(self, hidden_states, residual=None):
if hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM: if hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM:
...@@ -569,7 +574,6 @@ try: ...@@ -569,7 +574,6 @@ try:
return normed_hidden_states, residual return normed_hidden_states, residual
class FastRMSNorm(nn.Module): class FastRMSNorm(nn.Module):
def __init__(self, weight: torch.Tensor, eps: float): def __init__(self, weight: torch.Tensor, eps: float):
super().__init__() super().__init__()
...@@ -601,7 +605,11 @@ try: ...@@ -601,7 +605,11 @@ try:
return self.weight * hidden_states, residual return self.weight * hidden_states, residual
elif IS_CUDA_SYSTEM: elif IS_CUDA_SYSTEM:
# faster post attention rms norm # faster post attention rms norm
normed_hidden_states, res, *rest = dropout_layer_norm.dropout_add_ln_fwd( (
normed_hidden_states,
res,
*rest,
) = dropout_layer_norm.dropout_add_ln_fwd(
hidden_states, hidden_states,
residual, residual,
self.weight, self.weight,
...@@ -638,7 +646,8 @@ try: ...@@ -638,7 +646,8 @@ try:
return out, residual return out, residual
else: else:
raise ValueError( raise ValueError(
"Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction.") "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
)
except ImportError: except ImportError:
pass pass
...@@ -650,14 +659,12 @@ try: ...@@ -650,14 +659,12 @@ try:
elif IS_ROCM_SYSTEM: elif IS_ROCM_SYSTEM:
from vllm import pos_encoding_ops from vllm import pos_encoding_ops
def _create_inv_freq(dim, base, device): def _create_inv_freq(dim, base, device):
inv_freq = 1.0 / ( inv_freq = 1.0 / (
base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim) base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
) )
return inv_freq return inv_freq
def _get_rope_config(config): def _get_rope_config(config):
if os.getenv("ROPE_SCALING", None) is not None: if os.getenv("ROPE_SCALING", None) is not None:
rope_scaling = { rope_scaling = {
...@@ -667,7 +674,6 @@ try: ...@@ -667,7 +674,6 @@ try:
return rope_scaling return rope_scaling
return getattr(config, "rope_scaling", None) return getattr(config, "rope_scaling", None)
class PositionRotaryEmbedding(nn.Module): class PositionRotaryEmbedding(nn.Module):
def __init__(self, inv_freq, scaling_factor): def __init__(self, inv_freq, scaling_factor):
super().__init__() super().__init__()
...@@ -680,17 +686,23 @@ try: ...@@ -680,17 +686,23 @@ try:
self.scaling_factor = scaling_factor self.scaling_factor = scaling_factor
self.dynamic_args = None self.dynamic_args = None
def forward(self, query: torch.Tensor, key: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor): def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
cos: torch.Tensor,
sin: torch.Tensor,
):
# Such controlflows may add some overhead. # Such controlflows may add some overhead.
if IS_CUDA_SYSTEM: if IS_CUDA_SYSTEM:
rotary_dim = cos.shape[-1] rotary_dim = cos.shape[-1]
q1 = query[..., :rotary_dim] q1 = query[..., :rotary_dim]
q2 = query[..., rotary_dim: 2 * rotary_dim] q2 = query[..., rotary_dim : 2 * rotary_dim]
rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False) rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
k1 = key[..., :rotary_dim] k1 = key[..., :rotary_dim]
k2 = key[..., rotary_dim: 2 * rotary_dim] k2 = key[..., rotary_dim : 2 * rotary_dim]
rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False) rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
elif IS_ROCM_SYSTEM: elif IS_ROCM_SYSTEM:
...@@ -700,17 +712,11 @@ try: ...@@ -700,17 +712,11 @@ try:
head_size = query.shape[-1] head_size = query.shape[-1]
# Inplace operation, updating query and key. # Inplace operation, updating query and key.
pos_encoding_ops.rotary_embedding( pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, True)
query,
key,
head_size,
cos,
sin,
True
)
else: else:
raise ValueError( raise ValueError(
"Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction.") "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
)
@classmethod @classmethod
def static(cls, config, dim, base, device): def static(cls, config, dim, base, device):
...@@ -732,15 +738,16 @@ try: ...@@ -732,15 +738,16 @@ try:
elif rope_scaling["type"] == "yarn": elif rope_scaling["type"] == "yarn":
return YarnPositionRotaryEmbedding( return YarnPositionRotaryEmbedding(
dim=2 * inv_freq.shape[0], dim=2 * inv_freq.shape[0],
max_position_embeddings=rope_scaling["original_max_position_embeddings"], max_position_embeddings=rope_scaling[
"original_max_position_embeddings"
],
base=10000.0, base=10000.0,
device=inv_freq.device, device=inv_freq.device,
scaling_factor=scaling_factor, scaling_factor=scaling_factor,
extrapolation_factor=1, extrapolation_factor=1,
attn_factor=1, attn_factor=1,
beta_fast=32, beta_fast=32,
beta_slow=1 beta_slow=1,
) )
else: else:
raise NotImplementedError( raise NotImplementedError(
...@@ -773,15 +780,16 @@ try: ...@@ -773,15 +780,16 @@ try:
elif rope_scaling["type"] == "yarn": elif rope_scaling["type"] == "yarn":
return YarnPositionRotaryEmbedding( return YarnPositionRotaryEmbedding(
dim=2 * inv_freq.shape[0], dim=2 * inv_freq.shape[0],
max_position_embeddings=rope_scaling["original_max_position_embeddings"], max_position_embeddings=rope_scaling[
"original_max_position_embeddings"
],
base=10000.0, base=10000.0,
device=inv_freq.device, device=inv_freq.device,
scaling_factor=scaling_factor, scaling_factor=scaling_factor,
extrapolation_factor=1, extrapolation_factor=1,
attn_factor=1, attn_factor=1,
beta_fast=32, beta_fast=32,
beta_slow=1 beta_slow=1,
) )
else: else:
raise NotImplementedError( raise NotImplementedError(
...@@ -793,9 +801,9 @@ try: ...@@ -793,9 +801,9 @@ try:
# Reset the tables if the sequence length has changed, # Reset the tables if the sequence length has changed,
# or if we're on a new device (possibly due to tracing for instance) # or if we're on a new device (possibly due to tracing for instance)
if ( if (
seqlen > self._seq_len_cached seqlen > self._seq_len_cached
or self._cos_cached.device != device or self._cos_cached.device != device
or self._cos_cached.dtype != dtype or self._cos_cached.dtype != dtype
): ):
self._seq_len_cached = seqlen self._seq_len_cached = seqlen
t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
...@@ -809,7 +817,7 @@ try: ...@@ -809,7 +817,7 @@ try:
self._sin_cached = torch.sin(freqs).to(dtype) self._sin_cached = torch.sin(freqs).to(dtype)
def get_cos_sin( def get_cos_sin(
self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype
): ):
""" """
Return cos and sin for the asked position ids Return cos and sin for the asked position ids
...@@ -827,7 +835,6 @@ try: ...@@ -827,7 +835,6 @@ try:
# Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow. # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
return cos.unsqueeze(1), sin.unsqueeze(1) return cos.unsqueeze(1), sin.unsqueeze(1)
class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding): class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
def __init__(self, dim, max_position_embeddings, base, device, scaling_factor): def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
inv_freq = _create_inv_freq(dim, base, device) inv_freq = _create_inv_freq(dim, base, device)
...@@ -840,14 +847,14 @@ try: ...@@ -840,14 +847,14 @@ try:
# Reset the tables if the sequence length has changed, # Reset the tables if the sequence length has changed,
# or if we're on a new device (possibly due to tracing for instance) # or if we're on a new device (possibly due to tracing for instance)
if ( if (
seqlen > self._seq_len_cached seqlen > self._seq_len_cached
or self._cos_cached.device != device or self._cos_cached.device != device
or self._cos_cached.dtype != dtype or self._cos_cached.dtype != dtype
): ):
if seqlen > self.max_position_embeddings: if seqlen > self.max_position_embeddings:
newbase = self.base * ( newbase = self.base * (
(self.scaling_factor * seqlen / self.max_position_embeddings) (self.scaling_factor * seqlen / self.max_position_embeddings)
- (self.scaling_factor - 1) - (self.scaling_factor - 1)
) ** (self.dim / (self.dim - 2)) ) ** (self.dim / (self.dim - 2))
self.inv_freq = _create_inv_freq( self.inv_freq = _create_inv_freq(
self.dim, newbase, self.inv_freq.device self.dim, newbase, self.inv_freq.device
...@@ -861,24 +868,28 @@ try: ...@@ -861,24 +868,28 @@ try:
self._cos_cached = torch.cos(freqs).to(dtype) self._cos_cached = torch.cos(freqs).to(dtype)
self._sin_cached = torch.sin(freqs).to(dtype) self._sin_cached = torch.sin(freqs).to(dtype)
# Inverse dim formula to find dim based on number of rotations # Inverse dim formula to find dim based on number of rotations
import math import math
def find_correction_dim(
def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048): num_rotations, dim, base=10000, max_position_embeddings=2048
return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) ):
return (
dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))
) / (2 * math.log(base))
# Find dim range bounds based on rotations # Find dim range bounds based on rotations
def find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048): def find_correction_range(
low = math.floor(find_correction_dim( low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
low_rot, dim, base, max_position_embeddings)) ):
high = math.ceil(find_correction_dim( low = math.floor(
high_rot, dim, base, max_position_embeddings)) find_correction_dim(low_rot, dim, base, max_position_embeddings)
)
high = math.ceil(
find_correction_dim(high_rot, dim, base, max_position_embeddings)
)
return max(low, 0), min(high, dim - 1) # Clamp values just in case return max(low, 0), min(high, dim - 1) # Clamp values just in case
def linear_ramp_mask(min, max, dim): def linear_ramp_mask(min, max, dim):
if min == max: if min == max:
max += 0.001 # Prevent singularity max += 0.001 # Prevent singularity
...@@ -887,16 +898,25 @@ try: ...@@ -887,16 +898,25 @@ try:
ramp_func = torch.clamp(linear_func, 0, 1) ramp_func = torch.clamp(linear_func, 0, 1)
return ramp_func return ramp_func
def get_mscale(scale=1): def get_mscale(scale=1):
if scale <= 1: if scale <= 1:
return 1.0 return 1.0
return 0.1 * math.log(scale) + 1.0 return 0.1 * math.log(scale) + 1.0
class YarnPositionRotaryEmbedding(PositionRotaryEmbedding): class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
def __init__(self, dim, max_position_embeddings, base, device, scaling_factor, *, extrapolation_factor, def __init__(
attn_factor, beta_fast, beta_slow): self,
dim,
max_position_embeddings,
base,
device,
scaling_factor,
*,
extrapolation_factor,
attn_factor,
beta_fast,
beta_slow,
):
inv_freq = _create_inv_freq(dim, base, device) inv_freq = _create_inv_freq(dim, base, device)
super().__init__(inv_freq, scaling_factor) super().__init__(inv_freq, scaling_factor)
self.dim = dim self.dim = dim
...@@ -906,16 +926,17 @@ try: ...@@ -906,16 +926,17 @@ try:
self.attn_factor = attn_factor self.attn_factor = attn_factor
self.beta_fast = beta_fast self.beta_fast = beta_fast
self.beta_slow = beta_slow self.beta_slow = beta_slow
self.mscale = float(get_mscale( self.mscale = float(
self.scaling_factor) * self.attn_factor) # Get n-d magnitude scaling corrected for interpolation get_mscale(self.scaling_factor) * self.attn_factor
) # Get n-d magnitude scaling corrected for interpolation
def _update_cos_sin_cache(self, dtype, device, seqlen): def _update_cos_sin_cache(self, dtype, device, seqlen):
# Reset the tables if the sequence length has changed, # Reset the tables if the sequence length has changed,
# or if we're on a new device (possibly due to tracing for instance) # or if we're on a new device (possibly due to tracing for instance)
if ( if (
seqlen > self._seq_len_cached seqlen > self._seq_len_cached
or self._cos_cached.device != device or self._cos_cached.device != device
or self._cos_cached.dtype != dtype or self._cos_cached.dtype != dtype
): ):
if seqlen > self.max_position_embeddings: if seqlen > self.max_position_embeddings:
inv_freq_extrapolation = _create_inv_freq( inv_freq_extrapolation = _create_inv_freq(
...@@ -923,15 +944,26 @@ try: ...@@ -923,15 +944,26 @@ try:
) )
freqs = 1.0 / inv_freq_extrapolation freqs = 1.0 / inv_freq_extrapolation
inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs) inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
low, high = find_correction_range(self.beta_fast, self.beta_slow, self.dim, self.base, low, high = find_correction_range(
self.max_position_embeddings) self.beta_fast,
inv_freq_mask = (1 - linear_ramp_mask(low, high, self.dim // 2).float().to( self.beta_slow,
device)) * self.extrapolation_factor # Get n-d rotational scaling corrected for extrapolation self.dim,
inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask self.base,
self.max_position_embeddings,
)
inv_freq_mask = (
1
- linear_ramp_mask(low, high, self.dim // 2).float().to(device)
) * self.extrapolation_factor # Get n-d rotational scaling corrected for extrapolation
inv_freq = (
inv_freq_interpolation * (1 - inv_freq_mask)
+ inv_freq_extrapolation * inv_freq_mask
)
self.inv_freq = inv_freq self.inv_freq = inv_freq
self.mscale = float(get_mscale( self.mscale = float(
self.scaling_factor) * self.attn_factor) # Get n-d magnitude scaling corrected for interpolation get_mscale(self.scaling_factor) * self.attn_factor
) # Get n-d magnitude scaling corrected for interpolation
self._seq_len_cached = seqlen self._seq_len_cached = seqlen
t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
......
...@@ -2,6 +2,7 @@ import torch ...@@ -2,6 +2,7 @@ import torch
from dataclasses import dataclass from dataclasses import dataclass
from text_generation_server.utils.layers import TensorParallelHead, FastLinear from text_generation_server.utils.layers import TensorParallelHead, FastLinear
@dataclass @dataclass
class Output: class Output:
logits: torch.FloatTensor = None logits: torch.FloatTensor = None
...@@ -11,7 +12,9 @@ class Output: ...@@ -11,7 +12,9 @@ class Output:
class ResBlock(torch.nn.Module): class ResBlock(torch.nn.Module):
def __init__(self, config, prefix, weights): def __init__(self, config, prefix, weights):
super().__init__() super().__init__()
self.linear = FastLinear.load(config, prefix=f"{prefix}.linear", weights=weights, bias=True) self.linear = FastLinear.load(
config, prefix=f"{prefix}.linear", weights=weights, bias=True
)
self.act = torch.nn.SiLU() self.act = torch.nn.SiLU()
def forward(self, x): def forward(self, x):
...@@ -19,15 +22,13 @@ class ResBlock(torch.nn.Module): ...@@ -19,15 +22,13 @@ class ResBlock(torch.nn.Module):
class MedusaModel(torch.nn.Module): class MedusaModel(torch.nn.Module):
def __init__( def __init__(self, config, weights, lm_head):
self,
config,
weights,
lm_head
):
super().__init__() super().__init__()
self.heads = torch.nn.ModuleList( self.heads = torch.nn.ModuleList(
[MedusaHead(config, prefix=f"{i}", weights=weights) for i in range(config["medusa_num_heads"])] [
MedusaHead(config, prefix=f"{i}", weights=weights)
for i in range(config["medusa_num_heads"])
]
) )
self.lm_head = lm_head self.lm_head = lm_head
...@@ -40,9 +41,16 @@ class MedusaModel(torch.nn.Module): ...@@ -40,9 +41,16 @@ class MedusaModel(torch.nn.Module):
class MedusaHead(torch.nn.Module): class MedusaHead(torch.nn.Module):
def __init__(self, config, prefix, weights): def __init__(self, config, prefix, weights):
super().__init__() super().__init__()
self.blocks = torch.nn.ModuleList([ResBlock(config, prefix=f"{prefix}.{i}", weights=weights) for i in range(config["medusa_num_layers"])]) self.blocks = torch.nn.ModuleList(
[
ResBlock(config, prefix=f"{prefix}.{i}", weights=weights)
for i in range(config["medusa_num_layers"])
]
)
n = len(self.blocks) n = len(self.blocks)
self.out = FastLinear.load(config, prefix=f"{prefix}.{n}", weights=weights, bias=False) self.out = FastLinear.load(
config, prefix=f"{prefix}.{n}", weights=weights, bias=False
)
def forward(self, x): def forward(self, x):
for block in self.blocks: for block in self.blocks:
......
...@@ -7,23 +7,26 @@ from vllm import attention_ops ...@@ -7,23 +7,26 @@ from vllm import attention_ops
_PARTITION_SIZE = 512 _PARTITION_SIZE = 512
def reshape_and_cache(key: torch.Tensor, value: torch.Tensor, key_cache: torch.Tensor, value_cache: torch.Tensor, def reshape_and_cache(
slots: torch.Tensor): key: torch.Tensor,
cache_ops.reshape_and_cache( value: torch.Tensor,
key, value, key_cache, value_cache, slots key_cache: torch.Tensor,
) value_cache: torch.Tensor,
slots: torch.Tensor,
):
cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots)
def attention( def attention(
out: torch.Tensor, out: torch.Tensor,
query: torch.Tensor, query: torch.Tensor,
key_cache: torch.Tensor, key_cache: torch.Tensor,
value_cache: torch.Tensor, value_cache: torch.Tensor,
kv_head_mapping: torch.Tensor, kv_head_mapping: torch.Tensor,
softmax_scale: float, softmax_scale: float,
block_tables: torch.Tensor, block_tables: torch.Tensor,
input_lengths: torch.Tensor, input_lengths: torch.Tensor,
max_s: int, max_s: int,
): ):
# Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
# Copyright 2023 The vLLM team. All rights # Copyright 2023 The vLLM team. All rights
...@@ -45,9 +48,7 @@ def attention( ...@@ -45,9 +48,7 @@ def attention(
# value_cache => [num_blocks, num_heads, head_size, block_size] # value_cache => [num_blocks, num_heads, head_size, block_size]
block_size = value_cache.shape[3] block_size = value_cache.shape[3]
num_seqs, num_heads, head_size = query.shape num_seqs, num_heads, head_size = query.shape
max_num_partitions = ( max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
(max_s + _PARTITION_SIZE - 1) //
_PARTITION_SIZE)
# NOTE(woosuk): We use a simple heuristic to decide whether to use # NOTE(woosuk): We use a simple heuristic to decide whether to use
# PagedAttention V1 or V2. If the number of partitions is 1, we use # PagedAttention V1 or V2. If the number of partitions is 1, we use
# V1 to avoid the overhead of reduction. Also, if the number of # V1 to avoid the overhead of reduction. Also, if the number of
......
...@@ -38,7 +38,9 @@ def download_and_unload_peft(model_id, revision, trust_remote_code): ...@@ -38,7 +38,9 @@ def download_and_unload_peft(model_id, revision, trust_remote_code):
os.makedirs(model_id, exist_ok=True) os.makedirs(model_id, exist_ok=True)
cache_dir = model_id cache_dir = model_id
logger.info(f"Saving the newly created merged model to {cache_dir}") logger.info(f"Saving the newly created merged model to {cache_dir}")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=trust_remote_code) tokenizer = AutoTokenizer.from_pretrained(
base_model_id, trust_remote_code=trust_remote_code
)
model.save_pretrained(cache_dir, safe_serialization=True) model.save_pretrained(cache_dir, safe_serialization=True)
model.config.save_pretrained(cache_dir) model.config.save_pretrained(cache_dir)
tokenizer.save_pretrained(cache_dir) tokenizer.save_pretrained(cache_dir)
SPECULATE = None SPECULATE = None
def get_speculate() -> int: def get_speculate() -> int:
global SPECULATE global SPECULATE
return SPECULATE return SPECULATE
def set_speculate(speculate: int): def set_speculate(speculate: int):
global SPECULATE global SPECULATE
SPECULATE = speculate SPECULATE = speculate
...@@ -16,6 +16,7 @@ from text_generation_server.utils.logits_process import ( ...@@ -16,6 +16,7 @@ from text_generation_server.utils.logits_process import (
from text_generation_server.utils.watermark import WatermarkLogitsProcessor from text_generation_server.utils.watermark import WatermarkLogitsProcessor
from transformers import PreTrainedTokenizerBase, RepetitionPenaltyLogitsProcessor from transformers import PreTrainedTokenizerBase, RepetitionPenaltyLogitsProcessor
class NextTokenChooser: class NextTokenChooser:
def __init__( def __init__(
self, self,
...@@ -145,21 +146,31 @@ class StoppingCriteria: ...@@ -145,21 +146,31 @@ class StoppingCriteria:
pb.ignore_eos_token, pb.ignore_eos_token,
) )
def create_n_gram_speculation(input_ids: torch.Tensor, next_ids: torch.Tensor, accepted_ids: torch.Tensor, speculate: int, verbose: bool):
def create_n_gram_speculation(
input_ids: torch.Tensor,
next_ids: torch.Tensor,
accepted_ids: torch.Tensor,
speculate: int,
verbose: bool,
):
# Very trivial approach, find first match in the string. # Very trivial approach, find first match in the string.
# This is much less refined than actual n-gram but seems to work # This is much less refined than actual n-gram but seems to work
# relatively OK in grounded mode and is by far much faster with # relatively OK in grounded mode and is by far much faster with
# much less worst case complexity as everything happens on device. # much less worst case complexity as everything happens on device.
B = accepted_ids.shape[0] B = accepted_ids.shape[0]
device = input_ids.device device = input_ids.device
seeds = next_ids[accepted_ids.cumsum(dim=-1) -1 ] seeds = next_ids[accepted_ids.cumsum(dim=-1) - 1]
indices = (input_ids == seeds.unsqueeze(-1)).max(dim=1).indices + 1 indices = (input_ids == seeds.unsqueeze(-1)).max(dim=1).indices + 1
all_indices = indices.unsqueeze(-1).expand(B, speculate) + torch.arange(speculate, device=device) all_indices = indices.unsqueeze(-1).expand(B, speculate) + torch.arange(
speculate, device=device
)
all_indices = torch.clamp(all_indices, max=input_ids.shape[1] - 1) all_indices = torch.clamp(all_indices, max=input_ids.shape[1] - 1)
speculative_ids = input_ids.gather(dim=-1, index=all_indices) speculative_ids = input_ids.gather(dim=-1, index=all_indices)
return speculative_ids return speculative_ids
class HeterogeneousNextTokenChooser: class HeterogeneousNextTokenChooser:
def __init__( def __init__(
self, self,
...@@ -228,7 +239,15 @@ class HeterogeneousNextTokenChooser: ...@@ -228,7 +239,15 @@ class HeterogeneousNextTokenChooser:
self.dtype = dtype self.dtype = dtype
self.device = device self.device = device
def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor, speculate: int, speculated_ids: Optional[torch.Tensor] = None, speculative_scores: Optional[torch.Tensor] = None, verbose=False): def __call__(
self,
input_ids: torch.Tensor,
scores: torch.Tensor,
speculate: int,
speculated_ids: Optional[torch.Tensor] = None,
speculative_scores: Optional[torch.Tensor] = None,
verbose=False,
):
if speculated_ids is not None: if speculated_ids is not None:
B = scores.shape[0] // (speculated_ids.shape[1] + 1) B = scores.shape[0] // (speculated_ids.shape[1] + 1)
S = speculated_ids.shape[1] + 1 S = speculated_ids.shape[1] + 1
...@@ -249,12 +268,11 @@ class HeterogeneousNextTokenChooser: ...@@ -249,12 +268,11 @@ class HeterogeneousNextTokenChooser:
for warper in self.warpers: for warper in self.warpers:
_scores = warper(input_ids, _scores) _scores = warper(input_ids, _scores)
_next_ids = self.choice(_scores) _next_ids = self.choice(_scores)
scores[:, j] = _scores scores[:, j] = _scores
next_ids[:, j] = _next_ids next_ids[:, j] = _next_ids
next_ids = next_ids.view(B*S) next_ids = next_ids.view(B * S)
scores = scores.view( B* S, -1) scores = scores.view(B * S, -1)
if speculated_ids is not None: if speculated_ids is not None:
accepted_ids = [] accepted_ids = []
...@@ -262,7 +280,7 @@ class HeterogeneousNextTokenChooser: ...@@ -262,7 +280,7 @@ class HeterogeneousNextTokenChooser:
S = speculated_ids.shape[1] + 1 S = speculated_ids.shape[1] + 1
indices = [] indices = []
for i in range(B): for i in range(B):
_next_ids = next_ids[i*S: (i + 1)*S] _next_ids = next_ids[i * S : (i + 1) * S]
_speculated_ids = speculated_ids[i] _speculated_ids = speculated_ids[i]
validate_speculative = _next_ids[:-1] == _speculated_ids validate_speculative = _next_ids[:-1] == _speculated_ids
index = i * S index = i * S
...@@ -278,7 +296,9 @@ class HeterogeneousNextTokenChooser: ...@@ -278,7 +296,9 @@ class HeterogeneousNextTokenChooser:
break break
accepted_ids.append(accepted) accepted_ids.append(accepted)
accepted_ids = torch.tensor(accepted_ids, device=input_ids.device, dtype=input_ids.dtype) accepted_ids = torch.tensor(
accepted_ids, device=input_ids.device, dtype=input_ids.dtype
)
next_ids = next_ids[indices] next_ids = next_ids[indices]
scores = scores[indices] scores = scores[indices]
indices = torch.arange(B, device=input_ids.device) * S indices = torch.arange(B, device=input_ids.device) * S
...@@ -296,7 +316,9 @@ class HeterogeneousNextTokenChooser: ...@@ -296,7 +316,9 @@ class HeterogeneousNextTokenChooser:
speculative_ids = Greedy()(speculative_scores) speculative_ids = Greedy()(speculative_scores)
else: else:
# n-gram # n-gram
speculative_ids = create_n_gram_speculation(input_ids, next_ids, accepted_ids, speculate, verbose) speculative_ids = create_n_gram_speculation(
input_ids, next_ids, accepted_ids, speculate, verbose
)
else: else:
speculative_ids = None speculative_ids = None
......
...@@ -16,7 +16,7 @@ class Weights: ...@@ -16,7 +16,7 @@ class Weights:
dtype, dtype,
process_group, process_group,
aliases: Optional[Dict[str, List[str]]] = None, aliases: Optional[Dict[str, List[str]]] = None,
prefix: Optional[str] = None prefix: Optional[str] = None,
): ):
routing = {} routing = {}
for filename in filenames: for filename in filenames:
...@@ -213,7 +213,8 @@ class Weights: ...@@ -213,7 +213,8 @@ class Weights:
bits, groupsize = self._get_gptq_params() bits, groupsize = self._get_gptq_params()
from text_generation_server.utils.layers import HAS_EXLLAMA from text_generation_server.utils.layers import HAS_EXLLAMA
use_exllama = bits==4 and HAS_EXLLAMA and quantize == "gptq"
use_exllama = bits == 4 and HAS_EXLLAMA and quantize == "gptq"
weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama) weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
else: else:
w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes] w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
...@@ -283,7 +284,7 @@ class Weights: ...@@ -283,7 +284,7 @@ class Weights:
if use_exllama: if use_exllama:
qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0) qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0)
scales = self.get_sharded(f"{prefix}.scales", dim=0) scales = self.get_sharded(f"{prefix}.scales", dim=0)
g_idx = self.get_sharded(f"{prefix}.g_idx", dim= 0) g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0)
g_idx = g_idx - g_idx[0] g_idx = g_idx - g_idx[0]
else: else:
# The triton kernel reorders the scales/zero points instead of the weight/activation. # The triton kernel reorders the scales/zero points instead of the weight/activation.
......
...@@ -21,14 +21,14 @@ def main(): ...@@ -21,14 +21,14 @@ def main():
block = [] block = []
for line in lines: for line in lines:
if line.startswith(" -") or line.startswith(" -"): if line.startswith(" -") or line.startswith(" -"):
rendered_block = '\n'.join(block) rendered_block = "\n".join(block)
if header: if header:
final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n" final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
else: else:
final_doc += f"```shell\n{rendered_block}\n```\n" final_doc += f"```shell\n{rendered_block}\n```\n"
block = [] block = []
tokens = line.split("<") tokens = line.split("<")
if len(tokens)>1: if len(tokens) > 1:
header = tokens[-1][:-1] header = tokens[-1][:-1]
else: else:
header = line.split("--")[-1] header = line.split("--")[-1]
...@@ -36,7 +36,7 @@ def main(): ...@@ -36,7 +36,7 @@ def main():
block.append(line) block.append(line)
rendered_block = '\n'.join(block) rendered_block = "\n".join(block)
final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n" final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
block = [] block = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment