Unverified Commit cc796b13 authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

Convert before transpose (#1073)

parent f029ef94
...@@ -43,8 +43,8 @@ from vllm.model_executor.parallel_utils.tensor_parallel import ( ...@@ -43,8 +43,8 @@ from vllm.model_executor.parallel_utils.tensor_parallel import (
VocabParallelEmbedding) VocabParallelEmbedding)
from vllm.model_executor.quantization_utils import QuantizationConfig from vllm.model_executor.quantization_utils import QuantizationConfig
from vllm.model_executor.weight_utils import ( from vllm.model_executor.weight_utils import (
load_tensor_parallel_weights, load_padded_tensor_parallel_vocab, convert_pyslice_to_tensor, hf_model_weights_iterator,
hf_model_weights_iterator) load_tensor_parallel_weights, load_padded_tensor_parallel_vocab)
from vllm.sequence import SamplerOutput from vllm.sequence import SamplerOutput
KVCache = Tuple[torch.Tensor, torch.Tensor] KVCache = Tuple[torch.Tensor, torch.Tensor]
...@@ -337,6 +337,7 @@ class LlamaForCausalLM(nn.Module): ...@@ -337,6 +337,7 @@ class LlamaForCausalLM(nn.Module):
is_packed = self.quant_config.is_packed(name) is_packed = self.quant_config.is_packed(name)
is_transposed = self.quant_config.is_transposed(name) is_transposed = self.quant_config.is_transposed(name)
if is_transposed: if is_transposed:
loaded_weight = convert_pyslice_to_tensor(loaded_weight)
loaded_weight = loaded_weight.T loaded_weight = loaded_weight.T
is_attention_weight = False is_attention_weight = False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment