Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
86a677de
Unverified
Commit
86a677de
authored
Aug 29, 2024
by
Dipika Sikka
Committed by
GitHub
Aug 29, 2024
Browse files
[misc] update tpu int8 to use new vLLM Parameters (#7973)
parent
d78789ac
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
13 additions
and
11 deletions
+13
-11
vllm/model_executor/layers/linear.py
vllm/model_executor/layers/linear.py
+2
-1
vllm/model_executor/layers/quantization/tpu_int8.py
vllm/model_executor/layers/quantization/tpu_int8.py
+11
-10
No files found.
vllm/model_executor/layers/linear.py
View file @
86a677de
...
...
@@ -23,7 +23,8 @@ logger = init_logger(__name__)
WEIGHT_LOADER_V2_SUPPORTED
=
[
"CompressedTensorsLinearMethod"
,
"AWQMarlinLinearMethod"
,
"AWQLinearMethod"
,
"GPTQMarlinLinearMethod"
,
"Fp8LinearMethod"
,
"MarlinLinearMethod"
,
"QQQLinearMethod"
,
"GPTQMarlin24LinearMethod"
"MarlinLinearMethod"
,
"QQQLinearMethod"
,
"GPTQMarlin24LinearMethod"
,
"TPUInt8LinearMethod"
]
...
...
vllm/model_executor/layers/quantization/tpu_int8.py
View file @
86a677de
...
...
@@ -7,7 +7,7 @@ from torch.nn.parameter import Parameter
from
vllm.model_executor.layers.linear
import
LinearBase
,
LinearMethodBase
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.
utils
import
set_weight_attrs
from
vllm.model_executor.
parameter
import
ModelWeightParameter
ACTIVATION_SCHEMES
=
[
"none"
]
...
...
@@ -64,16 +64,16 @@ class TPUInt8LinearMethod(LinearMethodBase):
output_partition_sizes
:
List
[
int
],
input_size
:
int
,
output_size
:
int
,
params_dtype
:
torch
.
dtype
,
**
extra_weight_attrs
):
weight
=
Parameter
(
torch
.
empty
(
sum
(
output_partition_sizes
),
weight_loader
=
extra_weight_attrs
.
get
(
"weight_loader"
)
weight
=
ModelWeightParameter
(
data
=
torch
.
empty
(
sum
(
output_partition_sizes
),
input_size_per_partition
,
dtype
=
params_dtype
),
requires_grad
=
False
)
input_dim
=
1
,
output_dim
=
0
,
weight_loader
=
weight_loader
)
layer
.
register_parameter
(
"weight"
,
weight
)
set_weight_attrs
(
weight
,
{
**
extra_weight_attrs
,
"input_dim"
:
1
,
"output_dim"
:
0
,
})
def
_quantize_weight
(
self
,
weight
:
torch
.
Tensor
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
...
...
@@ -92,6 +92,7 @@ class TPUInt8LinearMethod(LinearMethodBase):
return
qweight
,
qscale
def
process_weights_after_loading
(
self
,
layer
:
Module
)
->
None
:
layer
.
weight
=
Parameter
(
layer
.
weight
.
data
,
requires_grad
=
False
)
device
=
layer
.
weight
.
device
qweight
,
qscale
=
self
.
_quantize_weight
(
layer
.
weight
)
qweight
=
qweight
.
to
(
device
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment