Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
7c8dd88d
Unverified
Commit
7c8dd88d
authored
Jan 15, 2024
by
Marc Sun
Committed by
GitHub
Jan 15, 2024
Browse files
[GPTQ] Fix test (#28018)
* fix test * reduce length * smaller model
parent
366c0327
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
13 additions
and
14 deletions
+13
-14
tests/quantization/gptq/test_gptq.py
tests/quantization/gptq/test_gptq.py
+13
-14
No files found.
tests/quantization/gptq/test_gptq.py
View file @
7c8dd88d
...
@@ -217,7 +217,9 @@ class GPTQTest(unittest.TestCase):
...
@@ -217,7 +217,9 @@ class GPTQTest(unittest.TestCase):
with
tempfile
.
TemporaryDirectory
()
as
tmpdirname
:
with
tempfile
.
TemporaryDirectory
()
as
tmpdirname
:
self
.
quantized_model
.
save_pretrained
(
tmpdirname
)
self
.
quantized_model
.
save_pretrained
(
tmpdirname
)
if
not
self
.
use_exllama
:
if
not
self
.
use_exllama
:
quantized_model_from_saved
=
AutoModelForCausalLM
.
from_pretrained
(
tmpdirname
).
to
(
0
)
quantized_model_from_saved
=
AutoModelForCausalLM
.
from_pretrained
(
tmpdirname
,
quantization_config
=
GPTQConfig
(
use_exllama
=
False
,
bits
=
4
)
).
to
(
0
)
self
.
check_quantized_layers_type
(
quantized_model_from_saved
,
"cuda-old"
)
self
.
check_quantized_layers_type
(
quantized_model_from_saved
,
"cuda-old"
)
else
:
else
:
# we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
# we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
...
@@ -242,12 +244,11 @@ class GPTQTest(unittest.TestCase):
...
@@ -242,12 +244,11 @@ class GPTQTest(unittest.TestCase):
with
tempfile
.
TemporaryDirectory
()
as
tmpdirname
:
with
tempfile
.
TemporaryDirectory
()
as
tmpdirname
:
self
.
quantized_model
.
save_pretrained
(
tmpdirname
)
self
.
quantized_model
.
save_pretrained
(
tmpdirname
)
if
not
self
.
use_exllama
:
if
not
self
.
use_exllama
:
self
.
assertEqual
(
self
.
quantized_model
.
config
.
quantization_config
.
use_exllama
,
False
)
self
.
check_quantized_layers_type
(
self
.
quantized_model
,
"cuda-old"
)
# we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
# we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
quantized_model_from_saved
=
AutoModelForCausalLM
.
from_pretrained
(
quantized_model_from_saved
=
AutoModelForCausalLM
.
from_pretrained
(
tmpdirname
,
quantization_config
=
GPTQConfig
(
use_exllama
=
True
,
bits
=
4
),
device_map
=
{
""
:
0
}
tmpdirname
,
quantization_config
=
GPTQConfig
(
use_exllama
=
True
,
bits
=
4
),
device_map
=
{
""
:
0
}
)
)
self
.
assertEqual
(
quantized_model_from_saved
.
config
.
quantization_config
.
use_exllama
,
True
)
self
.
assertEqual
(
quantized_model_from_saved
.
config
.
quantization_config
.
bits
,
self
.
bits
)
self
.
assertEqual
(
quantized_model_from_saved
.
config
.
quantization_config
.
bits
,
self
.
bits
)
self
.
check_quantized_layers_type
(
quantized_model_from_saved
,
"exllama"
)
self
.
check_quantized_layers_type
(
quantized_model_from_saved
,
"exllama"
)
self
.
check_inference_correctness
(
quantized_model_from_saved
)
self
.
check_inference_correctness
(
quantized_model_from_saved
)
...
@@ -279,10 +280,10 @@ class GPTQTestActOrderExllama(unittest.TestCase):
...
@@ -279,10 +280,10 @@ class GPTQTestActOrderExllama(unittest.TestCase):
"""
"""
EXPECTED_OUTPUTS
=
set
()
EXPECTED_OUTPUTS
=
set
()
EXPECTED_OUTPUTS
.
add
(
"Hello
my name is Katie and I am a 20 year
"
)
EXPECTED_OUTPUTS
.
add
(
"Hello
, how are you ? I'm doing good, thanks for asking.
"
)
model_name
=
"hf-internal-testing/Llama-2-7B-GPTQ"
# 4bit + act_order + 128g
revision
=
"gptq-4bit-128g-actorder_True
"
model_name
=
"hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ
"
input_text
=
"Hello
my name is
"
input_text
=
"Hello
, how are you ?
"
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
...
@@ -292,7 +293,6 @@ class GPTQTestActOrderExllama(unittest.TestCase):
...
@@ -292,7 +293,6 @@ class GPTQTestActOrderExllama(unittest.TestCase):
cls
.
quantization_config
=
GPTQConfig
(
bits
=
4
,
max_input_length
=
4028
)
cls
.
quantization_config
=
GPTQConfig
(
bits
=
4
,
max_input_length
=
4028
)
cls
.
quantized_model
=
AutoModelForCausalLM
.
from_pretrained
(
cls
.
quantized_model
=
AutoModelForCausalLM
.
from_pretrained
(
cls
.
model_name
,
cls
.
model_name
,
revision
=
cls
.
revision
,
torch_dtype
=
torch
.
float16
,
torch_dtype
=
torch
.
float16
,
device_map
=
{
""
:
0
},
device_map
=
{
""
:
0
},
quantization_config
=
cls
.
quantization_config
,
quantization_config
=
cls
.
quantization_config
,
...
@@ -336,7 +336,7 @@ class GPTQTestActOrderExllama(unittest.TestCase):
...
@@ -336,7 +336,7 @@ class GPTQTestActOrderExllama(unittest.TestCase):
self
.
quantized_model
.
generate
(
**
inp
,
num_beams
=
1
,
min_new_tokens
=
3
,
max_new_tokens
=
3
)
self
.
quantized_model
.
generate
(
**
inp
,
num_beams
=
1
,
min_new_tokens
=
3
,
max_new_tokens
=
3
)
self
.
assertTrue
(
"temp_state buffer is too small"
in
str
(
cm
.
exception
))
self
.
assertTrue
(
"temp_state buffer is too small"
in
str
(
cm
.
exception
))
prompt
=
"I am in Paris and"
*
500
prompt
=
"I am in Paris and"
inp
=
self
.
tokenizer
(
prompt
,
return_tensors
=
"pt"
).
to
(
0
)
inp
=
self
.
tokenizer
(
prompt
,
return_tensors
=
"pt"
).
to
(
0
)
self
.
assertTrue
(
inp
[
"input_ids"
].
shape
[
1
]
<
4028
)
self
.
assertTrue
(
inp
[
"input_ids"
].
shape
[
1
]
<
4028
)
self
.
quantized_model
.
generate
(
**
inp
,
num_beams
=
1
,
min_new_tokens
=
3
,
max_new_tokens
=
3
)
self
.
quantized_model
.
generate
(
**
inp
,
num_beams
=
1
,
min_new_tokens
=
3
,
max_new_tokens
=
3
)
...
@@ -355,10 +355,10 @@ class GPTQTestExllamaV2(unittest.TestCase):
...
@@ -355,10 +355,10 @@ class GPTQTestExllamaV2(unittest.TestCase):
"""
"""
EXPECTED_OUTPUTS
=
set
()
EXPECTED_OUTPUTS
=
set
()
EXPECTED_OUTPUTS
.
add
(
"Hello
my name is Katie and I am a 20 year
"
)
EXPECTED_OUTPUTS
.
add
(
"Hello
, how are you ? I'm doing good, thanks for asking.
"
)
model_name
=
"hf-internal-testing/Llama-2-7B-GPTQ"
# 4bit + act_order + 128g
revision
=
"gptq-4bit-128g-actorder_True
"
model_name
=
"hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ
"
input_text
=
"Hello
my name is
"
input_text
=
"Hello
, how are you ?
"
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
...
@@ -368,7 +368,6 @@ class GPTQTestExllamaV2(unittest.TestCase):
...
@@ -368,7 +368,6 @@ class GPTQTestExllamaV2(unittest.TestCase):
cls
.
quantization_config
=
GPTQConfig
(
bits
=
4
,
exllama_config
=
{
"version"
:
2
})
cls
.
quantization_config
=
GPTQConfig
(
bits
=
4
,
exllama_config
=
{
"version"
:
2
})
cls
.
quantized_model
=
AutoModelForCausalLM
.
from_pretrained
(
cls
.
quantized_model
=
AutoModelForCausalLM
.
from_pretrained
(
cls
.
model_name
,
cls
.
model_name
,
revision
=
cls
.
revision
,
torch_dtype
=
torch
.
float16
,
torch_dtype
=
torch
.
float16
,
device_map
=
{
""
:
0
},
device_map
=
{
""
:
0
},
quantization_config
=
cls
.
quantization_config
,
quantization_config
=
cls
.
quantization_config
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment