Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
23e5e50f
Unverified
Commit
23e5e50f
authored
Dec 22, 2024
by
Lianmin Zheng
Committed by
GitHub
Dec 22, 2024
Browse files
Fix gemlite import (#2553)
parent
25e5d589
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
11 additions
and
39 deletions
+11
-39
python/sglang/bench_offline_throughput.py
python/sglang/bench_offline_throughput.py
+0
-12
python/sglang/bench_one_batch.py
python/sglang/bench_one_batch.py
+0
-12
python/sglang/srt/layers/torchao_utils.py
python/sglang/srt/layers/torchao_utils.py
+11
-15
No files found.
python/sglang/bench_offline_throughput.py
View file @
23e5e50f
...
...
@@ -322,18 +322,6 @@ def throughput_test(
)
time
.
sleep
(
0.5
)
try
:
import
os
import
pwd
from
gemlite.core
import
GemLiteLinearTriton
GemLiteLinearTriton
.
cache_config
(
f
"/tmp/
{
pwd
.
getpwuid
(
os
.
getuid
()).
pw_gecos
}
_gemlite.json"
)
except
ImportError
:
pass
logging
.
info
(
"
\n
Benchmark..."
)
result
=
throughput_test_once
(
backend_name
=
bench_args
.
backend
,
...
...
python/sglang/bench_one_batch.py
View file @
23e5e50f
...
...
@@ -386,18 +386,6 @@ def latency_test(
server_args
.
device
,
)
try
:
import
os
import
pwd
from
gemlite.core
import
GemLiteLinearTriton
GemLiteLinearTriton
.
cache_config
(
f
"/tmp/
{
pwd
.
getpwuid
(
os
.
getuid
()).
pw_gecos
}
_gemlite.json"
)
except
ImportError
:
pass
rank_print
(
"Benchmark ..."
)
# Run the sweep
...
...
python/sglang/srt/layers/torchao_utils.py
View file @
23e5e50f
...
...
@@ -2,8 +2,14 @@
Common utilities for torchao.
"""
import
logging
import
os
import
pwd
import
torch
logger
=
logging
.
getLogger
(
__name__
)
def
apply_torchao_config_to_model
(
model
:
torch
.
nn
.
Module
,
torchao_config
:
str
,
filter_fn
=
None
...
...
@@ -50,27 +56,17 @@ def apply_torchao_config_to_model(
elif
"gemlite"
in
torchao_config
:
# gemlite-<packing_bitwidth>-<bit_width>-<group_size> or
# gemlite-<bit_width>-<group_size> (packing_bitwidth defaults to 32)
import
os
import
pwd
import
gemlite
from
gemlite.core
import
GemLiteLinearTriton
,
set_autotune
try
:
from
torchao.quantization
import
gemlite_uintx_weight_only
except
:
print
(
f
"import `gemlite_uintx_weight_only` failed, please use torchao nightly to use gemlite quantization"
)
return
model
from
gemlite.core
import
GemLiteLinearTriton
from
torchao.quantization
import
gemlite_uintx_weight_only
_quant_args
=
torchao_config
.
split
(
"-"
)
bit_width
=
int
(
_quant_args
[
-
2
])
group_size
=
None
if
_quant_args
[
-
1
]
==
"None"
else
int
(
_quant_args
[
-
1
])
try
:
packing_bitwidth
=
int
(
_quant_args
[
-
3
])
except
:
# if only 2 inputs found, use default value
except
(
ValueError
,
IndexError
)
:
# if only 2 inputs found
or conversion fails
, use default value
packing_bitwidth
=
32
quantize_
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment