Unverified Commit bcaa8a36 authored by Casper's avatar Casper Committed by GitHub
Browse files

v0.2.0 (#330)


Co-authored-by: default avatarjinz2014 <7799920+jinz2014@users.noreply.github.com>
Co-authored-by: default avatarJin Z <5zj@cousteau.ftpn.ornl.gov>
parent c69d3b65
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
model_path = 'mistralai/Mixtral-8x7B-Instruct-v0.1'
quant_path = 'mixtral-instruct-awq'
modules_to_not_convert = ["gate"]
quant_config = {
"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM",
"modules_to_not_convert": modules_to_not_convert
}
# Load model
# NOTE: pass safetensors=True to load safetensors
model = AutoAWQForCausalLM.from_pretrained(
model_path, safetensors=True, **{"low_cpu_mem_usage": True}
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Quantize
model.quantize(
tokenizer,
quant_config=quant_config,
modules_to_not_convert=modules_to_not_convert
)
# Save quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"')
\ No newline at end of file
from datasets import load_dataset
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
model_path = 'lmsys/vicuna-7b-v1.5'
quant_path = 'vicuna-7b-v1.5-awq'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
# Load model
model = AutoAWQForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Define data loading methods
def load_dolly():
data = load_dataset('databricks/databricks-dolly-15k', split="train")
# concatenate data
def concatenate_data(x):
return {"text": x['instruction'] + '\n' + x['context'] + '\n' + x['response']}
concatenated = data.map(concatenate_data)
return [text for text in concatenated["text"]]
def load_wikitext():
data = load_dataset('wikitext', 'wikitext-2-raw-v1', split="train")
return [text for text in data["text"] if text.strip() != '' and len(text.split(' ')) > 20]
# Quantize
model.quantize(tokenizer, quant_config=quant_config, calib_data=load_wikitext())
# Save quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"')
\ No newline at end of file
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
model_path = 'lmsys/vicuna-7b-v1.5'
quant_path = 'vicuna-7b-v1.5-awq'
model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
quant_path = 'mistral-instruct-v0.2-awq'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
# Load model
# NOTE: pass safetensors=True to load safetensors
model = AutoAWQForCausalLM.from_pretrained(
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
)
......
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, TextStreamer
quant_path = "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"
# Load model
model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# Convert prompt to tokens
prompt_template = """\
<|im_start|>system
{system}<|im_end|>
<|im_start|>user
{prompt}<|im_end|>
<|im_start|>assistant
"""
system = "You are a helpful assistant that answers precisely."
prompt = "You're standing on the surface of the Earth. "\
"You walk one mile south, one mile west and one mile north. "\
"You end up exactly where you started. Where are you?"
tokens = tokenizer(
prompt_template.format(system=system, prompt=prompt),
return_tensors='pt'
).input_ids.to("mps")
# Generate output
generation_output = model.generate(
tokens,
streamer=streamer,
max_new_tokens=64
)
\ No newline at end of file
......@@ -10,11 +10,10 @@ from peft import get_peft_model, LoraConfig, TaskType
def prepare_split(tokenizer):
data = datasets.load_dataset("mhenrichsen/alpaca_2k_test", split="train")
prompt_template = "<s>[INST] {system} {prompt} [/INST] {output}</s>"
prompt_template = "<s>[INST] {prompt} [/INST] {output}</s>"
def format_prompt(x):
return prompt_template.format(
system="",
prompt=x["instruction"],
output=x["output"]
)
......@@ -26,7 +25,7 @@ def prepare_split(tokenizer):
return data
model_path = "ybelkada/opt-125m-awq"
model_path = "TheBloke/Mistral-7B-v0.1-AWQ"
# Load model
model = AutoAWQForCausalLM.from_quantized(model_path, fuse_layers=False)
......@@ -56,7 +55,6 @@ training_arguments = TrainingArguments(
optim="adamw_torch",
num_train_epochs=1,
learning_rate=1e-4,
# fp16=True,
evaluation_strategy="no",
save_strategy="epoch",
save_steps=100,
......
site_name: AutoAWQ
repo_name: casper-hansen/AutoAWQ
repo_url: https://github.com/casper-hansen/AutoAWQ
nav:
- index.md
- Examples: examples.md
- Reference:
- reference/index.md
markdown_extensions:
toc:
permalink: true
markdown.extensions.codehilite:
guess_lang: false
admonition: null
codehilite: null
extra: null
pymdownx.superfences:
custom_fences:
- name: mermaid
class: mermaid
format: !!python/name:pymdownx.superfences.fence_code_format ''
pymdownx.tabbed:
alternate_style: true
pymdownx.tilde: null
attr_list: null
md_in_html: null
plugins:
search: null
mkdocstrings:
handlers:
python:
paths: [awq]
options:
extensions:
- griffe_typingdoc
show_root_heading: true
show_if_no_docstring: true
inherited_members: true
members_order: source
separate_signature: true
unwrap_annotated: true
filters:
- '!^_'
merge_init_into_class: true
docstring_section_style: spacy
signature_crossrefs: true
show_symbol_type_heading: true
show_symbol_type_toc: true
theme:
name: material
palette:
- media: '(prefers-color-scheme: light)'
scheme: default
primary: teal
accent: amber
toggle:
icon: material/lightbulb
name: Switch to dark mode
- media: '(prefers-color-scheme: dark)'
scheme: slate
primary: teal
accent: amber
toggle:
icon: material/lightbulb-outline
name: Switch to light mode
features:
- search.suggest
- search.highlight
- content.tabs.link
- navigation.indexes
- content.tooltips
- navigation.path
- content.code.annotate
- content.code.copy
- content.code.select
- navigation.tabs
icon:
repo: fontawesome/brands/github-alt
\ No newline at end of file
......@@ -31,7 +31,7 @@ def get_kernels_whl_url(
return f"https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v{release_version}/autoawq_kernels-{release_version}+{gpu_system_version}-cp{python_version}-cp{python_version}-{platform}_{architecture}.whl"
AUTOAWQ_VERSION = "0.1.8"
AUTOAWQ_VERSION = "0.2.0"
PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda
......@@ -90,6 +90,7 @@ requirements = [
"tokenizers>=0.12.1",
"accelerate",
"datasets",
"zstandard",
]
try:
......@@ -101,9 +102,9 @@ except importlib.metadata.PackageNotFoundError:
# kernels can be downloaded from pypi for cuda+121 only
# for everything else, we need to download the wheels from github
if not KERNELS_INSTALLED and (CUDA_VERSION or ROCM_VERSION):
if CUDA_VERSION.startswith("12"):
if CUDA_VERSION and CUDA_VERSION.startswith("12"):
requirements.append("autoawq-kernels")
elif CUDA_VERSION.startswith("11") or ROCM_VERSION in ["561", "571"]:
elif CUDA_VERSION and CUDA_VERSION.startswith("11") or ROCM_VERSION in ["561", "571"]:
gpu_system_version = (
f"cu{CUDA_VERSION}" if CUDA_VERSION else f"rocm{ROCM_VERSION}"
)
......@@ -130,6 +131,7 @@ setup(
install_requires=requirements,
extras_require={
"eval": ["lm_eval>=0.4.0", "tabulate", "protobuf", "evaluate", "scipy"],
"dev": ["black", "mkdocstrings-python", "mkdocs-material", "griffe-typingdoc"]
},
**common_setup_kwargs,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment