v0.2.0 (#330)

Co-authored-by: jinz2014 <7799920+jinz2014@users.noreply.github.com> Co-authored-by: Jin Z <5zj@cousteau.ftpn.ornl.gov>

v0.2.0 (#330)
Co-authored-by: jinz2014 <7799920+jinz2014@users.noreply.github.com> Co-authored-by: Jin Z <5zj@cousteau.ftpn.ornl.gov>
bcaa8a36 · Casper · GitHub · c69d3b65 · c69d3b65 · c69d3b65
Unverified Commit bcaa8a36 authored Feb 15, 2024 by Casper Committed by GitHub Feb 15, 2024
7 changed files
--- a/examples/mixtral_quant.py
+++ b/examples/mixtral_quant.py
-from awq import AutoAWQForCausalLM
-from transformers import AutoTokenizer
-
-model_path = 'mistralai/Mixtral-8x7B-Instruct-v0.1'
-quant_path = 'mixtral-instruct-awq'
-modules_to_not_convert = ["gate"]
-quant_config = {
-    "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM",
-    "modules_to_not_convert": modules_to_not_convert
-}
-
-# Load model
-# NOTE: pass safetensors=True to load safetensors
-model = AutoAWQForCausalLM.from_pretrained(
-    model_path, safetensors=True, **{"low_cpu_mem_usage": True}
-)
-tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-
-# Quantize
-model.quantize(
-    tokenizer,
-    quant_config=quant_config,
-    modules_to_not_convert=modules_to_not_convert
-)
-
-# Save quantized model
-model.save_quantized(quant_path)
-tokenizer.save_pretrained(quant_path)
-
-print(f'Model is quantized and saved at "{quant_path}"')
\ No newline at end of file
--- a/examples/quant_custom_data.py
+++ b/examples/quant_custom_data.py
-from datasets import load_dataset
-from awq import AutoAWQForCausalLM
-from transformers import AutoTokenizer
-
-model_path = 'lmsys/vicuna-7b-v1.5'
-quant_path = 'vicuna-7b-v1.5-awq'
-quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
-
-# Load model
-model = AutoAWQForCausalLM.from_pretrained(model_path)
-tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-
-# Define data loading methods
-def load_dolly():
-    data = load_dataset('databricks/databricks-dolly-15k', split="train")
-
-    # concatenate data
-    def concatenate_data(x):
-        return {"text": x['instruction'] + '\n' + x['context'] + '\n' + x['response']}
-    
-    concatenated = data.map(concatenate_data)
-    return [text for text in concatenated["text"]]
-
-def load_wikitext():
-    data = load_dataset('wikitext', 'wikitext-2-raw-v1', split="train")
-    return [text for text in data["text"] if text.strip() != '' and len(text.split(' ')) > 20]
-
-# Quantize
-model.quantize(tokenizer, quant_config=quant_config, calib_data=load_wikitext())
-
-# Save quantized model
-model.save_quantized(quant_path)
-tokenizer.save_pretrained(quant_path)
-
-print(f'Model is quantized and saved at "{quant_path}"')
\ No newline at end of file
--- a/examples/basic_quant.py
+++ b/examples/basic_quant.py
 from awq import AutoAWQForCausalLM
 from transformers import AutoTokenizer

-model_path = 'lmsys/vicuna-7b-v1.5'
-quant_path = 'vicuna-7b-v1.5-awq'
+model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
+quant_path = 'mistral-instruct-v0.2-awq'
 quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

 # Load model
-# NOTE: pass safetensors=True to load safetensors
 model = AutoAWQForCausalLM.from_pretrained(
    model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
 )

--- a/examples/tinyllama_generate.py
+++ b/examples/tinyllama_generate.py
-from awq import AutoAWQForCausalLM
-from transformers import AutoTokenizer, TextStreamer
-
-quant_path = "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"
-
-# Load model
-model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
-tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
-streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-
-# Convert prompt to tokens
-prompt_template = """\
-<|im_start|>system
-{system}<|im_end|>
-<|im_start|>user
-{prompt}<|im_end|>
-<|im_start|>assistant
-"""
-
-system = "You are a helpful assistant that answers precisely."
-
-prompt = "You're standing on the surface of the Earth. "\
-        "You walk one mile south, one mile west and one mile north. "\
-        "You end up exactly where you started. Where are you?"
-
-tokens = tokenizer(
-    prompt_template.format(system=system, prompt=prompt), 
-    return_tensors='pt'
-).input_ids.to("mps")
-
-# Generate output
-generation_output = model.generate(
-    tokens, 
-    streamer=streamer,
-    max_new_tokens=64
-)
\ No newline at end of file
--- a/examples/awq_train.py
+++ b/examples/awq_train.py
@@ -10,11 +10,10 @@ from peft import get_peft_model, LoraConfig, TaskType

 def prepare_split(tokenizer):
    data = datasets.load_dataset("mhenrichsen/alpaca_2k_test", split="train")
-    prompt_template = "<s>[INST] {system} {prompt} [/INST] {output}</s>"
+    prompt_template = "<s>[INST] {prompt} [/INST] {output}</s>"

    def format_prompt(x):
        return prompt_template.format(
-            system="",
            prompt=x["instruction"],
            output=x["output"]
        )
@@ -26,7 +25,7 @@ def prepare_split(tokenizer):

    return data

-model_path = "ybelkada/opt-125m-awq"
+model_path = "TheBloke/Mistral-7B-v0.1-AWQ"

 # Load model
 model = AutoAWQForCausalLM.from_quantized(model_path, fuse_layers=False)
@@ -56,7 +55,6 @@ training_arguments = TrainingArguments(
    optim="adamw_torch",
    num_train_epochs=1,
    learning_rate=1e-4,
-    # fp16=True,
    evaluation_strategy="no",
    save_strategy="epoch",
    save_steps=100,

--- a/mkdocs.yml
+++ b/mkdocs.yml
+site_name: AutoAWQ
+repo_name: casper-hansen/AutoAWQ
+repo_url: https://github.com/casper-hansen/AutoAWQ
+
+nav:
+- index.md
+- Examples: examples.md
+- Reference:
+  - reference/index.md
+
+markdown_extensions:
+  toc:
+    permalink: true
+  markdown.extensions.codehilite:
+    guess_lang: false
+  admonition: null
+  codehilite: null
+  extra: null
+  pymdownx.superfences:
+    custom_fences:
+    - name: mermaid
+      class: mermaid
+      format: !!python/name:pymdownx.superfences.fence_code_format ''
+  pymdownx.tabbed:
+    alternate_style: true
+  pymdownx.tilde: null
+  attr_list: null
+  md_in_html: null
+
+plugins:
+  search: null
+  mkdocstrings:
+    handlers:
+      python:
+        paths: [awq]
+        options:
+          extensions:
+          - griffe_typingdoc
+          show_root_heading: true
+          show_if_no_docstring: true
+          inherited_members: true
+          members_order: source
+          separate_signature: true
+          unwrap_annotated: true
+          filters:
+          - '!^_'
+          merge_init_into_class: true
+          docstring_section_style: spacy
+          signature_crossrefs: true
+          show_symbol_type_heading: true
+          show_symbol_type_toc: true
+
+theme:
+  name: material
+  palette:
+  - media: '(prefers-color-scheme: light)'
+    scheme: default
+    primary: teal
+    accent: amber
+    toggle:
+      icon: material/lightbulb
+      name: Switch to dark mode
+  - media: '(prefers-color-scheme: dark)'
+    scheme: slate
+    primary: teal
+    accent: amber
+    toggle:
+      icon: material/lightbulb-outline
+      name: Switch to light mode
+  features:
+  - search.suggest
+  - search.highlight
+  - content.tabs.link
+  - navigation.indexes
+  - content.tooltips
+  - navigation.path
+  - content.code.annotate
+  - content.code.copy
+  - content.code.select
+  - navigation.tabs
+  icon:
+    repo: fontawesome/brands/github-alt
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@ def get_kernels_whl_url(
    return f"https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v{release_version}/autoawq_kernels-{release_version}+{gpu_system_version}-cp{python_version}-cp{python_version}-{platform}_{architecture}.whl"


-AUTOAWQ_VERSION = "0.1.8"
+AUTOAWQ_VERSION = "0.2.0"
 PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"

 CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda
@@ -90,6 +90,7 @@ requirements = [
    "tokenizers>=0.12.1",
    "accelerate",
    "datasets",
+    "zstandard",
 ]

 try:
@@ -101,9 +102,9 @@ except importlib.metadata.PackageNotFoundError:
 # kernels can be downloaded from pypi for cuda+121 only
 # for everything else, we need to download the wheels from github
 if not KERNELS_INSTALLED and (CUDA_VERSION or ROCM_VERSION):
-    if CUDA_VERSION.startswith("12"):
+    if CUDA_VERSION and CUDA_VERSION.startswith("12"):
        requirements.append("autoawq-kernels")
-    elif CUDA_VERSION.startswith("11") or ROCM_VERSION in ["561", "571"]:
+    elif CUDA_VERSION and CUDA_VERSION.startswith("11") or ROCM_VERSION in ["561", "571"]:
        gpu_system_version = (
            f"cu{CUDA_VERSION}" if CUDA_VERSION else f"rocm{ROCM_VERSION}"
        )
@@ -130,6 +131,7 @@ setup(
    install_requires=requirements,
    extras_require={
        "eval": ["lm_eval>=0.4.0", "tabulate", "protobuf", "evaluate", "scipy"],
+        "dev": ["black", "mkdocstrings-python", "mkdocs-material", "griffe-typingdoc"]
    },
    **common_setup_kwargs,
 )