from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig from transformers import AutoTokenizer import torch import logging import json import time def gptq(): # Specify paths and hyperparameters for quantization model_path = "../../models/qwen/Qwen1___5-7B-Chat" quant_path = "./Qwen1.5-7B-4bit-gptq-4" quantize_config = BaseQuantizeConfig( bits=4, # 4 or 8 group_size=128, damp_percent=0.01, desc_act=True, # set to False can significantly speed up inference but the perplexity may slightly bad static_groups=False, sym=True, true_sequential=True, model_name_or_path=None, model_file_base_name="model" ) max_len = 8192 # Load your tokenizer and model with AutoGPTQ tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoGPTQForCausalLM.from_pretrained(model_path, quantize_config) # https://huggingface.co/datasets/llm-wizard/alpaca-gpt4-data-zh file_path = '../alpaca_gpt4_data_zh.json' # file_path = 'oaast_rm_zh.json' messages = [] with open(file_path, 'r',encoding='UTF-8') as fcc_file: fcc_data = json.load(fcc_file) for tmp_data in fcc_data: instruction = tmp_data['instruction'] input = tmp_data['input'] output = tmp_data['output'] for op in output: msg = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": instruction + input}, {"role": "assistant", "content": op} ] messages.append(msg) print('len(messages):',len(messages)) messages = messages[:500] print('len(messages):',len(messages)) data = [] for msg in messages: text = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False) # print(text + '\n') model_inputs = tokenizer([text]) input_ids = torch.tensor(model_inputs.input_ids[:max_len], dtype=torch.int) data.append(dict(input_ids=input_ids, attention_mask=input_ids.ne(tokenizer.pad_token_id))) print('len(data):',len(data)) t1 = time.time() logging.basicConfig(format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") model.quantize(data, cache_examples_on_gpu=False) model.save_quantized(quant_path, use_safetensors=True) tokenizer.save_pretrained(quant_path) t2 = time.time() print(('time:{:.2f}s').format(t2-t1)) gptq()