"sgl-kernel/vscode:/vscode.git/clone" did not exist on "f3817cb0b20595ac9abf77855cc9e0883d72c194"
Commit c009512a authored by Azure-Tang's avatar Azure-Tang
Browse files

Merge branch 'main' into hip

parents c1f13a69 4f22d726
from typing import List, Optional
from typing_extensions import Literal
from enum import Enum
from pydantic import BaseModel
from ktransformers.server.schemas.base import Object
from openai.types.completion_usage import CompletionUsage
from openai.types.chat.chat_completion_chunk import Choice
class Role(Enum):
system = 'system'
user = 'user'
......@@ -25,54 +30,31 @@ class ChatCompletionCreate(BaseModel):
messages: List[Message]
model : str
stream : bool = False
temperature: Optional[float] = None
top_p: Optional[float] = None
def get_tokenizer_messages(self):
return [m.to_tokenizer_message() for m in self.messages]
class FinishReason(Enum):
stop = 'stop'
length = 'length'
class Choice(BaseModel):
index: int
message: Message
logprobs: Optional[str] = None
finish_reason: FinishReason = None
class DeltaChoice(BaseModel):
index: int
delta: Message
logprobs: Optional[str] = None
finish_reason: FinishReason = None
class Usage(BaseModel):
completion_tokens:int
prompt_tokens:int
total_tokens:int
class ChatCompletionChunk(BaseModel):
id: str
choices: List[Choice]
created: int
model: str
object: Literal["chat.completion.chunk"]
service_tier: Optional[Literal["scale", "default"]] = None
system_fingerprint: Optional[str] = None
usage: Optional[CompletionUsage] = None
class ChatCompletionBase(Object):
created:int
model:str = 'not implmented'
system_fingerprint:str = 'not implmented'
usage: Optional[Usage] = None
class ChatCompletionObject(ChatCompletionBase):
choices:List[Choice] = []
def append_token(self,token:str):
if len(self.choices) == 0:
self.choices.append(Choice(index=0,message=Message(content='',role=Role.assistant)))
self.choices[0].message.content += token
class ChatCompletionChunk(ChatCompletionBase):
choices:List[DeltaChoice] = []
def to_stream_reply(self):
return f"data: {self.model_dump_json()}\n\n"
def set_token(self,token:str):
self.choices = [
DeltaChoice(index=0,delta=Message(content=token,role=Role.assistant))
]
def to_stream_reply(self):
return f"data:{self.model_dump_json()}\n\n"
class RawUsage(BaseModel):
tokenize_time: float
prefill_time: float
decode_time: float
prefill_count: int
decode_count: int
......@@ -9,6 +9,8 @@ class CompletionCreate(BaseModel):
model: str
prompt: str | List[str]
stream: bool = False
temperature: Optional[float] = None
top_p: Optional[float] = None
def get_tokenizer_messages(self):
if isinstance(self.prompt,List):
......
results/
\ No newline at end of file
# adapt from https://github.com/abacaj/code-eval?tab=readme-ov-file
import argparse
import json
import os
import time
import requests
import tqdm
from evaluation import filter_answer
from prompts import instruct_prompt
import pandas as pd
from datasets import load_dataset
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
def generate_text(api_url,question , model_name, stream=False, auth_token=None):
headers = {
'accept': 'application/json',
'Content-Type': 'application/json',
# 添加 API Key
'Authorization' : 'Bearer ' + auth_token if auth_token else ''
}
question = instruct_prompt(question)
data = {
"messages": [{"content": question, "role": "user"}],
"model": model_name,
"stream": stream,
"temperature": 0.6,
"max_tokens": 10240,
}
print(f"content: {question}")
response = requests.post(api_url, headers=headers, json=data,verify=False)
if response.status_code == 200:
result = response.json()
results = result.get('choices', [{}])[0].get('message', {}).get('content', '')
return filter_answer(results)
else:
print(f"API Request failed with status code {response.status_code}")
return None
def load_data(file_path):
"""
Load data from a Parquet file into a list.
Each record in the Parquet file should represent an individual record.
"""
# 读取 Parquet 文件
# dataset = load_dataset('parquet', data_files=file_path)
data = []
ds = load_dataset(file_path)
df = pd.DataFrame(ds['train'])
for _, row in df.iterrows():
data.append(row.to_dict())
return data
def get_score(pred, answer):
"""
Calculate scores between the prediction and the answer.
Uses ROUGE scores as the evaluation metric.
:param pred: The predicted string.
:param answer: The reference answer string.
:return: A dictionary containing ROUGE scores.
"""
if pred == answer:
return 1
# if we need to compare str with number, convert teh str to number
try:
pred = float(pred)
answer = float(answer)
except:
pass
if pred == answer:
return 1
return 0
def run_eval_api(
api_url: str,
model_name: str,
out_path: str,
format_tabs: bool = False,
auth_token: str = None,
problem_file: str = None,
append: bool = False,
skip: int = 0
):
data = load_data(problem_file)
pbar = tqdm.tqdm(total=len(data) * 1)
pbar.update(skip)
for i in range(len(data)):
i = i+skip
data_item = data[i]
question = data_item['Problem']
# Start the timer for this evaluation
start_time = time.time()
try:
completion = generate_text(api_url, question, model_name, auth_token=auth_token)
if completion is None:
raise Exception(f"Failed to get prediction for {question}")
answer = data_item['Answer']
score = get_score(completion, answer)
elapsed_time = time.time() - start_time
result = {
"index": i,
"question_id": data_item["ID"],
"answer": answer,
"prediction": completion,
"score": score,
"time": elapsed_time
}
with open(out_path, "a" if append else "w") as f:
f.write(json.dumps(result) + "\n")
except Exception as e:
print(f"Failed to get prediction for {question}")
print(e)
continue
pbar.update(1)
def main(output_path, api_url, model_name, auth_token, format_tabs,problem_file, append,skip):
os.makedirs(os.path.dirname(output_path), exist_ok=True)
run_eval_api(api_url, model_name, output_path, format_tabs, auth_token, problem_file,append,skip)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="API Generate Tester")
parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
parser.add_argument("--model_name", type=str, default="Pro/deepseek-ai/DeepSeek-R1", help="Model Name")
parser.add_argument("--out_path", type=str, default="results/api/eval_aime.jsonl", help="Output Path")
parser.add_argument("--auth_token", type=str, default=None, help="Auth Token")
parser.add_argument("--format_tabs", action="store_true", help="Format Tabs")
parser.add_argument("--problem_file", type=str, default="Maxwell-Jia/AIME_2024", help="Evalset File")
parser.add_argument("--no_append", action="store_false", help="Append to existing file")
parser.add_argument("--skip", type=int, default=0, help="Skip some tasks")
args = parser.parse_args()
# api_url = "https://api.siliconflow.cn/v1/chat/completions"
main(args.out_path, args.api_url, args.model_name, args.auth_token, args.format_tabs, args.problem_file, args.no_append, args.skip)
\ No newline at end of file
# reference: https://github.com/declare-lab/instruct-eval/blob/main/human_eval/main.py#L35
def filter_answer(completion: str) -> str:
# the answer is the last part of the completion, it's a int64 number
# get the last line
completion = completion.strip().split("\n")[-1]
# handle the $\\boxed{...}$ format
if "$\\boxed{" in completion:
return completion.split("}")[0].split("{")[-1]
return completion.split()[-1]
def instruct_prompt(prompt: str) -> str:
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nSolve the following math problem without any tests or explanation only one answer surrounede by '$\\boxed{{}}$'\n{prompt}\n\n### Response:"""
# adapt from https://github.com/abacaj/code-eval?tab=readme-ov-file
import argparse
import os
import requests
from human_eval.data import write_jsonl, read_problems
import tqdm
from evaluation import filter_code, fix_indents
from prompts import instruct_prompt
def generate_text(api_url,question , model_name, stream=False, auth_token=None):
headers = {
'accept': 'application/json',
'Content-Type': 'application/json',
# 添加 API Key
'Authorization' : 'Bearer ' + auth_token if auth_token else ''
}
question = instruct_prompt(question)
data = {
"messages": [{"content": question, "role": "user"}],
"model": model_name,
"stream": stream,
"temperature": 0.6
}
print(f"content: {question}")
response = requests.post(api_url, headers=headers, json=data,verify=False)
if response.status_code == 200:
result = response.json()
results = result.get('choices', [{}])[0].get('message', {}).get('content', '')
return [filter_code(fix_indents(results))]
else:
print(f"API Request failed with status code {response.status_code}")
return None
def run_eval_api(
api_url: str,
model_name: str,
out_path: str,
format_tabs: bool = False,
auth_token: str = None,
problem_file: str = None,
append: bool = False,
skip: int = 0
):
if(problem_file is None):
problems = read_problems()
else:
problems = read_problems(problem_file)
samples = []
pbar = tqdm.tqdm(total=len(problems) * 1)
pbar.update(skip)
try:
for task_id in problems:
# skip some tasks
if skip > 0:
skip -= 1
continue
if format_tabs:
prompt = problems[task_id]["prompt"].replace(" ", "\t")
else:
prompt = problems[task_id]["prompt"]
completion = generate_text(api_url, prompt, model_name, auth_token=auth_token)
# samples.append({"task_id": task_id, "completion": completion})
for sample in completion:
result = dict(
task_id=task_id,
completion=sample,
)
samples += [result]
if append:
write_jsonl(out_path, [result],append=append)
pbar.update(1)
if not append:
write_jsonl(out_path, samples,append=append)
except Exception as e:
if not append:
write_jsonl(out_path, samples,append=append)
print(f"Error: {e}")
def main(output_path, api_url, model_name, auth_token, format_tabs,problem_file, append,skip):
os.makedirs(os.path.dirname(output_path), exist_ok=True)
run_eval_api(api_url, model_name, output_path, format_tabs, auth_token, problem_file,append,skip)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="API Generate Tester")
#parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
parser.add_argument("--api_url", type=str, default="http://localhost:10002/v1/chat/completions", help="API URL")
parser.add_argument("--model_name", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model Name")
parser.add_argument("--out_path", type=str, default="results/api/eval_b.jsonl", help="Output Path")
parser.add_argument("--auth_token", type=str, default=None, help="Auth Token")
parser.add_argument("--format_tabs", action="store_true", help="Format Tabs")
parser.add_argument("--problem_file", type=str, default=None, help="Evalset File")
parser.add_argument("--no_append", action="store_false", help="Append to existing file")
parser.add_argument("--skip", type=int, default=0, help="Skip first n problems")
args = parser.parse_args()
# api_url = "https://api.siliconflow.cn/v1/chat/completions"
main(args.out_path, args.api_url, args.model_name, args.auth_token, args.format_tabs, args.problem_file, args.no_append,args.skip)
\ No newline at end of file
# reference: https://github.com/declare-lab/instruct-eval/blob/main/human_eval/main.py#L35
def filter_code(completion: str) -> str:
# The program tends to overwrite, we only take the first function
completion = completion.lstrip("\n")
# we also remove ```python\n and ```
completion = completion.replace("```python\n", "").replace("```", "")
if 'if __name__ == "__main__":' in completion:
completion = completion.split('if __name__ == "__main__":')[0]
if "# Example usage" in completion:
completion = completion.split("# Example usage")[0]
return completion
def fix_indents(text: str) -> str:
return text.replace("\t", " ")
def instruct_prompt(prompt: str) -> str:
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following Python code without any tests or explanation\n{prompt}\n\n### Response:"""
def standard_prompt(prompt: str) -> str:
return f"""Complete the following Python code without any tests or explanation\n{prompt}"""
def write_prompt(prompt: str) -> str:
return f"""Write a python program to complete the following code:\n{prompt}"""
def replit_glaive_prompt(prompt: str) -> str:
return f"""Below is an instruction that describes a task, paired with an input that provides further context.\n Write a response that appropriately completes the request.\n\n ### Instruction:\nWrite a program to perform the given task.\n\n Input:\n{prompt}\n\n### Response:"""
import argparse
import random
import time
import json
import requests
import pandas as pd
from datasets import load_dataset
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ['https_proxy'] = ''
os.environ['http_proxy'] = ''
hint = 'There is a single choice question. Answer the question by replying A, B, C, D, E, F, G, H, I, J. No other answers are accepted. Just the letter.'
class DataEvaluator:
def __init__(self):
# self.template_prompt = template_prompt
self.data = []
def load_data(self, file_path):
"""
Load data from a Parquet file into a list.
Each record in the Parquet file should represent an individual record.
"""
# 读取 Parquet 文件
# dataset = load_dataset('parquet', data_files=file_path)
ds = load_dataset("TIGER-Lab/MMLU-Pro")
df = pd.DataFrame(ds['test'])
# print(ds)
# # ds_1 = ds['train']
# ds_2 = ds['validation']
# ds_3 = ds['test']
# # 将数据集转换为 Pandas DataFrame
# df_test = pd.DataFrame(ds['test'])
# df_val = pd.DataFrame(ds['validation'])
# for _, row in df.iterrows():
# self.data.append(row.to_dict())
# df = pd.read_parquet(file_path)
for _, row in df.iterrows():
self.data.append(row.to_dict())
def get_prompt(self, record):
"""
Combine fields from a record with the template prompt to create a full prompt.
:param record: Dictionary containing fields to populate the template.
:return: A formatted prompt string.
"""
# 查看ABCD。。。的选项
options_str = "\n".join([f"{chr(65+i)}. {opt}" for i, opt in enumerate(record['options'])])
prompt = hint + "\nQuestion: " + record['question'] + "\n" + options_str + "\nAnswer: '"
return prompt
def post_processing(self, text):
"""
Perform post-processing on the prediction string.
:param text: The raw prediction string.
:return: Processed prediction string.
"""
text = text.lstrip('\n').split('\n')[-1]
return text[-1:]
def score(self, pred, answers):
"""
Calculate scores between the prediction and the answer.
Uses ROUGE scores as the evaluation metric.
:param pred: The predicted string.
:param answer: The reference answer string.
:return: A dictionary containing ROUGE scores.
"""
for answer in answers:
if pred == answer:
return 1
return 0
# Function to generate text using API
def generate_text(api_url, question, model_name, stream=False):
headers = {
'accept': 'application/json',
'Content-Type': 'application/json',
# 添加 API Key
'Authorization' : 'Bearer '
}
data = {
"messages": [{"content": question, "role": "user"}],
"model": model_name,
"stream": stream,
# "temperature": 0.0
}
print("POST data:", data)
response = requests.post(api_url, headers=headers, json=data)
if response.status_code == 200:
result = response.json()
return result.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
else:
print(f"API Request failed with status code {response.status_code}")
return None
# Main function to handle multiple evaluations
def main(concurrent_requests, data_evaluator: DataEvaluator, result_file, log_file, api_url, model_name):
start_total_time = time.time()
total_score = 0
results = []
# 设置随机数种子
random.seed(42)
random.shuffle(data_evaluator.data)
for i in range(min(concurrent_requests, len(data_evaluator.data))):
# Randomly select a data item from data for each request
data_item = data_evaluator.data[i]
question = data_evaluator.get_prompt(data_item)
# print(question)
# Start the timer for this evaluation
start_time = time.time()
try:
# Generate prediction using the API
prediction = generate_text(api_url, question, model_name)
if prediction is None:
raise Exception(f"Failed to get prediction for {question}")
answer = data_item['answer']
# Compute score
score = data_evaluator.score(data_evaluator.post_processing(prediction), answer)
# Calculate the time taken
elapsed_time = time.time() - start_time
# Collect the result data
result_data = {
"question_id": data_item['question_id'],
"answer": answer,
"prediction": data_evaluator.post_processing(prediction),
"score": score,
"time": elapsed_time
}
# Write results to result.json with each field on a new line
with open(result_file, 'a', encoding='utf-8') as f:
json.dump(result_data, f, ensure_ascii=False, indent=4)
f.write("\n") # Ensure each JSON object is on a new line
results.append(result_data)
# Aggregate scores
total_score += score
except Exception as e:
print(f"Error processing request {i}: {e}")
# Calculate total time and throughput
total_time = time.time() - start_total_time
throughput = concurrent_requests / total_time
# Log the total time, throughput, and average ROUGE scores
with open(log_file, 'a', encoding='utf-8') as log_f:
log_f.write(f"Total Time: {total_time:.2f} seconds\n")
log_f.write(f"Throughput: {throughput:.2f} requests per second\n")
log_f.write(f"Average Scores: {total_score / concurrent_requests}\n")
log_f.write('-' * 40 + '\n')
print(f"Results saved to {result_file}")
print(f"Log saved to {log_file}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="API Generate Tester")
parser.add_argument("--concurrent", type=int, default=1000, help="Number of concurrent evaluations")
parser.add_argument("--file", type=str, default="TIGER-Lab/MMLU-Pro", help="Path to the mmlu.jsonl file")
parser.add_argument("--result", type=str, default="./mmlu_result_pro.json", help="Path to save the result JSON file")
parser.add_argument("--log", type=str, default="./mmlu_result_pro.log", help="Path to save the log file")
parser.add_argument("--model", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model name or path")
parser.add_argument("--api_url", type=str, default="http://localhost:15488/v1/chat/completions", help="API URL")
# parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
args = parser.parse_args()
# Load the data from the provided file
# template_prompt = hint + "\nQuestion: {question}\nA. {options}\nB. {option_b}\nC. {option_c}\nD. {option_d}\nAnswer: '"
# template_prompt_pro = hint + "\nQuestion: {question}\nA. {options[0]}\nB. {options[1]}\nC. {options[2]}\nD. {options[3]}\nE. {options[4]}\nF. {options[5]}\nG. \
# {options[6]}\nH. {options[7]}\nI. {options[8]}\nJ. {options[9]}\nAnswer: '"
# Load the data from the provided file
data_evaluator = DataEvaluator()
data_evaluator.load_data(args.file)
# Run the main function with the specified number of concurrent evaluations
main(args.concurrent, data_evaluator, args.result, args.log, args.api_url, args.model)
\ No newline at end of file
import argparse
import random
import time
import json
import requests
import pandas as pd
from datasets import load_dataset
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ['https_proxy'] = ''
os.environ['http_proxy'] = ''
hint = 'There is a single choice question. Answer the question by replying A, B, C, D. No other answers are accepted. Just the letter.'
class DataEvaluator:
def __init__(self):
# self.template_prompt = template_prompt
self.data = []
def load_data(self, file_path):
"""
Load data from a Parquet file into a list.
Each record in the Parquet file should represent an individual record.
"""
# 读取 Parquet 文件
# dataset = load_dataset('parquet', data_files=file_path)
ds = load_dataset(file_path,"all")
df = pd.DataFrame(ds['test'])
# print(ds)
# # ds_1 = ds['train']
# ds_2 = ds['validation']
# ds_3 = ds['test']
# # 将数据集转换为 Pandas DataFrame
# df_test = pd.DataFrame(ds['test'])
# df_val = pd.DataFrame(ds['validation'])
# for _, row in df.iterrows():
# self.data.append(row.to_dict())
# df = pd.read_parquet(file_path)
for _, row in df.iterrows():
self.data.append(row.to_dict())
def get_prompt(self, record):
"""
Combine fields from a record with the template prompt to create a full prompt.
:param record: Dictionary containing fields to populate the template.
:return: A formatted prompt string.
"""
# 查看ABCD。。。的选项
options_str = "\n".join([f"{chr(65 + i)}. {opt}" for i, opt in enumerate(record['choices'])])
prompt = hint + "\nQuestion: " + record['question'] + "\n" + options_str + "\nAnswer: '"
return prompt
def post_processing(self, text):
"""
Perform post-processing on the prediction string.
:param text: The raw prediction string.
:return: Processed prediction string.
"""
text = text.lstrip('\n').split('\n')[-1]
return text[-1:]
def score(self, pred, answers):
"""
Calculate scores between the prediction and the answer.
Uses ROUGE scores as the evaluation metric.
:param pred: The predicted string.
:param answer: The reference answer string.
:return: A dictionary containing ROUGE scores.
"""
for answer in answers:
if pred == answer:
return 1
return 0
# Function to generate text using API
def generate_text(api_url, question, model_name, stream=False):
headers = {
'accept': 'application/json',
'Content-Type': 'application/json',
# 添加 API Key
'Authorization' : 'Bearer '
}
data = {
"messages": [{"content": question, "role": "user"}],
"model": model_name,
"stream": stream,
# "temperature": 0.0
}
print("POST data:", data)
response = requests.post(api_url, headers=headers, json=data)
if response.status_code == 200:
result = response.json()
return result.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
else:
print(f"API Request failed with status code {response.status_code}")
return None
# Main function to handle multiple evaluations
def main(concurrent_requests, data_evaluator: DataEvaluator, result_file, log_file, api_url, model_name):
start_total_time = time.time()
total_score = 0
results = []
# 设置随机数种子
random.seed(42)
random.shuffle(data_evaluator.data)
for i in range(min(concurrent_requests, len(data_evaluator.data))):
# Randomly select a data item from data for each request
data_item = data_evaluator.data[i]
question = data_evaluator.get_prompt(data_item)
# print(question)
# Start the timer for this evaluation
start_time = time.time()
try:
# Generate prediction using the API
prediction = generate_text(api_url, question, model_name)
if prediction is None:
raise Exception(f"Failed to get prediction for {question}")
answer = chr(data_item['answer'] + 65)
# Compute score
score = data_evaluator.score(data_evaluator.post_processing(prediction), answer)
# Calculate the time taken
elapsed_time = time.time() - start_time
# Collect the result data
result_data = {
"question_id": i,
"answer": answer,
"prediction": data_evaluator.post_processing(prediction),
"score": score,
"time": elapsed_time
}
# Write results to result.json with each field on a new line
with open(result_file, 'a', encoding='utf-8') as f:
json.dump(result_data, f, ensure_ascii=False, indent=4)
f.write("\n") # Ensure each JSON object is on a new line
results.append(result_data)
# Aggregate scores
total_score += score
except Exception as e:
print(f"Error processing request {i}: {e}")
# Calculate total time and throughput
total_time = time.time() - start_total_time
throughput = concurrent_requests / total_time
# Log the total time, throughput, and average ROUGE scores
with open(log_file, 'a', encoding='utf-8') as log_f:
log_f.write(f"Total Time: {total_time:.2f} seconds\n")
log_f.write(f"Throughput: {throughput:.2f} requests per second\n")
log_f.write(f"Average Scores: {total_score / concurrent_requests}\n")
log_f.write('-' * 40 + '\n')
print(f"Results saved to {result_file}")
print(f"Log saved to {log_file}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="API Generate Tester")
parser.add_argument("--concurrent", type=int, default=1000, help="Number of concurrent evaluations")
parser.add_argument("--file", type=str, default="cais/mmlu", help="Path to the mmlu.jsonl file")
parser.add_argument("--result", type=str, default="./mmlu_result_silicon.json", help="Path to save the result JSON file")
parser.add_argument("--log", type=str, default="./mmlu_result_silicon.log", help="Path to save the log file")
parser.add_argument("--model", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model name or path")
parser.add_argument("--api_url", type=str, default="http://localhost:10003/v1/chat/completions", help="API URL")
# parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
args = parser.parse_args()
# Load the data from the provided file
# template_prompt = hint + "\nQuestion: {question}\nA. {options}\nB. {option_b}\nC. {option_c}\nD. {option_d}\nAnswer: '"
# template_prompt_pro = hint + "\nQuestion: {question}\nA. {options[0]}\nB. {options[1]}\nC. {options[2]}\nD. {options[3]}\nE. {options[4]}\nF. {options[5]}\nG. \
# {options[6]}\nH. {options[7]}\nI. {options[8]}\nJ. {options[9]}\nAnswer: '"
# Load the data from the provided file
data_evaluator = DataEvaluator()
data_evaluator.load_data(args.file)
# Run the main function with the specified number of concurrent evaluations
main(args.concurrent, data_evaluator, args.result, args.log, args.api_url, args.model)
\ No newline at end of file
import torch
import torch.nn.functional as F
from typing import Optional
import pytest
from typing import Tuple, Optional, Literal
import time
# use dir path
import os
import sys
sys.path.insert(0, "/home/azure/ktransformers")
print(sys.path)
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
from safetensors import safe_open
world_size = 1
rank = 0
block_size = 128
gemm_impl: Literal["bf16", "fp8"] = "bf16"
# Assuming `fp8_gemm`, `act_quant`, `weight_dequant` and other relevant functions are already defined
def test_fp8_gemm_vs_torch_matmul():
# Test case 1: Create random matrices of size (M, K) and (K, N)
M, K, N = 64, 128, 256 # Matrix dimensions
x = torch.randn(M, K, dtype=torch.bfloat16, device='cuda')
weight = torch.randn(N, K, dtype=torch.bfloat16, device='cuda')
# Apply act_quant to both matrices
x_quantized, scale_x = act_quant(x, block_size)
weight_quantized, scale_w = act_quant(weight, block_size)
# mk continous
x_quantized = x_quantized.contiguous()
weight_quantized = weight_quantized.contiguous()
scale_x = scale_x.contiguous()
scale_w = scale_w.contiguous()
# Perform fp8_gemm using the quantized tensors
result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight_quantized, scale_w)
# Perform torch.matmul using the original floating point tensors
result_torch_matmul = torch.matmul(x, weight.T)
print(f'result_torch_matmul: {result_torch_matmul.shape}')
print(f'result_fp8_gemm: {result_fp8_gemm.shape}')
print(f"result_fp8_gemm:\n {result_fp8_gemm}")
print(f"result_torch_matmul:\n {result_torch_matmul}")
def test_fp8_gemm_vs_torch_matmul_load():
file_path = "/mnt/data/model/DeepSeek-V3/model-00001-of-000163.safetensors"
with safe_open(file_path, framework="pt", device=0) as f:
weight = f.get_tensor("model.layers.0.mlp.down_proj.weight")
scale = f.get_tensor("model.layers.0.mlp.down_proj.weight_scale_inv")
# weight_dequant
weight_dequantized = weight_dequant(weight, scale)
print(f"weight_dequantized: {weight_dequantized.shape}")
N, K = weight_dequantized.shape
M = 64
x = torch.randn(2 ,M, K, dtype=torch.bfloat16, device='cuda')
x_quantized, scale_x = act_quant(x, block_size)
# Test case 1: quantized x matmal with undequantized weight
result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
print(f"result_fp8_gemm:\n {result_fp8_gemm}")
print(f"dtype {result_fp8_gemm.dtype}")
# Perform torch.matmul using the original floating point tensors
result_torch_matmul = torch.matmul(x, weight_dequantized.to(torch.bfloat16).T)
print(f"result_torch_matmul:\n {result_torch_matmul}")
def test_fp8_gemm_tplops():
file_path = "/mnt/data/model/DeepSeek-V3/model-00001-of-000163.safetensors"
with safe_open(file_path, framework="pt", device=0) as f:
weight = f.get_tensor("model.layers.0.mlp.down_proj.weight")
scale = f.get_tensor("model.layers.0.mlp.down_proj.weight_scale_inv")
# weight_dequant
weight_dequantized = weight_dequant(weight, scale)
print(f"weight_dequantized: {weight_dequantized.shape}")
N, K = weight_dequantized.shape
M = 6400
x = torch.randn(2 ,M, K, dtype=torch.bfloat16, device='cuda')
# x_quantized, scale_x = act_quant(x, block_size)
# Calculate time for 1000 fp8_gemm
i = 10
flops_per_gemm = 2 * M * N * K
total_flops = i * flops_per_gemm
x_quantized, scale_x = act_quant(x, block_size)
result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
x_quantized, scale_x = act_quant(x, block_size)
result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
t0 = time.time()
torch.cuda.synchronize()
for i in range(i):
x_quantized, scale_x = act_quant(x, block_size)
result_fp8_gemm = fp8_gemm(x_quantized, scale_x, weight, scale)
torch.cuda.synchronize()
t1 = time.time()
total_time = t1 - t0
tflops = total_flops / total_time / 1e12
print(f"total_time: {total_time}")
print(f"tflops: {tflops}")
if __name__ == "__main__":
test_fp8_gemm_vs_torch_matmul()
test_fp8_gemm_vs_torch_matmul_load()
test_fp8_gemm_tplops()
\ No newline at end of file
......@@ -25,6 +25,9 @@ import os
from enum import IntEnum
import torch
import KTransformersOps
from .custom_loader import SafeTensorLoader
import ctypes
import math
class GGMLQuantizationType(IntEnum):
F32 = 0
......@@ -109,6 +112,7 @@ GGML_TYPES = {
"Q5_K": 13,
"Q6_K": 14,
"IQ4_XS": 23,
"BF16": 30,
}
GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()}
......@@ -116,6 +120,7 @@ GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()}
GGML_BLOCK_SIZES = {
"F32": 4,
"F16": 2,
"BF16": 2,
"Q4_0": 2 + 16,
"Q5_0": 2 + 4 + 16,
"Q8_0": 2 + 32,
......@@ -125,11 +130,13 @@ GGML_BLOCK_SIZES = {
"Q5_K": 2 + 2 + 12 + 256 // 8 + 256 // 2,
"Q6_K": 256 // 2 + 256 // 4 + 256 // 16 + 2,
"IQ4_XS": 2 + 2 + 256 // 2 + 256 // 64,
"FP8": 1,
}
GGML_ELEMENTS_PER_BLOCK = {
"F32": 1,
"F16": 1,
"BF16": 1,
"Q4_0": 32,
"Q5_0": 32,
"Q8_0": 32,
......@@ -139,6 +146,7 @@ GGML_ELEMENTS_PER_BLOCK = {
"Q5_K": 256,
"Q6_K": 256,
"IQ4_XS": 256,
"FP8": 1,
}
DATA_TYPES = {
......@@ -155,6 +163,7 @@ DATA_TYPES = {
"uint64": 10,
"int64": 11,
"float64": 12,
"FP8": 13,
}
class GGUFLoader:
......@@ -162,10 +171,15 @@ class GGUFLoader:
gguf_path: str
tensor_file_map: dict # {tensor_name: tensor_file_path}
gguf_file_meta: dict
safetensor_loader: SafeTensorLoader
def __init__(self, gguf_path: str):
# Check dir exist
if not os.path.exists(gguf_path):
raise FileNotFoundError(f"GGUF dir not found: {gguf_path}")
if os.path.isfile(gguf_path):
gguf_path = os.path.dirname(gguf_path)
self.safetensor_loader = None
self.tensor_info = {}
self.gguf_path = gguf_path
......@@ -173,16 +187,26 @@ class GGUFLoader:
self.file_data_map = {}
self.gguf_file_meta = {}
self.tensor_device_map = {}
# I know this is ugly, but I don't want to change the original code too much
# TODO: merge gguf load and other loads.
safetensor_loader = SafeTensorLoader(gguf_path)
if safetensor_loader.tensor_file_map:
self.safetensor_loader = safetensor_loader
return
# Walk through all the .gguf files in the directory
found_gguf = False
for root, dirs, files in os.walk(gguf_path):
for file in files:
if file.endswith(".gguf"):
found_gguf = True
file_name = os.path.join(root, file)
with open(file_name, "rb") as f:
self.load_gguf(f)
if file_name not in self.file_data_map:
self.file_data_map[file_name] = np.memmap(file_name, mode = 'r')
if not found_gguf:
raise FileNotFoundError(f"Cannot find any .gguf files in: {gguf_path}")
def load_gguf(self, f):
f.seek(0)
......@@ -207,7 +231,7 @@ class GGUFLoader:
shape = [read_value(f, DATA_TYPES["uint64"]) for _ in range(shape_len)]
ggml_type = read_value(f, DATA_TYPES["uint32"])
bad_offset = read_value(f, DATA_TYPES["uint64"])
n_elems = int(np.prod(shape))
n_elems = int(math.prod(shape))
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
n_bytes = n_elems * type_size // block_size
np_dims = tuple(reversed(shape))
......@@ -276,8 +300,49 @@ class GGUFLoader:
itemsize = int(np.empty([], dtype = item_type).itemsize)
return mmap_data[offset : offset + itemsize * item_count]
def load_gguf_tensor(self, name: str, device:str = "cpu")->torch.Tensor:
def get_undequanted_tensor_and_ggml_type(self, name):
t = self.tensor_info[name]
data = self.get_mmap_tensor(name)
ggml_type = t["ggml_type"]
data = torch.from_numpy(data)
return data, ggml_type
def load_expert_tensor(self, name, data, expert_id, elements_per_expert, device = "cuda", target_dtype = torch.get_default_dtype())->torch.Tensor:
t = self.tensor_info[name]
if device.lower() == "cpu":
print(f"loading expert {expert_id} of {name} with CPU")
shape = t["shape"]
ggml_type = t["ggml_type"]
if ggml_type not in GGML_NAMES:
raise NotImplementedError(f"ggml_type {ggml_type} not implemented")
ggml_name = GGML_NAMES[ggml_type]
# TODO: experts may fused in quant block, split it
assert elements_per_expert % GGML_ELEMENTS_PER_BLOCK[ggml_name] == 0, "experts may fused in quant block, please use CPU dequant"
blocks_per_experts = elements_per_expert // GGML_ELEMENTS_PER_BLOCK[ggml_name]
block_size = GGML_BLOCK_SIZES[ggml_name]
offset = expert_id * block_size * blocks_per_experts
data = data[offset: offset + block_size * blocks_per_experts]
if "cuda" in device.lower():
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device, target_dtype)
else:
values = GGML_DEQUANTIZE[ggml_name](data)
values = torch.from_numpy(values.copy())
if ggml_name == "BF16":
values = values.view(torch.bfloat16)
values = values.view(shape[-2::-1])
return values
def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor:
t = self.tensor_info[name]
if device.lower() == "cpu":
print(f"loading {name} with CPU")
if target_dtype == None:
target_dtype = torch.get_default_dtype()
shape = t["shape"]
ggml_type = t["ggml_type"]
......@@ -289,14 +354,38 @@ class GGUFLoader:
data = self.get_mmap_tensor(name)
if "cuda" in device.lower():
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
#values = GGML_DEQUANTIZE[ggml_name](data)
#print("load_gguf_tensor")
#values = torch.from_numpy(values).to(device = device)
block_size = GGML_BLOCK_SIZES[ggml_name]
elements_per_block = GGML_ELEMENTS_PER_BLOCK[ggml_name]
num_elements = int(np.prod(shape))
num_blocks = num_elements // elements_per_block
blocks_per_iter = 16384
if num_blocks > blocks_per_iter: # dequant large tensor
values = torch.empty((num_blocks, elements_per_block), dtype=target_dtype, device=device)
for i in range( (num_blocks + blocks_per_iter - 1) // blocks_per_iter):
blocks_begin = i * blocks_per_iter
blocks_end = min(blocks_begin + blocks_per_iter, num_blocks)
if "cuda" in device.lower():
cur_values = GGML_DEQUANTIZE_GPU[ggml_name](data[blocks_begin*block_size : blocks_end*block_size], device, target_dtype)
else:
cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size])
cur_values = torch.from_numpy(cur_values.copy())
cur_values = cur_values.view(-1, elements_per_block)
if ggml_name == "BF16":
cur_values = cur_values.view(torch.bfloat16)
values[blocks_begin : blocks_end] = cur_values
else:
values = GGML_DEQUANTIZE[ggml_name](data)
values = torch.from_numpy(values)
if "cuda" in device.lower():
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
else:
values = GGML_DEQUANTIZE[ggml_name](data)
values = torch.from_numpy(values)
if ggml_name == "BF16":
values = values.view(torch.bfloat16)
values = values.view(shape[::-1])
if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
n_head = self.gguf_file_meta['llama.attention.head_count']
......@@ -352,6 +441,9 @@ def read_value(f, data_type):
elem_type, count = struct.unpack("<IQ", f.read(4 + 8))
return [read_value(f, elem_type) for _ in range(count)]
elif data_type == DATA_TYPES["FP8"]:
return struct.unpack("<B", f.read(1))[0]
else:
raise NotImplementedError(f"Data type {data_type} not implemented")
......@@ -392,14 +484,15 @@ def dequantize_q2_k(data):
return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4)
def dequantize_q2_k_gpu(data, device:str ="cuda"):
def dequantize_q2_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
block_size = GGML_BLOCK_SIZES["Q2_K"]
ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q2_K"]
data = np.frombuffer(data, dtype=data.dtype)
device = torch.device(device)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data = torch.from_numpy(data)
return KTransformersOps.dequantize_q2_k(data, block_size, device)
c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
return KTransformersOps.dequantize_q2_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)
def dequantize_q3_k(data):
# C implementation
......@@ -443,14 +536,15 @@ def dequantize_q3_k(data):
(((qs[:, 48:64] >> 6) & 3) - bits[:, 16:, 7])
], axis=1)
def dequantize_q3_k_gpu(data, device:str ="cuda"):
def dequantize_q3_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
block_size = GGML_BLOCK_SIZES["Q3_K"]
ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q3_K"]
data = np.frombuffer(data, dtype=data.dtype)
device = torch.device(device)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data = torch.from_numpy(data)
return KTransformersOps.dequantize_q3_k(data, block_size, device)
c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
return KTransformersOps.dequantize_q3_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)
def dequantize_q4_k(data):
# C implementation
......@@ -474,13 +568,15 @@ def dequantize_q4_k(data):
# Dequantize final weights using scales and offsets
return factors * qs2 - offsets
def dequantize_q4_k_gpu(data, device:str ="cuda"):
def dequantize_q4_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
block_size = GGML_BLOCK_SIZES["Q4_K"]
ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q4_K"]
data = np.frombuffer(data, dtype=data.dtype)
device = torch.device(device)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data = torch.from_numpy(data)
return KTransformersOps.dequantize_q4_k(data, 144, device)
c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
return KTransformersOps.dequantize_q4_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)
def dequantize_q5_k(data):
# C implementation
......@@ -538,14 +634,15 @@ def dequantize_q5_k(data):
d8 * (qs_hi_4[:, 3] + (bits[:, :, 7] << 4)) - m8,
], axis=1)
def dequantize_q5_k_gpu(data, device:str ="cuda"):
def dequantize_q5_k_gpu(data, device:str ="cuda", target_dtype = torch.get_default_dtype()):
block_size = GGML_BLOCK_SIZES["Q5_K"]
ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q5_K"]
data = np.frombuffer(data, dtype=data.dtype)
device = torch.device(device)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data = torch.from_numpy(data)
return KTransformersOps.dequantize_q5_k(data, block_size, device)
c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
return KTransformersOps.dequantize_q5_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)
def dequantize_q6_k(data):
# C implementation
......@@ -596,13 +693,14 @@ def dequantize_q6_k(data):
], axis=1)
# @torch.jit.script
def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda"):
def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda", target_dtype = torch.get_default_dtype()):
block_size = GGML_BLOCK_SIZES["Q6_K"]
ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q6_K"]
device = torch.device(device)
num_blocks = len(data) // block_size
data = np.frombuffer(data, dtype=data.dtype)
data = torch.from_numpy(data)
return KTransformersOps.dequantize_q6_k(data, block_size, device)
c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
return KTransformersOps.dequantize_q6_k(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)
kvalues_iq4nl = np.array([-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113], dtype=np.int8)
......@@ -636,13 +734,14 @@ def dequantize_iq4_xs(data):
return y.flatten()
def dequantize_iq4_xs_gpu(data: np.ndarray, device:str = "cuda"):
def dequantize_iq4_xs_gpu(data: np.ndarray, device:str = "cuda", target_dtype = torch.get_default_dtype()):
block_size = GGML_BLOCK_SIZES["IQ4_XS"]
ele_per_blk = GGML_ELEMENTS_PER_BLOCK["IQ4_XS"]
device = torch.device(device)
num_blocks = len(data) // block_size
data = np.frombuffer(data, dtype=data.dtype)
data = torch.from_numpy(data)
return KTransformersOps.dequantize_iq4_xs(data, block_size, device)
c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
return KTransformersOps.dequantize_iq4_xs(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)
def dequantize_q4_0(data):
# C implementation
......@@ -659,7 +758,7 @@ def dequantize_q4_0(data):
scales * ((qs >> 4).astype(np.int8) - 8),
], axis=1)
def dequantize_q4_0_gpu(data):
def dequantize_q4_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()):
raise NotImplementedError()
def dequantize_q5_0(data):
......@@ -683,7 +782,7 @@ def dequantize_q5_0(data):
scales * x1,
], axis=1)
def dequantize_q5_0_gpu(data):
def dequantize_q5_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()):
raise NotImplementedError()
def dequantize_q8_0(data):
......@@ -695,32 +794,41 @@ def dequantize_q8_0(data):
qs = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, 2 + 32)[:, 2:]
return scales * qs
def dequantize_q8_0_gpu(data, device:str = "cuda"):
def dequantize_q8_0_gpu(data, device:str = "cuda", target_dtype = torch.get_default_dtype()):
# C struct definition
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
num_blocks = len(data) // GGML_BLOCK_SIZES["Q8_0"]
block_size = GGML_BLOCK_SIZES["Q8_0"]
ele_per_blk = GGML_ELEMENTS_PER_BLOCK["Q8_0"]
device = torch.device(device)
data = np.frombuffer(data, dtype=data.dtype)
data = torch.from_numpy(data)
return KTransformersOps.dequantize_q8_0(data, 34, device)
c_pointer = ctypes.addressof(ctypes.cast(data.ctypes.data, ctypes.POINTER(ctypes.c_int8)).contents)
return KTransformersOps.dequantize_q8_0(c_pointer, data.size, block_size, ele_per_blk, device, target_dtype)
def dequantize_f32(data):
return np.frombuffer(data, dtype=np.float32)
def dequantize_f32_gpu(data, device):
def dequantize_f32_gpu(data, device, target_dtype = torch.get_default_dtype()):
data = np.frombuffer(data, dtype=np.float32)
res = torch.from_numpy(data)
res_gpu = torch.empty_like(res, device=device)
res = torch.from_numpy(data.copy())
res_gpu = torch.empty_like(res, device=device, dtype=target_dtype)
res_gpu.copy_(res)
return res_gpu
def dequantize_f16(data):
return np.frombuffer(data, dtype=np.float16)
def dequantize_f16_gpu(data, device):
def dequantize_f16_gpu(data, device, target_dtype = torch.get_default_dtype()):
data = np.frombuffer(data, dtype=np.float16)
res = torch.from_numpy(data.copy())
res_gpu = torch.empty_like(res, device=device, dtype=target_dtype)
res_gpu.copy_(res)
return res_gpu
def dequantize_bf16_gpu(data, device, target_dtype = torch.get_default_dtype()):
data = np.frombuffer(data, dtype=np.float16)
res = torch.from_numpy(data)
res = torch.from_numpy(data.copy())
res_gpu = torch.empty_like(res, device=device)
res_gpu.copy_(res)
return res_gpu
......@@ -728,6 +836,7 @@ def dequantize_f16_gpu(data, device):
GGML_DEQUANTIZE = {
"F32": dequantize_f32,
"F16": dequantize_f16,
"BF16": dequantize_f16,
"Q4_0": dequantize_q4_0,
"Q5_0": dequantize_q5_0,
"Q8_0": dequantize_q8_0,
......@@ -742,6 +851,7 @@ GGML_DEQUANTIZE = {
GGML_DEQUANTIZE_GPU = {
"F32": dequantize_f32_gpu,
"F16": dequantize_f16_gpu,
"BF16": dequantize_bf16_gpu,
"Q4_0": dequantize_q4_0_gpu,
"Q5_0": dequantize_q5_0_gpu,
"Q8_0": dequantize_q8_0_gpu,
......
import struct
import warnings
import numpy as np
import re
import numpy.typing as npt
from typing import Sequence
import os
from enum import IntEnum
import torch
import KTransformersOps
from safetensors import safe_open
from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant
from safetensors.torch import save_file
class SafeTensorLoader:
tensor_file_map = {}
tensor_type_map = {}
file_handle_map = {}
def __init__(self, file_path: str):
self.__load_tensor_file_map(file_path)
def __load_tensor_file_map(self, file_path: str):
# 处理传入路径,确保是文件夹路径
if not os.path.exists(file_path):
raise FileNotFoundError(f"Path not found: {file_path}")
if os.path.isfile(file_path):
folder_path = os.path.dirname(file_path)
else:
folder_path = file_path
found_safetensor = False
for root, _, files in os.walk(folder_path):
files = sorted(files)
for file in files:
if file.endswith(".safetensors"):
found_safetensor = True
file_path = os.path.join(root, file)
if file not in self.file_handle_map:
try:
handle = safe_open(file_path, framework="pt")
self.file_handle_map[file] = handle
except Exception as e:
print(f"Error opening Safetensor file {file_path}: {e}")
continue
f = self.file_handle_map.get(file)
if f is None:
continue
try:
for key in f.keys():
self.tensor_file_map[key] = file
except Exception as e:
print(f"Error reading Safetensor file {file_path}: {e}")
# if not found_safetensor:
# raise FileNotFoundError(f"No Safetensor files found in {folder_path}")
def load_tensor(self, key: str, device: str="cpu"):
if key not in self.tensor_file_map:
raise KeyError(f"Key {key} not found in Safetensor files")
file = self.tensor_file_map[key]
f = self.file_handle_map.get(file)
if f is None:
raise FileNotFoundError(f"File {file} not found in Safetensor files")
tensor = f.get_tensor(key)
return tensor.to(device)
def close_all_handles(self):
for handle in self.file_handle_map.values():
handle.close()
self.file_handle_map.clear()
def load_dequantized_tensor(self, key:str, device: str="cpu"):
if key not in self.tensor_file_map:
raise KeyError(f"Key {key} not found in Safetensor files")
file = self.tensor_file_map[key]
f = self.file_handle_map.get(file)
if f is None:
raise FileNotFoundError(f"File {file} not found in Safetensor files")
tensor = f.get_tensor(key).to(device)
if key.endswith(".weight"):
if key[:-7] + ".weight_scale_inv" in self.tensor_file_map:
weight_scale_inv = f.get_tensor(key[:-7] + ".weight_scale_inv").to(device)
tensor = weight_dequant(tensor, weight_scale_inv)
return tensor.to(device)
\ No newline at end of file
......@@ -17,6 +17,21 @@ from ktransformers.operators import base_operator
from ktransformers.models.custom_cache import StaticCache
from ktransformers.util.cuda_graph_runner import CUDAGraphRunner
from ktransformers.util.textstream import TextStreamer
from ktransformers.operators.flashinfer_wrapper import MLAWrapperSingleton
warm_uped = False
def get_compute_capability(device:torch.device = None):
if torch.cuda.is_available():
if device is None:
num_gpus = torch.cuda.device_count()
min_compute_capability_major = 100
for gpu_id in range(num_gpus):
gpu_props = torch.cuda.get_device_properties(gpu_id)
min_compute_capability_major = min(min_compute_capability_major, gpu_props.major)
return min_compute_capability_major
else:
return torch.cuda.get_device_properties(device)
def set_module(model, submodule_key, module):
tokens = submodule_key.split('.')
......@@ -63,12 +78,22 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
for name, param in local_state.items():
key = prefix + name
translated_key = translate_name_to_gguf(key)
if translated_key in gguf_loader.tensor_file_map:
# TODO: Merge all loader.
# I know this is ugly but lets do it for now.
if gguf_loader.safetensor_loader is not None:
load_dequantized_tensor = gguf_loader.safetensor_loader.load_dequantized_tensor
tensor_file_map = gguf_loader.safetensor_loader.tensor_file_map
else:
load_dequantized_tensor = gguf_loader.load_gguf_tensor
tensor_file_map = gguf_loader.tensor_file_map
if translated_key in tensor_file_map:
target_dtype = torch.get_default_dtype()
device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map)
print(f"loading {translated_key} to {device}")
# device = "cpu" if "embd" in translated_key else "cuda"
weights = gguf_loader.load_gguf_tensor(translated_key, device = device).to(dtype = target_dtype)
torch.cuda.empty_cache()
weights = load_dequantized_tensor(translated_key, device=device).to(dtype=target_dtype)
set_param(module, name, weights)
del weights
else:
......@@ -76,7 +101,7 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
raise Exception(f"can't find {translated_key} in GGUF file!")
def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''):
# print(f"recursively loading weights {prefix},{return_when_injected=}, {only_load_injected=}")
#print(f"recursively loading weights {prefix}")
if not isinstance(module, base_operator.BaseInjectedModule):
load_cur_state_dict(module, gguf_loader, prefix)
for name, child in module._modules.items():
......@@ -85,7 +110,8 @@ def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix=''):
module.load()
def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cuda_graph: bool = True,
mode = 'normal', force_think: bool = False):
mode = 'normal', force_think: bool = False, chunk_prefill_size = 16384, use_flashinfer_mla = False,
num_heads = None, head_dim_ckv = None, head_dim_kpe = None, q_head_dim = None):
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch._dynamo.config.suppress_errors = True
......@@ -98,7 +124,9 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
tokens = []
def decode_one_tokens(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values, use_cuda_graph: bool = True):
def decode_one_tokens(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph: bool = True):
if cuda_graph_runner is None:
use_cuda_graph = False
if use_cuda_graph:
logits = cuda_graph_runner(cur_token, position_ids, cache_position)
else:
......@@ -124,8 +152,25 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
next_token = torch.argmax(next_token_scores, dim=-1)
return next_token
# TODO: use CUDA Graph for chunk prefill, may get small improvement
def chunk_prefill(inputs, cache_position, past_key_values):
if mode == "long_context":
inputs_embeds = model.model.embed_tokens(inputs.to("cpu"))
else:
inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
if use_flashinfer_mla:
MLAWrapperSingleton.update_buffer(past_key_values.max_pages)
MLAWrapperSingleton.need_plan_all()
logits = model(
inputs_embeds = inputs_embeds, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True
)[0][:,-1,:].unsqueeze(0).clone().to(torch_device)
return logits
torch.cuda.set_device(torch_device)
with torch.no_grad():
stream = TextStreamer(tokenizer)
if mode != 'long_context':
past_key_values = StaticCache(
......@@ -133,26 +178,11 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
)
else:
past_key_values = None
cache_position = torch.arange(seq_length, device=torch_device)
generated_ids = torch.zeros(
batch_size, seq_length + max_new_tokens + 1, dtype=torch.int, device=torch_device
)
generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
if past_key_values != None:
past_key_values.cur_idx=cache_position
start_time = time.time()
inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
if mode == "long_context":
inputs_embeds = model.model.embed_tokens(inputs.to("cpu"))
else:
inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device)
logits = model(
inputs_embeds = inputs_embeds, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True
)[0][:,-1,:].unsqueeze(0).clone().to(torch_device)
generation_config, model_kwargs = model._prepare_generation_config(
None, max_length=max_new_tokens,
do_sample=True, top_k=5, top_p=0.85, temperature=0.1 # change this to modify generate config
None, do_sample=True
# change this to modify generate config
#top_k=5, top_p=0.85, temperature=0.1
)
try: # transformers==4.43
logits_warper = (
......@@ -162,41 +192,66 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud
logits_warper = (
model._get_logits_warper(generation_config)
)
cache_position = torch.arange(seq_length, device=torch_device, dtype=torch.int32)
generated_ids = torch.zeros(
batch_size, seq_length + max_new_tokens + 1, dtype=torch.int, device=torch_device
)
generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
start_time = time.time()
chunk_start = 0
while chunk_start < seq_length:
chunk_end = min(chunk_start + chunk_prefill_size, seq_length)
if past_key_values != None:
past_key_values.cur_idx=cache_position[chunk_start:chunk_end]
logits = chunk_prefill(inputs[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end], past_key_values)
chunk_start += chunk_prefill_size
next_token_scores = logits_warper(inputs, logits[:, -1, :])
if generation_config.do_sample:
probs = nn.functional.softmax(next_token_scores, dim=-1)
next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
else:
next_token = torch.argmax(next_token_scores, dim=-1)
first_token_time = time.time() - start_time
if use_flashinfer_mla:
MLAWrapperSingleton.reset_buffer()
prefill_count = seq_length
prefill_time = first_token_time
if force_think:
print("<think>\n")
print("<think>")
print(stream.put(next_token.item()), end="", flush=True)
generated_ids[:, seq_length] = next_token
tokens.append(int(next_token))
inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
cache_position = torch.tensor([seq_length], device=torch_device)
cache_position = torch.tensor([seq_length], device=torch_device, dtype=torch.int32)
position_ids = cache_position.unsqueeze(0)
seq_length += 1
if use_cuda_graph:
cuda_graph_runner = CUDAGraphRunner()
cuda_graph_runner.capture(model, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, torch_device, return_dict=False, use_cache=True)
else:
cuda_graph_runner = None
cuda_graph_runner = None
start_time = time.time()
for _ in range(1, max_new_tokens):
next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, use_cuda_graph).to(torch_device)
for i in range(1, max_new_tokens):
if use_flashinfer_mla:
MLAWrapperSingleton.plan_all(None,None,None,position_ids.squeeze(1)+1,
num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size,
model.model.layers[0].self_attn.softmax_scale, torch.bfloat16, torch.bfloat16)
global warm_uped
if use_cuda_graph and ( (warm_uped == True and int(i) == 1) or (warm_uped == False and int(i) == 2) ):
warm_uped = True
cuda_graph_runner = CUDAGraphRunner()
cuda_graph_runner.capture(model, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, torch_device, return_dict=False, use_cache=True)
next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph).to(torch_device)
inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1)
generated_ids[:, cache_position] = next_token.int()
tokens.append(int(next_token))
seq_length += 1
if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token.tolist()) == '<|im_end|>':
print(stream.end(), end="", flush=True)
break
else:
......
# this script targets to merge the fp8 safe tensor and the gguf quantized tensors.
import os
# insert the path of the project
import sys
sys.path.insert(0, "/home/azure/ktransformers")
import argparse
import torch
from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf
from safetensors import safe_open
from safetensors.torch import save_file
import re
from collections import defaultdict
def read_safetensor_keys_from_folder(folder_path)->dict:
"""
:param folder_path: folder path
:return: key_to_file_map
"""
# check if the folder path is exist
if not os.path.exists(folder_path):
raise FileNotFoundError(f"GGUF dir not found: {folder_path}")
if os.path.isfile(folder_path):
folder_path = os.path.dirname(folder_path)
key_to_file_map = {}
found_safetensor = False
for root, dirs, files in os.walk(folder_path):
# sort files
files = sorted(files)
for file in files:
if file.endswith(".safetensors"):
found_safetensor = True
file_path = os.path.join(root, file)
try:
with safe_open(file_path, framework="pt") as f:
for key in f.keys():
if "model.layers.61" in key:
# skip MTP layer
continue
# try:
# if int(key.split('.')[2]) > 4:
# continue
# except:
# pass
key_to_file_map[key] = file_path
except Exception as e:
print(f"Error reading Safetensor file {file_path}: {e}")
if not found_safetensor:
raise FileNotFoundError(f"No Safetensor files found in {folder_path}")
return key_to_file_map
tensor_from_gguf = [] # todo: add keys in gguf that should be used in the final tensor
def translate_name(name:str)->str:
"""
:param name: name of the tensor
:return: translated name
"""
name = translate_name_to_gguf(name)
name = name.replace(".up_proj.", ".ffn_up_exps.")
name = name.replace(".down_proj.", ".ffn_down_exps.")
name = name.replace(".gate_proj.", ".ffn_gate_exps.")
name = name.replace(".ffn_gate_inp.e_score_correction_bias", ".exp_probs_b.bias")
return name
def combine_tensor_sources(safetensor_path:str, gguf_path:str):
gguf_loader = GGUFLoader(gguf_path)
gguf_tensor_file_map = gguf_loader.tensor_file_map
safetensor_tensor_file_map = read_safetensor_keys_from_folder(safetensor_path)
# build a map for the key to the tensor
# according to the key, we can get the tensor from the file
target_tensor_map = {}
for key in safetensor_tensor_file_map.keys():
# for all experts, we use the gguf tensor
if ".mlp.experts." in key:
if '.weight_scale_inv' in key:
continue
key = '.'.join(key.split('.')[:5]+key.split('.')[-2:])
translated_key = translate_name(key)
target_tensor_map[key] = gguf_tensor_file_map[translated_key]
continue
if any(target_key in key for target_key in tensor_from_gguf):
target_tensor_map[key] = gguf_tensor_file_map[translate_name(key)]
else:
target_tensor_map[key] = safetensor_tensor_file_map[key]
return target_tensor_map, gguf_loader
def write_combined_tensor(target_tensor_map: dict, output_path: str, gguf_loader: GGUFLoader):
# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)
# Cache for safetensor file handles and GGUF loaders
safetensors_cache = {}
gguf_cache = {}
# Group tensors by layer
layer_groups = defaultdict(list)
non_layer_keys = []
layer_pattern = re.compile(r'\.layers\.(\d+)\.')
for key in target_tensor_map:
match = layer_pattern.search(key)
if match:
layer_num = int(match.group(1))
layer_groups[layer_num].append(key)
else:
non_layer_keys.append(key)
# Calculate total shards
total_shards = len(layer_groups) + (1 if non_layer_keys else 0) - 1
if total_shards == 0:
raise ValueError("No tensors to save")
shard_idx = 0
# Save non-layer tensors to the first shard if they exist
if non_layer_keys:
tensors = {}
for key in non_layer_keys:
file_path = target_tensor_map[key]
tensor = None
ggml_type = None
if file_path.endswith('.safetensors'):
if file_path not in safetensors_cache:
safetensors_cache[file_path] = safe_open(file_path, framework='pt')
f = safetensors_cache[file_path]
tensor = f.get_tensor(key)
elif file_path.endswith('.gguf'):
gguf_name = translate_name(key)
tensor, ggml_type = gguf_loader.get_undequanted_tensor_and_ggml_type(gguf_name)
else:
raise ValueError(f"Unsupported file format: {file_path}")
tensors[translate_name(key)] = tensor
if ggml_type:
ggml_type = torch.tensor(ggml_type)
ggml_key = translate_name(key)[:-7] + ".ggml_type" if translate_name(key).endswith(".weight") else translate_name(key) + ".ggml_type"
tensors[ggml_key] = ggml_type
output_file = os.path.join(output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors")
print(f"Saving non-layer tensors to {output_file}")
save_file(tensors, output_file)
print(tensors.keys())
shard_idx += 1
# Save each layer's tensors to subsequent shards
for layer_num in sorted(layer_groups.keys()):
layer_keys = layer_groups[layer_num]
tensors = {}
for key in layer_keys:
file_path = target_tensor_map[key]
tensor = None
ggml_type = None
if file_path.endswith('.safetensors'):
if file_path not in safetensors_cache:
safetensors_cache[file_path] = safe_open(file_path, framework='pt')
f = safetensors_cache[file_path]
tensor = f.get_tensor(key)
tensor_info = tensor.shape
elif file_path.endswith('.gguf'):
gguf_name = translate_name(key)
tensor, ggml_type = gguf_loader.get_undequanted_tensor_and_ggml_type(gguf_name)
# tensor_info = gguf_loader.tensor_info[gguf_name]
# ggml_type = gguf_loader.tensor_info[gguf_name]['ggml_type']
else:
raise ValueError(f"Unsupported file format: {file_path}")
tensors[translate_name(key)] = tensor
if ggml_type:
ggml_type = torch.tensor(ggml_type)
ggml_key = translate_name(key)[:-7] + ".ggml_type" if translate_name(key).endswith(".weight") else translate_name(key) + ".ggml_type"
tensors[ggml_key] = ggml_type
output_file = os.path.join(output_path, f"model-{shard_idx:05}-of-{total_shards:05}.safetensors")
print(f"Saving layer {layer_num} to {output_file}")
# print(tensors.keys())
save_file(tensors, output_file)
shard_idx += 1
return
def main():
# 创建命令行参数解析器
parser = argparse.ArgumentParser(description="Read parameters from Safetensor and GGUF files")
parser.add_argument("--safetensor_path", type=str, help="Path to the Safetensor file", default="/mnt/data/model/DeepSeek-V3")
parser.add_argument("--gguf_path", type=str, help="Path to the GGUF file", default="/mnt/data/model/DeepseekV3-q4km-gguf")
parser.add_argument("--output_path", type=str, help="Path to the output file", default="/mnt/data/model/ktrans-safetensors/DeepSeek-V3-q4km-fp8")
# print all the arguments
print("All the arguments:")
print(parser.parse_args())
# 解析命令行参数
args = parser.parse_args()
safetensor_path = args.safetensor_path
gguf_path = args.gguf_path
output_path = args.output_path
target_tensor_map, gguf_loader = combine_tensor_sources(safetensor_path, gguf_path)
write_combined_tensor(target_tensor_map, output_path, gguf_loader)
return
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -4,4 +4,6 @@ numpy
torch>=2.3.0
packaging
cpufeature
protobuf
\ No newline at end of file
protobuf
tiktoken
blobfile
\ No newline at end of file
#!/usr/bin/env python
# coding=utf-8
'''
Description :
Description :
Author : chenxl
Date : 2024-07-27 16:15:27
Version : 1.0.0
LastEditors : chenxl
LastEditors : chenxl
LastEditTime : 2024-08-14 16:36:19
Adapted from:
https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
Copyright (c) 2023, Tri Dao.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
import os
......@@ -29,7 +29,12 @@ import torch.version
from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
from setuptools import setup, Extension
from cpufeature.extension import CPUFeature
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
try:
from torch_musa.utils.simple_porting import SimplePorting
from torch_musa.utils.musa_extension import BuildExtension, MUSAExtension, MUSA_HOME
except ImportError:
MUSA_HOME=None
class CpuInstructInfo:
CPU_INSTRUCT = os.getenv("CPU_INSTRUCT", "NATIVE")
......@@ -40,7 +45,7 @@ class CpuInstructInfo:
CMAKE_FANCY = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON -DLLAMA_AVX512_FANCY_SIMD=ON"
CMAKE_AVX512 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON"
CMAKE_AVX2 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON"
class VersionInfo:
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
PACKAGE_NAME = "ktransformers"
......@@ -49,6 +54,80 @@ class VersionInfo:
)
FORCE_BUILD = os.getenv("KTRANSFORMERS_FORCE_BUILD", "FALSE") == "TRUE"
def get_musa_bare_metal_version(self, musa_dir):
raw_output = subprocess.run(
[musa_dir + "/bin/mcc", "-v"], check=True,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout.decode("utf-8")
output = raw_output.split()
release_idx = output.index("version") + 1
bare_metal_version = parse(output[release_idx].split(",")[0])
musa_version = f"{bare_metal_version.major}{bare_metal_version.minor}"
return musa_version
def get_rocm_bare_metal_version(self, rocm_dir):
"""
Get the ROCm version from the ROCm installation directory.
Args:
rocm_dir: Path to the ROCm installation directory
Returns:
A string representation of the ROCm version (e.g., "63" for ROCm 6.3)
"""
try:
# Try using rocm_agent_enumerator to get version info
raw_output = subprocess.check_output(
[rocm_dir + "/bin/rocminfo", "--version"],
universal_newlines=True,
stderr=subprocess.STDOUT)
# Extract version number from output
match = re.search(r'(\d+\.\d+)', raw_output)
if match:
version_str = match.group(1)
version = parse(version_str)
rocm_version = f"{version.major}{version.minor}"
return rocm_version
except (subprocess.CalledProcessError, FileNotFoundError):
# If rocminfo --version fails, try alternative methods
pass
try:
# Try reading version from release file
with open(os.path.join(rocm_dir, "share/doc/hip/version.txt"), "r") as f:
version_str = f.read().strip()
version = parse(version_str)
rocm_version = f"{version.major}{version.minor}"
return rocm_version
except (FileNotFoundError, IOError):
pass
# If all else fails, try to extract from directory name
dir_name = os.path.basename(os.path.normpath(rocm_dir))
match = re.search(r'rocm-(\d+\.\d+)', dir_name)
if match:
version_str = match.group(1)
version = parse(version_str)
rocm_version = f"{version.major}{version.minor}"
return rocm_version
# Fallback to extracting from hipcc version
try:
raw_output = subprocess.check_output(
[rocm_dir + "/bin/hipcc", "--version"],
universal_newlines=True,
stderr=subprocess.STDOUT)
match = re.search(r'HIP version: (\d+\.\d+)', raw_output)
if match:
version_str = match.group(1)
version = parse(version_str)
rocm_version = f"{version.major}{version.minor}"
return rocm_version
except (subprocess.CalledProcessError, FileNotFoundError):
pass
# If we still can't determine the version, raise an error
raise ValueError(f"Could not determine ROCm version from directory: {rocm_dir}")
def get_cuda_bare_metal_version(self, cuda_dir):
raw_output = subprocess.check_output(
[cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
......@@ -58,7 +137,7 @@ class VersionInfo:
cuda_version = f"{bare_metal_version.major}{bare_metal_version.minor}"
return cuda_version
def get_cuda_version_of_torch(self,):
def get_cuda_version_of_torch(self):
torch_cuda_version = parse(torch.version.cuda)
cuda_version = f"{torch_cuda_version.major}{torch_cuda_version.minor}"
return cuda_version
......@@ -117,7 +196,7 @@ class VersionInfo:
torch_version_raw = parse(torch.__version__)
torch_version = f"{torch_version_raw.major}{torch_version_raw.minor}"
return torch_version
def get_flash_version(self,):
version_file = os.path.join(
Path(VersionInfo.THIS_DIR), VersionInfo.PACKAGE_NAME, "__init__.py")
......@@ -128,12 +207,23 @@ class VersionInfo:
return flash_version
def get_package_version(self, full_version=False):
flash_version = self.get_flash_version()
package_version = f"{str(flash_version)}+torch{self.get_torch_version()}{self.get_cpu_instruct()}"
flash_version = str(self.get_flash_version())
torch_version = self.get_torch_version()
cpu_instruct = self.get_cpu_instruct()
backend_version = ""
if CUDA_HOME is not None:
backend_version = f""
elif MUSA_HOME is not None:
backend_version = f"mu{self.get_musa_bare_metal_version(MUSA_HOME)}"
elif ROCM_HOME is not None:
backend_version = f"rocm{self.get_rocm_bare_metal_version(ROCM_HOME)}"
else:
raise ValueError("Unsupported backend: CUDA_HOME MUSA_HOME ROCM_HOME all not set.")
package_version = f"{flash_version}+{backend_version}torch{torch_version}{cpu_instruct}"
if full_version:
return package_version
if not VersionInfo.FORCE_BUILD:
return str(flash_version)
return flash_version
return package_version
......@@ -218,11 +308,23 @@ class CMakeBuild(BuildExtension):
f"-DPYTHON_EXECUTABLE={sys.executable}",
f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm
]
if CUDA_HOME is not None:
cmake_args += ["-DKTRANSFORMERS_USE_CUDA=ON"]
elif MUSA_HOME is not None:
cmake_args += ["-DKTRANSFORMERS_USE_MUSA=ON"]
elif ROCM_HOME is not None:
cmake_args += ["-DKTRANSFORMERS_USE_ROCM=ON"]
else:
raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")
# log cmake_args
print("CMake args:", cmake_args)
build_args = []
if "CMAKE_ARGS" in os.environ:
cmake_args += [
item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
if CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.FANCY:
cpu_args = CpuInstructInfo.CMAKE_FANCY
elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX512:
......@@ -231,7 +333,7 @@ class CMakeBuild(BuildExtension):
cpu_args = CpuInstructInfo.CMAKE_AVX2
else:
cpu_args = CpuInstructInfo.CMAKE_NATIVE
cmake_args += [
item for item in cpu_args.split(" ") if item
]
......@@ -258,7 +360,7 @@ class CMakeBuild(BuildExtension):
# CMake allows an arch-in-generator style for backward compatibility
contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})
if not single_config and not contains_arch:
if not single_config and not contains_arch and cmake_generator:
cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]]
# Multi-config generators have a different way to specify configs
......@@ -276,8 +378,13 @@ class CMakeBuild(BuildExtension):
"-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
cpu_count = os.cpu_count()
if cpu_count is None:
cpu_count = 1
if hasattr(self, "parallel") and self.parallel:
build_args += [f"-j{self.parallel}"]
build_args += [f"--parallel={self.parallel}"]
else:
build_args += [f"--parallel={cpu_count}"]
print("CMake args:", cmake_args)
build_temp = Path(ext.sourcedir) / "build"
if not build_temp.exists():
......@@ -288,28 +395,57 @@ class CMakeBuild(BuildExtension):
print("Standard output:", result.stdout)
print("Standard error:", result.stderr)
subprocess.run(
["cmake", "--build", ".", *build_args], cwd=build_temp, check=True
["cmake", "--build", ".", "--verbose", *build_args], cwd=build_temp, check=True
)
if CUDA_HOME is not None or ROCM_HOME is not None:
ops_module = CUDAExtension('KTransformersOps', [
'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu',
'ktransformers/ktransformers_ext/cuda/binding.cpp',
'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
],
extra_compile_args={
'cxx': ['-O3', '-DKTRANSFORMERS_USE_CUDA'],
'nvcc': [
'-O3',
# '--use_fast_math',
'-Xcompiler', '-fPIC',
'-DKTRANSFORMERS_USE_CUDA',
]
}
)
elif MUSA_HOME is not None:
SimplePorting(cuda_dir_path="ktransformers/ktransformers_ext/cuda", mapping_rule={
# Common rules
"at::cuda": "at::musa",
"#include <ATen/cuda/CUDAContext.h>": "#include \"torch_musa/csrc/aten/musa/MUSAContext.h\"",
"#include <c10/cuda/CUDAGuard.h>": "#include \"torch_musa/csrc/core/MUSAGuard.h\"",
"nv_bfloat16": "mt_bfloat16",
}).run()
ops_module = MUSAExtension('KTransformersOps', [
'ktransformers/ktransformers_ext/cuda_musa/custom_gguf/dequant.mu',
'ktransformers/ktransformers_ext/cuda_musa/binding.cpp',
# TODO: Add Marlin support for MUSA.
# 'ktransformers/ktransformers_ext/cuda_musa/gptq_marlin/gptq_marlin.mu'
],
extra_compile_args={
'cxx': ['force_mcc'],
'mcc': [
'-O3',
'-DKTRANSFORMERS_USE_MUSA',
'-DTHRUST_IGNORE_CUB_VERSION_CHECK',
]
}
)
else:
raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")
setup(
name=VersionInfo.PACKAGE_NAME,
version=VersionInfo().get_package_version(),
cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
ext_modules=[
CMakeExtension("cpuinfer_ext"),
CUDAExtension('KTransformersOps', [
'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu',
'ktransformers/ktransformers_ext/cuda/binding.cpp',
'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
],
extra_compile_args={
'cxx': ['-O3'],
'nvcc': [
'-O3',
# '--use_fast_math',
'-Xcompiler', '-fPIC',
]
}
)
ops_module,
]
)
......@@ -69,6 +69,10 @@
#endif
constexpr ggml_type GGML_TYPE_Q8_0_X4 = static_cast<ggml_type>(98);
constexpr ggml_type GGML_TYPE_Q8_1_X4 = static_cast<ggml_type>(99);
namespace {
typedef struct {
......@@ -106,13 +110,36 @@ struct DataInfo {
}
};
/*
moonll
change param for set_mul_mat
add func16
*/
typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x);
struct MulMat {
std::array<mul_mat_t, 8> funcs = {};
mul_mat_t func16 = nullptr;
//inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
IQK_NOINLINE void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
constexpr int k_x_step = 64; // This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small)
if (func16 && nrc_y >= 16) {
int n_step = (nrc_y - info.cur_y)/16;
for (int ix = 0; ix < nrc_x; ix += k_x_step) {
auto this_info = info;
this_info.s += ix;
int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
for (int iy = 0; iy < n_step; ++iy) {
func16(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
this_info.cur_y += 16;
}
}
info.cur_y += 16 * n_step;
if (info.cur_y == nrc_y) return;
}
int n_step = (nrc_y - info.cur_y)/funcs.size();
if (n_step > 0) {
for (int ix = 0; ix < nrc_x; ix += k_x_step) {
......@@ -131,7 +158,7 @@ struct MulMat {
funcs[n_left-1](n, vx, bx, info, nrc_x);
}
}
static IQK_NOINLINE bool set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int Ny);
static IQK_NOINLINE bool set_mul_mat(int typeA, int typeB,int ne00, MulMat& mm, int Ny);
private:
template <typename Dequantizer> static IQK_NOINLINE void set_functions(MulMat& m);
};
......@@ -147,6 +174,787 @@ inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) {
aux32[0] = a0 & 0x3f3f3f3f;
}
/*
moonll
decoding tables
*/
#ifdef __AVX2__
static const uint64_t iq1s_grid_us[2048] = {
0x0000000000000000, 0x0000000000000002, 0x0000000000000101, 0x0000000000000200,
0x0000000000000202, 0x0000000000010001, 0x0000000000010101, 0x0000000000020000,
0x0000000000020002, 0x0000000000020200, 0x0000000000020202, 0x0000000001000101,
0x0000000001010001, 0x0000000001010100, 0x0000000001010102, 0x0000000001020101,
0x0000000002000000, 0x0000000002000002, 0x0000000002000200, 0x0000000002000202,
0x0000000002010101, 0x0000000002020000, 0x0000000002020002, 0x0000000002020200,
0x0000000002020202, 0x0000000100000100, 0x0000000100000101, 0x0000000100010001,
0x0000000100010100, 0x0000000100010102, 0x0000000100010201, 0x0000000100010202,
0x0000000100020101, 0x0000000101000001, 0x0000000101000102, 0x0000000101000201,
0x0000000101010002, 0x0000000101010101, 0x0000000101010202, 0x0000000101020001,
0x0000000101020100, 0x0000000101020102, 0x0000000101020200, 0x0000000102000101,
0x0000000102010001, 0x0000000102010100, 0x0000000102010102, 0x0000000102020101,
0x0000000200000000, 0x0000000200000002, 0x0000000200000200, 0x0000000200000202,
0x0000000200010101, 0x0000000200020000, 0x0000000200020002, 0x0000000200020200,
0x0000000200020202, 0x0000000201000101, 0x0000000201010001, 0x0000000201010201,
0x0000000201020100, 0x0000000201020201, 0x0000000202000000, 0x0000000202000002,
0x0000000202000200, 0x0000000202000202, 0x0000000202010001, 0x0000000202010101,
0x0000000202010201, 0x0000000202020000, 0x0000000202020002, 0x0000000202020200,
0x0000000202020202, 0x0000010000010001, 0x0000010000010100, 0x0000010000010102,
0x0000010000020101, 0x0000010001000001, 0x0000010001000201, 0x0000010001010101,
0x0000010001010202, 0x0000010001020100, 0x0000010001020101, 0x0000010002010001,
0x0000010002010201, 0x0000010002020101, 0x0000010100000001, 0x0000010100000100,
0x0000010100000101, 0x0000010100000102, 0x0000010100010101, 0x0000010100010200,
0x0000010100010202, 0x0000010100020201, 0x0000010101000000, 0x0000010101000101,
0x0000010101000202, 0x0000010101010000, 0x0000010101010001, 0x0000010101010100,
0x0000010101010101, 0x0000010101010102, 0x0000010101010201, 0x0000010101020000,
0x0000010101020002, 0x0000010101020101, 0x0000010101020200, 0x0000010101020202,
0x0000010102000001, 0x0000010102010001, 0x0000010102010101, 0x0000010102010200,
0x0000010102010202, 0x0000010102020001, 0x0000010102020100, 0x0000010102020101,
0x0000010102020102, 0x0000010102020201, 0x0000010200010100, 0x0000010200010201,
0x0000010201000001, 0x0000010201000100, 0x0000010201010000, 0x0000010201010002,
0x0000010201010101, 0x0000010201010200, 0x0000010201020000, 0x0000010201020001,
0x0000010201020102, 0x0000010201020201, 0x0000010202000101, 0x0000010202010001,
0x0000010202010100, 0x0000010202010201, 0x0000020000000000, 0x0000020000000002,
0x0000020000000200, 0x0000020000000202, 0x0000020000010101, 0x0000020000020000,
0x0000020000020002, 0x0000020000020200, 0x0000020000020202, 0x0000020001000101,
0x0000020001010001, 0x0000020001010102, 0x0000020001020101, 0x0000020002000000,
0x0000020002000002, 0x0000020002000200, 0x0000020002000202, 0x0000020002010101,
0x0000020002020000, 0x0000020002020002, 0x0000020002020200, 0x0000020002020202,
0x0000020100000101, 0x0000020100010001, 0x0000020100010100, 0x0000020100010201,
0x0000020100020100, 0x0000020100020101, 0x0000020101000001, 0x0000020101010000,
0x0000020101010001, 0x0000020101010101, 0x0000020101020001, 0x0000020101020100,
0x0000020101020201, 0x0000020102010001, 0x0000020102010100, 0x0000020102010102,
0x0000020102010201, 0x0000020102020101, 0x0000020200000000, 0x0000020200000002,
0x0000020200000200, 0x0000020200000202, 0x0000020200010101, 0x0000020200020000,
0x0000020200020002, 0x0000020200020200, 0x0000020200020202, 0x0000020201000101,
0x0000020201010001, 0x0000020201010201, 0x0000020201020001, 0x0000020201020101,
0x0000020202000000, 0x0000020202000002, 0x0000020202000101, 0x0000020202000200,
0x0000020202000202, 0x0000020202010101, 0x0000020202020000, 0x0000020202020002,
0x0000020202020200, 0x0000020202020202, 0x0001000000010000, 0x0001000000010001,
0x0001000000010100, 0x0001000000010201, 0x0001000000020100, 0x0001000000020101,
0x0001000001000001, 0x0001000001000100, 0x0001000001010000, 0x0001000001010101,
0x0001000001010200, 0x0001000001020001, 0x0001000001020100, 0x0001000001020101,
0x0001000001020201, 0x0001000002010001, 0x0001000002010100, 0x0001000002010102,
0x0001000002020001, 0x0001000002020101, 0x0001000100000001, 0x0001000100000100,
0x0001000100000102, 0x0001000100000201, 0x0001000100010000, 0x0001000100010002,
0x0001000100010101, 0x0001000100010200, 0x0001000100020001, 0x0001000100020100,
0x0001000100020201, 0x0001000101000101, 0x0001000101000202, 0x0001000101010000,
0x0001000101010001, 0x0001000101010002, 0x0001000101010100, 0x0001000101010101,
0x0001000101010102, 0x0001000101010201, 0x0001000101020000, 0x0001000101020101,
0x0001000102000100, 0x0001000102010002, 0x0001000102010101, 0x0001000102020001,
0x0001000102020100, 0x0001000200010001, 0x0001000200010100, 0x0001000200010102,
0x0001000200020101, 0x0001000201000000, 0x0001000201000102, 0x0001000201000201,
0x0001000201010002, 0x0001000201010101, 0x0001000201010200, 0x0001000201010202,
0x0001000201020100, 0x0001000201020102, 0x0001000202000101, 0x0001000202010001,
0x0001000202010100, 0x0001000202010102, 0x0001000202020101, 0x0001010000000001,
0x0001010000000102, 0x0001010000000201, 0x0001010000010100, 0x0001010000010101,
0x0001010000010200, 0x0001010000010201, 0x0001010000020001, 0x0001010000020102,
0x0001010001000001, 0x0001010001000101, 0x0001010001000102, 0x0001010001000200,
0x0001010001000202, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101,
0x0001010001010102, 0x0001010001010201, 0x0001010001020002, 0x0001010001020101,
0x0001010001020200, 0x0001010002000100, 0x0001010002000201, 0x0001010002010000,
0x0001010002010100, 0x0001010002010101, 0x0001010002010200, 0x0001010002010201,
0x0001010002010202, 0x0001010002020001, 0x0001010002020100, 0x0001010002020101,
0x0001010002020201, 0x0001010100000002, 0x0001010100000101, 0x0001010100000202,
0x0001010100010001, 0x0001010100010100, 0x0001010100010101, 0x0001010100010102,
0x0001010100010201, 0x0001010100020000, 0x0001010100020002, 0x0001010100020101,
0x0001010100020200, 0x0001010100020202, 0x0001010101000001, 0x0001010101000100,
0x0001010101000101, 0x0001010101000102, 0x0001010101010001, 0x0001010101010002,
0x0001010101010100, 0x0001010101010101, 0x0001010101010102, 0x0001010101010201,
0x0001010101010202, 0x0001010101020001, 0x0001010101020100, 0x0001010101020101,
0x0001010101020102, 0x0001010101020201, 0x0001010102000000, 0x0001010102000002,
0x0001010102000100, 0x0001010102000101, 0x0001010102000200, 0x0001010102000202,
0x0001010102010000, 0x0001010102010001, 0x0001010102010100, 0x0001010102010101,
0x0001010102010102, 0x0001010102010201, 0x0001010102010202, 0x0001010102020000,
0x0001010102020002, 0x0001010102020101, 0x0001010200000001, 0x0001010200000100,
0x0001010200000101, 0x0001010200000102, 0x0001010200010101, 0x0001010200010102,
0x0001010200010200, 0x0001010200010202, 0x0001010200020001, 0x0001010200020102,
0x0001010201000000, 0x0001010201000002, 0x0001010201000100, 0x0001010201000101,
0x0001010201000200, 0x0001010201000202, 0x0001010201010001, 0x0001010201010101,
0x0001010201010102, 0x0001010201010200, 0x0001010201010201, 0x0001010201020001,
0x0001010201020100, 0x0001010201020101, 0x0001010201020200, 0x0001010201020201,
0x0001010201020202, 0x0001010202000102, 0x0001010202000202, 0x0001010202010002,
0x0001010202010101, 0x0001010202020100, 0x0001010202020201, 0x0001020000010001,
0x0001020000010102, 0x0001020000020101, 0x0001020001000001, 0x0001020001000100,
0x0001020001000102, 0x0001020001000201, 0x0001020001010000, 0x0001020001010101,
0x0001020001010200, 0x0001020001010202, 0x0001020001020000, 0x0001020001020001,
0x0001020001020100, 0x0001020001020102, 0x0001020001020201, 0x0001020002000101,
0x0001020002010001, 0x0001020002010100, 0x0001020002020101, 0x0001020100010000,
0x0001020100010002, 0x0001020100010101, 0x0001020100010202, 0x0001020100020001,
0x0001020100020101, 0x0001020101000002, 0x0001020101000100, 0x0001020101000101,
0x0001020101000200, 0x0001020101010001, 0x0001020101010100, 0x0001020101010101,
0x0001020101010102, 0x0001020101010201, 0x0001020101010202, 0x0001020101020000,
0x0001020101020101, 0x0001020101020202, 0x0001020102000201, 0x0001020102010001,
0x0001020102010002, 0x0001020102010101, 0x0001020102010200, 0x0001020102020001,
0x0001020102020102, 0x0001020102020201, 0x0001020200000201, 0x0001020200010102,
0x0001020200020100, 0x0001020200020102, 0x0001020201000100, 0x0001020201000102,
0x0001020201000201, 0x0001020201010000, 0x0001020201010002, 0x0001020201010101,
0x0001020201010200, 0x0001020201020001, 0x0001020201020102, 0x0001020201020201,
0x0001020202000101, 0x0001020202010001, 0x0001020202010102, 0x0001020202010202,
0x0002000000000000, 0x0002000000000002, 0x0002000000000200, 0x0002000000000202,
0x0002000000010101, 0x0002000000020000, 0x0002000000020002, 0x0002000000020101,
0x0002000000020200, 0x0002000000020202, 0x0002000001000101, 0x0002000001010001,
0x0002000001010201, 0x0002000001020001, 0x0002000001020101, 0x0002000002000000,
0x0002000002000002, 0x0002000002000200, 0x0002000002000202, 0x0002000002010101,
0x0002000002020000, 0x0002000002020002, 0x0002000002020101, 0x0002000002020200,
0x0002000002020202, 0x0002000100000101, 0x0002000100010001, 0x0002000100010100,
0x0002000100010201, 0x0002000100020101, 0x0002000101000002, 0x0002000101000100,
0x0002000101000201, 0x0002000101010101, 0x0002000101010200, 0x0002000101010202,
0x0002000101020001, 0x0002000101020100, 0x0002000101020101, 0x0002000101020102,
0x0002000102000101, 0x0002000102010000, 0x0002000102010102, 0x0002000102010201,
0x0002000102020101, 0x0002000200000001, 0x0002000200000200, 0x0002000200000202,
0x0002000200010001, 0x0002000200010101, 0x0002000200020000, 0x0002000200020002,
0x0002000200020200, 0x0002000200020202, 0x0002000201000101, 0x0002000201010001,
0x0002000201010102, 0x0002000201010201, 0x0002000201020101, 0x0002000202000001,
0x0002000202000200, 0x0002000202000202, 0x0002000202010001, 0x0002000202010101,
0x0002000202020000, 0x0002000202020002, 0x0002000202020200, 0x0002000202020202,
0x0002010000000101, 0x0002010000010100, 0x0002010000010102, 0x0002010000010201,
0x0002010000020101, 0x0002010001000100, 0x0002010001000101, 0x0002010001000102,
0x0002010001000201, 0x0002010001010002, 0x0002010001010101, 0x0002010001010200,
0x0002010001010202, 0x0002010001020102, 0x0002010002000101, 0x0002010002010001,
0x0002010002010100, 0x0002010002010201, 0x0002010002020001, 0x0002010002020101,
0x0002010100000201, 0x0002010100010101, 0x0002010100020001, 0x0002010100020201,
0x0002010101000000, 0x0002010101000101, 0x0002010101000200, 0x0002010101010001,
0x0002010101010100, 0x0002010101010101, 0x0002010101010201, 0x0002010101020002,
0x0002010101020101, 0x0002010101020200, 0x0002010102000201, 0x0002010102010000,
0x0002010102010100, 0x0002010102010101, 0x0002010102010200, 0x0002010102010202,
0x0002010102020001, 0x0002010102020100, 0x0002010102020102, 0x0002010102020201,
0x0002010200000101, 0x0002010200010000, 0x0002010200010002, 0x0002010200010201,
0x0002010200020101, 0x0002010201000001, 0x0002010201000201, 0x0002010201010101,
0x0002010201020000, 0x0002010201020001, 0x0002010201020201, 0x0002010202000100,
0x0002010202000102, 0x0002010202010000, 0x0002010202010202, 0x0002020000000000,
0x0002020000000002, 0x0002020000000200, 0x0002020000000202, 0x0002020000010101,
0x0002020000020000, 0x0002020000020002, 0x0002020000020200, 0x0002020000020202,
0x0002020001000101, 0x0002020001010001, 0x0002020001010100, 0x0002020001020101,
0x0002020002000000, 0x0002020002000002, 0x0002020002000200, 0x0002020002000202,
0x0002020002020000, 0x0002020002020002, 0x0002020002020200, 0x0002020002020202,
0x0002020100000201, 0x0002020100010001, 0x0002020100010100, 0x0002020100010201,
0x0002020100020101, 0x0002020101000102, 0x0002020101000201, 0x0002020101010002,
0x0002020101010101, 0x0002020101020001, 0x0002020101020100, 0x0002020101020102,
0x0002020101020201, 0x0002020102000101, 0x0002020102010000, 0x0002020102010102,
0x0002020102010201, 0x0002020102020100, 0x0002020102020101, 0x0002020200000000,
0x0002020200000002, 0x0002020200000200, 0x0002020200000202, 0x0002020200020000,
0x0002020200020002, 0x0002020200020200, 0x0002020200020202, 0x0002020201000101,
0x0002020201010001, 0x0002020201010102, 0x0002020201010201, 0x0002020201020101,
0x0002020202000000, 0x0002020202000002, 0x0002020202000200, 0x0002020202000202,
0x0002020202010101, 0x0002020202020000, 0x0002020202020002, 0x0002020202020200,
0x0002020202020202, 0x0100000000000101, 0x0100000000010001, 0x0100000000010102,
0x0100000000020101, 0x0100000001000201, 0x0100000001010002, 0x0100000001010101,
0x0100000001010200, 0x0100000001010202, 0x0100000001020001, 0x0100000001020100,
0x0100000001020102, 0x0100000002010100, 0x0100000002010201, 0x0100000002020001,
0x0100000002020102, 0x0100000100000000, 0x0100000100000001, 0x0100000100000100,
0x0100000100000102, 0x0100000100000201, 0x0100000100010002, 0x0100000100010101,
0x0100000100010102, 0x0100000100010200, 0x0100000100010202, 0x0100000100020001,
0x0100000100020102, 0x0100000100020201, 0x0100000101000101, 0x0100000101000200,
0x0100000101000202, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101,
0x0100000101010102, 0x0100000101010201, 0x0100000101010202, 0x0100000101020101,
0x0100000101020200, 0x0100000101020202, 0x0100000102000001, 0x0100000102000100,
0x0100000102000102, 0x0100000102010000, 0x0100000102010002, 0x0100000102010101,
0x0100000102020000, 0x0100000102020001, 0x0100000102020002, 0x0100000200000101,
0x0100000200010001, 0x0100000200010100, 0x0100000200010102, 0x0100000200020101,
0x0100000201000001, 0x0100000201010002, 0x0100000201010101, 0x0100000201010202,
0x0100000201020100, 0x0100000201020201, 0x0100000202000201, 0x0100000202010100,
0x0100000202020101, 0x0100010000000001, 0x0100010000010101, 0x0100010000010201,
0x0100010000020201, 0x0100010001000101, 0x0100010001000200, 0x0100010001000202,
0x0100010001010001, 0x0100010001010100, 0x0100010001010101, 0x0100010001010102,
0x0100010001020001, 0x0100010001020002, 0x0100010001020101, 0x0100010001020200,
0x0100010001020202, 0x0100010002000001, 0x0100010002000102, 0x0100010002000201,
0x0100010002010000, 0x0100010002010002, 0x0100010002010101, 0x0100010002020000,
0x0100010002020001, 0x0100010002020201, 0x0100010100000001, 0x0100010100000002,
0x0100010100000101, 0x0100010100000202, 0x0100010100010001, 0x0100010100010100,
0x0100010100010101, 0x0100010100010102, 0x0100010100010201, 0x0100010100020000,
0x0100010100020101, 0x0100010100020202, 0x0100010101000001, 0x0100010101000100,
0x0100010101000101, 0x0100010101000102, 0x0100010101000201, 0x0100010101010000,
0x0100010101010001, 0x0100010101010100, 0x0100010101010101, 0x0100010101010102,
0x0100010101010200, 0x0100010101010201, 0x0100010101020001, 0x0100010101020100,
0x0100010101020101, 0x0100010101020102, 0x0100010101020201, 0x0100010102000002,
0x0100010102000100, 0x0100010102000101, 0x0100010102000200, 0x0100010102010001,
0x0100010102010100, 0x0100010102010101, 0x0100010102010102, 0x0100010102010201,
0x0100010102010202, 0x0100010102020101, 0x0100010102020200, 0x0100010102020202,
0x0100010200000001, 0x0100010200000101, 0x0100010200000201, 0x0100010200010100,
0x0100010200010101, 0x0100010200010200, 0x0100010200010202, 0x0100010200020001,
0x0100010200020100, 0x0100010200020201, 0x0100010201000000, 0x0100010201000002,
0x0100010201000101, 0x0100010201000200, 0x0100010201010000, 0x0100010201010001,
0x0100010201010002, 0x0100010201010101, 0x0100010201010102, 0x0100010201010201,
0x0100010201020002, 0x0100010201020101, 0x0100010201020200, 0x0100010202000001,
0x0100010202000101, 0x0100010202000202, 0x0100010202010100, 0x0100010202010101,
0x0100010202020001, 0x0100010202020100, 0x0100010202020102, 0x0100020000000101,
0x0100020000010001, 0x0100020000010101, 0x0100020000010202, 0x0100020000020101,
0x0100020001000002, 0x0100020001000201, 0x0100020001010000, 0x0100020001010101,
0x0100020001010200, 0x0100020001020001, 0x0100020001020100, 0x0100020001020102,
0x0100020001020201, 0x0100020002000101, 0x0100020002010001, 0x0100020002010100,
0x0100020002010102, 0x0100020002010201, 0x0100020002020101, 0x0100020100000001,
0x0100020100000101, 0x0100020100000102, 0x0100020100000202, 0x0100020100010000,
0x0100020100010100, 0x0100020100010101, 0x0100020100010200, 0x0100020100020001,
0x0100020100020100, 0x0100020100020102, 0x0100020101000000, 0x0100020101000101,
0x0100020101000202, 0x0100020101010001, 0x0100020101010002, 0x0100020101010100,
0x0100020101010101, 0x0100020101010102, 0x0100020101010201, 0x0100020101020000,
0x0100020101020002, 0x0100020101020101, 0x0100020101020102, 0x0100020101020202,
0x0100020102000102, 0x0100020102000201, 0x0100020102010002, 0x0100020102010101,
0x0100020102010102, 0x0100020102010200, 0x0100020102020001, 0x0100020102020100,
0x0100020102020102, 0x0100020102020201, 0x0100020200010102, 0x0100020201000100,
0x0100020201000102, 0x0100020201000201, 0x0100020201010101, 0x0100020201010200,
0x0100020201010202, 0x0100020201020100, 0x0100020201020201, 0x0100020202010100,
0x0100020202020101, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101,
0x0101000000000102, 0x0101000000000201, 0x0101000000010002, 0x0101000000010101,
0x0101000000010202, 0x0101000000020001, 0x0101000000020100, 0x0101000000020201,
0x0101000001000000, 0x0101000001000101, 0x0101000001000200, 0x0101000001010001,
0x0101000001010100, 0x0101000001010101, 0x0101000001010102, 0x0101000001010201,
0x0101000001020101, 0x0101000001020200, 0x0101000002000102, 0x0101000002000201,
0x0101000002010101, 0x0101000002010200, 0x0101000002020000, 0x0101000002020001,
0x0101000002020102, 0x0101000002020201, 0x0101000100000101, 0x0101000100000200,
0x0101000100000201, 0x0101000100000202, 0x0101000100010001, 0x0101000100010100,
0x0101000100010101, 0x0101000100010102, 0x0101000100010200, 0x0101000100010201,
0x0101000100020000, 0x0101000100020101, 0x0101000100020102, 0x0101000100020200,
0x0101000100020202, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101,
0x0101000101000102, 0x0101000101000201, 0x0101000101010000, 0x0101000101010001,
0x0101000101010002, 0x0101000101010100, 0x0101000101010101, 0x0101000101010102,
0x0101000101010200, 0x0101000101010201, 0x0101000101010202, 0x0101000101020001,
0x0101000101020100, 0x0101000101020101, 0x0101000101020102, 0x0101000101020201,
0x0101000102000002, 0x0101000102000101, 0x0101000102010001, 0x0101000102010100,
0x0101000102010101, 0x0101000102010102, 0x0101000102010201, 0x0101000102020000,
0x0101000102020101, 0x0101000102020202, 0x0101000200000001, 0x0101000200000102,
0x0101000200010002, 0x0101000200010101, 0x0101000200010202, 0x0101000200020001,
0x0101000200020100, 0x0101000201000002, 0x0101000201000101, 0x0101000201000202,
0x0101000201010001, 0x0101000201010100, 0x0101000201010101, 0x0101000201010102,
0x0101000201010201, 0x0101000201020002, 0x0101000201020101, 0x0101000202000101,
0x0101000202010000, 0x0101000202010002, 0x0101000202010101, 0x0101000202010201,
0x0101000202010202, 0x0101000202020100, 0x0101010000000100, 0x0101010000000101,
0x0101010000010001, 0x0101010000010100, 0x0101010000010101, 0x0101010000010102,
0x0101010000010200, 0x0101010000010201, 0x0101010000020001, 0x0101010000020101,
0x0101010000020200, 0x0101010000020202, 0x0101010001000001, 0x0101010001000100,
0x0101010001000101, 0x0101010001000102, 0x0101010001000201, 0x0101010001000202,
0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101,
0x0101010001010102, 0x0101010001010200, 0x0101010001010201, 0x0101010001010202,
0x0101010001020001, 0x0101010001020002, 0x0101010001020100, 0x0101010001020101,
0x0101010001020102, 0x0101010001020201, 0x0101010002000000, 0x0101010002000200,
0x0101010002000202, 0x0101010002010001, 0x0101010002010100, 0x0101010002010101,
0x0101010002010102, 0x0101010002010201, 0x0101010002020001, 0x0101010002020100,
0x0101010002020101, 0x0101010002020202, 0x0101010100000001, 0x0101010100000002,
0x0101010100000100, 0x0101010100000101, 0x0101010100000102, 0x0101010100000201,
0x0101010100010000, 0x0101010100010001, 0x0101010100010002, 0x0101010100010100,
0x0101010100010101, 0x0101010100010102, 0x0101010100010201, 0x0101010100010202,
0x0101010100020001, 0x0101010100020100, 0x0101010100020101, 0x0101010100020102,
0x0101010100020201, 0x0101010101000000, 0x0101010101000001, 0x0101010101000002,
0x0101010101000100, 0x0101010101000101, 0x0101010101000102, 0x0101010101000200,
0x0101010101000201, 0x0101010101010000, 0x0101010101010001, 0x0101010101010002,
0x0101010101010100, 0x0101010101010101, 0x0101010101010102, 0x0101010101010200,
0x0101010101010201, 0x0101010101010202, 0x0101010101020000, 0x0101010101020001,
0x0101010101020100, 0x0101010101020101, 0x0101010101020102, 0x0101010101020200,
0x0101010101020201, 0x0101010101020202, 0x0101010102000001, 0x0101010102000100,
0x0101010102000101, 0x0101010102000201, 0x0101010102000202, 0x0101010102010000,
0x0101010102010001, 0x0101010102010100, 0x0101010102010101, 0x0101010102010102,
0x0101010102010200, 0x0101010102010201, 0x0101010102020001, 0x0101010102020100,
0x0101010102020101, 0x0101010102020102, 0x0101010102020201, 0x0101010200000000,
0x0101010200000001, 0x0101010200000002, 0x0101010200000100, 0x0101010200000102,
0x0101010200000200, 0x0101010200000201, 0x0101010200010001, 0x0101010200010100,
0x0101010200010101, 0x0101010200010200, 0x0101010200010201, 0x0101010200020000,
0x0101010200020001, 0x0101010200020002, 0x0101010200020100, 0x0101010200020101,
0x0101010200020102, 0x0101010200020200, 0x0101010200020201, 0x0101010201000001,
0x0101010201000101, 0x0101010201000102, 0x0101010201000200, 0x0101010201000201,
0x0101010201000202, 0x0101010201010000, 0x0101010201010001, 0x0101010201010002,
0x0101010201010100, 0x0101010201010101, 0x0101010201010102, 0x0101010201010200,
0x0101010201010201, 0x0101010201010202, 0x0101010201020001, 0x0101010201020100,
0x0101010201020101, 0x0101010201020201, 0x0101010202000002, 0x0101010202000101,
0x0101010202000102, 0x0101010202000200, 0x0101010202000201, 0x0101010202000202,
0x0101010202010001, 0x0101010202010101, 0x0101010202010202, 0x0101010202020002,
0x0101010202020101, 0x0101010202020102, 0x0101010202020200, 0x0101010202020201,
0x0101020000000100, 0x0101020000000101, 0x0101020000000102, 0x0101020000000201,
0x0101020000010000, 0x0101020000010101, 0x0101020000010200, 0x0101020000020001,
0x0101020000020202, 0x0101020001000101, 0x0101020001000200, 0x0101020001000202,
0x0101020001010001, 0x0101020001010100, 0x0101020001010101, 0x0101020001010102,
0x0101020001010200, 0x0101020001010201, 0x0101020001020000, 0x0101020001020002,
0x0101020001020100, 0x0101020001020101, 0x0101020002000002, 0x0101020002000201,
0x0101020002010000, 0x0101020002010002, 0x0101020002010101, 0x0101020002010200,
0x0101020002020001, 0x0101020002020201, 0x0101020100000001, 0x0101020100000002,
0x0101020100000101, 0x0101020100000202, 0x0101020100010001, 0x0101020100010100,
0x0101020100010101, 0x0101020100010102, 0x0101020100010201, 0x0101020100020101,
0x0101020101000001, 0x0101020101000100, 0x0101020101000101, 0x0101020101000102,
0x0101020101000201, 0x0101020101010000, 0x0101020101010001, 0x0101020101010002,
0x0101020101010100, 0x0101020101010101, 0x0101020101010102, 0x0101020101010200,
0x0101020101010201, 0x0101020101010202, 0x0101020101020001, 0x0101020101020100,
0x0101020101020101, 0x0101020101020102, 0x0101020101020201, 0x0101020102000001,
0x0101020102000101, 0x0101020102000201, 0x0101020102010001, 0x0101020102010100,
0x0101020102010101, 0x0101020102010102, 0x0101020102010200, 0x0101020102010201,
0x0101020102020101, 0x0101020200000100, 0x0101020200000200, 0x0101020200010101,
0x0101020200010202, 0x0101020200020000, 0x0101020200020101, 0x0101020200020102,
0x0101020200020201, 0x0101020201000101, 0x0101020201000200, 0x0101020201000201,
0x0101020201010001, 0x0101020201010101, 0x0101020201010102, 0x0101020201010200,
0x0101020201010201, 0x0101020201020002, 0x0101020201020101, 0x0101020201020200,
0x0101020201020202, 0x0101020202000001, 0x0101020202000202, 0x0101020202010002,
0x0101020202010101, 0x0101020202010102, 0x0101020202010200, 0x0101020202010202,
0x0101020202020001, 0x0102000000000101, 0x0102000000010100, 0x0102000000010102,
0x0102000000010201, 0x0102000000020101, 0x0102000001000100, 0x0102000001010000,
0x0102000001010101, 0x0102000001010102, 0x0102000001010200, 0x0102000001010202,
0x0102000001020001, 0x0102000001020100, 0x0102000001020102, 0x0102000001020201,
0x0102000002000001, 0x0102000002010102, 0x0102000002020101, 0x0102000100000001,
0x0102000100000100, 0x0102000100000102, 0x0102000100000201, 0x0102000100010002,
0x0102000100010101, 0x0102000100020001, 0x0102000100020002, 0x0102000100020102,
0x0102000100020201, 0x0102000101000101, 0x0102000101000201, 0x0102000101010001,
0x0102000101010101, 0x0102000101010102, 0x0102000101010201, 0x0102000101020101,
0x0102000101020102, 0x0102000101020202, 0x0102000102000100, 0x0102000102000202,
0x0102000102010002, 0x0102000102010101, 0x0102000102020001, 0x0102000102020102,
0x0102000102020201, 0x0102000200010001, 0x0102000200010102, 0x0102000200010201,
0x0102000201000000, 0x0102000201000001, 0x0102000201000102, 0x0102000201010101,
0x0102000201010102, 0x0102000201010200, 0x0102000201020000, 0x0102000202000101,
0x0102000202010001, 0x0102000202010102, 0x0102000202020101, 0x0102010000010001,
0x0102010000010002, 0x0102010000010101, 0x0102010000010102, 0x0102010000010202,
0x0102010000020001, 0x0102010000020102, 0x0102010000020201, 0x0102010001000000,
0x0102010001000002, 0x0102010001000101, 0x0102010001000200, 0x0102010001000202,
0x0102010001010001, 0x0102010001010100, 0x0102010001010101, 0x0102010001010102,
0x0102010001010201, 0x0102010001010202, 0x0102010001020000, 0x0102010001020002,
0x0102010001020101, 0x0102010002000100, 0x0102010002000101, 0x0102010002000201,
0x0102010002010000, 0x0102010002010002, 0x0102010002010100, 0x0102010002010101,
0x0102010002010102, 0x0102010002010200, 0x0102010002010202, 0x0102010002020001,
0x0102010002020100, 0x0102010002020201, 0x0102010100000101, 0x0102010100000200,
0x0102010100000202, 0x0102010100010001, 0x0102010100010101, 0x0102010100010102,
0x0102010100010201, 0x0102010101000100, 0x0102010101000101, 0x0102010101000102,
0x0102010101000201, 0x0102010101010000, 0x0102010101010001, 0x0102010101010100,
0x0102010101010101, 0x0102010101010102, 0x0102010101010201, 0x0102010101020001,
0x0102010101020100, 0x0102010101020101, 0x0102010101020102, 0x0102010101020201,
0x0102010102000102, 0x0102010102000201, 0x0102010102000202, 0x0102010102010001,
0x0102010102010101, 0x0102010102010102, 0x0102010102010201, 0x0102010102010202,
0x0102010102020002, 0x0102010102020101, 0x0102010102020102, 0x0102010102020200,
0x0102010200000002, 0x0102010200000201, 0x0102010200010101, 0x0102010200020000,
0x0102010200020102, 0x0102010200020200, 0x0102010200020201, 0x0102010201000000,
0x0102010201000101, 0x0102010201000200, 0x0102010201000202, 0x0102010201010001,
0x0102010201010100, 0x0102010201010101, 0x0102010201010102, 0x0102010201010200,
0x0102010201010202, 0x0102010201020000, 0x0102010201020101, 0x0102010201020200,
0x0102010202000000, 0x0102010202000002, 0x0102010202000101, 0x0102010202000202,
0x0102010202010100, 0x0102010202010102, 0x0102010202010200, 0x0102010202010201,
0x0102010202020000, 0x0102010202020100, 0x0102010202020102, 0x0102010202020202,
0x0102020000010102, 0x0102020000010201, 0x0102020000020101, 0x0102020001000001,
0x0102020001010002, 0x0102020001010101, 0x0102020001010202, 0x0102020001020001,
0x0102020001020201, 0x0102020002000101, 0x0102020002010001, 0x0102020002010200,
0x0102020002020102, 0x0102020100000001, 0x0102020100000100, 0x0102020100010000,
0x0102020100010101, 0x0102020100020001, 0x0102020100020100, 0x0102020100020102,
0x0102020100020201, 0x0102020101000000, 0x0102020101000001, 0x0102020101000101,
0x0102020101000102, 0x0102020101000200, 0x0102020101010001, 0x0102020101010100,
0x0102020101010101, 0x0102020101010102, 0x0102020101010201, 0x0102020101020000,
0x0102020101020101, 0x0102020101020202, 0x0102020102000002, 0x0102020102000100,
0x0102020102000202, 0x0102020102010101, 0x0102020102020001, 0x0102020102020100,
0x0102020102020101, 0x0102020102020201, 0x0102020200010001, 0x0102020200010102,
0x0102020200010200, 0x0102020201000001, 0x0102020201000100, 0x0102020201000201,
0x0102020201010000, 0x0102020201010101, 0x0102020201010200, 0x0102020201010202,
0x0102020201020100, 0x0102020201020101, 0x0102020201020201, 0x0102020202000102,
0x0102020202010100, 0x0102020202010200, 0x0102020202010202, 0x0102020202020102,
0x0200000000000000, 0x0200000000000002, 0x0200000000000200, 0x0200000000000202,
0x0200000000020000, 0x0200000000020002, 0x0200000000020200, 0x0200000000020202,
0x0200000001000101, 0x0200000001010000, 0x0200000001010001, 0x0200000001010100,
0x0200000001010102, 0x0200000001010201, 0x0200000001020101, 0x0200000002000000,
0x0200000002000002, 0x0200000002000200, 0x0200000002000202, 0x0200000002010101,
0x0200000002020000, 0x0200000002020002, 0x0200000002020200, 0x0200000002020202,
0x0200000100000101, 0x0200000100010001, 0x0200000100010100, 0x0200000100010102,
0x0200000100010201, 0x0200000100020101, 0x0200000101000001, 0x0200000101000100,
0x0200000101000201, 0x0200000101010000, 0x0200000101010002, 0x0200000101010101,
0x0200000101010102, 0x0200000101010200, 0x0200000101010201, 0x0200000101020100,
0x0200000101020102, 0x0200000101020201, 0x0200000102000101, 0x0200000102000201,
0x0200000102010100, 0x0200000102010102, 0x0200000102010201, 0x0200000102020101,
0x0200000200000000, 0x0200000200000002, 0x0200000200000200, 0x0200000200000202,
0x0200000200010101, 0x0200000200020000, 0x0200000200020002, 0x0200000200020200,
0x0200000200020202, 0x0200000201010001, 0x0200000201010100, 0x0200000201010201,
0x0200000201020101, 0x0200000202000000, 0x0200000202000002, 0x0200000202000200,
0x0200000202000202, 0x0200000202010101, 0x0200000202020000, 0x0200000202020002,
0x0200000202020200, 0x0200000202020202, 0x0200010000010100, 0x0200010000010201,
0x0200010001000001, 0x0200010001000100, 0x0200010001010001, 0x0200010001010101,
0x0200010001010202, 0x0200010001020001, 0x0200010001020100, 0x0200010001020201,
0x0200010002010100, 0x0200010002010201, 0x0200010100000001, 0x0200010100000201,
0x0200010100010002, 0x0200010100010101, 0x0200010100010202, 0x0200010100020102,
0x0200010100020201, 0x0200010101000000, 0x0200010101000001, 0x0200010101000101,
0x0200010101000200, 0x0200010101010001, 0x0200010101010100, 0x0200010101010101,
0x0200010101010102, 0x0200010101010201, 0x0200010101010202, 0x0200010101020101,
0x0200010101020102, 0x0200010101020200, 0x0200010101020202, 0x0200010102000001,
0x0200010102000100, 0x0200010102000102, 0x0200010102000201, 0x0200010102010000,
0x0200010102010002, 0x0200010102010101, 0x0200010102010200, 0x0200010102020102,
0x0200010200010001, 0x0200010200010102, 0x0200010200010201, 0x0200010200020101,
0x0200010201000001, 0x0200010201000100, 0x0200010201000201, 0x0200010201000202,
0x0200010201010000, 0x0200010201010101, 0x0200010201010201, 0x0200010201010202,
0x0200010201020001, 0x0200010201020102, 0x0200010201020202, 0x0200010202000101,
0x0200010202010001, 0x0200010202010202, 0x0200010202020100, 0x0200020000000000,
0x0200020000000002, 0x0200020000000200, 0x0200020000000202, 0x0200020000010101,
0x0200020000020000, 0x0200020000020002, 0x0200020000020200, 0x0200020000020202,
0x0200020001000001, 0x0200020001000101, 0x0200020001010001, 0x0200020001010100,
0x0200020001010201, 0x0200020001020101, 0x0200020001020201, 0x0200020002000000,
0x0200020002000002, 0x0200020002000200, 0x0200020002000202, 0x0200020002010101,
0x0200020002020000, 0x0200020002020002, 0x0200020002020200, 0x0200020002020202,
0x0200020100000101, 0x0200020100000102, 0x0200020100010001, 0x0200020100010100,
0x0200020100010102, 0x0200020100020101, 0x0200020101000001, 0x0200020101000100,
0x0200020101000102, 0x0200020101000201, 0x0200020101010000, 0x0200020101010002,
0x0200020101010101, 0x0200020101010202, 0x0200020101020001, 0x0200020101020100,
0x0200020102000101, 0x0200020102010102, 0x0200020102010201, 0x0200020102020101,
0x0200020200000000, 0x0200020200000002, 0x0200020200000200, 0x0200020200000202,
0x0200020200010101, 0x0200020200020000, 0x0200020200020002, 0x0200020200020200,
0x0200020200020202, 0x0200020201000101, 0x0200020201010001, 0x0200020201010100,
0x0200020201010102, 0x0200020202000000, 0x0200020202000002, 0x0200020202000200,
0x0200020202000202, 0x0200020202010101, 0x0200020202020000, 0x0200020202020002,
0x0200020202020200, 0x0200020202020202, 0x0201000000000101, 0x0201000000010001,
0x0201000000010102, 0x0201000000010200, 0x0201000000010201, 0x0201000000020101,
0x0201000001000001, 0x0201000001000102, 0x0201000001000201, 0x0201000001010101,
0x0201000001010200, 0x0201000001010202, 0x0201000001020201, 0x0201000001020202,
0x0201000002000101, 0x0201000002010001, 0x0201000002010100, 0x0201000002010102,
0x0201000002010201, 0x0201000002020101, 0x0201000100000001, 0x0201000100000100,
0x0201000100000102, 0x0201000100000201, 0x0201000100010000, 0x0201000100010101,
0x0201000100010200, 0x0201000100010202, 0x0201000100020001, 0x0201000100020100,
0x0201000100020102, 0x0201000100020201, 0x0201000101000000, 0x0201000101000101,
0x0201000101010000, 0x0201000101010001, 0x0201000101010100, 0x0201000101010101,
0x0201000101010102, 0x0201000101010201, 0x0201000101020002, 0x0201000101020101,
0x0201000102000100, 0x0201000102000102, 0x0201000102010002, 0x0201000102010101,
0x0201000102010200, 0x0201000102020001, 0x0201000102020100, 0x0201000102020102,
0x0201000102020201, 0x0201000200000101, 0x0201000200010001, 0x0201000200010100,
0x0201000200010201, 0x0201000200020101, 0x0201000201000100, 0x0201000201000102,
0x0201000201000201, 0x0201000201010000, 0x0201000201010002, 0x0201000201010101,
0x0201000201010200, 0x0201000201020102, 0x0201000201020201, 0x0201000202000101,
0x0201000202010100, 0x0201000202010102, 0x0201000202020201, 0x0201010000000001,
0x0201010000000100, 0x0201010000000102, 0x0201010000010000, 0x0201010000010101,
0x0201010000010200, 0x0201010000020102, 0x0201010001000000, 0x0201010001000202,
0x0201010001010001, 0x0201010001010100, 0x0201010001010101, 0x0201010001010102,
0x0201010001010200, 0x0201010001010201, 0x0201010001020000, 0x0201010001020001,
0x0201010001020002, 0x0201010001020101, 0x0201010002000100, 0x0201010002000102,
0x0201010002010002, 0x0201010002010100, 0x0201010002010101, 0x0201010002010200,
0x0201010002020001, 0x0201010002020201, 0x0201010100000000, 0x0201010100000101,
0x0201010100000200, 0x0201010100000202, 0x0201010100010000, 0x0201010100010001,
0x0201010100010100, 0x0201010100010101, 0x0201010100010102, 0x0201010100010201,
0x0201010100020001, 0x0201010100020101, 0x0201010100020201, 0x0201010100020202,
0x0201010101000001, 0x0201010101000100, 0x0201010101000101, 0x0201010101000102,
0x0201010101000201, 0x0201010101010000, 0x0201010101010001, 0x0201010101010002,
0x0201010101010100, 0x0201010101010101, 0x0201010101010102, 0x0201010101010200,
0x0201010101010201, 0x0201010101010202, 0x0201010101020001, 0x0201010101020100,
0x0201010101020101, 0x0201010101020102, 0x0201010101020201, 0x0201010102000001,
0x0201010102000101, 0x0201010102000200, 0x0201010102010001, 0x0201010102010002,
0x0201010102010100, 0x0201010102010101, 0x0201010102010102, 0x0201010102010201,
0x0201010102010202, 0x0201010102020000, 0x0201010102020002, 0x0201010102020101,
0x0201010102020200, 0x0201010102020202, 0x0201010200000001, 0x0201010200000100,
0x0201010200010000, 0x0201010200010101, 0x0201010200010201, 0x0201010200020000,
0x0201010200020102, 0x0201010200020201, 0x0201010201000101, 0x0201010201000200,
0x0201010201000201, 0x0201010201010001, 0x0201010201010002, 0x0201010201010101,
0x0201010201010102, 0x0201010201010201, 0x0201010201020101, 0x0201010201020200,
0x0201010202000002, 0x0201010202000100, 0x0201010202000201, 0x0201010202000202,
0x0201010202010002, 0x0201010202010100, 0x0201010202010101, 0x0201010202020100,
0x0201010202020102, 0x0201010202020201, 0x0201020000000101, 0x0201020000010102,
0x0201020000010201, 0x0201020000020101, 0x0201020001000001, 0x0201020001000102,
0x0201020001010000, 0x0201020001010002, 0x0201020001010101, 0x0201020001010102,
0x0201020001010202, 0x0201020001020100, 0x0201020001020101, 0x0201020002000101,
0x0201020002010001, 0x0201020002010102, 0x0201020002010201, 0x0201020002020101,
0x0201020100000100, 0x0201020100000102, 0x0201020100000201, 0x0201020100010000,
0x0201020100010002, 0x0201020100010101, 0x0201020100010200, 0x0201020100010202,
0x0201020100020000, 0x0201020100020001, 0x0201020100020100, 0x0201020100020102,
0x0201020101000000, 0x0201020101000002, 0x0201020101000101, 0x0201020101000200,
0x0201020101000202, 0x0201020101010001, 0x0201020101010100, 0x0201020101010101,
0x0201020101010102, 0x0201020101010201, 0x0201020101020002, 0x0201020101020101,
0x0201020101020102, 0x0201020101020202, 0x0201020102000001, 0x0201020102000100,
0x0201020102010000, 0x0201020102010002, 0x0201020102010101, 0x0201020102010202,
0x0201020102020001, 0x0201020102020102, 0x0201020200000101, 0x0201020200010101,
0x0201020200020101, 0x0201020201000100, 0x0201020201000102, 0x0201020201000201,
0x0201020201010000, 0x0201020201010101, 0x0201020201010200, 0x0201020201020001,
0x0201020202000101, 0x0201020202010001, 0x0201020202010100, 0x0201020202010101,
0x0201020202010102, 0x0202000000000000, 0x0202000000000002, 0x0202000000000200,
0x0202000000000202, 0x0202000000010101, 0x0202000000020000, 0x0202000000020002,
0x0202000000020200, 0x0202000000020202, 0x0202000001000101, 0x0202000001010001,
0x0202000001010100, 0x0202000001010102, 0x0202000001010201, 0x0202000002000000,
0x0202000002000002, 0x0202000002000200, 0x0202000002000202, 0x0202000002010101,
0x0202000002020000, 0x0202000002020002, 0x0202000002020200, 0x0202000002020202,
0x0202000100000101, 0x0202000100000201, 0x0202000100010001, 0x0202000100010100,
0x0202000100010102, 0x0202000100010201, 0x0202000100010202, 0x0202000101000102,
0x0202000101000201, 0x0202000101010001, 0x0202000101010101, 0x0202000101010200,
0x0202000101010202, 0x0202000101020001, 0x0202000101020100, 0x0202000102000101,
0x0202000102010000, 0x0202000102010002, 0x0202000102010102, 0x0202000102010201,
0x0202000200000002, 0x0202000200000200, 0x0202000200000202, 0x0202000200010000,
0x0202000200010201, 0x0202000200020002, 0x0202000200020200, 0x0202000200020202,
0x0202000201000101, 0x0202000201010001, 0x0202000201010102, 0x0202000201010201,
0x0202000201020101, 0x0202000202000000, 0x0202000202000002, 0x0202000202000200,
0x0202000202000202, 0x0202000202010101, 0x0202000202020000, 0x0202000202020002,
0x0202000202020200, 0x0202000202020202, 0x0202010000010201, 0x0202010000020101,
0x0202010001000001, 0x0202010001000100, 0x0202010001010000, 0x0202010001010100,
0x0202010001010101, 0x0202010001010200, 0x0202010001010202, 0x0202010001020001,
0x0202010001020101, 0x0202010001020102, 0x0202010001020200, 0x0202010001020201,
0x0202010002000101, 0x0202010100000102, 0x0202010100000201, 0x0202010100010000,
0x0202010100010002, 0x0202010100010101, 0x0202010100010200, 0x0202010100020102,
0x0202010100020201, 0x0202010101000002, 0x0202010101000101, 0x0202010101010001,
0x0202010101010100, 0x0202010101010101, 0x0202010101010102, 0x0202010101010201,
0x0202010101020101, 0x0202010101020202, 0x0202010102000001, 0x0202010102000100,
0x0202010102000101, 0x0202010102000102, 0x0202010102000201, 0x0202010102010002,
0x0202010102010101, 0x0202010102010200, 0x0202010200000101, 0x0202010200010001,
0x0202010200010102, 0x0202010200010202, 0x0202010200020001, 0x0202010200020101,
0x0202010201000100, 0x0202010201000102, 0x0202010201000202, 0x0202010201010002,
0x0202010201010101, 0x0202010201010102, 0x0202010201010200, 0x0202010201020000,
0x0202010201020002, 0x0202010202000102, 0x0202010202010000, 0x0202010202010101,
0x0202010202010102, 0x0202010202010201, 0x0202010202020001, 0x0202010202020100,
0x0202010202020102, 0x0202020000000000, 0x0202020000000002, 0x0202020000000200,
0x0202020000000202, 0x0202020000020000, 0x0202020000020002, 0x0202020000020200,
0x0202020000020202, 0x0202020001010001, 0x0202020001010100, 0x0202020001010102,
0x0202020001010201, 0x0202020002000000, 0x0202020002000002, 0x0202020002000200,
0x0202020002000202, 0x0202020002010101, 0x0202020002020000, 0x0202020002020002,
0x0202020002020200, 0x0202020002020202, 0x0202020100000101, 0x0202020100010100,
0x0202020100010201, 0x0202020100020001, 0x0202020100020101, 0x0202020101000001,
0x0202020101010000, 0x0202020101010101, 0x0202020101010202, 0x0202020101020001,
0x0202020101020102, 0x0202020101020201, 0x0202020102010000, 0x0202020102010102,
0x0202020200000000, 0x0202020200000002, 0x0202020200000200, 0x0202020200000202,
0x0202020200020000, 0x0202020200020002, 0x0202020200020200, 0x0202020200020202,
0x0202020201010001, 0x0202020201010100, 0x0202020201010102, 0x0202020202000000,
0x0202020202000002, 0x0202020202000200, 0x0202020202000202, 0x0202020202010101,
0x0202020202020000, 0x0202020202020002, 0x0202020202020200, 0x0202020202020202,
};
#else
static const uint32_t iq1s_grid_us[2048] = {
0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
};
#endif
#ifndef HAVE_FANCY_SIMD
const uint64_t keven_signs[128] = {
0x0101010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0x010101010101ffff,
0xff01010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0xff01010101ffffff,
......@@ -181,31 +989,41 @@ const uint64_t keven_signs[128] = {
0x01ffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0x01ffffffff01ffff,
0xffffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0xffffffffffffffff,
};
#endif
}
bool iqk_mul_mat(long Nx, long Ny, long ne00, int typeA, const void * A, const void * B,
float * C, long stride_C, int ith, int nth) {
/* moonll change mulmat
add typeB and strideB
}*/
MulMat mm;
int row_size_q8;
if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
return false;
}
bool iqk_mul_mat(long Nx, long Ny, long ne00,
int typeA, const void * A, long strideA,
int typeB, const void * B, long strideB,
float * C, long stride_C, int ith, int nth) {
auto row_size_qx = ggml_row_size((ggml_type)typeA, ne00);
MulMat mm;
if (!MulMat::set_mul_mat(typeA, typeB, ne00, mm, Ny)) {
return false;
}
auto nrc_x = (Nx + nth - 1)/nth;
auto first_x = ith*nrc_x;
if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;
size_t row_size_qx = strideA*ggml_type_size(ggml_type(typeA));
size_t row_size_qy = strideB*ggml_type_size(ggml_type(typeB));
auto nrc_x = (Nx + nth - 1)/nth;
auto first_x = ith*nrc_x;
if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;
DataInfo info{C + first_x, (const char *)B, (size_t)stride_C, (size_t)row_size_q8, 0, 1, nullptr, 0};
DataInfo info{C + first_x, (const char *)B, (size_t)stride_C, row_size_qy, 0, 1, nullptr, 0};
mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
return true;
return true;
}
bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const void * A, const void * B,
float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth) {
const mmid_row_mapping * row_mapping = (const mmid_row_mapping *)vrow_mapping;
......@@ -213,9 +1031,11 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
MulMat mm;
int row_size_q8;
/* moonll
if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
return false;
}
}*/
int row_size_qx = ggml_row_size((ggml_type)typeA, ne00);
int nrc_x = (Nx + nth - 1)/nth;
int first_x = ith*nrc_x;
......@@ -233,6 +1053,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
#define HAVE_FANCY_SIMD
#endif
//#define HAVE_FANCY_SIMD
namespace {
......@@ -257,10 +1078,9 @@ template <int nrc, typename block_q8 = block_q8_K> struct Q8 {
}
#ifdef HAVE_FANCY_SIMD
inline __m512i load_quants(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); }
#else
inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); }
inline __m512i load_quants64(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); }
#endif
inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); }
inline __m256i load_bsums(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].bsums); }
inline float scale(int iy, int i) const { return y[iy][i].d; }
......@@ -353,6 +1173,23 @@ struct ScaleIQ4XS {
const __m128i m32 = _mm_set1_epi16(-32);
};
struct Scales8KBase {
template <typename Q8>
inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const {
const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0]));
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
const __m256i q8s = q8.load_bsums(iy, i);
const __m256i prod = _mm256_madd_epi16(mins, q8s);
accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
}
}
inline __m256i shuffle(__m128i mins) const {
return MM256_SET_M128I(_mm_shuffle_epi8(mins, shuffles[1]), _mm_shuffle_epi8(mins, shuffles[0]));
}
const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100),
_mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)};
};
template <typename Block>
struct BaseDequantizer {
BaseDequantizer(const void * vx, size_t bx) : vx(vx), bx(bx) {}
......@@ -367,6 +1204,16 @@ struct BaseDequantizer {
float d;
};
__m128i inline load_iq4nl_values_128() {
static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
return _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
}
__m256i inline load_iq4nl_values_256() {
auto val128 = load_iq4nl_values_128();
return MM256_SET_M128I(val128, val128);
}
#ifdef HAVE_FANCY_SIMD
//====================================== Zen4 ==================================================
......@@ -434,8 +1281,17 @@ struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
Scales8K s8k;
};
/*
moonll DequantizerIQ4XS
*/
__m512i inline load_iq4nl_values_512() {
auto val256 = load_iq4nl_values_256();
return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1);
}
struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_values()) {}
DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_iq4nl_values_512()) {}
template <typename Q8>
inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
d = GGML_FP16_TO_FP32(x[i].d);
......@@ -444,14 +1300,10 @@ struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
s8k.accum_mins(scales128, q8, i, -128.f*d, accd);
auto scales256 = MM256_SET_M128I(scales128, scales128);
auto all_scales = _mm512_inserti32x8(_mm512_castsi256_si512(scales256), scales256, 1);
scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]);
scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]);
}
static __m512i load_values() {
static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
auto val128 = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
auto val256 = MM256_SET_M128I(val128, val128);
return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1);
scales[0] = _mm512_shuffle_epi8(all_scales, shuffles[0]);
scales[1] = _mm512_shuffle_epi8(all_scales, shuffles[1]);
scales[2] = _mm512_shuffle_epi8(all_scales, shuffles[2]);
scales[3] = _mm512_shuffle_epi8(all_scales, shuffles[3]);
}
inline void prepare(const uint8_t * q4) {
bits.prepare64(q4);
......@@ -467,11 +1319,17 @@ struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
}
Q4Bits bits;
Scales8K s8k;
Scales8KBase s8k;
ScaleIQ4XS siq4;
const __m512i values;
const __m512i permute1 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0);
const __m512i permute2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
const __m512i shuffles[4] = {
_mm512_inserti32x8(_mm512_set1_epi16(0x0100), _mm256_set1_epi16(0x0302), 1),
_mm512_inserti32x8(_mm512_set1_epi16(0x0504), _mm256_set1_epi16(0x0706), 1),
_mm512_inserti32x8(_mm512_set1_epi16(0x0908), _mm256_set1_epi16(0x0b0a), 1),
_mm512_inserti32x8(_mm512_set1_epi16(0x0d0c), _mm256_set1_epi16(0x0f0e), 1),
};
};
struct HighBit5 {
......@@ -646,6 +1504,149 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf
}
}
template <typename Q8>
inline void compute_block(int iy, int i, float d, const Q8& q8, const __m512i * values, const __m512i * scales, __m512 * accd) {
const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[0], q8.load_quants64(iy, i, 0));
const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[1], q8.load_quants64(iy, i, 1));
const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[2], q8.load_quants64(iy, i, 2));
const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[3], q8.load_quants64(iy, i, 3));
auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
}
template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n % QK_K == 0);
const int nb = n / QK_K;
Q8<nrc_y> q8(info);
Dequantizer deq(vx, bx);
__m256 accm[nrc_y];
__m512 accd[nrc_y];
__m512i scales[2];
for (int ix = 0; ix < nrc_x; ++ix) {
for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();
deq.new_row(ix);
for (int i = 0; i < nb; ++i) {
deq.new_block(i, q8, accm, scales);
for (int iy = 0; iy < nrc_y; ++iy) {
const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants64(iy, i, 0));
const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants64(iy, i, 1));
const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants64(iy, i, 2));
const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants64(iy, i, 3));
auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
}
}
for (int iy = 0; iy < nrc_y; ++iy) {
auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
}
}
}
template <typename Dequantizer, int nrc_y>
static void mul_mat_iqX_k_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n % QK_K == 0);
const int nb = n / QK_K;
Q8<nrc_y> q8(info);
Dequantizer deq(vx, bx);
__m256 accm[nrc_y];
__m512 accd[nrc_y];
__m512i scales[4];
for (int ix = 0; ix < nrc_x; ++ix) {
for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();
deq.new_row(ix);
for (int i = 0; i < nb; ++i) {
deq.new_block(i, q8, accm, scales);
for (int iy = 0; iy < nrc_y; ++iy) {
const __m512i p1 = _mm512_maddubs_epi16(deq.bits.values[0], q8.load_quants64(iy, i, 0));
const __m512i p2 = _mm512_maddubs_epi16(deq.bits.values[1], q8.load_quants64(iy, i, 1));
const __m512i p3 = _mm512_maddubs_epi16(deq.bits.values[2], q8.load_quants64(iy, i, 2));
const __m512i p4 = _mm512_maddubs_epi16(deq.bits.values[3], q8.load_quants64(iy, i, 3));
auto sumi = _mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_setzero_si512(),
p1, scales[0]), p2, scales[1]), p3, scales[2]), p4, scales[3]);
accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
}
}
for (int iy = 0; iy < nrc_y; ++iy) {
auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
}
}
}
template <typename Dequantizer>
static void mul_mat_qX_K_q8_K_AVX512_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n % QK_K == 0);
const int nb = n / QK_K;
constexpr int k_nx = 2;
Q8<1> q8(info);
Dequantizer deq1(vx, bx);
Dequantizer deq2(vx, bx);
Dequantizer * deq[k_nx];
deq[0] = &deq1;
deq[1] = &deq2;
__m512i scales[2*k_nx];
for (int ix = 0; ix < nrc_x; ++ix) {
auto accd = _mm512_setzero_ps();
auto accm = _mm256_setzero_ps();
for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_row(ix);
for (int i = 0; i < nb/k_nx; ++i) {
for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_block(k_nx*i+kx, q8, &accm, scales+2*kx);
for (int kx = 0; kx < k_nx; ++kx) {
compute_block(0, k_nx*i+kx, deq[kx]->d, q8, deq[kx]->bits.values, scales+2*kx, &accd);
}
}
if (2*(nb/2) < nb) {
int i0 = 2*(nb/2);
deq[0]->new_block(i0, q8, &accm, scales);
compute_block(0, i0, deq[0]->d, q8, deq[0]->bits.values, scales, &accd);
}
auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd), _mm512_extractf32x8_ps(accd, 1));
info.store(ix, 0, hsum_float_8(_mm256_add_ps(accm, sum256)));
}
}
#else
// ===================================== Vanilla AVX2 =====================================
......@@ -724,17 +1725,8 @@ struct HighBit3 {
__m256i hbits;
};
inline __m256i get_scale_shuffle_8(int i) {
return _mm256_set1_epi16((2*i) | ((2*i+1) << 8));
}
inline void set_scales_8(const __m256i& all_scales, int j, __m256i * scales) {
scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+0));
scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+1));
scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+2));
scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+3));
}
/*
template <typename Q8, typename Bits>
inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
if (j == 0) {
......@@ -755,7 +1747,7 @@ inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i,
sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
}
}
}
}*/
struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
......@@ -889,22 +1881,14 @@ struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
const __m256i mh = _mm256_set1_epi8(0x30);
};
inline __m256i get_scale_shuffle_16(int i) {
static const uint8_t k_shuffle[128] = {
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
};
return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
}
inline __m256i get_scale_shuffle_8(int i);
inline void set_scales_8(const __m256i& all_scales, int j, __m256i* scales);
inline __m256i get_scale_shuffle_16(int i);
inline void set_scales_16(const __m256i& all_scales, __m256i* scales);
inline void set_scales_16(const __m256i& all_scales, __m256i * scales) {
scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(0));
scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(1));
scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(2));
scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(3));
}
template <typename Dequantizer, int nrc_y>
static void mul_mat_qY_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
......@@ -1000,6 +1984,8 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf
}
#endif // Zen4 or vanilla AVX2
//
// ============================== Legacy quants
//
......@@ -1075,6 +2061,28 @@ struct ScaleHelperQ_0 {
template <typename Q> inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); }
template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
};
template <int min_value>
struct ScaleHelperQ_0_1 {
ggml_half scales8[4];
template <typename Q>
inline __m256 prepare4(const Q * y) {
for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
auto s4 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
return _mm256_set_m128(_mm_mul_ps(s4, min), s4);
}
template <typename Q>
inline __m256 prepare4(__m256 other_scales, const Q * y) {
return _mm_mul256_ps(other_scales, prepare4<Q>(y));
}
template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
float d = GGML_FP16_TO_FP32(y->d);
return std::make_pair(d, -d*float(min_value));
}
std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
}
const __m128 min = _mm_set1_ps(float(-min_value));
};
struct ScaleHelperQ_1 {
uint32_t scales8[4];
......@@ -1235,6 +2243,12 @@ struct Q8_0_Dequantizer {
}
};
struct Q8_0_1_Dequantizer {
inline __m256i dequant(const block_q8_0 * x) const {
return _mm256_add_epi8(_mm256_set1_epi8(127), _mm256_loadu_si256((const __m256i *)x->qs));
}
};
struct Q4_0_Dequantizer {
Dequantizer4bit b4;
const __m256i m8 = _mm256_set1_epi8(-8);
......@@ -1320,6 +2334,11 @@ struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_
Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
inline static int block_size() { return QK4_0; }
};
struct Q8_0_1_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0_1<127>, Q8_0_1_Dequantizer> {
Q8_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
// using Sum4T = Sum4TypeQ81;
inline static int block_size() { return QK8_0; }
};
struct Q4_0_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0, Q4_0_Dequantizer> {
Q4_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
inline static int block_size() { return QK4_0; }
......@@ -1353,8 +2372,488 @@ void mul_mat_q8_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info
}
}
/*
moonll
add some structs for DequantizerIQ2XXS
SimpleBits
EvenSignHelper
*/
struct SimpleBits {
__m256i values[4];
};
// fix for #829: 添加对 AVX512VPOPCNTDQ 的检测
#if defined(HAVE_FANCY_SIMD) && defined(__AVX512VPOPCNTDQ__)
#define HAVE_AVX512_POPCNT 1
#else
#define HAVE_AVX512_POPCNT 0
#endif
struct EvenSignHelper {
#if defined HAVE_FANCY_SIMD
// #pragma message("Using AVX512VPOPCNTDQ in even sign helper")
union sbits_t {
__m128i vec;
__mmask32 mask[4];
};
IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const {
aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask);
// fix for #829: 兼容Intel Cascade Lake架构的CPU,如果不支持AVX512VPOPCNTDQ扩展,则使用替代实现
#if HAVE_AVX512_POPCNT
auto pcnt = _mm256_popcnt_epi32(aux);
#else
// 提供替代实现,使用标准的位计数方法
__m256i pcnt;
int* pcnt_ptr = reinterpret_cast<int*>(&pcnt);
int* aux_ptr = reinterpret_cast<int*>(&aux); // 直接获取 aux 的地址,避免不必要的复制
#pragma unroll 8 // 提示编译器展开循环,提高 SIMD 计算吞吐量
for (int i = 0; i < 8; i++) {
pcnt_ptr[i] = __builtin_popcount(aux_ptr[i]); // 使用编译器内置 popcount
}
#endif
sbits_t sbits;
sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]);
values[1] = _mm256_mask_sub_epi8(values[1], sbits.mask[1], _mm256_setzero_si256(), values[1]);
//auto sign_bits = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
//const __mmask32 * m32 = (const __mmask32 *)&sign_bits;
//values[0] = _mm256_mask_sub_epi8(values[0], m32[0], _mm256_setzero_si256(), values[0]);
//values[1] = _mm256_mask_sub_epi8(values[1], m32[1], _mm256_setzero_si256(), values[1]);
}
const __m256i shifts = _mm256_set_epi32(21, 14, 7, 0, 21, 14, 7, 0);
const __m256i mask = _mm256_set1_epi32(127);
const __m256i mone = _mm256_set1_epi32(1);
#else
inline void sign_value(uint32_t aux32, __m256i& value) const {
auto signs = _mm256_set_epi64x(keven_signs[(aux32 >> 21) & 127], keven_signs[(aux32 >> 14) & 127],
keven_signs[(aux32 >> 7) & 127], keven_signs[(aux32 >> 0) & 127]);
value = _mm256_sign_epi8(value, signs);
}
#endif
};
/*
moonll ad multiply_add for mul_mat_qX_K_q8_K_IQ_1
add func
get_scale_shuffle_8
get_scale_shuffle_16
set_scales_16
*/
inline __m256i get_scale_shuffle_8(int i) {
return _mm256_set1_epi16((2*i) | ((2*i+1) << 8));
}
inline void set_scales_8(const __m256i& all_scales, int j, __m256i * scales) {
scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+0));
scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+1));
scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+2));
scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+3));
}
inline __m256i get_scale_shuffle_16(int i) {
static const uint8_t k_shuffle[128] = {
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
};
return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
}
inline void set_scales_16(const __m256i& all_scales, __m256i * scales) {
scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(0));
scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(1));
scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(2));
scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(3));
}
template <typename Q8, typename Bits>
inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
if (j == 0) {
#ifdef HAVE_FANCY_SIMD
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
sumi[iy] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
}
#else
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4));
}
#endif
} else {
#ifdef HAVE_FANCY_SIMD
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
}
#else
for (int iy = 0; iy < Q8::nrc_y; ++iy) {
const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3));
sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
}
#endif
}
}
/*
moonll ad multiply_add_1 for mul_mat_qX_K_q8_K_IQ_1
add func
set_scales_8_iq
set_scales_16_iq
add MUL_MAT
mul_mat_qX_K_q8_K_IQ_1
mul_mat_qX_K_q8_K_IQ_N
mul_mat_qX_K_q8_K_IQ
*/
template <typename Bits>
inline void multiply_add_1(int j, const Bits& bits, const __m256i * scales, const __m256i * q8, __m256i * sumi) {
if (j == 0) {
#ifdef HAVE_FANCY_SIMD
auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]);
auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]);
auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]);
auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]);
sumi[0] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_packs_epi32(p1, p2));
sumi[1] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[1], _mm256_packs_epi32(p3, p4));
#else
const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0]));
const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1]));
const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2]));
const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3]));
sumi[0] = _mm256_add_epi32(p1, p3);
sumi[1] = _mm256_add_epi32(p2, p4);
#endif
} else {
#ifdef HAVE_FANCY_SIMD
auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]);
auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]);
auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]);
auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]);
sumi[0] = _mm256_dpwssd_epi32(sumi[0], scales[0], _mm256_packs_epi32(p1, p2));
sumi[1] = _mm256_dpwssd_epi32(sumi[1], scales[1], _mm256_packs_epi32(p3, p4));
#else
const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0]));
const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1]));
const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2]));
const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3]));
sumi[0] = _mm256_add_epi32(sumi[0], _mm256_add_epi32(p1, p3));
sumi[1] = _mm256_add_epi32(sumi[1], _mm256_add_epi32(p2, p4));
#endif
}
}
inline void set_scales_8_iq(int j, const __m256i& all_scales, __m256i * scales) {
//#ifdef HAVE_FANCY_SIMD
auto shuffle = j == 0 ? _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100)
: _mm256_set_epi64x(0x0b0a0b0a0b0a0b0a, 0x0908090809080908, 0x0b0a0b0a0b0a0b0a, 0x0908090809080908);
scales[0] = _mm256_shuffle_epi8(all_scales, shuffle);
scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(4)));
//#else
// set_scales_8(all_scales, j, scales);
//#endif
}
inline void set_scales_16_iq(const __m256i& all_scales, __m256i * scales) {
#ifdef HAVE_FANCY_SIMD
auto shuffle = _mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100);
scales[0] = _mm256_shuffle_epi8(all_scales, shuffle);
scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(8)));
#else
set_scales_16(all_scales, scales);
#endif
}
template <typename Dequantizer>
static void mul_mat_qX_K_q8_K_IQ_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
const int nb = n / QK_K;
Q8<1> q8(info);
Dequantizer deq(vx, bx);
__m256i scales[2];
__m256i q8_quants[4];
for (int ix = 0; ix < nrc_x; ++ix) {
__m256 accd = _mm256_setzero_ps();
deq.new_row(ix);
for (int i = 0; i < nb; ++i) {
__m256i sumi[2], all_scales[Dequantizer::num_blocks/8];
deq.new_block(i, all_scales);
for (int j = 0; j < QK_K/128; ++j) {
deq.prepare(i, j, q8, q8_quants);
if constexpr (Dequantizer::num_blocks == 8) {
set_scales_8_iq(j, all_scales[0], scales);
} else {
set_scales_16_iq(all_scales[j], scales);
}
multiply_add_1(j, deq.bits, scales, q8_quants, sumi);
}
accd = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(0, i)), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi[0], sumi[1])), accd);
}
info.store(ix, 0, hsum_float_8(accd));
}
}
template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_IQ_N(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
const int nb = n / QK_K;
Q8<nrc_y> q8(info);
Dequantizer deq(vx, bx);
__m256i scales[4];
__m256 accd[nrc_y];
for (int ix = 0; ix < nrc_x; ++ix) {
for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();
deq.new_row(ix);
for (int i = 0; i < nb; ++i) {
__m256i sumi[nrc_y], all_scales[Dequantizer::num_blocks/8];
//for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = _mm256_setzero_si256();
__m256i mins;
float dmin = deq.new_block(i, all_scales, mins);
for (int iy = 0; iy < nrc_y; ++iy) {
auto bsums = q8.load_bsums(iy, i);
auto prod = _mm256_madd_epi16(mins, bsums);
accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(dmin*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
}
for (int j = 0; j < QK_K/128; ++j) {
deq.prepare(i, j);
if constexpr (Dequantizer::num_blocks == 8) {
set_scales_8(all_scales[0], j, scales);
} else {
set_scales_16(all_scales[j], scales);
}
//multiply_add_iq(deq.bits, scales, j, i, q8, sumi);
multiply_add(deq.bits, scales, j, i, q8, sumi);
}
for (int iy = 0; iy < nrc_y; ++iy) {
const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i));
accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
}
}
for (int iy = 0; iy < nrc_y; ++iy) {
info.store(ix, iy, hsum_float_8(accd[iy]));
}
}
}
template <typename Dequantizer, int nrc_y>
static void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
assert(n % QK_K == 0);
#ifdef HAVE_FANCY_SIMD
if constexpr (nrc_y == 1) {
mul_mat_qX_K_q8_K_IQ_1<Dequantizer>(n, vx, bx, info, nrc_x);
} else {
mul_mat_qX_K_q8_K_IQ_N<Dequantizer, nrc_y>(n, vx, bx, info, nrc_x);
}
#else
mul_mat_qX_K_q8_K_IQ_N<Dequantizer, nrc_y>(n, vx, bx, info, nrc_x);
#endif
}
/*
moonll iq1s
core func for iq1s mul_mat_iq1_s_q8_K
*/
template <int nrc_y>
static void mul_mat_iq1_s_q8_K(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
GGML_ASSERT(n%QK_K == 0);
Q8<nrc_y, block_q8_K> q8(info);
__m256i qx[8];
__m256i scales[4];
__m256 acc[nrc_y] = {};
auto delta_mask = _mm_set1_epi16(-32768); // to avoid stupid overflow warnings when using 0x8000
__m256i shuffle0 = _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100);
for (int ix = 0; ix < nrc_x; ++ix) {
auto iq1s = (const block_iq1_s *)((const char *)vx + ix*bx);
for (int ibl = 0; ibl < n/QK_K; ++ibl) {
float d = GGML_FP16_TO_FP32(iq1s[ibl].d);
auto qhb = _mm_loadu_si128((const __m128i *)iq1s[ibl].qh);
auto scales128 = _mm_and_si128(_mm_srli_epi16(qhb, 12), _mm_set1_epi16(7));
scales128 = _mm_add_epi16(_mm_slli_epi16(scales128, 1), _mm_set1_epi16(1));
#ifdef HAVE_FANCY_SIMD
auto mask = _mm_cmpeq_epi16_mask(_mm_and_si128(qhb, delta_mask), delta_mask);
auto deltas128 = _mm_mask_blend_epi16(mask, _mm_set1_epi16(-7), _mm_set1_epi16(-9));
#else
auto mask = _mm_cmpeq_epi16(_mm_and_si128(qhb, delta_mask), delta_mask);
auto deltas128 = _mm_or_si128(_mm_and_si128(mask, _mm_set1_epi16(-9)), _mm_andnot_si128(mask, _mm_set1_epi16(-7)));
#endif
deltas128 = _mm_mullo_epi16(scales128, deltas128);
scales128 = _mm_slli_epi16(scales128, 3);
auto deltas_l = _mm_unpacklo_epi16(deltas128, deltas128);
auto deltas_h = _mm_unpackhi_epi16(deltas128, deltas128);
auto deltas = MM256_SET_M128I(deltas_h, deltas_l); // blocks 0,0, 1,1, 2,2, ..., 7,7
auto all_scales = MM256_SET_M128I(scales128, scales128);
auto shuffle = shuffle0;
for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
scales[ib64] = _mm256_shuffle_epi8(all_scales, shuffle);
shuffle = _mm256_add_epi8(shuffle, _mm256_set1_epi8(4));
}
const uint8_t * qs = iq1s[ibl].qs;
const uint16_t * qh = iq1s[ibl].qh;
for (int ib = 0; ib < QK_K/32; ib += 2) {
qx[ib+0] = _mm256_set_epi64x(iq1s_grid_us[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid_us[qs[2] | ((qh[ib+0] << 2) & 0x700)],
iq1s_grid_us[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid_us[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
qx[ib+1] = _mm256_set_epi64x(iq1s_grid_us[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid_us[qs[6] | ((qh[ib+1] << 2) & 0x700)],
iq1s_grid_us[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid_us[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
qs += 8;
}
for (int iy = 0; iy < nrc_y; ++iy) {
auto bsums = q8.load_bsums(iy, ibl);
auto sumi = _mm256_setzero_si256();
for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
auto qy1 = q8.load_quants(iy, ibl, 2*ib64+0);
auto qy2 = q8.load_quants(iy, ibl, 2*ib64+1);
#ifdef HAVE_FANCY_SIMD
auto dot1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+0], qy1);
auto dot2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+1], qy2);
sumi = _mm256_dpwssd_epi32(sumi, scales[ib64], _mm256_packs_epi32(dot1, dot2));
#else
auto dot1 = _mm256_maddubs_epi16(qx[2*ib64+0], qy1);
auto dot2 = _mm256_maddubs_epi16(qx[2*ib64+1], qy2);
auto dot = _mm256_add_epi16(_mm256_unpacklo_epi64(dot1, dot2), _mm256_unpackhi_epi64(dot1, dot2));
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(scales[ib64], dot));
#endif
}
#ifdef HAVE_FANCY_SIMD
sumi = _mm256_dpwssd_epi32(sumi, bsums, deltas);
#else
sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(bsums, deltas));
#endif
acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d*q8.scale(iy, ibl)), _mm256_cvtepi32_ps(sumi), acc[iy]);
}
}
for (int iy = 0; iy < nrc_y; ++iy) {
info.store(ix, iy, 0.125f*hsum_float_8(acc[iy]));
acc[iy] = _mm256_setzero_ps();
}
}
}
/*
moonll iq1s
DequantizerIQ2XXS
DequantizerIQ2XXS is important Dequantizer for DequantizerIQ1_S
*/
struct DequantizerIQ2XXS final : public BaseDequantizer<block_iq2_xxs> {
DequantizerIQ2XXS(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
constexpr static int num_blocks = 8;
union Data {
__m256i vec;
uint32_t val[8];
};
inline __m128i load_scales(int i) {
d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
const uint16_t * a16 = (const uint16_t *)x[i].qs;
auto scales = _mm_srli_epi16(_mm_set_epi16(a16[31], a16[27], a16[23], a16[19], a16[15], a16[11], a16[7], a16[3]), 12);
return _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi16(1));
}
inline void new_block(int i, __m256i * scales) {
auto sc16 = load_scales(i);
scales[0] = MM256_SET_M128I(sc16, sc16);
}
inline float new_block(int i, __m256i * scales, __m256i& mins) {
auto sc16 = load_scales(i);
mins = scb.shuffle(sc16);
scales[0] = MM256_SET_M128I(sc16, sc16);
return -d*minv;
}
inline static void make4(const uint32_t * aux32, __m256i * values) {
const uint8_t * aux8 = (const uint8_t *)aux32;
values[0] = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[ 1]], iq2xxs_grid[aux8[ 0]]);
values[1] = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[ 9]], iq2xxs_grid[aux8[ 8]]);
values[2] = _mm256_set_epi64x(iq2xxs_grid[aux8[19]], iq2xxs_grid[aux8[18]], iq2xxs_grid[aux8[17]], iq2xxs_grid[aux8[16]]);
values[3] = _mm256_set_epi64x(iq2xxs_grid[aux8[27]], iq2xxs_grid[aux8[26]], iq2xxs_grid[aux8[25]], iq2xxs_grid[aux8[24]]);
}
IQK_ALWAYS_INLINE void sign_values(const uint32_t * aux32, __m256i * values) const {
#ifdef HAVE_FANCY_SIMD
esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[3]), _mm_set1_epi32(aux32[1])), values+0);
esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[7]), _mm_set1_epi32(aux32[5])), values+2);
#else
esh.sign_value(aux32[1], values[0]);
esh.sign_value(aux32[3], values[1]);
esh.sign_value(aux32[5], values[2]);
esh.sign_value(aux32[7], values[3]);
#endif
}
inline void make4_signed(const uint32_t * aux32, const __m256i& min_value, __m256i * values) const {
make4(aux32, values);
sign_values(aux32, values);
for (int k = 0; k < 4; ++k) values[k] = _mm256_add_epi8(values[k], min_value);
}
inline void make4(const uint32_t * aux32, __m256i * values, __m256i * q8) const {
make4(aux32, values);
sign_values(aux32, q8);
}
inline void prepare(int i, int j) {
Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j);
make4_signed(data.val, min_value, bits.values);
}
inline void prepare(int i, int j, const Q8<1>& q8, __m256i * q8_quants) {
for (int k = 0; k < 4; ++k) q8_quants[k] = q8.load_quants(0, i, 4*j+k);
Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j);
make4(data.val, bits.values, q8_quants);
}
constexpr static int minv = 43;
SimpleBits bits;
Scales8KBase scb;
EvenSignHelper esh;
const __m256i min_value = _mm256_set1_epi8(minv);
const __m256i shuffle = _mm256_set_epi32(7, 5, 3, 1, 7, 5, 3, 1);
};
/*
moonll
add Q8_0_Unpacker && DequantizerIQ2XXS support
add func mul_mat_qX_K_q8_K_IQ
*/
template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker>) {
if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker> ||
std::is_same_v<Dequantizer, Q8_0_Unpacker>) {
m.funcs[0] = mul_mat_qX_0_q8_0_T<Dequantizer, 1>;
m.funcs[1] = mul_mat_qX_0_q8_0_T<Dequantizer, 2>;
m.funcs[2] = mul_mat_qX_0_q8_0_T<Dequantizer, 3>;
......@@ -1364,7 +2863,7 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
m.funcs[6] = mul_mat_qX_0_q8_0_T<Dequantizer, 7>;
m.funcs[7] = mul_mat_qX_0_q8_0_T<Dequantizer, 8>;
}
else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>) {
else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>|| std::is_same_v<Dequantizer, Q8_0_1_Unpacker>) {
m.funcs[0] = mul_mat_qX_1_q8_1_T<Dequantizer, 1>;
m.funcs[1] = mul_mat_qX_1_q8_1_T<Dequantizer, 2>;
m.funcs[2] = mul_mat_qX_1_q8_1_T<Dequantizer, 3>;
......@@ -1374,16 +2873,37 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
m.funcs[6] = mul_mat_qX_1_q8_1_T<Dequantizer, 7>;
m.funcs[7] = mul_mat_qX_1_q8_1_T<Dequantizer, 8>;
}
else {
else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2XXS>) {
m.funcs[0] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 1>;
m.funcs[1] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 2>;
m.funcs[2] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 3>;
m.funcs[3] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 4>;
m.funcs[4] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 5>;
m.funcs[5] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 6>;
m.funcs[6] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 7>;
m.funcs[7] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 8>;
}
else {
#ifdef HAVE_FANCY_SIMD
m.funcs[0] = mul_mat_qX_K_q8_K_T<Dequantizer, 1>;
m.funcs[1] = mul_mat_qX_K_q8_K_T<Dequantizer, 2>;
m.funcs[2] = mul_mat_qX_K_q8_K_T<Dequantizer, 3>;
m.funcs[3] = mul_mat_qX_K_q8_K_T<Dequantizer, 4>;
m.funcs[4] = mul_mat_qX_K_q8_K_T<Dequantizer, 5>;
m.funcs[5] = mul_mat_qX_K_q8_K_T<Dequantizer, 6>;
m.funcs[6] = mul_mat_qX_K_q8_K_T<Dequantizer, 7>;
m.funcs[7] = mul_mat_qX_K_q8_K_T<Dequantizer, 8>;
if constexpr (std::is_same_v<Dequantizer, DequantizerIQ4XS>) {
m.funcs[0] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 1>;
m.funcs[1] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 2>;
m.funcs[2] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 3>;
m.funcs[3] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 4>;
m.funcs[4] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 5>;
m.funcs[5] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 6>;
m.funcs[6] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 7>;
m.funcs[7] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 8>;
} else {
m.funcs[0] = mul_mat_qX_K_q8_K_AVX512_1<Dequantizer>;
m.funcs[1] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 2>;
m.funcs[2] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 3>;
m.funcs[3] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 4>;
m.funcs[4] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 5>;
m.funcs[5] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 6>;
m.funcs[6] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 7>;
m.funcs[7] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 8>;
}
#else
if constexpr (std::is_same_v<Dequantizer, DequantizerQ2K> ||
std::is_same_v<Dequantizer, DequantizerQ3K> ||
......@@ -1410,11 +2930,260 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
}
}
bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int) {
struct QFBase {
#ifdef __AVX512F__
constexpr static int k_step = 16;
using Data = __m512;
using Acc = __m512;
static inline Data load(const ggml_half * x) { return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)x)); }
static inline Data load(const float * x) { return _mm512_loadu_ps(x); }
static inline Data load(const ggml_bf16_t * x) {
return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)x)), 16));
}
static inline Acc acc(Acc prev, const Data& y, const Data& x) {
return _mm512_fmadd_ps(y, x, prev);
}
static inline Acc acc_first(const Data& y, const Data& x) {
return _mm512_mul_ps(y, x);
}
static inline Acc add(Acc x, Acc y) { return _mm512_add_ps(x, y); }
static inline float hsum(Acc acc) {
return _mm512_reduce_add_ps(acc);
}
template <typename Float>
static inline Data load4Floats(const Float * x) {
return _mm512_insertf32x4(_mm512_setzero_ps(), load128(x), 0);
}
static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) {
acc = _mm512_fmadd_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00), acc);
acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc);
acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc);
acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc);
return acc;
}
static inline Acc acc_r4_first(const Data * xv, const Data& yv) {
auto acc = _mm512_mul_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00));
acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc);
acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc);
acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc);
return acc;
}
static inline __m128 hsum_r4(Acc acc) {
auto sum1 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 0), _mm512_extractf32x4_ps(acc, 1));
auto sum2 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 2), _mm512_extractf32x4_ps(acc, 3));
return _mm_add_ps(sum1, sum2);
}
#else
constexpr static int k_step = 8;
using Data = __m256;
using Acc = __m256;
static inline Data load(const ggml_half * x) { return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)x)); }
static inline Data load(const float * x) { return _mm256_loadu_ps(x); }
static inline Data load(const ggml_bf16_t * x) {
return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i*)x)), 16));
}
static inline Acc acc(Acc prev, const Data& y, const Data& x) {
return _mm256_fmadd_ps(y, x, prev);
}
static inline Acc add(Acc x, Acc y) { return _mm256_add_ps(x, y); }
static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) {
acc = _mm256_fmadd_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00), acc);
acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc);
acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc);
acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc);
return acc;
}
static inline Acc acc_r4_first(const Data * xv, const Data& yv) {
auto acc = _mm256_mul_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00));
acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc);
acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc);
acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc);
return acc;
}
static inline Acc acc_first(const Data& y, const Data& x) {
return _mm256_mul_ps(y, x);
}
static inline float hsum(Acc acc) {
return hsum_float_8(acc);
}
static inline __m128 hsum_r4(Acc acc) {
return _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1));
}
template <typename Float>
static inline Data load4Floats(const Float * x) {
return _mm256_insertf128_ps(_mm256_setzero_ps(), load128(x), 0);
}
#endif
static inline __m128 load128(const ggml_half * x) { return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)x)); }
static inline __m128 load128(const float * x) { return _mm_loadu_ps(x); }
static inline __m128 load128(const ggml_bf16_t * x) {
return _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*)x)), 16));
}
};
template <typename Float, int nrc_in> struct QFT final : public QFBase {
constexpr static int nrc = nrc_in;
QFT(const DataInfo& info) {
for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)info.src1_row(iy);
}
QFT(const char * cx, size_t bx) {
for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)(cx + iy*bx);
}
IQK_ALWAYS_INLINE Data load1(int iy, int i) const { return load(y[iy] + k_step*i); }
IQK_ALWAYS_INLINE Data load_tail(int iy, int i) const { return load4Floats(y[iy] + 4*i); }
IQK_ALWAYS_INLINE void load_r4(int ix, int i, Data * xv) const {
xv[0] = load1(ix+0, i);
xv[1] = load1(ix+1, i);
xv[2] = load1(ix+2, i);
xv[3] = load1(ix+3, i);
#ifdef __AVX512F__
auto t0 = _mm512_unpacklo_ps(xv[0], xv[1]);
auto t1 = _mm512_unpacklo_ps(xv[2], xv[3]);
auto t2 = _mm512_unpackhi_ps(xv[0], xv[1]);
auto t3 = _mm512_unpackhi_ps(xv[2], xv[3]);
xv[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1)));
xv[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1)));
xv[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3)));
xv[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3)));
#else
auto t0 = _mm256_unpacklo_ps(xv[0], xv[1]);
auto t1 = _mm256_unpacklo_ps(xv[2], xv[3]);
auto t2 = _mm256_unpackhi_ps(xv[0], xv[1]);
auto t3 = _mm256_unpackhi_ps(xv[2], xv[3]);
xv[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1)));
xv[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1)));
xv[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3)));
xv[3] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3)));
#endif
}
const Float * y[nrc];
};
template <typename Qy, typename Qx>
IQK_NOINLINE void mul_mat_Qx_Qy_MxN(int n, const char * cx, size_t bx, int ix0, const DataInfo& info) {
int nb = n/QFBase::k_step;
int nb4 = n/4;
Qy y(info);
Qx x(cx + ix0*bx, bx);
QFBase::Data xv[Qx::nrc];
QFBase::Acc acc[Qx::nrc*Qy::nrc];
auto yv = y.load1(0, 0);
for (int ix = 0; ix < Qx::nrc; ++ix) {
xv[ix] = x.load1(ix, 0);
acc[ix] = QFBase::acc_first(yv, xv[ix]);
}
for (int iy = 1; iy < Qy::nrc; ++iy) {
yv = y.load1(iy, 0);
for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc_first(yv, xv[ix]);
}
for (int i = 1; i < nb; ++i) {
yv = y.load1(0, i);
for (int ix = 0; ix < Qx::nrc; ++ix) {
xv[ix] = x.load1(ix, i);
acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]);
}
for (int iy = 1; iy < Qy::nrc; ++iy) {
yv = y.load1(iy, i);
for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
}
}
for (int i = (QFBase::k_step/4)*nb; i < nb4; ++i) {
yv = y.load_tail(0, i);
for (int ix = 0; ix < Qx::nrc; ++ix) {
xv[ix] = x.load_tail(ix, i);
acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]);
}
for (int iy = 1; iy < Qy::nrc; ++iy) {
yv = y.load_tail(iy, i);
for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
}
}
for (int iy = 0; iy < Qy::nrc; ++iy) for (int ix = 0; ix < Qx::nrc; ++ix) info.store(ix0+ix, iy, QFBase::hsum(acc[Qx::nrc*iy+ix]));
}
// This will handle any of f16 x f32, f32 x f16, f16 x f16, f32 x f32, with computations done
// in f32 (i.e., f16 is first converted to f32). It is easy to extend to computations done in
// f16, but I don't have a CPU capable of f16 vector arithmetic, so not doing it for now.
template <int nrc_y, typename FloatX, typename FloatY>
void mul_mat_fX_fY_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
const char * cx = (const char *)vx;
// TBD if we want this
//if constexpr (nrc_y == 1) {
// constexpr int k_nx = 2;
// for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
// mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
// }
// if (int lastx = k_nx*(nrc_x/k_nx); lastx < nrc_x) {
// int nx = nrc_x - lastx;
// switch (nx) {
// case 1: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info); break;
// case 2: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, lastx, info); break;
// case 3: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, lastx, info); break;
// }
// //mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info);
// }
// return;
//}
#ifdef __AVX512F__
constexpr int k_nx = 5;
#else
constexpr int k_nx = nrc_y == 1 ? 4 : 2;
#endif
for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
}
int last_x = k_nx*(nrc_x/k_nx);
if (last_x == nrc_x) return;
int nx = nrc_x - last_x;
#ifdef __AVX512F__
switch (nx) {
case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
case 2: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, last_x, info); break;
case 3: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, last_x, info); break;
case 4: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 4>>(n, cx, bx, last_x, info); break;
}
#else
if constexpr (nrc_y == 1) {
switch (nx) {
case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
case 2: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, last_x, info); break;
case 3: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, last_x, info); break;
}
} else {
switch (nx) {
case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
}
}
#endif
}
template <typename FloatX, typename FloatY>
void set_mul_mat_f(MulMat& mm) {
for (auto& f : mm.funcs) f = nullptr;
mm.funcs[0] = mul_mat_fX_fY_T<1, FloatX, FloatY>;
mm.funcs[1] = mul_mat_fX_fY_T<2, FloatX, FloatY>;
mm.funcs[2] = mul_mat_fX_fY_T<3, FloatX, FloatY>;
mm.funcs[3] = mul_mat_fX_fY_T<4, FloatX, FloatY>;
mm.funcs[4] = mul_mat_fX_fY_T<5, FloatX, FloatY>;
#ifndef __AVX512F__
mm.funcs[5] = mul_mat_fX_fY_T<6, FloatX, FloatY>;
#endif
}
if (ne00 % ggml_blck_size(GGML_TYPE_Q8_K) == 0)
row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
/*
moonll
add typeb TO compare return not expected type of weight matrix
add IQ2XSS
add IQ1_S
add GGML_TYPE_IQ4_XS
*/
bool MulMat::set_mul_mat(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
(void)Ny;
auto expected_typeB = GGML_TYPE_Q8_K;
switch (typeA) {
case GGML_TYPE_Q2_K:
assert (ne00 % QK_K == 0);
......@@ -1440,37 +3209,75 @@ bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int)
assert (ne00 % QK_K == 0);
MulMat::set_functions<DequantizerIQ4XS>(mm);
break;
case GGML_TYPE_IQ2_XXS:
assert (ne00 % QK_K == 0);
MulMat::set_functions<DequantizerIQ2XXS>(mm);
break;
case GGML_TYPE_Q4_0:
assert (ne00 % QK4_0 == 0);
MulMat::set_functions<Q4_0_Unpacker>(mm);
row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
expected_typeB = GGML_TYPE_Q8_0;
break;
case GGML_TYPE_Q4_1:
assert (ne00 % QK4_1 == 0);
MulMat::set_functions<Q4_1_Unpacker>(mm);
row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
expected_typeB = GGML_TYPE_Q8_1_X4;
break;
case GGML_TYPE_Q5_0:
assert (ne00 % QK5_0 == 0);
MulMat::set_functions<Q5_0_Unpacker>(mm);
row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
expected_typeB = GGML_TYPE_Q8_0;
break;
case GGML_TYPE_Q5_1:
assert (ne00 % QK5_1 == 0);
MulMat::set_functions<Q5_1_Unpacker>(mm);
row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
expected_typeB = GGML_TYPE_Q8_1_X4;
break;
case GGML_TYPE_Q8_0:
assert (ne00 % QK8_0 == 0);
#ifdef HAVE_FANCY_SIMD
MulMat::set_functions<Q8_0_1_Unpacker>(mm);
expected_typeB = GGML_TYPE_Q8_1_X4;
#else
MulMat::set_functions<Q8_0_Unpacker>(mm);
expected_typeB = GGML_TYPE_Q8_0_X4;
#endif
break;
case GGML_TYPE_IQ1_S:
mm.funcs[0] = mul_mat_iq1_s_q8_K<1>;
mm.funcs[1] = mul_mat_iq1_s_q8_K<2>;
mm.funcs[2] = mul_mat_iq1_s_q8_K<3>;
mm.funcs[3] = mul_mat_iq1_s_q8_K<4>;
mm.funcs[4] = mul_mat_iq1_s_q8_K<5>;
mm.funcs[5] = mul_mat_iq1_s_q8_K<6>;
mm.funcs[6] = mul_mat_iq1_s_q8_K<7>;
mm.funcs[7] = mul_mat_iq1_s_q8_K<8>;
#ifdef HAVE_FANCY_SIMD
mm.func16 = mul_mat_iq1_s_q8_K<16>;
#endif
// row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
expected_typeB = GGML_TYPE_Q8_K;
break;
default:
{
printf("case:%d",typeA);
return false;
}
}
return true;
return ggml_type(typeB) == expected_typeB;
}
} // namespace
/*
iq1_s is not support for arm
*/
#else // __aarch64__
namespace {
......
......@@ -12,10 +12,15 @@ extern "C" {
struct ggml_tensor;
struct ggml_compute_params;
/*moonll old
add more params typeb...
*/
bool iqk_mul_mat(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
bool iqk_mul_mat_zen4(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
bool iqk_mul_mat_arm82(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
bool iqk_mul_mat(long, long, long, int, const void*, const void*, float*, long, int, int);
bool iqk_mul_mat_zen4(long, long, long, int, const void*, const void*, float*, long, int, int);
bool iqk_mul_mat_arm82(long, long, long, int, const void*, const void*, float*, long, int, int);
bool iqk_mul_mat_moe(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
bool iqk_mul_mat_moe_zen4(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment