Commit 0ce8bcfd authored by xuxzh1's avatar xuxzh1 🎱
Browse files

init

parent b0135f4b
...@@ -476,7 +476,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par ...@@ -476,7 +476,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
} }
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Run `./llama-perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]` // Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval // BOS tokens will be added for each chunk before eval
...@@ -1032,7 +1032,7 @@ struct winogrande_entry { ...@@ -1032,7 +1032,7 @@ struct winogrande_entry {
std::vector<llama_token> seq_tokens[2]; std::vector<llama_token> seq_tokens[2];
}; };
static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) { static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string & prompt) {
std::vector<winogrande_entry> result; std::vector<winogrande_entry> result;
std::istringstream in(prompt); std::istringstream in(prompt);
std::string line; std::string line;
...@@ -1964,12 +1964,14 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { ...@@ -1964,12 +1964,14 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
params.n_ctx = 512;
params.logits_all = true;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1; return 1;
} }
params.logits_all = true;
const int32_t n_ctx = params.n_ctx; const int32_t n_ctx = params.n_ctx;
if (n_ctx <= 0) { if (n_ctx <= 0) {
...@@ -1989,6 +1991,12 @@ int main(int argc, char ** argv) { ...@@ -1989,6 +1991,12 @@ int main(int argc, char ** argv) {
params.n_batch = std::min(params.n_batch, n_kv); params.n_batch = std::min(params.n_batch, n_kv);
} else { } else {
params.n_batch = std::min(params.n_batch, params.n_ctx); params.n_batch = std::min(params.n_batch, params.n_ctx);
if (params.kl_divergence) {
params.n_parallel = 1;
} else {
// ensure there's at least enough seq_ids for HellaSwag
params.n_parallel = std::max(4, params.n_parallel);
}
} }
if (params.ppl_stride > 0) { if (params.ppl_stride > 0) {
...@@ -2006,27 +2014,22 @@ int main(int argc, char ** argv) { ...@@ -2006,27 +2014,22 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = string_random_prompt(rng);
}
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
llama_model * model;
llama_context * ctx;
// ensure there's at least enough seq_ids for HellaSwag
params.n_parallel = std::max(4, params.n_parallel);
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
std::tie(model, ctx) = llama_init_from_gpt_params(params); llama_init_result llama_init = llama_init_from_gpt_params(params);
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__); fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1; return 1;
} }
const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx_train = llama_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) { if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx); __func__, n_ctx_train, params.n_ctx);
......
...@@ -9,7 +9,7 @@ from inspect import getdoc, isclass ...@@ -9,7 +9,7 @@ from inspect import getdoc, isclass
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
from docstring_parser import parse from docstring_parser import parse
from pydantic import BaseModel, Field, create_model from pydantic import BaseModel, create_model
if TYPE_CHECKING: if TYPE_CHECKING:
from types import GenericAlias from types import GenericAlias
...@@ -17,6 +17,9 @@ else: ...@@ -17,6 +17,9 @@ else:
# python 3.8 compat # python 3.8 compat
from typing import _GenericAlias as GenericAlias from typing import _GenericAlias as GenericAlias
# TODO: fix this
# pyright: reportAttributeAccessIssue=information
class PydanticDataType(Enum): class PydanticDataType(Enum):
""" """
...@@ -50,35 +53,38 @@ class PydanticDataType(Enum): ...@@ -50,35 +53,38 @@ class PydanticDataType(Enum):
def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str: def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str:
if isclass(pydantic_type) and issubclass(pydantic_type, str): origin_type = get_origin(pydantic_type)
origin_type = pydantic_type if origin_type is None else origin_type
if isclass(origin_type) and issubclass(origin_type, str):
return PydanticDataType.STRING.value return PydanticDataType.STRING.value
elif isclass(pydantic_type) and issubclass(pydantic_type, bool): elif isclass(origin_type) and issubclass(origin_type, bool):
return PydanticDataType.BOOLEAN.value return PydanticDataType.BOOLEAN.value
elif isclass(pydantic_type) and issubclass(pydantic_type, int): elif isclass(origin_type) and issubclass(origin_type, int):
return PydanticDataType.INTEGER.value return PydanticDataType.INTEGER.value
elif isclass(pydantic_type) and issubclass(pydantic_type, float): elif isclass(origin_type) and issubclass(origin_type, float):
return PydanticDataType.FLOAT.value return PydanticDataType.FLOAT.value
elif isclass(pydantic_type) and issubclass(pydantic_type, Enum): elif isclass(origin_type) and issubclass(origin_type, Enum):
return PydanticDataType.ENUM.value return PydanticDataType.ENUM.value
elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel): elif isclass(origin_type) and issubclass(origin_type, BaseModel):
return format_model_and_field_name(pydantic_type.__name__) return format_model_and_field_name(origin_type.__name__)
elif get_origin(pydantic_type) is list: elif origin_type is list:
element_type = get_args(pydantic_type)[0] element_type = get_args(pydantic_type)[0]
return f"{map_pydantic_type_to_gbnf(element_type)}-list" return f"{map_pydantic_type_to_gbnf(element_type)}-list"
elif get_origin(pydantic_type) is set: elif origin_type is set:
element_type = get_args(pydantic_type)[0] element_type = get_args(pydantic_type)[0]
return f"{map_pydantic_type_to_gbnf(element_type)}-set" return f"{map_pydantic_type_to_gbnf(element_type)}-set"
elif get_origin(pydantic_type) is Union: elif origin_type is Union:
union_types = get_args(pydantic_type) union_types = get_args(pydantic_type)
union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types] union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
return f"union-{'-or-'.join(union_rules)}" return f"union-{'-or-'.join(union_rules)}"
elif get_origin(pydantic_type) is Optional: elif origin_type is Optional:
element_type = get_args(pydantic_type)[0] element_type = get_args(pydantic_type)[0]
return f"optional-{map_pydantic_type_to_gbnf(element_type)}" return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
elif isclass(pydantic_type): elif isclass(origin_type):
return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}" return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(origin_type.__name__)}"
elif get_origin(pydantic_type) is dict: elif origin_type is dict:
key_type, value_type = get_args(pydantic_type) key_type, value_type = get_args(pydantic_type)
return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}" return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
else: else:
...@@ -115,7 +121,7 @@ def get_members_structure(cls, rule_name): ...@@ -115,7 +121,7 @@ def get_members_structure(cls, rule_name):
# Modify this comprehension # Modify this comprehension
members = [ members = [
f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param_type)}' f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param_type)}'
for name, param_type in cls.__annotations__.items() for name, param_type in get_type_hints(cls).items()
if name != "self" if name != "self"
] ]
...@@ -234,8 +240,9 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None ...@@ -234,8 +240,9 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None
# Define the integer part rule # Define the integer part rule
integer_part_rule = ( integer_part_rule = (
"integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + ( "integer-part"
f"-min{min_digit}" if min_digit is not None else "") + (f"-max{max_digit}" if max_digit is not None else "")
+ (f"-min{min_digit}" if min_digit is not None else "")
) )
# Define the fractional part rule based on precision constraints # Define the fractional part rule based on precision constraints
...@@ -293,17 +300,20 @@ def generate_gbnf_rule_for_type( ...@@ -293,17 +300,20 @@ def generate_gbnf_rule_for_type(
field_name = format_model_and_field_name(field_name) field_name = format_model_and_field_name(field_name)
gbnf_type = map_pydantic_type_to_gbnf(field_type) gbnf_type = map_pydantic_type_to_gbnf(field_type)
if isclass(field_type) and issubclass(field_type, BaseModel): origin_type = get_origin(field_type)
origin_type = field_type if origin_type is None else origin_type
if isclass(origin_type) and issubclass(origin_type, BaseModel):
nested_model_name = format_model_and_field_name(field_type.__name__) nested_model_name = format_model_and_field_name(field_type.__name__)
nested_model_rules, _ = generate_gbnf_grammar(field_type, processed_models, created_rules) nested_model_rules, _ = generate_gbnf_grammar(field_type, processed_models, created_rules)
rules.extend(nested_model_rules) rules.extend(nested_model_rules)
gbnf_type, rules = nested_model_name, rules gbnf_type, rules = nested_model_name, rules
elif isclass(field_type) and issubclass(field_type, Enum): elif isclass(origin_type) and issubclass(origin_type, Enum):
enum_values = [f'"\\"{e.value}\\""' for e in field_type] # Adding escaped quotes enum_values = [f'"\\"{e.value}\\""' for e in field_type] # Adding escaped quotes
enum_rule = f"{model_name}-{field_name} ::= {' | '.join(enum_values)}" enum_rule = f"{model_name}-{field_name} ::= {' | '.join(enum_values)}"
rules.append(enum_rule) rules.append(enum_rule)
gbnf_type, rules = model_name + "-" + field_name, rules gbnf_type, rules = model_name + "-" + field_name, rules
elif get_origin(field_type) == list: # Array elif origin_type is list: # Array
element_type = get_args(field_type)[0] element_type = get_args(field_type)[0]
element_rule_name, additional_rules = generate_gbnf_rule_for_type( element_rule_name, additional_rules = generate_gbnf_rule_for_type(
model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
...@@ -313,7 +323,7 @@ def generate_gbnf_rule_for_type( ...@@ -313,7 +323,7 @@ def generate_gbnf_rule_for_type(
rules.append(array_rule) rules.append(array_rule)
gbnf_type, rules = model_name + "-" + field_name, rules gbnf_type, rules = model_name + "-" + field_name, rules
elif get_origin(field_type) == set or field_type == set: # Array elif origin_type is set: # Array
element_type = get_args(field_type)[0] element_type = get_args(field_type)[0]
element_rule_name, additional_rules = generate_gbnf_rule_for_type( element_rule_name, additional_rules = generate_gbnf_rule_for_type(
model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
...@@ -367,7 +377,7 @@ def generate_gbnf_rule_for_type( ...@@ -367,7 +377,7 @@ def generate_gbnf_rule_for_type(
gbnf_type = f"{model_name}-{field_name}-optional" gbnf_type = f"{model_name}-{field_name}-optional"
else: else:
gbnf_type = f"{model_name}-{field_name}-union" gbnf_type = f"{model_name}-{field_name}-union"
elif isclass(field_type) and issubclass(field_type, str): elif isclass(origin_type) and issubclass(origin_type, str):
if field_info and hasattr(field_info, "json_schema_extra") and field_info.json_schema_extra is not None: if field_info and hasattr(field_info, "json_schema_extra") and field_info.json_schema_extra is not None:
triple_quoted_string = field_info.json_schema_extra.get("triple_quoted_string", False) triple_quoted_string = field_info.json_schema_extra.get("triple_quoted_string", False)
markdown_string = field_info.json_schema_extra.get("markdown_code_block", False) markdown_string = field_info.json_schema_extra.get("markdown_code_block", False)
...@@ -383,8 +393,8 @@ def generate_gbnf_rule_for_type( ...@@ -383,8 +393,8 @@ def generate_gbnf_rule_for_type(
gbnf_type = PydanticDataType.STRING.value gbnf_type = PydanticDataType.STRING.value
elif ( elif (
isclass(field_type) isclass(origin_type)
and issubclass(field_type, float) and issubclass(origin_type, float)
and field_info and field_info
and hasattr(field_info, "json_schema_extra") and hasattr(field_info, "json_schema_extra")
and field_info.json_schema_extra is not None and field_info.json_schema_extra is not None
...@@ -409,8 +419,8 @@ def generate_gbnf_rule_for_type( ...@@ -409,8 +419,8 @@ def generate_gbnf_rule_for_type(
) )
elif ( elif (
isclass(field_type) isclass(origin_type)
and issubclass(field_type, int) and issubclass(origin_type, int)
and field_info and field_info
and hasattr(field_info, "json_schema_extra") and hasattr(field_info, "json_schema_extra")
and field_info.json_schema_extra is not None and field_info.json_schema_extra is not None
...@@ -458,7 +468,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas ...@@ -458,7 +468,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas
if not issubclass(model, BaseModel): if not issubclass(model, BaseModel):
# For non-Pydantic classes, generate model_fields from __annotations__ or __init__ # For non-Pydantic classes, generate model_fields from __annotations__ or __init__
if hasattr(model, "__annotations__") and model.__annotations__: if hasattr(model, "__annotations__") and model.__annotations__:
model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()} model_fields = {name: (typ, ...) for name, typ in get_type_hints(model).items()}
else: else:
init_signature = inspect.signature(model.__init__) init_signature = inspect.signature(model.__init__)
parameters = init_signature.parameters parameters = init_signature.parameters
...@@ -466,7 +476,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas ...@@ -466,7 +476,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas
name != "self"} name != "self"}
else: else:
# For Pydantic models, use model_fields and check for ellipsis (required fields) # For Pydantic models, use model_fields and check for ellipsis (required fields)
model_fields = model.__annotations__ model_fields = get_type_hints(model)
model_rule_parts = [] model_rule_parts = []
nested_rules = [] nested_rules = []
...@@ -624,7 +634,7 @@ string ::= "\"" ( ...@@ -624,7 +634,7 @@ string ::= "\"" (
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" ws )* "\"" ws
ws ::= ([ \t\n] ws)? ws ::= ([ \t\n] ws)?
float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws float ::= ("-"? ([0] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
integer ::= [0-9]+""" integer ::= [0-9]+"""
...@@ -680,7 +690,7 @@ def generate_markdown_documentation( ...@@ -680,7 +690,7 @@ def generate_markdown_documentation(
str: Generated text documentation. str: Generated text documentation.
""" """
documentation = "" documentation = ""
pyd_models = [(model, True) for model in pydantic_models] pyd_models: list[tuple[type[BaseModel], bool]] = [(model, True) for model in pydantic_models]
for model, add_prefix in pyd_models: for model, add_prefix in pyd_models:
if add_prefix: if add_prefix:
documentation += f"{model_prefix}: {model.__name__}\n" documentation += f"{model_prefix}: {model.__name__}\n"
...@@ -700,9 +710,9 @@ def generate_markdown_documentation( ...@@ -700,9 +710,9 @@ def generate_markdown_documentation(
# Indenting the fields section # Indenting the fields section
documentation += f" {fields_prefix}:\n" documentation += f" {fields_prefix}:\n"
else: else:
documentation += f" Fields:\n" documentation += f" Fields:\n" # noqa: F541
if isclass(model) and issubclass(model, BaseModel): if isclass(model) and issubclass(model, BaseModel):
for name, field_type in model.__annotations__.items(): for name, field_type in get_type_hints(model).items():
# if name == "markdown_code_block": # if name == "markdown_code_block":
# continue # continue
if get_origin(field_type) == list: if get_origin(field_type) == list:
...@@ -750,14 +760,17 @@ def generate_field_markdown( ...@@ -750,14 +760,17 @@ def generate_field_markdown(
field_info = model.model_fields.get(field_name) field_info = model.model_fields.get(field_name)
field_description = field_info.description if field_info and field_info.description else "" field_description = field_info.description if field_info and field_info.description else ""
if get_origin(field_type) == list: origin_type = get_origin(field_type)
origin_type = field_type if origin_type is None else origin_type
if origin_type == list:
element_type = get_args(field_type)[0] element_type = get_args(field_type)[0]
field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)} of {format_model_and_field_name(element_type.__name__)})" field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)} of {format_model_and_field_name(element_type.__name__)})"
if field_description != "": if field_description != "":
field_text += ":\n" field_text += ":\n"
else: else:
field_text += "\n" field_text += "\n"
elif get_origin(field_type) == Union: elif origin_type == Union:
element_types = get_args(field_type) element_types = get_args(field_type)
types = [] types = []
for element_type in element_types: for element_type in element_types:
...@@ -778,7 +791,7 @@ def generate_field_markdown( ...@@ -778,7 +791,7 @@ def generate_field_markdown(
return field_text return field_text
if field_description != "": if field_description != "":
field_text += f" Description: " + field_description + "\n" field_text += f" Description: {field_description}\n"
# Check for and include field-specific examples if available # Check for and include field-specific examples if available
if hasattr(model, "Config") and hasattr(model.Config, if hasattr(model, "Config") and hasattr(model.Config,
...@@ -788,9 +801,9 @@ def generate_field_markdown( ...@@ -788,9 +801,9 @@ def generate_field_markdown(
example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example
field_text += f"{indent} Example: {example_text}\n" field_text += f"{indent} Example: {example_text}\n"
if isclass(field_type) and issubclass(field_type, BaseModel): if isclass(origin_type) and issubclass(origin_type, BaseModel):
field_text += f"{indent} Details:\n" field_text += f"{indent} Details:\n"
for name, type_ in field_type.__annotations__.items(): for name, type_ in get_type_hints(field_type).items():
field_text += generate_field_markdown(name, type_, field_type, depth + 2) field_text += generate_field_markdown(name, type_, field_type, depth + 2)
return field_text return field_text
...@@ -833,7 +846,7 @@ def generate_text_documentation( ...@@ -833,7 +846,7 @@ def generate_text_documentation(
str: Generated text documentation. str: Generated text documentation.
""" """
documentation = "" documentation = ""
pyd_models = [(model, True) for model in pydantic_models] pyd_models: list[tuple[type[BaseModel], bool]] = [(model, True) for model in pydantic_models]
for model, add_prefix in pyd_models: for model, add_prefix in pyd_models:
if add_prefix: if add_prefix:
documentation += f"{model_prefix}: {model.__name__}\n" documentation += f"{model_prefix}: {model.__name__}\n"
...@@ -851,7 +864,7 @@ def generate_text_documentation( ...@@ -851,7 +864,7 @@ def generate_text_documentation(
if isclass(model) and issubclass(model, BaseModel): if isclass(model) and issubclass(model, BaseModel):
documentation_fields = "" documentation_fields = ""
for name, field_type in model.__annotations__.items(): for name, field_type in get_type_hints(model).items():
# if name == "markdown_code_block": # if name == "markdown_code_block":
# continue # continue
if get_origin(field_type) == list: if get_origin(field_type) == list:
...@@ -944,7 +957,7 @@ def generate_field_text( ...@@ -944,7 +957,7 @@ def generate_field_text(
if isclass(field_type) and issubclass(field_type, BaseModel): if isclass(field_type) and issubclass(field_type, BaseModel):
field_text += f"{indent} Details:\n" field_text += f"{indent} Details:\n"
for name, type_ in field_type.__annotations__.items(): for name, type_ in get_type_hints(field_type).items():
field_text += generate_field_text(name, type_, field_type, depth + 2) field_text += generate_field_text(name, type_, field_type, depth + 2)
return field_text return field_text
...@@ -1164,7 +1177,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]): ...@@ -1164,7 +1177,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
dynamic_fields[param.name] = ( dynamic_fields[param.name] = (
param.annotation if param.annotation != inspect.Parameter.empty else str, default_value) param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
# Creating the dynamic model # Creating the dynamic model
dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) # type: ignore[call-overload] dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
for name, param_doc in param_docs: for name, param_doc in param_docs:
dynamic_model.model_fields[name].description = param_doc.description dynamic_model.model_fields[name].description = param_doc.description
...@@ -1228,9 +1241,6 @@ def map_grammar_names_to_pydantic_model_class(pydantic_model_list): ...@@ -1228,9 +1241,6 @@ def map_grammar_names_to_pydantic_model_class(pydantic_model_list):
return output return output
from enum import Enum
def json_schema_to_python_types(schema): def json_schema_to_python_types(schema):
type_map = { type_map = {
"any": Any, "any": Any,
...@@ -1275,7 +1285,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: ...@@ -1275,7 +1285,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
if items != {}: if items != {}:
array = {"properties": items} array = {"properties": items}
array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items") array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
fields[field_name] = (List[array_type], ...) # type: ignore[valid-type] fields[field_name] = (List[array_type], ...)
else: else:
fields[field_name] = (list, ...) fields[field_name] = (list, ...)
elif field_type == "object": elif field_type == "object":
...@@ -1285,7 +1295,8 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: ...@@ -1285,7 +1295,8 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
required = field_data.get("enum", []) required = field_data.get("enum", [])
for key, field in fields.items(): for key, field in fields.items():
if key not in required: if key not in required:
fields[key] = (Optional[fields[key][0]], ...) optional_type = fields[key][0]
fields[key] = (Optional[optional_type], ...)
else: else:
field_type = json_schema_to_python_types(field_type) field_type = json_schema_to_python_types(field_type)
fields[field_name] = (field_type, ...) fields[field_name] = (field_type, ...)
...@@ -1305,6 +1316,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: ...@@ -1305,6 +1316,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
required = dictionary.get("required", []) required = dictionary.get("required", [])
for key, field in fields.items(): for key, field in fields.items():
if key not in required: if key not in required:
fields[key] = (Optional[fields[key][0]], ...) optional_type = fields[key][0]
fields[key] = (Optional[optional_type], ...)
custom_model = create_model(model_name, **fields) custom_model = create_model(model_name, **fields)
return custom_model return custom_model
#!/usr/bin/env python3
"""Function calling example using pydantic models."""
from __future__ import annotations
import argparse
import datetime
import json
import logging
import textwrap
import sys
from enum import Enum
from typing import Optional, Union
import requests
from pydantic import BaseModel, Field
from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert_dictionary_to_pydantic_model,
create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
def create_completion(host, prompt, gbnf_grammar):
"""Calls the /completion API on llama-server.
See
https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
"""
print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}")
headers = {"Content-Type": "application/json"}
data = {"prompt": prompt, "grammar": gbnf_grammar}
result = requests.post(f"http://{host}/completion", headers=headers, json=data).json()
assert data.get("error") is None, data
logging.info("Result: %s", result)
content = result["content"]
print(f" Model: {result['model']}")
print(f" Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), ' ')}")
return content
# A function for the agent to send a message to the user.
class SendMessageToUser(BaseModel):
"""Send a message to the User."""
chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
message: str = Field(..., description="Message you want to send to the user.")
def run(self):
print(f"SendMessageToUser: {self.message}")
def example_rce(host):
"""Minimal test case where the LLM call an arbitrary python function."""
print("- example_rce")
tools = [SendMessageToUser]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=tools, outer_object_name="function",
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
user_message = "What is 42 * 42?"
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
text = create_completion(host, prompt, gbnf_grammar)
json_data = json.loads(text)
tools_map = {tool.__name__:tool for tool in tools}
# This finds "SendMessageToUser":
tool = tools_map.get(json_data["function"])
if not tool:
print(f"Error: unknown tool {json_data['function']}")
return 1
tool(**json_data["function_parameters"]).run()
return 0
# Enum for the calculator tool.
class MathOperation(Enum):
ADD = "add"
SUBTRACT = "subtract"
MULTIPLY = "multiply"
DIVIDE = "divide"
# Simple pydantic calculator tool for the agent that can add, subtract,
# multiply, and divide. Docstring and description of fields will be used in
# system prompt.
class Calculator(BaseModel):
"""Perform a math operation on two numbers."""
number_one: Union[int, float] = Field(..., description="First number.")
operation: MathOperation = Field(..., description="Math operation to perform.")
number_two: Union[int, float] = Field(..., description="Second number.")
def run(self):
if self.operation == MathOperation.ADD:
return self.number_one + self.number_two
elif self.operation == MathOperation.SUBTRACT:
return self.number_one - self.number_two
elif self.operation == MathOperation.MULTIPLY:
return self.number_one * self.number_two
elif self.operation == MathOperation.DIVIDE:
return self.number_one / self.number_two
else:
raise ValueError("Unknown operation.")
def example_calculator(host):
"""Have the LLM ask to get a calculation done.
Here the grammar gets generated by passing the available function models to
generate_gbnf_grammar_and_documentation function. This also generates a
documentation usable by the LLM.
pydantic_model_list is the list of pydantic models outer_object_name is an
optional name for an outer object around the actual model object. Like a
"function" object with "function_parameters" which contains the actual model
object. If None, no outer object will be generated outer_object_content is
the name of outer object content.
model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
"""
print("- example_calculator")
tools = [SendMessageToUser, Calculator]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=tools, outer_object_name="function",
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
user_message1 = "What is 42 * 42?"
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant"
text = create_completion(host, prompt, gbnf_grammar)
json_data = json.loads(text)
expected = {
"function": "Calculator",
"function_parameters": {
"number_one": 42,
"operation": "multiply",
"number_two": 42
}
}
if json_data != expected:
print(" Result is not as expected!")
tools_map = {tool.__name__:tool for tool in tools}
# This finds "Calculator":
tool = tools_map.get(json_data["function"])
if not tool:
print(f"Error: unknown tool {json_data['function']}")
return 1
result = tool(**json_data["function_parameters"]).run()
print(f" Call {json_data['function']} gave result {result}")
return 0
class Category(Enum):
"""The category of the book."""
Fiction = "Fiction"
NonFiction = "Non-Fiction"
class Book(BaseModel):
"""Represents an entry about a book."""
title: str = Field(..., description="Title of the book.")
author: str = Field(..., description="Author of the book.")
published_year: Optional[int] = Field(..., description="Publishing year of the book.")
keywords: list[str] = Field(..., description="A list of keywords.")
category: Category = Field(..., description="Category of the book.")
summary: str = Field(..., description="Summary of the book.")
def example_struct(host):
"""A example structured output based on pydantic models.
The LLM will create an entry for a Book database out of an unstructured
text. We need no additional parameters other than our list of pydantic
models.
"""
print("- example_struct")
tools = [Book]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools)
system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
text = create_completion(host, prompt, gbnf_grammar)
json_data = json.loads(text)
# In this case, there's no function nor function_parameters.
# Here the result will vary based on the LLM used.
keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"])
if keys != sorted(json_data.keys()):
print(f"Unexpected result: {sorted(json_data.keys())}")
return 1
book = Book(**json_data)
print(f" As a Book object: %s" % book)
return 0
def get_current_datetime(output_format: Optional[str] = None):
"""Get the current date and time in the given format.
Args:
output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
"""
return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S")
# Example function to get the weather.
def get_current_weather(location, unit):
"""Get the current weather in a given location"""
if "London" in location:
return json.dumps({"location": "London", "temperature": "42", "unit": unit.value})
elif "New York" in location:
return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
elif "North Pole" in location:
return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
return json.dumps({"location": location, "temperature": "unknown"})
def example_concurrent(host):
"""An example for parallel function calling with a Python function, a pydantic
function model and an OpenAI like function definition.
"""
print("- example_concurrent")
# Function definition in OpenAI style.
current_weather_tool = {
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
# Convert OpenAI function definition into pydantic model.
current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
# Add the actual function to a pydantic model.
current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
# Convert normal Python function to a pydantic model.
current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=tools, outer_object_name="function",
outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
text = create_completion(host, prompt, gbnf_grammar)
json_data = json.loads(text)
expected = [
{
"function": "get_current_datetime",
"params": {
"output_format": "%Y-%m-%d %H:%M:%S"
}
},
{
"function": "get_current_weather",
"params": {
"location": "London",
"unit": "celsius"
}
},
{
"function": "Calculator",
"params": {
"number_one": 42,
"operation": "multiply",
"number_two": 42
}
}
]
res = 0
if json_data != expected:
print(" Result is not as expected!")
print(" This can happen on highly quantized models")
res = 1
tools_map = {tool.__name__:tool for tool in tools}
for call in json_data:
tool = tools_map.get(call["function"])
if not tool:
print(f"Error: unknown tool {call['function']}")
return 1
result = tool(**call["params"]).run()
print(f" Call {call['function']} returned {result}")
# Should output something like this:
# Call get_current_datetime returned 2024-07-15 09:50:38
# Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"}
# Call Calculator returned 1764
return res
def main():
parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
parser.add_argument("--host", default="localhost:8080", help="llama.cpp server")
parser.add_argument("-v", "--verbose", action="store_true", help="enables logging")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR)
ret = 0
# Comment out below to only run the example you want.
ret = ret or example_rce(args.host)
ret = ret or example_calculator(args.host)
ret = ret or example_struct(args.host)
ret = ret or example_concurrent(args.host)
return ret
if __name__ == "__main__":
sys.exit(main())
set(TARGET quantize-stats) set(TARGET llama-quantize-stats)
add_executable(${TARGET} quantize-stats.cpp) add_executable(${TARGET} quantize-stats.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
......
...@@ -154,7 +154,7 @@ static void test_roundtrip_on_chunk( ...@@ -154,7 +154,7 @@ static void test_roundtrip_on_chunk(
} }
if (use_reference) { if (use_reference) {
qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size); qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
} else { } else {
qfns.from_float(input_scratch, quantized_scratch, chunk_size); qfns.from_float(input_scratch, quantized_scratch, chunk_size);
} }
......
set(TARGET quantize) set(TARGET llama-quantize)
add_executable(${TARGET} quantize.cpp) add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
......
...@@ -4,7 +4,89 @@ You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf- ...@@ -4,7 +4,89 @@ You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-
Note: It is synced from llama.cpp `main` every 6 hours. Note: It is synced from llama.cpp `main` every 6 hours.
## Llama 2 7B Example usage:
```bash
# obtain the official LLaMA model weights and place them in ./models
ls ./models
llama-2-7b tokenizer_checklist.chk tokenizer.model
# [Optional] for models using BPE tokenizers
ls ./models
<folder containing weights and tokenizer json> vocab.json
# [Optional] for PyTorch .bin models like Mistral-7B
ls ./models
<folder containing weights and tokenizer json>
# install Python dependencies
python3 -m pip install -r requirements.txt
# convert the model to ggml FP16 format
python3 convert_hf_to_gguf.py models/mymodel/
# quantize the model to 4-bits (using Q4_K_M method)
./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
# update the gguf filetype to current version if older version is now unsupported
./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
```
Run the quantized model:
```bash
# start inference on a gguf model
./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
```
When running the larger models, make sure you have enough disk space to store all the intermediate files.
## Memory/Disk Requirements
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
| Model | Original size | Quantized size (Q4_0) |
|------:|--------------:|----------------------:|
| 7B | 13 GB | 3.9 GB |
| 13B | 24 GB | 7.8 GB |
| 30B | 60 GB | 19.5 GB |
| 65B | 120 GB | 38.5 GB |
## Quantization
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
*(outdated)*
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |
| 7B | ms/tok @ 4th | 127 | 55 | 54 | 76 | 83 | 72 |
| 7B | ms/tok @ 8th | 122 | 43 | 45 | 52 | 56 | 67 |
| 7B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
| 13B | perplexity | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 |
| 13B | file size | 25.0G | 6.8G | 7.6G | 8.3G | 9.1G | 13G |
| 13B | ms/tok @ 4th | - | 103 | 105 | 148 | 160 | 131 |
| 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 |
| 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
- recent k-quants improvements and new i-quants
- [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
- [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
- [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773)
- [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856)
- [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861)
- [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872)
- [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897)
- [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
- [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
- [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
- [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
- [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
- [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
- [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
**Llama 2 7B**
| Quantization | Bits per Weight (BPW) | | Quantization | Bits per Weight (BPW) |
|--------------|-----------------------| |--------------|-----------------------|
...@@ -18,7 +100,8 @@ Note: It is synced from llama.cpp `main` every 6 hours. ...@@ -18,7 +100,8 @@ Note: It is synced from llama.cpp `main` every 6 hours.
| Q5_K_M | 5.68 | | Q5_K_M | 5.68 |
| Q6_K | 6.56 | | Q6_K | 6.56 |
## Llama 2 13B **Llama 2 13B**
Quantization | Bits per Weight (BPW) Quantization | Bits per Weight (BPW)
-- | -- -- | --
Q2_K | 3.34 Q2_K | 3.34
...@@ -31,7 +114,7 @@ Q5_K_S | 5.51 ...@@ -31,7 +114,7 @@ Q5_K_S | 5.51
Q5_K_M | 5.67 Q5_K_M | 5.67
Q6_K | 6.56 Q6_K | 6.56
# Llama 2 70B **Llama 2 70B**
Quantization | Bits per Weight (BPW) Quantization | Bits per Weight (BPW)
-- | -- -- | --
......
...@@ -16,41 +16,44 @@ struct quant_option { ...@@ -16,41 +16,44 @@ struct quant_option {
}; };
static const std::vector<struct quant_option> QUANT_OPTIONS = { static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", }, { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", }, { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", }, { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", }, { "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", },
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , }, { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", },
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", }, { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", }, { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", },
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", }, { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", },
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", }, { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", },
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", }, { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", }, { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
}; };
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file"; static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
......
...@@ -18,9 +18,9 @@ fi ...@@ -18,9 +18,9 @@ fi
set -x set -x
SPLIT=$1/gguf-split SPLIT=$1/llama-gguf-split
QUANTIZE=$1/quantize QUANTIZE=$1/llama-quantize
MAIN=$1/main MAIN=$1/llama-cli
WORK_PATH=$TMP_DIR/quantize WORK_PATH=$TMP_DIR/quantize
ROOT_DIR=$(realpath $(dirname $0)/../../) ROOT_DIR=$(realpath $(dirname $0)/../../)
...@@ -47,7 +47,7 @@ echo PASS ...@@ -47,7 +47,7 @@ echo PASS
echo echo
# 3a. Test the requanted model is loading properly # 3a. Test the requanted model is loading properly
$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32 $MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
echo PASS echo PASS
echo echo
...@@ -57,7 +57,7 @@ echo PASS ...@@ -57,7 +57,7 @@ echo PASS
echo echo
# 4b. Test the requanted model is loading properly # 4b. Test the requanted model is loading properly
$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32 $MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
echo PASS echo PASS
echo echo
......
...@@ -8,7 +8,7 @@ if [ "$1" == "-m" ]; then ...@@ -8,7 +8,7 @@ if [ "$1" == "-m" ]; then
MODEL="-m $2 " MODEL="-m $2 "
fi fi
./main $MODEL --color \ ./llama-cli $MODEL --color \
-f ./prompts/reason-act.txt \ -f ./prompts/reason-act.txt \
-i --interactive-first \ -i --interactive-first \
--top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \ --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
......
import json, subprocess, sys, os
assert len(sys.argv) >= 2
[_, pattern, *rest] = sys.argv
print(subprocess.check_output(
[
"python",
os.path.join(
os.path.dirname(os.path.realpath(__file__)),
"json_schema_to_grammar.py"),
*rest,
"-",
"--raw-pattern",
],
text=True,
input=json.dumps({
"type": "string",
"pattern": pattern,
}, indent=2)))
set(TARGET retrieval) set(TARGET llama-retrieval)
add_executable(${TARGET} retrieval.cpp) add_executable(${TARGET} retrieval.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
......
...@@ -15,7 +15,7 @@ https://github.com/ggerganov/llama.cpp/pull/6193 ...@@ -15,7 +15,7 @@ https://github.com/ggerganov/llama.cpp/pull/6193
`retrieval` example can be tested as follows: `retrieval` example can be tested as follows:
```bash ```bash
make -j && ./retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator . make -j && ./llama-retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
``` ```
This chunks and embeds all given files and starts a loop requesting query inputs: This chunks and embeds all given files and starts a loop requesting query inputs:
......
...@@ -4,72 +4,12 @@ ...@@ -4,72 +4,12 @@
#include <algorithm> #include <algorithm>
#include <fstream> #include <fstream>
struct retrieval_params { static void print_usage(int argc, char ** argv, const gpt_params & params) {
std::vector<std::string> context_files; // context files to embed gpt_params_print_usage(argc, argv, params);
int32_t chunk_size = 64; // chunk size for context embedding
std::string chunk_separator = "\n"; // chunk separator for context embedding
};
static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
gpt_params_print_usage(argc, argv, gpt_params);
printf("retrieval options:\n");
printf(" --context-file FNAME file containing context to embed.\n");
printf(" specify multiple files by providing --context-file option multiple times.\n");
printf(" --chunk-size N minimum length of embedded text chunk (default:%d)\n", params.chunk_size);
printf(" --chunk-separator STRING\n");
printf(" string to separate chunks (default: \"\\n\")\n");
printf("\n");
}
static void retrieval_params_parse(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & retrieval_params) { LOG_TEE("\nexample usage:\n");
int i = 1; LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
std::string arg; LOG_TEE("\n");
while (i < argc) {
arg = argv[i];
bool invalid_gpt_param = false;
if(gpt_params_find_arg(argc, argv, argv[i], gpt_params, i, invalid_gpt_param)) {
if (invalid_gpt_param) {
fprintf(stderr, "error: invalid argument: %s\n", arg.c_str());
retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
exit(1);
}
// option was parsed by gpt_params_find_arg
} else if (arg == "--context-file") {
if (++i >= argc) {
fprintf(stderr, "error: missing argument for --context-file\n");
retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
exit(1);
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
exit(1);
}
// store the external file name in params
retrieval_params.context_files.push_back(argv[i]);
} else if (arg == "--chunk-size") {
if (++i >= argc) {
fprintf(stderr, "error: missing argument for --chunk-size\n");
retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
exit(1);
}
retrieval_params.chunk_size = std::stoi(argv[i]);
} else if (arg == "--chunk-separator") {
if (++i >= argc) {
fprintf(stderr, "error: missing argument for --chunk-separator\n");
retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
exit(1);
}
retrieval_params.chunk_separator = argv[i];
} else {
// unknown argument
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
exit(1);
}
i++;
}
} }
struct chunk { struct chunk {
...@@ -133,9 +73,10 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz ...@@ -133,9 +73,10 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
return chunks; return chunks;
} }
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) { static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
for (size_t i = 0; i < tokens.size(); i++) { size_t n_tokens = tokens.size();
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1); for (size_t i = 0; i < n_tokens; i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
} }
} }
...@@ -171,33 +112,35 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu ...@@ -171,33 +112,35 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
retrieval_params retrieval_params;
retrieval_params_parse(argc, argv, params, retrieval_params); if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
}
// For BERT models, batch size must be equal to ubatch size // For BERT models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch; params.n_ubatch = params.n_batch;
params.embedding = true;
if (retrieval_params.chunk_size <= 0) { if (params.chunk_size <= 0) {
fprintf(stderr, "chunk_size must be positive\n"); fprintf(stderr, "chunk_size must be positive\n");
return 1; return 1;
} }
if (retrieval_params.context_files.empty()) { if (params.context_files.empty()) {
fprintf(stderr, "context_files must be specified\n"); fprintf(stderr, "context_files must be specified\n");
return 1; return 1;
} }
params.embedding = true;
print_build_info(); print_build_info();
printf("processing files:\n"); printf("processing files:\n");
for (auto & context_file : retrieval_params.context_files) { for (auto & context_file : params.context_files) {
printf("%s\n", context_file.c_str()); printf("%s\n", context_file.c_str());
} }
std::vector<chunk> chunks; std::vector<chunk> chunks;
for (auto & context_file : retrieval_params.context_files) { for (auto & context_file : params.context_files) {
std::vector<chunk> file_chunk = chunk_file(context_file, retrieval_params.chunk_size, retrieval_params.chunk_separator); std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end()); chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
} }
printf("Number of chunks: %ld\n", chunks.size()); printf("Number of chunks: %ld\n", chunks.size());
...@@ -205,11 +148,12 @@ int main(int argc, char ** argv) { ...@@ -205,11 +148,12 @@ int main(int argc, char ** argv) {
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
llama_model * model;
llama_context * ctx;
// load the model // load the model
std::tie(model, ctx) = llama_init_from_gpt_params(params); llama_init_result llama_init = llama_init_from_gpt_params(params);
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__); fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1; return 1;
...@@ -218,6 +162,12 @@ int main(int argc, char ** argv) { ...@@ -218,6 +162,12 @@ int main(int argc, char ** argv) {
const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
return 1;
}
if (n_ctx > n_ctx_train) { if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx); __func__, n_ctx_train, n_ctx);
...@@ -242,7 +192,7 @@ int main(int argc, char ** argv) { ...@@ -242,7 +192,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
// add eos if not present // add eos if not present
if (inp.empty() || inp.back() != llama_token_eos(model)) { if (llama_token_eos(model) >= 0 && (inp.empty() || inp.back() != llama_token_eos(model))) {
inp.push_back(llama_token_eos(model)); inp.push_back(llama_token_eos(model));
} }
chunk.tokens = inp; chunk.tokens = inp;
......
...@@ -29,13 +29,13 @@ You can also run multiple `rpc-server` instances on the same host, each with a d ...@@ -29,13 +29,13 @@ You can also run multiple `rpc-server` instances on the same host, each with a d
## Usage ## Usage
On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options. On each host, build the corresponding backend with `cmake` and add `-DGGML_RPC=ON` to the build options.
For example, to build the CUDA backend with RPC support: For example, to build the CUDA backend with RPC support:
```bash ```bash
mkdir build-rpc-cuda mkdir build-rpc-cuda
cd build-rpc-cuda cd build-rpc-cuda
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON
cmake --build . --config Release cmake --build . --config Release
``` ```
...@@ -58,17 +58,17 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052 ...@@ -58,17 +58,17 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device. This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`: On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
```bash ```bash
mkdir build-rpc mkdir build-rpc
cd build-rpc cd build-rpc
cmake .. -DLLAMA_RPC=ON cmake .. -DGGML_RPC=ON
cmake --build . --config Release cmake --build . --config Release
``` ```
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`: Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
```bash ```bash
$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99 $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
``` ```
set(TARGET save-load-state) set(TARGET llama-save-load-state)
add_executable(${TARGET} save-load-state.cpp) add_executable(${TARGET} save-load-state.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
......
...@@ -11,6 +11,7 @@ int main(int argc, char ** argv) { ...@@ -11,6 +11,7 @@ int main(int argc, char ** argv) {
params.prompt = "The quick brown fox"; params.prompt = "The quick brown fox";
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1; return 1;
} }
...@@ -27,10 +28,11 @@ int main(int argc, char ** argv) { ...@@ -27,10 +28,11 @@ int main(int argc, char ** argv) {
std::string result2; std::string result2;
// init // init
llama_model * model; llama_init_result llama_init = llama_init_from_gpt_params(params);
llama_context * ctx;
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) { if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__); fprintf(stderr, "%s : failed to init\n", __func__);
return 1; return 1;
...@@ -46,7 +48,7 @@ int main(int argc, char ** argv) { ...@@ -46,7 +48,7 @@ int main(int argc, char ** argv) {
// save state (rng, logits, embedding and kv_cache) to file // save state (rng, logits, embedding and kv_cache) to file
{ {
std::vector<uint8_t> state_mem(llama_state_get_size(ctx)); std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
const size_t written = llama_state_get_data(ctx, state_mem.data()); const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size());
FILE *fp_write = fopen("dump_state.bin", "wb"); FILE *fp_write = fopen("dump_state.bin", "wb");
fwrite(state_mem.data(), 1, written, fp_write); fwrite(state_mem.data(), 1, written, fp_write);
...@@ -98,13 +100,16 @@ int main(int argc, char ** argv) { ...@@ -98,13 +100,16 @@ int main(int argc, char ** argv) {
// load state (rng, logits, embedding and kv_cache) from file // load state (rng, logits, embedding and kv_cache) from file
{ {
std::vector<uint8_t> state_mem(llama_state_get_size(ctx2)); std::vector<uint8_t> state_mem;
FILE * fp_read = fopen("dump_state.bin", "rb"); FILE * fp_read = fopen("dump_state.bin", "rb");
fseek(fp_read, 0, SEEK_END);
state_mem.resize(ftell(fp_read));
fseek(fp_read, 0, SEEK_SET);
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
fclose(fp_read); fclose(fp_read);
if (read != llama_state_set_data(ctx2, state_mem.data())) { if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) {
fprintf(stderr, "\n%s : failed to read state\n", __func__); fprintf(stderr, "\n%s : failed to read state\n", __func__);
llama_free(ctx2); llama_free(ctx2);
llama_free_model(model); llama_free_model(model);
...@@ -158,13 +163,16 @@ int main(int argc, char ** argv) { ...@@ -158,13 +163,16 @@ int main(int argc, char ** argv) {
// load state (rng, logits, embedding and kv_cache) from file // load state (rng, logits, embedding and kv_cache) from file
{ {
std::vector<uint8_t> state_mem(llama_state_get_size(ctx3)); std::vector<uint8_t> state_mem;
FILE * fp_read = fopen("dump_state.bin", "rb"); FILE * fp_read = fopen("dump_state.bin", "rb");
fseek(fp_read, 0, SEEK_END);
state_mem.resize(ftell(fp_read));
fseek(fp_read, 0, SEEK_SET);
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
fclose(fp_read); fclose(fp_read);
if (read != llama_state_set_data(ctx3, state_mem.data())) { if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) {
fprintf(stderr, "\n%s : failed to read state\n", __func__); fprintf(stderr, "\n%s : failed to read state\n", __func__);
llama_free(ctx3); llama_free(ctx3);
llama_free_model(model); llama_free_model(model);
...@@ -181,7 +189,7 @@ int main(int argc, char ** argv) { ...@@ -181,7 +189,7 @@ int main(int argc, char ** argv) {
{ {
// save kv of seq 0 // save kv of seq 0
std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0)); std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0); const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0);
if (ncopy != seq_store.size()) { if (ncopy != seq_store.size()) {
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
llama_free(ctx3); llama_free(ctx3);
...@@ -195,7 +203,7 @@ int main(int argc, char ** argv) { ...@@ -195,7 +203,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s : kv cache cleared\n", __func__); fprintf(stderr, "%s : kv cache cleared\n", __func__);
// restore kv into seq 1 // restore kv into seq 1
const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1); const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1);
if (nset != seq_store.size()) { if (nset != seq_store.size()) {
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
llama_free(ctx3); llama_free(ctx3);
......
...@@ -16,7 +16,7 @@ GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}" ...@@ -16,7 +16,7 @@ GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./server $GEN_OPTIONS \ ./llama-server $GEN_OPTIONS \
--model "$MODEL" \ --model "$MODEL" \
--threads "$N_THREAD" \ --threads "$N_THREAD" \
--rope-freq-scale 1.0 \ --rope-freq-scale 1.0 \
......
set(TARGET server) set(TARGET llama-server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
if (MINGW)
# fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
endif()
set(TARGET_SRCS set(TARGET_SRCS
server.cpp server.cpp
utils.hpp utils.hpp
httplib.h httplib.h
) )
set(PUBLIC_ASSETS set(PUBLIC_ASSETS
colorthemes.css
style.css
theme-beeninorder.css
theme-ketivah.css
theme-mangotango.css
theme-playground.css
theme-polarnight.css
theme-snowstorm.css
index.html index.html
index-new.html
index.js index.js
completion.js completion.js
system-prompts.js
prompt-formats.js
json-schema-to-grammar.mjs json-schema-to-grammar.mjs
) )
foreach(asset ${PUBLIC_ASSETS}) foreach(asset ${PUBLIC_ASSETS})
set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}") set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp") set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
...@@ -23,18 +42,23 @@ foreach(asset ${PUBLIC_ASSETS}) ...@@ -23,18 +42,23 @@ foreach(asset ${PUBLIC_ASSETS})
COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake" COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
) )
endforeach() endforeach()
add_executable(${TARGET} ${TARGET_SRCS}) add_executable(${TARGET} ${TARGET_SRCS})
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}> SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
) )
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
if (LLAMA_SERVER_SSL) if (LLAMA_SERVER_SSL)
find_package(OpenSSL REQUIRED) find_package(OpenSSL REQUIRED)
target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto) target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT) target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
endif() endif()
if (WIN32) if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif() endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
...@@ -5,7 +5,7 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/ ...@@ -5,7 +5,7 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/
Set of LLM REST APIs and a simple web front end to interact with llama.cpp. Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
**Features:** **Features:**
* LLM inference of F16 and quantum models on GPU and CPU * LLM inference of F16 and quantized models on GPU and CPU
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
* Parallel decoding with multi-user support * Parallel decoding with multi-user support
* Continuous batching * Continuous batching
...@@ -15,91 +15,261 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. ...@@ -15,91 +15,261 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216). The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
**Command line options:** ## Usage
- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response. ```
- `-t N`, `--threads N`: Set the number of threads to use by CPU layers during generation. Not used by model layers that are offloaded to GPU. This option has no effect when using the maximum number of GPU layers. Default: `std::thread::hardware_concurrency()` (number of CPU cores). usage: ./llama-server [options]
- `-tb N, --threads-batch N`: Set the number of threads to use by CPU layers during batch and prompt processing (>= 32 tokens). This option has no effect if a GPU is available. Default: `--threads`.
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)` general:
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused -h, --help, --usage print usage and exit
- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository. Default: unused --version show version and build info
- `-hff FILE, --hf-file FILE`: Hugging Face model file. Default: unused -v, --verbose print verbose information
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. --verbosity N set specific verbosity level (default: 0)
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is `512`, but LLaMA models were built with a context of `2048`, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of `4096`. --verbose-prompt print a verbose prompt before generation (default: false)
- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. --no-display-prompt don't print prompt at generation (default: false)
- `-mg i, --main-gpu i`: When using multiple GPUs, this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default, GPU `0` is used. -co, --color colorise output to distinguish prompt and user input from generations (default: false)
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs, this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance. -s, --seed SEED RNG seed (default: -1, use random seed for < 0)
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048` -t, --threads N number of threads to use during generation (default: 8)
- `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512` -tb, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. -td, --threads-draft N number of threads to use during generation (default: same as --threads)
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. -tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: same as --threads-draft)
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems --draft N number of tokens to draft for speculative decoding (default: 5)
- `--numa distribute`: Spread execution evenly over all nodes -ps, --p-split N speculative decoding split probability (default: 0.1)
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on -lcs, --lookup-cache-static FNAME
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437 path to static lookup cache to use for lookup decoding (not updated by generation)
- `--numa`: Attempt optimizations that may help on some NUMA systems. -lcd, --lookup-cache-dynamic FNAME
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. path to dynamic lookup cache to use for lookup decoding (updated by generation)
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. -c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model)
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600` -n, --predict N number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1` -b, --batch-size N logical maximum batch size (default: 2048)
- `--port`: Set the port to listen. Default: `8080` -ub, --ubatch-size N physical maximum batch size (default: 512)
- `--path`: Path from which to serve static files. Default: disabled --keep N number of tokens to keep from the initial prompt (default: 0, -1 = all)
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. --chunks N max number of chunks to process (default: -1, -1 = all)
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s. -fa, --flash-attn enable Flash Attention (default: disabled)
- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled -p, --prompt PROMPT prompt to start generation with
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error. in conversation mode, this will be used as system prompt
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled (default: '')
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime) -f, --file FNAME a file containing the prompt (default: none)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. --in-file FNAME an input file (repeat to specify multiple files)
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend. Used together with group attention width `--grp-attn-w`. Default: `1`, which is disabled. -bf, --binary-file FNAME binary file containing the prompt (default: none)
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend. Used together with group attention factor `--grp-attn-n`. Default: `512` -e, --escape process escapes sequences (\n, \r, \t, \', \", \\) (default: true)
- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1` --no-escape do not process escape sequences
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. -ptc, --print-token-count N print token count every N tokens (default: -1)
- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled --prompt-cache FNAME file to cache prompt state for faster startup (default: none)
- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled. --prompt-cache-all if specified, saves user input and generations to cache as well
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) not supported with --interactive or other interactive options
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled --prompt-cache-ro if specified, uses the prompt cache but does not update it
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json` -r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode
- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn` can be specified more than once for multiple prompts
- `--rope-freq-base N` : RoPE frequency base (default: loaded from model) -sp, --special special tokens output enabled (default: false)
- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25) -cnv, --conversation run in conversation mode, does not print special tokens and suffix/prefix
- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation) if suffix/prefix are not specified, default chat template will be used
- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0) (default: false)
- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0) -i, --interactive run in interactive mode (default: false)
- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0) -if, --interactive-first run in interactive mode and wait for input right away (default: false)
- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls` -mli, --multiline-input allows you to write or paste multiple lines without ending each in '\'
- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled) --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string
- `-fa`, `--flash-attn` : enable flash attention (default: disabled). --in-prefix STRING string to prefix user inputs with (default: empty)
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`) --in-suffix STRING string to suffix after user inputs with (default: empty)
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options) --spm-infill use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled)
**If compiled with `LLAMA_SERVER_SSL=ON`** sampling:
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
- `--ssl-cert-file FNAME`: path to file a PEM-encoded SSL certificate --samplers SAMPLERS samplers that will be used for generation in the order, separated by ';'
(default: top_k;tfs_z;typical_p;top_p;min_p;temperature)
--sampling-seq SEQUENCE simplified sequence for samplers that will be used (default: kfypmt)
--ignore-eos ignore end of stream token and continue generating (implies --logit-bias EOS-inf)
--penalize-nl penalize newline tokens (default: false)
--temp N temperature (default: 0.8)
--top-k N top-k sampling (default: 40, 0 = disabled)
--top-p N top-p sampling (default: 0.9, 1.0 = disabled)
--min-p N min-p sampling (default: 0.1, 0.0 = disabled)
--tfs N tail free sampling, parameter z (default: 1.0, 1.0 = disabled)
--typical N locally typical sampling, parameter p (default: 1.0, 1.0 = disabled)
--repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size)
--repeat-penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
--presence-penalty N repeat alpha presence penalty (default: 0.0, 0.0 = disabled)
--frequency-penalty N repeat alpha frequency penalty (default: 0.0, 0.0 = disabled)
--dynatemp-range N dynamic temperature range (default: 0.0, 0.0 = disabled)
--dynatemp-exp N dynamic temperature exponent (default: 1.0)
--mirostat N use Mirostat sampling.
Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
--mirostat-lr N Mirostat learning rate, parameter eta (default: 0.1)
--mirostat-ent N Mirostat target entropy, parameter tau (default: 5.0)
-l TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
--cfg-negative-prompt PROMPT
negative prompt to use for guidance (default: '')
--cfg-negative-prompt-file FNAME
negative prompt file to use for guidance
--cfg-scale N strength of guidance (default: 1.0, 1.0 = disable)
--chat-template JINJA_TEMPLATE
set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted:
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
grammar:
--grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '')
--grammar-file FNAME file to read grammar from
-j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead
embedding:
--pooling {none,mean,cls,last}
pooling type for embeddings, use model default if unspecified
--attention {causal,non-causal}
attention type for embeddings, use model default if unspecified
context hacking:
--rope-scaling {none,linear,yarn}
RoPE frequency scaling method, defaults to linear unless specified by the model
--rope-scale N RoPE context scaling factor, expands context by a factor of N
--rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
--rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N
--yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)
--yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
--yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
--yarn-beta-slow N YaRN: high correction dim or alpha (default: 1.0)
--yarn-beta-fast N YaRN: low correction dim or beta (default: 32.0)
-gan, --grp-attn-n N group-attention factor (default: 1)
-gaw, --grp-attn-w N group-attention width (default: 512.0)
-dkvc, --dump-kv-cache verbose print of the KV cache
-nkvo, --no-kv-offload disable KV offload
-ctk, --cache-type-k TYPE KV cache data type for K (default: f16)
-ctv, --cache-type-v TYPE KV cache data type for V (default: f16)
perplexity:
--all-logits return logits for all tokens in the batch (default: false)
--hellaswag compute HellaSwag score over random tasks from datafile supplied with -f
--hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: 400)
--winogrande compute Winogrande score over random tasks from datafile supplied with -f
--winogrande-tasks N number of tasks to use when computing the Winogrande score (default: 0)
--multiple-choice compute multiple choice score over random tasks from datafile supplied with -f
--multiple-choice-tasks N
number of tasks to use when computing the multiple choice score (default: 0)
--kl-divergence computes KL-divergence to logits provided via --kl-divergence-base
--ppl-stride N stride for perplexity calculation (default: 0)
--ppl-output-type {0,1} output type for perplexity calculation (default: 0)
parallel:
-dt, --defrag-thold N KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
-np, --parallel N number of parallel sequences to decode (default: 1)
-ns, --sequences N number of sequences to decode (default: 1)
-cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)
multi-modality:
--mmproj FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md
--image FILE path to an image file. use with multimodal models. Specify multiple times for batching
backend:
--rpc SERVERS comma separated list of RPC servers
--mlock force system to keep model in RAM rather than swapping or compressing
--no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)
--numa TYPE attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggerganov/llama.cpp/issues/1437
model:
--check-tensors check model tensor data for invalid values (default: false)
--override-kv KEY=TYPE:VALUE
advanced option to override model metadata by key. may be specified multiple times.
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false
--lora FNAME apply LoRA adapter (implies --no-mmap)
--lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)
--lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter
--control-vector FNAME add a control vector
note: this argument can be repeated to add multiple control vectors
--control-vector-scaled FNAME SCALE
add a control vector with user defined scaling SCALE
note: this argument can be repeated to add multiple scaled control vectors
--control-vector-layer-range START END
layer range to apply the control vector(s) to, start and end inclusive
-m, --model FNAME model path (default: models/$filename with filename from --hf-file
or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)
-md, --model-draft FNAME draft model for speculative decoding (default: unused)
-mu, --model-url MODEL_URL model download url (default: unused)
-hfr, --hf-repo REPO Hugging Face model repository (default: unused)
-hff, --hf-file FILE Hugging Face model file (default: unused)
-hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable)
server:
--host HOST ip address to listen (default: 127.0.0.1)
--port PORT port to listen (default: 8080)
--path PATH path to serve static files from (default: )
--embedding(s) restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
--api-key KEY API key to use for authentication (default: none)
--api-key-file FNAME path to file containing API keys (default: none)
--ssl-key-file FNAME path to file a PEM-encoded SSL private key
--ssl-cert-file FNAME path to file a PEM-encoded SSL certificate
--timeout N server read/write timeout in seconds (default: 600)
--threads-http N number of threads used to process HTTP requests (default: -1)
--system-prompt-file FNAME
set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications
--log-format {text,json}
log output format: json or text (default: json)
--metrics enable prometheus compatible metrics endpoint (default: disabled)
--no-slots disables slots monitoring endpoint (default: enabled)
--slot-save-path PATH path to save slot kv cache (default: disabled)
--chat-template JINJA_TEMPLATE
set custom jinja chat template (default: template taken from model's metadata)
only commonly used templates are accepted:
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
-sps, --slot-prompt-similarity SIMILARITY
how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
--lora-init-without-apply
load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled)
logging:
--simple-io use basic IO for better compatibility in subprocesses and limited consoles
-ld, --logdir LOGDIR path under which to save YAML logs (no logging if unset)
--log-test Run simple logging test
--log-disable Disable trace logs
--log-enable Enable trace logs
--log-file FNAME Specify a log filename (without extension)
--log-new Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log"
--log-append Don't truncate the old log file.
```
## Build ## Build
`server` is built alongside everything else from the root of the project `llama-server` is built alongside everything else from the root of the project
- Using `make`: - Using `make`:
```bash ```bash
make server make llama-server
``` ```
- Using `CMake`: - Using `CMake`:
```bash ```bash
cmake -B build cmake -B build
cmake --build build --config Release -t server cmake --build build --config Release -t llama-server
``` ```
Binary is at `./build/bin/server` Binary is at `./build/bin/llama-server`
## Build with SSL ## Build with SSL
`server` can also be built with SSL support using OpenSSL 3 `llama-server` can also be built with SSL support using OpenSSL 3
- Using `make`: - Using `make`:
...@@ -107,14 +277,14 @@ The project is under active development, and we are [looking for feedback and co ...@@ -107,14 +277,14 @@ The project is under active development, and we are [looking for feedback and co
# NOTE: For non-system openssl, use the following: # NOTE: For non-system openssl, use the following:
# CXXFLAGS="-I /path/to/openssl/include" # CXXFLAGS="-I /path/to/openssl/include"
# LDFLAGS="-L /path/to/openssl/lib" # LDFLAGS="-L /path/to/openssl/lib"
make LLAMA_SERVER_SSL=true server make LLAMA_SERVER_SSL=true llama-server
``` ```
- Using `CMake`: - Using `CMake`:
```bash ```bash
cmake -B build -DLLAMA_SERVER_SSL=ON cmake -B build -DLLAMA_SERVER_SSL=ON
cmake --build build --config Release -t server cmake --build build --config Release -t llama-server
``` ```
## Quick Start ## Quick Start
...@@ -124,13 +294,13 @@ To get started right away, run the following command, making sure to use the cor ...@@ -124,13 +294,13 @@ To get started right away, run the following command, making sure to use the cor
### Unix-based systems (Linux, macOS, etc.) ### Unix-based systems (Linux, macOS, etc.)
```bash ```bash
./server -m models/7B/ggml-model.gguf -c 2048 ./llama-server -m models/7B/ggml-model.gguf -c 2048
``` ```
### Windows ### Windows
```powershell ```powershell
server.exe -m models\7B\ggml-model.gguf -c 2048 llama-server.exe -m models\7B\ggml-model.gguf -c 2048
``` ```
The above command will start a server that by default listens on `127.0.0.1:8080`. The above command will start a server that by default listens on `127.0.0.1:8080`.
...@@ -198,7 +368,8 @@ node index.js ...@@ -198,7 +368,8 @@ node index.js
## API Endpoints ## API Endpoints
- **GET** `/health`: Returns the current state of the server: ### GET `/health`: Returns the current state of the server
- 503 -> `{"status": "loading model"}` if the model is still being loaded. - 503 -> `{"status": "loading model"}` if the model is still being loaded.
- 500 -> `{"status": "error"}` if the model failed to load. - 500 -> `{"status": "error"}` if the model failed to load.
- 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below. - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
...@@ -207,7 +378,7 @@ node index.js ...@@ -207,7 +378,7 @@ node index.js
If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set. If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion. ### POST `/completion`: Given a `prompt`, it returns the predicted completion.
*Options:* *Options:*
...@@ -231,7 +402,7 @@ node index.js ...@@ -231,7 +402,7 @@ node index.js
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity. `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
...@@ -279,13 +450,13 @@ node index.js ...@@ -279,13 +450,13 @@ node index.js
`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1`
`cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. Default: `false` `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values. `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
### Result JSON **Response format**
- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. - Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
...@@ -324,7 +495,7 @@ Notice that each `probs` is an array of length `n_probs`. ...@@ -324,7 +495,7 @@ Notice that each `probs` is an array of length `n_probs`.
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt - `tokens_evaluated`: Number of tokens evaluated in total from the prompt
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`) - `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
- **POST** `/tokenize`: Tokenize a given text. ### POST `/tokenize`: Tokenize a given text
*Options:* *Options:*
...@@ -332,13 +503,15 @@ Notice that each `probs` is an array of length `n_probs`. ...@@ -332,13 +503,15 @@ Notice that each `probs` is an array of length `n_probs`.
`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
- **POST** `/detokenize`: Convert tokens to text. ### POST `/detokenize`: Convert tokens to text
*Options:* *Options:*
`tokens`: Set the tokens to detokenize. `tokens`: Set the tokens to detokenize.
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does. ### POST `/embedding`: Generate embedding of a given text
The same as [the embedding example](../embedding) does.
*Options:* *Options:*
...@@ -346,7 +519,9 @@ Notice that each `probs` is an array of length `n_probs`. ...@@ -346,7 +519,9 @@ Notice that each `probs` is an array of length `n_probs`.
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream. ### POST `/infill`: For code infilling.
Takes a prefix and a suffix and returns the predicted completion as stream.
*Options:* *Options:*
...@@ -358,14 +533,15 @@ Notice that each `probs` is an array of length `n_probs`. ...@@ -358,14 +533,15 @@ Notice that each `probs` is an array of length `n_probs`.
- **GET** `/props`: Return current server settings. - **GET** `/props`: Return current server settings.
### Result JSON **Response format**
```json ```json
{ {
"assistant_name": "", "assistant_name": "",
"user_name": "", "user_name": "",
"default_generation_settings": { ... }, "default_generation_settings": { ... },
"total_slots": 1 "total_slots": 1,
"chat_template": ""
} }
``` ```
...@@ -373,8 +549,11 @@ Notice that each `probs` is an array of length `n_probs`. ...@@ -373,8 +549,11 @@ Notice that each `probs` is an array of length `n_probs`.
- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots. - `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
- `chat_template` - the model's original Jinja2 prompt template
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only model with [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, ChatML template will be used. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
*Options:* *Options:*
...@@ -426,7 +605,7 @@ Notice that each `probs` is an array of length `n_probs`. ...@@ -426,7 +605,7 @@ Notice that each `probs` is an array of length `n_probs`.
}' }'
``` ```
- **POST** `/v1/embeddings`: OpenAI-compatible embeddings API. ### POST `/v1/embeddings`: OpenAI-compatible embeddings API
*Options:* *Options:*
...@@ -460,9 +639,9 @@ Notice that each `probs` is an array of length `n_probs`. ...@@ -460,9 +639,9 @@ Notice that each `probs` is an array of length `n_probs`.
}' }'
``` ```
- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`. ### GET `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
### Result JSON **Response format**
```json ```json
[ [
...@@ -523,7 +702,7 @@ Notice that each `probs` is an array of length `n_probs`. ...@@ -523,7 +702,7 @@ Notice that each `probs` is an array of length `n_probs`.
] ]
``` ```
- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled: ### GET `/metrics`: Prometheus compatible metrics exporter endpoint if `--metrics` is enabled:
Available metrics: Available metrics:
- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed. - `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
...@@ -535,13 +714,13 @@ Available metrics: ...@@ -535,13 +714,13 @@ Available metrics:
- `llamacpp:requests_processing`: Number of requests processing. - `llamacpp:requests_processing`: Number of requests processing.
- `llamacpp:requests_deferred`: Number of requests deferred. - `llamacpp:requests_deferred`: Number of requests deferred.
- **POST** `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file. ### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
*Options:* *Options:*
`filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter. `filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
### Result JSON **Response format**
```json ```json
{ {
...@@ -555,13 +734,13 @@ Available metrics: ...@@ -555,13 +734,13 @@ Available metrics:
} }
``` ```
- **POST** `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file. ### POST `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
*Options:* *Options:*
`filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter. `filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
### Result JSON **Response format**
```json ```json
{ {
...@@ -575,9 +754,9 @@ Available metrics: ...@@ -575,9 +754,9 @@ Available metrics:
} }
``` ```
- **POST** `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot. ### POST `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.
### Result JSON **Response format**
```json ```json
{ {
...@@ -586,6 +765,42 @@ Available metrics: ...@@ -586,6 +765,42 @@ Available metrics:
} }
``` ```
### GET `/lora-adapters`: Get list of all LoRA adapters
If an adapter is disabled, the scale will be set to 0.
**Response format**
```json
[
{
"id": 0,
"path": "my_adapter_1.gguf",
"scale": 0.0
},
{
"id": 1,
"path": "my_adapter_2.gguf",
"scale": 0.0
}
]
```
### POST `/lora-adapters`: Set list of LoRA adapters
To disable an adapter, either remove it from the list below, or set scale to 0.
**Request format**
To know the `id` of the adapter, use GET `/lora-adapters`
```json
[
{"id": 0, "scale": 0.2},
{"id": 1, "scale": 0.8}
]
```
## More examples ## More examples
### Change system prompt on runtime ### Change system prompt on runtime
...@@ -629,11 +844,11 @@ bash chat.sh ...@@ -629,11 +844,11 @@ bash chat.sh
### OAI-like API ### OAI-like API
The HTTP `server` supports an OAI-like API: https://github.com/openai/openai-openapi The HTTP `llama-server` supports an OAI-like API: https://github.com/openai/openai-openapi
### API errors ### API errors
`server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi `llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
Example of an error: Example of an error:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment