Commit b2b270ad authored by Devon Rifkin's avatar Devon Rifkin
Browse files

Merge branch 'main' into drifkin/array-head-count-simple

parents 20c5fd39 2bb69b40
...@@ -344,7 +344,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase) ...@@ -344,7 +344,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
} }
cache.SetLayer(0) cache.SetLayer(0)
tensor, _ := context.FromFloatSlice(test.in, test.inShape...) tensor := context.FromFloatSlice(test.in, test.inShape...)
cache.Put(context, tensor, tensor) cache.Put(context, tensor, tensor)
out, _, mask := cache.Get(context) out, _, mask := cache.Get(context)
...@@ -386,7 +386,7 @@ func TestCanResume(t *testing.T) { ...@@ -386,7 +386,7 @@ func TestCanResume(t *testing.T) {
} }
cache.SetLayer(0) cache.SetLayer(0)
tensor, _ := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4) tensor := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
cache.Put(context, tensor, tensor) cache.Put(context, tensor, tensor)
// with window size 4, nothing has slid out of the window yet // with window size 4, nothing has slid out of the window yet
...@@ -413,7 +413,7 @@ func TestCanResume(t *testing.T) { ...@@ -413,7 +413,7 @@ func TestCanResume(t *testing.T) {
} }
cache.SetLayer(0) cache.SetLayer(0)
tensor, _ = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2) tensor = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
cache.Put(context, tensor, tensor) cache.Put(context, tensor, tensor)
// only the latest position has overlapping windows // only the latest position has overlapping windows
...@@ -470,24 +470,24 @@ func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor { ...@@ -470,24 +470,24 @@ func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
return c.Empty(dtype, shape...) return c.Empty(dtype, shape...)
} }
func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) { func (c *testContext) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
t := c.Empty(ml.DTypeF32, shape...).(*testTensor) t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
copy(t.data, s) copy(t.data, s)
return t, nil return t
} }
func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) { func (c *testContext) FromIntSlice(s []int32, shape ...int) ml.Tensor {
f := make([]float32, len(s)) f := make([]float32, len(s))
for i := range f { for i := range f {
f[i] = float32(s[i]) f[i] = float32(s[i])
} }
out, _ := c.FromFloatSlice(f, shape...) out := c.FromFloatSlice(f, shape...)
out.(*testTensor).dtype = ml.DTypeI32 out.(*testTensor).dtype = ml.DTypeI32
return out, nil return out
} }
func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor { func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
...@@ -496,7 +496,7 @@ func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tenso ...@@ -496,7 +496,7 @@ func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tenso
s = append(s, i) s = append(s, i)
} }
out, _ := c.FromFloatSlice(s, len(s)) out := c.FromFloatSlice(s, len(s))
out.(*testTensor).dtype = dtype out.(*testTensor).dtype = dtype
return out return out
} }
...@@ -508,7 +508,7 @@ func (c *testContext) Forward(...ml.Tensor) ml.Context { return c } ...@@ -508,7 +508,7 @@ func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
func (c *testContext) Compute(...ml.Tensor) {} func (c *testContext) Compute(...ml.Tensor) {}
func (c *testContext) Reserve() error { return nil } func (c *testContext) Reserve() {}
func (c *testContext) MaxGraphNodes() int { func (c *testContext) MaxGraphNodes() int {
return 10 return 10
......
int LLAMA_BUILD_NUMBER = 0; int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5"; char const *LLAMA_COMMIT = "de4c07f93783a1a96456a44dc16b9db538ee1618";
char const *LLAMA_COMPILER = ""; char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = ""; char const *LLAMA_BUILD_TARGET = "";
...@@ -10,11 +10,11 @@ include common/stb_image.* ...@@ -10,11 +10,11 @@ include common/stb_image.*
include include/ include include/
include include/llama.* include include/llama.*
include include/llama-*.* include include/llama-*.*
include examples/ include tools/
include examples/llava/ include tools/mtmd/
include examples/llava/clip.* include tools/mtmd/clip.*
include examples/llava/clip-impl.* include tools/mtmd/clip-impl.*
include examples/llava/llava.* include tools/mtmd/llava.*
include src/ include src/
include src/llama.* include src/llama.*
include src/llama-*.* include src/llama-*.*
......
...@@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params & ...@@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.n_threads = params.cpuparams.n_threads; cparams.n_threads = params.cpuparams.n_threads;
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ? cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
params.cpuparams.n_threads : params.cpuparams_batch.n_threads; params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
cparams.logits_all = params.logits_all;
cparams.embeddings = params.embedding; cparams.embeddings = params.embedding;
cparams.rope_scaling_type = params.rope_scaling_type; cparams.rope_scaling_type = params.rope_scaling_type;
cparams.rope_freq_base = params.rope_freq_base; cparams.rope_freq_base = params.rope_freq_base;
...@@ -1114,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & ...@@ -1114,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.offload_kqv = !params.no_kv_offload; cparams.offload_kqv = !params.no_kv_offload;
cparams.flash_attn = params.flash_attn; cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf; cparams.no_perf = params.no_perf;
cparams.op_offload = !params.no_op_offload;
if (params.reranking) { if (params.reranking) {
cparams.embeddings = true; cparams.embeddings = true;
...@@ -1565,3 +1565,20 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c ...@@ -1565,3 +1565,20 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
return result; return result;
} }
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
const int64_t ne_datapoint = llama_n_ctx(ctx);
const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
ggml_opt_dataset_t result = ggml_opt_dataset_init(
GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
for (int64_t idata = 0; idata < ndata; ++idata) {
memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
}
return result;
}
...@@ -66,7 +66,6 @@ enum llama_example { ...@@ -66,7 +66,6 @@ enum llama_example {
LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MAIN,
LLAMA_EXAMPLE_INFILL,
LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_EMBEDDING,
LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_PERPLEXITY,
LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_RETRIEVAL,
...@@ -96,6 +95,7 @@ enum common_sampler_type { ...@@ -96,6 +95,7 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_XTC = 8, COMMON_SAMPLER_TYPE_XTC = 8,
COMMON_SAMPLER_TYPE_INFILL = 9, COMMON_SAMPLER_TYPE_INFILL = 9,
COMMON_SAMPLER_TYPE_PENALTIES = 10, COMMON_SAMPLER_TYPE_PENALTIES = 10,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
}; };
// dimensionality reduction methods, used by cvector-generator // dimensionality reduction methods, used by cvector-generator
...@@ -161,6 +161,7 @@ struct common_params_sampling { ...@@ -161,6 +161,7 @@ struct common_params_sampling {
std::vector<enum common_sampler_type> samplers = { std::vector<enum common_sampler_type> samplers = {
COMMON_SAMPLER_TYPE_PENALTIES, COMMON_SAMPLER_TYPE_PENALTIES,
COMMON_SAMPLER_TYPE_DRY, COMMON_SAMPLER_TYPE_DRY,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
COMMON_SAMPLER_TYPE_TOP_K, COMMON_SAMPLER_TYPE_TOP_K,
COMMON_SAMPLER_TYPE_TYPICAL_P, COMMON_SAMPLER_TYPE_TYPICAL_P,
COMMON_SAMPLER_TYPE_TOP_P, COMMON_SAMPLER_TYPE_TOP_P,
...@@ -323,7 +324,6 @@ struct common_params { ...@@ -323,7 +324,6 @@ struct common_params {
bool ctx_shift = true; // context shift on inifinite text generation bool ctx_shift = true; // context shift on inifinite text generation
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation bool verbose_prompt = false; // print prompt tokens before generation
...@@ -332,6 +332,7 @@ struct common_params { ...@@ -332,6 +332,7 @@ struct common_params {
bool no_kv_offload = false; // disable KV offloading bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data bool check_tensors = false; // validate tensor data
bool no_op_offload = false; // globally disable offload host tensor operations to device
bool single_turn = false; // single turn chat conversation bool single_turn = false; // single turn chat conversation
...@@ -340,7 +341,7 @@ struct common_params { ...@@ -340,7 +341,7 @@ struct common_params {
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO; common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
// multimodal models (see examples/llava) // multimodal models (see tools/mtmd)
struct common_params_model mmproj; struct common_params_model mmproj;
bool mmproj_use_gpu = true; // use GPU for multimodal model bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model bool no_mmproj = false; // explicitly disable multimodal model
...@@ -409,13 +410,14 @@ struct common_params { ...@@ -409,13 +410,14 @@ struct common_params {
bool process_output = false; // collect data for the output tensor bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity bool compute_ppl = true; // whether to compute perplexity
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
// cvector-generator params // cvector-generator params
int n_pca_batch = 100; int n_pca_batch = 100;
int n_pca_iterations = 1000; int n_pca_iterations = 1000;
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA; dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
std::string cvector_positive_file = "examples/cvector-generator/positive.txt"; std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
bool spm_infill = false; // suffix/prefix/middle pattern for infill bool spm_infill = false; // suffix/prefix/middle pattern for infill
...@@ -664,3 +666,9 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count"; ...@@ -664,3 +666,9 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
} }
//
// training utils
//
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
This diff is collapsed.
This diff is collapsed.
...@@ -253,6 +253,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ ...@@ -253,6 +253,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
std::vector<ggml_backend_buffer_type_t> buft_extra; std::vector<ggml_backend_buffer_type_t> buft_extra;
{ {
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (!cpu_dev) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
...@@ -291,6 +294,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ ...@@ -291,6 +294,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft)); LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (!cpu_dev) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
buft = ggml_backend_dev_buffer_type(cpu_dev); buft = ggml_backend_dev_buffer_type(cpu_dev);
break; break;
......
This diff is collapsed.
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
enum llm_arch { enum llm_arch {
LLM_ARCH_LLAMA, LLM_ARCH_LLAMA,
LLM_ARCH_LLAMA4, LLM_ARCH_LLAMA4,
LLM_ARCH_MLLAMA,
LLM_ARCH_DECI, LLM_ARCH_DECI,
LLM_ARCH_FALCON, LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN, LLM_ARCH_BAICHUAN,
...@@ -149,7 +148,6 @@ enum llm_kv { ...@@ -149,7 +148,6 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
...@@ -351,14 +349,6 @@ enum llm_tensor { ...@@ -351,14 +349,6 @@ enum llm_tensor {
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV, LLM_TENSOR_BSKCN_TV,
LLM_TENSOR_CROSS_ATTN_K_NORM,
LLM_TENSOR_CROSS_ATTN_K_PROJ,
LLM_TENSOR_CROSS_ATTN_O_PROJ,
LLM_TENSOR_CROSS_ATTN_Q_NORM,
LLM_TENSOR_CROSS_ATTN_Q_PROJ,
LLM_TENSOR_CROSS_ATTN_V_PROJ,
LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
LLM_TENSOR_CROSS_ATTN_MLP_GATE,
LLM_TENSOR_CONV1D, LLM_TENSOR_CONV1D,
LLM_TENSOR_CONVNEXT_DW, LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM, LLM_TENSOR_CONVNEXT_NORM,
......
This diff is collapsed.
...@@ -70,7 +70,8 @@ struct llama_sbatch { ...@@ -70,7 +70,8 @@ struct llama_sbatch {
// sequence-wise split // sequence-wise split
llama_ubatch split_seq(size_t n_ubatch); llama_ubatch split_seq(size_t n_ubatch);
void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false); llama_sbatch() = default;
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
}; };
// temporary allocate memory for the input batch if needed // temporary allocate memory for the input batch if needed
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment