"ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp" did not exist on "4cc1a6143387f41e2466536abcd6a2620b63a35b"
Commit 920a4b07 authored by Daniel Hiltgen's avatar Daniel Hiltgen
Browse files

Merge remote-tracking branch 'upstream/main' into pr3702

parents c496967e ee49844d
...@@ -132,7 +132,7 @@ func (m *MistralModel) LoadVocab() error { ...@@ -132,7 +132,7 @@ func (m *MistralModel) LoadVocab() error {
return nil return nil
} }
func (m *MistralModel) WriteGGUF() (string, error) { func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error {
kv := llm.KV{ kv := llm.KV{
"general.architecture": "llama", "general.architecture": "llama",
"general.name": m.Name, "general.name": m.Name,
...@@ -158,16 +158,5 @@ func (m *MistralModel) WriteGGUF() (string, error) { ...@@ -158,16 +158,5 @@ func (m *MistralModel) WriteGGUF() (string, error) {
"tokenizer.ggml.unknown_token_id": uint32(0), "tokenizer.ggml.unknown_token_id": uint32(0),
} }
f, err := os.CreateTemp("", "ollama-gguf") return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
if err != nil {
return "", err
}
defer f.Close()
mod := llm.NewGGUFV3(m.Params.ByteOrder)
if err := mod.Encode(f, kv, m.Tensors); err != nil {
return "", err
}
return f.Name(), nil
} }
package convert
import (
"io"
"regexp"
"github.com/ollama/ollama/llm"
)
type MixtralModel struct {
ModelData
}
func (m *MixtralModel) GetTensors() error {
t, err := m.Format.GetTensors(m.Path, m.Params)
if err != nil {
return err
}
m.Tensors = []llm.Tensor{}
pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
re, err := regexp.Compile(pattern)
if err != nil {
return err
}
for _, l := range t {
matches := re.FindAllStringSubmatch(l.Name, -1)
if len(matches) > 0 {
wt := l.WriterTo.(safetensorWriterTo)
wt.handler = mistralLayerHandler
l.WriterTo = wt
}
m.Tensors = append(m.Tensors, l)
}
return nil
}
func (m *MixtralModel) LoadVocab() error {
v, err := LoadSentencePieceTokens(m.Path, m.Params)
if err != nil {
return err
}
m.Vocab = v
return nil
}
func (m *MixtralModel) WriteGGUF(ws io.WriteSeeker) error {
kv := llm.KV{
"general.architecture": "llama",
"general.name": m.Name,
"llama.block_count": uint32(m.Params.HiddenLayers),
"llama.context_length": uint32(m.Params.ContextSize),
"llama.embedding_length": uint32(m.Params.HiddenSize),
"llama.feed_forward_length": uint32(m.Params.IntermediateSize),
"llama.attention.head_count": uint32(m.Params.AttentionHeads),
"llama.attention.head_count_kv": uint32(m.Params.KeyValHeads),
"llama.rope.freq_base": float32(m.Params.RopeFrequencyBase),
"llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
"llama.expert_count": uint32(m.Params.Experts),
"llama.expert_used_count": uint32(m.Params.ExpertsUsed),
"llama.vocab_size": uint32(len(m.Vocab.Tokens)),
"llama.rope.dimension_count": uint32(m.Params.HiddenSize / m.Params.AttentionHeads),
"general.file_type": uint32(1),
"tokenizer.ggml.model": "llama",
"tokenizer.ggml.tokens": m.Vocab.Tokens,
"tokenizer.ggml.scores": m.Vocab.Scores,
"tokenizer.ggml.token_type": m.Vocab.Types,
"tokenizer.ggml.bos_token_id": uint32(m.Params.BoSTokenID),
"tokenizer.ggml.eos_token_id": uint32(m.Params.EoSTokenID),
"tokenizer.ggml.unknown_token_id": uint32(0),
"tokenizer.ggml.add_bos_token": true,
"tokenizer.ggml.add_eos_token": false,
}
return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
}
...@@ -53,7 +53,7 @@ func (m *SafetensorFormat) GetTensors(dirpath string, params *Params) ([]llm.Ten ...@@ -53,7 +53,7 @@ func (m *SafetensorFormat) GetTensors(dirpath string, params *Params) ([]llm.Ten
var err error var err error
t, offset, err = m.readTensors(f, offset, params) t, offset, err = m.readTensors(f, offset, params)
if err != nil { if err != nil {
slog.Error("%v", err) slog.Error(err.Error())
return nil, err return nil, err
} }
tensors = append(tensors, t...) tensors = append(tensors, t...)
...@@ -93,7 +93,6 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params) ...@@ -93,7 +93,6 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
} }
slices.Sort(keys) slices.Sort(keys)
slog.Info("converting layers") slog.Info("converting layers")
var tensors []llm.Tensor var tensors []llm.Tensor
...@@ -105,7 +104,6 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params) ...@@ -105,7 +104,6 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
return nil, 0, err return nil, 0, err
} }
slog.Debug(fmt.Sprintf("metadata = %#v", data))
var size uint64 var size uint64
var kind uint32 var kind uint32
switch len(data.Shape) { switch len(data.Shape) {
...@@ -124,7 +122,7 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params) ...@@ -124,7 +122,7 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
ggufName, err := m.GetLayerName(k) ggufName, err := m.GetLayerName(k)
if err != nil { if err != nil {
slog.Error("%v", err) slog.Error(err.Error())
return nil, 0, err return nil, 0, err
} }
...@@ -150,11 +148,13 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params) ...@@ -150,11 +148,13 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
padding: 8 + jsonSize, padding: 8 + jsonSize,
} }
tensors = append(tensors, t)
offset += size offset += size
tensors = append(tensors, t)
} }
slog.Debug(fmt.Sprintf("total tensors for file = %d", len(tensors))) slog.Debug(fmt.Sprintf("total tensors for file = %d", len(tensors)))
slog.Debug(fmt.Sprintf("offset = %d", offset)) slog.Debug(fmt.Sprintf("offset = %d", offset))
return tensors, offset, nil return tensors, offset, nil
} }
...@@ -194,6 +194,10 @@ func (m *SafetensorFormat) GetLayerName(n string) (string, error) { ...@@ -194,6 +194,10 @@ func (m *SafetensorFormat) GetLayerName(n string) (string, error) {
"model.layers.(\\d+).self_attn.o_proj.weight": "blk.$1.attn_output.weight", "model.layers.(\\d+).self_attn.o_proj.weight": "blk.$1.attn_output.weight",
"model.layers.(\\d+).self_attn.q_proj.weight": "blk.$1.attn_q.weight", "model.layers.(\\d+).self_attn.q_proj.weight": "blk.$1.attn_q.weight",
"model.layers.(\\d+).self_attn.v_proj.weight": "blk.$1.attn_v.weight", "model.layers.(\\d+).self_attn.v_proj.weight": "blk.$1.attn_v.weight",
"model.layers.(\\d+).block_sparse_moe.gate.weight": "blk.$1.ffn_gate_inp.weight",
"model.layers.(\\d+).block_sparse_moe.experts.(\\d+).w1.weight": "blk.$1.ffn_gate.$2.weight",
"model.layers.(\\d+).block_sparse_moe.experts.(\\d+).w2.weight": "blk.$1.ffn_down.$2.weight",
"model.layers.(\\d+).block_sparse_moe.experts.(\\d+).w3.weight": "blk.$1.ffn_up.$2.weight",
} }
v, ok := directMap[n] v, ok := directMap[n]
...@@ -286,6 +290,15 @@ func (m *SafetensorFormat) GetModelArch(name, dirPath string, params *Params) (M ...@@ -286,6 +290,15 @@ func (m *SafetensorFormat) GetModelArch(name, dirPath string, params *Params) (M
Format: m, Format: m,
}, },
}, nil }, nil
case "MixtralForCausalLM":
return &MixtralModel{
ModelData{
Name: name,
Path: dirPath,
Params: params,
Format: m,
},
}, nil
case "GemmaForCausalLM": case "GemmaForCausalLM":
return &GemmaModel{ return &GemmaModel{
ModelData{ ModelData{
......
...@@ -74,7 +74,7 @@ func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor, ...@@ -74,7 +74,7 @@ func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor,
ggufName, err := tf.GetLayerName(k.(string)) ggufName, err := tf.GetLayerName(k.(string))
if err != nil { if err != nil {
slog.Error("%v", err) slog.Error(err.Error())
return nil, err return nil, err
} }
slog.Debug(fmt.Sprintf("finding name for '%s' -> '%s'", k.(string), ggufName)) slog.Debug(fmt.Sprintf("finding name for '%s' -> '%s'", k.(string), ggufName))
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
### Model names ### Model names
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version. Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
### Durations ### Durations
...@@ -66,7 +66,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur ...@@ -66,7 +66,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama2", "model": "llama3",
"prompt": "Why is the sky blue?" "prompt": "Why is the sky blue?"
}' }'
``` ```
...@@ -77,7 +77,7 @@ A stream of JSON objects is returned: ...@@ -77,7 +77,7 @@ A stream of JSON objects is returned:
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T08:52:19.385406455-07:00", "created_at": "2023-08-04T08:52:19.385406455-07:00",
"response": "The", "response": "The",
"done": false "done": false
...@@ -90,16 +90,16 @@ The final response in the stream also includes additional data about the generat ...@@ -90,16 +90,16 @@ The final response in the stream also includes additional data about the generat
- `load_duration`: time spent in nanoseconds loading the model - `load_duration`: time spent in nanoseconds loading the model
- `prompt_eval_count`: number of tokens in the prompt - `prompt_eval_count`: number of tokens in the prompt
- `prompt_eval_duration`: time spent in nanoseconds evaluating the prompt - `prompt_eval_duration`: time spent in nanoseconds evaluating the prompt
- `eval_count`: number of tokens the response - `eval_count`: number of tokens in the response
- `eval_duration`: time in nanoseconds spent generating the response - `eval_duration`: time in nanoseconds spent generating the response
- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory - `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
- `response`: empty if the response was streamed, if not streamed, this will contain the full response - `response`: empty if the response was streamed, if not streamed, this will contain the full response
To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration`. To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration` * `10^9`.
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T19:22:45.499127Z", "created_at": "2023-08-04T19:22:45.499127Z",
"response": "", "response": "",
"done": true, "done": true,
...@@ -121,7 +121,7 @@ A response can be received in one reply when streaming is off. ...@@ -121,7 +121,7 @@ A response can be received in one reply when streaming is off.
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama2", "model": "llama3",
"prompt": "Why is the sky blue?", "prompt": "Why is the sky blue?",
"stream": false "stream": false
}' }'
...@@ -133,7 +133,7 @@ If `stream` is set to `false`, the response will be a single JSON object: ...@@ -133,7 +133,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T19:22:45.499127Z", "created_at": "2023-08-04T19:22:45.499127Z",
"response": "The sky is blue because it is the color of the sky.", "response": "The sky is blue because it is the color of the sky.",
"done": true, "done": true,
...@@ -155,7 +155,7 @@ If `stream` is set to `false`, the response will be a single JSON object: ...@@ -155,7 +155,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama2", "model": "llama3",
"prompt": "What color is the sky at different times of the day? Respond using JSON", "prompt": "What color is the sky at different times of the day? Respond using JSON",
"format": "json", "format": "json",
"stream": false "stream": false
...@@ -166,7 +166,7 @@ curl http://localhost:11434/api/generate -d '{ ...@@ -166,7 +166,7 @@ curl http://localhost:11434/api/generate -d '{
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-11-09T21:07:55.186497Z", "created_at": "2023-11-09T21:07:55.186497Z",
"response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n", "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
"done": true, "done": true,
...@@ -289,7 +289,7 @@ If you want to set custom options for the model at runtime rather than in the Mo ...@@ -289,7 +289,7 @@ If you want to set custom options for the model at runtime rather than in the Mo
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama2", "model": "llama3",
"prompt": "Why is the sky blue?", "prompt": "Why is the sky blue?",
"stream": false, "stream": false,
"options": { "options": {
...@@ -332,7 +332,7 @@ curl http://localhost:11434/api/generate -d '{ ...@@ -332,7 +332,7 @@ curl http://localhost:11434/api/generate -d '{
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T19:22:45.499127Z", "created_at": "2023-08-04T19:22:45.499127Z",
"response": "The sky is blue because it is the color of the sky.", "response": "The sky is blue because it is the color of the sky.",
"done": true, "done": true,
...@@ -354,7 +354,7 @@ If an empty prompt is provided, the model will be loaded into memory. ...@@ -354,7 +354,7 @@ If an empty prompt is provided, the model will be loaded into memory.
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama2" "model": "llama3"
}' }'
``` ```
...@@ -364,7 +364,7 @@ A single JSON object is returned: ...@@ -364,7 +364,7 @@ A single JSON object is returned:
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-12-18T19:52:07.071755Z", "created_at": "2023-12-18T19:52:07.071755Z",
"response": "", "response": "",
"done": true "done": true
...@@ -407,7 +407,7 @@ Send a chat message with a streaming response. ...@@ -407,7 +407,7 @@ Send a chat message with a streaming response.
```shell ```shell
curl http://localhost:11434/api/chat -d '{ curl http://localhost:11434/api/chat -d '{
"model": "llama2", "model": "llama3",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
...@@ -423,7 +423,7 @@ A stream of JSON objects is returned: ...@@ -423,7 +423,7 @@ A stream of JSON objects is returned:
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T08:52:19.385406455-07:00", "created_at": "2023-08-04T08:52:19.385406455-07:00",
"message": { "message": {
"role": "assistant", "role": "assistant",
...@@ -438,7 +438,7 @@ Final response: ...@@ -438,7 +438,7 @@ Final response:
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T19:22:45.499127Z", "created_at": "2023-08-04T19:22:45.499127Z",
"done": true, "done": true,
"total_duration": 4883583458, "total_duration": 4883583458,
...@@ -456,7 +456,7 @@ Final response: ...@@ -456,7 +456,7 @@ Final response:
```shell ```shell
curl http://localhost:11434/api/chat -d '{ curl http://localhost:11434/api/chat -d '{
"model": "llama2", "model": "llama3",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
...@@ -471,7 +471,7 @@ curl http://localhost:11434/api/chat -d '{ ...@@ -471,7 +471,7 @@ curl http://localhost:11434/api/chat -d '{
```json ```json
{ {
"model": "registry.ollama.ai/library/llama2:latest", "model": "registry.ollama.ai/library/llama3:latest",
"created_at": "2023-12-12T14:13:43.416799Z", "created_at": "2023-12-12T14:13:43.416799Z",
"message": { "message": {
"role": "assistant", "role": "assistant",
...@@ -495,7 +495,7 @@ Send a chat message with a conversation history. You can use this same approach ...@@ -495,7 +495,7 @@ Send a chat message with a conversation history. You can use this same approach
```shell ```shell
curl http://localhost:11434/api/chat -d '{ curl http://localhost:11434/api/chat -d '{
"model": "llama2", "model": "llama3",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
...@@ -519,7 +519,7 @@ A stream of JSON objects is returned: ...@@ -519,7 +519,7 @@ A stream of JSON objects is returned:
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T08:52:19.385406455-07:00", "created_at": "2023-08-04T08:52:19.385406455-07:00",
"message": { "message": {
"role": "assistant", "role": "assistant",
...@@ -533,7 +533,7 @@ Final response: ...@@ -533,7 +533,7 @@ Final response:
```json ```json
{ {
"model": "llama2", "model": "llama3",
"created_at": "2023-08-04T19:22:45.499127Z", "created_at": "2023-08-04T19:22:45.499127Z",
"done": true, "done": true,
"total_duration": 8113331500, "total_duration": 8113331500,
...@@ -591,7 +591,7 @@ curl http://localhost:11434/api/chat -d '{ ...@@ -591,7 +591,7 @@ curl http://localhost:11434/api/chat -d '{
```shell ```shell
curl http://localhost:11434/api/chat -d '{ curl http://localhost:11434/api/chat -d '{
"model": "llama2", "model": "llama3",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
...@@ -609,7 +609,7 @@ curl http://localhost:11434/api/chat -d '{ ...@@ -609,7 +609,7 @@ curl http://localhost:11434/api/chat -d '{
```json ```json
{ {
"model": "registry.ollama.ai/library/llama2:latest", "model": "registry.ollama.ai/library/llama3:latest",
"created_at": "2023-12-12T14:13:43.416799Z", "created_at": "2023-12-12T14:13:43.416799Z",
"message": { "message": {
"role": "assistant", "role": "assistant",
...@@ -651,7 +651,7 @@ Create a new model from a `Modelfile`. ...@@ -651,7 +651,7 @@ Create a new model from a `Modelfile`.
```shell ```shell
curl http://localhost:11434/api/create -d '{ curl http://localhost:11434/api/create -d '{
"name": "mario", "name": "mario",
"modelfile": "FROM llama2\nSYSTEM You are mario from Super Mario Bros." "modelfile": "FROM llama3\nSYSTEM You are mario from Super Mario Bros."
}' }'
``` ```
...@@ -758,7 +758,7 @@ A single JSON object will be returned. ...@@ -758,7 +758,7 @@ A single JSON object will be returned.
} }
}, },
{ {
"name": "llama2:latest", "name": "llama3:latest",
"modified_at": "2023-12-07T09:32:18.757212583-08:00", "modified_at": "2023-12-07T09:32:18.757212583-08:00",
"size": 3825819519, "size": 3825819519,
"digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e", "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
...@@ -792,7 +792,7 @@ Show information about a model including details, modelfile, template, parameter ...@@ -792,7 +792,7 @@ Show information about a model including details, modelfile, template, parameter
```shell ```shell
curl http://localhost:11434/api/show -d '{ curl http://localhost:11434/api/show -d '{
"name": "llama2" "name": "llama3"
}' }'
``` ```
...@@ -827,8 +827,8 @@ Copy a model. Creates a model with another name from an existing model. ...@@ -827,8 +827,8 @@ Copy a model. Creates a model with another name from an existing model.
```shell ```shell
curl http://localhost:11434/api/copy -d '{ curl http://localhost:11434/api/copy -d '{
"source": "llama2", "source": "llama3",
"destination": "llama2-backup" "destination": "llama3-backup"
}' }'
``` ```
...@@ -854,7 +854,7 @@ Delete a model and its data. ...@@ -854,7 +854,7 @@ Delete a model and its data.
```shell ```shell
curl -X DELETE http://localhost:11434/api/delete -d '{ curl -X DELETE http://localhost:11434/api/delete -d '{
"name": "llama2:13b" "name": "llama3:13b"
}' }'
``` ```
...@@ -882,7 +882,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where ...@@ -882,7 +882,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
```shell ```shell
curl http://localhost:11434/api/pull -d '{ curl http://localhost:11434/api/pull -d '{
"name": "llama2" "name": "llama3"
}' }'
``` ```
......
...@@ -51,7 +51,7 @@ Typically the build scripts will auto-detect CUDA, however, if your Linux distro ...@@ -51,7 +51,7 @@ Typically the build scripts will auto-detect CUDA, however, if your Linux distro
or installation approach uses unusual paths, you can specify the location by or installation approach uses unusual paths, you can specify the location by
specifying an environment variable `CUDA_LIB_DIR` to the location of the shared specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70") a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
Then generate dependencies: Then generate dependencies:
......
...@@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter: ...@@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:
``` ```
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama2", "model": "llama3",
"prompt": "Why is the sky blue?", "prompt": "Why is the sky blue?",
"options": { "options": {
"num_ctx": 4096 "num_ctx": 4096
...@@ -140,7 +140,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e ...@@ -140,7 +140,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e
- macOS: `~/.ollama/models` - macOS: `~/.ollama/models`
- Linux: `/usr/share/ollama/.ollama/models` - Linux: `/usr/share/ollama/.ollama/models`
- Windows: `C:\Users\<username>\.ollama\models` - Windows: `C:\Users\%username%\.ollama\models`
### How do I set them to a different location? ### How do I set them to a different location?
...@@ -221,10 +221,20 @@ The `keep_alive` parameter can be set to: ...@@ -221,10 +221,20 @@ The `keep_alive` parameter can be set to:
For example, to preload a model and leave it in memory use: For example, to preload a model and leave it in memory use:
```shell ```shell
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": -1}' curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}'
``` ```
To unload the model and free up memory use: To unload the model and free up memory use:
```shell ```shell
curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}' curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}'
``` ```
Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` API parameter with the `/api/generate` or `/api/chat` API endpoints.
## How do I manage the maximum number of requests the server can queue
If too many requests are sent to the server, it will respond with a 503 error
indicating the server is overloaded. You can adjust how many requests may be
queue by setting `OLLAMA_MAX_QUEUE`
\ No newline at end of file
...@@ -125,7 +125,7 @@ Publishing models is in early alpha. If you'd like to publish your model to shar ...@@ -125,7 +125,7 @@ Publishing models is in early alpha. If you'd like to publish your model to shar
1. Create [an account](https://ollama.com/signup) 1. Create [an account](https://ollama.com/signup)
2. Copy your Ollama public key: 2. Copy your Ollama public key:
- macOS: `cat ~/.ollama/id_ed25519.pub` - macOS: `cat ~/.ollama/id_ed25519.pub | pbcopy`
- Windows: `type %USERPROFILE%\.ollama\id_ed25519.pub` - Windows: `type %USERPROFILE%\.ollama\id_ed25519.pub`
- Linux: `cat /usr/share/ollama/.ollama/id_ed25519.pub` - Linux: `cat /usr/share/ollama/.ollama/id_ed25519.pub`
3. Add your public key to your [Ollama account](https://ollama.com/settings/keys) 3. Add your public key to your [Ollama account](https://ollama.com/settings/keys)
...@@ -136,6 +136,8 @@ Next, copy your model to your username's namespace: ...@@ -136,6 +136,8 @@ Next, copy your model to your username's namespace:
ollama cp example <your username>/example ollama cp example <your username>/example
``` ```
> Note: model names may only contain lowercase letters, digits, and the characters `.`, `-`, and `_`.
Then push the model: Then push the model:
``` ```
......
...@@ -105,7 +105,7 @@ sudo chmod +x /usr/bin/ollama ...@@ -105,7 +105,7 @@ sudo chmod +x /usr/bin/ollama
To view logs of Ollama running as a startup service, run: To view logs of Ollama running as a startup service, run:
```bash ```bash
journalctl -u ollama journalctl -e -u ollama
``` ```
## Uninstall ## Uninstall
......
...@@ -10,7 +10,7 @@ A model file is the blueprint to create and share models with Ollama. ...@@ -10,7 +10,7 @@ A model file is the blueprint to create and share models with Ollama.
- [Examples](#examples) - [Examples](#examples)
- [Instructions](#instructions) - [Instructions](#instructions)
- [FROM (Required)](#from-required) - [FROM (Required)](#from-required)
- [Build from llama2](#build-from-llama2) - [Build from llama3](#build-from-llama3)
- [Build from a bin file](#build-from-a-bin-file) - [Build from a bin file](#build-from-a-bin-file)
- [PARAMETER](#parameter) - [PARAMETER](#parameter)
- [Valid Parameters and Values](#valid-parameters-and-values) - [Valid Parameters and Values](#valid-parameters-and-values)
...@@ -48,7 +48,7 @@ INSTRUCTION arguments ...@@ -48,7 +48,7 @@ INSTRUCTION arguments
An example of a `Modelfile` creating a mario blueprint: An example of a `Modelfile` creating a mario blueprint:
```modelfile ```modelfile
FROM llama2 FROM llama3
# sets the temperature to 1 [higher is more creative, lower is more coherent] # sets the temperature to 1 [higher is more creative, lower is more coherent]
PARAMETER temperature 1 PARAMETER temperature 1
# sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
...@@ -67,33 +67,25 @@ To use this: ...@@ -67,33 +67,25 @@ To use this:
More examples are available in the [examples directory](../examples). More examples are available in the [examples directory](../examples).
### `Modelfile`s in [ollama.com/library][1] To view the Modelfile of a given model, use the `ollama show --modelfile` command.
There are two ways to view `Modelfile`s underlying the models in [ollama.com/library][1]:
- Option 1: view a details page from a model's tags page:
1. Go to a particular model's tags (e.g. https://ollama.com/library/llama2/tags)
2. Click on a tag (e.g. https://ollama.com/library/llama2:13b)
3. Scroll down to "Layers"
- Note: if the [`FROM` instruction](#from-required) is not present,
it means the model was created from a local file
- Option 2: use `ollama show` to print the `Modelfile` for any local models like so:
```bash ```bash
> ollama show --modelfile llama2:13b > ollama show --modelfile llama3
# Modelfile generated by "ollama show" # Modelfile generated by "ollama show"
# To build a new Modelfile based on this one, replace the FROM line with: # To build a new Modelfile based on this one, replace the FROM line with:
# FROM llama2:13b # FROM llama3:latest
FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
FROM /root/.ollama/models/blobs/sha256:123abc {{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
TEMPLATE """[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>>
{{ end }}{{ .Prompt }} [/INST] """ {{ .Response }}<|eot_id|>"""
SYSTEM """""" PARAMETER stop "<|start_header_id|>"
PARAMETER stop [INST] PARAMETER stop "<|end_header_id|>"
PARAMETER stop [/INST] PARAMETER stop "<|eot_id|>"
PARAMETER stop <<SYS>> PARAMETER stop "<|reserved_special_token"
PARAMETER stop <</SYS>>
``` ```
## Instructions ## Instructions
...@@ -106,10 +98,10 @@ The `FROM` instruction defines the base model to use when creating a model. ...@@ -106,10 +98,10 @@ The `FROM` instruction defines the base model to use when creating a model.
FROM <model name>:<tag> FROM <model name>:<tag>
``` ```
#### Build from llama2 #### Build from llama3
```modelfile ```modelfile
FROM llama2 FROM llama3
``` ```
A list of available base models: A list of available base models:
......
...@@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create( ...@@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
'content': 'Say this is a test', 'content': 'Say this is a test',
} }
], ],
model='llama2', model='llama3',
) )
``` ```
...@@ -43,7 +43,7 @@ const openai = new OpenAI({ ...@@ -43,7 +43,7 @@ const openai = new OpenAI({
const chatCompletion = await openai.chat.completions.create({ const chatCompletion = await openai.chat.completions.create({
messages: [{ role: 'user', content: 'Say this is a test' }], messages: [{ role: 'user', content: 'Say this is a test' }],
model: 'llama2', model: 'llama3',
}) })
``` ```
...@@ -53,7 +53,7 @@ const chatCompletion = await openai.chat.completions.create({ ...@@ -53,7 +53,7 @@ const chatCompletion = await openai.chat.completions.create({
curl http://localhost:11434/v1/chat/completions \ curl http://localhost:11434/v1/chat/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
"model": "llama2", "model": "llama3",
"messages": [ "messages": [
{ {
"role": "system", "role": "system",
...@@ -113,7 +113,7 @@ curl http://localhost:11434/v1/chat/completions \ ...@@ -113,7 +113,7 @@ curl http://localhost:11434/v1/chat/completions \
Before using a model, pull it locally `ollama pull`: Before using a model, pull it locally `ollama pull`:
```shell ```shell
ollama pull llama2 ollama pull llama3
``` ```
### Default model names ### Default model names
...@@ -121,7 +121,7 @@ ollama pull llama2 ...@@ -121,7 +121,7 @@ ollama pull llama2
For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name: For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
``` ```
ollama cp llama2 gpt-3.5-turbo ollama cp llama3 gpt-3.5-turbo
``` ```
Afterwards, this new model name can be specified the `model` field: Afterwards, this new model name can be specified the `model` field:
......
...@@ -15,7 +15,7 @@ import { Ollama } from "langchain/llms/ollama"; ...@@ -15,7 +15,7 @@ import { Ollama } from "langchain/llms/ollama";
const ollama = new Ollama({ const ollama = new Ollama({
baseUrl: "http://localhost:11434", baseUrl: "http://localhost:11434",
model: "llama2", model: "llama3",
}); });
const answer = await ollama.invoke(`why is the sky blue?`); const answer = await ollama.invoke(`why is the sky blue?`);
...@@ -23,7 +23,7 @@ const answer = await ollama.invoke(`why is the sky blue?`); ...@@ -23,7 +23,7 @@ const answer = await ollama.invoke(`why is the sky blue?`);
console.log(answer); console.log(answer);
``` ```
That will get us the same thing as if we ran `ollama run llama2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app. That will get us the same thing as if we ran `ollama run llama3 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
```bash ```bash
npm install cheerio npm install cheerio
......
...@@ -12,15 +12,17 @@ So let's figure out how we can use **LangChain** with Ollama to ask our question ...@@ -12,15 +12,17 @@ So let's figure out how we can use **LangChain** with Ollama to ask our question
Let's start by asking a simple question that we can get an answer to from the **Llama2** model using **Ollama**. First, we need to install the **LangChain** package: Let's start by asking a simple question that we can get an answer to from the **Llama2** model using **Ollama**. First, we need to install the **LangChain** package:
`pip install langchain` `pip install langchain_community`
Then we can create a model and ask the question: Then we can create a model and ask the question:
```python ```python
from langchain.llms import Ollama from langchain_community.llms import Ollama
ollama = Ollama(base_url='http://localhost:11434', ollama = Ollama(
model="llama2") base_url='http://localhost:11434',
print(ollama("why is the sky blue")) model="llama3"
)
print(ollama.invoke("why is the sky blue"))
``` ```
Notice that we are defining the model and the base URL for Ollama. Notice that we are defining the model and the base URL for Ollama.
......
# Running Ollama on NVIDIA Jetson Devices # Running Ollama on NVIDIA Jetson Devices
With some minor configuration, Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/). The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack). Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/) and should run out of the box with the standard installation instructions.
NVIDIA Jetson devices are Linux-based embedded AI computers that are purpose-built for AI applications. The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack), but should also work on JetPack 6.0.
Jetsons have an integrated GPU that is wired directly to the memory controller of the machine. For this reason, the `nvidia-smi` command is unrecognized, and Ollama proceeds to operate in "CPU only"
mode. This can be verified by using a monitoring tool like jtop.
In order to address this, we simply pass the path to the Jetson's pre-installed CUDA libraries into `ollama serve` (while in a tmux session). We then hardcode the num_gpu parameters into a cloned
version of our target model.
Prerequisites:
- curl
- tmux
Here are the steps:
- Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.com/install.sh | sh` - Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.com/install.sh | sh`
- Stop the Ollama service: `sudo systemctl stop ollama`
- Start Ollama serve in a tmux session called ollama_jetson and reference the CUDA libraries path: `tmux has-session -t ollama_jetson 2>/dev/null || tmux new-session -d -s ollama_jetson
'LD_LIBRARY_PATH=/usr/local/cuda/lib64 ollama serve'`
- Pull the model you want to use (e.g. mistral): `ollama pull mistral` - Pull the model you want to use (e.g. mistral): `ollama pull mistral`
- Create a new Modelfile specifically for enabling GPU support on the Jetson: `touch ModelfileMistralJetson` - Start an interactive session: `ollama run mistral`
- In the ModelfileMistralJetson file, specify the FROM model and the num_gpu PARAMETER as shown below:
```
FROM mistral
PARAMETER num_gpu 999
```
- Create a new model from your Modelfile: `ollama create mistral-jetson -f ./ModelfileMistralJetson` And that's it!
- Run the new model: `ollama run mistral-jetson`
If you run a monitoring tool like jtop you should now see that Ollama is using the Jetson's integrated GPU. # Running Ollama in Docker
And that's it! When running GPU accelerated applications in Docker, it is highly recommended to use [dusty-nv jetson-containers repo](https://github.com/dusty-nv/jetson-containers).
\ No newline at end of file
...@@ -14,7 +14,7 @@ As this is a preview release, you should expect a few bugs here and there. If ...@@ -14,7 +14,7 @@ As this is a preview release, you should expect a few bugs here and there. If
you run into a problem you can reach out on you run into a problem you can reach out on
[Discord](https://discord.gg/ollama), or file an [Discord](https://discord.gg/ollama), or file an
[issue](https://github.com/ollama/ollama/issues). [issue](https://github.com/ollama/ollama/issues).
Logs will often be helpful in dianosing the problem (see Logs will often be helpful in diagnosing the problem (see
[Troubleshooting](#troubleshooting) below) [Troubleshooting](#troubleshooting) below)
## System Requirements ## System Requirements
...@@ -27,7 +27,7 @@ Logs will often be helpful in dianosing the problem (see ...@@ -27,7 +27,7 @@ Logs will often be helpful in dianosing the problem (see
Here's a quick example showing API access from `powershell` Here's a quick example showing API access from `powershell`
```powershell ```powershell
(Invoke-WebRequest -method POST -Body '{"model":"llama2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json (Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
``` ```
## Troubleshooting ## Troubleshooting
...@@ -45,3 +45,17 @@ the explorer window by hitting `<cmd>+R` and type in: ...@@ -45,3 +45,17 @@ the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH) - `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` contains models and configuration - `explorer %HOMEPATH%\.ollama` contains models and configuration
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories - `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
## Standalone CLI
The easiest way to install Ollama on Windows is to use the `OllamaSetup.exe`
installer. It installs in your account without requiring Administrator rights.
We update Ollama regularly to support the latest models, and this installer will
help you keep up to date.
If you'd like to install or integrate Ollama as a service, a standalone
`ollama-windows-amd64.zip` zip file is available containing only the Ollama CLI
and GPU library dependencies for Nvidia and AMD. This allows for embedding
Ollama in existing applications, or running it as a system service via `ollama
serve` with tools such as [NSSM](https://nssm.cc/).
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other: When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other:
`ollama run llama2 < sourcequestions.txt` `ollama run llama3 < sourcequestions.txt`
This concept is used in the following example. This concept is used in the following example.
......
# Deploy Ollama to Fly.io
> Note: this example exposes a public endpoint and does not configure authentication. Use with care.
## Prerequisites
- Ollama: https://ollama.com/download
- Fly.io account. Sign up for a free account: https://fly.io/app/sign-up
## Steps
1. Login to Fly.io
```bash
fly auth login
```
1. Create a new Fly app
```bash
fly launch --name <name> --image ollama/ollama --internal-port 11434 --vm-size shared-cpu-8x --now
```
1. Pull and run `orca-mini:3b`
```bash
OLLAMA_HOST=https://<name>.fly.dev ollama run orca-mini:3b
```
`shared-cpu-8x` is a free-tier eligible machine type. For better performance, switch to a `performance` or `dedicated` machine type or attach a GPU for hardware acceleration (see below).
## (Optional) Persistent Volume
By default Fly Machines use ephemeral storage which is problematic if you want to use the same model across restarts without pulling it again. Create and attach a persistent volume to store the downloaded models:
1. Create the Fly Volume
```bash
fly volume create ollama
```
1. Update `fly.toml` and add `[mounts]`
```toml
[mounts]
source = "ollama"
destination = "/mnt/ollama/models"
```
1. Update `fly.toml` and add `[env]`
```toml
[env]
OLLAMA_MODELS = "/mnt/ollama/models"
```
1. Deploy your app
```bash
fly deploy
```
## (Optional) Hardware Acceleration
Fly.io GPU is currently in waitlist. Sign up for the waitlist: https://fly.io/gpu
Once you've been accepted, create the app with the additional flags `--vm-gpu-kind a100-pcie-40gb` or `--vm-gpu-kind a100-pcie-80gb`.
...@@ -35,7 +35,7 @@ func main() { ...@@ -35,7 +35,7 @@ func main() {
ctx := context.Background() ctx := context.Background()
req := &api.ChatRequest{ req := &api.ChatRequest{
Model: "llama2", Model: "llama3",
Messages: messages, Messages: messages,
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment