Merge pull request #17 from jmorganca/ctransformers

use ctransformers as backup to llama.cpp

Merge pull request #17 from jmorganca/ctransformers
use ctransformers as backup to llama.cpp
e69a08f1 · Michael Yang · GitHub · 9dfef44a · 07d8d561 · e69a08f1
Unverified Commit e69a08f1 authored Jun 30, 2023 by Michael Yang Committed by GitHub Jun 30, 2023
12 changed files
--- a/ollama/cmd/cli.py
+++ b/ollama/cmd/cli.py
@@ -31,7 +31,7 @@ def main():
    )
    # create models home if it doesn't exist
-    os.makedirs(model.models_home, exist_ok=True)
+    os.makedirs(model.MODELS_CACHE_PATH, exist_ok=True)
    subparsers = parser.add_subparsers(
        title='commands',
@@ -111,7 +111,7 @@ def generate_oneshot(*args, **kwargs):
    spinner.start()
    spinner_running = True
    try:
-        for output in engine.generate(*args, **kwargs):
+        for output in engine.generate(model_name=kwargs.pop('model'), *args, **kwargs):
            choices = output.get("choices", [])
            if len(choices) > 0:
                if spinner_running:
@@ -147,7 +147,7 @@ def generate_batch(*args, **kwargs):
 def pull(*args, **kwargs):
-    model.pull(*args, **kwargs)
+    model.pull(model_name=kwargs.pop('model'), *args, **kwargs)
 def run(*args, **kwargs):

--- a/ollama/cmd/server.py
+++ b/ollama/cmd/server.py
@@ -38,7 +38,7 @@ def serve(*args, **kwargs):
    app.update(
        {
-            "llms": {},
+            "models": {},
        }
    )
@@ -47,32 +47,32 @@ def serve(*args, **kwargs):
 async def load(request):
    body = await request.json()
-    model = body.get("model")
+    name = body.get("model")
-    if not model:
+    if not name:
        raise web.HTTPBadRequest()
    kwargs = {
-        "llms": request.app.get("llms"),
+        "models": request.app.get("models"),
    }
-    engine.load(model, **kwargs)
+    engine.load(name, **kwargs)
    return web.Response()
 async def unload(request):
    body = await request.json()
-    model = body.get("model")
+    name = body.get("model")
-    if not model:
+    if not name:
        raise web.HTTPBadRequest()
-    engine.unload(model, llms=request.app.get("llms"))
+    engine.unload(name, models=request.app.get("models"))
    return web.Response()
 async def generate(request):
    body = await request.json()
-    model = body.get("model")
+    name = body.get("model")
-    if not model:
+    if not name:
        raise web.HTTPBadRequest()
    prompt = body.get("prompt")
@@ -83,10 +83,10 @@ async def generate(request):
    await response.prepare(request)
    kwargs = {
-        "llms": request.app.get("llms"),
+        "models": request.app.get("models"),
    }
-    for output in engine.generate(model, prompt, **kwargs):
+    for output in engine.generate(name, prompt, **kwargs):
        output = json.dumps(output).encode('utf-8')
        await response.write(output)
        await response.write(b"\n")

--- a/ollama/engine.py
+++ b/ollama/engine.py
@@ -2,63 +2,120 @@ import os
 import sys
 from os import path
 from contextlib import contextmanager
-from llama_cpp import Llama as LLM
+from fuzzywuzzy import process
+from llama_cpp import Llama
+from ctransformers import AutoModelForCausalLM
-import ollama.model
 import ollama.prompt
+from ollama.model import MODELS_CACHE_PATH
 @contextmanager
-def suppress_stderr():
+def suppress(file):
-    stderr = os.dup(sys.stderr.fileno())
+    original = os.dup(file.fileno())
    with open(os.devnull, "w") as devnull:
-        os.dup2(devnull.fileno(), sys.stderr.fileno())
+        os.dup2(devnull.fileno(), file.fileno())
        yield
-    os.dup2(stderr, sys.stderr.fileno())
+    os.dup2(original, file.fileno())
-def generate(model, prompt, llms={}, *args, **kwargs):
+def generate(model_name, prompt, models={}, *args, **kwargs):
-    llm = load(model, llms=llms)
+    model = load(model_name, models=models)
+    inputs = ollama.prompt.template(model_name, prompt)
+    return model.generate(inputs, *args, **kwargs)
-    prompt = ollama.prompt.template(model, prompt)
-    if "max_tokens" not in kwargs:
-        kwargs.update({"max_tokens": 16384})
-    if "stop" not in kwargs:
+def load(model_name, models={}):
-        kwargs.update({"stop": ["Q:"]})
+    if not models.get(model_name, None):
+        model_path = path.expanduser(model_name)
+        if not path.exists(model_path):
+            model_path = path.join(MODELS_CACHE_PATH, model_name + ".bin")
-    if "stream" not in kwargs:
+        runners = {
-        kwargs.update({"stream": True})
+            model_type: cls
+            for cls in [LlamaCppRunner, CtransformerRunner]
+            for model_type in cls.model_types()
+        }
-    for output in llm(prompt, *args, **kwargs):
+        best_match, _ = process.extractOne(model_path, runners.keys())
-        yield output
+        model = runners.get(best_match, LlamaCppRunner)
+        models.update({model_name: model(model_path, best_match)})
-def load(model, llms={}):
+    return models.get(model_name)
-    llm = llms.get(model, None)
-    if not llm:
-        stored_model_path = path.join(ollama.model.models_home, model) + ".bin"
-        if path.exists(stored_model_path):
-            model_path = stored_model_path
-        else:
-            # try loading this as a path to a model, rather than a model name
-            model_path = path.abspath(model)
-        if not path.exists(model_path):
-            raise Exception(f"Model not found: {model}")
+def unload(model_name, models={}):
+    if model_name in models:
+        models.pop(model_name)
+class LlamaCppRunner:
+    def __init__(self, model_path, model_type):
        try:
-            # suppress LLM's output
+            with suppress(sys.stderr), suppress(sys.stdout):
-            with suppress_stderr():
+                self.model = Llama(model_path,
-                llm = LLM(model_path, verbose=False)
+                                   verbose=False,
-                llms.update({model: llm})
+                                   n_gpu_layers=1,
-        except Exception as e:
+                                   seed=-1)
-            # e is sent to devnull, so create a generic exception
+        except Exception:
-            raise Exception(f"Failed to load model: {model}")
+            raise Exception("Failed to load model", model_path, model_type)
-    return llm
+    @staticmethod
+    def model_types():
-def unload(model, llms={}):
+        return [
-    if model in llms:
+            'llama',
-        llms.pop(model)
+            'orca',
+            'vicuna',
+            'ultralm',
+        ]
+    def generate(self, prompt, *args, **kwargs):
+        if "max_tokens" not in kwargs:
+            kwargs.update({"max_tokens": 512})
+        if "stop" not in kwargs:
+            kwargs.update({"stop": ["Q:"]})
+        if "stream" not in kwargs:
+            kwargs.update({"stream": True})
+        with suppress(sys.stderr):
+            for output in self.model(prompt, *args, **kwargs):
+                yield output
+class CtransformerRunner:
+    def __init__(self, model_path, model_type):
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, model_type=model_type, local_files_only=True)
+    @staticmethod
+    def model_types():
+        return [
+            'falcon',
+            'mpt',
+            'starcoder',
+        ]
+    def generate(self, prompt, *args, **kwargs):
+        if "max_new_tokens" not in kwargs:
+            kwargs.update({"max_new_tokens": 512})
+        if "stop" not in kwargs:
+            kwargs.update({"stop": ["User"]})
+        if "stream" not in kwargs:
+            kwargs.update({"stream": True})
+        for output in self.model(prompt, *args, **kwargs):
+            yield {
+                'choices': [
+                    {
+                        'text': output,
+                    },
+                ],
+            }
--- a/ollama/model.py
+++ b/ollama/model.py
@@ -6,12 +6,12 @@ from urllib.parse import urlsplit, urlunsplit
 from tqdm import tqdm
-models_endpoint_url = 'https://ollama.ai/api/models'
+MODELS_MANIFEST = 'https://ollama.ai/api/models'
-models_home = path.join(Path.home(), '.ollama', 'models')
+MODELS_CACHE_PATH = path.join(Path.home(), '.ollama', 'models')
 def models(*args, **kwargs):
-    for _, _, files in walk(models_home):
+    for _, _, files in walk(MODELS_CACHE_PATH):
        for file in files:
            base, ext = path.splitext(file)
            if ext == '.bin':
@@ -20,7 +20,7 @@ def models(*args, **kwargs):
 # get the url of the model from our curated directory
 def get_url_from_directory(model):
-    response = requests.get(models_endpoint_url)
+    response = requests.get(MODELS_MANIFEST)
    response.raise_for_status()
    directory = response.json()
    for model_info in directory:
@@ -78,7 +78,7 @@ def find_bin_file(json_response, location, branch):
 def download_file(download_url, file_name, file_size):
-    local_filename = path.join(models_home, file_name) + '.bin'
+    local_filename = path.join(MODELS_CACHE_PATH, file_name) + '.bin'
    first_byte = path.getsize(local_filename) if path.exists(local_filename) else 0
@@ -125,7 +125,7 @@ def pull(model, *args, **kwargs):
        url = f'https://{url}'
    if not validators.url(url):
-        if model in models(models_home):
+        if model in models(MODELS_CACHE_PATH):
            # the model is already downloaded, and specified by name
            return model
        raise Exception(f'Unknown model {model}')

--- a/ollama/prompt.py
+++ b/ollama/prompt.py
-import os
+from os import path
 from difflib import SequenceMatcher
 from jinja2 import Environment, PackageLoader
-def template(model, prompt):
+def template(name, prompt):
    best_ratio = 0
    best_template = ''
    environment = Environment(loader=PackageLoader(__name__, 'templates'))
    for template in environment.list_templates():
-        base, _ = os.path.splitext(template)
+        base, _ = path.splitext(template)
-        ratio = SequenceMatcher(None, os.path.basename(model.lower()), base).ratio()
+        ratio = SequenceMatcher(None, path.basename(name).lower(), base).ratio()
        if ratio > best_ratio:
            best_ratio = ratio
            best_template = template

--- a/ollama/templates/falcon.prompt
+++ b/ollama/templates/falcon.prompt
+A helpful assistant who helps the user with any questions asked.
+User: {{ prompt }}
+Assistant:
--- a/ollama/templates/mpt.prompt
+++ b/ollama/templates/mpt.prompt
+Below is an instruction that describes a task. Write a response that appropriately completes the request. Be concise. Once the request is completed, include no other text.
+### Instruction:
+{{ prompt }}
+### Response:
--- a/ollama/templates/ultralm.prompt
+++ b/ollama/templates/ultralm.prompt
+USER: {{ prompt }}
+ASSISTANT:
--- a/ollama/templates/wizardcoder.prompt
+++ b/ollama/templates/wizardcoder.prompt
+Below is an instruction that describes a task. Write a response that appropriately completes the request
+### Instruction: {{ prompt }}
+### Response:
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,6 @@ scripts = {ollama = "ollama.cmd.cli:main"}
 [tool.poetry.dependencies]
 python = "^3.8"
-llama-cpp-python = "^0.1.66"
 aiohttp = "^3.8.4"
 aiohttp-cors = "^0.7.0"
 jinja2 = "^3.1.2"
@@ -17,6 +16,9 @@ requests = "^2.31.0"
 tqdm = "^4.65.0"
 validators = "^0.20.0"
 yaspin = "^2.3.0"
+llama-cpp-python = "^0.1.67"
+ctransformers = "^0.2.10"
+fuzzywuzzy = {extras = ["speedup"], version = "^0.18.0"}
 [build-system]
 requires = ["poetry-core"]

--- a/requirements.txt
+++ b/requirements.txt