Commit 4d0eb763 authored by Michael Yang's avatar Michael Yang
Browse files

use ctransformers as backup to llama-cpp

parent 5cea13ce
...@@ -2,58 +2,120 @@ import os ...@@ -2,58 +2,120 @@ import os
import sys import sys
from os import path from os import path
from contextlib import contextmanager from contextlib import contextmanager
from llama_cpp import Llama as LLM from fuzzywuzzy import process
from llama_cpp import Llama
from ctransformers import AutoModelForCausalLM
import ollama.prompt import ollama.prompt
from ollama.model import models_home from ollama.model import models_home
@contextmanager @contextmanager
def suppress_stderr(): def suppress(file):
stderr = os.dup(sys.stderr.fileno()) original = os.dup(file.fileno())
with open(os.devnull, "w") as devnull: with open(os.devnull, "w") as devnull:
os.dup2(devnull.fileno(), sys.stderr.fileno()) os.dup2(devnull.fileno(), file.fileno())
yield yield
os.dup2(stderr, sys.stderr.fileno()) os.dup2(original, file.fileno())
def generate(model_name, prompt, models={}, *args, **kwargs): def generate(model_name, prompt, models={}, *args, **kwargs):
if "max_tokens" not in kwargs:
kwargs.update({"max_tokens": 16384})
if "stop" not in kwargs:
kwargs.update({"stop": ["Q:"]})
if "stream" not in kwargs:
kwargs.update({"stream": True})
prompt = ollama.prompt.template(model_name, prompt)
model = load(model_name, models=models) model = load(model_name, models=models)
for output in model.create_completion(prompt, *args, **kwargs): inputs = ollama.prompt.template(model_name, prompt)
yield output return model.generate(inputs, *args, **kwargs)
def load(model_name, models={}): def load(model_name, models={}):
model = models.get(model_name, None) if not models.get(model_name, None):
if not model:
model_path = path.expanduser(model_name) model_path = path.expanduser(model_name)
if not path.exists(model_path): if not path.exists(model_path):
model_path = path.join(models_home, model_name + ".bin") model_path = path.join(models_home, model_name + ".bin")
try: runners = {
# suppress LLM's output model_type: cls
with suppress_stderr(): for cls in [LlamaCppRunner, CtransformerRunner]
model = LLM(model_path, verbose=False) for model_type in cls.model_types()
models.update({model_name: model}) }
except Exception:
# e is sent to devnull, so create a generic exception best_match, _ = process.extractOne(model_path, runners.keys())
raise Exception(f"Failed to load model: {model}") model = runners.get(best_match, LlamaCppRunner)
return model models.update({model_name: model(model_path, best_match)})
return models.get(model_name)
def unload(model_name, models={}): def unload(model_name, models={}):
if model_name in models: if model_name in models:
models.pop(model_name) models.pop(model_name)
class LlamaCppRunner:
def __init__(self, model_path, model_type):
try:
with suppress(sys.stderr), suppress(sys.stdout):
self.model = Llama(model_path,
verbose=False,
n_gpu_layers=1,
seed=-1)
except Exception:
raise Exception("Failed to load model", model_path, model_type)
@staticmethod
def model_types():
return [
'llama',
'orca',
'vicuna',
'ultralm',
]
def generate(self, prompt, *args, **kwargs):
if "max_tokens" not in kwargs:
kwargs.update({"max_tokens": 512})
if "stop" not in kwargs:
kwargs.update({"stop": ["Q:"]})
if "stream" not in kwargs:
kwargs.update({"stream": True})
with suppress(sys.stderr):
for output in self.model(prompt, *args, **kwargs):
yield output
class CtransformerRunner:
def __init__(self, model_path, model_type):
self.model = AutoModelForCausalLM.from_pretrained(
model_path, model_type=model_type, local_files_only=True)
@staticmethod
def model_types():
return [
'falcon',
'mpt',
'starcoder',
]
def generate(self, prompt, *args, **kwargs):
if "max_new_tokens" not in kwargs:
kwargs.update({"max_new_tokens": 512})
if "stop" not in kwargs:
kwargs.update({"stop": ["User"]})
if "stream" not in kwargs:
kwargs.update({"stream": True})
for output in self.model(prompt, *args, **kwargs):
yield {
'choices': [
{
'text': output,
},
],
}
A helpful assistant who helps the user with any questions asked.
User: {{ prompt }}
Assistant:
Below is an instruction that describes a task. Write a response that appropriately completes the request. Be concise. Once the request is completed, include no other text.
### Instruction:
{{ prompt }}
### Response:
USER: {{ prompt }}
ASSISTANT:
Below is an instruction that describes a task. Write a response that appropriately completes the request
### Instruction: {{ prompt }}
### Response:
This diff is collapsed.
...@@ -9,7 +9,6 @@ scripts = {ollama = "ollama.cmd.cli:main"} ...@@ -9,7 +9,6 @@ scripts = {ollama = "ollama.cmd.cli:main"}
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.8" python = "^3.8"
llama-cpp-python = "^0.1.66"
aiohttp = "^3.8.4" aiohttp = "^3.8.4"
aiohttp-cors = "^0.7.0" aiohttp-cors = "^0.7.0"
jinja2 = "^3.1.2" jinja2 = "^3.1.2"
...@@ -17,6 +16,9 @@ requests = "^2.31.0" ...@@ -17,6 +16,9 @@ requests = "^2.31.0"
tqdm = "^4.65.0" tqdm = "^4.65.0"
validators = "^0.20.0" validators = "^0.20.0"
yaspin = "^2.3.0" yaspin = "^2.3.0"
llama-cpp-python = "^0.1.67"
ctransformers = "^0.2.10"
fuzzywuzzy = {extras = ["speedup"], version = "^0.18.0"}
[build-system] [build-system]
requires = ["poetry-core"] requires = ["poetry-core"]
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment