Unverified Commit e69a08f1 authored by Michael Yang's avatar Michael Yang Committed by GitHub
Browse files

Merge pull request #17 from jmorganca/ctransformers

use ctransformers as backup to llama.cpp
parents 9dfef44a 07d8d561
...@@ -31,7 +31,7 @@ def main(): ...@@ -31,7 +31,7 @@ def main():
) )
# create models home if it doesn't exist # create models home if it doesn't exist
os.makedirs(model.models_home, exist_ok=True) os.makedirs(model.MODELS_CACHE_PATH, exist_ok=True)
subparsers = parser.add_subparsers( subparsers = parser.add_subparsers(
title='commands', title='commands',
...@@ -111,7 +111,7 @@ def generate_oneshot(*args, **kwargs): ...@@ -111,7 +111,7 @@ def generate_oneshot(*args, **kwargs):
spinner.start() spinner.start()
spinner_running = True spinner_running = True
try: try:
for output in engine.generate(*args, **kwargs): for output in engine.generate(model_name=kwargs.pop('model'), *args, **kwargs):
choices = output.get("choices", []) choices = output.get("choices", [])
if len(choices) > 0: if len(choices) > 0:
if spinner_running: if spinner_running:
...@@ -147,7 +147,7 @@ def generate_batch(*args, **kwargs): ...@@ -147,7 +147,7 @@ def generate_batch(*args, **kwargs):
def pull(*args, **kwargs): def pull(*args, **kwargs):
model.pull(*args, **kwargs) model.pull(model_name=kwargs.pop('model'), *args, **kwargs)
def run(*args, **kwargs): def run(*args, **kwargs):
......
...@@ -38,7 +38,7 @@ def serve(*args, **kwargs): ...@@ -38,7 +38,7 @@ def serve(*args, **kwargs):
app.update( app.update(
{ {
"llms": {}, "models": {},
} }
) )
...@@ -47,32 +47,32 @@ def serve(*args, **kwargs): ...@@ -47,32 +47,32 @@ def serve(*args, **kwargs):
async def load(request): async def load(request):
body = await request.json() body = await request.json()
model = body.get("model") name = body.get("model")
if not model: if not name:
raise web.HTTPBadRequest() raise web.HTTPBadRequest()
kwargs = { kwargs = {
"llms": request.app.get("llms"), "models": request.app.get("models"),
} }
engine.load(model, **kwargs) engine.load(name, **kwargs)
return web.Response() return web.Response()
async def unload(request): async def unload(request):
body = await request.json() body = await request.json()
model = body.get("model") name = body.get("model")
if not model: if not name:
raise web.HTTPBadRequest() raise web.HTTPBadRequest()
engine.unload(model, llms=request.app.get("llms")) engine.unload(name, models=request.app.get("models"))
return web.Response() return web.Response()
async def generate(request): async def generate(request):
body = await request.json() body = await request.json()
model = body.get("model") name = body.get("model")
if not model: if not name:
raise web.HTTPBadRequest() raise web.HTTPBadRequest()
prompt = body.get("prompt") prompt = body.get("prompt")
...@@ -83,10 +83,10 @@ async def generate(request): ...@@ -83,10 +83,10 @@ async def generate(request):
await response.prepare(request) await response.prepare(request)
kwargs = { kwargs = {
"llms": request.app.get("llms"), "models": request.app.get("models"),
} }
for output in engine.generate(model, prompt, **kwargs): for output in engine.generate(name, prompt, **kwargs):
output = json.dumps(output).encode('utf-8') output = json.dumps(output).encode('utf-8')
await response.write(output) await response.write(output)
await response.write(b"\n") await response.write(b"\n")
......
...@@ -2,63 +2,120 @@ import os ...@@ -2,63 +2,120 @@ import os
import sys import sys
from os import path from os import path
from contextlib import contextmanager from contextlib import contextmanager
from llama_cpp import Llama as LLM from fuzzywuzzy import process
from llama_cpp import Llama
from ctransformers import AutoModelForCausalLM
import ollama.model
import ollama.prompt import ollama.prompt
from ollama.model import MODELS_CACHE_PATH
@contextmanager @contextmanager
def suppress_stderr(): def suppress(file):
stderr = os.dup(sys.stderr.fileno()) original = os.dup(file.fileno())
with open(os.devnull, "w") as devnull: with open(os.devnull, "w") as devnull:
os.dup2(devnull.fileno(), sys.stderr.fileno()) os.dup2(devnull.fileno(), file.fileno())
yield yield
os.dup2(stderr, sys.stderr.fileno()) os.dup2(original, file.fileno())
def generate(model, prompt, llms={}, *args, **kwargs): def generate(model_name, prompt, models={}, *args, **kwargs):
llm = load(model, llms=llms) model = load(model_name, models=models)
inputs = ollama.prompt.template(model_name, prompt)
return model.generate(inputs, *args, **kwargs)
prompt = ollama.prompt.template(model, prompt)
if "max_tokens" not in kwargs:
kwargs.update({"max_tokens": 16384})
if "stop" not in kwargs: def load(model_name, models={}):
kwargs.update({"stop": ["Q:"]}) if not models.get(model_name, None):
model_path = path.expanduser(model_name)
if not path.exists(model_path):
model_path = path.join(MODELS_CACHE_PATH, model_name + ".bin")
if "stream" not in kwargs: runners = {
kwargs.update({"stream": True}) model_type: cls
for cls in [LlamaCppRunner, CtransformerRunner]
for model_type in cls.model_types()
}
for output in llm(prompt, *args, **kwargs): best_match, _ = process.extractOne(model_path, runners.keys())
yield output model = runners.get(best_match, LlamaCppRunner)
models.update({model_name: model(model_path, best_match)})
def load(model, llms={}): return models.get(model_name)
llm = llms.get(model, None)
if not llm:
stored_model_path = path.join(ollama.model.models_home, model) + ".bin"
if path.exists(stored_model_path):
model_path = stored_model_path
else:
# try loading this as a path to a model, rather than a model name
model_path = path.abspath(model)
if not path.exists(model_path):
raise Exception(f"Model not found: {model}")
def unload(model_name, models={}):
if model_name in models:
models.pop(model_name)
class LlamaCppRunner:
def __init__(self, model_path, model_type):
try: try:
# suppress LLM's output with suppress(sys.stderr), suppress(sys.stdout):
with suppress_stderr(): self.model = Llama(model_path,
llm = LLM(model_path, verbose=False) verbose=False,
llms.update({model: llm}) n_gpu_layers=1,
except Exception as e: seed=-1)
# e is sent to devnull, so create a generic exception except Exception:
raise Exception(f"Failed to load model: {model}") raise Exception("Failed to load model", model_path, model_type)
return llm
@staticmethod
def model_types():
def unload(model, llms={}): return [
if model in llms: 'llama',
llms.pop(model) 'orca',
'vicuna',
'ultralm',
]
def generate(self, prompt, *args, **kwargs):
if "max_tokens" not in kwargs:
kwargs.update({"max_tokens": 512})
if "stop" not in kwargs:
kwargs.update({"stop": ["Q:"]})
if "stream" not in kwargs:
kwargs.update({"stream": True})
with suppress(sys.stderr):
for output in self.model(prompt, *args, **kwargs):
yield output
class CtransformerRunner:
def __init__(self, model_path, model_type):
self.model = AutoModelForCausalLM.from_pretrained(
model_path, model_type=model_type, local_files_only=True)
@staticmethod
def model_types():
return [
'falcon',
'mpt',
'starcoder',
]
def generate(self, prompt, *args, **kwargs):
if "max_new_tokens" not in kwargs:
kwargs.update({"max_new_tokens": 512})
if "stop" not in kwargs:
kwargs.update({"stop": ["User"]})
if "stream" not in kwargs:
kwargs.update({"stream": True})
for output in self.model(prompt, *args, **kwargs):
yield {
'choices': [
{
'text': output,
},
],
}
...@@ -6,12 +6,12 @@ from urllib.parse import urlsplit, urlunsplit ...@@ -6,12 +6,12 @@ from urllib.parse import urlsplit, urlunsplit
from tqdm import tqdm from tqdm import tqdm
models_endpoint_url = 'https://ollama.ai/api/models' MODELS_MANIFEST = 'https://ollama.ai/api/models'
models_home = path.join(Path.home(), '.ollama', 'models') MODELS_CACHE_PATH = path.join(Path.home(), '.ollama', 'models')
def models(*args, **kwargs): def models(*args, **kwargs):
for _, _, files in walk(models_home): for _, _, files in walk(MODELS_CACHE_PATH):
for file in files: for file in files:
base, ext = path.splitext(file) base, ext = path.splitext(file)
if ext == '.bin': if ext == '.bin':
...@@ -20,7 +20,7 @@ def models(*args, **kwargs): ...@@ -20,7 +20,7 @@ def models(*args, **kwargs):
# get the url of the model from our curated directory # get the url of the model from our curated directory
def get_url_from_directory(model): def get_url_from_directory(model):
response = requests.get(models_endpoint_url) response = requests.get(MODELS_MANIFEST)
response.raise_for_status() response.raise_for_status()
directory = response.json() directory = response.json()
for model_info in directory: for model_info in directory:
...@@ -78,7 +78,7 @@ def find_bin_file(json_response, location, branch): ...@@ -78,7 +78,7 @@ def find_bin_file(json_response, location, branch):
def download_file(download_url, file_name, file_size): def download_file(download_url, file_name, file_size):
local_filename = path.join(models_home, file_name) + '.bin' local_filename = path.join(MODELS_CACHE_PATH, file_name) + '.bin'
first_byte = path.getsize(local_filename) if path.exists(local_filename) else 0 first_byte = path.getsize(local_filename) if path.exists(local_filename) else 0
...@@ -125,7 +125,7 @@ def pull(model, *args, **kwargs): ...@@ -125,7 +125,7 @@ def pull(model, *args, **kwargs):
url = f'https://{url}' url = f'https://{url}'
if not validators.url(url): if not validators.url(url):
if model in models(models_home): if model in models(MODELS_CACHE_PATH):
# the model is already downloaded, and specified by name # the model is already downloaded, and specified by name
return model return model
raise Exception(f'Unknown model {model}') raise Exception(f'Unknown model {model}')
......
import os from os import path
from difflib import SequenceMatcher from difflib import SequenceMatcher
from jinja2 import Environment, PackageLoader from jinja2 import Environment, PackageLoader
def template(model, prompt): def template(name, prompt):
best_ratio = 0 best_ratio = 0
best_template = '' best_template = ''
environment = Environment(loader=PackageLoader(__name__, 'templates')) environment = Environment(loader=PackageLoader(__name__, 'templates'))
for template in environment.list_templates(): for template in environment.list_templates():
base, _ = os.path.splitext(template) base, _ = path.splitext(template)
ratio = SequenceMatcher(None, os.path.basename(model.lower()), base).ratio() ratio = SequenceMatcher(None, path.basename(name).lower(), base).ratio()
if ratio > best_ratio: if ratio > best_ratio:
best_ratio = ratio best_ratio = ratio
best_template = template best_template = template
......
A helpful assistant who helps the user with any questions asked.
User: {{ prompt }}
Assistant:
Below is an instruction that describes a task. Write a response that appropriately completes the request. Be concise. Once the request is completed, include no other text.
### Instruction:
{{ prompt }}
### Response:
USER: {{ prompt }}
ASSISTANT:
Below is an instruction that describes a task. Write a response that appropriately completes the request
### Instruction: {{ prompt }}
### Response:
This diff is collapsed.
...@@ -9,7 +9,6 @@ scripts = {ollama = "ollama.cmd.cli:main"} ...@@ -9,7 +9,6 @@ scripts = {ollama = "ollama.cmd.cli:main"}
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.8" python = "^3.8"
llama-cpp-python = "^0.1.66"
aiohttp = "^3.8.4" aiohttp = "^3.8.4"
aiohttp-cors = "^0.7.0" aiohttp-cors = "^0.7.0"
jinja2 = "^3.1.2" jinja2 = "^3.1.2"
...@@ -17,6 +16,9 @@ requests = "^2.31.0" ...@@ -17,6 +16,9 @@ requests = "^2.31.0"
tqdm = "^4.65.0" tqdm = "^4.65.0"
validators = "^0.20.0" validators = "^0.20.0"
yaspin = "^2.3.0" yaspin = "^2.3.0"
llama-cpp-python = "^0.1.67"
ctransformers = "^0.2.10"
fuzzywuzzy = {extras = ["speedup"], version = "^0.18.0"}
[build-system] [build-system]
requires = ["poetry-core"] requires = ["poetry-core"]
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment