Commit d7117b95 authored by zhouxiang's avatar zhouxiang
Browse files

同步0.2.6代码

parent 5f83e392
......@@ -17,6 +17,13 @@ target_link_libraries(${PROJECT_NAME} PRIVATE TransformerTritonBackend
LlamaTritonBackend)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_14)
set(_INSTALL_CUDA_RPATH
"\$ORIGIN"
"\$ORIGIN/../../nvidia/nccl/lib/"
"\$ORIGIN/../../nvidia/cuda_runtime/lib/"
"\$ORIGIN/../../nvidia/cublas/lib/"
)
set_target_properties(${PROJECT_NAME} PROPERTIES
BUILD_RPATH "\$ORIGIN"
INSTALL_RPATH "\$ORIGIN;\$ORIGIN/../../nvidia/nccl/lib/")
BUILD_RPATH "\$ORIGIN"
INSTALL_RPATH "${_INSTALL_CUDA_RPATH}"
)
......@@ -47,7 +47,6 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
model_dir);
}
#ifdef ENABLE_BF16
else if (data_type == "bf16") {
#ifdef ENABLE_BF16
return std::make_shared<LlamaTritonModel<__nv_bfloat16>>(
......@@ -60,7 +59,6 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
ft::FT_CHECK(false);
#endif
}
#endif
else {
return std::make_shared<LlamaTritonModel<float>>(
reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
......@@ -146,7 +144,17 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size,
enable_custom_all_reduce_(enable_custom_all_reduce)
{
INIReader reader;
FT_CHECK_WITH_INFO((config.empty() ^ model_dir.empty()), "invalid init options");
FT_CHECK_WITH_INFO(!(config.empty() && model_dir.empty()), "invalid init options");
if (!model_dir.empty()) {
model_dir_ = model_dir;
const std::string inifile{model_dir + "/config.ini"};
reader = INIReader(inifile);
if (reader.ParseError() < 0) {
TM_LOG_ERROR("[ERROR] Can't load %s", inifile.c_str());
ft::FT_CHECK(false);
}
}
if (!config.empty()) {
std::FILE* tmpf = std::tmpfile();
......@@ -159,16 +167,6 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size,
}
}
if (!model_dir.empty()) {
model_dir_ = model_dir;
const std::string inifile{model_dir + "/config.ini"};
reader = INIReader(inifile);
if (reader.ParseError() < 0) {
TM_LOG_ERROR("[ERROR] Can't load %s", inifile.c_str());
ft::FT_CHECK(false);
}
}
model_name_ = reader.Get("llama", "model_name");
head_num_ = reader.GetInteger("llama", "head_num");
kv_head_num_ = reader.GetInteger("llama", "kv_head_num", 0);
......
......@@ -67,7 +67,8 @@ add_library(mpi_utils STATIC mpi_utils.cc)
#set_property(TARGET mpi_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET mpi_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
if (BUILD_MULTI_GPU)
target_link_libraries(mpi_utils PUBLIC mpi logger)
#target_link_libraries(mpi_utils PUBLIC mpi logger)
target_link_libraries(mpi_utils PUBLIC ${MPI_CXX_LIBRARIES} logger)
endif()
add_library(nccl_utils STATIC nccl_utils.cc)
......
......@@ -386,6 +386,7 @@ protected:
T* d_bias_;
int* d_output_ids_;
int* d_input_lengths_;
int* d_penalty_workspace_;
float* d_repetition_penalties_;
......@@ -410,6 +411,8 @@ protected:
d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
d_output_ids_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * sequence_length_ * batch_size_));
d_input_lengths_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size_));
d_penalty_workspace_ =
reinterpret_cast<int*>(allocator->malloc((sizeof(int) + sizeof(float)) * batch_size_ * step_));
cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
cudaAutoCpy(d_bias_, h_bias_, vocab_size_padded_, stream);
......@@ -501,6 +504,7 @@ public:
else {
invokeBatchApplyRepetitionPenalty(d_logits_,
d_repetition_penalties_,
d_penalty_workspace_,
d_output_ids_,
batch_size_,
batch_size_,
......@@ -559,6 +563,7 @@ public:
cudaAutoCpy(d_logits_batch, h_logits_, batch_size_ * vocab_size_padded_, stream);
invokeBatchApplyRepetitionPenalty(d_logits_batch,
d_repetition_penalties_,
d_penalty_workspace_,
d_output_ids_,
batch_size_,
batch_size_,
......
import os
import numpy as np
import torch
from transformers import AutoTokenizer
from lmdeploy.pytorch.decode import Engine, decode_single
from lmdeploy.pytorch.model import accel_model, init_model
def _test_decode_dist(model_path, prompt):
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'right'
inputs = tokenizer(prompt)
input_ids = inputs.input_ids
engine = Engine(model_path, tokenizer=tokenizer)
probs = engine.decode(input_ids, sort=False, max_bs=1, pad=True)
return probs
def _test_decode_single(model_path, prompt):
model, tokenizer = init_model(model_path)
model = accel_model(model)
model = model.eval()
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'right'
inputs = tokenizer(prompt, return_tensors='pt', padding=True)
input_ids = inputs.input_ids.cuda()
attention_mask = inputs.attention_mask.cuda()
probs: torch.Tensor = decode_single(model, input_ids, attention_mask)
return probs.numpy()
def test_compare(output_outliers=True):
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
os.environ['MKL_THREADING_LAYER'] = 'GNU'
# https://github.com/pytorch/pytorch/issues/37377#issuecomment-629529611
model_path = 'llama2/huggingface/llama-2-7b'
prompts = [
'I believe the meaning of life is to find your gift. The purpose of life is to give it away.', # noqa: E501
'Simply put, the theory of relativity states that ',
'Building a website can be done in 10 simple steps:'
]
p_single = _test_decode_single(model_path, prompts)
p_dist = _test_decode_dist(model_path, prompts)
rtol = 2.0e-2
atol = 2.0e-2
if output_outliers:
np.set_printoptions(linewidth=150, edgeitems=5)
failed = (abs(p_dist - p_single) > atol + rtol * abs(p_single))
idx = failed.nonzero()
print(f'Num outliers: {len(idx[0])}')
print(p_dist[idx])
print(p_single[idx])
assert np.allclose(p_dist, p_single, rtol=rtol, atol=atol)
import unittest
import torch
from lmdeploy.pytorch.dist import (get_rank, master_only,
master_only_and_broadcast_general,
master_only_and_broadcast_tensor)
class SimpleTest(unittest.TestCase):
@master_only
def fake_input(self):
print(f'Evaluate fake input 1 on {get_rank()}')
return 'master only or none'
@master_only_and_broadcast_general
def fake_input21(self):
print(f'Evaluate fake input 21 (str) on {get_rank()}')
return 'master only and_broadcast'
@master_only_and_broadcast_general
def fake_input22(self):
print(f'Evaluate fake input 22 (cpu tensor) on {get_rank()}')
return torch.tensor([6, 66, 666])
@master_only_and_broadcast_tensor
def fake_input3(self):
print(f'Evaluate fake input 3 (gpu tensor) on {get_rank()}')
return torch.tensor([6, 66, 666]).cuda()
def test(self):
torch.distributed.init_process_group(backend='nccl')
rank = get_rank()
# unittest will discard --local_rank, thus set manually
torch.cuda.set_device(rank)
in1 = self.fake_input()
in21 = self.fake_input21()
in22 = self.fake_input22()
in3 = self.fake_input3(dtype=torch.long, size=(1, 3))
if rank == 0:
self.assertEqual(in1, 'master only or none')
else:
self.assertEqual(in1, None)
self.assertEqual(in21, 'master only and_broadcast')
self.assertTrue(torch.allclose(in22, torch.tensor([6, 66, 666])))
self.assertFalse(torch.allclose(in3.cpu(), torch.tensor([6, 6, 666])))
self.assertTrue(torch.allclose(in3.cpu(), torch.tensor([6, 66, 666])))
from lmdeploy.pytorch.model import accel_model, init_model
def test_init_model():
cprint = lambda x: print(f'\033[92m{x}\033[0m') # noqa: E731
# Test llama2-7b
for model_path in ['llama2/huggingface/llama-2-7b', 'internlm-7b']:
model, tokenizer = init_model(model_path)
assert tokenizer.is_fast
cprint('llama2 on CPU')
print(model)
model1 = accel_model(model)
cprint('llama2 on GPU')
print(model1)
cprint('llama2 with kernel injection')
model2 = accel_model(model, accel='deepspeed')
assert 'DeepSpeedSelfAttention' in repr(model2)
assert 'DeepSpeedMLP' in repr(model2)
from lmdeploy.pytorch.utils import BasicStreamer, TerminalIO
def test_terminal_io(monkeypatch):
import io
tio = TerminalIO()
inputs = 'hello\n\n'
# inputs = 'hello\n\n\x1B[A\n\n'
monkeypatch.setattr('sys.stdin', io.StringIO(inputs))
string = tio.input()
tio.output(string)
assert string == 'hello'
# string = tio.input()
# tio.output(string)
# assert string == 'hello'
def test_basic_streamer():
output = []
def decode_func(value):
return value + 10
def output_func(value):
output.append(value)
streamer = BasicStreamer(decode_func, output_func)
for i in range(10):
streamer.put(i)
if i == 5:
streamer.end()
streamer.end()
assert output == [11, 12, 13, 14, 15, '\n', 17, 18, 19, '\n']
output.clear()
streamer = BasicStreamer(decode_func, output_func, skip_prompt=False)
for i in range(10):
streamer.put(i)
if i == 5:
streamer.end()
streamer.end()
assert output == [10, 11, 12, 13, 14, 15, '\n', 16, 17, 18, 19, '\n']
import inspect
def compare_func(class_method, function):
"""Compare if a class method has same arguments as a function."""
argspec_cls = inspect.getfullargspec(class_method)
argspec_func = inspect.getfullargspec(function)
assert argspec_cls.args[1:] == argspec_func.args
assert argspec_cls.defaults == argspec_func.defaults
assert argspec_cls.annotations == argspec_func.annotations
def test_cli():
from lmdeploy.cli.cli import CLI
from lmdeploy.serve.turbomind.deploy import main as convert
compare_func(CLI.convert, convert)
def test_subcli_chat():
from lmdeploy.cli.chat import SubCliChat
from lmdeploy.pytorch.chat import main as run_torch_model
from lmdeploy.turbomind.chat import main as run_turbomind_model
compare_func(SubCliChat.torch, run_torch_model)
compare_func(SubCliChat.turbomind, run_turbomind_model)
def test_subcli_lite():
from lmdeploy.cli.lite import SubCliLite
from lmdeploy.lite.apis.auto_awq import auto_awq
from lmdeploy.lite.apis.calibrate import calibrate
from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
compare_func(SubCliLite.auto_awq, auto_awq)
compare_func(SubCliLite.calibrate, calibrate)
compare_func(SubCliLite.kv_qparams, run_kv_qparams)
def test_subcli_serve():
from lmdeploy.cli.serve import SubCliServe
from lmdeploy.serve.client import main as run_triton_client
from lmdeploy.serve.gradio.app import run as run_gradio
from lmdeploy.serve.openai.api_client import main as run_api_client
from lmdeploy.serve.openai.api_server import serve as run_api_server
compare_func(SubCliServe.gradio, run_gradio)
compare_func(SubCliServe.api_server, run_api_server)
compare_func(SubCliServe.api_client, run_api_client)
compare_func(SubCliServe.triton_client, run_triton_client)
import pytest
from lmdeploy.model import MODELS, SamplingParam
from lmdeploy.model import MODELS, best_match_model
@pytest.mark.parametrize(
'model_path_and_name',
[('internlm/internlm-chat-7b', ['internlm']),
('internlm/internlm2-1_8b', ['base']),
('models--internlm--internlm-chat-7b/snapshots/1234567', ['internlm']),
('Qwen/Qwen-7B-Chat', ['qwen']),
('codellama/CodeLlama-7b-hf', ['codellama']),
('upstage/SOLAR-0-70b', ['solar', 'solar-70b']),
('meta-llama/Llama-2-7b-chat-hf', ['llama2']),
('THUDM/chatglm2-6b', ['chatglm']),
('01-ai/Yi-6B-200k', ['yi', 'yi-200k']), ('01-ai/Yi-34B-Chat', ['yi']),
('01-ai/Yi-6B-Chat', ['yi', 'yi-chat']),
('WizardLM/WizardLM-70B-V1.0', ['wizardlm']),
('codellama/CodeLlama-34b-Instruct-hf', ['codellama']),
('tiiuae/falcon-7b', ['falcon']), ('workspace', [None])])
@pytest.mark.parametrize('suffix', ['', '-w4', '-4bit', '-16bit'])
def test_best_match_model(model_path_and_name, suffix):
if model_path_and_name[0] == 'internlm/internlm2-1_8b' and suffix:
return # internlm/internlm2-1_8b-suffix will got None
deduced_name = best_match_model(model_path_and_name[0] + suffix)
if deduced_name is not None:
assert deduced_name in model_path_and_name[
1], f'expect {model_path_and_name[1]}, but got {deduced_name}'
else:
assert deduced_name in model_path_and_name[
1], f'expect {model_path_and_name[1]}, but got {deduced_name}'
@pytest.mark.parametrize('model_name',
['llama2', 'base', 'yi', 'qwen-7b', 'vicuna'])
@pytest.mark.parametrize('meta_instruction', ['[fake meta_instruction]'])
def test_model_config(model_name, meta_instruction):
from lmdeploy.model import ChatTemplateConfig
chat_template = ChatTemplateConfig(
model_name, meta_instruction=meta_instruction).chat_template
prompt = chat_template.get_prompt('')
if model_name == 'base':
assert prompt == ''
else:
assert meta_instruction in prompt
def test_base_model():
......@@ -21,8 +63,6 @@ def test_vicuna():
model = MODELS.get('vicuna')(capability='completion')
assert model.get_prompt(prompt, sequence_start=True) == prompt
assert model.get_prompt(prompt, sequence_start=False) == prompt
assert model.stop_words is None
assert model.system is not None
model = MODELS.get('vicuna')(capability='chat',
system='Provide answers in Python')
......@@ -34,7 +74,7 @@ def test_vicuna():
_prompt = None
with pytest.raises(AssertionError):
_prompt = model.get_prompt(prompt, sequence_start=True)
assert _prompt is None
assert _prompt is None
def test_internlm_chat():
......@@ -43,7 +83,7 @@ def test_internlm_chat():
assert model.get_prompt(prompt, sequence_start=True) == prompt
assert model.get_prompt(prompt, sequence_start=False) == prompt
assert model.stop_words is not None
assert model.system == ''
assert model.system == '<|System|>:'
assert model.session_len == 2048
model = MODELS.get('internlm-chat-7b')(capability='chat',
......@@ -56,7 +96,7 @@ def test_internlm_chat():
_prompt = None
with pytest.raises(AssertionError):
_prompt = model.get_prompt(prompt, sequence_start=True)
assert _prompt is None
assert _prompt is None
model = MODELS.get('internlm-chat-7b-8k')()
assert model.session_len == 8192
......@@ -68,7 +108,6 @@ def test_baichuan():
assert model.get_prompt(prompt, sequence_start=True) == prompt
assert model.get_prompt(prompt, sequence_start=False) == prompt
assert model.stop_words is None
assert model.repetition_penalty == 1.1
model = MODELS.get('baichuan-7b')(capability='chat')
_prompt = model.get_prompt(prompt, sequence_start=True)
......@@ -81,19 +120,19 @@ def test_llama2():
assert model.get_prompt(prompt, sequence_start=True) == prompt
assert model.get_prompt(prompt, sequence_start=False) == prompt
assert model.stop_words is None
assert model.default_sys_prompt is not None
assert model.meta_instruction is not None
model = MODELS.get('llama2')(capability='chat',
system='Provide answers in Python')
meta_instruction='Provide answers in Python')
assert model.get_prompt(prompt, sequence_start=True) != prompt
assert model.get_prompt(prompt, sequence_start=False) != prompt
assert model.default_sys_prompt == 'Provide answers in Python'
assert model.meta_instruction == 'Provide answers in Python'
model = MODELS.get('llama2')(capability='voice')
_prompt = None
with pytest.raises(AssertionError):
_prompt = model.get_prompt(prompt, sequence_start=True)
assert _prompt is None
assert _prompt is None
def test_qwen():
......@@ -111,7 +150,7 @@ def test_qwen():
_prompt = None
with pytest.raises(AssertionError):
_prompt = model.get_prompt(prompt, sequence_start=True)
assert _prompt is None
assert _prompt is None
def test_codellama_completion():
......@@ -167,39 +206,3 @@ def test_codellama_others():
with pytest.raises(AssertionError):
model = MODELS.get('codellama')(capability='java')
assert model is None
def test_sampling_param():
model = MODELS.get('llama')()
default_sampling_param = SamplingParam()
assert model.sampling_param == default_sampling_param
model = MODELS.get('llama')(top_p=0.1, top_k=10)
assert model.sampling_param.top_p == 0.1 and \
model.sampling_param.top_k == 10
assert model.sampling_param.temperature == 0.8 and \
model.sampling_param.repetition_penalty == 1.0
model = MODELS.get('codellama')(capability='completion')
assert model.sampling_param.top_p == 0.9 and \
model.sampling_param.top_k is None and \
model.sampling_param.temperature == 0.2 and \
model.sampling_param.repetition_penalty == 1.0
model = MODELS.get('codellama')(capability='chat')
assert model.sampling_param.top_p == 0.95 and \
model.sampling_param.top_k is None and \
model.sampling_param.temperature == 0.2 and \
model.sampling_param.repetition_penalty == 1.0
model = MODELS.get('codellama')(capability='infilling')
assert model.sampling_param.top_p == 0.9 and \
model.sampling_param.top_k is None and \
model.sampling_param.temperature == 0.0 and \
model.sampling_param.repetition_penalty == 1.0
model = MODELS.get('codellama')(capability='python')
assert model.sampling_param.top_p == 0.9 and \
model.sampling_param.top_k is None and \
model.sampling_param.temperature == 0.2 and \
model.sampling_param.repetition_penalty == 1.0
import random
import pytest
from lmdeploy.tokenizer import HuggingFaceTokenizer
from lmdeploy.tokenizer import DetokenizeState, HuggingFaceTokenizer
@pytest.mark.parametrize('model_path', [
'internlm/internlm-chat-7b', 'Qwen/Qwen-7B-Chat',
'baichuan-inc/Baichuan2-7B-Chat', 'upstage/SOLAR-0-70b-16bit',
'baichuan-inc/Baichuan-7B', 'codellama/CodeLlama-7b-hf',
'upstage/SOLAR-0-70b-16bit'
'THUDM/chatglm2-6b', '01-ai/Yi-6B-200k', '01-ai/Yi-34B-Chat',
'01-ai/Yi-6B-Chat', 'WizardLM/WizardLM-70B-V1.0',
'codellama/CodeLlama-34b-Instruct-hf', 'tiiuae/falcon-7b'
])
@pytest.mark.parametrize('input', [
'hi, this is a test 😆😆! ' * 5, '為什麼我還在用繁體字 😆😆 gg! ' * 5,
' License at\n#\n#' + ' ' * 100 + 'ht', ' '
])
@pytest.mark.parametrize(
'input', ['hi, this is a test 😆😆! ' * 5, '為什麼我還在用繁體字 😆😆 gg! ' * 5])
def test_tokenizer(model_path, input):
@pytest.mark.parametrize('interval', [1, 3])
@pytest.mark.parametrize('skip_special_tokens', [True, False])
def test_tokenizer(model_path, input, interval, skip_special_tokens):
tokenizer = HuggingFaceTokenizer(model_path)
encoded = tokenizer.encode(input)
encoded = tokenizer.encode(input, False, add_special_tokens=False)
output = ''
offset = 0
for i in range(1, len(encoded) + 1):
decoded = tokenizer.decode(encoded[:i], offset)
if decoded.endswith('�'):
continue
state = DetokenizeState()
for i in range(0, len(encoded), interval):
offset = i + interval
if offset < len(encoded):
# lmdeploy may decode nothing when concurrency is high
if random.randint(1, 10) < 4:
offset -= interval
decoded, state = tokenizer.detokenize_incrementally(
encoded[:offset], state, skip_special_tokens)
output += decoded
offset = i
assert input == output, 'input string should equal to output after enc-dec'
@pytest.mark.parametrize('model_path', [
'internlm/internlm-chat-7b', 'Qwen/Qwen-7B-Chat',
'baichuan-inc/Baichuan2-7B-Chat', 'codellama/CodeLlama-7b-hf',
'upstage/SOLAR-0-70b-16bit'
])
@pytest.mark.parametrize('stop_words', ['.', ' ', '?', ''])
def test_tokenizer_with_stop_words(model_path, stop_words):
tokenizer = HuggingFaceTokenizer(model_path)
indexes = tokenizer.indexes_containing_token(stop_words)
assert indexes is not None
def test_qwen_vl_decode_special():
from lmdeploy.tokenizer import Tokenizer
tok = Tokenizer('Qwen/Qwen-VL-Chat')
try:
tok.decode([151857])
assert (0)
except Exception as e:
assert str(e) == 'Unclosed image token'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment