Unverified Commit a6ac981d authored by WRH's avatar WRH Committed by GitHub
Browse files

[Fix] Remaining Issues in #19 (#75)

* previous merged

* add chinese

* support torch<2

* add a docstring

* fix typo

* rename torch submodule

* rename to pytorch

* rename in readme
parent cfb3b75d
......@@ -127,7 +127,7 @@ For the deployment of other supported models, such as LLaMA, vicuna, you can fin
#### Single GPU
```shell
python3 -m lmdeploy.torch.chat $NAME_OR_PATH_TO_HF_MODEL\
python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL \
--max_new_tokens 64 \
--temperture 0.8 \
--top_p 0.95 \
......@@ -137,7 +137,7 @@ python3 -m lmdeploy.torch.chat $NAME_OR_PATH_TO_HF_MODEL\
#### Tensor Parallel with DeepSpeed
```shell
deepspeed --module --num_gpus 2 lmdeploy.torch.chat \
deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \
$NAME_OR_PATH_TO_HF_MODEL \
--max_new_tokens 64 \
--temperture 0.8 \
......
......@@ -121,6 +121,29 @@ python3 lmdeploy.app {server_ip_addresss}:33337 internlm
其他模型的部署方式,比如 LLaMA,vicuna,请参考[这里](docs/zh_cn/serving.md)
### 基于 PyTorch 的推理
#### 单个 GPU
```shell
python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL\
--max_new_tokens 64 \
--temperture 0.8 \
--top_p 0.95 \
--seed 0
```
#### 使用 DeepSpeed 实现张量并行
```shell
deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \
$NAME_OR_PATH_TO_HF_MODEL \
--max_new_tokens 64 \
--temperture 0.8 \
--top_p 0.95 \
--seed 0
```
## 量化部署
在 fp16 模式下,可以开启 kv_cache int8 量化,单卡可服务更多用户。
......
# Copyright (c) OpenMMLab. All rights reserved.
"""Chat with torch models."""
# Copyright (c) OpenMMLab. All rights reserved.
import torch
class LoadNoInit:
"""Initialize model without parameter initialization."""
def __init__(self):
self.constant_ = torch.nn.init.constant_
self.zeros_ = torch.nn.init.zeros_
self.ones_ = torch.nn.init.ones_
self.uniform_ = torch.nn.init.uniform_
self.normal_ = torch.nn.init.normal_
self.kaiming_uniform_ = torch.nn.init.kaiming_uniform_
self.kaiming_normal_ = torch.nn.init.kaiming_normal_
def __enter__(self, *args, **kwargs):
torch.nn.init.constant_ = lambda *args, **kwargs: None
torch.nn.init.zeros_ = lambda *args, **kwargs: None
torch.nn.init.ones_ = lambda *args, **kwargs: None
torch.nn.init.uniform_ = lambda *args, **kwargs: None
torch.nn.init.normal_ = lambda *args, **kwargs: None
torch.nn.init.kaiming_uniform_ = lambda *args, **kwargs: None
torch.nn.init.kaiming_normal_ = lambda *args, **kwargs: None
def __exit__(self, *args, **kwargs):
torch.nn.init.constant_ = self.constant_
torch.nn.init.zeros_ = self.zeros_
torch.nn.init.ones_ = self.ones_
torch.nn.init.uniform_ = self.uniform_
torch.nn.init.normal_ = self.normal_
torch.nn.init.kaiming_uniform_ = self.kaiming_uniform_
torch.nn.init.kaiming_normal_ = self.kaiming_normal_
......@@ -17,6 +17,7 @@ try:
from transformers import (AutoModelForCausalLM, AutoTokenizer,
GenerationConfig)
from .accel import LoadNoInit
from .utils import get_utils
_is_transformers_available = True
......@@ -51,10 +52,14 @@ def init_model(
use_fast=use_fast_tokenizer,
trust_remote_code=True)
torch.set_default_device(local_rank)
model = AutoModelForCausalLM.from_pretrained(model_path,
torch_dtype=torch.float16,
trust_remote_code=True)
if torch.__version__ >= '2':
torch.set_default_device(local_rank)
with LoadNoInit():
model = AutoModelForCausalLM.from_pretrained(model_path,
torch_dtype=torch.float16,
trust_remote_code=True)
model = model.cuda(local_rank)
if not _is_deepspeed_available:
warnings.warn('deepspeed is not installed, ',
......@@ -69,8 +74,6 @@ def init_model(
max_out_tokens=2048,
)
# print(f"model is loaded on device {model.device}")
return tokenizer, model
......@@ -115,7 +118,7 @@ def main(
temperature=temperature,
top_p=top_p,
)
model.generate(torch.tensor([[1]]), warmup_config)
model.generate(torch.tensor([[1]], device=local_rank), warmup_config)
# print("READY ...")
_on_master = local_rank == 0
......@@ -154,7 +157,7 @@ def main(
prompt = Decorator.decorate(prompt)
ids = tokenizer.encode(prompt, return_tensors='pt')
model.generate(ids,
model.generate(ids.cuda(local_rank),
gen_config,
streamer=streamer,
stopping_criteria=stop_criteria)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment