[Fix] Remaining Issues in #19 (#75)

* previous merged * add chinese * support torch<2 * add a docstring * fix typo * rename torch submodule * rename to pytorch * rename in readme

[Fix] Remaining Issues in #19 (#75)
* previous merged * add chinese * support torch<2 * add a docstring * fix typo * rename torch submodule * rename to pytorch * rename in readme
a6ac981d · WRH · GitHub · cfb3b75d · a6ac981d · a6ac981d
Unverified Commit a6ac981d authored Jul 11, 2023 by WRH Committed by GitHub Jul 11, 2023
6 changed files
--- a/README.md
+++ b/README.md
@@ -127,7 +127,7 @@ For the deployment of other supported models, such as LLaMA, vicuna, you can fin
 #### Single GPU

 ```shell
-python3 -m lmdeploy.torch.chat $NAME_OR_PATH_TO_HF_MODEL\
+python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL \
    --max_new_tokens 64 \
    --temperture 0.8 \
    --top_p 0.95 \
@@ -137,7 +137,7 @@ python3 -m lmdeploy.torch.chat $NAME_OR_PATH_TO_HF_MODEL\
 #### Tensor Parallel with DeepSpeed

 ```shell
-deepspeed --module --num_gpus 2 lmdeploy.torch.chat \
+deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \
    $NAME_OR_PATH_TO_HF_MODEL \
    --max_new_tokens 64 \
    --temperture 0.8 \

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -121,6 +121,29 @@ python3 lmdeploy.app {server_ip_addresss}:33337 internlm

 其他模型的部署方式，比如 LLaMA，vicuna，请参考[这里](docs/zh_cn/serving.md)

+### 基于 PyTorch 的推理
+
+#### 单个 GPU
+
+```shell
+python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL\
+    --max_new_tokens 64 \
+    --temperture 0.8 \
+    --top_p 0.95 \
+    --seed 0
+```
+
+#### 使用 DeepSpeed 实现张量并行
+
+```shell
+deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \
+    $NAME_OR_PATH_TO_HF_MODEL \
+    --max_new_tokens 64 \
+    --temperture 0.8 \
+    --top_p 0.95 \
+    --seed 0
+```
+
 ## 量化部署

 在 fp16 模式下，可以开启 kv_cache int8 量化，单卡可服务更多用户。

--- a/lmdeploy/torch/__init__.py
+++ b/lmdeploy/torch/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
+"""Chat with torch models."""
--- a/lmdeploy/pytorch/accel.py
+++ b/lmdeploy/pytorch/accel.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+class LoadNoInit:
+    """Initialize model without parameter initialization."""
+
+    def __init__(self):
+        self.constant_ = torch.nn.init.constant_
+        self.zeros_ = torch.nn.init.zeros_
+        self.ones_ = torch.nn.init.ones_
+        self.uniform_ = torch.nn.init.uniform_
+        self.normal_ = torch.nn.init.normal_
+        self.kaiming_uniform_ = torch.nn.init.kaiming_uniform_
+        self.kaiming_normal_ = torch.nn.init.kaiming_normal_
+
+    def __enter__(self, *args, **kwargs):
+        torch.nn.init.constant_ = lambda *args, **kwargs: None
+        torch.nn.init.zeros_ = lambda *args, **kwargs: None
+        torch.nn.init.ones_ = lambda *args, **kwargs: None
+        torch.nn.init.uniform_ = lambda *args, **kwargs: None
+        torch.nn.init.normal_ = lambda *args, **kwargs: None
+        torch.nn.init.kaiming_uniform_ = lambda *args, **kwargs: None
+        torch.nn.init.kaiming_normal_ = lambda *args, **kwargs: None
+
+    def __exit__(self, *args, **kwargs):
+        torch.nn.init.constant_ = self.constant_
+        torch.nn.init.zeros_ = self.zeros_
+        torch.nn.init.ones_ = self.ones_
+        torch.nn.init.uniform_ = self.uniform_
+        torch.nn.init.normal_ = self.normal_
+        torch.nn.init.kaiming_uniform_ = self.kaiming_uniform_
+        torch.nn.init.kaiming_normal_ = self.kaiming_normal_
--- a/lmdeploy/torch/chat.py
+++ b/lmdeploy/torch/chat.py
@@ -17,6 +17,7 @@ try:
    from transformers import (AutoModelForCausalLM, AutoTokenizer,
                              GenerationConfig)

+    from .accel import LoadNoInit
    from .utils import get_utils

    _is_transformers_available = True
@@ -51,10 +52,14 @@ def init_model(
                                              use_fast=use_fast_tokenizer,
                                              trust_remote_code=True)

-    torch.set_default_device(local_rank)
-    model = AutoModelForCausalLM.from_pretrained(model_path,
-                                                 torch_dtype=torch.float16,
-                                                 trust_remote_code=True)
+    if torch.__version__ >= '2':
+        torch.set_default_device(local_rank)
+
+    with LoadNoInit():
+        model = AutoModelForCausalLM.from_pretrained(model_path,
+                                                     torch_dtype=torch.float16,
+                                                     trust_remote_code=True)
+    model = model.cuda(local_rank)

    if not _is_deepspeed_available:
        warnings.warn('deepspeed is not installed, ',
@@ -69,8 +74,6 @@ def init_model(
            max_out_tokens=2048,
        )

-    # print(f"model is loaded on device {model.device}")
-
    return tokenizer, model


@@ -115,7 +118,7 @@ def main(
        temperature=temperature,
        top_p=top_p,
    )
-    model.generate(torch.tensor([[1]]), warmup_config)
+    model.generate(torch.tensor([[1]], device=local_rank), warmup_config)

    # print("READY ...")
    _on_master = local_rank == 0
@@ -154,7 +157,7 @@ def main(

            prompt = Decorator.decorate(prompt)
            ids = tokenizer.encode(prompt, return_tensors='pt')
-            model.generate(ids,
+            model.generate(ids.cuda(local_rank),
                           gen_config,
                           streamer=streamer,
                           stopping_criteria=stop_criteria)

--- a/lmdeploy/torch/utils.py
+++ b/lmdeploy/torch/utils.py