Unverified Commit 3d328d61 authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

issue/92 添加InferEngine,支持多线程推理

parent 0794f307
......@@ -13,6 +13,7 @@ class AutoLlamaModel:
device: infinicore.device,
dtype=infinicore.dtype,
backend="python",
**kwargs,
):
if backend == "python":
from . import modeling_llama
......@@ -21,6 +22,7 @@ class AutoLlamaModel:
model_path,
device=device,
dtype=dtype,
**kwargs,
)
elif backend == "cpp":
......@@ -30,6 +32,7 @@ class AutoLlamaModel:
model_path,
device=device,
dtype=dtype,
**kwargs,
)
raise KeyError("invalid backend")
......@@ -2,6 +2,7 @@ from ....generation.utils import GenerationMixin
import infinicore
from infinilm.models.llama.configuration_llama import LlamaConfig as _LlamaConfig
from infinilm.lib import _infinilm
from infinilm.distributed import DistConfig
import json
import os
from typing import Optional, Union
......@@ -85,7 +86,13 @@ class LlamaConfig:
class LlamaForCausalLM(GenerationMixin):
"""Llama model for causal language modeling"""
def __init__(self, config, device=None, dtype=None):
def __init__(
self,
config,
device=None,
dtype=None,
distributed_config=DistConfig(1),
):
"""
Create LlamaForCausalLM
......@@ -96,10 +103,7 @@ class LlamaForCausalLM(GenerationMixin):
"""
super().__init__()
if isinstance(config, dict):
config = LlamaConfig(**config)
elif not isinstance(config, LlamaConfig):
config = LlamaConfig(**config)
self.config = config
if device is None:
device = infinicore.device()
......@@ -107,8 +111,11 @@ class LlamaForCausalLM(GenerationMixin):
self.use_cache = False
self._device = device
self._model = _infinilm.LlamaForCausalLM(
config._underlying, device._underlying, dtype
# self._model = _infinilm.LlamaForCausalLM(
# config._underlying, device._underlying, dtype
# )
self._model = _infinilm.InferEngine(
config._underlying, distributed_config._underlying, device._underlying.type
)
def state_dict(self):
......@@ -122,7 +129,9 @@ class LlamaForCausalLM(GenerationMixin):
Args:
state_dict: Dictionary mapping parameter names to InfiniCore tensors, numpy arrays, or torch tensors
"""
self._model.load_state_dict(state_dict, self._device._underlying)
# self._model.load_state_dict(state_dict, self._device._underlying)
for name, param in state_dict.items():
self._model.load_param(name, param._underlying)
def get_parameter(self, name):
"""
......@@ -136,15 +145,21 @@ class LlamaForCausalLM(GenerationMixin):
"""
return self._model.get_parameter(name)
@property
def config(self):
"""Get model configuration"""
return self._model.config()
# @property
# def config(self):
# """Get model configuration"""
# return self._model.config()
def forward(self, input_ids, position_ids, *args, **kwargs):
kv_caches = None
# return infinicore.Tensor(
# self._model.forward(input_ids, position_ids, kv_caches)
# )
return infinicore.Tensor(
self._model.forward(input_ids, position_ids, kv_caches)
self._model.generate(
input_ids._underlying,
position_ids._underlying,
)
)
def __call__(self, input_ids, position_ids, *args, **kwargs):
......@@ -156,6 +171,7 @@ class LlamaForCausalLM(GenerationMixin):
model_path: Union[str, os.PathLike],
device: Optional[infinicore.device] = None,
dtype: Optional[infinicore.dtype] = None,
**kwargs,
):
"""
Load a pretrained LlamaForCausalLM model from a directory.
......@@ -176,4 +192,4 @@ class LlamaForCausalLM(GenerationMixin):
config_dict = json.load(f)
config = LlamaConfig(config_dict)
return cls(config, device=device, dtype=dtype)
return cls(config, device=device, dtype=dtype, **kwargs)
......@@ -17,7 +17,6 @@ import json
import os
from typing import Optional, Union
from transformers.utils import logging
import infinicore
......@@ -25,8 +24,6 @@ from ...cache_utils import Cache, DynamicCache
from ...generation.utils import GenerationMixin
from .configuration_llama import LlamaConfig
logger = logging.get_logger(__name__)
def repeat_kv(keys: infinicore.Tensor, values: infinicore.Tensor, ngroup: int):
total_seq_len, num_heads, head_dim = keys.shape
......@@ -399,6 +396,7 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
bias=False,
**kwargs,
)
self.device = kwargs.get("device", infinicore.device("cpu"))
def forward(
self,
......@@ -410,7 +408,7 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
):
last_token = self.model(
input_ids,
position_ids,
position_ids.to(self.device),
past_key_values=past_key_values,
use_cache=use_cache,
**kwargs,
......
......@@ -37,7 +37,7 @@ setup(
version="0.1.0",
description="InfiniLM model implementations",
package_dir={"": "python"},
packages=["infinilm", "infinilm.models", "infinilm.lib"],
packages=["infinilm", "infinilm.models", "infinilm.lib", "infinilm.distributed"],
cmdclass={
"build": Build,
"develop": Develop,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment