"backend/vscode:/vscode.git/clone" did not exist on "4c490132ba77e78433d3b7b2474b69b07bf60eb8"
Unverified Commit 3d328d61 authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

issue/92 添加InferEngine,支持多线程推理

parent 0794f307
...@@ -13,6 +13,7 @@ class AutoLlamaModel: ...@@ -13,6 +13,7 @@ class AutoLlamaModel:
device: infinicore.device, device: infinicore.device,
dtype=infinicore.dtype, dtype=infinicore.dtype,
backend="python", backend="python",
**kwargs,
): ):
if backend == "python": if backend == "python":
from . import modeling_llama from . import modeling_llama
...@@ -21,6 +22,7 @@ class AutoLlamaModel: ...@@ -21,6 +22,7 @@ class AutoLlamaModel:
model_path, model_path,
device=device, device=device,
dtype=dtype, dtype=dtype,
**kwargs,
) )
elif backend == "cpp": elif backend == "cpp":
...@@ -30,6 +32,7 @@ class AutoLlamaModel: ...@@ -30,6 +32,7 @@ class AutoLlamaModel:
model_path, model_path,
device=device, device=device,
dtype=dtype, dtype=dtype,
**kwargs,
) )
raise KeyError("invalid backend") raise KeyError("invalid backend")
...@@ -2,6 +2,7 @@ from ....generation.utils import GenerationMixin ...@@ -2,6 +2,7 @@ from ....generation.utils import GenerationMixin
import infinicore import infinicore
from infinilm.models.llama.configuration_llama import LlamaConfig as _LlamaConfig from infinilm.models.llama.configuration_llama import LlamaConfig as _LlamaConfig
from infinilm.lib import _infinilm from infinilm.lib import _infinilm
from infinilm.distributed import DistConfig
import json import json
import os import os
from typing import Optional, Union from typing import Optional, Union
...@@ -85,7 +86,13 @@ class LlamaConfig: ...@@ -85,7 +86,13 @@ class LlamaConfig:
class LlamaForCausalLM(GenerationMixin): class LlamaForCausalLM(GenerationMixin):
"""Llama model for causal language modeling""" """Llama model for causal language modeling"""
def __init__(self, config, device=None, dtype=None): def __init__(
self,
config,
device=None,
dtype=None,
distributed_config=DistConfig(1),
):
""" """
Create LlamaForCausalLM Create LlamaForCausalLM
...@@ -96,10 +103,7 @@ class LlamaForCausalLM(GenerationMixin): ...@@ -96,10 +103,7 @@ class LlamaForCausalLM(GenerationMixin):
""" """
super().__init__() super().__init__()
if isinstance(config, dict): self.config = config
config = LlamaConfig(**config)
elif not isinstance(config, LlamaConfig):
config = LlamaConfig(**config)
if device is None: if device is None:
device = infinicore.device() device = infinicore.device()
...@@ -107,8 +111,11 @@ class LlamaForCausalLM(GenerationMixin): ...@@ -107,8 +111,11 @@ class LlamaForCausalLM(GenerationMixin):
self.use_cache = False self.use_cache = False
self._device = device self._device = device
self._model = _infinilm.LlamaForCausalLM( # self._model = _infinilm.LlamaForCausalLM(
config._underlying, device._underlying, dtype # config._underlying, device._underlying, dtype
# )
self._model = _infinilm.InferEngine(
config._underlying, distributed_config._underlying, device._underlying.type
) )
def state_dict(self): def state_dict(self):
...@@ -122,7 +129,9 @@ class LlamaForCausalLM(GenerationMixin): ...@@ -122,7 +129,9 @@ class LlamaForCausalLM(GenerationMixin):
Args: Args:
state_dict: Dictionary mapping parameter names to InfiniCore tensors, numpy arrays, or torch tensors state_dict: Dictionary mapping parameter names to InfiniCore tensors, numpy arrays, or torch tensors
""" """
self._model.load_state_dict(state_dict, self._device._underlying) # self._model.load_state_dict(state_dict, self._device._underlying)
for name, param in state_dict.items():
self._model.load_param(name, param._underlying)
def get_parameter(self, name): def get_parameter(self, name):
""" """
...@@ -136,15 +145,21 @@ class LlamaForCausalLM(GenerationMixin): ...@@ -136,15 +145,21 @@ class LlamaForCausalLM(GenerationMixin):
""" """
return self._model.get_parameter(name) return self._model.get_parameter(name)
@property # @property
def config(self): # def config(self):
"""Get model configuration""" # """Get model configuration"""
return self._model.config() # return self._model.config()
def forward(self, input_ids, position_ids, *args, **kwargs): def forward(self, input_ids, position_ids, *args, **kwargs):
kv_caches = None kv_caches = None
# return infinicore.Tensor(
# self._model.forward(input_ids, position_ids, kv_caches)
# )
return infinicore.Tensor( return infinicore.Tensor(
self._model.forward(input_ids, position_ids, kv_caches) self._model.generate(
input_ids._underlying,
position_ids._underlying,
)
) )
def __call__(self, input_ids, position_ids, *args, **kwargs): def __call__(self, input_ids, position_ids, *args, **kwargs):
...@@ -156,6 +171,7 @@ class LlamaForCausalLM(GenerationMixin): ...@@ -156,6 +171,7 @@ class LlamaForCausalLM(GenerationMixin):
model_path: Union[str, os.PathLike], model_path: Union[str, os.PathLike],
device: Optional[infinicore.device] = None, device: Optional[infinicore.device] = None,
dtype: Optional[infinicore.dtype] = None, dtype: Optional[infinicore.dtype] = None,
**kwargs,
): ):
""" """
Load a pretrained LlamaForCausalLM model from a directory. Load a pretrained LlamaForCausalLM model from a directory.
...@@ -176,4 +192,4 @@ class LlamaForCausalLM(GenerationMixin): ...@@ -176,4 +192,4 @@ class LlamaForCausalLM(GenerationMixin):
config_dict = json.load(f) config_dict = json.load(f)
config = LlamaConfig(config_dict) config = LlamaConfig(config_dict)
return cls(config, device=device, dtype=dtype) return cls(config, device=device, dtype=dtype, **kwargs)
...@@ -17,7 +17,6 @@ import json ...@@ -17,7 +17,6 @@ import json
import os import os
from typing import Optional, Union from typing import Optional, Union
from transformers.utils import logging
import infinicore import infinicore
...@@ -25,8 +24,6 @@ from ...cache_utils import Cache, DynamicCache ...@@ -25,8 +24,6 @@ from ...cache_utils import Cache, DynamicCache
from ...generation.utils import GenerationMixin from ...generation.utils import GenerationMixin
from .configuration_llama import LlamaConfig from .configuration_llama import LlamaConfig
logger = logging.get_logger(__name__)
def repeat_kv(keys: infinicore.Tensor, values: infinicore.Tensor, ngroup: int): def repeat_kv(keys: infinicore.Tensor, values: infinicore.Tensor, ngroup: int):
total_seq_len, num_heads, head_dim = keys.shape total_seq_len, num_heads, head_dim = keys.shape
...@@ -399,6 +396,7 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin): ...@@ -399,6 +396,7 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
bias=False, bias=False,
**kwargs, **kwargs,
) )
self.device = kwargs.get("device", infinicore.device("cpu"))
def forward( def forward(
self, self,
...@@ -410,7 +408,7 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin): ...@@ -410,7 +408,7 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
): ):
last_token = self.model( last_token = self.model(
input_ids, input_ids,
position_ids, position_ids.to(self.device),
past_key_values=past_key_values, past_key_values=past_key_values,
use_cache=use_cache, use_cache=use_cache,
**kwargs, **kwargs,
......
...@@ -37,7 +37,7 @@ setup( ...@@ -37,7 +37,7 @@ setup(
version="0.1.0", version="0.1.0",
description="InfiniLM model implementations", description="InfiniLM model implementations",
package_dir={"": "python"}, package_dir={"": "python"},
packages=["infinilm", "infinilm.models", "infinilm.lib"], packages=["infinilm", "infinilm.models", "infinilm.lib", "infinilm.distributed"],
cmdclass={ cmdclass={
"build": Build, "build": Build,
"develop": Develop, "develop": Develop,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment