Unverified Commit 079bf3cb authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
parent 3c6b831c
...@@ -7,9 +7,9 @@ from coati.experience_maker.base import Experience ...@@ -7,9 +7,9 @@ from coati.experience_maker.base import Experience
class ExperienceBuffer(ABC): class ExperienceBuffer(ABC):
"""Experience buffer base class. It stores experience. """Experience buffer base class. It stores experience.
Args: Args:
sample_batch_size (int): Batch size when sampling. sample_batch_size (int): Batch size when sampling.
limit (int, optional): Limit of number of experience samples. A number <= 0 means unlimited. Defaults to 0. limit (int, optional): Limit of number of experience samples. A number <= 0 means unlimited. Defaults to 0.
""" """
def __init__(self, sample_batch_size: int, limit: int = 0) -> None: def __init__(self, sample_batch_size: int, limit: int = 0) -> None:
......
...@@ -11,23 +11,23 @@ from .utils import BufferItem, make_experience_batch, split_experience_batch ...@@ -11,23 +11,23 @@ from .utils import BufferItem, make_experience_batch, split_experience_batch
class NaiveExperienceBuffer(ExperienceBuffer): class NaiveExperienceBuffer(ExperienceBuffer):
"""Naive experience buffer class. It stores experience. """Naive experience buffer class. It stores experience.
Args: Args:
sample_batch_size (int): Batch size when sampling. sample_batch_size (int): Batch size when sampling.
limit (int, optional): Limit of number of experience samples. A number <= 0 means unlimited. Defaults to 0. limit (int, optional): Limit of number of experience samples. A number <= 0 means unlimited. Defaults to 0.
cpu_offload (bool, optional): Whether to offload experience to cpu when sampling. Defaults to True. cpu_offload (bool, optional): Whether to offload experience to cpu when sampling. Defaults to True.
""" """
def __init__(self, sample_batch_size: int, limit: int = 0, cpu_offload: bool = True) -> None: def __init__(self, sample_batch_size: int, limit: int = 0, cpu_offload: bool = True) -> None:
super().__init__(sample_batch_size, limit) super().__init__(sample_batch_size, limit)
self.cpu_offload = cpu_offload self.cpu_offload = cpu_offload
self.target_device = torch.device(f'cuda:{torch.cuda.current_device()}') self.target_device = torch.device(f"cuda:{torch.cuda.current_device()}")
# TODO(ver217): add prefetch # TODO(ver217): add prefetch
self.items: List[BufferItem] = [] self.items: List[BufferItem] = []
@torch.no_grad() @torch.no_grad()
def append(self, experience: Experience) -> None: def append(self, experience: Experience) -> None:
if self.cpu_offload: if self.cpu_offload:
experience.to_device(torch.device('cpu')) experience.to_device(torch.device("cpu"))
items = split_experience_batch(experience) items = split_experience_batch(experience)
self.items.extend(items) self.items.extend(items)
if self.limit > 0: if self.limit > 0:
......
...@@ -21,6 +21,7 @@ class BufferItem: ...@@ -21,6 +21,7 @@ class BufferItem:
"A" is the number of actions. "A" is the number of actions.
""" """
sequences: torch.Tensor sequences: torch.Tensor
action_log_probs: torch.Tensor action_log_probs: torch.Tensor
values: torch.Tensor values: torch.Tensor
...@@ -33,8 +34,7 @@ class BufferItem: ...@@ -33,8 +34,7 @@ class BufferItem:
def split_experience_batch(experience: Experience) -> List[BufferItem]: def split_experience_batch(experience: Experience) -> List[BufferItem]:
batch_size = experience.sequences.size(0) batch_size = experience.sequences.size(0)
batch_kwargs = [{} for _ in range(batch_size)] batch_kwargs = [{} for _ in range(batch_size)]
keys = ('sequences', 'action_log_probs', 'values', keys = ("sequences", "action_log_probs", "values", "reward", "advantages", "attention_mask", "action_mask")
'reward', 'advantages', 'attention_mask', 'action_mask')
for key in keys: for key in keys:
value = getattr(experience, key) value = getattr(experience, key)
if isinstance(value, torch.Tensor): if isinstance(value, torch.Tensor):
...@@ -49,22 +49,21 @@ def split_experience_batch(experience: Experience) -> List[BufferItem]: ...@@ -49,22 +49,21 @@ def split_experience_batch(experience: Experience) -> List[BufferItem]:
return items return items
def _zero_pad_sequences(sequences: List[torch.Tensor], side: str = 'left') -> torch.Tensor: def _zero_pad_sequences(sequences: List[torch.Tensor], side: str = "left") -> torch.Tensor:
assert side in ('left', 'right') assert side in ("left", "right")
max_len = max(seq.size(0) for seq in sequences) max_len = max(seq.size(0) for seq in sequences)
padded_sequences = [] padded_sequences = []
for seq in sequences: for seq in sequences:
pad_len = max_len - seq.size(0) pad_len = max_len - seq.size(0)
padding = (pad_len, 0) if side == 'left' else (0, pad_len) padding = (pad_len, 0) if side == "left" else (0, pad_len)
padded_sequences.append(F.pad(seq, padding)) padded_sequences.append(F.pad(seq, padding))
return torch.stack(padded_sequences, dim=0) return torch.stack(padded_sequences, dim=0)
def make_experience_batch(items: List[BufferItem]) -> Experience: def make_experience_batch(items: List[BufferItem]) -> Experience:
kwargs = {} kwargs = {}
to_pad_keys = set(('action_log_probs', 'action_mask')) to_pad_keys = set(("action_log_probs", "action_mask"))
keys = ('sequences', 'action_log_probs', 'values', keys = ("sequences", "action_log_probs", "values", "reward", "advantages", "attention_mask", "action_mask")
'reward', 'advantages', 'attention_mask', 'action_mask')
for key in keys: for key in keys:
vals = [getattr(item, key) for item in items] vals = [getattr(item, key) for item in items]
if key in to_pad_keys: if key in to_pad_keys:
......
from .base import Experience, ExperienceMaker from .base import Experience, ExperienceMaker
from .naive import NaiveExperienceMaker from .naive import NaiveExperienceMaker
__all__ = ['Experience', 'ExperienceMaker', 'NaiveExperienceMaker'] __all__ = ["Experience", "ExperienceMaker", "NaiveExperienceMaker"]
...@@ -24,6 +24,7 @@ class Experience: ...@@ -24,6 +24,7 @@ class Experience:
"A" is the number of actions. "A" is the number of actions.
""" """
sequences: torch.Tensor sequences: torch.Tensor
action_log_probs: torch.Tensor action_log_probs: torch.Tensor
values: torch.Tensor values: torch.Tensor
...@@ -58,13 +59,9 @@ class Experience: ...@@ -58,13 +59,9 @@ class Experience:
class ExperienceMaker(ABC): class ExperienceMaker(ABC):
def __init__(
def __init__(self, self, actor: Actor, critic: nn.Module, reward_model: nn.Module, initial_model: Actor, kl_coef: float = 0.1
actor: Actor, ) -> None:
critic: nn.Module,
reward_model: nn.Module,
initial_model: Actor,
kl_coef: float = 0.1) -> None:
super().__init__() super().__init__()
self.actor = actor self.actor = actor
self.critic = critic self.critic = critic
......
...@@ -23,22 +23,21 @@ class NaiveExperienceMaker(ExperienceMaker): ...@@ -23,22 +23,21 @@ class NaiveExperienceMaker(ExperienceMaker):
# calculate auxiliary tensors # calculate auxiliary tensors
attention_mask = None attention_mask = None
pad_token_id = generate_kwargs.get('pad_token_id', None) pad_token_id = generate_kwargs.get("pad_token_id", None)
if pad_token_id is not None: if pad_token_id is not None:
attention_mask = sequences.not_equal(pad_token_id)\ attention_mask = sequences.not_equal(pad_token_id).to(dtype=torch.long, device=sequences.device)
.to(dtype=torch.long, device=sequences.device)
input_len = input_ids.size(1) input_len = input_ids.size(1)
eos_token_id = generate_kwargs.get('eos_token_id', None) eos_token_id = generate_kwargs.get("eos_token_id", None)
if eos_token_id is None: if eos_token_id is None:
action_mask = torch.ones_like(sequences, dtype=torch.bool) action_mask = torch.ones_like(sequences, dtype=torch.bool)
else: else:
# left padding may be applied, only mask action # left padding may be applied, only mask action
action_mask = (sequences[:, input_len:] == eos_token_id).cumsum(dim=-1) == 0 action_mask = (sequences[:, input_len:] == eos_token_id).cumsum(dim=-1) == 0
action_mask = F.pad(action_mask, (1 + input_len, -1), value=True) # include eos token and input action_mask = F.pad(action_mask, (1 + input_len, -1), value=True) # include eos token and input
action_mask[:, :input_len] = False action_mask[:, :input_len] = False
action_mask = action_mask[:, 1:] action_mask = action_mask[:, 1:]
action_mask = action_mask[:, -(sequences.size(1) - input_len):] action_mask = action_mask[:, -(sequences.size(1) - input_len) :]
num_actions = action_mask.size(1) num_actions = action_mask.size(1)
actor_output = self.actor(sequences, attention_mask) actor_output = self.actor(sequences, attention_mask)
......
from .wrapper import convert_to_xformer_model, recover_from_xformer_model from .wrapper import convert_to_xformer_model, recover_from_xformer_model
__all__ = [ __all__ = [
'convert_to_xformer_model', "convert_to_xformer_model",
'recover_from_xformer_model', "recover_from_xformer_model",
] ]
...@@ -21,11 +21,12 @@ class XOPTAttention(OPTAttention): ...@@ -21,11 +21,12 @@ class XOPTAttention(OPTAttention):
output_attentions: bool = False, output_attentions: bool = False,
) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]: ) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]:
if not self.training: if not self.training:
return super().forward(hidden_states, key_value_states, past_key_value, attention_mask, layer_head_mask, return super().forward(
output_attentions) hidden_states, key_value_states, past_key_value, attention_mask, layer_head_mask, output_attentions
)
"""Input shape: Batch x Time x Channel""" """Input shape: Batch x Time x Channel"""
assert layer_head_mask is None, 'Xformers attention does not support layer_head_mask' assert layer_head_mask is None, "Xformers attention does not support layer_head_mask"
assert not output_attentions, 'Xformers attention does not support output_attentions' assert not output_attentions, "Xformers attention does not support output_attentions"
# if key_value_states are provided this layer is used as a cross-attention layer # if key_value_states are provided this layer is used as a cross-attention layer
# for the decoder # for the decoder
...@@ -69,12 +70,14 @@ class XOPTAttention(OPTAttention): ...@@ -69,12 +70,14 @@ class XOPTAttention(OPTAttention):
key_states = key_states.transpose(1, 2) key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2) value_states = value_states.transpose(1, 2)
attn_output = xops.memory_efficient_attention(query_states, attn_output = xops.memory_efficient_attention(
key_states, query_states,
value_states, key_states,
attn_bias=xops.LowerTriangularMask(), value_states,
p=self.dropout if self.training else 0.0, attn_bias=xops.LowerTriangularMask(),
scale=self.scaling) p=self.dropout if self.training else 0.0,
scale=self.scaling,
)
# Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
# partitioned across GPUs when using tensor-parallelism. # partitioned across GPUs when using tensor-parallelism.
......
...@@ -3,6 +3,13 @@ from .lora import LoRAModule, convert_to_lora_module ...@@ -3,6 +3,13 @@ from .lora import LoRAModule, convert_to_lora_module
from .loss import LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss from .loss import LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
__all__ = [ __all__ = [
'Actor', 'Critic', 'RewardModel', 'PolicyLoss', 'ValueLoss', 'LogSigLoss', 'LogExpLoss', "Actor",
'LoRAModule', 'convert_to_lora_module' "Critic",
"RewardModel",
"PolicyLoss",
"ValueLoss",
"LogSigLoss",
"LogExpLoss",
"LoRAModule",
"convert_to_lora_module",
] ]
...@@ -9,7 +9,7 @@ from .reward_model import RewardModel ...@@ -9,7 +9,7 @@ from .reward_model import RewardModel
def get_base_model(model: Union[Actor, Critic, RewardModel]) -> nn.Module: def get_base_model(model: Union[Actor, Critic, RewardModel]) -> nn.Module:
"""Get the base model of our wrapper classes. """Get the base model of our wrapper classes.
For Actor, Critic and RewardModel, return ``model.model``, For Actor, Critic and RewardModel, return ``model.model``,
it's usually a ``transformers.PreTrainedModel``. it's usually a ``transformers.PreTrainedModel``.
Args: Args:
...@@ -18,9 +18,10 @@ def get_base_model(model: Union[Actor, Critic, RewardModel]) -> nn.Module: ...@@ -18,9 +18,10 @@ def get_base_model(model: Union[Actor, Critic, RewardModel]) -> nn.Module:
Returns: Returns:
nn.Module: the base model nn.Module: the base model
""" """
assert isinstance(model, (Actor, Critic, RewardModel)), \ assert isinstance(
f'Expect Actor, Critic or RewardModel, got {type(model)}, use unwrap_model first.' model, (Actor, Critic, RewardModel)
), f"Expect Actor, Critic or RewardModel, got {type(model)}, use unwrap_model first."
return model.model return model.model
__all__ = ['Actor', 'Critic', 'RewardModel', 'get_base_model'] __all__ = ["Actor", "Critic", "RewardModel", "get_base_model"]
...@@ -16,18 +16,17 @@ class Actor(LoRAModule): ...@@ -16,18 +16,17 @@ class Actor(LoRAModule):
lora_train_bias (str): LoRA bias training mode. lora_train_bias (str): LoRA bias training mode.
""" """
def __init__(self, model: nn.Module, lora_rank: int = 0, lora_train_bias: str = 'none') -> None: def __init__(self, model: nn.Module, lora_rank: int = 0, lora_train_bias: str = "none") -> None:
super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias) super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
self.model = model self.model = model
self.convert_to_lora() self.convert_to_lora()
def forward( def forward(
self, self,
input_ids: torch.LongTensor, input_ids: torch.LongTensor,
attention_mask: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None,
**model_kwargs, # HACK: `generate` method may pass more kwargs **model_kwargs, # HACK: `generate` method may pass more kwargs
) -> torch.Tensor: ) -> torch.Tensor:
"""Returns model output. """Returns model output."""
"""
output = self.model(input_ids, attention_mask=attention_mask, **model_kwargs) output = self.model(input_ids, attention_mask=attention_mask, **model_kwargs)
return output return output
...@@ -23,22 +23,23 @@ class Critic(LoRAModule): ...@@ -23,22 +23,23 @@ class Critic(LoRAModule):
model: nn.Module, model: nn.Module,
value_head: nn.Module, value_head: nn.Module,
lora_rank: int = 0, lora_rank: int = 0,
lora_train_bias: str = 'none', lora_train_bias: str = "none",
use_action_mask: bool = False, use_action_mask: bool = False,
) -> None: ) -> None:
super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias) super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
self.model = model self.model = model
self.value_head = value_head self.value_head = value_head
self.use_action_mask = use_action_mask self.use_action_mask = use_action_mask
self.convert_to_lora() self.convert_to_lora()
def forward(self, def forward(
sequences: torch.LongTensor, self,
action_mask: Optional[torch.Tensor] = None, sequences: torch.LongTensor,
attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor: action_mask: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
outputs = self.model(sequences, attention_mask=attention_mask) outputs = self.model(sequences, attention_mask=attention_mask)
last_hidden_states = outputs['last_hidden_state'] last_hidden_states = outputs["last_hidden_state"]
values = self.value_head(last_hidden_states).squeeze(-1) values = self.value_head(last_hidden_states).squeeze(-1)
......
...@@ -17,11 +17,13 @@ class RewardModel(LoRAModule): ...@@ -17,11 +17,13 @@ class RewardModel(LoRAModule):
lora_train_bias (str): LoRA bias training mode. lora_train_bias (str): LoRA bias training mode.
""" """
def __init__(self, def __init__(
model: nn.Module, self,
value_head: Optional[nn.Module] = None, model: nn.Module,
lora_rank: int = 0, value_head: Optional[nn.Module] = None,
lora_train_bias: str = 'none') -> None: lora_rank: int = 0,
lora_train_bias: str = "none",
) -> None:
super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias) super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
self.model = model self.model = model
self.convert_to_lora() self.convert_to_lora()
...@@ -35,7 +37,7 @@ class RewardModel(LoRAModule): ...@@ -35,7 +37,7 @@ class RewardModel(LoRAModule):
def forward(self, sequences: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor: def forward(self, sequences: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
outputs = self.model(sequences, attention_mask=attention_mask) outputs = self.model(sequences, attention_mask=attention_mask)
last_hidden_states = outputs['last_hidden_state'] last_hidden_states = outputs["last_hidden_state"]
values = self.value_head(last_hidden_states)[:, :-1] values = self.value_head(last_hidden_states)[:, :-1]
value = values.mean(dim=1).squeeze(1) # ensure shape is (B) value = values.mean(dim=1).squeeze(1) # ensure shape is (B)
return value return value
...@@ -2,4 +2,4 @@ from .bloom_actor import BLOOMActor ...@@ -2,4 +2,4 @@ from .bloom_actor import BLOOMActor
from .bloom_critic import BLOOMCritic from .bloom_critic import BLOOMCritic
from .bloom_rm import BLOOMRM from .bloom_rm import BLOOMRM
__all__ = ['BLOOMActor', 'BLOOMCritic', 'BLOOMRM'] __all__ = ["BLOOMActor", "BLOOMCritic", "BLOOMRM"]
from typing import Optional from typing import Optional
import torch from transformers import BloomConfig, BloomForCausalLM
from transformers import BloomConfig, BloomForCausalLM, BloomModel
from ..base import Actor from ..base import Actor
...@@ -18,12 +17,14 @@ class BLOOMActor(Actor): ...@@ -18,12 +17,14 @@ class BLOOMActor(Actor):
lora_train_bias (str): LoRA bias training mode. lora_train_bias (str): LoRA bias training mode.
""" """
def __init__(self, def __init__(
pretrained: str = None, self,
config: Optional[BloomConfig] = None, pretrained: str = None,
checkpoint: bool = False, config: Optional[BloomConfig] = None,
lora_rank: int = 0, checkpoint: bool = False,
lora_train_bias: str = 'none') -> None: lora_rank: int = 0,
lora_train_bias: str = "none",
) -> None:
if pretrained is not None: if pretrained is not None:
model = BloomForCausalLM.from_pretrained(pretrained) model = BloomForCausalLM.from_pretrained(pretrained)
elif config is not None: elif config is not None:
......
from typing import Optional from typing import Optional
import torch
import torch.nn as nn import torch.nn as nn
from transformers import BloomConfig, BloomForCausalLM, BloomModel from transformers import BloomConfig, BloomModel
from ..base import Critic from ..base import Critic
...@@ -18,12 +17,14 @@ class BLOOMCritic(Critic): ...@@ -18,12 +17,14 @@ class BLOOMCritic(Critic):
lora_train_bias (str): LoRA bias training mode. lora_train_bias (str): LoRA bias training mode.
""" """
def __init__(self, def __init__(
pretrained: str = None, self,
config: Optional[BloomConfig] = None, pretrained: str = None,
lora_rank: int = 0, config: Optional[BloomConfig] = None,
lora_train_bias: str = 'none', lora_rank: int = 0,
**kwargs) -> None: lora_train_bias: str = "none",
**kwargs,
) -> None:
if pretrained is not None: if pretrained is not None:
model = BloomModel.from_pretrained(pretrained) model = BloomModel.from_pretrained(pretrained)
elif config is not None: elif config is not None:
......
from typing import Optional from typing import Optional
import torch.nn as nn import torch.nn as nn
from transformers import BloomConfig, BloomForCausalLM, BloomModel from transformers import BloomConfig, BloomModel
from ..base import RewardModel from ..base import RewardModel
...@@ -17,11 +17,13 @@ class BLOOMRM(RewardModel): ...@@ -17,11 +17,13 @@ class BLOOMRM(RewardModel):
lora_train_bias (str): LoRA bias training mode. lora_train_bias (str): LoRA bias training mode.
""" """
def __init__(self, def __init__(
pretrained: str = None, self,
config: Optional[BloomConfig] = None, pretrained: str = None,
lora_rank: int = 0, config: Optional[BloomConfig] = None,
lora_train_bias: str = 'none') -> None: lora_rank: int = 0,
lora_train_bias: str = "none",
) -> None:
if pretrained is not None: if pretrained is not None:
model = BloomModel.from_pretrained(pretrained) model = BloomModel.from_pretrained(pretrained)
elif config is not None: elif config is not None:
......
from .chatglm_actor import ChatGLMActor from .chatglm_actor import ChatGLMActor
__all__ = ['ChatGLMActor'] __all__ = ["ChatGLMActor"]
\ No newline at end of file
from typing import Optional from typing import Optional
import torch from ..base import Actor
from .configuration_chatglm import ChatGLMConfig from .configuration_chatglm import ChatGLMConfig
from .modeling_chatglm import ChatGLMForConditionalGeneration from .modeling_chatglm import ChatGLMForConditionalGeneration
from ..base import Actor
class ChatGLMActor(Actor): class ChatGLMActor(Actor):
""" """
...@@ -19,10 +17,9 @@ class ChatGLMActor(Actor): ...@@ -19,10 +17,9 @@ class ChatGLMActor(Actor):
do not support lora for now. do not support lora for now.
""" """
def __init__(self, def __init__(
pretrained: str = None, self, pretrained: str = None, config: Optional[ChatGLMConfig] = None, checkpoint: bool = False
config: Optional[ChatGLMConfig] = None, ) -> None:
checkpoint: bool = False) -> None:
if pretrained is not None: if pretrained is not None:
model = ChatGLMForConditionalGeneration.from_pretrained(pretrained) model = ChatGLMForConditionalGeneration.from_pretrained(pretrained)
elif config is not None: elif config is not None:
...@@ -31,4 +28,4 @@ class ChatGLMActor(Actor): ...@@ -31,4 +28,4 @@ class ChatGLMActor(Actor):
model = ChatGLMForConditionalGeneration(ChatGLMConfig()) model = ChatGLMForConditionalGeneration(ChatGLMConfig())
if checkpoint: if checkpoint:
model.gradient_checkpointing_enable() model.gradient_checkpointing_enable()
super().__init__(model, lora_rank=0, lora_train_bias='none') super().__init__(model, lora_rank=0, lora_train_bias="none")
...@@ -2,15 +2,14 @@ ...@@ -2,15 +2,14 @@
This code is copied from https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py This code is copied from https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py
""" """
"""Tokenization classes for ChatGLM.""" """Tokenization classes for ChatGLM."""
from typing import List, Optional, Union
import os import os
from typing import Dict, List, Optional, Union
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.utils import logging, PaddingStrategy
from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
from typing import Dict
import sentencepiece as spm
import numpy as np import numpy as np
import sentencepiece as spm
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_utils_base import BatchEncoding, EncodedInput
from transformers.utils import PaddingStrategy, logging
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
...@@ -52,11 +51,11 @@ class TextTokenizer: ...@@ -52,11 +51,11 @@ class TextTokenizer:
class SPTokenizer: class SPTokenizer:
def __init__( def __init__(
self, self,
vocab_file, vocab_file,
num_image_tokens=20000, num_image_tokens=20000,
max_blank_length=80, max_blank_length=80,
byte_fallback=True, byte_fallback=True,
): ):
assert vocab_file is not None assert vocab_file is not None
self.vocab_file = vocab_file self.vocab_file = vocab_file
...@@ -100,9 +99,7 @@ class SPTokenizer: ...@@ -100,9 +99,7 @@ class SPTokenizer:
text = self._encode_whitespaces(text, max_len=self.max_blank_length) text = self._encode_whitespaces(text, max_len=self.max_blank_length)
return text return text
def encode( def encode(self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True) -> List[int]:
self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
) -> List[int]:
""" """
@param text: Text to encode. @param text: Text to encode.
@param linebreak: Whether to encode newline (\n) in text. @param linebreak: Whether to encode newline (\n) in text.
...@@ -136,9 +133,7 @@ class SPTokenizer: ...@@ -136,9 +133,7 @@ class SPTokenizer:
text = self.postprocess(text) text = self.postprocess(text)
return text return text
def tokenize( def tokenize(self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True) -> List[str]:
self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
) -> List[str]:
""" """
@param text: Text to encode. @param text: Text to encode.
@param linebreak: Whether to encode newline (\n) in text. @param linebreak: Whether to encode newline (\n) in text.
...@@ -181,20 +176,20 @@ class ChatGLMTokenizer(PreTrainedTokenizer): ...@@ -181,20 +176,20 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
model_input_names = ["input_ids", "attention_mask", "position_ids"] model_input_names = ["input_ids", "attention_mask", "position_ids"]
def __init__( def __init__(
self, self,
vocab_file, vocab_file,
do_lower_case=False, do_lower_case=False,
remove_space=False, remove_space=False,
bos_token='<sop>', bos_token="<sop>",
eos_token='<eop>', eos_token="<eop>",
end_token='</s>', end_token="</s>",
mask_token='[MASK]', mask_token="[MASK]",
gmask_token='[gMASK]', gmask_token="[gMASK]",
padding_side="left", padding_side="left",
pad_token="<pad>", pad_token="<pad>",
unk_token="<unk>", unk_token="<unk>",
num_image_tokens=20000, num_image_tokens=20000,
**kwargs **kwargs,
) -> None: ) -> None:
super().__init__( super().__init__(
do_lower_case=do_lower_case, do_lower_case=do_lower_case,
...@@ -208,7 +203,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer): ...@@ -208,7 +203,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
pad_token=pad_token, pad_token=pad_token,
unk_token=unk_token, unk_token=unk_token,
num_image_tokens=num_image_tokens, num_image_tokens=num_image_tokens,
**kwargs **kwargs,
) )
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
...@@ -243,11 +238,11 @@ class ChatGLMTokenizer(PreTrainedTokenizer): ...@@ -243,11 +238,11 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
@property @property
def vocab_size(self): def vocab_size(self):
""" Returns vocab size """ """Returns vocab size"""
return self.sp_tokenizer.num_tokens return self.sp_tokenizer.num_tokens
def get_vocab(self): def get_vocab(self):
""" Returns vocab as a dict """ """Returns vocab as a dict"""
vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder) vocab.update(self.added_tokens_encoder)
return vocab return vocab
...@@ -264,7 +259,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer): ...@@ -264,7 +259,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
return outputs return outputs
def _tokenize(self, text, **kwargs): def _tokenize(self, text, **kwargs):
""" Returns a tokenized string. """ """Returns a tokenized string."""
text = self.preprocess_text(text) text = self.preprocess_text(text)
seq = self.sp_tokenizer.tokenize(text) seq = self.sp_tokenizer.tokenize(text)
...@@ -274,11 +269,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer): ...@@ -274,11 +269,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
def convert_tokens_to_string(self, tokens: List[str]) -> str: def convert_tokens_to_string(self, tokens: List[str]) -> str:
return self.sp_tokenizer.decode_tokens(tokens) return self.sp_tokenizer.decode_tokens(tokens)
def _decode( def _decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
self,
token_ids: Union[int, List[int]],
**kwargs
) -> str:
if isinstance(token_ids, int): if isinstance(token_ids, int):
token_ids = [token_ids] token_ids = [token_ids]
if len(token_ids) == 0: if len(token_ids) == 0:
...@@ -288,7 +279,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer): ...@@ -288,7 +279,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
return super()._decode(token_ids, **kwargs) return super()._decode(token_ids, **kwargs)
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """ """Converts a token (str) in an id using the vocab."""
return self.sp_tokenizer[token] return self.sp_tokenizer[token]
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
...@@ -309,13 +300,11 @@ class ChatGLMTokenizer(PreTrainedTokenizer): ...@@ -309,13 +300,11 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
`Tuple(str)`: Paths to the files saved. `Tuple(str)`: Paths to the files saved.
""" """
if os.path.isdir(save_directory): if os.path.isdir(save_directory):
vocab_file = os.path.join( vocab_file = os.path.join(save_directory, self.vocab_files_names["vocab_file"])
save_directory, self.vocab_files_names["vocab_file"]
)
else: else:
vocab_file = save_directory vocab_file = save_directory
with open(self.vocab_file, 'rb') as fin: with open(self.vocab_file, "rb") as fin:
proto_str = fin.read() proto_str = fin.read()
with open(vocab_file, "wb") as writer: with open(vocab_file, "wb") as writer:
...@@ -324,7 +313,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer): ...@@ -324,7 +313,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
return (vocab_file,) return (vocab_file,)
def build_inputs_with_special_tokens( def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
...@@ -343,19 +332,19 @@ class ChatGLMTokenizer(PreTrainedTokenizer): ...@@ -343,19 +332,19 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
""" """
gmask_id = self.sp_tokenizer[self.gmask_token] gmask_id = self.sp_tokenizer[self.gmask_token]
eos_id = self.sp_tokenizer[self.eos_token] self.sp_tokenizer[self.eos_token]
token_ids_0 = token_ids_0 + [gmask_id, self.sp_tokenizer[self.bos_token]] token_ids_0 = token_ids_0 + [gmask_id, self.sp_tokenizer[self.bos_token]]
if token_ids_1 is not None: if token_ids_1 is not None:
token_ids_0 = token_ids_0 + token_ids_1 token_ids_0 = token_ids_0 + token_ids_1
return token_ids_0 return token_ids_0
def _pad( def _pad(
self, self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None, max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None, pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None, return_attention_mask: Optional[bool] = None,
) -> dict: ) -> dict:
""" """
Pad encoded inputs (on left/right and up to predefined length or max length in the batch) Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
...@@ -421,17 +410,23 @@ class ChatGLMTokenizer(PreTrainedTokenizer): ...@@ -421,17 +410,23 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
mask_position = required_input.index(mask_token) mask_position = required_input.index(mask_token)
position_ids[context_length:] = mask_position position_ids[context_length:] = mask_position
block_position_ids = np.concatenate( block_position_ids = np.concatenate(
[np.zeros(context_length, dtype=np.int64), [
np.arange(1, seq_length - context_length + 1, dtype=np.int64)]) np.zeros(context_length, dtype=np.int64),
np.arange(1, seq_length - context_length + 1, dtype=np.int64),
]
)
encoded_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0) encoded_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
if needs_to_be_padded: if needs_to_be_padded:
difference = max_length - len(required_input) difference = max_length - len(required_input)
if "attention_mask" in encoded_inputs: if "attention_mask" in encoded_inputs:
encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"], encoded_inputs["attention_mask"] = np.pad(
pad_width=[(0, 0), (difference, 0), (difference, 0)], encoded_inputs["attention_mask"],
mode='constant', constant_values=True) pad_width=[(0, 0), (difference, 0), (difference, 0)],
mode="constant",
constant_values=True,
)
if "token_type_ids" in encoded_inputs: if "token_type_ids" in encoded_inputs:
encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
"token_type_ids" "token_type_ids"
...@@ -439,8 +434,9 @@ class ChatGLMTokenizer(PreTrainedTokenizer): ...@@ -439,8 +434,9 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
if "special_tokens_mask" in encoded_inputs: if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
if "position_ids" in encoded_inputs: if "position_ids" in encoded_inputs:
encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"], encoded_inputs["position_ids"] = np.pad(
pad_width=[(0, 0), (difference, 0)]) encoded_inputs["position_ids"], pad_width=[(0, 0), (difference, 0)]
)
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
return encoded_inputs return encoded_inputs
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment