first step: add HF+accelerate implementations

db1f55ff · haileyschoelkopf · 25780307 · db1f55ff · db1f55ff
Commit db1f55ff authored Jan 17, 2023 by haileyschoelkopf
Hide whitespace changes
Inline Side-by-side

Showing with 639 additions and 0 deletions

lm_eval/models/__init__.py lm_eval/models/__init__.py +3 -0

lm_eval/models/huggingface.py lm_eval/models/huggingface.py +636 -0

No files found.
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
 from . import gpt2
 from . import gpt3
+from . import huggingface
 from . import textsynth
 from . import dummy
 MODEL_REGISTRY = {
    "hf": gpt2.HFLM,
+    "hf-causal": huggingface.AutoCausalLM,
+    "hf-seq2seq": huggingface.AutoSeq2SeqLM,
    "gpt2": gpt2.GPT2LM,
    "gpt3": gpt3.GPT3LM,
    "textsynth": textsynth.TextSynthLM,

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
+import math
+import torch
+import torch.nn.functional as F
+import transformers
+from typing import List, Mapping, NewType, Optional, Tuple, Union
+from tqdm import tqdm
+from transformers import BatchEncoding
+from lm_eval import utils
+from lm_eval.base import BaseLM
+TokenSequence = Union[List[int], torch.LongTensor, torch.Tensor, BatchEncoding]
+_DeviceMapping = NewType("DeviceMapping", Mapping[str, Union[int, str, torch.device]])
+def _get_accelerate_args(
+    device_map_option: Optional[str] = "auto",
+    max_memory_per_gpu: Optional[Union[int, str]] = None,
+    max_cpu_memory: Optional[Union[int, str]] = None,
+    offload_folder: Optional[str] = "./offload",
+) -> dict:
+    """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
+    max_memory = {}
+    if max_memory_per_gpu is not None:
+        max_memory_per_gpu_map = {
+            device_idx: max_memory_per_gpu
+            for device_idx in range(torch.cuda.device_count())
+        }
+        max_memory.update(max_memory_per_gpu_map)
+    if max_cpu_memory is not None:
+        max_memory["cpu"] = max_cpu_memory
+    args = {}
+    if max_memory:
+        args["max_memory"] = max_memory
+    args["device_map"] = device_map_option
+    args["offload_folder"] = offload_folder
+    return args
+def _get_dtype(
+    dtype: Union[str, torch.dtype], config: Optional[transformers.AutoConfig] = None
+) -> torch.dtype:
+    """Converts `dtype` from `str` to torch.dtype when possible."""
+    if dtype is None and config is not None:
+        _torch_dtype = config.torch_dtype
+    elif isinstance(dtype, str) and dtype != "auto":
+        # Convert `str` args torch dtype: `float16` -> `torch.float16`
+        _torch_dtype = getattr(torch, dtype)
+    else:
+        _torch_dtype = dtype
+    return _torch_dtype
+class HuggingFaceAutoLM(BaseLM):
+    AUTO_CONFIG_CLASS: transformers.AutoConfig = transformers.AutoConfig
+    AUTO_TOKENIZER_CLASS: transformers.AutoTokenizer = transformers.AutoTokenizer
+    AUTO_MODEL_CLASS: transformers.AutoModel = None
+    # Default max sequence length setting for when no `max_length` is provided
+    # or no max length config setting is found in the model or tokenizer.
+    _DEFAULT_MAX_LENGTH: int = 2048
+    def __init__(
+        self,
+        pretrained: str,
+        tokenizer: Optional[str] = None,
+        subfolder: Optional[str] = None,
+        revision: Optional[str] = "main",
+        batch_size: Optional[int] = 1,
+        max_gen_toks: Optional[int] = 256,
+        max_length: Optional[int] = None,
+        add_special_tokens: Optional[bool] = None,
+        use_accelerate: Optional[bool] = False,
+        device_map_option: Optional[str] = "auto",
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[str] = "./offload",
+        dtype: Optional[Union[str, torch.dtype]] = None,
+        device: Optional[Union[int, str]] = "cuda",
+    ):
+        """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.
+        Args:
+            pretrained (str):
+                The HuggingFace Hub model ID name or the path to a pre-trained
+                model to load. This is effectively the `pretrained_model_name_or_path`
+                argument of `from_pretrained` in the HuggingFace `transformers` API.
+            add_special_tokens (bool, optional, defaults to True):
+                Whether to add special tokens to the input sequences. If `None`, the
+                default value will be set to `True` for seq2seq models (e.g. T5) and
+                `False` for causal models.
+                WARNING: Evaluating causal models with `add_special_tokens=True` is
+                currently __not__ supported.
+            > Large model loading `accelerate` arguments
+            use_accelerate (bool, optional, defaults to False):
+                If True, uses the `accelerate` library to load a large model across
+                multiple devices.
+            device_map_option (str, optional, defaults to "auto"):
+                The device map option to use when loading the model with
+                `accelerate`.
+                Options:
+                    "auto", "balanced", "balanced_low_0", "sequential"
+                See the `accelerate` docs for more details on these options:
+                https://huggingface.co/docs/accelerate/v0.12.0/en/usage_guides/big_modeling#designing-a-device-map
+            max_memory_per_gpu (Union[int, str], optional, defaults to None):
+                The maximum memory available for each GPU in bytes as `int` or in
+                the format f"{significand}{unit_symbol}" where {unit_symbol} is
+                any of ["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in
+                the "Parameters for big model inference" section of the following
+                docs:
+                https://huggingface.co/docs/transformers/v4.20.1/en/main_classes/model#large-model-loading
+            max_cpu_memory (Union[int, str], optional, defaults to None):
+                The maximum available CPU RAM in bytes as `int` or in the format
+                f"{significand}{unit_symbol}" where {unit_symbol} is any of
+                ["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in the
+                "Parameters for big model inference" section of the following docs:
+                https://huggingface.co/docs/transformers/v4.20.1/en/main_classes/model#large-model-loading
+            offload_folder (str, optional, defaults to "./offload"):
+                The folder to offload weights into if `device_map` contains any
+                "disk" value.
+            dtype (Union[str, torch.dtype], optional, defaults to None):):
+                Converts the model weights to `dtype`, if specified. Strings get
+                converted to `torch.dtype` objects (e.g. `float16` -> `torch.float16`).
+                Use `dtype="auto"` to derive the type from the model’s weights.
+        """
+        super().__init__()
+        assert isinstance(pretrained, str)
+        assert isinstance(device, str)
+        assert isinstance(batch_size, int)
+        if (
+            add_special_tokens is not None
+            and self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM
+        ):
+            # TODO: Support evaluating causal models with special tokens. Currently,
+            # this is not possible because the `_loglikelihood_tokens()` method for
+            # causal LMs makes a no-special-tokens assumption given that contexts
+            # and labels/continuations are tokenized separately without special
+            # tokens, concatenated, and then processed as inputs.
+            assert (
+                not add_special_tokens
+            ), "Evaluating causal models with `add_special_tokens=True` is currently not supported."
+        self._batch_size = batch_size  # TODO: Adaptive batch size
+        self._max_gen_toks = max_gen_toks
+        self._max_length = max_length
+        self._config = self.AUTO_CONFIG_CLASS.from_pretrained(
+            pretrained,
+            revision=revision + ("/" + subfolder if subfolder is not None else ""),
+        )
+        self._add_special_tokens = add_special_tokens
+        self.tokenizer = self._create_auto_tokenizer(
+            pretrained=pretrained,
+            revision=revision,
+            subfolder=subfolder,
+            tokenizer=tokenizer,
+        )
+        self.tokenizer.model_max_length = self.max_length
+        accelerate_kwargs = {}
+        if use_accelerate:
+            accelerate_kwargs = _get_accelerate_args(
+                device_map_option,
+                max_memory_per_gpu,
+                max_cpu_memory,
+                offload_folder,
+            )
+        self.model = self._create_auto_model(
+            pretrained=pretrained,
+            revision=revision,
+            subfolder=subfolder,
+            torch_dtype=_get_dtype(dtype, self._config),
+            **accelerate_kwargs,
+        )
+        self.model.eval()
+        torch.set_grad_enabled(False)
+        self._device = device
+        if use_accelerate and "lm_head" in self.model.hf_device_map:
+            # `accelerate` can place `lm_head` weights on a different device than
+            # the user specified one so we force `self._device` to be the same as
+            # `lm_head`'s.
+            self._device = self.model.hf_device_map["lm_head"]
+        if not use_accelerate:
+            self.model.to(self._device)
+    def _create_auto_model(
+        self,
+        *,
+        pretrained: str,
+        revision: str,
+        subfolder: str,
+        device_map: Optional[Union[str, _DeviceMapping]] = None,
+        max_memory: Optional[dict] = None,
+        offload_folder: Optional[str] = None,
+        torch_dtype: Optional[Union[str, torch.dtype]] = None,
+    ) -> transformers.AutoModel:
+        """Returns a pre-trained pytorch model from a pre-trained model configuration."""
+        model = self.AUTO_MODEL_CLASS.from_pretrained(
+            pretrained,
+            revision=revision + ("/" + subfolder if subfolder is not None else ""),
+            device_map=device_map,
+            max_memory=max_memory,
+            offload_folder=offload_folder,
+            torch_dtype=torch_dtype,
+        )
+        return model
+    def _create_auto_tokenizer(
+        self,
+        *,
+        pretrained: str,
+        revision: str,
+        subfolder: str,
+        tokenizer: Optional[str] = None,
+    ) -> transformers.PreTrainedTokenizer:
+        """Returns a pre-trained tokenizer from a pre-trained tokenizer configuration."""
+        tokenizer = self.AUTO_TOKENIZER_CLASS.from_pretrained(
+            pretrained if tokenizer is None else tokenizer,
+            revision=revision + ("/" + subfolder if subfolder is not None else ""),
+        )
+        tokenizer.pad_token = tokenizer.eos_token
+        return tokenizer
+    @property
+    def add_special_tokens(self) -> bool:
+        """Whether to include special tokens in encoded text. This should be
+        determined by whether or not the model was trained with special tokens.
+        TODO: Remove these conditionals once HuggingFace supports a way to
+        check whether or not an arbitrary model was trained with special tokens.
+        """
+        if self._add_special_tokens is not None:
+            return self._add_special_tokens
+        elif self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM:
+            return False
+        elif self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM:
+            return True
+        else:
+            raise ValueError(
+                "Could not determine `add_special_tokens` value from the model "
+                "class. Set to `True` or `False` depending on whether the model "
+                "was pre-trained with special tokens."
+            )
+    @property
+    def eot_token(self) -> str:
+        return self.tokenizer.eos_token
+    @property
+    def eot_token_id(self) -> int:
+        return self.tokenizer.eos_token_id
+    @property
+    def max_gen_toks(self) -> int:
+        return self._max_gen_toks
+    @property
+    def max_length(self) -> int:
+        """Return the maximum sequence length of the model.
+        NOTE: Different model configurations have different max sequence length
+        attribute names.
+            - n_positions: (CTRLConfig)
+            - max_position_embeddings: (BartConfig, RoFormerConfig)
+            - n_ctx: (GPT2Config)
+        NOTE: For relative position encoded models you should specify the max
+        sequence length of the model in the constructor via `max_length`.
+        """
+        if self._max_length is not None:
+            return self._max_length
+        # Try to get the sequence length from the model config.
+        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
+        for attr in seqlen_config_attrs:
+            if hasattr(self._config, attr):
+                return getattr(self._config, attr)
+        if hasattr(self.tokenizer, "model_max_length"):
+            return self.tokenizer.model_max_length
+        return self._DEFAULT_MAX_LENGTH
+    @property
+    def batch_size(self) -> int:
+        # TODO: Add adaptive batch size.
+        return self._batch_size  # * gpus
+    @property
+    def device(self) -> Union[int, str, torch.device]:
+        return self._device
+    def tok_encode(self, string: str) -> TokenSequence:
+        # TODO: Merge `tok_encode_batch` here.
+        return self.tokenizer.encode(string, add_special_tokens=self.add_special_tokens)
+    def tok_encode_batch(self, strings: List[str]) -> TokenSequence:
+        return self.tokenizer(
+            strings,
+            padding=True,
+            add_special_tokens=self.add_special_tokens,
+            return_tensors="pt",
+        )
+    def tok_decode(self, tokens: torch.LongTensor) -> List[str]:
+        return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)
+    def greedy_until(self, requests: List[Tuple[str, dict]]) -> List[str]:
+        def _collate(x):
+            tokens = self.tok_encode(x[0])
+            return len(tokens), x[0]
+        results = []
+        reorder = utils.Reorderer(requests, _collate)
+        for chunk in utils.chunks(
+            tqdm(reorder.get_reordered(), disable=False), self.batch_size
+        ):
+            context = [c[0] for c in chunk]
+            request_args = chunk[0][1]
+            stop_sequences = request_args["stop_sequences"]
+            max_generation_length = request_args["max_generation_length"]
+            num_fewshot = request_args["num_fewshot"]
+            assert (
+                isinstance(max_generation_length, int) or max_generation_length is None
+            )
+            assert isinstance(stop_sequences, list) or stop_sequences is None
+            assert isinstance(num_fewshot, int) or num_fewshot is None
+            # TODO: Find a better way to handle stop sequences for 0-shot.
+            if stop_sequences is None or num_fewshot == 0:
+                until = [self.eot_token]
+            else:
+                until = stop_sequences + [self.eot_token]
+            if max_generation_length is None:
+                max_tokens = self.max_gen_toks
+            else:
+                max_tokens = max_generation_length
+            token_context = self.tok_encode_batch(context)
+            responses = self._model_generate(
+                inputs=token_context,
+                max_tokens=max_tokens,
+                stop=until,
+            )
+            responses = self.tok_decode(responses.tolist())
+            for response in responses:
+                # Ensure the generated responses do not contain the stop sequences.
+                for term in until:
+                    response = response.split(term)[0]
+                # partial caching
+                self.cache_hook.add_partial("greedy_until", (context, until), response)
+                results.append(response)
+        return reorder.get_original(results)
+class AutoCausalLM(HuggingFaceAutoLM):
+    """Causal language modeling.
+    You can find a set of supported models in the HF documentation:
+    https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForCausalLM
+    """
+    AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+    def _create_auto_tokenizer(
+        self,
+        *,
+        pretrained: str,
+        revision: str,
+        subfolder: str,
+        tokenizer: Optional[str] = None,
+    ) -> transformers.PreTrainedTokenizer:
+        tokenizer = super()._create_auto_tokenizer(
+            pretrained=pretrained,
+            revision=revision,
+            subfolder=subfolder,
+            tokenizer=tokenizer,
+        )
+        tokenizer.padding_side = "left"
+        return tokenizer
+    def _model_call(
+        self, inputs: TokenSequence, labels: Optional[TokenSequence] = None
+    ) -> TokenSequence:
+        return self.model(inputs)["logits"]
+    def _model_generate(
+        self,
+        inputs: transformers.BatchEncoding,
+        max_tokens: int,
+        stop: Optional[List[str]] = None,
+    ) -> TokenSequence:
+        # Ensure that the context does not encroach into the `space`
+        # for the generation.
+        input_ids = inputs["input_ids"][:, self.max_gen_toks - self.max_length :]
+        attention_mask = inputs["attention_mask"][
+            :, self.max_gen_toks - self.max_length :
+        ]
+        input_ids = input_ids.to(self.device)
+        attention_mask = attention_mask.to(self.device)
+        stopping_criteria = stop_sequences_criteria(
+            self.tokenizer, stop, input_ids.shape[1], input_ids.shape[0]
+        )
+        generations = self.model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            # GPT style models require the `generate` `max_length` arg to include the
+            # context length, so we instead set `max_new_tokens` which is the number
+            # of new tokens to generate, excluding the current number of tokens.
+            max_new_tokens=max_tokens,
+            stopping_criteria=stopping_criteria,
+            do_sample=False,
+        )
+        return utils.select_continuation_from_batch_left_padding(
+            generations, max_context_size=inputs["input_ids"].size(1)
+        )
+class AutoSeq2SeqLM(HuggingFaceAutoLM):
+    """Seq2Seq language modeling.
+    You can find a set of supported models in the following documentation:
+    https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForSeq2SeqLM
+    """
+    AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
+    @property
+    def max_length(self) -> int:
+        """Return the maximum sequence length of the model.
+        TODO: Currently only works for relative position encoded Seq2Seq models.
+        """
+        if self._max_length is not None:
+            return self._max_length
+        return self._DEFAULT_MAX_LENGTH
+    def loglikelihood(
+        self, requests: List[Tuple[str, str]]
+    ) -> List[Tuple[float, bool]]:
+        new_requests = []
+        for chunk in utils.chunks(requests, self.batch_size):
+            context, continuation = zip(*chunk)
+            # Fill empty contexts with the EOT token.
+            context = [
+                f"{self.eot_token}" if len(text) == 0 else text for text in context
+            ]
+            context_enc = self.tok_encode_batch(context)
+            for key in context_enc:
+                context_enc[key] = context_enc[key][:, -self.max_length :]
+            # Remove leading whitespace introduced by the default
+            # `text_target_separator` since the context and continuation
+            # will not be concatenated as a single (decoder) input.
+            continuation = [text.lstrip() for text in continuation]
+            continuation_enc = self.tok_encode_batch(list(continuation))
+            for key in continuation_enc:
+                continuation_enc[key] = continuation_enc[key][:, -self.max_length :]
+            new_requests.append(
+                ((context, continuation), context_enc, continuation_enc)
+            )
+        return self._loglikelihood_tokens(new_requests)
+    def loglikelihood_rolling(self, requests: List[Tuple[str, str]]) -> List[float]:
+        loglikelihoods = []
+        for (string,) in tqdm(requests):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.eot_token_id,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+            contexts, conts = utils.split_and_pad_windows(
+                rolling_token_windows,
+                pad_token_id=self.eot_token_id,
+                max_seq_len=self.max_length,
+            )
+            # Manually create BatchEncoding tensors with attention masks as
+            # expected by `self._model_call` in `self._loglikelihood_tokens`.
+            contexts_enc = torch.Tensor(contexts).long()
+            contexts_enc = transformers.tokenization_utils_base.BatchEncoding(
+                {
+                    "input_ids": contexts_enc,
+                    "attention_mask": (contexts_enc != self.eot_token_id).long(),
+                }
+            )
+            conts_enc = torch.Tensor(conts).long()
+            conts_enc = transformers.tokenization_utils_base.BatchEncoding(
+                {
+                    "input_ids": conts_enc,
+                    "attention_mask": (conts_enc != self.eot_token_id).long(),
+                }
+            )
+            # TODO: Extract out this call so it only gets called once and also
+            # somehow figure out partial caching for.
+            rolling_token_windows_request = [
+                ((contexts, conts), contexts_enc, conts_enc)
+            ]
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows_request, disable_tqdm=True
+            )
+            string_nll = [x[0] for x in string_nll]  # discard is_greedy
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+        return loglikelihoods
+    def _loglikelihood_tokens(
+        self,
+        requests: List[Tuple[Tuple[str, str], TokenSequence, TokenSequence]],
+        disable_tqdm: Optional[bool] = False,
+    ) -> List[Tuple[float, bool]]:
+        results = []
+        for chunk in tqdm(
+            requests, total=math.ceil(len(requests)), disable=disable_tqdm
+        ):
+            cache_keys, inputs_tokens, targets_tokens = chunk
+            inputs_tokens = inputs_tokens.to(self.device)
+            targets_tokens = targets_tokens.to(self.device)
+            outputs = self._model_call(inputs=inputs_tokens, labels=targets_tokens)
+            log_softmaxes = F.log_softmax(outputs.logits, dim=-1)
+            output_iterator = zip(
+                zip(cache_keys[0], cache_keys[1]),
+                log_softmaxes,
+                targets_tokens["input_ids"],
+                targets_tokens["attention_mask"],
+            )
+            for cache_key, log_softmax, target_tokens, target_mask in output_iterator:
+                length = target_mask.sum()
+                log_softmax = log_softmax[:length]
+                target_tokens = target_tokens[:length]
+                greedy_tokens = log_softmax.argmax(dim=-1)
+                max_equal = (greedy_tokens == target_tokens).all()
+                target_logits = torch.gather(
+                    log_softmax, 1, target_tokens.unsqueeze(-1)
+                ).squeeze(-1)
+                answer = (float(target_logits.sum()), bool(max_equal))
+                results.append(answer)
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+        return results
+    def _model_call(
+        self, inputs: TokenSequence, labels: Optional[TokenSequence] = None
+    ) -> TokenSequence:
+        return self.model(**inputs, labels=labels["input_ids"])
+    def _model_generate(
+        self,
+        inputs: transformers.BatchEncoding,
+        max_tokens: int,
+        stop: Optional[List[str]] = None,
+    ) -> TokenSequence:
+        input_ids = inputs["input_ids"][:, -self.max_length :].to(self.device)
+        attention_mask = inputs["attention_mask"][:, -self.max_length :].to(self.device)
+        # Generate one token to calculate the number of start tokens prepended to decoder_input_ids
+        # (leaving this here in case the below assumption is violated in the future)
+        # one_tok_gen = self.model.generate(
+        #    input_ids=torch.zeros((1, 1), dtype=torch.int),
+        #    min_length=2,
+        #    max_new_tokens=1,
+        # ).squeeze()
+        # initial_decoder_input_length = len(one_tok_gen) - 1
+        # Assume that there will always only be one token in the decoder inputs, assumption holds for existing HF models
+        stopping_criteria = stop_sequences_criteria(
+            self.tokenizer, stop, 1, input_ids.shape[0]
+        )
+        generations = self.model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=max_tokens,
+            stopping_criteria=stopping_criteria,
+            do_sample=False,
+        )
+        return generations
+class MultiTokenEOSCriteria(transformers.StoppingCriteria):
+    """Criteria to stop on the specified multi-token sequence."""
+    def __init__(
+        self,
+        sequence: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        initial_decoder_input_length: int,
+        batch_size: int,
+    ):
+        self.initial_decoder_input_length = initial_decoder_input_length
+        self.done_tracker = [False] * batch_size
+        self.sequence = sequence
+        self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
+        self.sequence_id_len = len(self.sequence_ids)
+        self.tokenizer = tokenizer
+    def __call__(self, input_ids, scores, **kwargs) -> bool:
+        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
+        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :][
+            :, -self.sequence_id_len :
+        ]
+        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
+        for i, done in enumerate(self.done_tracker):
+            if not done:
+                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
+        return False not in self.done_tracker
+def stop_sequences_criteria(
+    tokenizer: transformers.PreTrainedTokenizer,
+    stop_sequences: List[str],
+    initial_decoder_input_length: int,
+    batch_size: int,
+) -> transformers.StoppingCriteriaList:
+    return transformers.StoppingCriteriaList(
+        [
+            *[
+                MultiTokenEOSCriteria(
+                    sequence, tokenizer, initial_decoder_input_length, batch_size
+                )
+                for sequence in stop_sequences
+            ],
+        ]
+    )