Merge branch 'master' into big-refactor-merge

b5efc813 · gk · 7dec84a0 · b018a7d5 · b5efc813 · 7dec84a0
Commit b5efc813 authored Jun 15, 2023 by gk
7 changed files
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -167,13 +167,22 @@ def evaluate(
    # get lists of each type of request
    for task_name, task in task_dict.items():
        versions[task_name] = task.VERSION
-        configs[task_name] = dict(task.dump_config()) # TODO: don't access a private attribute here ; for non-YAML tasks handle this case
+        # TODO: don't access a private attribute here ; for non-YAML tasks handle this case
+        configs[task_name] = dict(task.dump_config())
        # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
        # task_docs = list(task_doc_func())
        # rnd = random.Random()
        # rnd.seed(42)
        # rnd.shuffle(task_docs)
+        if limit is not None:
+            if task.has_test_docs():
+                task_docs = task.test_docs()
+            elif task.has_validation_docs():
+                task_docs = task.validation_docs()
+            else:
+                raise RuntimeError("Task has neither test_docs nor validation_docs")
+            limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit)
        task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)

--- a/lm_eval/models/hf_wip.py
+++ b/lm_eval/models/hf_wip.py
-from typing import Iterable
-from tqdm import tqdm
-from accelerate import find_executable_batch_size
-import math
-import peft
-from peft import __version__ as PEFT_VERSION
-from pathlib import Path
-from typing import List, Mapping, NewType, Optional, Tuple, Union
-from tqdm import tqdm
-import torch
-import transformers
-from typing import Optional, Union
-from transformers import BatchEncoding
-from lm_eval.api.model import LM
-from lm_eval import utils
-from abc import abstractmethod
-class BaseLM(LM):
-    def __init__(self):
-        super().__init__()
-        self.batch_schedule = 1
-        self.batch_sizes = {}
-        self.max_batch_size = 512
-    @property
-    @abstractmethod
-    def eot_token_id(self):
-        pass
-    @property
-    @abstractmethod
-    def max_length(self):
-        pass
-    @property
-    @abstractmethod
-    def max_gen_toks(self):
-        pass
-    @property
-    @abstractmethod
-    def batch_size(self):
-        pass
-    @property
-    @abstractmethod
-    def device(self):
-        pass
-    @abstractmethod
-    def tok_encode(self, string: str):
-        pass
-    @abstractmethod
-    def tok_decode(self, tokens: Iterable[int]):
-        pass
-    @abstractmethod
-    def _model_generate(self, context, max_length, eos_token_id):
-        pass
-    @abstractmethod
-    def _model_call(self, inps):
-        """
-        inps: a torch tensor of shape [batch, sequence]
-        the size of sequence may vary from call to call
-        returns: a torch tensor of shape [batch, sequence, vocab] with the
-        logits returned from the model
-        """
-        pass
-    def _detect_batch_size(self, requests=None, pos=0):
-        if requests:
-            _, context_enc, continuation_enc = requests[pos]
-            max_length = len((context_enc + continuation_enc)[-(self.max_length + 1) :][:-1])
-        else:
-            max_length = self.max_length
-        # if OOM, then halves batch_size and tries again
-        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
-        def forward_batch(batch_size):
-            test_batch = torch.ones((batch_size, max_length), device=self.device).long()
-            for _ in range(5):
-                _ = F.log_softmax(self._model_call(test_batch), dim=-1).cpu()
-            return batch_size
-        batch_size = forward_batch()
-        utils.clear_torch_cache()
-        return batch_size
-    # subclass must implement properties vocab_size, eot_token_id, max_gen_toks, batch_size, device, max_length.
-    # TODO: enforce this somehow
-    def _encode_pair(self, context, continuation):
-        n_spaces = len(context) - len(context.rstrip())
-        if n_spaces > 0:
-            continuation = context[-n_spaces:] + continuation
-            context = context[:-n_spaces]
-        whole_enc = self.tok_encode(context + continuation)
-        context_enc = self.tok_encode(context)
-        context_enc_len = len(context_enc)
-        continuation_enc = whole_enc[context_enc_len:]
-        return context_enc, continuation_enc
-    def loglikelihood(self, requests):
-        new_reqs = []
-        for context, continuation in requests:
-            if context == "":
-                # end of text as context
-                context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(continuation)
-            else:
-                context_enc, continuation_enc = self._encode_pair(context, continuation)
-            new_reqs.append(((context, continuation), context_enc, continuation_enc))
-        return self._loglikelihood_tokens(new_reqs)
-    def loglikelihood_rolling(self, requests):
-        # TODO: Implement caching once we've confirmed the perplexity implementation
-        # automatic batch size detection for vectorization
-        adaptive_batch_size = None
-        if self.batch_size == "auto":
-            # using rolling window with maximum context
-            print("Passed argument batch_size = auto. Detecting largest batch size")
-            batch_size = self._detect_batch_size()
-            print(f"Determined Largest batch size: {batch_size}")
-            adaptive_batch_size = batch_size
-        loglikelihoods = []
-        for (string,) in tqdm(requests):
-            rolling_token_windows = list(
-                map(
-                    utils.make_disjoint_window,
-                    utils.get_rolling_token_windows(
-                        token_list=self.tok_encode(string),
-                        prefix_token=self.eot_token_id,
-                        max_seq_len=self.max_length,
-                        context_len=1,
-                    ),
-                )
-            )
-            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
-            # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for
-            # that
-            string_nll = self._loglikelihood_tokens(
-                rolling_token_windows,
-                disable_tqdm=True,
-                override_bs=adaptive_batch_size,
-            )
-            # discard is_greedy
-            string_nll = [x[0] for x in string_nll]
-            string_nll = sum(string_nll)
-            loglikelihoods.append(string_nll)
-        return loglikelihoods
-    def _loglikelihood_tokens(self, requests, disable_tqdm=False, override_bs=None):
-        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
-        res = []
-        def _collate(x):
-            # the negative sign on len(toks) sorts descending - this has a few advantages:
-            # - time estimates will always be over not underestimates, which is more useful for planning
-            # - to know the size of a batch when going through the list, you know the first one is always the batch
-            #   padded context length. this is useful to simplify the batching logic and more importantly to make
-            #   automatic adaptive batches much much easier to implement
-            # - any OOMs will happen right away rather than near the end
-            toks = x[1] + x[2]
-            return -len(toks), tuple(toks)
-        re_ord = utils.Reorderer(requests, _collate)
-        reordered_requests = re_ord.get_reordered()
-        n_reordered_requests = len(reordered_requests)
-        # automatic (variable) batch size detection for vectorization
-        # pull longest context sample from request
-        def _batch_scheduler(pos):
-            sched = pos // int(n_reordered_requests / self.batch_schedule)
-            if sched in self.batch_sizes:
-                return self.batch_sizes[sched]
-            print(f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size")
-            self.batch_sizes[sched] = self._detect_batch_size(reordered_requests, pos)
-            print(f"Determined largest batch size: {self.batch_sizes[sched]}")
-            return self.batch_sizes[sched]
-        for chunk in utils.chunks(
-            tqdm(reordered_requests, disable=disable_tqdm),
-            n=self.batch_size if self.batch_size != "auto" else override_bs if override_bs is not None else 0,
-            fn=_batch_scheduler if self.batch_size == "auto" and n_reordered_requests > 0 else None,
-        ):
-            inps = []
-            cont_toks_list = []
-            inplens = []
-            padding_length = None
-            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
-            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
-            # again because vectorizing is annoying
-            for _, context_enc, continuation_enc in chunk:
-                # sanity check
-                assert len(context_enc) > 0
-                assert len(continuation_enc) > 0
-                assert len(continuation_enc) <= self.max_length
-                # how this all works:
-                #          CTX      CONT
-                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
-                # gpt2    \               \
-                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
-                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
-                # when too long to fit in context, truncate from the left
-                inp = torch.tensor(
-                    (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
-                    dtype=torch.long,
-                ).to(self.device)
-                (inplen,) = inp.shape
-                cont = continuation_enc
-                # since in _collate we make sure length is descending, the longest is always the first one.
-                padding_length = (
-                    padding_length if padding_length is not None else inplen
-                )
-                # pad length from seq to padding_length
-                inp = torch.cat(
-                    [
-                        inp,  # [seq]
-                        torch.zeros(padding_length - inplen, dtype=torch.long).to(
-                            inp.device
-                        ),  # [padding_length - seq]
-                    ],
-                    dim=0,
-                )
-                inps.append(inp.unsqueeze(0))  # [1, padding_length]
-                cont_toks_list.append(cont)
-                inplens.append(inplen)
-            batched_inps = torch.cat(inps, dim=0)  # [batch, padding_length
-            multi_logits = F.log_softmax(
-                self._model_call(batched_inps), dim=-1
-            ).cpu()  # [batch, padding_length, vocab]
-            for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(
-                chunk, multi_logits, inps, inplens, cont_toks_list
-            ):
-                # Slice to original seq length
-                contlen = len(cont_toks)
-                logits = logits[inplen - contlen : inplen].unsqueeze(
-                    0
-                )  # [1, seq, vocab]
-                # Check if per-token argmax is exactly equal to continuation
-                greedy_tokens = logits.argmax(dim=-1)
-                cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(
-                    0
-                )  # [1, seq]
-                max_equal = (greedy_tokens == cont_toks).all()
-                # Obtain log-probs at the corresponding continuation token indices
-                # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
-                logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
-                    -1
-                )  # [1, seq]
-                # Answer: (log prob, is-exact-match)
-                answer = (float(logits.sum()), bool(max_equal))
-                # partial caching
-                if cache_key is not None:
-                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
-                res.append(answer)
-        return re_ord.get_original(res)
-    def greedy_until(self, requests):
-        # TODO: implement fully general `until` that handles until that are
-        #       multiple tokens or that span multiple tokens correctly
-        # TODO: extract to TokenizedLM?
-        res = []
-        def _collate(x):
-            toks = self.tok_encode(x[0])
-            return len(toks), x[0]
-        re_ord = utils.Reorderer(requests, _collate)
-        for context, request_args in tqdm(re_ord.get_reordered()):
-            until = request_args["until"]
-            if isinstance(until, str):
-                until = [until]
-            if until:
-                (primary_until,) = self.tok_encode(until[0])
-            else:
-                primary_until = None
-            context_enc = torch.tensor(
-                [self.tok_encode(context)[self.max_gen_toks - self.max_length :]]
-            ).to(self.device)
-            max_gen_tokens = min(
-                self.max_gen_toks, request_args.get("max_length", self.max_gen_toks)
-            )
-            cont = self._model_generate(
-                context_enc, context_enc.shape[1] + max_gen_tokens, primary_until
-            )
-            s = self.tok_decode(cont[0].tolist()[context_enc.shape[1] :])
-            for term in until:
-                s = s.split(term)[0]
-            # partial caching
-            self.cache_hook.add_partial("greedy_until", (context, until), s)
-            res.append(s)
-        return re_ord.get_original(res)
-def _get_dtype(
-    dtype: Union[str, torch.dtype]
-) -> torch.dtype:
-    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
-    if isinstance(dtype, str) and dtype != "auto":
-        # Convert `str` args torch dtype: `float16` -> `torch.float16`
-        _torch_dtype = getattr(torch, dtype)
-    else:
-        _torch_dtype = dtype
-    return _torch_dtype
-class HFLM(BaseLM):
-    _DEFAULT_MAX_LENGTH = 2048
-    def __init__(
-        self,
-        device="cuda",
-        pretrained="gpt2",
-        revision="main",
-        low_cpu_mem_usage=None,
-        subfolder=None,
-        tokenizer=None,
-        batch_size=1,
-	max_length=None,
-        load_in_8bit: Optional[bool] = False,
-        trust_remote_code: Optional[bool] = False,
-        dtype: Optional[Union[str, torch.dtype]]="auto",
-    ):
-        super().__init__()
-        assert isinstance(device, str)
-        assert isinstance(pretrained, str)
-        assert isinstance(batch_size, (int, str))
-        device_list = set(
-            ["cuda", "cpu"] + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
-        )
-        if device and device in device_list:
-            self._device = torch.device(device)
-            print(f"Using device '{device}'")
-        else:
-            print("Device not specified")
-            print(f"Cuda Available? {torch.cuda.is_available()}")
-            self._device = (
-                torch.device("cuda")
-                if torch.cuda.is_available()
-                else torch.device("cpu")
-            )
-        # TODO: update this to be less of a hack once subfolder is fixed in HF
-        revision = revision + ("/" + subfolder if subfolder is not None else "")
-        self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
-            pretrained,
-            load_in_8bit=load_in_8bit,
-            low_cpu_mem_usage=low_cpu_mem_usage,
-            revision=revision,
-            torch_dtype=_get_dtype(dtype),
-            trust_remote_code=trust_remote_code,
-        ).eval()
-        if not load_in_8bit:
-            try:
-                self.gpt2.to(self.device)
-            except:
-                print("Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore.")
-        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-            pretrained if tokenizer is None else tokenizer,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-        )
-        self.vocab_size = self.tokenizer.vocab_size
-        # setup for automatic batch size detection
-        if batch_size == "auto":
-            self.batch_size_per_gpu = batch_size
-        else:
-            self.batch_size_per_gpu = int(batch_size)
-        self._max_length = max_length
-    @property
-    def eot_token_id(self):
-        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
-        return self.tokenizer.eos_token_id
-    @property
-    def max_length(self):
-        if self._max_length: # if max length manually set, return it
-            return self._max_length
-        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
-        for attr in seqlen_config_attrs:
-            if hasattr(self.gpt2.config, attr):
-                return getattr(self.gpt2.config, attr)
-        if hasattr(self.tokenizer, "model_max_length"):
-            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
-                return self._DEFAULT_MAX_LENGTH
-            return self.tokenizer.model_max_length
-        return self._DEFAULT_MAX_LENGTH
-    @property
-    def max_gen_toks(self):
-        return 256
-    @property
-    def batch_size(self):
-        # TODO: fix multi-gpu
-        return self.batch_size_per_gpu  # * gpus
-    @property
-    def device(self):
-        # TODO: fix multi-gpu
-        return self._device
-    def tok_encode(self, string: str):
-        return self.tokenizer.encode(string, add_special_tokens=False)
-    def tok_decode(self, tokens):
-        return self.tokenizer.decode(tokens)
-    def _model_call(self, inps):
-        """
-        inps: a torch tensor of shape [batch, sequence]
-        the size of sequence may vary from call to call
-        returns: a torch tensor of shape [batch, sequence, vocab] with the
-        logits returned from the model
-        """
-        with torch.no_grad():
-            return self.gpt2(inps)[0]
-    def _model_generate(self, context, max_length, eos_token_id):
-        generation_kwargs = {"do_sample": False, "max_length": max_length}
-        if eos_token_id is not None:
-            generation_kwargs['eos_token_id'] = eos_token_id
-            generation_kwargs['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
-        return self.gpt2.generate(context, **generation_kwargs)
-TokenSequence = Union[List[int], torch.LongTensor, torch.Tensor, BatchEncoding]
-_DeviceMapping = NewType("DeviceMapping", Mapping[str, Union[int, str, torch.device]])
-def _get_accelerate_args(
-    device_map_option: Optional[str] = "auto",
-    max_memory_per_gpu: Optional[Union[int, str]] = None,
-    max_cpu_memory: Optional[Union[int, str]] = None,
-    offload_folder: Optional[str] = "./offload",
-) -> dict:
-    """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
-    max_memory = {}
-    if max_memory_per_gpu is not None:
-        max_memory_per_gpu_map = {
-            device_idx: max_memory_per_gpu
-            for device_idx in range(torch.cuda.device_count())
-        }
-        max_memory.update(max_memory_per_gpu_map)
-    if max_cpu_memory is not None:
-        max_memory["cpu"] = max_cpu_memory
-    args = {}
-    if max_memory:
-        args["max_memory"] = max_memory
-    args["device_map"] = device_map_option
-    args["offload_folder"] = offload_folder
-    return args
-def _get_dtype(
-    dtype: Union[str, torch.dtype], config: Optional[transformers.AutoConfig] = None
-) -> torch.dtype:
-    """Converts `dtype` from `str` to torch.dtype when possible."""
-    if dtype is None and config is not None:
-        _torch_dtype = config.torch_dtype
-    elif isinstance(dtype, str) and dtype != "auto":
-        # Convert `str` args torch dtype: `float16` -> `torch.float16`
-        _torch_dtype = getattr(torch, dtype)
-    else:
-        _torch_dtype = dtype
-    return _torch_dtype
-class HuggingFaceAutoLM(BaseLM):
-    AUTO_CONFIG_CLASS: transformers.AutoConfig = transformers.AutoConfig
-    AUTO_TOKENIZER_CLASS: transformers.AutoTokenizer = transformers.AutoTokenizer
-    AUTO_MODEL_CLASS: transformers.AutoModel = None
-    AUTO_PEFT_CLASS: peft.PeftModel = None
-    # Default max sequence length setting for when no `max_length` is provided
-    # or no max length config setting is found in the model or tokenizer.
-    _DEFAULT_MAX_LENGTH: int = 2048
-    def __init__(
-        self,
-        pretrained: str,
-        quantized: Optional[Union[bool, str]] = False,
-        tokenizer: Optional[str] = None,
-        subfolder: Optional[str] = None,
-        revision: Optional[str] = "main",
-        batch_size: Optional[Union[int, str]] = 1,
-        max_batch_size: Optional[int] = 512,
-        max_gen_toks: Optional[int] = 256,
-        max_length: Optional[int] = None,
-        add_special_tokens: Optional[bool] = None,
-        use_accelerate: Optional[bool] = False,
-        device_map_option: Optional[str] = "auto",
-        max_memory_per_gpu: Optional[Union[int, str]] = None,
-        max_cpu_memory: Optional[Union[int, str]] = None,
-        offload_folder: Optional[str] = "./offload",
-        dtype: Optional[Union[str, torch.dtype]] = None,
-        device: Optional[Union[int, str]] = "cuda",
-        peft: str = None,
-        load_in_8bit: Optional[bool] = False,
-        load_in_4bit: Optional[bool] = False,
-        trust_remote_code: Optional[bool] = False,
-        gptq_use_triton: Optional[bool] = False,
-    ):
-        """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.
-        Args:
-            pretrained (str):
-                The HuggingFace Hub model ID name or the path to a pre-trained
-                model to load. This is effectively the `pretrained_model_name_or_path`
-                argument of `from_pretrained` in the HuggingFace `transformers` API.
-            quantized (str or bool, optional, defaults to False):
-                File name of a GPTQ quantized model to load. Set to `True` to use the
-                default name of the quantized model.
-            add_special_tokens (bool, optional, defaults to True):
-                Whether to add special tokens to the input sequences. If `None`, the
-                default value will be set to `True` for seq2seq models (e.g. T5) and
-                `False` for causal models.
-                WARNING: Evaluating causal models with `add_special_tokens=True` is
-                currently __not__ supported.
-            > Large model loading `accelerate` arguments
-            use_accelerate (bool, optional, defaults to False):
-                If True, uses the `accelerate` library to load a large model across
-                multiple devices.
-            device_map_option (str, optional, defaults to "auto"):
-                The device map option to use when loading the model with
-                `accelerate`.
-                Options:
-                    "auto", "balanced", "balanced_low_0", "sequential"
-                See the `accelerate` docs for more details on these options:
-                https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.device_map
-            max_memory_per_gpu (Union[int, str], optional, defaults to None):
-                The maximum memory available for each GPU in bytes as `int` or in
-                the format f"{significand}{unit_symbol}" where {unit_symbol} is
-                any of ["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in
-                the "Parameters for big model inference" section of the following
-                docs:
-                https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.max_memory
-            max_cpu_memory (Union[int, str], optional, defaults to None):
-                The maximum available CPU RAM in bytes as `int` or in the format
-                f"{significand}{unit_symbol}" where {unit_symbol} is any of
-                ["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in the
-                "Parameters for big model inference" section of the following docs:
-                https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.max_memory
-            offload_folder (str, optional, defaults to "./offload"):
-                The folder to offload weights into if `device_map` contains any
-                "disk" value.
-            dtype (Union[str, torch.dtype], optional, defaults to None):):
-                Converts the model weights to `dtype`, if specified. Strings get
-                converted to `torch.dtype` objects (e.g. `float16` -> `torch.float16`).
-                Use `dtype="auto"` to derive the type from the model’s weights.
-            peft (str, optional, defaults to None):
-                Path of the adapter weights to load from Huggingface. This will usually
-                include a directory that includes the files `adapter_config.json` and
-                `adapter_model.bin`. Compatible with [PEFT](https://github.com/huggingface/peft)
-            load_in_8bit (bool, optional, defaults to False):
-                If True, will convert the loaded model into mixed-8bit quantized model. See:
-                https://huggingface.co/docs/transformers/main/en/main_classes/quantization#load-a-large-model-in-8bit
-            load_in_4bit (bool, optional, defaults to False):
-                If True, will convert the loaded model into mixed-4bit quantized model. See:
-                https://huggingface.co/docs/transformers/main/en/main_classes/quantization#load-a-large-model-in-4bit
-            trust_remote_code (bool, optional, defaults to False):
-                If True, will trust the remote code when loading the model.
-            gptq_use_triton (bool, optional, defaults to False):
-                Use Triton for GPTQ inference.
-        """
-        super().__init__()
-        assert isinstance(pretrained, str)
-        assert isinstance(device, str)
-        assert isinstance(batch_size, (int, str))
-        if (
-            add_special_tokens is not None
-            and self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM
-        ):
-            # TODO: Support evaluating causal models with special tokens. Currently,
-            # this is not possible because the `_loglikelihood_tokens()` method for
-            # causal LMs makes a no-special-tokens assumption given that contexts
-            # and labels/continuations are tokenized separately without special
-            # tokens, concatenated, and then processed as inputs.
-            assert (
-                not add_special_tokens
-            ), "Evaluating causal models with `add_special_tokens=True` is currently not supported."
-        # setup for automatic batch size detection
-        if str(batch_size).startswith("auto"):
-            batch_size = batch_size.split(":")
-            self._batch_size = batch_size[0]
-            self.batch_schedule = float(batch_size[1]) if len(batch_size) > 1 else 1
-        else:
-            self._batch_size = int(batch_size)
-        self.max_batch_size = max_batch_size
-        self._max_gen_toks = max_gen_toks
-        self._max_length = max_length
-        self._config = self.AUTO_CONFIG_CLASS.from_pretrained(
-            pretrained,
-            trust_remote_code=trust_remote_code,
-            revision=revision + ("/" + subfolder if subfolder is not None else ""),
-        )
-        self._add_special_tokens = add_special_tokens
-        self.tokenizer = self._create_auto_tokenizer(
-            pretrained=pretrained,
-            revision=revision,
-            subfolder=subfolder,
-            tokenizer=tokenizer,
-        )
-        self.tokenizer.model_max_length = self.max_length
-        model_kwargs = {}
-        if use_accelerate:
-            model_kwargs = _get_accelerate_args(
-                device_map_option,
-                max_memory_per_gpu,
-                max_cpu_memory,
-                offload_folder,
-            )
-        self.model = self._create_auto_model(
-            pretrained=pretrained,
-            quantized=quantized,
-            trust_remote_code=trust_remote_code,
-            revision=revision,
-            subfolder=subfolder,
-            torch_dtype=_get_dtype(dtype, self._config),
-            gptq_use_triton=gptq_use_triton,
-            load_in_8bit=load_in_8bit,
-            load_in_4bit=load_in_4bit,
-            **model_kwargs,
-        )
-        # note: peft_path can be different than pretrained model path
-        if peft is not None:
-            self.model = self._create_auto_model_peft(
-                model=self.model,
-                peft=peft,
-                revision=revision,
-                subfolder=subfolder,
-                load_in_4bit=load_in_4bit,
-            )
-        self.model.eval()
-        torch.set_grad_enabled(False)
-        self._device = device
-        if use_accelerate and "lm_head" in self.model.hf_device_map:
-            # `accelerate` can place `lm_head` weights on a different device than
-            # the user specified one so we force `self._device` to be the same as
-            # `lm_head`'s.
-            self._device = self.model.hf_device_map["lm_head"]
-        if not use_accelerate and not (load_in_4bit or load_in_8bit):
-            try:
-                self.model.to(self._device)
-            except:
-                print("Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore.")
-    def _create_auto_model(
-        self,
-        *,
-        pretrained: str,
-        quantized: Optional[Union[bool, str]] = False,
-        revision: str,
-        subfolder: str,
-        device_map: Optional[Union[str, _DeviceMapping]] = None,
-        max_memory: Optional[dict] = None,
-        offload_folder: Optional[str] = None,
-        load_in_8bit: Optional[bool] = False,
-        load_in_4bit: Optional[bool] = False,
-        trust_remote_code: Optional[bool] = False,
-        torch_dtype: Optional[Union[str, torch.dtype]] = None,
-        gptq_use_triton: Optional[bool] = False,
-    ) -> transformers.AutoModel:
-        """Returns a pre-trained pytorch model from a pre-trained model configuration."""
-        if not quantized:
-            if load_in_4bit:
-                assert transformers.__version__ >= "4.30.0", "load_in_4bit requires transformers >= 4.30.0"
-            model_kwargs = {}
-            if transformers.__version__ >= "4.30.0":
-                model_kwargs["load_in_4bit"] = load_in_4bit
-            model = self.AUTO_MODEL_CLASS.from_pretrained(
-                pretrained,
-                revision=revision + ("/" + subfolder if subfolder is not None else ""),
-                device_map=device_map,
-                max_memory=max_memory,
-                offload_folder=offload_folder,
-                load_in_8bit=load_in_8bit,
-                trust_remote_code=trust_remote_code,
-                torch_dtype=torch_dtype,
-                **model_kwargs,
-            )
-        else:
-            from auto_gptq import AutoGPTQForCausalLM
-            model = AutoGPTQForCausalLM.from_quantized(
-                pretrained,
-                model_basename=None if quantized == True else Path(quantized).stem,
-                device_map=device_map,
-                max_memory=max_memory,
-                trust_remote_code=trust_remote_code,
-                use_safetensors=True if quantized == True else quantized.endswith('.safetensors'),
-                use_triton=gptq_use_triton,
-                warmup_triton=gptq_use_triton,
-            )
-        return model
-    def _create_auto_model_peft(
-        self,
-        *,
-        model: transformers.PreTrainedModel,
-        peft: str,
-        revision: str,
-        subfolder: str,
-        load_in_4bit: Optional[bool] = False,
-    ):
-        if load_in_4bit:
-            assert PEFT_VERSION >= "0.4.0", "load_in_4bit requires peft >= 0.4.0"
-        model = self.AUTO_PEFT_CLASS.from_pretrained(
-            model,
-            peft,
-            revision=revision + ("/" + subfolder if subfolder is not None else ""),
-        )
-        return model
-    def _create_auto_tokenizer(
-        self,
-        *,
-        pretrained: str,
-        revision: str,
-        subfolder: str,
-        tokenizer: Optional[str] = None,
-    ) -> transformers.PreTrainedTokenizer:
-        """Returns a pre-trained tokenizer from a pre-trained tokenizer configuration."""
-        tokenizer = self.AUTO_TOKENIZER_CLASS.from_pretrained(
-            pretrained if tokenizer is None else tokenizer,
-            revision=revision + ("/" + subfolder if subfolder is not None else ""),
-        )
-        tokenizer.pad_token = tokenizer.eos_token
-        return tokenizer
-    @property
-    def add_special_tokens(self) -> bool:
-        """Whether to include special tokens in encoded text. This should be
-        determined by whether or not the model was trained with special tokens.
-        TODO: Remove these conditionals once HuggingFace supports a way to
-        check whether or not an arbitrary model was trained with special tokens.
-        """
-        if self._add_special_tokens is not None:
-            return self._add_special_tokens
-        elif self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM:
-            return False
-        elif self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM:
-            return True
-        else:
-            raise ValueError(
-                "Could not determine `add_special_tokens` value from the model "
-                "class. Set to `True` or `False` depending on whether the model "
-                "was pre-trained with special tokens."
-            )
-    @property
-    def eot_token(self) -> str:
-        return self.tokenizer.eos_token
-    @property
-    def eot_token_id(self) -> int:
-        return self.tokenizer.eos_token_id
-    @property
-    def max_gen_toks(self) -> int:
-        return self._max_gen_toks
-    @property
-    def max_length(self) -> int:
-        """Return the maximum sequence length of the model.
-        NOTE: Different model configurations have different max sequence length
-        attribute names.
-            - n_positions: (CTRLConfig, T5Config)
-            - max_position_embeddings: (BartConfig, RoFormerConfig)
-            - n_ctx: (GPT2Config)
-        NOTE: For relative position encoded models you should specify the max
-        sequence length of the model in the constructor via `max_length`.
-        """
-        if self._max_length is not None:
-            return self._max_length
-        # Try to get the sequence length from the model config.
-        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
-        for attr in seqlen_config_attrs:
-            if hasattr(self._config, attr):
-                return getattr(self._config, attr)
-        if hasattr(self.tokenizer, "model_max_length"):
-            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
-                return self._DEFAULT_MAX_LENGTH
-            return self.tokenizer.model_max_length
-        return self._DEFAULT_MAX_LENGTH
-    @property
-    def batch_size(self) -> int:
-        # TODO: Add adaptive batch size.
-        return self._batch_size  # * gpus
-    @property
-    def device(self) -> Union[int, str, torch.device]:
-        return self._device
-    def tok_encode(self, string: str) -> TokenSequence:
-        # TODO: Merge `tok_encode_batch` here.
-        return self.tokenizer.encode(string, add_special_tokens=self.add_special_tokens)
-    def tok_encode_batch(self, strings: List[str]) -> TokenSequence:
-        return self.tokenizer(
-            strings,
-            padding=True,
-            add_special_tokens=self.add_special_tokens,
-            return_tensors="pt",
-        )
-    def tok_decode(self, tokens: torch.LongTensor) -> List[str]:
-        return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)
-    def greedy_until(
-        self, requests: List[Tuple[str, Union[List[str], str]]]
-    ) -> List[str]:
-        def _collate(x):
-            tokens = self.tok_encode(x[0])
-            return len(tokens), x[0]
-        results = []
-        reorder = utils.Reorderer(requests, _collate)
-        adaptive_batch_size = None
-        if self.batch_size == "auto":
-            # using rolling window with maximum context
-            print("Passed argument batch_size = auto. Detecting largest batch size")
-            batch_size = self._detect_batch_size()
-            print(f"Determined Largest batch size: {batch_size}")
-            adaptive_batch_size = batch_size
-        for chunk in utils.chunks(
-            tqdm(reorder.get_reordered(), disable=False),
-            self.batch_size if self.batch_size != "auto" else adaptive_batch_size,
-        ):
-            context = [c[0] for c in chunk]
-            request_args = chunk[0][1]
-            stop = request_args.get("until", None)
-            stop_sequences = stop if isinstance(stop, list) else [stop]
-            max_generation_length = request_args.get("max_length", None)
-            assert (
-                isinstance(max_generation_length, int) or max_generation_length is None
-            )
-            assert isinstance(stop_sequences, list) or stop_sequences is None
-            # TODO: Find a better way to handle stop sequences for 0-shot.
-            if stop_sequences is None:
-                until = [self.eot_token]
-            else:
-                until = stop_sequences + [self.eot_token]
-            if max_generation_length is None:
-                max_tokens = self.max_gen_toks
-            else:
-                max_tokens = max_generation_length
-            token_context = self.tok_encode_batch(context)
-            responses = self._model_generate(
-                inputs=token_context,
-                max_tokens=max_tokens,
-                stop=until,
-            )
-            responses = self.tok_decode(responses.tolist())
-            for response in responses:
-                # Ensure the generated responses do not contain the stop sequences.
-                for term in until:
-                    response = response.split(term)[0]
-                # partial caching
-                self.cache_hook.add_partial("greedy_until", (context, until), response)
-                results.append(response)
-        return reorder.get_original(results)
-class AutoCausalLM(HuggingFaceAutoLM):
-    """Causal language modeling.
-    You can find a set of supported models in the HF documentation:
-    https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForCausalLM
-    """
-    AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
-    AUTO_PEFT_CLASS = peft.PeftModel
-    def _create_auto_tokenizer(
-        self,
-        *,
-        pretrained: str,
-        revision: str,
-        subfolder: str,
-        tokenizer: Optional[str] = None,
-    ) -> transformers.PreTrainedTokenizer:
-        tokenizer = super()._create_auto_tokenizer(
-            pretrained=pretrained,
-            revision=revision,
-            subfolder=subfolder,
-            tokenizer=tokenizer,
-        )
-        tokenizer.padding_side = "left"
-        return tokenizer
-    def _model_call(
-        self, inputs: TokenSequence, labels: Optional[TokenSequence] = None
-    ) -> TokenSequence:
-        return self.model(inputs)["logits"]
-    def _model_generate(
-        self,
-        inputs: transformers.BatchEncoding,
-        max_tokens: int,
-        stop: Optional[List[str]] = None,
-    ) -> TokenSequence:
-        # Ensure that the context does not encroach into the `space`
-        # for the generation.
-        input_ids = inputs["input_ids"][:, self.max_gen_toks - self.max_length :]
-        attention_mask = inputs["attention_mask"][
-            :, self.max_gen_toks - self.max_length :
-        ]
-        input_ids = input_ids.to(self.device)
-        attention_mask = attention_mask.to(self.device)
-        stopping_criteria = stop_sequences_criteria(
-            self.tokenizer, stop, input_ids.shape[1], input_ids.shape[0]
-        )
-        generations = self.model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            # GPT style models require the `generate` `max_length` arg to include the
-            # context length, so we instead set `max_new_tokens` which is the number
-            # of new tokens to generate, excluding the current number of tokens.
-            max_new_tokens=max_tokens,
-            stopping_criteria=stopping_criteria,
-            do_sample=False,
-        )
-        return utils.select_continuation_from_batch_left_padding(
-            generations, max_context_size=inputs["input_ids"].size(1)
-        )
-class AutoSeq2SeqLM(HuggingFaceAutoLM):
-    """Seq2Seq language modeling.
-    You can find a set of supported models in the following documentation:
-    https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForSeq2SeqLM
-    """
-    AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
-    AUTO_PEFT_CLASS = peft.PeftModel
-    def loglikelihood(
-        self, requests: List[Tuple[str, str]]
-    ) -> List[Tuple[float, bool]]:
-        new_requests = []
-        for chunk in utils.chunks(requests, self.batch_size):
-            context, continuation = zip(*chunk)
-            # Fill empty contexts with the EOT token.
-            context = [
-                f"{self.eot_token}" if len(text) == 0 else text for text in context
-            ]
-            context_enc = self.tok_encode_batch(context)
-            for key in context_enc:
-                context_enc[key] = context_enc[key][:, -self.max_length :]
-            # Remove leading whitespace introduced by the default
-            # `text_target_separator` since the context and continuation
-            # will not be concatenated as a single (decoder) input.
-            continuation = [text.lstrip() for text in continuation]
-            continuation_enc = self.tok_encode_batch(list(continuation))
-            for key in continuation_enc:
-                continuation_enc[key] = continuation_enc[key][:, -self.max_length :]
-            new_requests.append(
-                ((context, continuation), context_enc, continuation_enc)
-            )
-        return self._loglikelihood_tokens(new_requests)
-    def loglikelihood_rolling(self, requests: List[Tuple[str, str]]) -> List[float]:
-        loglikelihoods = []
-        for (string,) in tqdm(requests):
-            rolling_token_windows = list(
-                map(
-                    utils.make_disjoint_window,
-                    utils.get_rolling_token_windows(
-                        token_list=self.tok_encode(string),
-                        prefix_token=self.eot_token_id,
-                        max_seq_len=self.max_length,
-                        context_len=1,
-                    ),
-                )
-            )
-            contexts, conts = utils.split_and_pad_windows(
-                rolling_token_windows,
-                pad_token_id=self.eot_token_id,
-                max_seq_len=self.max_length,
-            )
-            # Manually create BatchEncoding tensors with attention masks as
-            # expected by `self._model_call` in `self._loglikelihood_tokens`.
-            contexts_enc = torch.Tensor(contexts).long()
-            contexts_enc = transformers.tokenization_utils_base.BatchEncoding(
-                {
-                    "input_ids": contexts_enc,
-                    "attention_mask": (contexts_enc != self.eot_token_id).long(),
-                }
-            )
-            conts_enc = torch.Tensor(conts).long()
-            conts_enc = transformers.tokenization_utils_base.BatchEncoding(
-                {
-                    "input_ids": conts_enc,
-                    "attention_mask": (conts_enc != self.eot_token_id).long(),
-                }
-            )
-            # TODO: Extract out this call so it only gets called once and also
-            # somehow figure out partial caching for.
-            rolling_token_windows_request = [
-                ((contexts, conts), contexts_enc, conts_enc)
-            ]
-            string_nll = self._loglikelihood_tokens(
-                rolling_token_windows_request, disable_tqdm=True
-            )
-            string_nll = [x[0] for x in string_nll]  # discard is_greedy
-            string_nll = sum(string_nll)
-            loglikelihoods.append(string_nll)
-        return loglikelihoods
-    def _loglikelihood_tokens(
-        self,
-        requests: List[Tuple[Tuple[str, str], TokenSequence, TokenSequence]],
-        disable_tqdm: Optional[bool] = False,
-    ) -> List[Tuple[float, bool]]:
-        results = []
-        for chunk in tqdm(
-            requests, total=math.ceil(len(requests)), disable=disable_tqdm
-        ):
-            cache_keys, inputs_tokens, targets_tokens = chunk
-            inputs_tokens = inputs_tokens.to(self.device)
-            targets_tokens = targets_tokens.to(self.device)
-            outputs = self._model_call(inputs=inputs_tokens, labels=targets_tokens)
-            log_softmaxes = F.log_softmax(outputs.logits, dim=-1)
-            output_iterator = zip(
-                zip(cache_keys[0], cache_keys[1]),
-                log_softmaxes,
-                targets_tokens["input_ids"],
-                targets_tokens["attention_mask"],
-            )
-            for cache_key, log_softmax, target_tokens, target_mask in output_iterator:
-                length = target_mask.sum()
-                log_softmax = log_softmax[:length]
-                target_tokens = target_tokens[:length]
-                greedy_tokens = log_softmax.argmax(dim=-1)
-                max_equal = (greedy_tokens == target_tokens).all()
-                target_logits = torch.gather(
-                    log_softmax, 1, target_tokens.unsqueeze(-1)
-                ).squeeze(-1)
-                answer = (float(target_logits.sum()), bool(max_equal))
-                results.append(answer)
-                if cache_key is not None:
-                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
-        return results
-    def _model_call(
-        self, inputs: TokenSequence, labels: Optional[TokenSequence] = None
-    ) -> TokenSequence:
-        return self.model(**inputs, labels=labels["input_ids"])
-    def _model_generate(
-        self,
-        inputs: transformers.BatchEncoding,
-        max_tokens: int,
-        stop: Optional[List[str]] = None,
-    ) -> TokenSequence:
-        input_ids = inputs["input_ids"][:, -self.max_length :].to(self.device)
-        attention_mask = inputs["attention_mask"][:, -self.max_length :].to(self.device)
-        # Generate one token to calculate the number of start tokens prepended to decoder_input_ids
-        # (leaving this here in case the below assumption is violated in the future)
-        # one_tok_gen = self.model.generate(
-        #    input_ids=torch.zeros((1, 1), dtype=torch.int),
-        #    min_length=2,
-        #    max_new_tokens=1,
-        # ).squeeze()
-        # initial_decoder_input_length = len(one_tok_gen) - 1
-        # Assume that there will always only be one token in the decoder inputs, assumption holds for existing HF models
-        stopping_criteria = stop_sequences_criteria(
-            self.tokenizer, stop, 1, input_ids.shape[0]
-        )
-        generations = self.model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            max_new_tokens=max_tokens,
-            stopping_criteria=stopping_criteria,
-            do_sample=False,
-        )
-        return generations
-class MultiTokenEOSCriteria(transformers.StoppingCriteria):
-    """Criteria to stop on the specified multi-token sequence."""
-    def __init__(
-        self,
-        sequence: str,
-        tokenizer: transformers.PreTrainedTokenizer,
-        initial_decoder_input_length: int,
-        batch_size: int,
-    ):
-        self.initial_decoder_input_length = initial_decoder_input_length
-        self.done_tracker = [False] * batch_size
-        self.sequence = sequence
-        self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
-        self.sequence_id_len = len(self.sequence_ids)
-        self.tokenizer = tokenizer
-    def __call__(self, input_ids, scores, **kwargs) -> bool:
-        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
-        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :][
-            :, -self.sequence_id_len :
-        ]
-        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
-        for i, done in enumerate(self.done_tracker):
-            if not done:
-                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
-        return False not in self.done_tracker
-def stop_sequences_criteria(
-    tokenizer: transformers.PreTrainedTokenizer,
-    stop_sequences: List[str],
-    initial_decoder_input_length: int,
-    batch_size: int,
-) -> transformers.StoppingCriteriaList:
-    return transformers.StoppingCriteriaList(
-        [
-            *[
-                MultiTokenEOSCriteria(
-                    sequence, tokenizer, initial_decoder_input_length, batch_size
-                )
-                for sequence in stop_sequences
-            ],
-        ]
-    )
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -16,9 +16,6 @@ from lm_eval.api.registry import (
 )
-ALL_TASKS = sorted(list(TASK_REGISTRY.keys()) + list(GROUP_REGISTRY.keys()))
 def get_task_name_from_config(task_config):
    return "{dataset_path}_{dataset_name}".format(**task_config)

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -19,7 +19,6 @@ from omegaconf import OmegaConf
 from jinja2 import BaseLoader, Environment, StrictUndefined
 from itertools import islice
-from lm_eval import tasks
 from lm_eval.logger import eval_logger

--- a/main.py
+++ b/main.py
@@ -16,7 +16,7 @@ def parse_args():
    parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(sorted(ALL_TASKS)))
    parser.add_argument("--config", default=None)
    parser.add_argument("--num_fewshot", type=int, default=0)
-    parser.add_argument("--batch_size", type=str, default=None)
+    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--max_batch_size", type=int, default=None,
                        help="Maximal batch size to try with --batch_size auto")
    parser.add_argument("--device", type=str, default=None)

--- a/scripts/make_table_results.py
+++ b/scripts/make_table_results.py
@@ -3,7 +3,6 @@ Usage:
   python make_table_tasks.py --output <markdown_filename>
 """
 import logging
-from lm_eval import tasks
 from pytablewriter import MarkdownTableWriter, LatexTableWriter
 import os
 import json
@@ -54,8 +53,6 @@ def make_table(result_dict):
 if __name__ == "__main__":
-    task_names = tasks.ALL_TASKS
    # loop dirs and subdirs in results dir
    # for each dir, load json files
    for dirpath, dirnames, filenames in os.walk("../results"):

--- a/scripts/regression.py
+++ b/scripts/regression.py
@@ -5,7 +5,8 @@ import subprocess
 import time
 from pathlib import Path
-from lm_eval import tasks, utils
+from lm_eval import evaluator, utils
+from lm_eval.api.registry import ALL_TASKS
 seq2seq_models = ["google/flan-t5-small"]
@@ -31,7 +32,7 @@ def parse_args():
    parser.add_argument("--num_fewshot", type=int, default=0)
    parser.add_argument("--limit", type=float, default=None)
    # TODO: implement hf-auto to pick between causal and seq2seq models so we don't need this
-    parser.add_argument("--model", default="hf-causal-experimental")
+    parser.add_argument("--model", default="hf-causal")
    # Use whatever is faster here
    parser.add_argument("--model_args", default="use_accelerate=True,load_in_8bit=True")
    parser.add_argument("--batch_size", default="auto")
@@ -50,14 +51,14 @@ def eval_models(args, branch=None):
    results = {}
    for model in args.models:
-        model_type = "hf-causal-experimental" if model in causal_models \
+        model_type = "hf-causal" if model in causal_models \
            else "hf-seq2seq" if model in seq2seq_models else args.model
        model_args = f"pretrained={model},{args.model_args}"
        # TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
-        tasks = args.tasks if model in causal_models or model_type == "hf-causal-experimental" \
+        tasks = args.tasks if model in causal_models or model_type == "hf-causal" \
            else list(filter(lambda task: task not in perplexity_tasks, args.tasks))
        # TODO: OOM with auto for seq2seq models, also can OOM with llama
-        batch_size = args.batch_size if model in causal_models or model_type == "hf-causal-experimental" \
+        batch_size = args.batch_size if model in causal_models or model_type == "hf-causal" \
            else 64 if args.batch_size == "auto" else args.batch_size
        output_path = f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
@@ -83,12 +84,12 @@ def extract_value(args, results, model, task, err=False):
    if task not in results:
        return 0
    results = results[task]
-    if args.acc_norm and "acc_norm" in results:
+    if args.acc_norm and "acc_norm,none" in results:
-        return results["acc_norm"] if not err else results["acc_norm_stderr"]
+        return results["acc_norm,none"] if not err else results["acc_norm_stderr,none"]
-    if "acc" in results:
+    if "acc,none" in results:
-        return results["acc"] if not err else results["acc_stderr"]
+        return results["acc,none"] if not err else results["acc_stderr,none"]
-    if (args.perplexity or "word_perplexity") in results:
+    if (args.perplexity or "word_perplexity") + ",none" in results:
-        return results[args.perplexity or "word_perplexity"] if not err else 0
+        return results[(args.perplexity or "word_perplexity") + ",none"] if not err else 0
    return 0
@@ -110,8 +111,8 @@ def main():
    args.branches = args.branches.split(",") if type(args.branches) == str else args.branches
    args.models = args.models.split(",") if type(args.models) == str else args.models
-    args.tasks = tasks.ALL_TASKS if args.tasks == "all_tasks" \
+    args.tasks = ALL_TASKS if args.tasks == "all_tasks" \
-        else utils.pattern_match(args.tasks.split(",") if type(args.tasks) == str else args.tasks, tasks.ALL_TASKS)
+        else utils.pattern_match(args.tasks.split(","), ALL_TASKS) if type(args.tasks) == str else args.tasks
    global initial_branch
    initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip()