resolved merge conflict

8d59330b · lintangsutawika · 110e5a28 · d4a913c4 · 8d59330b · 8d59330b
Commit 8d59330b authored May 07, 2024 by lintangsutawika
20 changed files
--- a/README.md
+++ b/README.md
@@ -301,10 +301,23 @@ lm_eval --model hf \

 We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.

+## Saving Results
+
 To save evaluation results provide an `--output_path`. We also support logging model responses with the `--log_samples` flag for post-hoc analysis.

 Additionally, one can provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring.

+To push results and samples to the Hugging Face Hub, first ensure an access token with write access is set in the `HF_TOKEN` environment variable. Then, use the `--hf_hub_log_args` flag to specify the organization, repository name, repository visibility, and whether to push results and samples to the Hub - [example output](https://huggingface.co/datasets/KonradSzafer/lm-eval-results-demo/tree/main/microsoft__phi-2). For instance:
+
+```bash
+lm_eval --model hf \
+    --model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \
+    --tasks hellaswag \
+    --log_samples \
+    --output_path results \
+    --hf_hub_log_args hub_results_org=EleutherAI,hub_repo_name=lm-eval-results,push_results_to_hub=True,push_samples_to_hub=True,public_repo=False \
+```
+
 For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation!

 ## Visualizing Results

--- a/docs/interface.md
+++ b/docs/interface.md
@@ -14,7 +14,7 @@ This mode supports a number of command-line arguments, the details of which can

 - `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=EleutherAI/pythia-160m,dtype=float32`. For a full list of what keyword arguments, see the initialization of the `lm_eval.api.model.LM` subclass, e.g. [`HFLM`](https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/models/huggingface.py#L66)

- `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups.
+- `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups. A list of supported tasks can be viewed with `--tasks list`.

 - `--num_fewshot` : Sets the number of few-shot examples to place in context. Must be an integer.


--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -155,6 +155,21 @@ Our final filter pipeline, "maj@8", does majority voting across the first 8 of t
 Thus, given the 64 responses from our LM on each document, we can report metrics on these responses in these 3 different ways, as defined by our filter pipelines.


+### Adding a custom filter
+
+Just like adding a custom model with `register_model` decorator one is able to do the same with filters, for example
+
+```python
+from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter
+
+@register_filter("new_filter")
+class NewFilter(Filter)
+    ...
+```
+
+
+
 ## Embedded Python Code

 Use can use python functions for certain arguments by using the `!function` operator after the argument name followed by `<filename>.<pythonfunctionname>`. This feature can be used for the following arguments:

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -2,34 +2,20 @@ import argparse
 import json
 import logging
 import os
-import re
 import sys
 from functools import partial
-from pathlib import Path
 from typing import Union

-import numpy as np
-
 from lm_eval import evaluator, utils
 from lm_eval.evaluator import request_caching_arg_to_dict
-from lm_eval.logging_utils import WandbLogger
+from lm_eval.logging import EvaluationTracker, WandbLogger
 from lm_eval.tasks import TaskManager
-from lm_eval.utils import make_table, simple_parse_args_string
-
+from lm_eval.utils import handle_non_serializable, make_table, simple_parse_args_string

-DEFAULT_RESULTS_FILE = "results.json"

-
-def _handle_non_serializable(o):
-    if isinstance(o, np.int64) or isinstance(o, np.int32):
-        return int(o)
-    elif isinstance(o, set):
-        return list(o)
-    else:
-        return str(o)
-
-
-def _int_or_none_list_arg_type(max_len: int, value: str, split_char: str = ","):
+def _int_or_none_list_arg_type(
+    min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","
+):
    def parse_value(item):
        item = item.strip().lower()
        if item == "none":
@@ -45,10 +31,19 @@ def _int_or_none_list_arg_type(max_len: int, value: str, split_char: str = ","):
    if num_items == 1:
        # Makes downstream handling the same for single and multiple values
        items = items * max_len
-    elif num_items != max_len:
+    elif num_items < min_len or num_items > max_len:
        raise argparse.ArgumentTypeError(
            f"Argument requires {max_len} integers or None, separated by '{split_char}'"
        )
+    elif num_items != max_len:
+        logging.warning(
+            f"Argument requires {max_len} integers or None, separated by '{split_char}'. "
+            "Missing values will be filled with defaults."
+        )
+        default_items = [parse_value(v) for v in defaults.split(split_char)]
+        items.extend(
+            default_items[num_items:]
+        )  # extend items list with missing defaults

    return items

@@ -203,6 +198,12 @@ def setup_parser() -> argparse.ArgumentParser:
        default="",
        help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval",
    )
+    parser.add_argument(
+        "--hf_hub_log_args",
+        type=str,
+        default="",
+        help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
+    )
    parser.add_argument(
        "--predict_only",
        "-x",
@@ -210,17 +211,20 @@ def setup_parser() -> argparse.ArgumentParser:
        default=False,
        help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
    )
+    default_seed_string = "0,1234,1234,1234"
    parser.add_argument(
        "--seed",
-        type=partial(_int_or_none_list_arg_type, 3),
-        default="0,1234,1234",  # for backward compatibility
+        type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
+        default=default_seed_string,  # for backward compatibility
        help=(
-            "Set seed for python's random, numpy and torch.\n"
-            "Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, "
-            "or a single integer to set the same seed for all three.\n"
-            "The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).\n"
-            "E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.\n"
-            "E.g, `--seed 42` sets all three seeds to 42."
+            "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
+            "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
+            "respectively, or a single integer to set the same seed for all three.\n"
+            f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
+            "(for backward compatibility).\n"
+            "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
+            "Here numpy's seed is not set since the second value is `None`.\n"
+            "E.g, `--seed 42` sets all four seeds to 42."
        ),
    )
    parser.add_argument(
@@ -228,7 +232,6 @@ def setup_parser() -> argparse.ArgumentParser:
        action="store_true",
        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
    )
-
    return parser


@@ -251,6 +254,15 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    eval_logger.info(f"Verbosity set to {args.verbosity}")
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

+    # update the evaluation tracker args with the output path and the HF token
+    args.hf_hub_log_args = f"output_path={args.output_path},token={os.environ.get('HF_TOKEN')},{args.hf_hub_log_args}"
+    evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args)
+    evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)
+    evaluation_tracker.general_config_tracker.log_experiment_args(
+        model_source=args.model,
+        model_args=args.model_args,
+    )
+
    if args.predict_only:
        args.log_samples = True
    if (args.log_samples or args.predict_only) and not args.output_path:
@@ -262,6 +274,18 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        eval_logger.info(f"Including path: {args.include_path}")
    task_manager = TaskManager(args.verbosity, include_path=args.include_path)

+    if (
+        "push_results_to_hub" in evaluation_tracker_args
+        or "push_samples_to_hub" in evaluation_tracker_args
+    ) and "hub_results_org" not in evaluation_tracker_args:
+        raise ValueError(
+            "If push_results_to_hub or push_samples_to_hub is set, results_org must be specified."
+        )
+    if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
+        eval_logger.warning(
+            "Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub."
+        )
+
    if args.limit:
        eval_logger.warning(
            " --limit SHOULD ONLY BE USED FOR TESTING."
@@ -306,24 +330,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                    f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues."
                )

-    if args.output_path:
-        path = Path(args.output_path)
-        # check if file or 'dir/results.json' exists
-        if path.is_file():
-            raise FileExistsError(f"File already exists at {path}")
-        output_path_file = path.joinpath(DEFAULT_RESULTS_FILE)
-        if output_path_file.is_file():
-            eval_logger.warning(
-                f"File {output_path_file} already exists. Results will be overwritten."
-            )
-        # if path json then get parent dir
-        elif path.suffix in (".json", ".jsonl"):
-            output_path_file = path
-            path.parent.mkdir(parents=True, exist_ok=True)
-            path = path.parent
-        else:
-            path.mkdir(parents=True, exist_ok=True)
-
    # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
    if args.trust_remote_code:
        os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = str(args.trust_remote_code)
@@ -358,6 +364,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        random_seed=args.seed[0],
        numpy_random_seed=args.seed[1],
        torch_random_seed=args.seed[2],
+        fewshot_random_seed=args.seed[3],
        **request_caching_args,
    )

@@ -365,7 +372,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        if args.log_samples:
            samples = results.pop("samples")
        dumped = json.dumps(
-            results, indent=2, default=_handle_non_serializable, ensure_ascii=False
+            results, indent=2, default=handle_non_serializable, ensure_ascii=False
        )
        if args.show_config:
            print(dumped)
@@ -382,23 +389,15 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
            except Exception as e:
                eval_logger.info(f"Logging to Weights and Biases failed due to {e}")

-        if args.output_path:
-            output_path_file.open("w", encoding="utf-8").write(dumped)
-
-            if args.log_samples:
-                for task_name, config in results["configs"].items():
-                    output_name = "{}_{}".format(
-                        re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", args.model_args),
-                        task_name,
-                    )
-                    filename = path.joinpath(f"{output_name}.jsonl")
-                    samples_dumped = json.dumps(
-                        samples[task_name],
-                        indent=2,
-                        default=_handle_non_serializable,
-                        ensure_ascii=False,
-                    )
-                    filename.write_text(samples_dumped, encoding="utf-8")
+        evaluation_tracker.save_results_aggregated(
+            results=results, samples=samples if args.log_samples else None
+        )
+
+        if args.log_samples:
+            for task_name, config in results["configs"].items():
+                evaluation_tracker.save_results_samples(
+                    task_name=task_name, samples=samples[task_name]
+                )

        print(
            f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "

--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -78,6 +78,7 @@ METRIC_REGISTRY = {}
 METRIC_AGGREGATION_REGISTRY = {}
 AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {}
 HIGHER_IS_BETTER_REGISTRY = {}
+FILTER_REGISTRY = {}

 DEFAULT_METRIC_REGISTRY = {
    "loglikelihood": [
@@ -170,3 +171,22 @@ def is_higher_better(metric_name) -> bool:
        eval_logger.warning(
            f"higher_is_better not specified for metric '{metric_name}'!"
        )
+
+
+def register_filter(name):
+    def decorate(cls):
+        if name in FILTER_REGISTRY:
+            eval_logger.info(
+                f"Registering filter `{name}` that is already in Registry {FILTER_REGISTRY}"
+            )
+        FILTER_REGISTRY[name] = cls
+        return cls
+
+    return decorate
+
+
+def get_filter(filter_name: str) -> type:
+    try:
+        return FILTER_REGISTRY[filter_name]
+    except KeyError:
+        eval_logger.warning(f"filter `{filter_name}` is not registered!")
--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
 class ContextSampler:
    def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
        self.rnd = rnd
-        assert self.rnd, "must pass rnd to FewShotSampler!"
+        if not self.rnd:
+            raise ValueError(
+                "A `random.Random` generator argument must be provided to `rnd` of FewShotSampler!"
+            )

        self.task = task
        self.config = task._config

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -312,6 +312,9 @@ class Task(abc.ABC):
        self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig()

        self._filters = [build_filter_ensemble("none", [["take_first", None]])]
+        self.fewshot_rnd: Optional[
+            random.Random
+        ] = None  # purposely induce errors in case of improper usage

    def download(
        self,
@@ -603,7 +606,7 @@ class Task(abc.ABC):
        self,
        doc,
        num_fewshot,
-        rnd=random.Random(1234),
+        rnd=None,
        description=None,
    ):
        """Returns a fewshot context string that is made up of a prepended description
@@ -622,9 +625,12 @@ class Task(abc.ABC):
            The fewshot context.
        """
        if rnd is None:
-            raise ValueError(
-                "A `random.Random` generator argument must be provided to `rnd`"
-            )
+            if self.fewshot_rnd is not None:
+                rnd = self.fewshot_rnd
+            else:
+                raise ValueError(
+                    "A `random.Random` generator argument must be provided to `rnd`"
+                )

        description = description if description else ""

@@ -715,6 +721,11 @@ class Task(abc.ABC):
        setattr(self._config, "metric_list", [{"metric": metric_name}])
        setattr(self._config, "process_results", None)

+    def set_fewshot_seed(self, seed: Optional[int] = None) -> None:
+        self.fewshot_rnd = random.Random(seed)
+        if hasattr(self, "sampler"):
+            self.sampler.rnd = self.fewshot_rnd
+
    @property
    def eval_docs(self) -> Union[datasets.Dataset, List[dict]]:
        if self.has_test_docs():
@@ -891,11 +902,29 @@ class ConfigurableTask(Task):
            self.prompt = None

        if self.fewshot_docs() is not None:
-            self.sampler = samplers.get_sampler(
+            self.fewshot_rnd = (
+                random.Random()
+            )  # setting with no seed, to be overridden at a later time
+            config_sampler: Union[str, Callable] = (
                self.config.fewshot_config.get("sampler", "default")
                if self.config.fewshot_config
                else "default"
-            )(list(self.fewshot_docs()), self, rnd=random.Random(1234))
+            )
+            if isinstance(config_sampler, str):
+                self.sampler = samplers.get_sampler(config_sampler)(
+                    list(self.fewshot_docs()), self, rnd=self.fewshot_rnd
+                )
+            elif callable(config_sampler) and issubclass(
+                config_sampler, samplers.ContextSampler
+            ):
+                self.sampler = config_sampler(
+                    docs=list(self.fewshot_docs()), task=self, rnd=self.fewshot_rnd
+                )
+            else:
+                raise TypeError(
+                    f"fewshot_config.sampler should be a string or callable of ContextSampler type, "
+                    f"not {type(config_sampler)}"
+                )

        self.task_docs = self.eval_docs


--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
 import itertools
+import json
 import logging
 import random
 import time
@@ -28,7 +29,13 @@ from lm_eval.tasks import (
    TaskManager,
    get_task_dict,
 )
-from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string
+from lm_eval.utils import (
+    eval_logger,
+    handle_non_serializable,
+    hash_string,
+    positional_deprecated,
+    simple_parse_args_string,
+)


 if TYPE_CHECKING:
@@ -61,6 +68,7 @@ def simple_evaluate(
    random_seed: int = 0,
    numpy_random_seed: int = 1234,
    torch_random_seed: int = 1234,
+    fewshot_random_seed: int = 1234,
 ):
    """Instantiate and evaluate a model on a list of tasks.

@@ -108,6 +116,8 @@ def simple_evaluate(
        Random seed for numpy. If set to None, the seed will not be set.
    :param torch_random_seed: int
        Random seed for torch. If set to None, the seed will not be set.
+    :param fewshot_random_seed: int
+        Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None.

    :return
        Dictionary of results
@@ -156,15 +166,6 @@ def simple_evaluate(
        if model_args is None:
            eval_logger.warning("model_args not specified. Using defaults.")
            model_args = ""
-        if "pretrained" not in model_args and model in [
-            "hf-auto",
-            "hf",
-            "huggingface",
-            "vllm",
-        ]:
-            eval_logger.warning(
-                "pretrained not specified. Using default pretrained=gpt2."
-            )

        if isinstance(model_args, dict):
            eval_logger.info(
@@ -217,7 +218,7 @@ def simple_evaluate(

    task_dict = get_task_dict(tasks, task_manager)

-    def _adjust_config(task_dict):
+    def _adjust_config(task_dict, predict_only):
        adjusted_task_dict = {}
        for task_name, task_obj in task_dict.items():
            if isinstance(task_obj, dict):
@@ -252,6 +253,10 @@ def simple_evaluate(
                            f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
                        )
                        task_obj.set_config(key="num_fewshot", value=num_fewshot)
+                    task_obj.set_fewshot_seed(seed=fewshot_random_seed)
+                    eval_logger.info(
+                        f"Setting fewshot random generator seed to {fewshot_random_seed}"
+                    )
                else:
                    # if num_fewshot not provided, and the task does not define a default one, default to 0
                    if (
@@ -263,7 +268,7 @@ def simple_evaluate(

        return adjusted_task_dict

-    task_dict = _adjust_config(task_dict)
+    task_dict = _adjust_config(task_dict, predict_only)
    results = evaluate(
        lm=lm,
        task_dict=task_dict,
@@ -288,16 +293,28 @@ def simple_evaluate(
        results["config"] = {
            "model": model_name,
            "model_args": model_args,
-            "batch_size": batch_size,
-            "batch_sizes": (
-                list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
-            ),
-            "device": device,
-            "use_cache": use_cache,
-            "limit": limit,
-            "bootstrap_iters": bootstrap_iters,
-            "gen_kwargs": gen_kwargs,
        }
+        # add more detailed model info if available
+        if isinstance(lm, lm_eval.models.huggingface.HFLM):
+            results["config"].update(lm.get_model_info())
+        # add info about execution
+        results["config"].update(
+            {
+                "batch_size": batch_size,
+                "batch_sizes": (
+                    list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
+                ),
+                "device": device,
+                "use_cache": use_cache,
+                "limit": limit,
+                "bootstrap_iters": bootstrap_iters,
+                "gen_kwargs": gen_kwargs,
+                "random_seed": random_seed,
+                "numpy_seed": numpy_random_seed,
+                "torch_seed": torch_random_seed,
+                "fewshot_seed": fewshot_random_seed,
+            }
+        )
        results["git_hash"] = get_git_commit_hash()
        results["date"] = start_date
        add_env_info(results)  # additional environment info to results
@@ -365,7 +382,6 @@ def evaluate(
        eval_logger.debug(
            f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
        )
-
        if write_out:
            print_writeout(task)
        # aggregate Instances by LM method requested to get output.
@@ -451,6 +467,16 @@ def evaluate(
                        "filtered_resps": [
                            req.filtered_resps[filter_key] for req in requests
                        ],
+                        "doc_hash": hash_string(
+                            json.dumps(
+                                requests[0].doc,
+                                indent=2,
+                                default=handle_non_serializable,
+                                ensure_ascii=False,
+                            )
+                        ),
+                        "prompt_hash": hash_string(requests[0].arguments[0]),
+                        "target_hash": hash_string(str(target)),
                    }
                    example.update(metrics)
                    task_output.logged_samples.append(example)
@@ -612,6 +638,16 @@ def evaluate(
            "configs": dict(sorted(configs.items())),
            "versions": dict(sorted(versions.items())),
            "n-shot": dict(sorted(num_fewshot.items())),
+            "n-samples": {
+                task_output.task_name: {
+                    "original": len(task_output.task.eval_docs),
+                    "effective": min(
+                        limit if limit else len(task_output.task.eval_docs),
+                        len(task_output.task.eval_docs),
+                    ),
+                }
+                for task_output in eval_tasks
+            },
        }
        if log_samples:
            results_dict["samples"] = dict(samples)

--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
@@ -5,8 +5,9 @@ import sys
 from typing import List, Optional, Tuple, Union

 from lm_eval.api import metrics
-from lm_eval.utils import eval_logger, positional_deprecated
 from lm_eval.tasks import ConfigurableGroup
+from lm_eval.utils import eval_logger, positional_deprecated
+

 class TaskOutput:
    """
@@ -198,9 +199,7 @@ def prepare_print_tasks(
            task_agg[name].pop("samples")

        if from_configurable_group and (" " not in results[name]):
-            group_tab_string = (
-                " " * group_depth + "- " if group_depth > 0 else ""
-            )
+            group_tab_string = " " * group_depth + "- " if group_depth > 0 else ""
            group_agg[name] = results[name].copy()
            group_agg[name]["alias"] = group_tab_string + alias
            if "samples" in group_agg[name]:

--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
 from functools import partial
-from typing import List, Union
+from typing import List

 from lm_eval.api.filter import FilterEnsemble
+from lm_eval.api.registry import get_filter

 from . import extraction, selection, transformation


-FILTER_REGISTRY = {
-    "take_first": selection.TakeFirstFilter,
-    "regex": extraction.RegexFilter,
-    "majority_vote": selection.MajorityVoteFilter,
-    "take_first_k": selection.TakeKFilter,
-    "remove_whitespace": extraction.WhitespaceFilter,
-    "lowercase": transformation.LowercaseFilter,
-    "uppercase": transformation.UppercaseFilter,
-    "map": transformation.MapFilter,
-    "multi_choice_regex": extraction.MultiChoiceRegexFilter,
-    # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
-    # that takes an input and returns a scalar and then should select the max reward,
-    # or should implement different filters for different ways of handling a reward model's inference.
-    # "arg_max": selection.ArgMaxFilter,
-}
-
-
-def get_filter(filter_name: str) -> Union[type, str]:
-    if filter_name in FILTER_REGISTRY:
-        return FILTER_REGISTRY[filter_name]
-    else:
-        return filter_name
-
-
 def build_filter_ensemble(
    filter_name: str, components: List[List[str]]
 ) -> FilterEnsemble:

--- a/lm_eval/filters/decontamination.py
+++ b/lm_eval/filters/decontamination.py
 from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter


+@register_filter("decontaminate")
 class DecontaminationFilter(Filter):

    """

--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -3,8 +3,10 @@ import sys
 import unicodedata

 from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter


+@register_filter("regex")
 class RegexFilter(Filter):
    """ """

@@ -49,6 +51,7 @@ class RegexFilter(Filter):
        return filtered_resps


+@register_filter("remove_whitespace")
 class WhitespaceFilter(Filter):
    """ """

@@ -71,6 +74,7 @@ class WhitespaceFilter(Filter):
        return filtered_resps


+@register_filter("multi_choice_regex")
 class MultiChoiceRegexFilter(RegexFilter):
    """
    A filter used to extract a model's answer on multiple choice questions with

--- a/lm_eval/filters/selection.py
+++ b/lm_eval/filters/selection.py
 from collections import Counter

 from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter


+# TODO: implement "arg_max" filter. either it should take in an arbitrary "scoring"/reward function
+# that takes an input and returns a scalar and then should select the max reward,
+# or should implement different filters for different ways of handling a reward model's inference.
+
+
+@register_filter("take_first")
 class TakeFirstFilter(Filter):
    def __init__(self) -> None:
        """
@@ -16,6 +23,7 @@ class TakeFirstFilter(Filter):
        return map(lambda r: r[0], resps)


+@register_filter("take_first_k")
 class TakeKFilter(Filter):
    def __init__(self, **kwargs) -> None:
        self.k = kwargs.pop("k")
@@ -32,6 +40,7 @@ class TakeKFilter(Filter):
        return map(lambda r: r[: self.k], resps)


+@register_filter("majority_vote")
 class MajorityVoteFilter(Filter):
    def __init__(self) -> None:
        """

--- a/lm_eval/filters/transformation.py
+++ b/lm_eval/filters/transformation.py
 from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter


+@register_filter("lowercase")
 class LowercaseFilter(Filter):
    def __init__(self) -> None:
        pass
@@ -12,6 +14,7 @@ class LowercaseFilter(Filter):
        return [filter_set(resp) for resp in resps]


+@register_filter("uppercase")
 class UppercaseFilter(Filter):
    def __init__(self) -> None:
        pass
@@ -23,6 +26,7 @@ class UppercaseFilter(Filter):
        return [filter_set(resp) for resp in resps]


+@register_filter("map")
 class MapFilter(Filter):
    def __init__(self, mapping_dict: dict = None, default_value=None) -> None:
        """

--- a/lm_eval/logging/__init__.py
+++ b/lm_eval/logging/__init__.py
+from .evaluation_tracker import EvaluationTracker
+from .wandb_logger import WandbLogger
--- a/lm_eval/logging/evaluation_tracker.py
+++ b/lm_eval/logging/evaluation_tracker.py
+import json
+import re
+import time
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+
+from huggingface_hub import HfApi
+
+from lm_eval.utils import (
+    eval_logger,
+    handle_non_serializable,
+    hash_string,
+)
+
+
+@dataclass(init=False)
+class GeneralConfigTracker:
+    """
+    Tracker for the evaluation parameters.
+
+    Attributes:
+        model_source (str): Source of the model (e.g. Hugging Face, GGUF, etc.)
+        model_name (str): Name of the model.
+        model_name_sanitized (str): Sanitized model name for directory creation.
+        start_time (float): Start time of the experiment. Logged at class init.
+        end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigTracker.log_end_time`]
+        total_evaluation_time_seconds (str): Inferred total evaluation time in seconds (from the start and end times).
+    """
+
+    model_source: str = None
+    model_name: str = None
+    model_name_sanitized: str = None
+    start_time: float = None
+    end_time: float = None
+    total_evaluation_time_seconds: str = None
+
+    def __init__(self) -> None:
+        """Starts the evaluation timer."""
+        self.start_time = time.perf_counter()
+
+    @staticmethod
+    def _get_model_name(model_args: str) -> str:
+        """Extracts the model name from the model arguments."""
+
+        def extract_model_name(model_args: str, key: str) -> str:
+            """Extracts the model name from the model arguments using a key."""
+            args_after_key = model_args.split(key)[1]
+            return args_after_key.split(",")[0]
+
+        # order does matter, e.g. peft and delta are provided together with pretrained
+        prefixes = ["peft=", "delta=", "pretrained=", "model=", "path=", "engine="]
+        for prefix in prefixes:
+            if prefix in model_args:
+                return extract_model_name(model_args, prefix)
+        return ""
+
+    def log_experiment_args(
+        self,
+        model_source: str,
+        model_args: str,
+    ) -> None:
+        """Logs model parameters and job ID."""
+        self.model_source = model_source
+        self.model_name = GeneralConfigTracker._get_model_name(model_args)
+        self.model_name_sanitized = re.sub(
+            r"[\"<>:/\|\\?\*\[\]]+", "__", self.model_name
+        )
+
+    def log_end_time(self) -> None:
+        """Logs the end time of the evaluation and calculates the total evaluation time."""
+        self.end_time = time.perf_counter()
+        self.total_evaluation_time_seconds = str(self.end_time - self.start_time)
+
+
+class EvaluationTracker:
+    """
+    Keeps track and saves relevant information of the evaluation process.
+    Compiles the data from trackers and writes it to files, which can be published to the Hugging Face hub if requested.
+    """
+
+    def __init__(
+        self,
+        output_path: str = None,
+        hub_results_org: str = "",
+        hub_repo_name: str = "",
+        push_results_to_hub: bool = False,
+        push_samples_to_hub: bool = False,
+        public_repo: bool = False,
+        token: str = "",
+    ) -> None:
+        """
+        Creates all the necessary loggers for evaluation tracking.
+
+        Args:
+            output_path (str): Path to save the results. If not provided, the results won't be saved.
+            hub_results_org (str): The Hugging Face organisation to push the results to. If not provided, the results won't be pushed.
+            hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
+            push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
+            push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
+            public_repo (bool): Whether to push the results to a public or private repository.
+            token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
+        """
+        self.general_config_tracker = GeneralConfigTracker()
+
+        self.output_path = output_path
+        self.hub_results_org = hub_results_org
+        hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
+        self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
+        self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
+        self.push_results_to_hub = push_results_to_hub
+        self.push_samples_to_hub = push_samples_to_hub
+        self.public_repo = public_repo
+        self.api = HfApi(token=token) if token else None
+
+    def save_results_aggregated(
+        self,
+        results: dict,
+        samples: dict,
+    ) -> None:
+        """
+        Saves the aggregated results and samples to the output path and pushes them to the Hugging Face hub if requested.
+
+        Args:
+            results (dict): The aggregated results to save.
+            samples (dict): The samples results to save.
+        """
+        self.general_config_tracker.log_end_time()
+
+        if self.output_path:
+            try:
+                eval_logger.info("Saving results aggregated")
+
+                # calculate cumulative hash for each task - only if samples are provided
+                task_hashes = {}
+                if samples:
+                    for task_name, task_samples in samples.items():
+                        sample_hashes = [
+                            s["doc_hash"] + s["prompt_hash"] + s["target_hash"]
+                            for s in task_samples
+                        ]
+                        task_hashes[task_name] = hash_string("".join(sample_hashes))
+
+                # update initial results dict
+                results.update({"task_hashes": task_hashes})
+                results.update(asdict(self.general_config_tracker))
+                dumped = json.dumps(
+                    results,
+                    indent=2,
+                    default=handle_non_serializable,
+                    ensure_ascii=False,
+                )
+
+                path = Path(self.output_path if self.output_path else Path.cwd())
+                path = path.joinpath(self.general_config_tracker.model_name_sanitized)
+                path.mkdir(parents=True, exist_ok=True)
+
+                self.date_id = datetime.now().isoformat().replace(":", "-")
+                file_results_aggregated = path.joinpath(f"results_{self.date_id}.json")
+                file_results_aggregated.open("w", encoding="utf-8").write(dumped)
+
+                if self.api and self.push_results_to_hub:
+                    self.api.create_repo(
+                        repo_id=self.hub_results_repo
+                        if self.public_repo
+                        else self.hub_results_repo_private,
+                        repo_type="dataset",
+                        private=not self.public_repo,
+                        exist_ok=True,
+                    )
+                    self.api.upload_folder(
+                        repo_id=self.hub_results_repo
+                        if self.public_repo
+                        else self.hub_results_repo_private,
+                        folder_path=str(path),
+                        path_in_repo=self.general_config_tracker.model_name_sanitized,
+                        repo_type="dataset",
+                        commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
+                    )
+
+            except Exception as e:
+                eval_logger.warning("Could not save results aggregated")
+                eval_logger.info(repr(e))
+        else:
+            eval_logger.info(
+                "Output path not provided, skipping saving results aggregated"
+            )
+
+    def save_results_samples(
+        self,
+        task_name: str,
+        samples: dict,
+    ) -> None:
+        """
+        Saves the samples results to the output path and pushes them to the Hugging Face hub if requested.
+
+        Args:
+            task_name (str): The task name to save the samples for.
+            samples (dict): The samples results to save.
+        """
+        if self.output_path:
+            try:
+                eval_logger.info("Saving samples results")
+                samples_dumped = json.dumps(
+                    samples,
+                    indent=2,
+                    default=handle_non_serializable,
+                    ensure_ascii=False,
+                )
+
+                path = Path(self.output_path if self.output_path else Path.cwd())
+                path = path.joinpath(self.general_config_tracker.model_name_sanitized)
+                path.mkdir(parents=True, exist_ok=True)
+
+                file_results_samples = path.joinpath(
+                    f"samples_{task_name}_{self.date_id}.json"
+                )
+                file_results_samples.write_text(samples_dumped, encoding="utf-8")
+
+                if self.api and self.push_samples_to_hub:
+                    self.api.create_repo(
+                        self.hub_results_repo
+                        if self.public_repo
+                        else self.hub_results_repo_private,
+                        repo_type="dataset",
+                        private=not self.public_repo,
+                        exist_ok=True,
+                    )
+                    self.api.upload_folder(
+                        repo_id=self.hub_results_repo
+                        if self.public_repo
+                        else self.hub_results_repo_private,
+                        folder_path=str(path),
+                        path_in_repo=self.general_config_tracker.model_name_sanitized,
+                        repo_type="dataset",
+                        commit_message=f"Adding samples results for {task_name} to {self.general_config_tracker.model_name}",
+                    )
+
+            except Exception as e:
+                eval_logger.warning("Could not save sample results")
+                eval_logger.info(repr(e))
+        else:
+            eval_logger.info("Output path not provided, skipping saving sample results")
--- a/lm_eval/logging/utils.py
+++ b/lm_eval/logging/utils.py
+import logging
+import os
+import re
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+from torch.utils.collect_env import get_pretty_env_info
+from transformers import __version__ as trans_version
+
+
+logger = logging.getLogger(__name__)
+
+
+def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
+    """Remove the ',none' substring from the input_string if it exists at the end.
+
+    Args:
+        input_string (str): The input string from which to remove the ',none' substring.
+
+    Returns:
+        Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
+                          and a boolean indicating whether the modification was made (True) or not (False).
+    """
+    # Define the pattern to match ',none' at the end of the string
+    pattern = re.compile(r",none$")
+
+    # Use sub() to replace ',none' with an empty string
+    result = re.sub(pattern, "", input_string)
+
+    # check if the input_string changed
+    removed = result != input_string
+
+    return result, removed
+
+
+def _handle_non_serializable(o: Any) -> Union[int, str, list]:
+    """Handle non-serializable objects by converting them to serializable types.
+
+    Args:
+        o (Any): The object to be handled.
+
+    Returns:
+        Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
+            it will be converted to int. If the object is of type set, it will be converted
+            to a list. Otherwise, it will be converted to str.
+    """
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    else:
+        return str(o)
+
+
+def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
+    try:
+        git_folder = Path(repo_path, ".git")
+        if git_folder.is_file():
+            git_folder = Path(
+                git_folder.parent,
+                git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
+            )
+        if Path(git_folder, "HEAD").exists():
+            head_name = (
+                Path(git_folder, "HEAD")
+                .read_text(encoding="utf-8")
+                .split("\n")[0]
+                .split(" ")[-1]
+            )
+            head_ref = Path(git_folder, head_name)
+            git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
+        else:
+            git_hash = None
+    except Exception as err:
+        logger.debug(
+            f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}"
+        )
+        return None
+    return git_hash
+
+
+def get_git_commit_hash():
+    """
+    Gets the git commit hash of your current repo (if it exists).
+    Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
+    """
+    try:
+        git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
+        git_hash = git_hash.decode()
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        # FileNotFoundError occurs when git not installed on system
+        git_hash = get_commit_from_path(os.getcwd())  # git hash of repo if exists
+    return git_hash
+
+
+def add_env_info(storage: Dict[str, Any]):
+    try:
+        pretty_env_info = get_pretty_env_info()
+    except Exception as err:
+        pretty_env_info = str(err)
+    transformers_version = trans_version
+    upper_dir_commit = get_commit_from_path(
+        Path(os.getcwd(), "..")
+    )  # git hash of upper repo if exists
+    added_info = {
+        "pretty_env_info": pretty_env_info,
+        "transformers_version": transformers_version,
+        "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
+    }
+    storage.update(added_info)
--- a/lm_eval/logging_utils.py
+++ b/lm_eval/logging_utils.py
 import copy
 import json
 import logging
-import os
-import re
-import subprocess
-from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+from typing import Any, Dict, List, Literal, Tuple

 import numpy as np
 import pandas as pd
 from packaging.version import Version
-from torch.utils.collect_env import get_pretty_env_info
-from transformers import __version__ as trans_version

-
-logger = logging.getLogger(__name__)
-
-
-def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
-    """Remove the ',none' substring from the input_string if it exists at the end.
-
-    Args:
-        input_string (str): The input string from which to remove the ',none' substring.
-
-    Returns:
-        Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
-                          and a boolean indicating whether the modification was made (True) or not (False).
-    """
-    # Define the pattern to match ',none' at the end of the string
-    pattern = re.compile(r",none$")
-
-    # Use sub() to replace ',none' with an empty string
-    result = re.sub(pattern, "", input_string)
-
-    # check if the input_string changed
-    removed = result != input_string
-
-    return result, removed
+from lm_eval.logging.utils import _handle_non_serializable, remove_none_pattern


-def _handle_non_serializable(o: Any) -> Union[int, str, list]:
-    """Handle non-serializable objects by converting them to serializable types.
-
-    Args:
-        o (Any): The object to be handled.
-
-    Returns:
-        Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
-            it will be converted to int. If the object is of type set, it will be converted
-            to a list. Otherwise, it will be converted to str.
-    """
-    if isinstance(o, np.int64) or isinstance(o, np.int32):
-        return int(o)
-    elif isinstance(o, set):
-        return list(o)
-    else:
-        return str(o)
+logger = logging.getLogger(__name__)


 def get_wandb_printer() -> Literal["Printer"]:
@@ -395,61 +350,3 @@ class WandbLogger:
                self._log_samples_as_artifact(eval_preds, task_name)

            self.run.log({f"{group}_eval_results": grouped_df})
-
-
-def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
-    try:
-        git_folder = Path(repo_path, ".git")
-        if git_folder.is_file():
-            git_folder = Path(
-                git_folder.parent,
-                git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
-            )
-        if Path(git_folder, "HEAD").exists():
-            head_name = (
-                Path(git_folder, "HEAD")
-                .read_text(encoding="utf-8")
-                .split("\n")[0]
-                .split(" ")[-1]
-            )
-            head_ref = Path(git_folder, head_name)
-            git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
-        else:
-            git_hash = None
-    except Exception as err:
-        logger.debug(
-            f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}"
-        )
-        return None
-    return git_hash
-
-
-def get_git_commit_hash():
-    """
-    Gets the git commit hash of your current repo (if it exists).
-    Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
-    """
-    try:
-        git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
-        git_hash = git_hash.decode()
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        # FileNotFoundError occurs when git not installed on system
-        git_hash = get_commit_from_path(os.getcwd())  # git hash of repo if exists
-    return git_hash
-
-
-def add_env_info(storage: Dict[str, Any]):
-    try:
-        pretty_env_info = get_pretty_env_info()
-    except Exception as err:
-        pretty_env_info = str(err)
-    transformers_version = trans_version
-    upper_dir_commit = get_commit_from_path(
-        Path(os.getcwd(), "..")
-    )  # git hash of upper repo if exists
-    added_info = {
-        "pretty_env_info": pretty_env_info,
-        "transformers_version": transformers_version,
-        "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
-    }
-    storage.update(added_info)
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -13,6 +13,7 @@ from accelerate import (
    InitProcessGroupKwargs,
    find_executable_batch_size,
 )
+from huggingface_hub import HfApi
 from packaging import version
 from peft import PeftModel
 from peft import __version__ as PEFT_VERSION
@@ -77,7 +78,7 @@ class HFLM(TemplateLM):

    def __init__(
        self,
-        pretrained: Optional[Union[str, transformers.PreTrainedModel]] = "gpt2",
+        pretrained: Union[str, transformers.PreTrainedModel],
        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
        # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
        revision: Optional[str] = "main",
@@ -278,7 +279,10 @@ class HFLM(TemplateLM):
            )

        self._max_length = max_length
-
+        self.pretrained = pretrained
+        self.delta = delta
+        self.peft = peft
+        self.revision = revision
        self.batch_schedule = 1
        self.batch_sizes = {}
        self.max_batch_size = max_batch_size
@@ -663,6 +667,8 @@ class HFLM(TemplateLM):
            max_cont_enc = len(continuation_enc[-(self.max_length + 1) :])
        else:
            max_length = self.max_length
+            max_context_enc = max_length
+            max_cont_enc = max_length

        # if OOM, then halves batch_size and tries again
        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
@@ -1272,3 +1278,44 @@ class HFLM(TemplateLM):
        pbar.close()

        return res
+
+    def get_model_info(self) -> dict:
+        """
+        Method to get Hugging Face model information for experiment reproducibility.
+        """
+
+        def get_model_num_params(model) -> int:
+            if hasattr(model, "num_parameters"):
+                return model.num_parameters()
+            if hasattr(model, "parameters"):
+                return sum(p.numel() for p in model.parameters())
+            else:
+                return -1
+
+        def get_model_dtype(model) -> str:
+            if hasattr(model, "dtype"):
+                return model.dtype
+            else:
+                return ""
+
+        def get_model_sha(pretrained: str, revision: str) -> str:
+            try:
+                model_info = HfApi().model_info(repo_id=pretrained, revision=revision)
+                return model_info.sha
+            except Exception as e:
+                eval_logger.warn(
+                    f"Failed to get model SHA for {pretrained} at revision {revision}. Error: {e}"
+                )
+                return ""
+
+        model_info = {
+            "model_num_parameters": get_model_num_params(self._model),
+            "model_dtype": get_model_dtype(self._model),
+            "model_revision": self.revision,
+            "model_sha": get_model_sha(self.pretrained, self.revision),
+        }
+        if self.peft:
+            model_info["peft_sha"] = get_model_sha(self.peft, self.revision)
+        if self.delta:
+            model_info["delta_sha"] = get_model_sha(self.delta, self.revision)
+        return model_info
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -14,13 +14,11 @@ from lm_eval.models.utils import retry_on_specific_exceptions
 from lm_eval.utils import eval_logger


-def get_result(response, ctxlen: int) -> Tuple[float, bool]:
+def get_result(response) -> Tuple[float, bool]:
    """Process results from OpenAI API response.

    :param response: dict
        OpenAI API Response
-    :param ctxlen: int
-        Length of context (so we can slice them away and only keep the predictions)
    :return:
        continuation_logprobs: np.array
            Log probabilities of continuation tokens
@@ -29,9 +27,9 @@ def get_result(response, ctxlen: int) -> Tuple[float, bool]:
    """
    is_greedy = True
    logprobs = response.logprobs.token_logprobs
-    continuation_logprobs = sum(logprobs[ctxlen:])
+    continuation_logprobs = sum(logprobs)

-    for i in range(ctxlen, len(response.logprobs.token_logprobs)):
+    for i in range(len(response.logprobs.token_logprobs)):
        token = response.logprobs.token_logprobs[i]
        top_tokens = response.logprobs.top_logprobs[i]
        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
@@ -212,7 +210,6 @@ class OpenaiCompletionsLM(TemplateLM):
                client=self.client,
                model=self.model,
                prompt=inps,
-                echo=True,
                max_tokens=0,
                temperature=0.0,
                logprobs=10,
@@ -222,7 +219,7 @@ class OpenaiCompletionsLM(TemplateLM):
            for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(
                response.choices, ctxlens, chunk
            ):
-                answer = get_result(resp, ctxlen)
+                answer = get_result(resp)

                res.append(answer)

@@ -433,7 +430,7 @@ class OpenaiChatCompletionsLM(LM):
                    if "until" in kwargs.keys():
                        until = kwargs.pop("until")
                        if isinstance(until, str):
-                            until = [kwargs]
+                            until = [until]
                        elif not isinstance(until, list):
                            raise ValueError(
                                f"Expected repr(kwargs['until']) to be of type Union[str, list] but got {until}"