Merge pull request #931 from EleutherAI/fix-generate-until

[Refactor] Generate_until rename

Merge pull request #931 from EleutherAI/fix-generate-until
[Refactor] Generate_until rename
ef332026 · Hailey Schoelkopf · GitHub · a8d130ab · e66ba123 · ef332026
Unverified Commit ef332026 authored Oct 18, 2023 by Hailey Schoelkopf Committed by GitHub Oct 18, 2023
12 changed files
--- a/README.md
+++ b/README.md
@@ -155,14 +155,14 @@ A full accounting of the supported and planned libraries + APIs can be seen belo

 | API or Inference Server     | Implemented?                    | `--model <xxx>` name                                                             | Models supported:                    | Request Types:                                           |
 |-----------------------------|---------------------------------|----------------------------------------------------------------------------------|--------------------------------------|----------------------------------------------------------|
-| OpenAI Completions          | :heavy_check_mark:              | `openai`, `openai-completions`, `gooseai`                                        | up to `code-davinci-002`             | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| OpenAI ChatCompletions      | :x: Not yet - needs help!       | N/A                                                                              | (link here?)                         | `greedy_until` (no logprobs)                             |
-| Anthropic                   | :heavy_check_mark:              | `anthropic`                                                                      | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)         | `greedy_until` (no logprobs)                             |
-| GooseAI                     | :heavy_check_mark: (not separately maintained)  | `openai`, `openai-completions`, `gooseai` (same interface as OpenAI Completions) |                                      | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Textsynth                   | Needs testing                   | `textsynth`                                                                      | ???                                  | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Cohere                      | :hourglass: - blocked on Cohere API bug | N/A                                                                              | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| GGML                        | :hourglass: [PR](https://github.com/EleutherAI/lm-evaluation-harness/pull/617)              | N/A                                                                              | ???                                  | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| vLLM                        | :x: Not yet - needs help!       | N/A                                                                              | All HF models                        | `greedy_until` (no logprobs)                             |
+| OpenAI Completions          | :heavy_check_mark:              | `openai`, `openai-completions`, `gooseai`                                        | up to `code-davinci-002`             | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| OpenAI ChatCompletions      | :x: Not yet - needs help!       | N/A                                                                              | (link here?)                         | `generate_until` (no logprobs)                             |
+| Anthropic                   | :heavy_check_mark:              | `anthropic`                                                                      | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)         | `generate_until` (no logprobs)                             |
+| GooseAI                     | :heavy_check_mark: (not separately maintained)  | `openai`, `openai-completions`, `gooseai` (same interface as OpenAI Completions) |                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Textsynth                   | Needs testing                   | `textsynth`                                                                      | ???                                  | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Cohere                      | :hourglass: - blocked on Cohere API bug | N/A                                                                              | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| GGML                        | :hourglass: [PR](https://github.com/EleutherAI/lm-evaluation-harness/pull/617)              | N/A                                                                              | ???                                  | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| vLLM                        | :x: Not yet - needs help!       | N/A                                                                              | All HF models                        | `generate_until` (no logprobs)                             |
 | Your inference server here! | ...                             | ...                                                                              | ...                                  | ...                                                      |                                | ...                                                      |

 It is on our roadmap to create task variants designed to enable models which do not serve logprobs/loglikelihoods to be compared with generation performance of open-source models.

--- a/docs/interface.md
+++ b/docs/interface.md
@@ -57,7 +57,7 @@ import lm_eval

 my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
 ...
-lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
+lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`

 results = lm_eval.simple_evaluate( # call simple_evaluate
    model=lm_obj,
@@ -83,7 +83,7 @@ from my_tasks import MyTask1 # suppose you've defined a custom lm_eval.api.Task

 my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
 ...
-lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
+lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`




--- a/docs/model_guide.md
+++ b/docs/model_guide.md
@@ -44,26 +44,24 @@ class MyCustomLM(LM):
        #...


-    def greedy_until(self, requests: list[Instance]) -> list[str]:
+    def generate_until(self, requests: list[Instance]) -> list[str]:
        #...
    #...
 ```
 Where `Instance` is a dataclass defined in [`lm_eval.api.instance`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/api/instance.py) with property `args` which returns a tuple of (context, continuation).

-We support
+We support three types of requests, consisting of different interactions / measurements with an autoregressive LM.

-The three types of
+All three request types take as input `requests` of type `list[Instance]` that have a matching `Instance.request_type` to the method name.

+- `generate_until`
+  - Each request contains `Instance.args : Tuple[str, dict]` containing 1. an input string to the LM and 2. a dictionary of keyword arguments used to control generation parameters.
+  -

+- `loglikelihood`
+  -

-smth smth tokenizer-agnostic
-
-3 reqtypes
- greedy_until, and the arguments passed to it
-
- loglikelihood, and args passed to it
-
- loglikelihood_rolling, and args passed to it
+- `loglikelihood_rolling`, and args passed to it


 ## Registration

--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -32,7 +32,7 @@ Prompting / in-context formatting options:
 - **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text, doc_to_target, and doc_to_choice.
 - **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate input for the model
 - **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into
- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `greedy_until` tasks.
+- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
 - **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
 - **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.

@@ -42,7 +42,7 @@ Runtime configuration options:

 Scoring details:
 - **metric_list** (`str`, *optional*, defaults to None) — A list of metrics to use for evaluation. See docs for expected format.
- **output_type** (`str`, *optional*, defaults to "greedy_until") — Selects the type of model output for the given task. Options are `greedy_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
+- **output_type** (`str`, *optional*, defaults to "generate_until") — Selects the type of model output for the given task. Options are `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
 - **generation_kwargs** (`dict`, *optional*) — Auxiliary arguments for the `generate` function from HF transformers library. Advanced keyword arguments may not be supported for non-HF LM classes.
 - **repeats** (`int`, *optional*, defaults to 1) — Number of repeated runs through model for each sample. can be used for cases such as self-consistency.
 - **filter_list** (`Union[str, list]`, *optional*) — List of filters to postprocess model outputs. See below for further detail on the filter API.

--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
@@ -4,7 +4,7 @@ from typing import Literal, Tuple

 @dataclass
 class Instance:
-    request_type: Literal["loglikelihood", "loglikelihood_rolling", "greedy_until"]
+    request_type: Literal["loglikelihood", "loglikelihood_rolling", "generate_until"]
    doc: dict
    arguments: tuple
    idx: int

--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -212,7 +212,7 @@ def f1_fn(items):  # This is a passthrough function
 @register_metric(
    metric="bleu",
    higher_is_better=True,
-    output_type="greedy_until",
+    output_type="generate_until",
    aggregation="bleu",
 )
 def bleu_fn(items):  # This is a passthrough function
@@ -222,7 +222,7 @@ def bleu_fn(items):  # This is a passthrough function
 @register_metric(
    metric="chrf",
    higher_is_better=True,
-    output_type="greedy_until",
+    output_type="generate_until",
    aggregation="chrf",
 )
 def chrf_fn(items):  # This is a passthrough function
@@ -232,7 +232,7 @@ def chrf_fn(items):  # This is a passthrough function
 @register_metric(
    metric="ter",
    higher_is_better=True,
-    output_type="greedy_until",
+    output_type="generate_until",
    aggregation="ter",
 )
 def ter_fn(items):  # This is a passthrough function

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -211,12 +211,12 @@ class CachingLM:
            )
            for req in tqdm(requests):
                hsh = hash_args(attr, req.args)
-                if attr == "greedy_until" and req.args[1].get("do_sample", False):
+                if attr == "generate_until" and req.args[1].get("do_sample", False):
                    # when we are doing non-greedy generation, don't use the cache
                    # (else every "randomly sampled" generation would be identical for repeats > 1).
                    if not warned:
                        eval_logger.warning(
-                            f"Arguments to lm.greedy_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests."
+                            f"Arguments to lm.generate_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests."
                        )
                        warned = True
                    res.append(None)

--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -81,7 +81,7 @@ DEFAULT_METRIC_REGISTRY = {
    ],
    "loglikelihood_rolling": ["word_perplexity", "byte_perplexity", "bits_per_byte"],
    "multiple_choice": ["acc", "acc_norm"],
-    "greedy_until": ["exact_match"],
+    "generate_until": ["exact_match"],
 }


@@ -171,7 +171,6 @@ def is_higher_better(metric_name):
    try:
        return HIGHER_IS_BETTER_REGISTRY[metric_name]
    except KeyError:
-        raise Warning(f"higher_is_better not specified for metric '{metric_name}'!")
        eval_logger.warning(
            f"higher_is_better not specified for metric '{metric_name}'!"
        )
--- a/scripts/cost_estimate.py
+++ b/scripts/cost_estimate.py
@@ -23,7 +23,7 @@ class DryrunLM(LM):

        return res

-    def greedy_until(self, requests):
+    def generate_until(self, requests):
        res = []

        for ctx, _ in requests:

--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
@@ -15,10 +15,10 @@ class Test_HFLM:
    multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")()  # type: ignore
    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
    MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
-    greedy_until_task = tasks.TASK_REGISTRY.get("gsm8k_yaml")()  # type: ignore
-    greedy_until_task.build_all_requests(limit=10, rank=0, world_size=1)
-    greedy_until_task._config.generation_kwargs["max_gen_toks"] = 10
-    GREEDY_UNTIL: list[Instance] = greedy_until_task.instances
+    generate_until_task = tasks.TASK_REGISTRY.get("gsm8k_yaml")()  # type: ignore
+    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
+    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
+    generate_until: list[Instance] = generate_until_task.instances
    rolling_task = tasks.TASK_REGISTRY.get("wikitext")()  # type: ignore
    rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
    ROLLING: list[Instance] = rolling_task.instances
@@ -65,7 +65,7 @@ class Test_HFLM:
        -52.70050811767578,
        -56.25089645385742,
    ]
-    GREEDY_UNTIL_RES = [
+    generate_until_RES = [
        " The average of $2.50 each is $",
        " A robe takes 2 bolts of blue fiber and half",
        " $50,000 in repairs.",
@@ -109,9 +109,9 @@ class Test_HFLM:
        ), np.argmax(np.array(_res).reshape(-1, 4), axis=1)
        assert (argmax_RES == argmax_res).all()

-    def test_greedy_until(self) -> None:
-        res = self.LM.greedy_until(self.GREEDY_UNTIL)
-        assert res == self.GREEDY_UNTIL_RES
+    def test_generate_until(self) -> None:
+        res = self.LM.generate_until(self.generate_until)
+        assert res == self.generate_until_RES

    def test_logliklihood_rolling(self) -> None:
        res = self.LM.loglikelihood_rolling(self.ROLLING)

--- a/tests/tests_master/test_models.py
+++ b/tests/tests_master/test_models.py
@@ -78,7 +78,7 @@ def test_gpt2():
    # test empty context
    gpt2.loglikelihood([("", "test")])

-    (gen,) = gpt2.greedy_until(
+    (gen,) = gpt2.generate_until(
        [("The quick brown fox jumps over the lazy", [".", "\n"])]
    )

@@ -204,7 +204,7 @@ def test_gpt3():
    # test empty context
    gpt3.loglikelihood([("", "test")])

-    (gen,) = gpt3.greedy_until(
+    (gen,) = gpt3.generate_until(
        [("The quick brown fox jumps over the lazy", [".", "\n"])]
    )

@@ -300,7 +300,7 @@ def test_textsynth():
    # test empty context
    textsynth.loglikelihood([("", "test")])

-    (gen,) = textsynth.greedy_until(
+    (gen,) = textsynth.generate_until(
        [("The quick brown fox jumps over the lazy", [".", "\n"])]
    )


--- a/tests/tests_master/test_version_stable.py
+++ b/tests/tests_master/test_version_stable.py
@@ -98,9 +98,9 @@ def test_versions_stable(taskname, task_class):

        return res

-    def greedy_until(reqs):
+    def generate_until(reqs):
        res = []
-        assert_target_hashed(f"{taskname}-v{task_class.VERSION}-greedy_until", reqs)
+        assert_target_hashed(f"{taskname}-v{task_class.VERSION}-generate_until", reqs)

        for ctx, _ in [req.args for req in reqs]:
            res.append("lol")
@@ -110,7 +110,7 @@ def test_versions_stable(taskname, task_class):

    lm.loglikelihood = ll_fn
    lm.loglikelihood_rolling = ll_perp_fn
-    lm.greedy_until = greedy_until
+    lm.generate_until = generate_until

    limit = None
    result = evaluator.evaluate(