fix codespell

d95a4333 · Fabrizio Milo · 121b7096 · d95a4333 · d95a4333 · d95a4333
Commit d95a4333 authored May 02, 2022 by Fabrizio Milo
14 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -35,8 +35,8 @@ repos:
    rev: v2.1.0
    hooks:
      - id: codespell
-        args: [
-            "--ignore-words-list=reord", # Word used in error messages that need rewording
-            --check-filenames,
-            --check-hidden,
-          ]
+        exclude: >
+          (?x)^(
+              .*\.json|ignore.txt
+          )$
+        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
--- a/ignore.txt
+++ b/ignore.txt
+ROUGE
+rouge
+nin
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -51,7 +51,7 @@ class LM(abc.ABC):
        - We will use the full max context length of the model.
        - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
        the max context length.
-        - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementaitons
+        - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
          which may simply concatenate multiple documents together.
        - IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
          multiple chunks, the last input will still a full-sized context.
@@ -234,9 +234,9 @@ class BaseLM(LM):
            return -len(toks), tuple(toks)

        # TODO: automatic (variable) batch size detection for vectorization
-        reord = utils.Reorderer(requests, _collate)
+        re_ord = utils.Reorderer(requests, _collate)
        for chunk in utils.chunks(
-            tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size
+            tqdm(re_ord.get_reordered(), disable=disable_tqdm), self.batch_size
        ):
            inps = []
            cont_toks_list = []
@@ -327,10 +327,10 @@ class BaseLM(LM):

                res.append(answer)

-        return reord.get_original(res)
+        return re_ord.get_original(res)

    def greedy_until(self, requests):
-        # TODO: implement fully general `until` that handles untils that are
+        # TODO: implement fully general `until` that handles until that are
        #       multiple tokens or that span multiple tokens correctly

        # TODO: extract to TokenizedLM?
@@ -340,9 +340,9 @@ class BaseLM(LM):
            toks = self.tok_encode(x[0])
            return len(toks), x[0]

-        reord = utils.Reorderer(requests, _collate)
+        re_ord = utils.Reorderer(requests, _collate)

-        for context, until in tqdm(reord.get_reordered()):
+        for context, until in tqdm(re_ord.get_reordered()):
            if isinstance(until, str):
                until = [until]

@@ -366,7 +366,7 @@ class BaseLM(LM):

            res.append(s)

-        return reord.get_original(res)
+        return re_ord.get_original(res)


 class Task(abc.ABC):

--- a/lm_eval/datasets/drop/drop.py
+++ b/lm_eval/datasets/drop/drop.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# Custom DROP dataet that, unlike HF, keeps all question-answer pairs
+# Custom DROP dataset that, unlike HF, keeps all question-answer pairs
 # even if there are multiple types of answers for the same question.
 """DROP dataset."""


--- a/lm_eval/datasets/sat_analogies/sat_analogies.py
+++ b/lm_eval/datasets/sat_analogies/sat_analogies.py
@@ -61,7 +61,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
        return (
            "To use SAT Analogy Questions you have to download it manually. Please "
            "email Peter Turney to request the data (https://www.apperceptual.com). "
-            "Once you recieve a download link for the dataset, supply the local path "
+            "Once you receive a download link for the dataset, supply the local path "
            "as the `data_dir` arg: "
            "`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
        )

--- a/lm_eval/decontamination/janitor.py
+++ b/lm_eval/decontamination/janitor.py
@@ -158,7 +158,7 @@ class Janitor:

    def clean(self, dirty_string):
        """Clean a string (e.g. a training set) by removing all ngrams previously
-        reigstered as contaminants. Returns a list of clean chunks, or empty if
+        registered as contaminants. Returns a list of clean chunks, or empty if
        the string was too dirty"""
        if JANITOR_CPP:
            return self.clean_cpp(dirty_string)
@@ -275,7 +275,7 @@ class Janitor:
 #         ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958.  At first,
 #         oil money had a marginal impact.  A few lowrise concete buildings were erected, and the first
 #         paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
-#         would last, took a cautious approach, prefering to save the revenue rather than investing it in
+#         would last, took a cautious approach, preferring to save the revenue rather than investing it in
 #         development.  His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
 #         to transform Abu Dhabi.  The ruling Al Nahayan family decided that Sheikh Zayed should replace his
 #         brother as Ruler and carry out his vision of developing the country.  On [[August 6]], [[1966]],

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -25,7 +25,7 @@ class HFLM(BaseLM):
            self._device = torch.device(device)
            print(f"Using device '{device}'")
        else:
-            print("Device not specificed")
+            print("Device not specified")
            print(f"Cuda Available? {torch.cuda.is_available()}")
            self._device = (
                torch.device("cuda")

--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
@@ -124,10 +124,10 @@ class GPT3LM(BaseLM):
            toks = x[1] + x[2]
            return -len(toks), tuple(toks)

-        reord = utils.Reorderer(requests, _collate)
+        re_ord = utils.Reorderer(requests, _collate)

        for chunk in tqdm(
-            list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)),
+            list(utils.chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE)),
            disable=disable_tqdm,
        ):
            inps = []
@@ -163,7 +163,7 @@ class GPT3LM(BaseLM):
                if cache_key is not None:
                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)

-        return reord.get_original(res)
+        return re_ord.get_original(res)

    def greedy_until(self, requests):
        if not requests:
@@ -174,7 +174,7 @@ class GPT3LM(BaseLM):
            toks = self.tok_encode(x[0])
            return len(toks), x[0]

-        reord = utils.Reorderer(requests, _collate)
+        re_ord = utils.Reorderer(requests, _collate)

        def sameuntil_chunks(xs, size):
            ret = []
@@ -191,7 +191,7 @@ class GPT3LM(BaseLM):

        # todo: more intelligent batching for heterogeneous `until`
        for chunk, until in tqdm(
-            list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))
+            list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE))
        ):
            inps = []
            for context, _ in chunk:
@@ -219,7 +219,7 @@ class GPT3LM(BaseLM):

                res.append(s)

-        return reord.get_original(res)
+        return re_ord.get_original(res)

    def _model_call(self, inps):
        # Isn't used because we override _loglikelihood_tokens

--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
@@ -74,16 +74,16 @@ class DROP(Task):
            {"number": ['1', '8'], ...}
            -> [{"number": ['1'], ...}, {"number": ['8'], ...}]
            """
-            vas = []
+            valid_answers = []
            for i in range(len(validated_answers["number"])):
-                vas.append(
+                valid_answers.append(
                    {
                        "number": validated_answers["number"][i],
                        "date": validated_answers["date"][i],
                        "spans": validated_answers["spans"][i],
                    }
                )
-            return vas
+            return valid_answers

        answers = []
        answers_set = set()

--- a/lm_eval/tasks/hendrycks_ethics.py
+++ b/lm_eval/tasks/hendrycks_ethics.py
@@ -10,7 +10,7 @@ to steer chatbot outputs or eventually regularize open-ended reinforcement
 learning agents.

 NOTE: The reported "group" accuracies for the Deontology, Justice, and Virtue
-tasks are refered to in this work as the `em` sub-metric. See Section 3. Metrics.
+tasks are referred to in this work as the `em` sub-metric. See Section 3. Metrics.
 of the paper.

 Homepage: https://github.com/hendrycks/ethics
@@ -323,7 +323,7 @@ class EthicsUtilitarianism(Ethics):
        }

    def doc_to_text(self, doc):
-        return "Scenario 1: {}\nScenario 2: {}\nQuestion: Is Scenario 1 preferrable?\nAnswer:".format(
+        return "Scenario 1: {}\nScenario 2: {}\nQuestion: Is Scenario 1 preferable?\nAnswer:".format(
            doc["scenarios"][0], doc["scenarios"][1]
        )


--- a/scripts/clean_training_data/README.md
+++ b/scripts/clean_training_data/README.md
@@ -5,7 +5,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
 1) Collects all contamination text files that are to be removed from training data
 2) Filters training data by finding `N`gram matches between the training data
   and any contamination
-   1) `N`grams ignore case and punctation and are split on whitespace.
+   1) `N`grams ignore case and punctuation and are split on whitespace.
   2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around
    the match, splitting the training data into chunks
   3) Any chunks less than `minimum_slice_length` are removed
@@ -20,7 +20,7 @@ minimum_slice_length = 200
 too_dirty_cutoff = 10
 ```

-## Compling
+## Compiling

 Janitor can be used as a pure python program, but it is much faster if the ngram
 code is run in C++. To compile the C++ code, run

--- a/scripts/clean_training_data/process_sorted_buckets.py
+++ b/scripts/clean_training_data/process_sorted_buckets.py
@@ -63,7 +63,7 @@ def process_bucket(
    for line in bucket.read():
        [ngram, document_id] = line.rsplit(" ", 1)

-        # Write ngram if more then 10 unique document occurences
+        # Write ngram if more then 10 unique document occurrences
        if ngram != current_ngram:
            if len(current_ngram_document_ids) > 10:
                output_archive.add_data(

--- a/templates/new_multiple_choice_task.py
+++ b/templates/new_multiple_choice_task.py
 # TODO: Remove all TODO comments once the implementation is complete.
 """
 TODO: Add the Paper Title on this line.
-TODO: Add the paper's PDF URL (preferrably from arXiv) on this line.
+TODO: Add the paper's PDF URL (preferably from arXiv) on this line.

 TODO: Write a Short Description of the task.


--- a/templates/new_task.py
+++ b/templates/new_task.py
 # TODO: Remove all TODO comments once the implementation is complete.
 """
 TODO: Add the Paper Title on this line.
-TODO: Add the paper's PDF URL (preferrably from arXiv) on this line.
+TODO: Add the paper's PDF URL (preferably from arXiv) on this line.

 TODO: Write a Short Description of the task.

@@ -45,7 +45,7 @@ class NewTask(Task):
            if self._training_docs is None:
                # TODO: Return the training document generator from `self.dataset`.
                # If you need to process the data, `map` over the documents with
-                # the custom procesing function, `self._process_doc`. E.g.
+                # the custom processing function, `self._process_doc`. E.g.
                # `map(self._process_doc, self.dataset["validation"])`
                # In most case you can leave this as is unless the dataset split is
                # named differently than the default `"train"`.
@@ -56,7 +56,7 @@ class NewTask(Task):
        if self.has_validation_docs():
            # TODO: Return the validation document generator from `self.dataset`.
            # If you need to process the data, `map` over the documents with the
-            # custom procesing function, `self._process_doc`. E.g.
+            # custom processing function, `self._process_doc`. E.g.
            # `map(self._process_doc, self.dataset["validation"])`
            # In most case you can leave this as is unless the dataset split is
            # named differently than the default `"validation"`.