Commit d95a4333 authored by Fabrizio Milo's avatar Fabrizio Milo
Browse files

fix codespell

parent 121b7096
......@@ -35,8 +35,8 @@ repos:
rev: v2.1.0
hooks:
- id: codespell
args: [
"--ignore-words-list=reord", # Word used in error messages that need rewording
--check-filenames,
--check-hidden,
]
exclude: >
(?x)^(
.*\.json|ignore.txt
)$
args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
ROUGE
rouge
nin
......@@ -51,7 +51,7 @@ class LM(abc.ABC):
- We will use the full max context length of the model.
- For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
the max context length.
- IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementaitons
- IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
which may simply concatenate multiple documents together.
- IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
multiple chunks, the last input will still a full-sized context.
......@@ -234,9 +234,9 @@ class BaseLM(LM):
return -len(toks), tuple(toks)
# TODO: automatic (variable) batch size detection for vectorization
reord = utils.Reorderer(requests, _collate)
re_ord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks(
tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size
tqdm(re_ord.get_reordered(), disable=disable_tqdm), self.batch_size
):
inps = []
cont_toks_list = []
......@@ -327,10 +327,10 @@ class BaseLM(LM):
res.append(answer)
return reord.get_original(res)
return re_ord.get_original(res)
def greedy_until(self, requests):
# TODO: implement fully general `until` that handles untils that are
# TODO: implement fully general `until` that handles until that are
# multiple tokens or that span multiple tokens correctly
# TODO: extract to TokenizedLM?
......@@ -340,9 +340,9 @@ class BaseLM(LM):
toks = self.tok_encode(x[0])
return len(toks), x[0]
reord = utils.Reorderer(requests, _collate)
re_ord = utils.Reorderer(requests, _collate)
for context, until in tqdm(reord.get_reordered()):
for context, until in tqdm(re_ord.get_reordered()):
if isinstance(until, str):
until = [until]
......@@ -366,7 +366,7 @@ class BaseLM(LM):
res.append(s)
return reord.get_original(res)
return re_ord.get_original(res)
class Task(abc.ABC):
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Custom DROP dataet that, unlike HF, keeps all question-answer pairs
# Custom DROP dataset that, unlike HF, keeps all question-answer pairs
# even if there are multiple types of answers for the same question.
"""DROP dataset."""
......
......@@ -61,7 +61,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
return (
"To use SAT Analogy Questions you have to download it manually. Please "
"email Peter Turney to request the data (https://www.apperceptual.com). "
"Once you recieve a download link for the dataset, supply the local path "
"Once you receive a download link for the dataset, supply the local path "
"as the `data_dir` arg: "
"`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
)
......
......@@ -158,7 +158,7 @@ class Janitor:
def clean(self, dirty_string):
"""Clean a string (e.g. a training set) by removing all ngrams previously
reigstered as contaminants. Returns a list of clean chunks, or empty if
registered as contaminants. Returns a list of clean chunks, or empty if
the string was too dirty"""
if JANITOR_CPP:
return self.clean_cpp(dirty_string)
......@@ -275,7 +275,7 @@ class Janitor:
# ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first,
# oil money had a marginal impact. A few lowrise concete buildings were erected, and the first
# paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
# would last, took a cautious approach, prefering to save the revenue rather than investing it in
# would last, took a cautious approach, preferring to save the revenue rather than investing it in
# development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
# to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his
# brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]],
......
......@@ -25,7 +25,7 @@ class HFLM(BaseLM):
self._device = torch.device(device)
print(f"Using device '{device}'")
else:
print("Device not specificed")
print("Device not specified")
print(f"Cuda Available? {torch.cuda.is_available()}")
self._device = (
torch.device("cuda")
......
......@@ -124,10 +124,10 @@ class GPT3LM(BaseLM):
toks = x[1] + x[2]
return -len(toks), tuple(toks)
reord = utils.Reorderer(requests, _collate)
re_ord = utils.Reorderer(requests, _collate)
for chunk in tqdm(
list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)),
list(utils.chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE)),
disable=disable_tqdm,
):
inps = []
......@@ -163,7 +163,7 @@ class GPT3LM(BaseLM):
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return reord.get_original(res)
return re_ord.get_original(res)
def greedy_until(self, requests):
if not requests:
......@@ -174,7 +174,7 @@ class GPT3LM(BaseLM):
toks = self.tok_encode(x[0])
return len(toks), x[0]
reord = utils.Reorderer(requests, _collate)
re_ord = utils.Reorderer(requests, _collate)
def sameuntil_chunks(xs, size):
ret = []
......@@ -191,7 +191,7 @@ class GPT3LM(BaseLM):
# todo: more intelligent batching for heterogeneous `until`
for chunk, until in tqdm(
list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))
list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE))
):
inps = []
for context, _ in chunk:
......@@ -219,7 +219,7 @@ class GPT3LM(BaseLM):
res.append(s)
return reord.get_original(res)
return re_ord.get_original(res)
def _model_call(self, inps):
# Isn't used because we override _loglikelihood_tokens
......
......@@ -74,16 +74,16 @@ class DROP(Task):
{"number": ['1', '8'], ...}
-> [{"number": ['1'], ...}, {"number": ['8'], ...}]
"""
vas = []
valid_answers = []
for i in range(len(validated_answers["number"])):
vas.append(
valid_answers.append(
{
"number": validated_answers["number"][i],
"date": validated_answers["date"][i],
"spans": validated_answers["spans"][i],
}
)
return vas
return valid_answers
answers = []
answers_set = set()
......
......@@ -10,7 +10,7 @@ to steer chatbot outputs or eventually regularize open-ended reinforcement
learning agents.
NOTE: The reported "group" accuracies for the Deontology, Justice, and Virtue
tasks are refered to in this work as the `em` sub-metric. See Section 3. Metrics.
tasks are referred to in this work as the `em` sub-metric. See Section 3. Metrics.
of the paper.
Homepage: https://github.com/hendrycks/ethics
......@@ -323,7 +323,7 @@ class EthicsUtilitarianism(Ethics):
}
def doc_to_text(self, doc):
return "Scenario 1: {}\nScenario 2: {}\nQuestion: Is Scenario 1 preferrable?\nAnswer:".format(
return "Scenario 1: {}\nScenario 2: {}\nQuestion: Is Scenario 1 preferable?\nAnswer:".format(
doc["scenarios"][0], doc["scenarios"][1]
)
......
......@@ -5,7 +5,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
1) Collects all contamination text files that are to be removed from training data
2) Filters training data by finding `N`gram matches between the training data
and any contamination
1) `N`grams ignore case and punctation and are split on whitespace.
1) `N`grams ignore case and punctuation and are split on whitespace.
2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around
the match, splitting the training data into chunks
3) Any chunks less than `minimum_slice_length` are removed
......@@ -20,7 +20,7 @@ minimum_slice_length = 200
too_dirty_cutoff = 10
```
## Compling
## Compiling
Janitor can be used as a pure python program, but it is much faster if the ngram
code is run in C++. To compile the C++ code, run
......
......@@ -63,7 +63,7 @@ def process_bucket(
for line in bucket.read():
[ngram, document_id] = line.rsplit(" ", 1)
# Write ngram if more then 10 unique document occurences
# Write ngram if more then 10 unique document occurrences
if ngram != current_ngram:
if len(current_ngram_document_ids) > 10:
output_archive.add_data(
......
# TODO: Remove all TODO comments once the implementation is complete.
"""
TODO: Add the Paper Title on this line.
TODO: Add the paper's PDF URL (preferrably from arXiv) on this line.
TODO: Add the paper's PDF URL (preferably from arXiv) on this line.
TODO: Write a Short Description of the task.
......
# TODO: Remove all TODO comments once the implementation is complete.
"""
TODO: Add the Paper Title on this line.
TODO: Add the paper's PDF URL (preferrably from arXiv) on this line.
TODO: Add the paper's PDF URL (preferably from arXiv) on this line.
TODO: Write a Short Description of the task.
......@@ -45,7 +45,7 @@ class NewTask(Task):
if self._training_docs is None:
# TODO: Return the training document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with
# the custom procesing function, `self._process_doc`. E.g.
# the custom processing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["validation"])`
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"train"`.
......@@ -56,7 +56,7 @@ class NewTask(Task):
if self.has_validation_docs():
# TODO: Return the validation document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with the
# custom procesing function, `self._process_doc`. E.g.
# custom processing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["validation"])`
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"validation"`.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment