Commit d95a4333 authored by Fabrizio Milo's avatar Fabrizio Milo
Browse files

fix codespell

parent 121b7096
...@@ -35,8 +35,8 @@ repos: ...@@ -35,8 +35,8 @@ repos:
rev: v2.1.0 rev: v2.1.0
hooks: hooks:
- id: codespell - id: codespell
args: [ exclude: >
"--ignore-words-list=reord", # Word used in error messages that need rewording (?x)^(
--check-filenames, .*\.json|ignore.txt
--check-hidden, )$
] args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
ROUGE
rouge
nin
...@@ -51,7 +51,7 @@ class LM(abc.ABC): ...@@ -51,7 +51,7 @@ class LM(abc.ABC):
- We will use the full max context length of the model. - We will use the full max context length of the model.
- For inputs that exceed the max context length, we divide the tokenized string into chunks of up to - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
the max context length. the max context length.
- IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementaitons - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
which may simply concatenate multiple documents together. which may simply concatenate multiple documents together.
- IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into - IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
multiple chunks, the last input will still a full-sized context. multiple chunks, the last input will still a full-sized context.
...@@ -234,9 +234,9 @@ class BaseLM(LM): ...@@ -234,9 +234,9 @@ class BaseLM(LM):
return -len(toks), tuple(toks) return -len(toks), tuple(toks)
# TODO: automatic (variable) batch size detection for vectorization # TODO: automatic (variable) batch size detection for vectorization
reord = utils.Reorderer(requests, _collate) re_ord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks( for chunk in utils.chunks(
tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size tqdm(re_ord.get_reordered(), disable=disable_tqdm), self.batch_size
): ):
inps = [] inps = []
cont_toks_list = [] cont_toks_list = []
...@@ -327,10 +327,10 @@ class BaseLM(LM): ...@@ -327,10 +327,10 @@ class BaseLM(LM):
res.append(answer) res.append(answer)
return reord.get_original(res) return re_ord.get_original(res)
def greedy_until(self, requests): def greedy_until(self, requests):
# TODO: implement fully general `until` that handles untils that are # TODO: implement fully general `until` that handles until that are
# multiple tokens or that span multiple tokens correctly # multiple tokens or that span multiple tokens correctly
# TODO: extract to TokenizedLM? # TODO: extract to TokenizedLM?
...@@ -340,9 +340,9 @@ class BaseLM(LM): ...@@ -340,9 +340,9 @@ class BaseLM(LM):
toks = self.tok_encode(x[0]) toks = self.tok_encode(x[0])
return len(toks), x[0] return len(toks), x[0]
reord = utils.Reorderer(requests, _collate) re_ord = utils.Reorderer(requests, _collate)
for context, until in tqdm(reord.get_reordered()): for context, until in tqdm(re_ord.get_reordered()):
if isinstance(until, str): if isinstance(until, str):
until = [until] until = [until]
...@@ -366,7 +366,7 @@ class BaseLM(LM): ...@@ -366,7 +366,7 @@ class BaseLM(LM):
res.append(s) res.append(s)
return reord.get_original(res) return re_ord.get_original(res)
class Task(abc.ABC): class Task(abc.ABC):
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
# Custom DROP dataet that, unlike HF, keeps all question-answer pairs # Custom DROP dataset that, unlike HF, keeps all question-answer pairs
# even if there are multiple types of answers for the same question. # even if there are multiple types of answers for the same question.
"""DROP dataset.""" """DROP dataset."""
......
...@@ -61,7 +61,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder): ...@@ -61,7 +61,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
return ( return (
"To use SAT Analogy Questions you have to download it manually. Please " "To use SAT Analogy Questions you have to download it manually. Please "
"email Peter Turney to request the data (https://www.apperceptual.com). " "email Peter Turney to request the data (https://www.apperceptual.com). "
"Once you recieve a download link for the dataset, supply the local path " "Once you receive a download link for the dataset, supply the local path "
"as the `data_dir` arg: " "as the `data_dir` arg: "
"`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`" "`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
) )
......
...@@ -158,7 +158,7 @@ class Janitor: ...@@ -158,7 +158,7 @@ class Janitor:
def clean(self, dirty_string): def clean(self, dirty_string):
"""Clean a string (e.g. a training set) by removing all ngrams previously """Clean a string (e.g. a training set) by removing all ngrams previously
reigstered as contaminants. Returns a list of clean chunks, or empty if registered as contaminants. Returns a list of clean chunks, or empty if
the string was too dirty""" the string was too dirty"""
if JANITOR_CPP: if JANITOR_CPP:
return self.clean_cpp(dirty_string) return self.clean_cpp(dirty_string)
...@@ -275,7 +275,7 @@ class Janitor: ...@@ -275,7 +275,7 @@ class Janitor:
# ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first, # ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first,
# oil money had a marginal impact. A few lowrise concete buildings were erected, and the first # oil money had a marginal impact. A few lowrise concete buildings were erected, and the first
# paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties # paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
# would last, took a cautious approach, prefering to save the revenue rather than investing it in # would last, took a cautious approach, preferring to save the revenue rather than investing it in
# development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential # development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
# to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his # to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his
# brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]], # brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]],
......
...@@ -25,7 +25,7 @@ class HFLM(BaseLM): ...@@ -25,7 +25,7 @@ class HFLM(BaseLM):
self._device = torch.device(device) self._device = torch.device(device)
print(f"Using device '{device}'") print(f"Using device '{device}'")
else: else:
print("Device not specificed") print("Device not specified")
print(f"Cuda Available? {torch.cuda.is_available()}") print(f"Cuda Available? {torch.cuda.is_available()}")
self._device = ( self._device = (
torch.device("cuda") torch.device("cuda")
......
...@@ -124,10 +124,10 @@ class GPT3LM(BaseLM): ...@@ -124,10 +124,10 @@ class GPT3LM(BaseLM):
toks = x[1] + x[2] toks = x[1] + x[2]
return -len(toks), tuple(toks) return -len(toks), tuple(toks)
reord = utils.Reorderer(requests, _collate) re_ord = utils.Reorderer(requests, _collate)
for chunk in tqdm( for chunk in tqdm(
list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)), list(utils.chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE)),
disable=disable_tqdm, disable=disable_tqdm,
): ):
inps = [] inps = []
...@@ -163,7 +163,7 @@ class GPT3LM(BaseLM): ...@@ -163,7 +163,7 @@ class GPT3LM(BaseLM):
if cache_key is not None: if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer) self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return reord.get_original(res) return re_ord.get_original(res)
def greedy_until(self, requests): def greedy_until(self, requests):
if not requests: if not requests:
...@@ -174,7 +174,7 @@ class GPT3LM(BaseLM): ...@@ -174,7 +174,7 @@ class GPT3LM(BaseLM):
toks = self.tok_encode(x[0]) toks = self.tok_encode(x[0])
return len(toks), x[0] return len(toks), x[0]
reord = utils.Reorderer(requests, _collate) re_ord = utils.Reorderer(requests, _collate)
def sameuntil_chunks(xs, size): def sameuntil_chunks(xs, size):
ret = [] ret = []
...@@ -191,7 +191,7 @@ class GPT3LM(BaseLM): ...@@ -191,7 +191,7 @@ class GPT3LM(BaseLM):
# todo: more intelligent batching for heterogeneous `until` # todo: more intelligent batching for heterogeneous `until`
for chunk, until in tqdm( for chunk, until in tqdm(
list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)) list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE))
): ):
inps = [] inps = []
for context, _ in chunk: for context, _ in chunk:
...@@ -219,7 +219,7 @@ class GPT3LM(BaseLM): ...@@ -219,7 +219,7 @@ class GPT3LM(BaseLM):
res.append(s) res.append(s)
return reord.get_original(res) return re_ord.get_original(res)
def _model_call(self, inps): def _model_call(self, inps):
# Isn't used because we override _loglikelihood_tokens # Isn't used because we override _loglikelihood_tokens
......
...@@ -74,16 +74,16 @@ class DROP(Task): ...@@ -74,16 +74,16 @@ class DROP(Task):
{"number": ['1', '8'], ...} {"number": ['1', '8'], ...}
-> [{"number": ['1'], ...}, {"number": ['8'], ...}] -> [{"number": ['1'], ...}, {"number": ['8'], ...}]
""" """
vas = [] valid_answers = []
for i in range(len(validated_answers["number"])): for i in range(len(validated_answers["number"])):
vas.append( valid_answers.append(
{ {
"number": validated_answers["number"][i], "number": validated_answers["number"][i],
"date": validated_answers["date"][i], "date": validated_answers["date"][i],
"spans": validated_answers["spans"][i], "spans": validated_answers["spans"][i],
} }
) )
return vas return valid_answers
answers = [] answers = []
answers_set = set() answers_set = set()
......
...@@ -10,7 +10,7 @@ to steer chatbot outputs or eventually regularize open-ended reinforcement ...@@ -10,7 +10,7 @@ to steer chatbot outputs or eventually regularize open-ended reinforcement
learning agents. learning agents.
NOTE: The reported "group" accuracies for the Deontology, Justice, and Virtue NOTE: The reported "group" accuracies for the Deontology, Justice, and Virtue
tasks are refered to in this work as the `em` sub-metric. See Section 3. Metrics. tasks are referred to in this work as the `em` sub-metric. See Section 3. Metrics.
of the paper. of the paper.
Homepage: https://github.com/hendrycks/ethics Homepage: https://github.com/hendrycks/ethics
...@@ -323,7 +323,7 @@ class EthicsUtilitarianism(Ethics): ...@@ -323,7 +323,7 @@ class EthicsUtilitarianism(Ethics):
} }
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "Scenario 1: {}\nScenario 2: {}\nQuestion: Is Scenario 1 preferrable?\nAnswer:".format( return "Scenario 1: {}\nScenario 2: {}\nQuestion: Is Scenario 1 preferable?\nAnswer:".format(
doc["scenarios"][0], doc["scenarios"][1] doc["scenarios"][0], doc["scenarios"][1]
) )
......
...@@ -5,7 +5,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1 ...@@ -5,7 +5,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
1) Collects all contamination text files that are to be removed from training data 1) Collects all contamination text files that are to be removed from training data
2) Filters training data by finding `N`gram matches between the training data 2) Filters training data by finding `N`gram matches between the training data
and any contamination and any contamination
1) `N`grams ignore case and punctation and are split on whitespace. 1) `N`grams ignore case and punctuation and are split on whitespace.
2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around 2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around
the match, splitting the training data into chunks the match, splitting the training data into chunks
3) Any chunks less than `minimum_slice_length` are removed 3) Any chunks less than `minimum_slice_length` are removed
...@@ -20,7 +20,7 @@ minimum_slice_length = 200 ...@@ -20,7 +20,7 @@ minimum_slice_length = 200
too_dirty_cutoff = 10 too_dirty_cutoff = 10
``` ```
## Compling ## Compiling
Janitor can be used as a pure python program, but it is much faster if the ngram Janitor can be used as a pure python program, but it is much faster if the ngram
code is run in C++. To compile the C++ code, run code is run in C++. To compile the C++ code, run
......
...@@ -63,7 +63,7 @@ def process_bucket( ...@@ -63,7 +63,7 @@ def process_bucket(
for line in bucket.read(): for line in bucket.read():
[ngram, document_id] = line.rsplit(" ", 1) [ngram, document_id] = line.rsplit(" ", 1)
# Write ngram if more then 10 unique document occurences # Write ngram if more then 10 unique document occurrences
if ngram != current_ngram: if ngram != current_ngram:
if len(current_ngram_document_ids) > 10: if len(current_ngram_document_ids) > 10:
output_archive.add_data( output_archive.add_data(
......
# TODO: Remove all TODO comments once the implementation is complete. # TODO: Remove all TODO comments once the implementation is complete.
""" """
TODO: Add the Paper Title on this line. TODO: Add the Paper Title on this line.
TODO: Add the paper's PDF URL (preferrably from arXiv) on this line. TODO: Add the paper's PDF URL (preferably from arXiv) on this line.
TODO: Write a Short Description of the task. TODO: Write a Short Description of the task.
......
# TODO: Remove all TODO comments once the implementation is complete. # TODO: Remove all TODO comments once the implementation is complete.
""" """
TODO: Add the Paper Title on this line. TODO: Add the Paper Title on this line.
TODO: Add the paper's PDF URL (preferrably from arXiv) on this line. TODO: Add the paper's PDF URL (preferably from arXiv) on this line.
TODO: Write a Short Description of the task. TODO: Write a Short Description of the task.
...@@ -45,7 +45,7 @@ class NewTask(Task): ...@@ -45,7 +45,7 @@ class NewTask(Task):
if self._training_docs is None: if self._training_docs is None:
# TODO: Return the training document generator from `self.dataset`. # TODO: Return the training document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with # If you need to process the data, `map` over the documents with
# the custom procesing function, `self._process_doc`. E.g. # the custom processing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["validation"])` # `map(self._process_doc, self.dataset["validation"])`
# In most case you can leave this as is unless the dataset split is # In most case you can leave this as is unless the dataset split is
# named differently than the default `"train"`. # named differently than the default `"train"`.
...@@ -56,7 +56,7 @@ class NewTask(Task): ...@@ -56,7 +56,7 @@ class NewTask(Task):
if self.has_validation_docs(): if self.has_validation_docs():
# TODO: Return the validation document generator from `self.dataset`. # TODO: Return the validation document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with the # If you need to process the data, `map` over the documents with the
# custom procesing function, `self._process_doc`. E.g. # custom processing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["validation"])` # `map(self._process_doc, self.dataset["validation"])`
# In most case you can leave this as is unless the dataset split is # In most case you can leave this as is unless the dataset split is
# named differently than the default `"validation"`. # named differently than the default `"validation"`.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment