Unverified Commit 1a02d9df authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Merge pull request #833 from EleutherAI/fix-ppl

[Refactor] Fix wikitext task
parents 8f448eed 9bca36a9
...@@ -296,14 +296,14 @@ class HFLM(LM): ...@@ -296,14 +296,14 @@ class HFLM(LM):
) )
else: else:
assert accelerator.distributed_type in [ assert accelerator.distributed_type in [
DistributedType.FSDP, DistributedType.FSDP,
DistributedType.MULTI_GPU DistributedType.MULTI_GPU,
], "Unsupported distributed type provided. Only DDP and FSDP are supported." ], "Unsupported distributed type provided. Only DDP and FSDP are supported."
if accelerator.distributed_type == DistributedType.FSDP: if accelerator.distributed_type == DistributedType.FSDP:
self._model = accelerator.prepare(self.model) self._model = accelerator.prepare(self.model)
else: else:
self._model = accelerator.prepare_model( self._model = accelerator.prepare_model(
self.model, evaluation_mode=True self.model, evaluation_mode=True
) )
self._device = torch.device(f"cuda:{accelerator.local_process_index}") self._device = torch.device(f"cuda:{accelerator.local_process_index}")
self.accelerator = accelerator self.accelerator = accelerator
......
...@@ -34,3 +34,15 @@ def wikitext_detokenizer(doc): ...@@ -34,3 +34,15 @@ def wikitext_detokenizer(doc):
string = string.replace(" 's", "'s") string = string.replace(" 's", "'s")
return string return string
def process_results(doc, results):
(loglikelihood,) = results
# IMPORTANT: wikitext counts number of words in *original doc before detokenization*
_words = len(re.split(r"\s+", doc["page"]))
_bytes = len(doc["page"].encode("utf-8"))
return {
"word_perplexity": (loglikelihood, _words),
"byte_perplexity": (loglikelihood, _bytes),
"bits_per_byte": (loglikelihood, _bytes),
}
...@@ -7,6 +7,7 @@ validation_split: validation ...@@ -7,6 +7,7 @@ validation_split: validation
test_split: test test_split: test
doc_to_text: "" doc_to_text: ""
doc_to_target: !function preprocess_wikitext.wikitext_detokenizer doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
process_results: !function preprocess_wikitext.process_results
should_decontaminate: true should_decontaminate: true
doc_to_decontamination_query: "{{page}}" doc_to_decontamination_query: "{{page}}"
metric_list: metric_list:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment