Commit a7286607 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

change words/bytes calc for wikitext

parent cc7828dd
...@@ -34,3 +34,15 @@ def wikitext_detokenizer(doc): ...@@ -34,3 +34,15 @@ def wikitext_detokenizer(doc):
string = string.replace(" 's", "'s") string = string.replace(" 's", "'s")
return string return string
def process_results(doc, results):
(loglikelihood,) = results
# IMPORTANT: wikitext counts number of words in *original doc before detokenization*
_words = len(re.split(r"\s+", doc["page"]))
_bytes = len(doc["page"].encode("utf-8"))
return {
"word_perplexity": (loglikelihood, _words),
"byte_perplexity": (loglikelihood, _bytes),
"bits_per_byte": (loglikelihood, _bytes),
}
...@@ -7,6 +7,7 @@ validation_split: validation ...@@ -7,6 +7,7 @@ validation_split: validation
test_split: test test_split: test
doc_to_text: "" doc_to_text: ""
doc_to_target: !function preprocess_wikitext.wikitext_detokenizer doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
process_results: !function preprocess_wikitext.process_results
should_decontaminate: true should_decontaminate: true
doc_to_decontamination_query: "{{page}}" doc_to_decontamination_query: "{{page}}"
metric_list: metric_list:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment