Unverified Commit 94785906 authored by Denny's avatar Denny Committed by GitHub
Browse files

Update run_lm_finetuning.py

The previous method, just as phrased, did not exist in the class.
parent ca559826
...@@ -75,7 +75,7 @@ class TextDataset(Dataset): ...@@ -75,7 +75,7 @@ class TextDataset(Dataset):
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[i:i+block_size])) self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[i:i+block_size]))
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding) # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
# If your dataset is small, first you should loook for a bigger one :-) and second you # If your dataset is small, first you should loook for a bigger one :-) and second you
# can change this behavior by adding (model specific) padding. # can change this behavior by adding (model specific) padding.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment