Commit 89b5c982 authored by chenzk's avatar chenzk
Browse files

v1.0.55

parent f0ee8b73
......@@ -64,7 +64,7 @@ class AllamoDataset:
self.processed_files.append(load_dataset_file)
new_data = None
if load_dataset_file.endswith('.bin'):
assert self.training_type == 'pre', 'NumPy format is supported only for pre-training'
# assert self.training_type == 'pre', 'NumPy format is supported only for pre-training'
step_size = self.world_size * self.sample_size
new_data = torch.from_numpy(np.fromfile(load_dataset_file, dtype=np.uint16).astype(np.int16))
if step_size > len(new_data):
......
......@@ -68,7 +68,7 @@ class AllamoSampler:
else:
raise Exception('Tokenizer is not provided. Please specify either a Tiktoken tokenizer or a HuggingFace tokenizer')
# ensure that the tokenizer and model vocabulary sizes are equal
assert len(tokenizer) == self.model.config.vocab_size
# assert len(tokenizer) == self.model.config.vocab_size
self.tokenizer = tokenizer
def tokenize_prompt(self, text: str):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment