{ "training_type": "pre", "init_from": "scratch", "data_dir": "../data/", "out_dir": "../data/out-allamo-1B/", "checkpoint_interval": 1000, "save_best_checkpoint": false, "eval_interval": 1000, "eval_iters": 200, "log_interval": 1, "vocab_size": 50307, "custom_tokenizer_path": "../data/allamo_1B_dataset/tokenizer.json", "wandb_log": false, "wandb_project": "allamo", "wandb_run_name": "allamo-1B", "dataset": "allamo_1B_dataset", "batch_size": 1, "block_size": 2048, "gradient_accumulation_steps": 264, "dataset_seq_train": true, "grad_accum_schedule": false, "n_layer": 20, "n_head": 16, "head_size": 128, "n_embd": 2048, "dropout": 0, "weight_decay": 0.1, "multiple_of": 256, "norm_eps": 0.000001, "learning_rate": 0.0003, "max_iters": 38000, "decay_lr": true, "lr_decay_iters": 38000, "lr_decay_reset_iters": 3800, "min_lr": 0.0002, "warmup_iters": 3800, "device": "cuda:0", "dtype": "float16", "compile": true }