#!/bin/bash # Small English Wikipedia dataset (~2M chunks). get_wiki_tiny_config() { RETRO_INDEX_STR="IVF4096_HNSW4,Flat" RETRO_GPT_TRAIN_SAMPLES=31250 LR_DECAY_SAMPLES=2 LR_WARMUP_SAMPLES=1 RETRO_GPT_EVAL_INTERVAL=2000 RETRO_GPT_EVAL_ITERS=100 RETRO_EF_SEARCH=4 RETRO_NPROBE=64 DATALOADER_TYPE=cyclic } # English Wikipedia dataset (~67M chunks). get_wiki_config() { RETRO_INDEX_STR="IVF262144_HNSW32,Flat" RETRO_GPT_TRAIN_SAMPLES=2037248 LR_DECAY_SAMPLES=2 LR_WARMUP_SAMPLES=1 RETRO_GPT_EVAL_INTERVAL=2000 RETRO_GPT_EVAL_ITERS=100 RETRO_EF_SEARCH=16 RETRO_NPROBE=4096 DATALOADER_TYPE=cyclic } # Full corpus (~5B chunks). get_corpus_config() { RETRO_INDEX_STR="OPQ32_256,IVF4194304_HNSW32,PQ32" RETRO_GPT_TRAIN_SAMPLES=192000000 LR_DECAY_SAMPLES=166400000 LR_WARMUP_SAMPLES=162761 RETRO_GPT_EVAL_INTERVAL=2000 RETRO_GPT_EVAL_ITERS=50 RETRO_EF_SEARCH=32 RETRO_NPROBE=4096 DATALOADER_TYPE=single }