preprocess_data.sh 4.57 KB
Newer Older
Lawrence McAfee's avatar
Retro  
Lawrence McAfee committed
1
2
3
#!/bin/bash

set -u
liangjing's avatar
v1  
liangjing committed
4

Lawrence McAfee's avatar
Retro  
Lawrence McAfee committed
5
6
unset NCCL_DEBUG

liangjing's avatar
v1  
liangjing committed
7
######## Megatron, Retro dirs. ########
Lawrence McAfee's avatar
Retro  
Lawrence McAfee committed
8

liangjing's avatar
v1  
liangjing committed
9
REPO_DIR="<path/to/megatron/repo>"
xingjinliang's avatar
xingjinliang committed
10
RETRO_PROJECT_DIR="<path/to/retro/project/directory>"
Lawrence McAfee's avatar
Retro  
Lawrence McAfee committed
11

liangjing's avatar
v1  
liangjing committed
12
######## Task (e.g., db, index, query). ########
Lawrence McAfee's avatar
Retro  
Lawrence McAfee committed
13

xingjinliang's avatar
xingjinliang committed
14
15
16
# This script takes a single argument, which specifies the retro task to be
# performed. The available tasks are: db-build, index-train, index-add, and
# query-neighbors.
Lawrence McAfee's avatar
Retro  
Lawrence McAfee committed
17

xingjinliang's avatar
xingjinliang committed
18
19
20
21
22
23
24
25
26
# ~~ Examples ~~
# RETRO_TASKS="db-build"          # Build the retrieval database
# RETRO_TASKS="index-train"       # Train the index
# RETRO_TASKS="index-add"         # Add data to the index
# RETRO_TASKS="query-neighbors"   # Perform query pretraining for neighbors

# You can also provide the task as a command-line argument when executing the
# script. Example: ./preprocess_data.sh index-add
RETRO_TASKS=$1
Lawrence McAfee's avatar
Retro  
Lawrence McAfee committed
27

xingjinliang's avatar
xingjinliang committed
28
######## Data. ########
liangjing's avatar
v1  
liangjing committed
29
30
31
32
33
DATA_BLEND="<see --data-path in arguments.py>"

######## Index. ########

RETRO_INDEX_STR="OPQ32_64,IVF65536_HNSW8,PQ32"
xingjinliang's avatar
xingjinliang committed
34
RETRO_INDEX_NTRAIN=66625331
liangjing's avatar
v1  
liangjing committed
35
36
37
38
39
40
41
42
RETRO_INDEX_TRAIN_LOAD_FRACTION=0.97
RETRO_INDEX_ADD_LOAD_FRACTION=0.95

######## GPT. ########

RETRO_GPT_SEED=1234
RETRO_GPT_SPLIT="98,2,0"
RETRO_GPT_DATA_PATH=${DATA_BLEND}
xingjinliang's avatar
xingjinliang committed
43
RETRO_GPT_TRAIN_SAMPLES=200000
liangjing's avatar
v1  
liangjing committed
44
45
46
47
RETRO_GPT_EVAL_INTERVAL=2000
RETRO_GPT_EVAL_ITERS=50
RETRO_GPT_LR_DECAY_SAMPLES=175000
RETRO_GPT_LR_WARMUP_SAMPLES=10000
xingjinliang's avatar
xingjinliang committed
48
RETRO_GPT_SEQ_LENGTH=2048
liangjing's avatar
v1  
liangjing committed
49
50
51
52
53
RETRO_GPT_GLOBAL_BATCH_SIZE=256
RETRO_GPT_CHUNK_LENGTH=64

######## Query. ########

xingjinliang's avatar
xingjinliang committed
54
55
RETRO_QUERY_NUM_NEIGHBORS_QUERY=200
RETRO_QUERY_NUM_NEIGHBORS_SAVE=20
liangjing's avatar
v1  
liangjing committed
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
RETRO_QUERY_EF_SEARCH=32
RETRO_QUERY_NPROBE=4096

######## Args. ########

ARGS=" \
    --distributed-timeout-minutes 600 \
    --tensor-model-parallel-size 1 \
    --pipeline-model-parallel-size 1 \
    --num-layers 24 \
    --hidden-size 1024 \
    --num-attention-heads 16 \
    --micro-batch-size 1 \
    --global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \
    --seq-length 512 \
    --max-position-embeddings 512 \
xingjinliang's avatar
xingjinliang committed
72
    --load ${RETRO_PROJECT_DIR}/checkpoints/bert \
liangjing's avatar
v1  
liangjing committed
73
74
    --exit-on-missing-checkpoint \
    --no-load-optim \
xingjinliang's avatar
xingjinliang committed
75
    --data-path [null] \
liangjing's avatar
v1  
liangjing committed
76
    --tokenizer-type BertWordPieceLowerCase \
xingjinliang's avatar
xingjinliang committed
77
    --vocab-file ${RETRO_PROJECT_DIR}/tokenizer/bert-large-uncased-vocab.txt \
liangjing's avatar
v1  
liangjing committed
78
79
80
81
82
83
84
85
86
87
88
89
    --split ${RETRO_GPT_SPLIT} \
    --distributed-backend nccl \
    --lr 0.0001 \
    --lr-decay-style linear \
    --min-lr 1.0e-5 \
    --train-samples ${RETRO_GPT_TRAIN_SAMPLES} \
    --lr-decay-samples ${RETRO_GPT_LR_DECAY_SAMPLES} \
    --lr-warmup-samples ${RETRO_GPT_LR_WARMUP_SAMPLES} \
    --weight-decay 1e-2 \
    --clip-grad 1.0 \
    --eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
    --eval-iters ${RETRO_GPT_EVAL_ITERS} \
xingjinliang's avatar
xingjinliang committed
90
    --bf16 \
liangjing's avatar
v1  
liangjing committed
91
92
93
94
95
96
    --no-data-sharding \
    --no-gradient-accumulation-fusion \
    --no-async-tensor-model-parallel-allreduce \
    --bert-embedder-type megatron \
    --output-bert-embeddings \
    \
xingjinliang's avatar
xingjinliang committed
97
    --retro-project-dir ${RETRO_PROJECT_DIR} \
liangjing's avatar
v1  
liangjing committed
98
    --retro-tasks ${RETRO_TASKS} \
xingjinliang's avatar
xingjinliang committed
99
    --retro-bert-vocab-file tokenizer/bert-large-uncased-vocab.txt \
liangjing's avatar
v1  
liangjing committed
100
    --retro-bert-tokenizer-type BertWordPieceLowerCase \
xingjinliang's avatar
xingjinliang committed
101
    \
liangjing's avatar
v1  
liangjing committed
102
103
    --retro-gpt-seed ${RETRO_GPT_SEED} \
    --retro-gpt-tokenizer-type GPTSentencePieceTokenizer \
xingjinliang's avatar
xingjinliang committed
104
    --retro-gpt-tokenizer-model /path/to/tokenizer/model \
liangjing's avatar
v1  
liangjing committed
105
106
107
108
109
110
111
    --retro-gpt-seq-length ${RETRO_GPT_SEQ_LENGTH} \
    --retro-gpt-chunk-length ${RETRO_GPT_CHUNK_LENGTH} \
    --retro-gpt-global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \
    --retro-gpt-eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
    --retro-gpt-eval-iters ${RETRO_GPT_EVAL_ITERS} \
    --retro-gpt-split ${RETRO_GPT_SPLIT} \
    --retro-gpt-data-path ${RETRO_GPT_DATA_PATH} \
xingjinliang's avatar
xingjinliang committed
112
113
    --retro-gpt-train-samples ${RETRO_GPT_TRAIN_SAMPLES} \
    \
liangjing's avatar
v1  
liangjing committed
114
115
116
117
    --retro-index-str ${RETRO_INDEX_STR} \
    --retro-index-ntrain ${RETRO_INDEX_NTRAIN} \
    --retro-index-train-load-fraction ${RETRO_INDEX_TRAIN_LOAD_FRACTION} \
    --retro-index-add-load-fraction ${RETRO_INDEX_ADD_LOAD_FRACTION} \
xingjinliang's avatar
xingjinliang committed
118
119
120
    --no-retro-index-delete-training-embeddings \
    --no-retro-index-delete-added-codes \
    \
liangjing's avatar
v1  
liangjing committed
121
122
123
124
125
    --retro-query-num-neighbors-query ${RETRO_QUERY_NUM_NEIGHBORS_QUERY} \
    --retro-query-num-neighbors-save ${RETRO_QUERY_NUM_NEIGHBORS_SAVE} \
    --retro-query-ef-search ${RETRO_QUERY_EF_SEARCH} \
    --retro-query-nprobe ${RETRO_QUERY_NPROBE} \
"
Lawrence McAfee's avatar
Retro  
Lawrence McAfee committed
126
127

######## Command. ########
liangjing's avatar
v1  
liangjing committed
128
129
130
131

NPROCS=8 # Number of GPUs.
CMD="\
    cd ${REPO_DIR} && pwd && \
Lawrence McAfee's avatar
Retro  
Lawrence McAfee committed
132
    export PYTHONPATH=$PYTHONPATH:${REPO_DIR} && \
liangjing's avatar
v1  
liangjing committed
133
    python -m torch.distributed.run \
Lawrence McAfee's avatar
Retro  
Lawrence McAfee committed
134
135
136
137
138
    --nproc_per_node ${NPROCS} \
    --nnodes 1 \
    --node_rank ${NODE_RANK} \
    --master_addr ${MASTER_ADDR} \
    --master_port 6000 \
xingjinliang's avatar
xingjinliang committed
139
    tools/retro/preprocess_data.py ${ARGS} \
Lawrence McAfee's avatar
Retro  
Lawrence McAfee committed
140
141
"
echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
liangjing's avatar
v1  
liangjing committed
142
echo "CMD = '$CMD'."
Lawrence McAfee's avatar
Retro  
Lawrence McAfee committed
143
echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
liangjing's avatar
v1  
liangjing committed
144
eval $CMD