run.sh 2.06 KB
Newer Older
1
#!/bin/bash
2
set -e
3

4
DATASET="ml-20m"
5

6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
BUCKET=${BUCKET:-""}
ROOT_DIR="${BUCKET:-/tmp}/MLPerf_NCF"
echo "Root directory: ${ROOT_DIR}"

if [[ -z ${BUCKET} ]]; then
  LOCAL_ROOT=${ROOT_DIR}
else
  LOCAL_ROOT="/tmp/MLPerf_NCF"
  mkdir -p ${LOCAL_ROOT}
  echo "Local root (for files which cannot use GCS): ${LOCAL_ROOT}"
fi

DATE=$(date '+%Y-%m-%d_%H:%M:%S')
TEST_DIR="${ROOT_DIR}/${DATE}"
LOCAL_TEST_DIR="${LOCAL_ROOT}/${DATE}"
mkdir -p ${LOCAL_TEST_DIR}

TPU=${TPU:-""}
if [[ -z ${TPU} ]]; then
  DEVICE_FLAG="--num_gpus -1"
else
  DEVICE_FLAG="--tpu ${TPU} --num_gpus 0"
fi

DATA_DIR="${ROOT_DIR}/movielens_data"
python ../datasets/movielens.py --data_dir ${DATA_DIR} --dataset ${DATASET}

{

for i in `seq 0 4`;
do
  START_TIME=$(date +%s)
  MODEL_DIR="${TEST_DIR}/model_dir_${i}"

  RUN_LOG="${LOCAL_TEST_DIR}/run_${i}.log"
  echo ""
  echo "Beginning run ${i}"
  echo "  Complete logs are in ${RUN_LOG}"

45
46
47
48
49
50
  # To reduce variation set the seed flag:
  #   --seed ${i}
  #
  # And to confirm that the pipeline is deterministic pass the flag:
  #   --hash_pipeline
  #
51
  # (`--hash_pipeline` will slow down training, though not as much as one might imagine.)
52
53
54
55
56
  python ncf_main.py --model_dir ${MODEL_DIR} \
                     --data_dir ${DATA_DIR} \
                     --dataset ${DATASET} --hooks "" \
                     ${DEVICE_FLAG} \
                     --clean \
57
                     --train_epochs 20 \
58
                     --batch_size 2048 \
59
                     --eval_batch_size 100000 \
60
61
                     --learning_rate 0.0005 \
                     --layers 256,256,128,64 --num_factors 64 \
62
63
                     --hr_threshold 0.635 \
                     --ml_perf \
64
65
 |& tee ${RUN_LOG} \
 | grep --line-buffered  -E --regexp="(Iteration [0-9]+: HR = [0-9\.]+, NDCG = [0-9\.]+)|(pipeline_hash)"
66
67
68
69

  END_TIME=$(date +%s)
  echo "Run ${i} complete: $(( $END_TIME - $START_TIME )) seconds."

70
71
72
73
74
75
  # Don't fill up the local hard drive.
  if [[ -z ${BUCKET} ]]; then
    echo "Removing model directory to save space."
    rm -r ${MODEL_DIR}
  fi

76
77
78
done

} |& tee "${LOCAL_TEST_DIR}/summary.log"