run.sh 3.15 KB
Newer Older
Hongkun Yu's avatar
Hongkun Yu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
#!/bin/bash
16
set -e
17

18
19
20
21
22
if [ `id -u` != 0 ]; then
  echo "Calling sudo to gain root for this shell. (Needed to clear caches.)"
  sudo echo "Success"
fi

23
24
SCRIPT_DIR=`dirname "$BASH_SOURCE"`
export PYTHONPATH="${SCRIPT_DIR}/../../"
Shining Sun's avatar
Shining Sun committed
25
MAIN_SCRIPT="ncf_estimator_main.py"
26

27
DATASET="ml-20m"
28

29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
BUCKET=${BUCKET:-""}
ROOT_DIR="${BUCKET:-/tmp}/MLPerf_NCF"
echo "Root directory: ${ROOT_DIR}"

if [[ -z ${BUCKET} ]]; then
  LOCAL_ROOT=${ROOT_DIR}
else
  LOCAL_ROOT="/tmp/MLPerf_NCF"
  mkdir -p ${LOCAL_ROOT}
  echo "Local root (for files which cannot use GCS): ${LOCAL_ROOT}"
fi

DATE=$(date '+%Y-%m-%d_%H:%M:%S')
TEST_DIR="${ROOT_DIR}/${DATE}"
LOCAL_TEST_DIR="${LOCAL_ROOT}/${DATE}"
mkdir -p ${LOCAL_TEST_DIR}

TPU=${TPU:-""}
if [[ -z ${TPU} ]]; then
48
  DEVICE_FLAG="--num_gpus -1" # --use_xla_for_gpu"
49
50
51
52
53
else
  DEVICE_FLAG="--tpu ${TPU} --num_gpus 0"
fi

DATA_DIR="${ROOT_DIR}/movielens_data"
54
python "${SCRIPT_DIR}/movielens.py" --data_dir ${DATA_DIR} --dataset ${DATASET}
55

Shining Sun's avatar
Shining Sun committed
56
57
58
if [ "$1" == "keras" ]
then
	MAIN_SCRIPT="ncf_keras_main.py"
59
	BATCH_SIZE=99000
Shining Sun's avatar
Shining Sun committed
60
61
62
63
64
	DEVICE_FLAG="--num_gpus 1"
else
	BATCH_SIZE=98340
fi

65
66
67
68
69
70
71
72
{

for i in `seq 0 4`;
do
  START_TIME=$(date +%s)
  MODEL_DIR="${TEST_DIR}/model_dir_${i}"

  RUN_LOG="${LOCAL_TEST_DIR}/run_${i}.log"
73
74
  export COMPLIANCE_FILE="${LOCAL_TEST_DIR}/run_${i}_compliance_raw.log"
  export STITCHED_COMPLIANCE_FILE="${LOCAL_TEST_DIR}/run_${i}_compliance_submission.log"
75
76
  echo ""
  echo "Beginning run ${i}"
77
78
79
80
  echo "  Complete output logs are in ${RUN_LOG}"
  echo "  Compliance logs: (submission log is created after run.)"
  echo "    ${COMPLIANCE_FILE}"
  echo "    ${STITCHED_COMPLIANCE_FILE}"
81

82
83
  # To reduce variation set the seed flag:
  #   --seed ${i}
84

Shining Sun's avatar
Shining Sun committed
85
  python -u "${SCRIPT_DIR}/${MAIN_SCRIPT}" \
86
87
88
89
90
91
      --model_dir ${MODEL_DIR} \
      --data_dir ${DATA_DIR} \
      --dataset ${DATASET} --hooks "" \
      ${DEVICE_FLAG} \
      --clean \
      --train_epochs 14 \
Shining Sun's avatar
Shining Sun committed
92
      --batch_size ${BATCH_SIZE} \
93
94
95
96
97
98
99
100
      --eval_batch_size 160000 \
      --learning_rate 0.00382059 \
      --beta1 0.783529 \
      --beta2 0.909003 \
      --epsilon 1.45439e-07 \
      --layers 256,256,128,64 --num_factors 64 \
      --hr_threshold 0.635 \
      --ml_perf \
101
 |& tee ${RUN_LOG} \
102
 | grep --line-buffered  -E --regexp="(Iteration [0-9]+: HR = [0-9\.]+, NDCG = [0-9\.]+, Loss = [0-9\.]+)|(pipeline_hash)|(MLPerf time:)"
103
104
105
106

  END_TIME=$(date +%s)
  echo "Run ${i} complete: $(( $END_TIME - $START_TIME )) seconds."

107
108
109
110
111
112
  # Don't fill up the local hard drive.
  if [[ -z ${BUCKET} ]]; then
    echo "Removing model directory to save space."
    rm -r ${MODEL_DIR}
  fi

113
114
115
done

} |& tee "${LOCAL_TEST_DIR}/summary.log"