minigo.py

# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Train MiniGo with several iterations of RL learning.

One iteration of RL learning consists of bootstrap, selfplay, gather and train:
  bootstrap: Initialize a random model
  selfplay: Play games with the latest model to produce data used for training
  gather: Group games played with the same model into larger files of tfexamples
  train: Train a new model with the selfplay results from the most recent
    N generations.
After training, validation can be performed on the holdout data.
Given two models, evaluation can be applied to choose a stronger model.
The training pipeline consists of multiple RL learning iterations to achieve
better models.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import os
import random
import socket
import sys
import time

import tensorflow as tf  # pylint: disable=g-bad-import-order

import dualnet
import evaluation
import go
import model_params
import preprocessing
import selfplay_mcts
import utils

_TF_RECORD_SUFFIX = '.tfrecord.zz'


def _ensure_dir_exists(directory):
  """Check if directory exists. If not, create it.

  Args:
    directory: A given directory
  """
  if os.path.isdir(directory) is False:
    tf.gfile.MakeDirs(directory)


def bootstrap(estimator_model_dir, trained_models_dir, params):
  """Initialize the model with random weights.

  Args:
    estimator_model_dir: tf.estimator model directory.
    trained_models_dir: Dir to save the trained models. Here to export the first
      bootstrapped generation.
    params: A MiniGoParams instance of hyperparameters for the model.
  """
  bootstrap_name = utils.generate_model_name(0)
  _ensure_dir_exists(trained_models_dir)
  bootstrap_model_path = os.path.join(trained_models_dir, bootstrap_name)
  _ensure_dir_exists(estimator_model_dir)

  print('Bootstrapping with working dir {}\n Model 0 exported to {}'.format(
      estimator_model_dir, bootstrap_model_path))
  dualnet.bootstrap(estimator_model_dir, params)
  dualnet.export_model(estimator_model_dir, bootstrap_model_path)


def selfplay(selfplay_dirs, selfplay_model, params):
  """Perform selfplay with a specific model.

  Args:
    selfplay_dirs: A dict to specify the directories used in selfplay.
      selfplay_dirs = {
          'output_dir': output_dir,
          'holdout_dir': holdout_dir,
          'clean_sgf': clean_sgf,
          'full_sgf': full_sgf
      }
    selfplay_model: The actual Dualnet runner for selfplay.
    params: A MiniGoParams instance of hyperparameters for the model.
  """
  with utils.logged_timer('Playing game'):
    player = selfplay_mcts.play(
        params.board_size, selfplay_model, params.selfplay_readouts,
        params.selfplay_resign_threshold, params.simultaneous_leaves,
        params.selfplay_verbose)

  output_name = '{}-{}'.format(int(time.time()), socket.gethostname())

  def _write_sgf_data(dir_sgf, use_comments):
    with tf.gfile.GFile(
        os.path.join(dir_sgf, '{}.sgf'.format(output_name)), 'w') as f:
      f.write(player.to_sgf(use_comments=use_comments))

  _write_sgf_data(selfplay_dirs['clean_sgf'], use_comments=False)
  _write_sgf_data(selfplay_dirs['full_sgf'], use_comments=True)

  game_data = player.extract_data()
  tf_examples = preprocessing.make_dataset_from_selfplay(game_data, params)

  # Hold out 5% of games for evaluation.
  if random.random() < params.holdout_pct:
    fname = os.path.join(
        selfplay_dirs['holdout_dir'], output_name + _TF_RECORD_SUFFIX)
  else:
    fname = os.path.join(
        selfplay_dirs['output_dir'], output_name + _TF_RECORD_SUFFIX)

  preprocessing.write_tf_examples(fname, tf_examples)


def gather(selfplay_dir, training_chunk_dir, params):
  """Gather selfplay data into large training chunk.

  Args:
    selfplay_dir: Where to look for games. Set as 'base_dir/data/selfplay/'.
    training_chunk_dir: where to put collected games. Set as
      'base_dir/data/training_chunks/'.
    params: A MiniGoParams instance of hyperparameters for the model.
  """
  # Check the selfplay data from the most recent 50 models.
  _ensure_dir_exists(training_chunk_dir)
  sorted_model_dirs = sorted(tf.gfile.ListDirectory(selfplay_dir))
  models = [model_dir.strip('/')
            for model_dir in sorted_model_dirs[-params.gather_generation:]]

  with utils.logged_timer('Finding existing tfrecords...'):
    model_gamedata = {
        model: tf.gfile.Glob(
            os.path.join(selfplay_dir, model, '*'+_TF_RECORD_SUFFIX))
        for model in models
    }
  print('Found {} models'.format(len(models)))
  for model_name, record_files in sorted(model_gamedata.items()):
    print('    {}: {} files'.format(model_name, len(record_files)))

  meta_file = os.path.join(training_chunk_dir, 'meta.txt')
  try:
    with tf.gfile.GFile(meta_file, 'r') as f:
      already_processed = set(f.read().split())
  except tf.errors.NotFoundError:
    already_processed = set()

  num_already_processed = len(already_processed)

  for model_name, record_files in sorted(model_gamedata.items()):
    if set(record_files) <= already_processed:
      continue
    print('Gathering files from {}:'.format(model_name))
    tf_examples = preprocessing.shuffle_tf_examples(
        params.shuffle_buffer_size, params.examples_per_chunk, record_files)
    # tqdm to make the loops show a smart progress meter
    for i, example_batch in enumerate(tf_examples):
      output_record = os.path.join(
          training_chunk_dir,
          ('{}-{}'+_TF_RECORD_SUFFIX).format(model_name, str(i)))
      preprocessing.write_tf_examples(
          output_record, example_batch, serialize=False)
    already_processed.update(record_files)

  print('Processed {} new files'.format(
      len(already_processed) - num_already_processed))
  with tf.gfile.GFile(meta_file, 'w') as f:
    f.write('\n'.join(sorted(already_processed)))


def train(trained_models_dir, estimator_model_dir, training_chunk_dir,
          generation, params):
  """Train the latest model from gathered data.

  Args:
    trained_models_dir: Where to export the completed generation.
    estimator_model_dir: tf.estimator model directory.
    training_chunk_dir: Directory where gathered training chunks are.
    generation: Which generation you are training.
    params: A MiniGoParams instance of hyperparameters for the model.
  """
  new_model_name = utils.generate_model_name(generation)
  print('New model will be {}'.format(new_model_name))
  new_model = os.path.join(trained_models_dir, new_model_name)

  print('Training on gathered game data...')
  tf_records = sorted(
      tf.gfile.Glob(os.path.join(training_chunk_dir, '*'+_TF_RECORD_SUFFIX)))
  tf_records = tf_records[
      -(params.train_window_size // params.examples_per_chunk):]

  print('Training from: {} to {}'.format(tf_records[0], tf_records[-1]))
  with utils.logged_timer('Training'):
    dualnet.train(estimator_model_dir, tf_records, generation, params)
    dualnet.export_model(estimator_model_dir, new_model)


def validate(trained_models_dir, holdout_dir, estimator_model_dir, params):
  """Validate the latest model on the holdout dataset.

  Args:
    trained_models_dir: Directories where the completed generations/models are.
    holdout_dir: Directories where holdout data are.
    estimator_model_dir: tf.estimator model directory.
    params: A MiniGoParams instance of hyperparameters for the model.
  """
  model_num, _ = utils.get_latest_model(trained_models_dir)

  # Get the holdout game data
  nums_names = utils.get_models(trained_models_dir)

  # Model N was trained on games up through model N-1, so the validation set
  # should only be for models through N-1 as well, thus the (model_num) term.
  models = [num_name for num_name in nums_names if num_name[0] < model_num]

  # pair is a tuple of (model_num, model_name), like (13, 000013-modelname)
  holdout_dirs = [os.path.join(holdout_dir, pair[1])
                  for pair in models[-params.holdout_generation:]]
  tf_records = []
  with utils.logged_timer('Building lists of holdout files'):
    for record_dir in holdout_dirs:
      if os.path.exists(record_dir):  # make sure holdout dir exists
        tf_records.extend(
            tf.gfile.Glob(os.path.join(record_dir, '*'+_TF_RECORD_SUFFIX)))

  if not tf_records:
    print('No holdout dataset for validation! '
          'Please check your holdout directory: {}'.format(holdout_dir))
    return

  print('The length of tf_records is {}.'.format(len(tf_records)))
  first_tf_record = os.path.basename(tf_records[0])
  last_tf_record = os.path.basename(tf_records[-1])
  with utils.logged_timer('Validating from {} to {}'.format(
      first_tf_record, last_tf_record)):
    dualnet.validate(estimator_model_dir, tf_records, params)


def evaluate(black_model_name, black_net, white_model_name, white_net,
             evaluate_dir, params):
  """Evaluate with two models.

  With two DualNetRunners to play as black and white in a Go match. Two models
  play several games, and the model that wins by a margin of 55% will be the
  winner.

  Args:
    black_model_name: The name of the model playing black.
    black_net: The DualNetRunner model for black
    white_model_name: The name of the model playing white.
    white_net: The DualNetRunner model for white.
    evaluate_dir: Where to write the evaluation results. Set as
      'base_dir/sgf/evaluate/'.
    params: A MiniGoParams instance of hyperparameters for the model.

  Returns:
    The model name of the winner.

  Raises:
      ValueError: if neither `WHITE` or `BLACK` is returned.
  """
  with utils.logged_timer('{} games'.format(params.eval_games)):
    winner = evaluation.play_match(
        params, black_net, white_net, params.eval_games,
        params.eval_readouts, evaluate_dir, params.eval_verbose)

  if winner != go.WHITE_NAME and winner != go.BLACK_NAME:
    raise ValueError('Winner should be either White or Black!')

  return black_model_name if winner == go.BLACK_NAME else white_model_name


def _set_params(flags):
  """Set hyperparameters from board size.

  Args:
    flags: Flags from Argparser.

  Returns:
  An MiniGoParams instance of hyperparameters.
  """
  params = model_params.MiniGoParams()
  k = utils.round_power_of_two(flags.board_size ** 2 / 3)
  params.num_filters = k  # Number of filters in the convolution layer
  params.fc_width = 2 * k  # Width of each fully connected layer
  params.num_shared_layers = flags.board_size  # Number of shared trunk layers
  params.board_size = flags.board_size  # Board size

  # How many positions can fit on a graphics card. 256 for 9s, 16 or 32 for 19s.
  if flags.batch_size is None:
    if flags.board_size == 9:
      params.batch_size = 256
    else:
      params.batch_size = 32
  else:
    params.batch_size = flags.batch_size

  return params


def _prepare_selfplay(
    model_name, trained_models_dir, selfplay_dir, holdout_dir, sgf_dir, params):
  """Set directories and load the network for selfplay.

  Args:
    model_name: The name of the model for self-play
    trained_models_dir: Directories where the completed generations/models are.
    selfplay_dir: Where to write the games. Set as 'base_dir/data/selfplay/'.
    holdout_dir: Where to write the holdout data. Set as
      'base_dir/data/holdout/'.
    sgf_dir: Where to write the sgf (Smart Game Format) files. Set as
      'base_dir/sgf/'.
    params: A MiniGoParams instance of hyperparameters for the model.

  Returns:
    The directories and network model for selfplay.
  """
  # Set paths for the model with 'model_name'
  model_path = os.path.join(trained_models_dir, model_name)
  output_dir = os.path.join(selfplay_dir, model_name)
  holdout_dir = os.path.join(holdout_dir, model_name)
  # clean_sgf is to write sgf file without comments.
  # full_sgf is to write sgf file with comments.
  clean_sgf = os.path.join(sgf_dir, model_name, 'clean')
  full_sgf = os.path.join(sgf_dir, model_name, 'full')

  _ensure_dir_exists(output_dir)
  _ensure_dir_exists(holdout_dir)
  _ensure_dir_exists(clean_sgf)
  _ensure_dir_exists(full_sgf)
  selfplay_dirs = {
      'output_dir': output_dir,
      'holdout_dir': holdout_dir,
      'clean_sgf': clean_sgf,
      'full_sgf': full_sgf
  }
  # cache the network model for self-play
  with utils.logged_timer('Loading weights from {} ... '.format(model_path)):
    network = dualnet.DualNetRunner(model_path, params)
  return selfplay_dirs, network


def run_selfplay(selfplay_model, selfplay_games, dirs, params):
  """Run selfplay to generate training data.

  Args:
    selfplay_model: The model name for selfplay.
    selfplay_games: The number of selfplay games.
    dirs: A MiniGoDirectory instance of directories used in each step.
    params: A MiniGoParams instance of hyperparameters for the model.
  """
  selfplay_dirs, network = _prepare_selfplay(
      selfplay_model, dirs.trained_models_dir, dirs.selfplay_dir,
      dirs.holdout_dir, dirs.sgf_dir, params)

  print('Self-play with model: {}'.format(selfplay_model))
  for _ in range(selfplay_games):
    selfplay(selfplay_dirs, network, params)


def main(_):
  """Run the reinforcement learning loop."""
  tf.logging.set_verbosity(tf.logging.INFO)

  params = _set_params(FLAGS)

  # A dummy model for debug/testing purpose with fewer games and iterations
  if FLAGS.test:
    params = model_params.DummyMiniGoParams()
    base_dir = FLAGS.base_dir + str(FLAGS.board_size) + '_size_dummy/'
  else:
    # Set directories for models and datasets
    base_dir = FLAGS.base_dir + str(FLAGS.board_size) + '_size/'

  dirs = utils.MiniGoDirectory(base_dir)

  # Run selfplay only if user specifies the argument.
  if FLAGS.selfplay:
    selfplay_model_name = FLAGS.selfplay_model_name or utils.get_latest_model(
        dirs.trained_models_dir)[1]
    max_games = FLAGS.selfplay_max_games or params.max_games_per_generation
    run_selfplay(selfplay_model_name, max_games, dirs, params)
    return

  # Run the RL pipeline
  # if no models have been trained, start from bootstrap model

  if not os.path.isdir(dirs.trained_models_dir):
    print('No trained model exists! Starting from Bootstrap...')
    print('Creating random initial weights...')
    bootstrap(dirs.estimator_model_dir, dirs.trained_models_dir, params)
  else:
    print('A MiniGo base directory has been found! ')
    print('Start from the last checkpoint...')

  _, best_model_so_far = utils.get_latest_model(dirs.trained_models_dir)
  for rl_iter in range(params.max_iters_per_pipeline):
    print('RL_iteration: {}'.format(rl_iter))
    # Self-play with the best model to generate training data
    run_selfplay(
        best_model_so_far, params.max_games_per_generation, dirs, params)

    # gather selfplay data for training
    print('Gathering game output...')
    gather(dirs.selfplay_dir, dirs.training_chunk_dir, params)

    # train the next generation model
    model_num, _ = utils.get_latest_model(dirs.trained_models_dir)
    print('Training on gathered game data...')
    train(dirs.trained_models_dir, dirs.estimator_model_dir,
          dirs.training_chunk_dir, model_num + 1, params)

    # validate the latest model if needed
    if FLAGS.validation:
      print('Validating on the holdout game data...')
      validate(dirs.trained_models_dir, dirs.holdout_dir,
               dirs.estimator_model_dir, params)

    _, current_model = utils.get_latest_model(dirs.trained_models_dir)

    if FLAGS.evaluation:  # Perform evaluation if needed
      print('Evaluate models between {} and {}'.format(
          best_model_so_far, current_model))
      black_model = os.path.join(dirs.trained_models_dir, best_model_so_far)
      white_model = os.path.join(dirs.trained_models_dir, current_model)
      _ensure_dir_exists(dirs.evaluate_dir)
      with utils.logged_timer('Loading weights'):
        black_net = dualnet.DualNetRunner(black_model, params)
        white_net = dualnet.DualNetRunner(white_model, params)

      best_model_so_far = evaluate(
          best_model_so_far, black_net, current_model, white_net,
          dirs.evaluate_dir, params)
      print('Winner of evaluation: {}!'.format(best_model_so_far))
    else:
      best_model_so_far = current_model


if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  # flags to run the RL pipeline
  parser.add_argument(
      '--base_dir',
      type=str,
      default='/tmp/minigo/',
      metavar='BD',
      help='Base directory for the MiniGo models and datasets.')
  parser.add_argument(
      '--board_size',
      type=int,
      default=9,
      metavar='N',
      choices=[9, 19],
      help='Go board size. The default size is 9.')
  parser.add_argument(
      '--batch_size',
      type=int,
      default=None,
      metavar='BS',
      help='Batch size for training. The default size is None')
  # Test the pipeline with a dummy model
  parser.add_argument(
      '--test',
      action='store_true',
      help='A boolean to test RL pipeline with a dummy model.')
  # Run RL pipeline with the validation step
  parser.add_argument(
      '--validation',
      action='store_true',
      help='A boolean to specify validation in the RL pipeline.')
  # Run RL pipeline with the evaluation step
  parser.add_argument(
      '--evaluation',
      action='store_true',
      help='A boolean to specify evaluation in the RL pipeline.')

  # self-play only
  parser.add_argument(
      '--selfplay',
      action='store_true',
      help='A boolean to run self-play only.')
  parser.add_argument(
      '--selfplay_model_name',
      type=str,
      default=None,
      metavar='SM',
      help='The model used for self-play only.')
  parser.add_argument(
      '--selfplay_max_games',
      type=int,
      default=None,
      metavar='SMG',
      help='The number of game data self-play only needs to generate')

  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)