# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Defines DualNet model, the architecture of the policy and value network.

The input to the neural network is a [board_size * board_size * 17] image stack
comprising 17 binary feature planes. 8 feature planes consist of binary values
indicating the presence of the current player's stones; A further 8 feature
planes represent the corresponding features for the opponent's stones; The final
feature plane represents the color to play, and has a constant value of either 1
if black is to play or 0 if white to play. Check 'features.py' for more details.

In MiniGo implementation, the input features are processed by a residual tower
that consists of a single convolutional block followed by either 9 or 19
residual blocks.
The convolutional block applies the following modules:
  1. A convolution of num_filter filters of kernel size 3 x 3 with stride 1
  2. Batch normalization
  3. A rectifier non-linearity
Each residual block applies the following modules sequentially to its input:
  1. A convolution of num_filter filters of kernel size 3 x 3 with stride 1
  2. Batch normalization
  3. A rectifier non-linearity
  4. A convolution of num_filter filters of kernel size 3 x 3 with stride 1
  5. Batch normalization
  6. A skip connection that adds the input to the block
  7. A rectifier non-linearity
Note: num_filter is 128 for 19 x 19 board size, and 32 for 9 x 9 board size.

The output of the residual tower is passed into two separate "heads" for
computing the policy and value respectively. The policy head applies the
following modules:
  1. A convolution of 2 filters of kernel size 1 x 1 with stride 1
  2. Batch normalization
  3. A rectifier non-linearity
  4. A fully connected linear layer that outputs a vector of size 19^2 + 1 = 362
  corresponding to logit probabilities for all intersections and the pass move
The value head applies the following modules:
  1. A convolution of 1 filter of kernel size 1 x 1 with stride 1
  2. Batch normalization
  3. A rectifier non-linearity
  4. A fully connected linear layer to a hidden layer of size 256 for 19 x 19
    board size and 64 for 9x9 board size
  5. A rectifier non-linearity
  6. A fully connected linear layer to a scalar
  7. A tanh non-linearity outputting a scalar in the range [-1, 1]

The overall network depth, in the 10 or 20 block network, is 19 or 39
parameterized layers respectively for the residual tower, plus an additional 2
layers for the policy head and 3 layers for the value head.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

_BATCH_NORM_DECAY = 0.997
_BATCH_NORM_EPSILON = 1e-5


def _batch_norm(inputs, training, center=True, scale=True):
  """Performs a batch normalization using a standard set of parameters."""
  return tf.layers.batch_normalization(
      inputs=inputs, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON,
      center=center, scale=scale, fused=True, training=training)


def _conv2d(inputs, filters, kernel_size):
  """Performs 2D convolution with a standard set of parameters."""
  return tf.layers.conv2d(
      inputs=inputs, filters=filters, kernel_size=kernel_size,
      padding='same')


def _conv_block(inputs, filters, kernel_size, training):
  """A convolutional block.

  Args:
    inputs: A tensor representing a batch of input features with shape
      [BATCH_SIZE, board_size, board_size, features.NEW_FEATURES_PLANES].
    filters: The number of filters for network layers in residual tower.
    kernel_size: The kernel to be used in conv2d.
    training: Either True or False, whether we are currently training the
      model. Needed for batch norm.

  Returns:
    The output tensor of the convolutional block layer.
  """
  conv = _conv2d(inputs, filters, kernel_size)
  batchn = _batch_norm(conv, training)

  output = tf.nn.relu(batchn)

  return output


def _res_block(inputs, filters, kernel_size, training):
  """A residual block.

  Args:
    inputs: A tensor representing a batch of input features with shape
      [BATCH_SIZE, board_size, board_size, features.NEW_FEATURES_PLANES].
    filters: The number of filters for network layers in residual tower.
    kernel_size: The kernel to be used in conv2d.
    training: Either True or False, whether we are currently training the
      model. Needed for batch norm.

  Returns:
    The output tensor of the residual block layer.
  """

  initial_output = _conv_block(inputs, filters, kernel_size, training)

  int_layer2_conv = _conv2d(initial_output, filters, kernel_size)
  int_layer2_batchn = _batch_norm(int_layer2_conv, training)

  output = tf.nn.relu(inputs + int_layer2_batchn)

  return output


class Model(object):
  """Base class for building the DualNet Model."""

  def __init__(self, num_filters, num_shared_layers, fc_width, board_size):
    """Initialize a model for computing the policy and value in RL.

    Args:
      num_filters: Number of filters (AlphaGoZero used 256). We use 128 by
        default for a 19x19 go board, and 32 for 9x9 size.
      num_shared_layers: Number of shared residual blocks.  AGZ used both 19
        and 39. Here we use 19 for 19x19 size and 9 for 9x9 size because it's
        faster to train.
      fc_width: Dimensionality of the fully connected linear layer.
      board_size: A single integer for the board size.
    """
    self.num_filters = num_filters
    self.num_shared_layers = num_shared_layers
    self.fc_width = fc_width
    self.board_size = board_size
    self.kernel_size = [3, 3]  # kernel size is from AGZ paper

  def __call__(self, inputs, training):
    """Add operations to classify a batch of input Go features.

    Args:
      inputs: A Tensor representing a batch of input Go features with shape
        [BATCH_SIZE, board_size, board_size, features.NEW_FEATURES_PLANES]
      training: A boolean. Set to True to add operations required only when
        training the classifier.

    Returns:
      policy_logits: A vector of size self.board_size * self.board_size + 1
        corresponding to the policy logit probabilities for all intersections
        and the pass move.
      value_logits: A scalar for the value logits output
    """
    initial_output = _conv_block(
        inputs=inputs, filters=self.num_filters,
        kernel_size=self.kernel_size, training=training)
    # the shared stack
    shared_output = initial_output
    for _ in range(self.num_shared_layers):
      shared_output = _res_block(
          inputs=shared_output, filters=self.num_filters,
          kernel_size=self.kernel_size, training=training)

    # policy head
    policy_conv2d = _conv2d(inputs=shared_output, filters=2, kernel_size=[1, 1])
    policy_batchn = _batch_norm(inputs=policy_conv2d, training=training,
                                center=False, scale=False)
    policy_relu = tf.nn.relu(policy_batchn)
    policy_logits = tf.layers.dense(
        tf.reshape(policy_relu, [-1, self.board_size * self.board_size * 2]),
        self.board_size * self.board_size + 1)

    # value head
    value_conv2d = _conv2d(shared_output, filters=1, kernel_size=[1, 1])
    value_batchn = _batch_norm(value_conv2d, training,
                               center=False, scale=False)
    value_relu = tf.nn.relu(value_batchn)
    value_fc_hidden = tf.nn.relu(tf.layers.dense(
        tf.reshape(value_relu, [-1, self.board_size * self.board_size]),
        self.fc_width))
    value_logits = tf.reshape(tf.layers.dense(value_fc_hidden, 1), [-1])

    return policy_logits, value_logits


def model_fn(features, labels, mode, params, config=None):  # pylint: disable=unused-argument
  """DualNet model function.

  Args:
    features: tensor with shape
      [BATCH_SIZE, self.board_size, self.board_size,
      features.NEW_FEATURES_PLANES]
    labels: dict from string to tensor with shape
      'pi_tensor': [BATCH_SIZE, self.board_size * self.board_size + 1]
      'value_tensor': [BATCH_SIZE]
    mode: a tf.estimator.ModeKeys (batchnorm params update for TRAIN only)
    params: an object of hyperparams
    config: ignored; is required by Estimator API.
  Returns:
    EstimatorSpec parameterized according to the input params and the current
    mode.
  """
  model = Model(params.num_filters, params.num_shared_layers, params.fc_width,
                params.board_size)
  policy_logits, value_logits = model(
      features, mode == tf.estimator.ModeKeys.TRAIN)

  policy_output = tf.nn.softmax(policy_logits, name='policy_output')
  value_output = tf.nn.tanh(value_logits, name='value_output')

  # Calculate model loss. The loss function sums over the mean-squared error,
  # the cross-entropy losses and the l2 regularization term.
  # Cross-entropy of policy
  policy_entropy = -tf.reduce_mean(tf.reduce_sum(
      policy_output * tf.log(policy_output), axis=1))
  policy_cost = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
          logits=policy_logits, labels=labels['pi_tensor']))
  # Mean squared error
  value_cost = tf.reduce_mean(
      tf.square(value_output - labels['value_tensor']))
  # L2 term
  l2_cost = params.l2_strength * tf.add_n(
      [tf.nn.l2_loss(v) for v in tf.trainable_variables()
       if 'bias' not in v.name])
  # The loss function
  combined_cost = policy_cost + value_cost + l2_cost

  # Get model train ops
  global_step = tf.train.get_or_create_global_step()
  boundaries = [int(1e6), int(2e6)]
  values = [1e-2, 1e-3, 1e-4]
  learning_rate = tf.train.piecewise_constant(
      global_step, boundaries, values)
  update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
  with tf.control_dependencies(update_ops):
    train_op = tf.train.MomentumOptimizer(
        learning_rate, params.momentum).minimize(
            combined_cost, global_step=global_step)

  # Create multiple tensors for logging purpose
  metric_ops = {
      'accuracy': tf.metrics.accuracy(labels=labels['pi_tensor'],
                                      predictions=policy_output,
                                      name='accuracy_op'),
      'policy_cost': tf.metrics.mean(policy_cost),
      'value_cost': tf.metrics.mean(value_cost),
      'l2_cost': tf.metrics.mean(l2_cost),
      'policy_entropy': tf.metrics.mean(policy_entropy),
      'combined_cost': tf.metrics.mean(combined_cost),
  }
  for metric_name, metric_op in metric_ops.items():
    tf.summary.scalar(metric_name, metric_op[1])

  # Return tf.estimator.EstimatorSpec
  return tf.estimator.EstimatorSpec(
      mode=mode,
      predictions={
          'policy_output': policy_output,
          'value_output': value_output,
      },
      loss=combined_cost,
      train_op=train_op,
      eval_metric_ops=metric_ops)