# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from argparse import ArgumentParser, REMAINDER


def parse_args():
    """
    Helper function parsing the command line options
    @retval ArgumentParser
    """
    parser = ArgumentParser(
        description="The script to print PaddlePaddle CPU binding cmd.")

    # Optional arguments for the launch helper
    parser.add_argument(
        "--nnodes",
        type=int,
        default=1,
        help="The number of nodes to use for distributed "
        "training")
    parser.add_argument(
        "--node_rank",
        type=int,
        default=0,
        help="The rank of the node for multi-node distributed "
        "training")
    parser.add_argument(
        "--local_rank", type=int, default=0, help="The local rank.")
    parser.add_argument(
        "--nproc_per_node",
        type=int,
        default=1,
        help="The number of processes to launch on each node, "
        "for GPU training, this is recommended to be set "
        "to the number of GPUs in your system so that "
        "each process can be bound to a single GPU.")
    parser.add_argument(
        '--no_hyperthreads',
        action='store_true',
        help='Flag to disable binding to hyperthreads')
    parser.add_argument(
        '--no_membind',
        action='store_true',
        help='Flag to disable memory binding')

    # non-optional arguments for binding
    parser.add_argument(
        "--nsockets_per_node",
        type=int,
        required=True,
        help="Number of CPU sockets on a node")
    parser.add_argument(
        "--ncores_per_socket",
        type=int,
        required=True,
        help="Number of CPU cores per socket")

    return parser.parse_args()


def main():
    args = parse_args()

    # variables for numactrl binding

    NSOCKETS = args.nsockets_per_node
    NGPUS_PER_SOCKET = (args.nproc_per_node // args.nsockets_per_node) + (1 if (
        args.nproc_per_node % args.nsockets_per_node) else 0)
    NCORES_PER_GPU = args.ncores_per_socket // NGPUS_PER_SOCKET

    # world size in terms of number of processes
    dist_world_size = args.nproc_per_node * args.nnodes

    local_rank = args.local_rank

    # each process's rank
    dist_rank = args.nproc_per_node * args.node_rank + local_rank

    # form numactrl binding command
    cpu_ranges = [
        local_rank * NCORES_PER_GPU, (local_rank + 1) * NCORES_PER_GPU - 1,
        local_rank * NCORES_PER_GPU +
        (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS),
        (local_rank + 1) * NCORES_PER_GPU +
        (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS) - 1
    ]

    numactlargs = []
    if args.no_hyperthreads:
        numactlargs += ["--physcpubind={}-{}".format(*cpu_ranges[0:2])]
    else:
        numactlargs += ["--physcpubind={}-{},{}-{}".format(*cpu_ranges)]

    if not args.no_membind:
        memnode = local_rank // NGPUS_PER_SOCKET
        numactlargs += ["--membind={}".format(memnode)]

    cmd = ["/usr/bin/numactl"] + numactlargs
    print(" ".join(cmd))


if __name__ == "__main__":
    main()