Merge branch 'ds-v0.9.2-rocm' into 'main'

Ds v0.9.2 rocm See merge request dcutoolkit/deeplearing/deepspeed!2

Merge branch 'ds-v0.9.2-rocm' into 'main'
Ds v0.9.2 rocm See merge request dcutoolkit/deeplearing/deepspeed!2
c25a91b6 · aiss · d1596c94 · af82b300 · c25a91b6 · c25a91b6
Commit c25a91b6 authored May 30, 2023 by aiss
20 changed files
--- a/csrc/aio/common/deepspeed_aio_common.h
+++ b/csrc/aio/common/deepspeed_aio_common.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+// DeepSpeed Team
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */

--- a/csrc/aio/common/deepspeed_aio_types.cpp
+++ b/csrc/aio/common/deepspeed_aio_types.cpp
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+// DeepSpeed Team
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */

--- a/csrc/aio/common/deepspeed_aio_types.h
+++ b/csrc/aio/common/deepspeed_aio_types.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+// DeepSpeed Team
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */

--- a/csrc/aio/common/deepspeed_aio_utils.cpp
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+// DeepSpeed Team
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */

--- a/csrc/aio/common/deepspeed_aio_utils.h
+++ b/csrc/aio/common/deepspeed_aio_utils.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+// DeepSpeed Team
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */

--- a/csrc/aio/py_lib/deepspeed_aio_thread.cpp
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+// DeepSpeed Team
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */

--- a/csrc/aio/py_lib/deepspeed_aio_thread.h
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+// DeepSpeed Team
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */

--- a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2023 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+// DeepSpeed Team
+/*
 Functionality for managing CPU tensors occupying page-locked memory.
 */

--- a/csrc/aio/py_lib/deepspeed_pin_tensor.h
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2023 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+// DeepSpeed Team
+/*
 Functionality for managing CPU tensors occupying page-locked memory.
 TODO: Implement a full-featured manager that
- 1. Avoid page-locked memory leaks
+1. Avoid page-locked memory leaks
- 2. Minimize page-locked memory usage by reducing internal fragmentation
+2. Minimize page-locked memory usage by reducing internal fragmentation
+Functionality for managing CPU tensors occupying page-locked memory.
 */
 #include <map>

--- a/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+// DeepSpeed Team
 /*
 Copyright 2020 The Microsoft DeepSpeed Team

--- a/csrc/aio/py_lib/deepspeed_py_aio.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio.h
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+// DeepSpeed Team
 /*
 Copyright 2020 The Microsoft DeepSpeed Team

--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+// DeepSpeed Team
 /*
 Copyright 2020 The Microsoft DeepSpeed Team

--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+// DeepSpeed Team
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */

--- a/csrc/aio/py_lib/deepspeed_py_copy.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+// DeepSpeed Team
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */

--- a/csrc/aio/py_lib/deepspeed_py_copy.h
+++ b/csrc/aio/py_lib/deepspeed_py_copy.h
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+// DeepSpeed Team
 /*
 Copyright 2020 The Microsoft DeepSpeed Team

--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+// DeepSpeed Team
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */

--- a/csrc/aio/py_test/aio_bench_generate_param.py
+++ b/csrc/aio/py_test/aio_bench_generate_param.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2021 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 import os
@@ -14,13 +15,10 @@ from perf_sweep_utils import BENCH_LOG_DIR, READ_LOG_DIR, WRITE_LOG_DIR
 def parse_arguments():
    parser = argparse.ArgumentParser()
-    parser.add_argument(
+    parser.add_argument('--log_dir',
-        '--log_dir',
+                        type=str,
-        type=str,
+                        default=BENCH_LOG_DIR,
-        default=BENCH_LOG_DIR,
+                        help=f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}')
-        help=
-        f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}'
-    )
    args = parser.parse_args()
    print(f'args = {args}')
@@ -75,9 +73,7 @@ def generate_aio_param(read_log_dir, write_log_dir):
    optimal_config_read = read_results.get(read_perf_keys[optimal_key], None)
    optimal_config_write = write_results.get(write_perf_keys[optimal_key], None)
-    print(
+    print(f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}')
-        f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}'
-    )
    print(json.dumps(aio_param, indent=3))

--- a/csrc/aio/py_test/aio_bench_perf_sweep.py
+++ b/csrc/aio/py_test/aio_bench_perf_sweep.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2021 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 import os
@@ -20,20 +21,16 @@ from deepspeed.ops.op_builder import AsyncIOBuilder
 OTHER_OPTIONS = '--handle'
 PERF_SCRIPT = 'test_ds_aio.py'
 DEFAULT_SWEEP_CONFIG = {
-    "block_size": ["128K",
+    "block_size": ["128K", "256K"],
-                   "256K"],
+    "queue_depth": [4, 16, 32],
-    "queue_depth": [4,
+    "overlap_events": [True, False],
-                    16,
+    "io_parallel": [2, 8],
-                    32],
-    "overlap_events": [True,
-                       False],
-    "io_parallel": [2,
-                    8],
    "single_submit": [False]
 }
 class Job(object):
    def __init__(self, cmd_line, output_file=None, work_dir=None):
        self.cmd_line = cmd_line
        self.output_file = output_file
@@ -63,6 +60,7 @@ class Job(object):
 class SweepConfig(object):
    def __init__(self, args):
        self.nvme_dir = args.nvme_dir
        self.io_size = args.io_size
@@ -78,52 +76,35 @@ class SweepConfig(object):
 def parse_arguments():
    parser = argparse.ArgumentParser()
-    parser.add_argument(
+    parser.add_argument('--nvme_dir',
-        '--nvme_dir',
+                        required=True,
-        required=True,
-        type=str,
-        help=
-        'Directory in which to perform I/O tests. A writeable directory on a NVMe device.'
-    )
-    parser.add_argument('--sweep_config',
                        type=str,
-                        default=None,
+                        help='Directory in which to perform I/O tests. A writeable directory on a NVMe device.')
-                        help='Performance sweep configuration json file.')
-    parser.add_argument('--no_read',
+    parser.add_argument('--sweep_config', type=str, default=None, help='Performance sweep configuration json file.')
-                        action='store_true',
-                        help='Disable read performance measurements.')
-    parser.add_argument('--no_write',
+    parser.add_argument('--no_read', action='store_true', help='Disable read performance measurements.')
-                        action='store_true',
-                        help='Disable write performance measurements.')
-    parser.add_argument(
+    parser.add_argument('--no_write', action='store_true', help='Disable write performance measurements.')
-        '--io_size',
-        type=str,
+    parser.add_argument('--io_size',
-        default="400M",
+                        type=str,
-        help='Number of I/O bytes to read/write for performance measurements.')
+                        default="400M",
+                        help='Number of I/O bytes to read/write for performance measurements.')
    parser.add_argument(
        '--no_sudo',
        action='store_true',
        help=
-        'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.'
+        'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.')
-    )
    parser.add_argument(
        '--log_dir',
        type=str,
        default=BENCH_LOG_DIR,
-        help=
+        help=f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}')
-        f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}'
-    )
-    parser.add_argument('--loops',
+    parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions')
-                        type=int,
-                        default=1,
-                        help='Count of operation repetitions')
    args = parser.parse_args()
    print(f'args = {args}')
@@ -147,6 +128,7 @@ def get_sweep_config_dict(sweep_config_json):
 def get_sweep_cmd_lines(sweep_config_dict):
    def flatten_options(key, value_list):
        flat_list = []
        for v in value_list:
@@ -170,11 +152,7 @@ def run_job(job):
    args = ' '.join(job.cmd())
    print(f'args = {args}')
    job.open_output_file()
-    proc = subprocess.run(args=args,
+    proc = subprocess.run(args=args, shell=True, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
-                          shell=True,
-                          stdout=job.get_stdout(),
-                          stderr=job.get_stderr(),
-                          cwd=job.get_cwd())
    job.close_output_file()
    assert proc.returncode == 0, \
    f"This command failed: {job.cmd()}"
@@ -240,14 +218,7 @@ def get_log_file(io_op_desc, cmd_line):
            return tag_key
        return f'{tag_key}{value}'
-    tag_list = [
+    tag_list = [SINGLE_SUBMIT, OVERLAP_EVENTS, THREAD_COUNT, IO_PARALLEL, QUEUE_DEPTH, BLOCK_SIZE]
-        SINGLE_SUBMIT,
-        OVERLAP_EVENTS,
-        THREAD_COUNT,
-        IO_PARALLEL,
-        QUEUE_DEPTH,
-        BLOCK_SIZE
-    ]
    log_tags = [io_op_desc]
    cmd_tags = create_cmd_tags(cmd_line)
    for tag in tag_list:
@@ -298,16 +269,10 @@ def create_read_file(sweep_config):
    os.makedirs(read_folder, exist_ok=True)
    read_file_name = os.path.join(read_folder, f'random_{sweep_config.io_size}B.pt')
    block_size, block_count = get_block_size_and_count(refine_integer_value(sweep_config.io_size))
-    dd_job = Job(cmd_line=[
+    dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}'])
-        f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}'
+    print(f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....')
-    ])
-    print(
-        f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....'
-    )
    run_job(dd_job)
-    print(
+    print(f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....')
-        f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....'
-    )
    return read_folder, read_file_name
@@ -319,20 +284,15 @@ def remove_folder(folder):
 def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
    read_folder, read_file_name = create_read_file(sweep_config)
    read_option = f'--read_file {read_file_name}'
-    read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd
+    read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
-                      for cmd in cmd_lines]
    #dump_cmd_lines(read_cmd_lines)
    log_folder = os.path.join(sweep_config.log_dir, f'{READ_LOG_DIR}')
    os.makedirs(log_folder, exist_ok=True)
-    perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC,
+    perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC, log_dir=log_folder, cmd_lines=read_cmd_lines)
-                                 log_dir=log_folder,
-                                 cmd_lines=read_cmd_lines)
-    launch_sweep(sweep_jobs=perf_jobs,
+    launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job)
-                 sync_job=sync_job,
-                 flush_cache_job=flush_cache_job)
    remove_folder(read_folder)
@@ -342,20 +302,15 @@ def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
    os.makedirs(write_folder, exist_ok=True)
    write_file_name = os.path.join(write_folder, f'random_{sweep_config.io_size}B.pt')
    write_option = f'--write_size {sweep_config.io_size} --write_file {write_file_name}'
-    write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd
+    write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
-                       for cmd in cmd_lines]
    #dump_cmd_lines(write_cmd_lines)
    log_folder = os.path.join(sweep_config.log_dir, f'{WRITE_LOG_DIR}')
    os.makedirs(log_folder, exist_ok=True)
-    perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC,
+    perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC, log_dir=log_folder, cmd_lines=write_cmd_lines)
-                                 log_dir=log_folder,
-                                 cmd_lines=write_cmd_lines)
-    launch_sweep(sweep_jobs=perf_jobs,
+    launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job)
-                 sync_job=sync_job,
-                 flush_cache_job=flush_cache_job)
    remove_folder(write_folder)
@@ -376,10 +331,7 @@ def main():
    cmd_lines = get_sweep_cmd_lines(sweep_config.search_space)
    if sweep_config.flush_cache:
-        flush_cache_job = Job(
+        flush_cache_job = Job(cmd_line=['sudo', 'bash -c', "'echo 1 > /proc/sys/vm/drop_caches'"])
-            cmd_line=['sudo',
-                      'bash -c',
-                      "'echo 1 > /proc/sys/vm/drop_caches'"])
    else:
        flush_cache_job = None

--- a/csrc/aio/py_test/ds_aio_basic.py
+++ b/csrc/aio/py_test/ds_aio_basic.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
@@ -20,14 +21,8 @@ def pre_basic(args, tid, read_op):
    file = args.read_file if read_op else f'{args.write_file}.{tid}'
    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    buffer = get_accelerator().pin_memory(
+    buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu'))
-        torch.empty(num_bytes,
+    task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
-                    dtype=torch.uint8,
-                    device='cpu'))
-    task_log(
-        tid,
-        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
-    )
    ctxt = {}
    ctxt['file'] = file
@@ -60,13 +55,8 @@ def post_basic(pool_params):
 def main_basic_read(pool_params):
    args, tid, ctxt = pool_params
    start_time = time.time()
-    AsyncIOBuilder().load().aio_read(ctxt['buffer'],
+    AsyncIOBuilder().load().aio_read(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
-                                     ctxt['file'],
+                                     args.single_submit, args.overlap_events, args.validate)
-                                     args.block_size,
-                                     args.queue_depth,
-                                     args.single_submit,
-                                     args.overlap_events,
-                                     args.validate)
    end_time = time.time()
    ctxt['elapsed_sec'] += end_time - start_time
@@ -76,13 +66,8 @@ def main_basic_read(pool_params):
 def main_basic_write(pool_params):
    args, tid, ctxt = pool_params
    start_time = time.time()
-    AsyncIOBuilder().load().aio_write(ctxt['buffer'],
+    AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
-                                      ctxt['file'],
+                                      args.single_submit, args.overlap_events, args.validate)
-                                      args.block_size,
-                                      args.queue_depth,
-                                      args.single_submit,
-                                      args.overlap_events,
-                                      args.validate)
    end_time = time.time()
    ctxt['elapsed_sec'] += end_time - start_time

--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
@@ -20,27 +21,17 @@ def pre_handle(args, tid, read_op):
    file = args.read_file if read_op else f'{args.write_file}.{tid}'
    io_parallel = args.io_parallel if args.io_parallel else 1
-    handle = AsyncIOBuilder().load().aio_handle(args.block_size,
+    handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
-                                                args.queue_depth,
+                                                args.overlap_events, io_parallel)
-                                                args.single_submit,
-                                                args.overlap_events,
-                                                io_parallel)
    task_log(tid, f'Created deepspeed aio handle')
    if args.gpu:
-        buffer = torch.empty(num_bytes,
+        buffer = torch.empty(num_bytes, dtype=torch.uint8, device=get_accelerator().device_name())
-                             dtype=torch.uint8,
-                             device=get_accelerator().device_name())
    else:
        if args.use_accelerator_pin_memory:
-            buffer = get_accelerator().pin_memory(
+            buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu'))
-                torch.empty(num_bytes,
-                            dtype=torch.uint8,
-                            device='cpu'))
        else:
-            buffer = handle.new_cpu_locked_tensor(num_bytes,
+            buffer = handle.new_cpu_locked_tensor(num_bytes, torch.empty(0, dtype=torch.uint8))
-                                                  torch.empty(0,
-                                                              dtype=torch.uint8))
    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
@@ -51,10 +42,7 @@ def pre_handle(args, tid, read_op):
    ctxt['buffer'] = buffer
    ctxt['elapsed_sec'] = 0
-    task_log(
+    task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
-        tid,
-        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
-    )
    return ctxt