Commit c25a91b6 authored by aiss's avatar aiss
Browse files

Merge branch 'ds-v0.9.2-rocm' into 'main'

Ds v0.9.2 rocm

See merge request dcutoolkit/deeplearing/deepspeed!2
parents d1596c94 af82b300
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
......
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
......
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
......
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
......
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
......
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
......
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
......
/*
Copyright 2023 The Microsoft DeepSpeed Team
Licensed under the MIT license.
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Functionality for managing CPU tensors occupying page-locked memory.
*/
......
/*
Copyright 2023 The Microsoft DeepSpeed Team
Licensed under the MIT license.
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Functionality for managing CPU tensors occupying page-locked memory.
TODO: Implement a full-featured manager that
1. Avoid page-locked memory leaks
2. Minimize page-locked memory usage by reducing internal fragmentation
1. Avoid page-locked memory leaks
2. Minimize page-locked memory usage by reducing internal fragmentation
Functionality for managing CPU tensors occupying page-locked memory.
*/
#include <map>
......
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Copyright 2020 The Microsoft DeepSpeed Team
......
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Copyright 2020 The Microsoft DeepSpeed Team
......
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Copyright 2020 The Microsoft DeepSpeed Team
......
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
......
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
......
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Copyright 2020 The Microsoft DeepSpeed Team
......
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0
// DeepSpeed Team
/*
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
......
"""
Copyright 2021 The Microsoft DeepSpeed Team
Licensed under the MIT license.
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
"""
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import os
......@@ -14,13 +15,10 @@ from perf_sweep_utils import BENCH_LOG_DIR, READ_LOG_DIR, WRITE_LOG_DIR
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
'--log_dir',
type=str,
default=BENCH_LOG_DIR,
help=
f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}'
)
parser.add_argument('--log_dir',
type=str,
default=BENCH_LOG_DIR,
help=f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}')
args = parser.parse_args()
print(f'args = {args}')
......@@ -75,9 +73,7 @@ def generate_aio_param(read_log_dir, write_log_dir):
optimal_config_read = read_results.get(read_perf_keys[optimal_key], None)
optimal_config_write = write_results.get(write_perf_keys[optimal_key], None)
print(
f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}'
)
print(f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}')
print(json.dumps(aio_param, indent=3))
......
"""
Copyright 2021 The Microsoft DeepSpeed Team
Licensed under the MIT license.
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
"""
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import os
......@@ -20,20 +21,16 @@ from deepspeed.ops.op_builder import AsyncIOBuilder
OTHER_OPTIONS = '--handle'
PERF_SCRIPT = 'test_ds_aio.py'
DEFAULT_SWEEP_CONFIG = {
"block_size": ["128K",
"256K"],
"queue_depth": [4,
16,
32],
"overlap_events": [True,
False],
"io_parallel": [2,
8],
"block_size": ["128K", "256K"],
"queue_depth": [4, 16, 32],
"overlap_events": [True, False],
"io_parallel": [2, 8],
"single_submit": [False]
}
class Job(object):
def __init__(self, cmd_line, output_file=None, work_dir=None):
self.cmd_line = cmd_line
self.output_file = output_file
......@@ -63,6 +60,7 @@ class Job(object):
class SweepConfig(object):
def __init__(self, args):
self.nvme_dir = args.nvme_dir
self.io_size = args.io_size
......@@ -78,52 +76,35 @@ class SweepConfig(object):
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
'--nvme_dir',
required=True,
type=str,
help=
'Directory in which to perform I/O tests. A writeable directory on a NVMe device.'
)
parser.add_argument('--sweep_config',
parser.add_argument('--nvme_dir',
required=True,
type=str,
default=None,
help='Performance sweep configuration json file.')
help='Directory in which to perform I/O tests. A writeable directory on a NVMe device.')
parser.add_argument('--no_read',
action='store_true',
help='Disable read performance measurements.')
parser.add_argument('--sweep_config', type=str, default=None, help='Performance sweep configuration json file.')
parser.add_argument('--no_write',
action='store_true',
help='Disable write performance measurements.')
parser.add_argument('--no_read', action='store_true', help='Disable read performance measurements.')
parser.add_argument(
'--io_size',
type=str,
default="400M",
help='Number of I/O bytes to read/write for performance measurements.')
parser.add_argument('--no_write', action='store_true', help='Disable write performance measurements.')
parser.add_argument('--io_size',
type=str,
default="400M",
help='Number of I/O bytes to read/write for performance measurements.')
parser.add_argument(
'--no_sudo',
action='store_true',
help=
'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.'
)
'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.')
parser.add_argument(
'--log_dir',
type=str,
default=BENCH_LOG_DIR,
help=
f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}'
)
help=f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}')
parser.add_argument('--loops',
type=int,
default=1,
help='Count of operation repetitions')
parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions')
args = parser.parse_args()
print(f'args = {args}')
......@@ -147,6 +128,7 @@ def get_sweep_config_dict(sweep_config_json):
def get_sweep_cmd_lines(sweep_config_dict):
def flatten_options(key, value_list):
flat_list = []
for v in value_list:
......@@ -170,11 +152,7 @@ def run_job(job):
args = ' '.join(job.cmd())
print(f'args = {args}')
job.open_output_file()
proc = subprocess.run(args=args,
shell=True,
stdout=job.get_stdout(),
stderr=job.get_stderr(),
cwd=job.get_cwd())
proc = subprocess.run(args=args, shell=True, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
job.close_output_file()
assert proc.returncode == 0, \
f"This command failed: {job.cmd()}"
......@@ -240,14 +218,7 @@ def get_log_file(io_op_desc, cmd_line):
return tag_key
return f'{tag_key}{value}'
tag_list = [
SINGLE_SUBMIT,
OVERLAP_EVENTS,
THREAD_COUNT,
IO_PARALLEL,
QUEUE_DEPTH,
BLOCK_SIZE
]
tag_list = [SINGLE_SUBMIT, OVERLAP_EVENTS, THREAD_COUNT, IO_PARALLEL, QUEUE_DEPTH, BLOCK_SIZE]
log_tags = [io_op_desc]
cmd_tags = create_cmd_tags(cmd_line)
for tag in tag_list:
......@@ -298,16 +269,10 @@ def create_read_file(sweep_config):
os.makedirs(read_folder, exist_ok=True)
read_file_name = os.path.join(read_folder, f'random_{sweep_config.io_size}B.pt')
block_size, block_count = get_block_size_and_count(refine_integer_value(sweep_config.io_size))
dd_job = Job(cmd_line=[
f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}'
])
print(
f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....'
)
dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}'])
print(f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....')
run_job(dd_job)
print(
f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....'
)
print(f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....')
return read_folder, read_file_name
......@@ -319,20 +284,15 @@ def remove_folder(folder):
def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
read_folder, read_file_name = create_read_file(sweep_config)
read_option = f'--read_file {read_file_name}'
read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd
for cmd in cmd_lines]
read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
#dump_cmd_lines(read_cmd_lines)
log_folder = os.path.join(sweep_config.log_dir, f'{READ_LOG_DIR}')
os.makedirs(log_folder, exist_ok=True)
perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC,
log_dir=log_folder,
cmd_lines=read_cmd_lines)
perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC, log_dir=log_folder, cmd_lines=read_cmd_lines)
launch_sweep(sweep_jobs=perf_jobs,
sync_job=sync_job,
flush_cache_job=flush_cache_job)
launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job)
remove_folder(read_folder)
......@@ -342,20 +302,15 @@ def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
os.makedirs(write_folder, exist_ok=True)
write_file_name = os.path.join(write_folder, f'random_{sweep_config.io_size}B.pt')
write_option = f'--write_size {sweep_config.io_size} --write_file {write_file_name}'
write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd
for cmd in cmd_lines]
write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
#dump_cmd_lines(write_cmd_lines)
log_folder = os.path.join(sweep_config.log_dir, f'{WRITE_LOG_DIR}')
os.makedirs(log_folder, exist_ok=True)
perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC,
log_dir=log_folder,
cmd_lines=write_cmd_lines)
perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC, log_dir=log_folder, cmd_lines=write_cmd_lines)
launch_sweep(sweep_jobs=perf_jobs,
sync_job=sync_job,
flush_cache_job=flush_cache_job)
launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job)
remove_folder(write_folder)
......@@ -376,10 +331,7 @@ def main():
cmd_lines = get_sweep_cmd_lines(sweep_config.search_space)
if sweep_config.flush_cache:
flush_cache_job = Job(
cmd_line=['sudo',
'bash -c',
"'echo 1 > /proc/sys/vm/drop_caches'"])
flush_cache_job = Job(cmd_line=['sudo', 'bash -c', "'echo 1 > /proc/sys/vm/drop_caches'"])
else:
flush_cache_job = None
......
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
"""
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
......@@ -20,14 +21,8 @@ def pre_basic(args, tid, read_op):
file = args.read_file if read_op else f'{args.write_file}.{tid}'
task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
buffer = get_accelerator().pin_memory(
torch.empty(num_bytes,
dtype=torch.uint8,
device='cpu'))
task_log(
tid,
f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
)
buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu'))
task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
ctxt = {}
ctxt['file'] = file
......@@ -60,13 +55,8 @@ def post_basic(pool_params):
def main_basic_read(pool_params):
args, tid, ctxt = pool_params
start_time = time.time()
AsyncIOBuilder().load().aio_read(ctxt['buffer'],
ctxt['file'],
args.block_size,
args.queue_depth,
args.single_submit,
args.overlap_events,
args.validate)
AsyncIOBuilder().load().aio_read(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
args.single_submit, args.overlap_events, args.validate)
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time
......@@ -76,13 +66,8 @@ def main_basic_read(pool_params):
def main_basic_write(pool_params):
args, tid, ctxt = pool_params
start_time = time.time()
AsyncIOBuilder().load().aio_write(ctxt['buffer'],
ctxt['file'],
args.block_size,
args.queue_depth,
args.single_submit,
args.overlap_events,
args.validate)
AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
args.single_submit, args.overlap_events, args.validate)
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time
......
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
"""
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
......@@ -20,27 +21,17 @@ def pre_handle(args, tid, read_op):
file = args.read_file if read_op else f'{args.write_file}.{tid}'
io_parallel = args.io_parallel if args.io_parallel else 1
handle = AsyncIOBuilder().load().aio_handle(args.block_size,
args.queue_depth,
args.single_submit,
args.overlap_events,
io_parallel)
handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
args.overlap_events, io_parallel)
task_log(tid, f'Created deepspeed aio handle')
if args.gpu:
buffer = torch.empty(num_bytes,
dtype=torch.uint8,
device=get_accelerator().device_name())
buffer = torch.empty(num_bytes, dtype=torch.uint8, device=get_accelerator().device_name())
else:
if args.use_accelerator_pin_memory:
buffer = get_accelerator().pin_memory(
torch.empty(num_bytes,
dtype=torch.uint8,
device='cpu'))
buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu'))
else:
buffer = handle.new_cpu_locked_tensor(num_bytes,
torch.empty(0,
dtype=torch.uint8))
buffer = handle.new_cpu_locked_tensor(num_bytes, torch.empty(0, dtype=torch.uint8))
task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
......@@ -51,10 +42,7 @@ def pre_handle(args, tid, read_op):
ctxt['buffer'] = buffer
ctxt['elapsed_sec'] = 0
task_log(
tid,
f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
)
task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
return ctxt
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment