Merge branch 'ds-v0.9.2-rocm' into 'main'

Ds v0.9.2 rocm See merge request dcutoolkit/deeplearing/deepspeed!2

Merge branch 'ds-v0.9.2-rocm' into 'main'
Ds v0.9.2 rocm See merge request dcutoolkit/deeplearing/deepspeed!2
c25a91b6 · aiss · d1596c94 · af82b300 · c25a91b6 · c25a91b6
Commit c25a91b6 authored May 30, 2023 by aiss
20 changed files
--- a/csrc/aio/py_test/parse_aio_stats.py
+++ b/csrc/aio/py_test/parse_aio_stats.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
@@ -19,10 +20,7 @@ METRIC_SEARCH = {READ_SPEED: 'E2E Read Speed', WRITE_SPEED: 'E2E Write Speed'}
 def parse_arguments():
    parser = argparse.ArgumentParser()
-    parser.add_argument('--log_dir',
+    parser.add_argument('--log_dir', type=str, required=True, help='Folder of statistics logs')
-                        type=str,
-                        required=True,
-                        help='Folder of statistics logs')
    parser.add_argument('--metric',
                        type=str,
@@ -125,10 +123,7 @@ def get_results(log_files, metric):
 def get_sorted_results(log_dir, metric):
-    log_files = [
+    log_files = [f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir, f))]
-        f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir,
-                                                                      f))
-    ]
    log_files_path = [os.path.join(log_dir, f) for f in log_files]
    results = get_results(log_files_path, metric)

--- a/csrc/aio/py_test/perf_sweep_utils.py
+++ b/csrc/aio/py_test/perf_sweep_utils.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
 SCRIPT_PREFIX = '_aio_bench'
 WRITE_OP_DESC = 'write'

--- a/csrc/aio/py_test/test_ds_aio.py
+++ b/csrc/aio/py_test/test_ds_aio.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
@@ -20,46 +21,29 @@ def parse_arguments():
    parser.add_argument('--write_file', type=str, default=None, help='Write file.')
-    parser.add_argument('--write_size',
+    parser.add_argument('--write_size', type=str, default=None, help='Number of bytes to write.')
-                        type=str,
-                        default=None,
-                        help='Number of bytes to write.')
    parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.')
    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.')
-    parser.add_argument('--threads',
+    parser.add_argument('--threads', type=int, default=1, help='Thread parallelism count.')
-                        type=int,
-                        default=1,
-                        help='Thread parallelism count.')
-    parser.add_argument(
+    parser.add_argument('--single_submit',
-        '--single_submit',
+                        action='store_true',
-        action='store_true',
+                        help='Submit I/O requests in singles (default is submit queue_depth amount at once.).')
-        help=
-        'Submit I/O requests in singles (default is submit queue_depth amount at once.).'
-    )
    parser.add_argument('--overlap_events',
                        action='store_true',
                        help='Overlap I/O submission and completion requests.')
-    parser.add_argument('--validate',
+    parser.add_argument('--validate', action='store_true', help='Perform validation in library.')
-                        action='store_true',
-                        help='Perform validation in library.')
    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
-    parser.add_argument('--loops',
+    parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions')
-                        type=int,
-                        default=1,
-                        help='Count of operation repetitions')
-    parser.add_argument('--io_parallel',
+    parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism')
-                        type=int,
-                        default=None,
-                        help='Per iop parallelism')
    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')

--- a/csrc/aio/py_test/test_ds_aio_utils.py
+++ b/csrc/aio/py_test/test_ds_aio_utils.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2020 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """

--- a/csrc/aio/py_test/validate_async_io.py
+++ b/csrc/aio/py_test/validate_async_io.py
-"""
+# Copyright (c) Microsoft Corporation.
-Copyright 2021 The Microsoft DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
-Licensed under the MIT license.
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 from deepspeed.ops.op_builder import AsyncIOBuilder

--- a/csrc/common/custom_cuda_kernel.cu
+++ b/csrc/common/custom_cuda_kernel.cu
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-*/
+// DeepSpeed Team
 #ifdef __HIPCC__
 #include "custom_hip_layers.h"
 #else
 #include "custom_cuda_layers.h"
 #endif
 __global__ void param_update_kernel(const float* input, __half* output, int size)
 {
    int id = blockIdx.x * blockDim.x + threadIdx.x;

--- a/csrc/includes/StopWatch.h
+++ b/csrc/includes/StopWatch.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-*/
+// DeepSpeed Team
 #pragma once
 #ifdef _WIN32

--- a/csrc/includes/Timer.h
+++ b/csrc/includes/Timer.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-*/
+// DeepSpeed Team
 #ifndef __TIMER_H__
 #define __TIMER_H__

--- a/csrc/includes/compat.h
+++ b/csrc/includes/compat.h
-/* Copyright 2020 The Microsoft DeepSpeed Team
+// Copyright (c) Microsoft Corporation.
-   Copyright NVIDIA/apex
+// SPDX-License-Identifier: Apache-2.0
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+// DeepSpeed Team
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 */
 #ifndef TORCH_CHECK

--- a/csrc/includes/context.h
+++ b/csrc/includes/context.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-*/
+// DeepSpeed Team
 #pragma once
@@ -43,9 +44,9 @@ inline int DS_GET_BLOCKS(const int N)
        1);
 }
-class Context {
+class TrainingContext {
 public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0)
+    TrainingContext() : _workspace(nullptr), _seed(42), _curr_offset(0)
    {
        curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT);
        curandSetPseudoRandomGeneratorSeed(_gen, 123);
@@ -56,15 +57,15 @@ public:
        }
    }
-    virtual ~Context()
+    virtual ~TrainingContext()
    {
        cublasDestroy(_cublasHandle);
        cudaFree(_workspace);
    }
-    static Context& Instance()
+    static TrainingContext& Instance()
    {
-        static Context _ctx;
+        static TrainingContext _ctx;
        return _ctx;
    }

--- a/csrc/includes/conversion_utils.h
+++ b/csrc/includes/conversion_utils.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2022 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-*/
+// DeepSpeed Team
 #pragma once
@@ -262,12 +263,16 @@ DS_D_INLINE float2 to(__nv_bfloat162 val)
 #endif
 /*********************  To Half Conversions *********************/
-//aiss
+template <>
-//template <>
+DS_D_INLINE __half to(double val)
-//DS_D_INLINE __half to(double val)
+{
-//{
+#ifdef __HIP_PLATFORM_HCC__
-//    return __double2half(val);
+    float val_f = __double2float_rn(val);
-//}
+    return __float2half(val_f);
+#else
+    return __double2half(val);
+#endif
+}
 template <>
 DS_D_INLINE __half to(float val)
 {
@@ -329,6 +334,11 @@ DS_D_INLINE __half2 to(float2 val)
 {
    return __float22half2_rn(val);
 }
+template <>
+DS_D_INLINE __half2 to(float val)
+{
+    return __float2half2_rn(val);
+}
 #ifdef BF16_AVAILABLE
 // No direct conversion
@@ -401,6 +411,11 @@ DS_D_INLINE __nv_bfloat162 to(float2 val)
    return __float22bfloat162_rn(val);
 }
 template <>
+DS_D_INLINE __nv_bfloat162 to(float val)
+{
+    return __float2bfloat162_rn(val);
+}
+template <>
 DS_D_INLINE __nv_bfloat162 to(__half2 val)
 {
    return to<__nv_bfloat162>(to<float2>(val));

--- a/csrc/includes/cpu_adagrad.h
+++ b/csrc/includes/cpu_adagrad.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-*/
+// DeepSpeed Team
 #pragma once
@@ -38,8 +39,8 @@ public:
        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-        _streams[0] = Context::Instance().GetCurrentStream();
+        _streams[0] = TrainingContext::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
+        _streams[1] = TrainingContext::Instance().GetNewStream();
        _buf_index = false;
 #endif
    }

--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-*/
+// DeepSpeed Team
 #pragma once
@@ -53,8 +54,8 @@ public:
        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-        _streams[0] = Context::Instance().GetCurrentStream();
+        _streams[0] = TrainingContext::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
+        _streams[1] = TrainingContext::Instance().GetNewStream();
        _buf_index = false;
 #endif
    }

--- a/csrc/includes/cublas_wrappers.h
+++ b/csrc/includes/cublas_wrappers.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-*/
+// DeepSpeed Team
 #pragma once

--- a/csrc/includes/custom_cuda_layers.h
+++ b/csrc/includes/custom_cuda_layers.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2022 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-*/
+// DeepSpeed Team
 #pragma once

--- a/csrc/includes/dequantization_utils.h
+++ b/csrc/includes/dequantization_utils.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2022 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-*/
+// DeepSpeed Team
 #include "conversion_utils.h"
 #include "ds_kernel_utils.h"

--- a/csrc/includes/dropout.h
+++ b/csrc/includes/dropout.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-*/
+// DeepSpeed Team
 #pragma once

--- a/csrc/includes/ds_kernel_utils.h
+++ b/csrc/includes/ds_kernel_utils.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright 2022 The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
+// DeepSpeed Team
+/*
 Centralized header file for preprocessor macros and constants
 used throughout the codebase.
 */

--- a/csrc/includes/ds_transformer_cuda.h
+++ b/csrc/includes/ds_transformer_cuda.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-*/
+// DeepSpeed Team
 #pragma once

--- a/csrc/includes/feed_forward.h
+++ b/csrc/includes/feed_forward.h
-/*
+// Copyright (c) Microsoft Corporation.
-Copyright The Microsoft DeepSpeed Team
+// SPDX-License-Identifier: Apache-2.0
-*/
+// DeepSpeed Team
 #ifndef __FEEDFORWARD_H__
 #define __FEEDFORWARD_H__