CI/CD - Upgrade dependency versions in pipeline (#671)

Upgrade dependency versions in Azure pipeline: * Remove Python 3.6 and add Python 3.10 for cpu-unit-test * Upgrade CUDA from 11.1 to 12.4 for cuda-unit-test * Update labels accordingly --------- Co-authored-by: Dilip Patlolla <dilipreddi@gmail.com>

CI/CD - Upgrade dependency versions in pipeline (#671)
Upgrade dependency versions in Azure pipeline: * Remove Python 3.6 and add Python 3.10 for cpu-unit-test * Upgrade CUDA from 11.1 to 12.4 for cuda-unit-test * Update labels accordingly --------- Co-authored-by: Dilip Patlolla <dilipreddi@gmail.com>
96f5ccea · Yifan Xiong · GitHub · 7cef624e · 96f5ccea · 96f5ccea
Unverified Commit 96f5ccea authored Nov 26, 2024 by Yifan Xiong Committed by GitHub Nov 26, 2024
9 changed files
--- a/.azure-pipelines/ansible-integration-test.yml
+++ b/.azure-pipelines/ansible-integration-test.yml
@@ -7,6 +7,7 @@ trigger:

 pool:
  name: SuperBench CI
+  demands: ansible-agent
  vmImage: ubuntu-latest

 container:

--- a/.azure-pipelines/cpu-unit-test.yml
+++ b/.azure-pipelines/cpu-unit-test.yml
@@ -7,12 +7,12 @@ trigger:

 strategy:
  matrix:
-    python-3.6:
-      imageTag: '3.6'
    python-3.7:
      imageTag: '3.7'
    python-3.8:
      imageTag: '3.8'
+    python-3.10:
+      imageTag: '3.10'
    # TODO
    #python-latest:
    #  imageTag: '3'

--- a/.azure-pipelines/cuda-unit-test.yml
+++ b/.azure-pipelines/cuda-unit-test.yml
@@ -7,22 +7,26 @@ trigger:

 pool:
  name: SuperBench CI
+  demands: cuda-agent
  vmImage: ubuntu-latest

 container:
-  image: nvcr.io/nvidia/pytorch:20.12-py3
-  options: '-v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker -v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/:/usr/lib/sudo/'
+  image: nvcr.io/nvidia/pytorch:24.03-py3
+  options: '--name cuda-ci -v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker:ro'

 steps:
  - script: |
      echo "##vso[task.prependpath]$HOME/.local/bin"
    displayName: Export path
  - script: |
+      docker exec -t -u root -e DEBIAN_FRONTEND=noninteractive cuda-ci bash -c \
+        "apt-get update -y -q && \
+        yes '' | apt-get install -y -q sudo && \
+        apt-get install -y -q \
+        ffmpeg libavcodec-dev libavformat-dev libavutil-dev libboost-program-options-dev libswresample-dev"
      python3 -m pip install --upgrade pip setuptools==65.7
      python3 -m pip install .[test,nvworker]
      make postinstall
-      sudo DEBIAN_FRONTEND=noninteractive apt-get update
-      sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev
    displayName: Install dependencies
  - script: |
      python3 setup.py lint

--- a/.codecov.yml
+++ b/.codecov.yml
@@ -14,8 +14,9 @@ coverage:
        target: 80%
        threshold: 1%
        flags:
-          - cpu-python3.6-unit-test
          - cpu-python3.7-unit-test
+          - cpu-python3.8-unit-test
+          - cpu-python3.10-unit-test
          - cuda-unit-test
          - directx-unit-test
    patch:
@@ -23,7 +24,8 @@ coverage:
        target: 80%
        threshold: 1%
        flags:
-          - cpu-python3.6-unit-test
          - cpu-python3.7-unit-test
+          - cpu-python3.8-unit-test
+          - cpu-python3.10-unit-test
          - cuda-unit-test
          - directx-unit-test
--- a/docs/getting-started/installation.mdx
+++ b/docs/getting-started/installation.mdx
@@ -26,7 +26,7 @@ Here're the system requirements for control node.
 ### Requirements

 * Latest version of Linux, you're highly encouraged to use Ubuntu 18.04 or later.
-* [Python](https://www.python.org/) version 3.6 or later (which can be checked by running `python3 --version`).
+* [Python](https://www.python.org/) version 3.7 or later (which can be checked by running `python3 --version`).
 * [Pip](https://pip.pypa.io/en/stable/installing/) version 18.0 or later (which can be checked by running `python3 -m pip --version`).

 :::note

--- a/setup.py
+++ b/setup.py
@@ -131,17 +131,17 @@ def run(self):
        'Operating System :: POSIX',
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3 :: Only',
-        'Programming Language :: Python :: 3.6',
        'Programming Language :: Python :: 3.7',
        'Programming Language :: Python :: 3.8',
        'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
        'Topic :: System :: Benchmark',
        'Topic :: System :: Clustering',
        'Topic :: System :: Hardware',
    ],
    keywords='benchmark, AI systems',
    packages=find_packages(exclude=['tests']),
-    python_requires='>=3.6, <4',
+    python_requires='>=3.7, <4',
    use_scm_version={
        'local_scheme': 'node-and-date',
        'version_scheme': lambda _: superbench.__version__,

--- a/superbench/benchmarks/base.py
+++ b/superbench/benchmarks/base.py
@@ -48,6 +48,8 @@ def __init__(self, name, parameters=''):
            allow_abbrev=False,
            formatter_class=SortedMetavarTypeHelpFormatter,
        )
+        # Fix optionals title in Python 3.10
+        self._parser._optionals.title = 'optional arguments'
        self._args = None
        self._curr_run_index = 0
        self._result = None

--- a/tests/analyzer/test_summaryop.py
+++ b/tests/analyzer/test_summaryop.py
@@ -4,7 +4,7 @@
 """Tests for SummaryOp module."""

 import unittest
-from numpy import NaN, float64
+from numpy import nan, float64

 import pandas as pd

@@ -55,7 +55,7 @@ def test_rule_op(self):
        # Test - std
        result = SummaryOp.std(raw_data_df)
        print(result)
-        expectedResult = pd.Series([3.0, 3.0, 2.1213203435596424, NaN], index=['a', 'b', 'c', 'd'], dtype=float64)
+        expectedResult = pd.Series([3.0, 3.0, 2.1213203435596424, nan], index=['a', 'b', 'c', 'd'], dtype=float64)
        pd.testing.assert_series_equal(result, expectedResult)
        # Test - count
        result = SummaryOp.count(raw_data_df)

--- a/tests/benchmarks/model_benchmarks/test_pytorch_base.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_base.py
@@ -250,16 +250,35 @@ def test_pytorch_empty_cache():
    # Register mnist benchmark.
    BenchmarkRegistry.register_benchmark('pytorch-mnist', PytorchMNIST)

+    # Get initial memory reserved
+    init_res_memory = torch.cuda.memory_reserved()
+
    # Test cache empty by manually calling torch.cuda.empty_cache().
    parameters = '--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train'
    benchmark = PytorchMNIST('pytorch-mnist', parameters=parameters)
+
    assert (benchmark)
    assert (benchmark._preprocess())
    assert (benchmark._benchmark())
    del benchmark
-    assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] > 0)
+
+    # Get current reserved memory after benchmark
+    post_bm_res_memory = torch.cuda.memory_reserved()
+
+    # Assert that memory is increased after benchmark
+    assert (post_bm_res_memory >= init_res_memory)
+
+    # Manually empty cache and get reserved memory
+    # Calling empty_cache() releases all unused cached memory from PyTorch so that those can be used by
+    # other GPU applications. However, the occupied GPU memory by tensors will not be freed so it can not
+    # increase the amount of GPU memory available for PyTorch.
+    # https://pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management
    torch.cuda.empty_cache()
-    assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] == 0)
+    post_empty_cache_res_memory = torch.cuda.memory_reserved()
+
+    # Assert that some memory is released after manually empty cache. The cache is not guaranteed to be reset
+    # back to the init_res_memory due to some tensors not being released.
+    assert (post_empty_cache_res_memory <= post_bm_res_memory)

    # Test automatic cache empty.
    context = BenchmarkRegistry.create_benchmark_context(
@@ -268,4 +287,4 @@ def test_pytorch_empty_cache():

    benchmark = BenchmarkRegistry.launch_benchmark(context)
    assert (benchmark)
-    assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] == 0)
+    assert (torch.cuda.memory_reserved() == post_empty_cache_res_memory)