Fix AdaptDL Intergration (#3153)

dbb2434f · hao-howard-zhang · GitHub · 765206cb · dbb2434f · dbb2434f
Unverified Commit dbb2434f authored Dec 08, 2020 by hao-howard-zhang Committed by GitHub Dec 09, 2020
9 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -81,6 +81,7 @@ typings/
 __pycache__
 build
 *.egg-info
+.eggs/
 setup.pye
 **/__init__.pye
 **/.ipynb_checkpoints

--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
 **NNI (Neural Network Intelligence)** is a lightweight but powerful toolkit to help users **automate** <a href="docs/en_US/FeatureEngineering/Overview.md">Feature Engineering</a>, <a href="docs/en_US/NAS/Overview.md">Neural Architecture Search</a>, <a href="docs/en_US/Tuner/BuiltinTuner.md">Hyperparameter Tuning</a> and <a href="docs/en_US/Compression/Overview.md">Model Compression</a>.
-The tool manages automated machine learning (AutoML) experiments, **dispatches and runs** experiments' trial jobs generated by tuning algorithms to search the best neural architecture and/or hyper-parameters in **different training environments** like <a href="docs/en_US/TrainingService/LocalMode.md">Local Machine</a>, <a href="docs/en_US/TrainingService/RemoteMachineMode.md">Remote Servers</a>, <a href="docs/en_US/TrainingService/PaiMode.md">OpenPAI</a>, <a href="docs/en_US/TrainingService/KubeflowMode.md">Kubeflow</a>, <a href="docs/en_US/TrainingService/FrameworkControllerMode.md">FrameworkController on K8S (AKS etc.)</a>, <a href="docs/en_US/TrainingService/DLTSMode.md">DLWorkspace (aka. DLTS)</a>, <a href="docs/en_US/TrainingService/AMLMode.md">AML (Azure Machine Learning)</a> and other cloud options.
+The tool manages automated machine learning (AutoML) experiments, **dispatches and runs** experiments' trial jobs generated by tuning algorithms to search the best neural architecture and/or hyper-parameters in **different training environments** like <a href="docs/en_US/TrainingService/LocalMode.md">Local Machine</a>, <a href="docs/en_US/TrainingService/RemoteMachineMode.md">Remote Servers</a>, <a href="docs/en_US/TrainingService/PaiMode.md">OpenPAI</a>, <a href="docs/en_US/TrainingService/KubeflowMode.md">Kubeflow</a>, <a href="docs/en_US/TrainingService/FrameworkControllerMode.md">FrameworkController on K8S (AKS etc.)</a>, <a href="docs/en_US/TrainingService/DLTSMode.md">DLWorkspace (aka. DLTS)</a>, <a href="docs/en_US/TrainingService/AMLMode.md">AML (Azure Machine Learning)</a>, <a href="docs/en_US/TrainingService/AdaptDLMode.md">AdaptDL (aka. ADL)</a> and other cloud options.
 ## **Who should consider using NNI**
@@ -173,11 +173,13 @@ Within the following table, we summarized the current NNI capabilities, we are g
        <li><a href="docs/en_US/TrainingService/RemoteMachineMode.md">Remote Servers</a></li>
        <li><a href="docs/en_US/TrainingService/AMLMode.md">AML(Azure Machine Learning)</a></li>
        <li><b>Kubernetes based services</b></li>
-            <ul><li><a href="docs/en_US/TrainingService/PaiMode.md">OpenPAI</a></li>
+        <ul>
-            <li><a href="docs/en_US/TrainingService/KubeflowMode.md">Kubeflow</a></li>
+          <li><a href="docs/en_US/TrainingService/PaiMode.md">OpenPAI</a></li>
-            <li><a href="docs/en_US/TrainingService/FrameworkControllerMode.md">FrameworkController on K8S (AKS etc.)</a></li>
+          <li><a href="docs/en_US/TrainingService/KubeflowMode.md">Kubeflow</a></li>
-            </ul>
+          <li><a href="docs/en_US/TrainingService/FrameworkControllerMode.md">FrameworkController on K8S (AKS etc.)</a></li>
-            <ul><li><a href="docs/en_US/TrainingService/DLTSMode.md">DLWorkspace (aka. DLTS)</a></li>
+          <li><a href="docs/en_US/TrainingService/DLTSMode.md">DLWorkspace (aka. DLTS)</a></li>
+          <li><a href="docs/en_US/TrainingService/AdaptDLMode.md">AdaptDL (aka. ADL)</a></li>
+        </ul>
      </ul>
      </td>
    </tr>

--- a/docs/en_US/TrainingService/AdaptDLMode.md
+++ b/docs/en_US/TrainingService/AdaptDLMode.md
@@ -66,7 +66,7 @@ trial:
    path: /
    containerMountPath: /nfs
  checkpoint: # optional
-    storageClass: microk8s-hostpath
+    storageClass: dfs
    storageSize: 1Gi
 ```
@@ -79,18 +79,21 @@ IP address of the machine with NNI manager (NNICTL) that launches NNI experiment
 * **logCollection**: *Recommended* to set as `http`. It will collect the trial logs on cluster back to your machine via http.
 * **tuner**: It supports the Tuun tuner and all NNI built-in tuners (only except for the checkpoint feature of the NNI PBT tuners).
 * **trial**: It defines the specs of an `adl` trial.
-    * **adaptive**: (*Optional*) Boolean for AdaptDL trainer. While `true`, it the job is preemptible and adaptive.
+  * **adaptive**: (*Optional*) Boolean for AdaptDL trainer. While `true`, it the job is preemptible and adaptive.
-    * **image**: Docker image for the trial
+  * **image**: Docker image for the trial
-    * **imagePullSecret**: (*Optional*) If you are using a private registry,
+  * **imagePullSecret**: (*Optional*) If you are using a private registry,
-    you need to provide the secret to successfully pull the image.
+  you need to provide the secret to successfully pull the image.
-    * **codeDir**: the working directory of the container. `.` means the default working directory defined by the image.
+  * **codeDir**: the working directory of the container. `.` means the default working directory defined by the image.
-    * **command**: the bash command to start the trial
+  * **command**: the bash command to start the trial
-    * **gpuNum**: the number of GPUs requested for this trial. It must be non-negative integer.
+  * **gpuNum**: the number of GPUs requested for this trial. It must be non-negative integer.
-    * **cpuNum**: (*Optional*) the number of CPUs requested for this trial.  It must be non-negative integer.
+  * **cpuNum**: (*Optional*) the number of CPUs requested for this trial.  It must be non-negative integer.
-    * **memorySize**: (*Optional*) the size of memory requested for this trial. It must follow the Kubernetes
+  * **memorySize**: (*Optional*) the size of memory requested for this trial. It must follow the Kubernetes
-    [default format](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-memory).
+  [default format](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-memory).
-    * **nfs**: (*Optional*) mounting external storage. For more information about using NFS please check the below paragraph.
+  * **nfs**: (*Optional*) mounting external storage. For more information about using NFS please check the below paragraph.
-    * **checkpoint** (*Optional*) [storage settings](https://kubernetes.io/docs/concepts/storage/storage-classes/) for AdaptDL internal checkpoints. You can keep it optional if you are not dev users.
+  * **checkpoint**: (*Optional*) storage settings for model checkpoints.
+    * **storageClass**: check [Kubernetes storage documentation](https://kubernetes.io/docs/concepts/storage/storage-classes/) for how to use the appropriate `storageClass`.
+    * **storageSize**: this value should be large enough to fit your model's checkpoints, or it could cause disk quota exceeded error.
 ### NFS Storage

--- a/examples/trials/cifar10_pytorch/config_adl.yml
+++ b/examples/trials/cifar10_pytorch/config_adl.yml
@@ -17,10 +17,13 @@ tuner:
    #choice: maximize, minimize
    optimize_mode: maximize
 trial:
-  command: python3 main_adl.py
+  command: python3 /cifar10/main_adl.py
-  codeDir: .
+  codeDir: /cifar10
  gpuNum: 1
  image: {replace_with_the_image_that_has_adaptdl_installed}
+  # optional
+  imagePullSecrets:
+    - name: {secret}
  adaptive: true
  checkpoint:
    storageClass: dfs

--- a/examples/trials/cifar10_pytorch/main_adl.py
+++ b/examples/trials/cifar10_pytorch/main_adl.py
@@ -146,7 +146,7 @@ def valid(epoch):
        writer.add_scalar("Accuracy/Valid", stats["accuracy"], epoch)
        if adaptdl.env.replica_rank() == 0:
-            nni.report_intermediate_result(stats["accuracy"], accum=stats)
+            nni.report_intermediate_result(stats["accuracy"])
        print("Valid:", stats)
        return stats["accuracy"]

--- a/nni/tools/nnictl/launcher_utils.py
+++ b/nni/tools/nnictl/launcher_utils.py
@@ -63,14 +63,16 @@ def parse_path(experiment_config, config_path):
    if experiment_config['trial'].get('paiConfigPath'):
        expand_path(experiment_config['trial'], 'paiConfigPath')
-    #if users use relative path, convert it to absolute path
+    # If users use relative path, convert it to absolute path.
    root_path = os.path.dirname(config_path)
    if experiment_config.get('searchSpacePath'):
        parse_relative_path(root_path, experiment_config, 'searchSpacePath')
    if experiment_config.get('logDir'):
        parse_relative_path(root_path, experiment_config, 'logDir')
    if experiment_config.get('trial'):
-        parse_relative_path(root_path, experiment_config['trial'], 'codeDir')
+        # In AdaptDL mode, 'codeDir' shouldn't be parsed because it points to the path in the container.
+        if experiment_config.get('trainingServicePlatform') != 'adl':
+            parse_relative_path(root_path, experiment_config['trial'], 'codeDir')
        if experiment_config['trial'].get('authFile'):
            parse_relative_path(root_path, experiment_config['trial'], 'authFile')
        if experiment_config['trial'].get('ps'):

--- a/nni/tools/nnictl/tensorboard_utils.py
+++ b/nni/tools/nnictl/tensorboard_utils.py
@@ -134,7 +134,6 @@ def start_tensorboard(args):
    if experiment_dict[args.id]["status"] == "STOPPED":
        print_error("Experiment {} is stopped...".format(args.id))
        return
-    config_file_name = experiment_dict[experiment_id]['fileName']
    nni_config = Config(args.id)
    if nni_config.get_config('experimentConfig').get('trainingServicePlatform') == 'adl':
        adl_tensorboard_helper(args)

--- a/ts/nni_manager/training_service/kubernetes/adl/adlTrainingService.ts
+++ b/ts/nni_manager/training_service/kubernetes/adl/adlTrainingService.ts
@@ -214,10 +214,10 @@ class AdlTrainingService extends KubernetesTrainingService implements Kubernetes
            trialJobId, form, codeDir, outputDir)
        const cleanupScriptTemplate: string =
 `#!/bin/bash
-ps aux | grep "python3 -m nni_trial_tool.trial_keeper" | awk '{print $2}' | xargs kill -2
+ps aux | grep "python3 -m nni.tools.trial_tool.trial_keeper" | awk '{print $2}' | xargs kill -2
 while true;
 do
-    proc=\`ps aux | grep "python3 -m nni_trial_tool.trial_keeper" | awk '{print $2}' | grep "" -c\`
+    proc=\`ps aux | grep "python3 -m nni.tools.trial_tool.trial_keeper" | awk '{print $2}' | grep "" -c\`
    if (( $proc == 1  )); then
        exit 0
    else
@@ -281,7 +281,7 @@ export NNI_TRIAL_SEQ_ID={4}
 mkdir -p $NNI_OUTPUT_DIR
 {5}
 echo '{6}' > $NNI_CODE_DIR/{7}
-python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' \
+python3 -m nni.tools.trial_tool.trial_keeper --trial_command '{8}' \
 --nnimanager_ip {9} --nnimanager_port {10} \
 --nni_manager_version '{11}' --log_collection '{12}'
 `;

--- a/ts/nni_manager/training_service/pai/paiYarn/paiYarnData.ts
+++ b/ts/nni_manager/training_service/pai/paiYarn/paiYarnData.ts
@@ -16,7 +16,7 @@ fi`;
 export const PAI_TRIAL_COMMAND_FORMAT: string =
 `export NNI_PLATFORM=paiYarn NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
 && cd $NNI_SYS_DIR && sh install_nni.sh \
-&& python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' \
+&& python3 -m nni.tools.trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' \
 --pai_hdfs_output_dir '{9}' --pai_hdfs_host '{10}' --pai_user_name {11} --nni_hdfs_exp_dir '{12}' --webhdfs_path '/webhdfs/api/v1' \
 --nni_manager_version '{13}' --log_collection '{14}'`;