"src/include/base.hpp" did not exist on "8a4b59785b4f5ba48468d53618ca270c5da599a7"
Commit eaf42120 authored by suiguoxin's avatar suiguoxin
Browse files

squash commits in v1.0 first round bug bash

parent f721b431
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
* * * * * *
[![MIT 许可证](https://img.shields.io/badge/license-MIT-brightgreen.svg)](LICENSE) [![生成状态](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/Microsoft.nni)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=6) [![问题](https://img.shields.io/github/issues-raw/Microsoft/nni.svg)](https://github.com/Microsoft/nni/issues?q=is%3Aissue+is%3Aopen) [![Bug](https://img.shields.io/github/issues/Microsoft/nni/bug.svg)](https://github.com/Microsoft/nni/issues?q=is%3Aissue+is%3Aopen+label%3Abug) [![拉取请求](https://img.shields.io/github/issues-pr-raw/Microsoft/nni.svg)](https://github.com/Microsoft/nni/pulls?q=is%3Apr+is%3Aopen) [![版本](https://img.shields.io/github/release/Microsoft/nni.svg)](https://github.com/Microsoft/nni/releases) [![进入 https://gitter.im/Microsoft/nni 聊天室提问](https://badges.gitter.im/Microsoft/nni.svg)](https://gitter.im/Microsoft/nni?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![文档状态](https://readthedocs.org/projects/nni/badge/?version=latest)](https://nni.readthedocs.io/en/latest/?badge=latest) [![MIT 许可证](https://img.shields.io/badge/license-MIT-brightgreen.svg)](LICENSE) [![生成状态](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/Microsoft.nni)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=6) [![问题](https://img.shields.io/github/issues-raw/Microsoft/nni.svg)](https://github.com/Microsoft/nni/issues?q=is%3Aissue+is%3Aopen) [![Bug](https://img.shields.io/github/issues/Microsoft/nni/bug.svg)](https://github.com/Microsoft/nni/issues?q=is%3Aissue+is%3Aopen+label%3Abug) [![拉取请求](https://img.shields.io/github/issues-pr-raw/Microsoft/nni.svg)](https://github.com/Microsoft/nni/pulls?q=is%3Apr+is%3Aopen) [![版本](https://img.shields.io/github/release/Microsoft/nni.svg)](https://github.com/Microsoft/nni/releases) [![进入 https://gitter.im/Microsoft/nni 聊天室提问](https://badges.gitter.im/Microsoft/nni.svg)](https://gitter.im/Microsoft/nni?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![文档状态](https://readthedocs.org/projects/nni/badge/?version=latest)](https://nni.readthedocs.io/zh/latest/?badge=latest)
[English](README.md) [English](README.md)
...@@ -47,40 +47,40 @@ NNI (Neural Network Intelligence) 是自动机器学习(AutoML)的工具包 ...@@ -47,40 +47,40 @@ NNI (Neural Network Intelligence) 是自动机器学习(AutoML)的工具包
</ul> </ul>
</td> </td>
<td align="left"> <td align="left">
<a href="docs/en_US/Tuner/BuiltinTuner.md">Tuner(调参器)</a> <a href="docs/zh_CN/Tuner/BuiltinTuner.md">Tuner(调参器)</a>
<br /> <br />
<ul> <ul>
<b style="margin-left:-20px">通用 Tuner</b> <b style="margin-left:-20px">通用 Tuner</b>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#Random">Random Search(随机搜索)</a></li> <li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#Random">Random Search(随机搜索)</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#Evolution">Naïve Evolution(进化算法)</a></li> <li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#Evolution">Naïve Evolution(进化算法)</a></li>
<b style="margin-left:-20px">超参 Tuner</b> <b style="margin-left:-20px">超参 Tuner</b>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#TPE">TPE</a></li> <li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#TPE">TPE</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#Anneal">Anneal(退火算法)</a></li> <li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#Anneal">Anneal(退火算法)</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#SMAC">SMAC</a></li> <li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#SMAC">SMAC</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#Batch">Batch(批处理)</a></li> <li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#Batch">Batch(批处理)</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#GridSearch">Grid Search(遍历搜索)</a></li> <li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#GridSearch">Grid Search(遍历搜索)</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#Hyperband">Hyperband</a></li> <li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#Hyperband">Hyperband</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#MetisTuner">Metis Tuner</a></li> <li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#MetisTuner">Metis Tuner</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#BOHB">BOHB</a></li> <li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#BOHB">BOHB</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#GPTuner">GP Tuner</a></li> <li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#GPTuner">GP Tuner</a></li>
<b style="margin-left:-20px">网络结构 Tuner</b> <b style="margin-left:-20px">网络结构 Tuner</b>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#NetworkMorphism">Network Morphism</a></li> <li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#NetworkMorphism">Network Morphism</a></li>
<li><a href="examples/tuners/enas_nni/README.md">ENAS</a></li> <li><a href="examples/tuners/enas_nni/README.md">ENAS</a></li>
</ul> </ul>
<a href="docs/en_US/Assessor/BuiltinAssessor.md">Assessor(评估器)</a> <a href="docs/zh_CN/Assessor/BuiltinAssessor.md">Assessor(评估器)</a>
<ul> <ul>
<li><a href="docs/en_US/Assessor/BuiltinAssessor.md#Medianstop">Median Stop(中位数终止)</a></li> <li><a href="docs/zh_CN/Assessor/BuiltinAssessor.md#Medianstop">Median Stop(中位数终止)</a></li>
<li><a href="docs/en_US/Assessor/BuiltinAssessor.md#Curvefitting">Curve Fitting(曲线拟合)</a></li> <li><a href="docs/zh_CN/Assessor/BuiltinAssessor.md#Curvefitting">Curve Fitting(曲线拟合)</a></li>
</ul> </ul>
</td> </td>
<td> <td>
<ul> <ul>
<li><a href="docs/en_US/TrainingService/LocalMode.md">本机</a></li> <li><a href="docs/zh_CN/TrainingService/LocalMode.md">本机</a></li>
<li><a href="docs/en_US/TrainingService/RemoteMachineMode.md">远程计算机</a></li> <li><a href="docs/zh_CN/TrainingService/RemoteMachineMode.md">远程计算机</a></li>
<li><b>基于 Kubernetes 的平台</b></li> <li><b>基于 Kubernetes 的平台</b></li>
<ul><li><a href="docs/en_US/TrainingService/PaiMode.md">OpenPAI</a></li> <ul><li><a href="docs/zh_CN/TrainingService/PaiMode.md">OpenPAI</a></li>
<li><a href="docs/en_US/TrainingService/KubeflowMode.md">Kubeflow</a></li> <li><a href="docs/zh_CN/TrainingService/KubeflowMode.md">Kubeflow</a></li>
<li><a href="docs/en_US/TrainingService/FrameworkControllerMode.md">基于 Kubernetes(AKS 等)的 FrameworkController</a></li> <li><a href="docs/zh_CN/TrainingService/FrameworkControllerMode.md">基于 Kubernetes(AKS 等)的 FrameworkController</a></li>
</ul> </ul>
</ul> </ul>
</td> </td>
...@@ -226,34 +226,34 @@ You can use these commands to get more information about the experiment ...@@ -226,34 +226,34 @@ You can use these commands to get more information about the experiment
点击阅读: 点击阅读:
* [NNI 概述](docs/zh_CN/Overview.md) * [NNI 概述](docs/zh_CN/Overview.md)
* [快速入门](docs/en_US/Tutorial/QuickStart.md) * [快速入门](docs/zh_CN/Tutorial/QuickStart.md)
* [贡献](docs/en_US/Tutorial/Contributing.md) * [贡献](docs/zh_CN/Tutorial/Contributing.md)
* [示例](docs/en_US/examples.rst) * [示例](docs/zh_CN/examples.rst)
* [参考](docs/en_US/reference.rst) * [参考](docs/zh_CN/reference.rst)
* [Web 界面教程](docs/en_US/Tutorial/WebUI.md) * [Web 界面教程](docs/zh_CN/Tutorial/WebUI.md)
## **入门** ## **入门**
* [安装 NNI](docs/en_US/Tutorial/Installation.md) * [安装 NNI](docs/zh_CN/Tutorial/Installation.md)
* [使用命令行工具 nnictl](docs/en_US/Tutorial/Nnictl.md) * [使用命令行工具 nnictl](docs/zh_CN/Tutorial/Nnictl.md)
* [使用 NNIBoard](docs/en_US/Tutorial/WebUI.md) * [使用 NNIBoard](docs/zh_CN/Tutorial/WebUI.md)
* [如何定义搜索空间](docs/en_US/Tutorial/SearchSpaceSpec.md) * [如何定义搜索空间](docs/zh_CN/Tutorial/SearchSpaceSpec.md)
* [如何实现 Trial 代码](docs/en_US/TrialExample/Trials.md) * [如何实现 Trial 代码](docs/zh_CN/TrialExample/Trials.md)
* [如何选择 Tuner、搜索算法](docs/en_US/Tuner/BuiltinTuner.md) * [如何选择 Tuner、搜索算法](docs/zh_CN/Tuner/BuiltinTuner.md)
* [配置 Experiment](docs/en_US/Tutorial/ExperimentConfig.md) * [配置 Experiment](docs/zh_CN/Tutorial/ExperimentConfig.md)
* [如何使用 Annotation](docs/en_US/TrialExample/Trials.md#nni-python-annotation) * [如何使用 Annotation](docs/zh_CN/TrialExample/Trials.md#nni-python-annotation)
## **教程** ## **教程**
* [在 OpenPAI 上运行 Experiment](docs/en_US/TrainingService/PaiMode.md) * [在 OpenPAI 上运行 Experiment](docs/zh_CN/TrainingService/PaiMode.md)
* [在 Kubeflow 上运行 Experiment](docs/en_US/TrainingService/KubeflowMode.md) * [在 Kubeflow 上运行 Experiment](docs/zh_CN/TrainingService/KubeflowMode.md)
* [在本机运行 Experiment (支持多 GPU 卡)](docs/en_US/TrainingService/LocalMode.md) * [在本机运行 Experiment (支持多 GPU 卡)](docs/zh_CN/TrainingService/LocalMode.md)
* [在多机上运行 Experiment](docs/en_US/TrainingService/RemoteMachineMode.md) * [在多机上运行 Experiment](docs/zh_CN/TrainingService/RemoteMachineMode.md)
* [尝试不同的 Tuner](docs/en_US/Tuner/BuiltinTuner.md) * [尝试不同的 Tuner](docs/zh_CN/Tuner/BuiltinTuner.md)
* [尝试不同的 Assessor](docs/en_US/Assessor/BuiltinAssessor.md) * [尝试不同的 Assessor](docs/zh_CN/Assessor/BuiltinAssessor.md)
* [实现自定义 Tuner](docs/en_US/Tuner/CustomizeTuner.md) * [实现自定义 Tuner](docs/zh_CN/Tuner/CustomizeTuner.md)
* [实现自定义 Assessor](docs/en_US/Assessor/CustomizeAssessor.md) * [实现自定义 Assessor](docs/zh_CN/Assessor/CustomizeAssessor.md)
* [使用进化算法为阅读理解任务找到好模型](docs/en_US/TrialExample/SquadEvolutionExamples.md) * [使用进化算法为阅读理解任务找到好模型](docs/zh_CN/TrialExample/SquadEvolutionExamples.md)
## **贡献** ## **贡献**
...@@ -265,10 +265,10 @@ You can use these commands to get more information about the experiment ...@@ -265,10 +265,10 @@ You can use these commands to get more information about the experiment
在提交代码前,需要遵循以下的简单准则: 在提交代码前,需要遵循以下的简单准则:
* [如何调试](docs/en_US/Tutorial/HowToDebug.md) * [如何调试](docs/zh_CN/Tutorial/HowToDebug.md)
* [代码风格和命名约定](docs/en_US/Tutorial/Contributing.md) * [代码风格和命名约定](docs/zh_CN/Tutorial/Contributing.md)
* 如何设置 [NNI 开发环境](docs/zh_CN/Tutorial/SetupNniDeveloperEnvironment.md) * 如何设置 [NNI 开发环境](docs/zh_CN/Tutorial/SetupNniDeveloperEnvironment.md)
* 查看[贡献说明](docs/en_US/Tutorial/Contributing.md)并熟悉 NNI 的代码贡献指南 * 查看[贡献说明](docs/zh_CN/Tutorial/Contributing.md)并熟悉 NNI 的代码贡献指南
## **外部代码库** ## **外部代码库**
......
...@@ -49,7 +49,7 @@ RUN DEBIAN_FRONTEND=noninteractive && \ ...@@ -49,7 +49,7 @@ RUN DEBIAN_FRONTEND=noninteractive && \
# #
# update pip # update pip
# #
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip setuptools==39.1.0
# numpy 1.14.3 scipy 1.1.0 # numpy 1.14.3 scipy 1.1.0
RUN python3 -m pip --no-cache-dir install \ RUN python3 -m pip --no-cache-dir install \
......
# TrainingService
NNI TrainingService provides the training platform for running NNI trial jobs. NNI supports [local](./LocalMode.md), [remote](./RemoteMachineMode.md), [pai](./PaiMode.md), [kubeflow](./KubeflowMode.md) and [frameworkcontroller](./FrameworkControllerMode.md) built-in training services.
NNI not only provides few built-in training service options, but also provides a method for customers to build their own training service easily.
## Built-in TrainingService
|TrainingService|Brief Introduction|
|---|---|
|[__Local__](./LocalMode.md)|NNI supports running an experiment on local machine, called local mode. Local mode means that NNI will run the trial jobs and nniManager process in same machine, and support gpu schedule function for trial jobs.|
|[__Remote__](./RemoteMachineMode.md)|NNI supports running an experiment on multiple machines through SSH channel, called remote mode. NNI assumes that you have access to those machines, and already setup the environment for running deep learning training code. NNI will submit the trial jobs in remote machine, and schedule suitable machine with enouth gpu resource if specified.|
|[__Pai__](./PaiMode.md)|NNI supports running an experiment on [OpenPAI](https://github.com/Microsoft/pai) (aka pai), called pai mode. Before starting to use NNI pai mode, you should have an account to access an [OpenPAI](https://github.com/Microsoft/pai) cluster. See [here](https://github.com/Microsoft/pai#how-to-deploy) if you don't have any OpenPAI account and want to deploy an OpenPAI cluster. In pai mode, your trial program will run in pai's container created by Docker.|
|[__Kubeflow__](./KubeflowMode.md)|NNI supports running experiment on [Kubeflow](https://github.com/kubeflow/kubeflow), called kubeflow mode. Before starting to use NNI kubeflow mode, you should have a Kubernetes cluster, either on-premises or [Azure Kubernetes Service(AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/), a Ubuntu machine on which [kubeconfig](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/) is setup to connect to your Kubernetes cluster. If you are not familiar with Kubernetes, [here](https://kubernetes.io/docs/tutorials/kubernetes-basics/) is a good start. In kubeflow mode, your trial program will run as Kubeflow job in Kubernetes cluster.|
|[__FrameworkController__](./FrameworkControllerMode.md)|NNI supports running experiment using [FrameworkController](https://github.com/Microsoft/frameworkcontroller), called frameworkcontroller mode. FrameworkController is built to orchestrate all kinds of applications on Kubernetes, you don't need to install Kubeflow for specific deep learning framework like tf-operator or pytorch-operator. Now you can use FrameworkController as the training service to run NNI experiment.|
## TrainingService Implementation
TrainingService is designed to be easily implemented, we define an abstract class TrainingService as the parent class of all kinds of TrainingService, users just need to inherit the parent class and complete their own child class if they want to implement customized TrainingService.
The abstract function in TrainingService is shown below:
```
abstract class TrainingService {
public abstract listTrialJobs(): Promise<TrialJobDetail[]>;
public abstract getTrialJob(trialJobId: string): Promise<TrialJobDetail>;
public abstract addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void;
public abstract removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void;
public abstract submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail>;
public abstract updateTrialJob(trialJobId: string, form: JobApplicationForm): Promise<TrialJobDetail>;
public abstract get isMultiPhaseJobSupported(): boolean;
public abstract cancelTrialJob(trialJobId: string, isEarlyStopped?: boolean): Promise<void>;
public abstract setClusterMetadata(key: string, value: string): Promise<void>;
public abstract getClusterMetadata(key: string): Promise<string>;
public abstract cleanUp(): Promise<void>;
public abstract run(): Promise<void>;
}
```
The parent class of TrainingService has a few abstract functions, users need to inherit the parent class and implement all of these abstract functions.
For more information about how to write your own TrainingService, please [refer](https://github.com/SparkSnail/nni/blob/dev-trainingServiceDoc/docs/en_US/TrainingService/HowToImplementTrainingService.md).
...@@ -123,6 +123,7 @@ Debug mode will disable version check function in Trialkeeper. ...@@ -123,6 +123,7 @@ Debug mode will disable version check function in Trialkeeper.
|------|------|------ |------| |------|------|------ |------|
|id| False| |The id of the experiment you want to stop| |id| False| |The id of the experiment you want to stop|
|--port, -p| False| |Rest port of the experiment you want to stop| |--port, -p| False| |Rest port of the experiment you want to stop|
|--all, -a| False| |Stop all of experiments|
* Details & Examples * Details & Examples
...@@ -144,10 +145,10 @@ Debug mode will disable version check function in Trialkeeper. ...@@ -144,10 +145,10 @@ Debug mode will disable version check function in Trialkeeper.
nnictl stop --port 8080 nnictl stop --port 8080
``` ```
4. Users could use 'nnictl stop all' to stop all experiments. 4. Users could use 'nnictl stop --all' to stop all experiments.
```bash ```bash
nnictl stop all nnictl stop --all
``` ```
5. If the id ends with *, nnictl will stop all experiments whose ids matchs the regular. 5. If the id ends with *, nnictl will stop all experiments whose ids matchs the regular.
......
...@@ -50,13 +50,11 @@ Click the tab "Intermediate Result" to see the lines graph. ...@@ -50,13 +50,11 @@ Click the tab "Intermediate Result" to see the lines graph.
![](../../img/webui-img/trials_intermeidate.png) ![](../../img/webui-img/trials_intermeidate.png)
We set a filter function for the intermediate result graph because that the trials may have many intermediate results in the training progress. You need to provide data if you want to use the filter button to see the trend of some trial. The trial may have many intermediate results in the training progress. In order to see the trend of some trials more clearly, we set a filtering function for the intermediate result graph.
What data should be written in the first input? Maybe you find an intermediate count those trials became better or worse. In other word, it's an important and concerned intermediate count. Just input it into the first input. You may find that these trials will get better or worse at one of intermediate results. In other words, this is an important and relevant intermediate result. To take a closer look at the point here, you need to enter its corresponding abscissa value at #Intermediate.
After selecting the intermeidate count, you should input your focus metric's range on this intermediate count. Yes, it's the min and max value. Like this picture, I choose the intermeidate count is 9 and the metric's range is 60-80. And then input the range of metrics on this intermedia result. Like below picture, it chooses No. 4 intermediate result and set the range of metrics to 0.8-1.
As a result, I filter these trials that the metric's range is 20-60 on the 13 intermediate count.
![](../../img/webui-img/filter-intermediate.png) ![](../../img/webui-img/filter-intermediate.png)
## View trials status ## View trials status
......
docs/img/webui-img/compare.png

48.8 KB | W: | H:

docs/img/webui-img/compare.png

43 KB | W: | H:

docs/img/webui-img/compare.png
docs/img/webui-img/compare.png
docs/img/webui-img/compare.png
docs/img/webui-img/compare.png
  • 2-up
  • Swipe
  • Onion skin
...@@ -46,7 +46,7 @@ export async function validateCodeDir(codeDir: string) : Promise<number> { ...@@ -46,7 +46,7 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
} }
try { try {
fileNameValid = await validateFileNameRecursively(codeDir); fileNameValid = await validateFileNameRecursively(codeDir);
} catch(error) { } catch (error) {
throw new Error(`Validate file name error: ${error}`); throw new Error(`Validate file name error: ${error}`);
} }
...@@ -55,23 +55,24 @@ export async function validateCodeDir(codeDir: string) : Promise<number> { ...@@ -55,23 +55,24 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
+ ` please check if it's a valid code dir`; + ` please check if it's a valid code dir`;
throw new Error(errMessage); throw new Error(errMessage);
} }
if(!fileNameValid) { if (!fileNameValid) {
const errMessage: string = `File name in ${codeDir} is not valid, please check file names, only support digit number、alphabet and (.-_) in file name.`; const errMessage: string = `File name in ${codeDir} is not valid, please check file names, only support digit number、alphabet and (.-_) in file name.`;
throw new Error(errMessage); throw new Error(errMessage);
} }
return fileCount; return fileCount;
} }
/** /**
* crete a new directory * crete a new directory
* @param directory * @param directory
*/ */
export async function execMkdir(directory: string): Promise<void> { export async function execMkdir(directory: string, share: boolean = false): Promise<void> {
if (process.platform === 'win32') { if (process.platform === 'win32') {
await cpp.exec(`powershell.exe New-Item -Path ${directory} -ItemType "directory" -Force`); await cpp.exec(`powershell.exe New-Item -Path ${directory} -ItemType "directory" -Force`);
} else if (share) {
await cpp.exec(`(umask 0; mkdir -p ${directory})`);
} else { } else {
await cpp.exec(`mkdir -p ${directory}`); await cpp.exec(`mkdir -p ${directory}`);
} }
......
...@@ -54,6 +54,9 @@ class GPUScheduler { ...@@ -54,6 +54,9 @@ class GPUScheduler {
} catch (error) { } catch (error) {
this.log.error('Read GPU summary failed with error: ', error); this.log.error('Read GPU summary failed with error: ', error);
} }
if (this.gpuSummary !== undefined && this.gpuSummary.gpuCount === 0) {
throw new Error('GPU not available. Please check your CUDA configuration');
}
await delay(5000); await delay(5000);
} }
} }
...@@ -97,7 +100,7 @@ class GPUScheduler { ...@@ -97,7 +100,7 @@ class GPUScheduler {
* used to run in remote machine, and will be deleted after uploaded from local. * used to run in remote machine, and will be deleted after uploaded from local.
*/ */
private async runGpuMetricsCollectorScript(): Promise<void> { private async runGpuMetricsCollectorScript(): Promise<void> {
await execMkdir(this.gpuMetricCollectorScriptFolder); await execMkdir(this.gpuMetricCollectorScriptFolder, true);
//generate gpu_metrics_collector script //generate gpu_metrics_collector script
const gpuMetricsCollectorScriptPath: string = const gpuMetricsCollectorScriptPath: string =
path.join(this.gpuMetricCollectorScriptFolder, getScriptName('gpu_metrics_collector')); path.join(this.gpuMetricCollectorScriptFolder, getScriptName('gpu_metrics_collector'));
......
...@@ -131,7 +131,7 @@ class LocalTrainingService implements TrainingService { ...@@ -131,7 +131,7 @@ class LocalTrainingService implements TrainingService {
private readonly occupiedGpuIndexNumMap: Map<number, number>; private readonly occupiedGpuIndexNumMap: Map<number, number>;
private designatedGpuIndices!: Set<number>; private designatedGpuIndices!: Set<number>;
private readonly log: Logger; private readonly log: Logger;
private localTrailConfig?: TrialConfig; private localTrialConfig?: TrialConfig;
private localConfig?: LocalConfig; private localConfig?: LocalConfig;
private isMultiPhase: boolean; private isMultiPhase: boolean;
private readonly jobStreamMap: Map<string, ts.Stream>; private readonly jobStreamMap: Map<string, ts.Stream>;
...@@ -204,7 +204,7 @@ class LocalTrainingService implements TrainingService { ...@@ -204,7 +204,7 @@ class LocalTrainingService implements TrainingService {
} catch (error) { } catch (error) {
//ignore //ignore
} }
this.log.debug(`trailJob status update: ${trialJobId}, ${trialJob.status}`); this.log.debug(`trialJob status update: ${trialJobId}, ${trialJob.status}`);
} }
} }
...@@ -302,14 +302,14 @@ class LocalTrainingService implements TrainingService { ...@@ -302,14 +302,14 @@ class LocalTrainingService implements TrainingService {
} }
switch (key) { switch (key) {
case TrialConfigMetadataKey.TRIAL_CONFIG: case TrialConfigMetadataKey.TRIAL_CONFIG:
this.localTrailConfig = <TrialConfig>JSON.parse(value); this.localTrialConfig = <TrialConfig>JSON.parse(value);
// Parse trial config failed, throw Error // Parse trial config failed, throw Error
if (this.localTrailConfig === undefined) { if (this.localTrialConfig === undefined) {
throw new Error('trial config parsed failed'); throw new Error('trial config parsed failed');
} }
if (this.localTrailConfig.gpuNum !== undefined) { if (this.localTrialConfig.gpuNum !== undefined) {
this.log.info(`required GPU number is ${this.localTrailConfig.gpuNum}`); this.log.info(`required GPU number is ${this.localTrialConfig.gpuNum}`);
if (this.gpuScheduler === undefined && this.localTrailConfig.gpuNum > 0) { if (this.gpuScheduler === undefined && this.localTrialConfig.gpuNum > 0) {
this.gpuScheduler = new GPUScheduler(); this.gpuScheduler = new GPUScheduler();
} }
} }
...@@ -343,10 +343,10 @@ class LocalTrainingService implements TrainingService { ...@@ -343,10 +343,10 @@ class LocalTrainingService implements TrainingService {
switch (key) { switch (key) {
case TrialConfigMetadataKey.TRIAL_CONFIG: case TrialConfigMetadataKey.TRIAL_CONFIG:
let getResult: Promise<string>; let getResult: Promise<string>;
if (this.localTrailConfig === undefined) { if (this.localTrialConfig === undefined) {
getResult = Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, `${key} is never set yet`)); getResult = Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, `${key} is never set yet`));
} else { } else {
getResult = Promise.resolve(JSON.stringify(this.localTrailConfig)); getResult = Promise.resolve(JSON.stringify(this.localTrialConfig));
} }
return getResult; return getResult;
...@@ -359,8 +359,8 @@ class LocalTrainingService implements TrainingService { ...@@ -359,8 +359,8 @@ class LocalTrainingService implements TrainingService {
this.log.info('Stopping local machine training service...'); this.log.info('Stopping local machine training service...');
this.stopping = true; this.stopping = true;
for (const stream of this.jobStreamMap.values()) { for (const stream of this.jobStreamMap.values()) {
stream.end(0) stream.end(0);
stream.emit('end') stream.emit('end');
} }
if (this.gpuScheduler !== undefined) { if (this.gpuScheduler !== undefined) {
await this.gpuScheduler.stop(); await this.gpuScheduler.stop();
...@@ -378,8 +378,8 @@ class LocalTrainingService implements TrainingService { ...@@ -378,8 +378,8 @@ class LocalTrainingService implements TrainingService {
throw new Error(`Could not find stream in trial ${trialJob.id}`); throw new Error(`Could not find stream in trial ${trialJob.id}`);
} }
//Refer https://github.com/Juul/tail-stream/issues/20 //Refer https://github.com/Juul/tail-stream/issues/20
stream.end(0) stream.end(0);
stream.emit('end') stream.emit('end');
this.jobStreamMap.delete(trialJob.id); this.jobStreamMap.delete(trialJob.id);
} }
} }
...@@ -427,8 +427,8 @@ class LocalTrainingService implements TrainingService { ...@@ -427,8 +427,8 @@ class LocalTrainingService implements TrainingService {
} }
private tryGetAvailableResource(): [boolean, { gpuIndices: number[]}] { private tryGetAvailableResource(): [boolean, { gpuIndices: number[]}] {
if (this.localTrailConfig === undefined) { if (this.localTrialConfig === undefined) {
throw new Error('localTrailConfig is not initialized!'); throw new Error('localTrialConfig is not initialized!');
} }
const resource: { gpuIndices: number[] } = { gpuIndices: [] }; const resource: { gpuIndices: number[] } = { gpuIndices: [] };
...@@ -450,11 +450,11 @@ class LocalTrainingService implements TrainingService { ...@@ -450,11 +450,11 @@ class LocalTrainingService implements TrainingService {
selectedGPUIndices = selectedGPUIndices.filter((index: number) => this.designatedGpuIndices.has(index)); selectedGPUIndices = selectedGPUIndices.filter((index: number) => this.designatedGpuIndices.has(index));
} }
if (selectedGPUIndices.length < this.localTrailConfig.gpuNum) { if (selectedGPUIndices.length < this.localTrialConfig.gpuNum) {
return [false, resource]; return [false, resource];
} }
selectedGPUIndices.splice(this.localTrailConfig.gpuNum); selectedGPUIndices.splice(this.localTrialConfig.gpuNum);
Object.assign(resource, { gpuIndices: selectedGPUIndices }); Object.assign(resource, { gpuIndices: selectedGPUIndices });
return [true, resource]; return [true, resource];
...@@ -494,7 +494,7 @@ class LocalTrainingService implements TrainingService { ...@@ -494,7 +494,7 @@ class LocalTrainingService implements TrainingService {
if (!success) { if (!success) {
break; break;
} }
this.occupyResource(resource); this.occupyResource(resource);
await this.runTrialJob(trialJobId, resource); await this.runTrialJob(trialJobId, resource);
} }
...@@ -512,17 +512,17 @@ class LocalTrainingService implements TrainingService { ...@@ -512,17 +512,17 @@ class LocalTrainingService implements TrainingService {
} }
} }
private getScript(localTrailConfig: TrialConfig, workingDirectory: string): string[] { private getScript(localTrialConfig: TrialConfig, workingDirectory: string): string[] {
const script: string[] = []; const script: string[] = [];
if (process.platform === 'win32') { if (process.platform === 'win32') {
script.push( script.push(
`cmd /c ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`, `cmd /c ${localTrialConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`, `$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`,
`$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`, `$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File ${path.join(workingDirectory, '.nni', 'state')} -NoNewline -encoding utf8`); `Write $LASTEXITCODE " " $NOW_DATE | Out-File ${path.join(workingDirectory, '.nni', 'state')} -NoNewline -encoding utf8`);
} else { } else {
script.push( script.push(
`eval ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`, `eval ${localTrialConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`echo $? \`date +%s%3N\` >${path.join(workingDirectory, '.nni', 'state')}`); `echo $? \`date +%s%3N\` >${path.join(workingDirectory, '.nni', 'state')}`);
} }
...@@ -531,23 +531,23 @@ class LocalTrainingService implements TrainingService { ...@@ -531,23 +531,23 @@ class LocalTrainingService implements TrainingService {
private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> { private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> {
const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId); const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId);
if (this.localTrailConfig === undefined) { if (this.localTrialConfig === undefined) {
throw new Error(`localTrialConfig not initialized!`); throw new Error(`localTrialConfig not initialized!`);
} }
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource, this.localTrailConfig.gpuNum); const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource, this.localTrialConfig.gpuNum);
if (this.localTrailConfig === undefined) { if (this.localTrialConfig === undefined) {
throw new Error('trial config is not initialized'); throw new Error('trial config is not initialized');
} }
const runScriptContent: string[] = []; const runScriptContent: string[] = [];
if (process.platform !== 'win32') { if (process.platform !== 'win32') {
runScriptContent.push('#!/bin/bash'); runScriptContent.push('#!/bin/bash');
} }
runScriptContent.push(`cd ${this.localTrailConfig.codeDir}`); runScriptContent.push(`cd ${this.localTrialConfig.codeDir}`);
for (const variable of variables) { for (const variable of variables) {
runScriptContent.push(setEnvironmentVariable(variable)); runScriptContent.push(setEnvironmentVariable(variable));
} }
const scripts: string[] = this.getScript(this.localTrailConfig, trialJobDetail.workingDirectory); const scripts: string[] = this.getScript(this.localTrialConfig, trialJobDetail.workingDirectory);
scripts.forEach((script: string) => { scripts.forEach((script: string) => {
runScriptContent.push(script); runScriptContent.push(script);
}); });
......
...@@ -511,12 +511,16 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -511,12 +511,16 @@ class RemoteMachineTrainingService implements TrainingService {
// tslint:disable-next-line: no-floating-promises // tslint:disable-next-line: no-floating-promises
SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh')}`, conn); SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh')}`, conn);
this.timer.subscribe( const disposable: Rx.IDisposable = this.timer.subscribe(
async (tick: number) => { async (tick: number) => {
const cmdresult: RemoteCommandResult = await SSHClientUtility.remoteExeCommand( const cmdresult: RemoteCommandResult = await SSHClientUtility.remoteExeCommand(
`tail -n 1 ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn); `tail -n 1 ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn);
if (cmdresult !== undefined && cmdresult.stdout !== undefined) { if (cmdresult !== undefined && cmdresult.stdout !== undefined) {
rmMeta.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout); rmMeta.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout);
if (rmMeta.gpuSummary.gpuCount === 0) {
this.log.warning(`No GPU found on remote machine ${rmMeta.ip}`);
this.timer.unsubscribe(disposable);
}
} }
} }
); );
......
...@@ -83,7 +83,7 @@ class Compare extends React.Component<CompareProps, {}> { ...@@ -83,7 +83,7 @@ class Compare extends React.Component<CompareProps, {}> {
}, },
xAxis: { xAxis: {
type: 'category', type: 'category',
// name: '# Intermeidate', // name: '# Intermediate',
boundaryGap: false, boundaryGap: false,
data: xAxis data: xAxis
}, },
...@@ -194,9 +194,9 @@ class Compare extends React.Component<CompareProps, {}> { ...@@ -194,9 +194,9 @@ class Compare extends React.Component<CompareProps, {}> {
maskClosable={false} maskClosable={false}
width="90%" width="90%"
> >
<Row className="compare-intermeidate"> <Row className="compare-intermediate">
{this.intermediate()} {this.intermediate()}
<Row className="compare-yAxis"># Intermeidate</Row> <Row className="compare-yAxis"># Intermediate</Row>
</Row> </Row>
<Row>{this.initColumn()}</Row> <Row>{this.initColumn()}</Row>
</Modal> </Modal>
......
...@@ -9,7 +9,7 @@ import DefaultPoint from './trial-detail/DefaultMetricPoint'; ...@@ -9,7 +9,7 @@ import DefaultPoint from './trial-detail/DefaultMetricPoint';
import Duration from './trial-detail/Duration'; import Duration from './trial-detail/Duration';
import Title1 from './overview/Title1'; import Title1 from './overview/Title1';
import Para from './trial-detail/Para'; import Para from './trial-detail/Para';
import Intermediate from './trial-detail/Intermeidate'; import Intermediate from './trial-detail/Intermediate';
import TableList from './trial-detail/TableList'; import TableList from './trial-detail/TableList';
const TabPane = Tabs.TabPane; const TabPane = Tabs.TabPane;
import '../static/style/trialsDetail.scss'; import '../static/style/trialsDetail.scss';
......
...@@ -292,7 +292,7 @@ class Intermediate extends React.Component<IntermediateProps, IntermediateState> ...@@ -292,7 +292,7 @@ class Intermediate extends React.Component<IntermediateProps, IntermediateState>
isFilter isFilter
? ?
<span> <span>
<span className="filter-x"># Intermeidate</span> <span className="filter-x"># Intermediate</span>
<input <input
// placeholder="point" // placeholder="point"
ref={input => this.pointInput = input} ref={input => this.pointInput = input}
...@@ -321,7 +321,7 @@ class Intermediate extends React.Component<IntermediateProps, IntermediateState> ...@@ -321,7 +321,7 @@ class Intermediate extends React.Component<IntermediateProps, IntermediateState>
null null
} }
</Row> </Row>
<Row className="intermeidate-graph"> <Row className="intermediate-graph">
<ReactEcharts <ReactEcharts
option={interSource} option={interSource}
style={{ width: '100%', height: 418, margin: '0 auto' }} style={{ width: '100%', height: 418, margin: '0 auto' }}
......
...@@ -193,7 +193,7 @@ class TableList extends React.Component<TableListProps, TableListState> { ...@@ -193,7 +193,7 @@ class TableList extends React.Component<TableListProps, TableListState> {
case 'Status': case 'Status':
case 'Operation': case 'Operation':
case 'Default': case 'Default':
case 'Intermeidate count': case 'Intermediate count':
break; break;
default: default:
finalKeys.push(checkedValues[m]); finalKeys.push(checkedValues[m]);
...@@ -392,7 +392,7 @@ class TableList extends React.Component<TableListProps, TableListState> { ...@@ -392,7 +392,7 @@ class TableList extends React.Component<TableListProps, TableListState> {
sorter: (a: TableObj, b: TableObj): number => a.status.localeCompare(b.status) sorter: (a: TableObj, b: TableObj): number => a.status.localeCompare(b.status)
}); });
break; break;
case 'Intermeidate count': case 'Intermediate count':
showColumn.push({ showColumn.push({
title: 'Intermediate count', title: 'Intermediate count',
dataIndex: 'progress', dataIndex: 'progress',
......
...@@ -42,7 +42,7 @@ const COLUMN_INDEX = [ ...@@ -42,7 +42,7 @@ const COLUMN_INDEX = [
index: 4 index: 4
}, },
{ {
name: 'Intermeidate count', name: 'Intermediate count',
index: 5 index: 5
}, },
{ {
...@@ -57,7 +57,7 @@ const COLUMN_INDEX = [ ...@@ -57,7 +57,7 @@ const COLUMN_INDEX = [
// defatult selected column // defatult selected column
const COLUMN = ['Trial No.', 'ID', 'Duration', 'Status', 'Default', 'Operation']; const COLUMN = ['Trial No.', 'ID', 'Duration', 'Status', 'Default', 'Operation'];
// all choice column !dictory final // all choice column !dictory final
const COLUMNPro = ['Trial No.', 'ID', 'Duration', 'Status', 'Intermeidate count', 'Default', 'Operation']; const COLUMNPro = ['Trial No.', 'ID', 'Duration', 'Status', 'Intermediate count', 'Default', 'Operation'];
export { export {
MANAGER_IP, DOWNLOAD_IP, trialJobStatus, COLUMNPro, MANAGER_IP, DOWNLOAD_IP, trialJobStatus, COLUMNPro,
CONTROLTYPE, MONACO, COLUMN, COLUMN_INDEX, DRAWEROPTION CONTROLTYPE, MONACO, COLUMN, COLUMN_INDEX, DRAWEROPTION
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
} }
} }
.compare-intermeidate{ .compare-intermediate{
position: relative; position: relative;
.compare-yAxis{ .compare-yAxis{
color: #333; color: #333;
......
...@@ -84,8 +84,8 @@ ...@@ -84,8 +84,8 @@
} }
} }
/* for # intermediate in intermeidate graph*/ /* for # intermediate in intermediate graph*/
.intermeidate-graph{ .intermediate-graph{
position: relative; position: relative;
.yAxis{ .yAxis{
color: #333; color: #333;
......
...@@ -131,5 +131,3 @@ if __name__ == '__main__': ...@@ -131,5 +131,3 @@ if __name__ == '__main__':
setup_experiment(args.preinstall) setup_experiment(args.preinstall)
run(args) run(args)
#
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment