Commit eaf42120 authored by suiguoxin's avatar suiguoxin
Browse files

squash commits in v1.0 first round bug bash

parent f721b431
......@@ -4,7 +4,7 @@
* * *
[![MIT 许可证](https://img.shields.io/badge/license-MIT-brightgreen.svg)](LICENSE) [![生成状态](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/Microsoft.nni)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=6) [![问题](https://img.shields.io/github/issues-raw/Microsoft/nni.svg)](https://github.com/Microsoft/nni/issues?q=is%3Aissue+is%3Aopen) [![Bug](https://img.shields.io/github/issues/Microsoft/nni/bug.svg)](https://github.com/Microsoft/nni/issues?q=is%3Aissue+is%3Aopen+label%3Abug) [![拉取请求](https://img.shields.io/github/issues-pr-raw/Microsoft/nni.svg)](https://github.com/Microsoft/nni/pulls?q=is%3Apr+is%3Aopen) [![版本](https://img.shields.io/github/release/Microsoft/nni.svg)](https://github.com/Microsoft/nni/releases) [![进入 https://gitter.im/Microsoft/nni 聊天室提问](https://badges.gitter.im/Microsoft/nni.svg)](https://gitter.im/Microsoft/nni?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![文档状态](https://readthedocs.org/projects/nni/badge/?version=latest)](https://nni.readthedocs.io/en/latest/?badge=latest)
[![MIT 许可证](https://img.shields.io/badge/license-MIT-brightgreen.svg)](LICENSE) [![生成状态](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/Microsoft.nni)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=6) [![问题](https://img.shields.io/github/issues-raw/Microsoft/nni.svg)](https://github.com/Microsoft/nni/issues?q=is%3Aissue+is%3Aopen) [![Bug](https://img.shields.io/github/issues/Microsoft/nni/bug.svg)](https://github.com/Microsoft/nni/issues?q=is%3Aissue+is%3Aopen+label%3Abug) [![拉取请求](https://img.shields.io/github/issues-pr-raw/Microsoft/nni.svg)](https://github.com/Microsoft/nni/pulls?q=is%3Apr+is%3Aopen) [![版本](https://img.shields.io/github/release/Microsoft/nni.svg)](https://github.com/Microsoft/nni/releases) [![进入 https://gitter.im/Microsoft/nni 聊天室提问](https://badges.gitter.im/Microsoft/nni.svg)](https://gitter.im/Microsoft/nni?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![文档状态](https://readthedocs.org/projects/nni/badge/?version=latest)](https://nni.readthedocs.io/zh/latest/?badge=latest)
[English](README.md)
......@@ -47,40 +47,40 @@ NNI (Neural Network Intelligence) 是自动机器学习(AutoML)的工具包
</ul>
</td>
<td align="left">
<a href="docs/en_US/Tuner/BuiltinTuner.md">Tuner(调参器)</a>
<a href="docs/zh_CN/Tuner/BuiltinTuner.md">Tuner(调参器)</a>
<br />
<ul>
<b style="margin-left:-20px">通用 Tuner</b>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#Random">Random Search(随机搜索)</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#Evolution">Naïve Evolution(进化算法)</a></li>
<li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#Random">Random Search(随机搜索)</a></li>
<li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#Evolution">Naïve Evolution(进化算法)</a></li>
<b style="margin-left:-20px">超参 Tuner</b>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#TPE">TPE</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#Anneal">Anneal(退火算法)</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#SMAC">SMAC</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#Batch">Batch(批处理)</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#GridSearch">Grid Search(遍历搜索)</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#Hyperband">Hyperband</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#MetisTuner">Metis Tuner</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#BOHB">BOHB</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#GPTuner">GP Tuner</a></li>
<li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#TPE">TPE</a></li>
<li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#Anneal">Anneal(退火算法)</a></li>
<li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#SMAC">SMAC</a></li>
<li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#Batch">Batch(批处理)</a></li>
<li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#GridSearch">Grid Search(遍历搜索)</a></li>
<li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#Hyperband">Hyperband</a></li>
<li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#MetisTuner">Metis Tuner</a></li>
<li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#BOHB">BOHB</a></li>
<li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#GPTuner">GP Tuner</a></li>
<b style="margin-left:-20px">网络结构 Tuner</b>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#NetworkMorphism">Network Morphism</a></li>
<li><a href="docs/zh_CN/Tuner/BuiltinTuner.md#NetworkMorphism">Network Morphism</a></li>
<li><a href="examples/tuners/enas_nni/README.md">ENAS</a></li>
</ul>
<a href="docs/en_US/Assessor/BuiltinAssessor.md">Assessor(评估器)</a>
<a href="docs/zh_CN/Assessor/BuiltinAssessor.md">Assessor(评估器)</a>
<ul>
<li><a href="docs/en_US/Assessor/BuiltinAssessor.md#Medianstop">Median Stop(中位数终止)</a></li>
<li><a href="docs/en_US/Assessor/BuiltinAssessor.md#Curvefitting">Curve Fitting(曲线拟合)</a></li>
<li><a href="docs/zh_CN/Assessor/BuiltinAssessor.md#Medianstop">Median Stop(中位数终止)</a></li>
<li><a href="docs/zh_CN/Assessor/BuiltinAssessor.md#Curvefitting">Curve Fitting(曲线拟合)</a></li>
</ul>
</td>
<td>
<ul>
<li><a href="docs/en_US/TrainingService/LocalMode.md">本机</a></li>
<li><a href="docs/en_US/TrainingService/RemoteMachineMode.md">远程计算机</a></li>
<li><a href="docs/zh_CN/TrainingService/LocalMode.md">本机</a></li>
<li><a href="docs/zh_CN/TrainingService/RemoteMachineMode.md">远程计算机</a></li>
<li><b>基于 Kubernetes 的平台</b></li>
<ul><li><a href="docs/en_US/TrainingService/PaiMode.md">OpenPAI</a></li>
<li><a href="docs/en_US/TrainingService/KubeflowMode.md">Kubeflow</a></li>
<li><a href="docs/en_US/TrainingService/FrameworkControllerMode.md">基于 Kubernetes(AKS 等)的 FrameworkController</a></li>
<ul><li><a href="docs/zh_CN/TrainingService/PaiMode.md">OpenPAI</a></li>
<li><a href="docs/zh_CN/TrainingService/KubeflowMode.md">Kubeflow</a></li>
<li><a href="docs/zh_CN/TrainingService/FrameworkControllerMode.md">基于 Kubernetes(AKS 等)的 FrameworkController</a></li>
</ul>
</ul>
</td>
......@@ -226,34 +226,34 @@ You can use these commands to get more information about the experiment
点击阅读:
* [NNI 概述](docs/zh_CN/Overview.md)
* [快速入门](docs/en_US/Tutorial/QuickStart.md)
* [贡献](docs/en_US/Tutorial/Contributing.md)
* [示例](docs/en_US/examples.rst)
* [参考](docs/en_US/reference.rst)
* [Web 界面教程](docs/en_US/Tutorial/WebUI.md)
* [快速入门](docs/zh_CN/Tutorial/QuickStart.md)
* [贡献](docs/zh_CN/Tutorial/Contributing.md)
* [示例](docs/zh_CN/examples.rst)
* [参考](docs/zh_CN/reference.rst)
* [Web 界面教程](docs/zh_CN/Tutorial/WebUI.md)
## **入门**
* [安装 NNI](docs/en_US/Tutorial/Installation.md)
* [使用命令行工具 nnictl](docs/en_US/Tutorial/Nnictl.md)
* [使用 NNIBoard](docs/en_US/Tutorial/WebUI.md)
* [如何定义搜索空间](docs/en_US/Tutorial/SearchSpaceSpec.md)
* [如何实现 Trial 代码](docs/en_US/TrialExample/Trials.md)
* [如何选择 Tuner、搜索算法](docs/en_US/Tuner/BuiltinTuner.md)
* [配置 Experiment](docs/en_US/Tutorial/ExperimentConfig.md)
* [如何使用 Annotation](docs/en_US/TrialExample/Trials.md#nni-python-annotation)
* [安装 NNI](docs/zh_CN/Tutorial/Installation.md)
* [使用命令行工具 nnictl](docs/zh_CN/Tutorial/Nnictl.md)
* [使用 NNIBoard](docs/zh_CN/Tutorial/WebUI.md)
* [如何定义搜索空间](docs/zh_CN/Tutorial/SearchSpaceSpec.md)
* [如何实现 Trial 代码](docs/zh_CN/TrialExample/Trials.md)
* [如何选择 Tuner、搜索算法](docs/zh_CN/Tuner/BuiltinTuner.md)
* [配置 Experiment](docs/zh_CN/Tutorial/ExperimentConfig.md)
* [如何使用 Annotation](docs/zh_CN/TrialExample/Trials.md#nni-python-annotation)
## **教程**
* [在 OpenPAI 上运行 Experiment](docs/en_US/TrainingService/PaiMode.md)
* [在 Kubeflow 上运行 Experiment](docs/en_US/TrainingService/KubeflowMode.md)
* [在本机运行 Experiment (支持多 GPU 卡)](docs/en_US/TrainingService/LocalMode.md)
* [在多机上运行 Experiment](docs/en_US/TrainingService/RemoteMachineMode.md)
* [尝试不同的 Tuner](docs/en_US/Tuner/BuiltinTuner.md)
* [尝试不同的 Assessor](docs/en_US/Assessor/BuiltinAssessor.md)
* [实现自定义 Tuner](docs/en_US/Tuner/CustomizeTuner.md)
* [实现自定义 Assessor](docs/en_US/Assessor/CustomizeAssessor.md)
* [使用进化算法为阅读理解任务找到好模型](docs/en_US/TrialExample/SquadEvolutionExamples.md)
* [在 OpenPAI 上运行 Experiment](docs/zh_CN/TrainingService/PaiMode.md)
* [在 Kubeflow 上运行 Experiment](docs/zh_CN/TrainingService/KubeflowMode.md)
* [在本机运行 Experiment (支持多 GPU 卡)](docs/zh_CN/TrainingService/LocalMode.md)
* [在多机上运行 Experiment](docs/zh_CN/TrainingService/RemoteMachineMode.md)
* [尝试不同的 Tuner](docs/zh_CN/Tuner/BuiltinTuner.md)
* [尝试不同的 Assessor](docs/zh_CN/Assessor/BuiltinAssessor.md)
* [实现自定义 Tuner](docs/zh_CN/Tuner/CustomizeTuner.md)
* [实现自定义 Assessor](docs/zh_CN/Assessor/CustomizeAssessor.md)
* [使用进化算法为阅读理解任务找到好模型](docs/zh_CN/TrialExample/SquadEvolutionExamples.md)
## **贡献**
......@@ -265,10 +265,10 @@ You can use these commands to get more information about the experiment
在提交代码前,需要遵循以下的简单准则:
* [如何调试](docs/en_US/Tutorial/HowToDebug.md)
* [代码风格和命名约定](docs/en_US/Tutorial/Contributing.md)
* [如何调试](docs/zh_CN/Tutorial/HowToDebug.md)
* [代码风格和命名约定](docs/zh_CN/Tutorial/Contributing.md)
* 如何设置 [NNI 开发环境](docs/zh_CN/Tutorial/SetupNniDeveloperEnvironment.md)
* 查看[贡献说明](docs/en_US/Tutorial/Contributing.md)并熟悉 NNI 的代码贡献指南
* 查看[贡献说明](docs/zh_CN/Tutorial/Contributing.md)并熟悉 NNI 的代码贡献指南
## **外部代码库**
......
......@@ -49,7 +49,7 @@ RUN DEBIAN_FRONTEND=noninteractive && \
#
# update pip
#
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --upgrade pip setuptools==39.1.0
# numpy 1.14.3 scipy 1.1.0
RUN python3 -m pip --no-cache-dir install \
......
# TrainingService
NNI TrainingService provides the training platform for running NNI trial jobs. NNI supports [local](./LocalMode.md), [remote](./RemoteMachineMode.md), [pai](./PaiMode.md), [kubeflow](./KubeflowMode.md) and [frameworkcontroller](./FrameworkControllerMode.md) built-in training services.
NNI not only provides few built-in training service options, but also provides a method for customers to build their own training service easily.
## Built-in TrainingService
|TrainingService|Brief Introduction|
|---|---|
|[__Local__](./LocalMode.md)|NNI supports running an experiment on local machine, called local mode. Local mode means that NNI will run the trial jobs and nniManager process in same machine, and support gpu schedule function for trial jobs.|
|[__Remote__](./RemoteMachineMode.md)|NNI supports running an experiment on multiple machines through SSH channel, called remote mode. NNI assumes that you have access to those machines, and already setup the environment for running deep learning training code. NNI will submit the trial jobs in remote machine, and schedule suitable machine with enouth gpu resource if specified.|
|[__Pai__](./PaiMode.md)|NNI supports running an experiment on [OpenPAI](https://github.com/Microsoft/pai) (aka pai), called pai mode. Before starting to use NNI pai mode, you should have an account to access an [OpenPAI](https://github.com/Microsoft/pai) cluster. See [here](https://github.com/Microsoft/pai#how-to-deploy) if you don't have any OpenPAI account and want to deploy an OpenPAI cluster. In pai mode, your trial program will run in pai's container created by Docker.|
|[__Kubeflow__](./KubeflowMode.md)|NNI supports running experiment on [Kubeflow](https://github.com/kubeflow/kubeflow), called kubeflow mode. Before starting to use NNI kubeflow mode, you should have a Kubernetes cluster, either on-premises or [Azure Kubernetes Service(AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/), a Ubuntu machine on which [kubeconfig](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/) is setup to connect to your Kubernetes cluster. If you are not familiar with Kubernetes, [here](https://kubernetes.io/docs/tutorials/kubernetes-basics/) is a good start. In kubeflow mode, your trial program will run as Kubeflow job in Kubernetes cluster.|
|[__FrameworkController__](./FrameworkControllerMode.md)|NNI supports running experiment using [FrameworkController](https://github.com/Microsoft/frameworkcontroller), called frameworkcontroller mode. FrameworkController is built to orchestrate all kinds of applications on Kubernetes, you don't need to install Kubeflow for specific deep learning framework like tf-operator or pytorch-operator. Now you can use FrameworkController as the training service to run NNI experiment.|
## TrainingService Implementation
TrainingService is designed to be easily implemented, we define an abstract class TrainingService as the parent class of all kinds of TrainingService, users just need to inherit the parent class and complete their own child class if they want to implement customized TrainingService.
The abstract function in TrainingService is shown below:
```
abstract class TrainingService {
public abstract listTrialJobs(): Promise<TrialJobDetail[]>;
public abstract getTrialJob(trialJobId: string): Promise<TrialJobDetail>;
public abstract addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void;
public abstract removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void;
public abstract submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail>;
public abstract updateTrialJob(trialJobId: string, form: JobApplicationForm): Promise<TrialJobDetail>;
public abstract get isMultiPhaseJobSupported(): boolean;
public abstract cancelTrialJob(trialJobId: string, isEarlyStopped?: boolean): Promise<void>;
public abstract setClusterMetadata(key: string, value: string): Promise<void>;
public abstract getClusterMetadata(key: string): Promise<string>;
public abstract cleanUp(): Promise<void>;
public abstract run(): Promise<void>;
}
```
The parent class of TrainingService has a few abstract functions, users need to inherit the parent class and implement all of these abstract functions.
For more information about how to write your own TrainingService, please [refer](https://github.com/SparkSnail/nni/blob/dev-trainingServiceDoc/docs/en_US/TrainingService/HowToImplementTrainingService.md).
......@@ -123,6 +123,7 @@ Debug mode will disable version check function in Trialkeeper.
|------|------|------ |------|
|id| False| |The id of the experiment you want to stop|
|--port, -p| False| |Rest port of the experiment you want to stop|
|--all, -a| False| |Stop all of experiments|
* Details & Examples
......@@ -144,10 +145,10 @@ Debug mode will disable version check function in Trialkeeper.
nnictl stop --port 8080
```
4. Users could use 'nnictl stop all' to stop all experiments.
4. Users could use 'nnictl stop --all' to stop all experiments.
```bash
nnictl stop all
nnictl stop --all
```
5. If the id ends with *, nnictl will stop all experiments whose ids matchs the regular.
......
......@@ -50,13 +50,11 @@ Click the tab "Intermediate Result" to see the lines graph.
![](../../img/webui-img/trials_intermeidate.png)
We set a filter function for the intermediate result graph because that the trials may have many intermediate results in the training progress. You need to provide data if you want to use the filter button to see the trend of some trial.
The trial may have many intermediate results in the training progress. In order to see the trend of some trials more clearly, we set a filtering function for the intermediate result graph.
What data should be written in the first input? Maybe you find an intermediate count those trials became better or worse. In other word, it's an important and concerned intermediate count. Just input it into the first input.
You may find that these trials will get better or worse at one of intermediate results. In other words, this is an important and relevant intermediate result. To take a closer look at the point here, you need to enter its corresponding abscissa value at #Intermediate.
After selecting the intermeidate count, you should input your focus metric's range on this intermediate count. Yes, it's the min and max value. Like this picture, I choose the intermeidate count is 9 and the metric's range is 60-80.
As a result, I filter these trials that the metric's range is 20-60 on the 13 intermediate count.
And then input the range of metrics on this intermedia result. Like below picture, it chooses No. 4 intermediate result and set the range of metrics to 0.8-1.
![](../../img/webui-img/filter-intermediate.png)
## View trials status
......
docs/img/webui-img/compare.png

48.8 KB | W: | H:

docs/img/webui-img/compare.png

43 KB | W: | H:

docs/img/webui-img/compare.png
docs/img/webui-img/compare.png
docs/img/webui-img/compare.png
docs/img/webui-img/compare.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -46,7 +46,7 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
}
try {
fileNameValid = await validateFileNameRecursively(codeDir);
} catch(error) {
} catch (error) {
throw new Error(`Validate file name error: ${error}`);
}
......@@ -55,23 +55,24 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
+ ` please check if it's a valid code dir`;
throw new Error(errMessage);
}
if(!fileNameValid) {
const errMessage: string = `File name in ${codeDir} is not valid, please check file names, only support digit number、alphabet and (.-_) in file name.`;
throw new Error(errMessage);
if (!fileNameValid) {
const errMessage: string = `File name in ${codeDir} is not valid, please check file names, only support digit number、alphabet and (.-_) in file name.`;
throw new Error(errMessage);
}
return fileCount;
}
/**
* crete a new directory
* @param directory
*/
export async function execMkdir(directory: string): Promise<void> {
export async function execMkdir(directory: string, share: boolean = false): Promise<void> {
if (process.platform === 'win32') {
await cpp.exec(`powershell.exe New-Item -Path ${directory} -ItemType "directory" -Force`);
} else if (share) {
await cpp.exec(`(umask 0; mkdir -p ${directory})`);
} else {
await cpp.exec(`mkdir -p ${directory}`);
}
......
......@@ -54,6 +54,9 @@ class GPUScheduler {
} catch (error) {
this.log.error('Read GPU summary failed with error: ', error);
}
if (this.gpuSummary !== undefined && this.gpuSummary.gpuCount === 0) {
throw new Error('GPU not available. Please check your CUDA configuration');
}
await delay(5000);
}
}
......@@ -97,7 +100,7 @@ class GPUScheduler {
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private async runGpuMetricsCollectorScript(): Promise<void> {
await execMkdir(this.gpuMetricCollectorScriptFolder);
await execMkdir(this.gpuMetricCollectorScriptFolder, true);
//generate gpu_metrics_collector script
const gpuMetricsCollectorScriptPath: string =
path.join(this.gpuMetricCollectorScriptFolder, getScriptName('gpu_metrics_collector'));
......
......@@ -131,7 +131,7 @@ class LocalTrainingService implements TrainingService {
private readonly occupiedGpuIndexNumMap: Map<number, number>;
private designatedGpuIndices!: Set<number>;
private readonly log: Logger;
private localTrailConfig?: TrialConfig;
private localTrialConfig?: TrialConfig;
private localConfig?: LocalConfig;
private isMultiPhase: boolean;
private readonly jobStreamMap: Map<string, ts.Stream>;
......@@ -204,7 +204,7 @@ class LocalTrainingService implements TrainingService {
} catch (error) {
//ignore
}
this.log.debug(`trailJob status update: ${trialJobId}, ${trialJob.status}`);
this.log.debug(`trialJob status update: ${trialJobId}, ${trialJob.status}`);
}
}
......@@ -302,14 +302,14 @@ class LocalTrainingService implements TrainingService {
}
switch (key) {
case TrialConfigMetadataKey.TRIAL_CONFIG:
this.localTrailConfig = <TrialConfig>JSON.parse(value);
this.localTrialConfig = <TrialConfig>JSON.parse(value);
// Parse trial config failed, throw Error
if (this.localTrailConfig === undefined) {
if (this.localTrialConfig === undefined) {
throw new Error('trial config parsed failed');
}
if (this.localTrailConfig.gpuNum !== undefined) {
this.log.info(`required GPU number is ${this.localTrailConfig.gpuNum}`);
if (this.gpuScheduler === undefined && this.localTrailConfig.gpuNum > 0) {
if (this.localTrialConfig.gpuNum !== undefined) {
this.log.info(`required GPU number is ${this.localTrialConfig.gpuNum}`);
if (this.gpuScheduler === undefined && this.localTrialConfig.gpuNum > 0) {
this.gpuScheduler = new GPUScheduler();
}
}
......@@ -343,10 +343,10 @@ class LocalTrainingService implements TrainingService {
switch (key) {
case TrialConfigMetadataKey.TRIAL_CONFIG:
let getResult: Promise<string>;
if (this.localTrailConfig === undefined) {
if (this.localTrialConfig === undefined) {
getResult = Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, `${key} is never set yet`));
} else {
getResult = Promise.resolve(JSON.stringify(this.localTrailConfig));
getResult = Promise.resolve(JSON.stringify(this.localTrialConfig));
}
return getResult;
......@@ -359,8 +359,8 @@ class LocalTrainingService implements TrainingService {
this.log.info('Stopping local machine training service...');
this.stopping = true;
for (const stream of this.jobStreamMap.values()) {
stream.end(0)
stream.emit('end')
stream.end(0);
stream.emit('end');
}
if (this.gpuScheduler !== undefined) {
await this.gpuScheduler.stop();
......@@ -378,8 +378,8 @@ class LocalTrainingService implements TrainingService {
throw new Error(`Could not find stream in trial ${trialJob.id}`);
}
//Refer https://github.com/Juul/tail-stream/issues/20
stream.end(0)
stream.emit('end')
stream.end(0);
stream.emit('end');
this.jobStreamMap.delete(trialJob.id);
}
}
......@@ -427,8 +427,8 @@ class LocalTrainingService implements TrainingService {
}
private tryGetAvailableResource(): [boolean, { gpuIndices: number[]}] {
if (this.localTrailConfig === undefined) {
throw new Error('localTrailConfig is not initialized!');
if (this.localTrialConfig === undefined) {
throw new Error('localTrialConfig is not initialized!');
}
const resource: { gpuIndices: number[] } = { gpuIndices: [] };
......@@ -450,11 +450,11 @@ class LocalTrainingService implements TrainingService {
selectedGPUIndices = selectedGPUIndices.filter((index: number) => this.designatedGpuIndices.has(index));
}
if (selectedGPUIndices.length < this.localTrailConfig.gpuNum) {
if (selectedGPUIndices.length < this.localTrialConfig.gpuNum) {
return [false, resource];
}
selectedGPUIndices.splice(this.localTrailConfig.gpuNum);
selectedGPUIndices.splice(this.localTrialConfig.gpuNum);
Object.assign(resource, { gpuIndices: selectedGPUIndices });
return [true, resource];
......@@ -494,7 +494,7 @@ class LocalTrainingService implements TrainingService {
if (!success) {
break;
}
this.occupyResource(resource);
await this.runTrialJob(trialJobId, resource);
}
......@@ -512,17 +512,17 @@ class LocalTrainingService implements TrainingService {
}
}
private getScript(localTrailConfig: TrialConfig, workingDirectory: string): string[] {
private getScript(localTrialConfig: TrialConfig, workingDirectory: string): string[] {
const script: string[] = [];
if (process.platform === 'win32') {
script.push(
`cmd /c ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`cmd /c ${localTrialConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`,
`$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File ${path.join(workingDirectory, '.nni', 'state')} -NoNewline -encoding utf8`);
} else {
script.push(
`eval ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`eval ${localTrialConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`echo $? \`date +%s%3N\` >${path.join(workingDirectory, '.nni', 'state')}`);
}
......@@ -531,23 +531,23 @@ class LocalTrainingService implements TrainingService {
private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> {
const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId);
if (this.localTrailConfig === undefined) {
if (this.localTrialConfig === undefined) {
throw new Error(`localTrialConfig not initialized!`);
}
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource, this.localTrailConfig.gpuNum);
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource, this.localTrialConfig.gpuNum);
if (this.localTrailConfig === undefined) {
if (this.localTrialConfig === undefined) {
throw new Error('trial config is not initialized');
}
const runScriptContent: string[] = [];
if (process.platform !== 'win32') {
runScriptContent.push('#!/bin/bash');
}
runScriptContent.push(`cd ${this.localTrailConfig.codeDir}`);
runScriptContent.push(`cd ${this.localTrialConfig.codeDir}`);
for (const variable of variables) {
runScriptContent.push(setEnvironmentVariable(variable));
}
const scripts: string[] = this.getScript(this.localTrailConfig, trialJobDetail.workingDirectory);
const scripts: string[] = this.getScript(this.localTrialConfig, trialJobDetail.workingDirectory);
scripts.forEach((script: string) => {
runScriptContent.push(script);
});
......
......@@ -511,12 +511,16 @@ class RemoteMachineTrainingService implements TrainingService {
// tslint:disable-next-line: no-floating-promises
SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh')}`, conn);
this.timer.subscribe(
const disposable: Rx.IDisposable = this.timer.subscribe(
async (tick: number) => {
const cmdresult: RemoteCommandResult = await SSHClientUtility.remoteExeCommand(
`tail -n 1 ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn);
if (cmdresult !== undefined && cmdresult.stdout !== undefined) {
rmMeta.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout);
if (rmMeta.gpuSummary.gpuCount === 0) {
this.log.warning(`No GPU found on remote machine ${rmMeta.ip}`);
this.timer.unsubscribe(disposable);
}
}
}
);
......
......@@ -83,7 +83,7 @@ class Compare extends React.Component<CompareProps, {}> {
},
xAxis: {
type: 'category',
// name: '# Intermeidate',
// name: '# Intermediate',
boundaryGap: false,
data: xAxis
},
......@@ -194,9 +194,9 @@ class Compare extends React.Component<CompareProps, {}> {
maskClosable={false}
width="90%"
>
<Row className="compare-intermeidate">
<Row className="compare-intermediate">
{this.intermediate()}
<Row className="compare-yAxis"># Intermeidate</Row>
<Row className="compare-yAxis"># Intermediate</Row>
</Row>
<Row>{this.initColumn()}</Row>
</Modal>
......
......@@ -9,7 +9,7 @@ import DefaultPoint from './trial-detail/DefaultMetricPoint';
import Duration from './trial-detail/Duration';
import Title1 from './overview/Title1';
import Para from './trial-detail/Para';
import Intermediate from './trial-detail/Intermeidate';
import Intermediate from './trial-detail/Intermediate';
import TableList from './trial-detail/TableList';
const TabPane = Tabs.TabPane;
import '../static/style/trialsDetail.scss';
......
......@@ -292,7 +292,7 @@ class Intermediate extends React.Component<IntermediateProps, IntermediateState>
isFilter
?
<span>
<span className="filter-x"># Intermeidate</span>
<span className="filter-x"># Intermediate</span>
<input
// placeholder="point"
ref={input => this.pointInput = input}
......@@ -321,7 +321,7 @@ class Intermediate extends React.Component<IntermediateProps, IntermediateState>
null
}
</Row>
<Row className="intermeidate-graph">
<Row className="intermediate-graph">
<ReactEcharts
option={interSource}
style={{ width: '100%', height: 418, margin: '0 auto' }}
......
......@@ -193,7 +193,7 @@ class TableList extends React.Component<TableListProps, TableListState> {
case 'Status':
case 'Operation':
case 'Default':
case 'Intermeidate count':
case 'Intermediate count':
break;
default:
finalKeys.push(checkedValues[m]);
......@@ -392,7 +392,7 @@ class TableList extends React.Component<TableListProps, TableListState> {
sorter: (a: TableObj, b: TableObj): number => a.status.localeCompare(b.status)
});
break;
case 'Intermeidate count':
case 'Intermediate count':
showColumn.push({
title: 'Intermediate count',
dataIndex: 'progress',
......
......@@ -42,7 +42,7 @@ const COLUMN_INDEX = [
index: 4
},
{
name: 'Intermeidate count',
name: 'Intermediate count',
index: 5
},
{
......@@ -57,7 +57,7 @@ const COLUMN_INDEX = [
// defatult selected column
const COLUMN = ['Trial No.', 'ID', 'Duration', 'Status', 'Default', 'Operation'];
// all choice column !dictory final
const COLUMNPro = ['Trial No.', 'ID', 'Duration', 'Status', 'Intermeidate count', 'Default', 'Operation'];
const COLUMNPro = ['Trial No.', 'ID', 'Duration', 'Status', 'Intermediate count', 'Default', 'Operation'];
export {
MANAGER_IP, DOWNLOAD_IP, trialJobStatus, COLUMNPro,
CONTROLTYPE, MONACO, COLUMN, COLUMN_INDEX, DRAWEROPTION
......
......@@ -24,7 +24,7 @@
}
}
.compare-intermeidate{
.compare-intermediate{
position: relative;
.compare-yAxis{
color: #333;
......
......@@ -84,8 +84,8 @@
}
}
/* for # intermediate in intermeidate graph*/
.intermeidate-graph{
/* for # intermediate in intermediate graph*/
.intermediate-graph{
position: relative;
.yAxis{
color: #333;
......
......@@ -131,5 +131,3 @@ if __name__ == '__main__':
setup_experiment(args.preinstall)
run(args)
#
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment