Unverified Commit 841d4677 authored by chicm-ms's avatar chicm-ms Committed by GitHub
Browse files

Merge pull request #18 from microsoft/master

pull code
parents ccb2211e feb6f3b8
...@@ -100,6 +100,8 @@ Targeting at openness and advancing state-of-art technology, [Microsoft Research ...@@ -100,6 +100,8 @@ Targeting at openness and advancing state-of-art technology, [Microsoft Research
* [OpenPAI](https://github.com/Microsoft/pai) : an open source platform that provides complete AI model training and resource management capabilities, it is easy to extend and supports on-premise, cloud and hybrid environments in various scale. * [OpenPAI](https://github.com/Microsoft/pai) : an open source platform that provides complete AI model training and resource management capabilities, it is easy to extend and supports on-premise, cloud and hybrid environments in various scale.
* [FrameworkController](https://github.com/Microsoft/frameworkcontroller) : an open source general-purpose Kubernetes Pod Controller that orchestrate all kinds of applications on Kubernetes by a single controller. * [FrameworkController](https://github.com/Microsoft/frameworkcontroller) : an open source general-purpose Kubernetes Pod Controller that orchestrate all kinds of applications on Kubernetes by a single controller.
* [MMdnn](https://github.com/Microsoft/MMdnn) : A comprehensive, cross-framework solution to convert, visualize and diagnose deep neural network models. The "MM" in MMdnn stands for model management and "dnn" is an acronym for deep neural network. * [MMdnn](https://github.com/Microsoft/MMdnn) : A comprehensive, cross-framework solution to convert, visualize and diagnose deep neural network models. The "MM" in MMdnn stands for model management and "dnn" is an acronym for deep neural network.
* [SPTAG](https://github.com/Microsoft/SPTAG) : Space Partition Tree And Graph (SPTAG) is an open source library for large scale vector approximate nearest neighbor search scenario.
We encourage researchers and students leverage these projects to accelerate the AI development and research. We encourage researchers and students leverage these projects to accelerate the AI development and research.
## **Install & Verify** ## **Install & Verify**
......
...@@ -357,13 +357,18 @@ function countFilesRecursively(directory: string, timeoutMilliSeconds?: number): ...@@ -357,13 +357,18 @@ function countFilesRecursively(directory: string, timeoutMilliSeconds?: number):
}); });
let fileCount: number = -1; let fileCount: number = -1;
cpp.exec(`find ${directory} -type f | wc -l`).then((result) => { let cmd: string;
if(process.platform === "win32") {
cmd = `powershell "Get-ChildItem -Path ${directory} -Recurse -File | Measure-Object | %{$_.Count}"`
} else {
cmd = `find ${directory} -type f | wc -l`;
}
cpp.exec(cmd).then((result) => {
if(result.stdout && parseInt(result.stdout)) { if(result.stdout && parseInt(result.stdout)) {
fileCount = parseInt(result.stdout); fileCount = parseInt(result.stdout);
} }
deferred.resolve(fileCount); deferred.resolve(fileCount);
}); });
return Promise.race([deferred.promise, delayTimeout]).finally(() => { return Promise.race([deferred.promise, delayTimeout]).finally(() => {
clearTimeout(timeoutId); clearTimeout(timeoutId);
}); });
...@@ -459,6 +464,16 @@ function getNewLine(): string{ ...@@ -459,6 +464,16 @@ function getNewLine(): string{
} }
} }
/**
* Use '/' to join path instead of '\' for all kinds of platform
* @param path
*/
function unixPathJoin(...paths: any[]): string {
const dir: string = paths.filter((path: any) => path !== '').join('/');
if (dir === '') return '.';
return dir;
}
export {countFilesRecursively, getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand, getCheckpointDir, export {countFilesRecursively, getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand, getCheckpointDir,
getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address, getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address, unixPathJoin,
mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine }; mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine };
{
"kind": "CustomResourceDefinition",
"spec": {
"scope": "Namespaced",
"version": "v1beta2",
"group": "kubeflow.org",
"names": {
"kind": "PyTorchJob",
"plural": "pytorchjobs",
"singular": "pytorchjob"
}
},
"apiVersion": "apiextensions.k8s.io/v1beta2",
"metadata": {
"name": "pytorchjobs.kubeflow.org"
}
}
{
"kind": "CustomResourceDefinition",
"spec": {
"scope": "Namespaced",
"version": "v1beta2",
"group": "kubeflow.org",
"names": {
"kind": "TFJob",
"plural": "tfjobs",
"singular": "tfjob"
}
},
"apiVersion": "apiextensions.k8s.io/v1beta2",
"metadata": {
"name": "tfjobs.kubeflow.org"
}
}
...@@ -25,6 +25,7 @@ import { EventEmitter } from 'events'; ...@@ -25,6 +25,7 @@ import { EventEmitter } from 'events';
import { Readable, Writable } from 'stream'; import { Readable, Writable } from 'stream';
import { NNIError } from '../common/errors'; import { NNIError } from '../common/errors';
import { getLogger, Logger } from '../common/log'; import { getLogger, Logger } from '../common/log';
import { getLogDir } from '../common/utils';
import * as CommandType from './commands'; import * as CommandType from './commands';
const ipcOutgoingFd: number = 3; const ipcOutgoingFd: number = 3;
...@@ -106,7 +107,10 @@ class IpcInterface { ...@@ -106,7 +107,10 @@ class IpcInterface {
this.logger.warning('Commands jammed in buffer!'); this.logger.warning('Commands jammed in buffer!');
} }
} catch (err) { } catch (err) {
throw NNIError.FromError(err, 'Dispatcher Error: '); throw NNIError.FromError(
err,
`Dispatcher Error, please check this dispatcher log file for more detailed information: ${getLogDir()}/dispatcher.log . `
);
} }
} }
......
...@@ -152,7 +152,16 @@ mkDirP(getLogDir()) ...@@ -152,7 +152,16 @@ mkDirP(getLogDir())
console.error(`Failed to create log dir: ${err.stack}`); console.error(`Failed to create log dir: ${err.stack}`);
}); });
process.on('SIGTERM', async () => { function getStopSignal(): any {
if (process.platform === "win32") {
return 'SIGBREAK';
}
else{
return 'SIGTERM';
}
}
process.on(getStopSignal(), async () => {
const log: Logger = getLogger(); const log: Logger = getLogger();
let hasError: boolean = false; let hasError: boolean = false;
try { try {
......
...@@ -29,20 +29,35 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{ ...@@ -29,20 +29,35 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
*/ */
public static generateOperatorClient(kubeflowOperator: KubeflowOperator, public static generateOperatorClient(kubeflowOperator: KubeflowOperator,
operatorApiVersion: string): KubernetesCRDClient { operatorApiVersion: string): KubernetesCRDClient {
if(kubeflowOperator === 'tf-operator') { switch(kubeflowOperator) {
if(operatorApiVersion == 'v1alpha2') { case 'tf-operator': {
switch(operatorApiVersion) {
case 'v1alpha2': {
return new TFOperatorClientV1Alpha2(); return new TFOperatorClientV1Alpha2();
} else if(operatorApiVersion == 'v1beta1') { }
case 'v1beta1': {
return new TFOperatorClientV1Beta1(); return new TFOperatorClientV1Beta1();
} }
} else if(kubeflowOperator === 'pytorch-operator') { case 'v1beta2': {
if(operatorApiVersion == 'v1alpha2') { return new TFOperatorClientV1Beta2();
return new PytorchOperatorClientV1Alpha2(); }
} else if(operatorApiVersion == 'v1beta1') { }
return new PytorchOperatorClientV1Beta1(); break;
}
case 'pytorch-operator': {
switch(operatorApiVersion) {
case 'v1alpha2': {
return new PyTorchOperatorClientV1Alpha2();
}
case 'v1beta1': {
return new PyTorchOperatorClientV1Beta1();
}
case 'v1beta2': {
return new PyTorchOperatorClientV1Beta2();
}
}
} }
} }
throw new Error(`Invalid operator ${kubeflowOperator} or apiVersion ${operatorApiVersion}`); throw new Error(`Invalid operator ${kubeflowOperator} or apiVersion ${operatorApiVersion}`);
} }
} }
...@@ -85,7 +100,26 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient { ...@@ -85,7 +100,26 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
} }
} }
class PytorchOperatorClientV1Alpha2 extends KubeflowOperatorClient { class TFOperatorClientV1Beta2 extends KubernetesCRDClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/tfjob-crd-v1beta2.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta2.namespaces('default').tfjobs;
}
public get containerName(): string {
return 'tensorflow';
}
}
class PyTorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
/** /**
* constructor, to initialize tfjob CRD definition * constructor, to initialize tfjob CRD definition
*/ */
...@@ -104,7 +138,7 @@ class PytorchOperatorClientV1Alpha2 extends KubeflowOperatorClient { ...@@ -104,7 +138,7 @@ class PytorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
} }
} }
class PytorchOperatorClientV1Beta1 extends KubernetesCRDClient { class PyTorchOperatorClientV1Beta1 extends KubernetesCRDClient {
/** /**
* constructor, to initialize tfjob CRD definition * constructor, to initialize tfjob CRD definition
*/ */
...@@ -123,5 +157,24 @@ class PytorchOperatorClientV1Beta1 extends KubernetesCRDClient { ...@@ -123,5 +157,24 @@ class PytorchOperatorClientV1Beta1 extends KubernetesCRDClient {
} }
} }
class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/pytorchjob-crd-v1beta2.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta2.namespaces('default').pytorchjobs;
}
public get containerName(): string {
return 'pytorch';
}
}
export { KubeflowOperatorClient, GeneralK8sClient }; export { KubeflowOperatorClient, GeneralK8sClient };
...@@ -28,7 +28,7 @@ import { MethodNotImplementedError } from '../../../common/errors'; ...@@ -28,7 +28,7 @@ import { MethodNotImplementedError } from '../../../common/errors';
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ; export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ;
export type DistTrainRole = 'worker' | 'ps' | 'master'; export type DistTrainRole = 'worker' | 'ps' | 'master';
export type KubeflowJobStatus = 'Created' | 'Running' | 'Failed' | 'Succeeded'; export type KubeflowJobStatus = 'Created' | 'Running' | 'Failed' | 'Succeeded';
export type OperatorApiVersion = 'v1alpha2' | 'v1beta1'; export type OperatorApiVersion = 'v1alpha2' | 'v1beta1' | 'v1beta2';
export class KubeflowClusterConfig extends KubernetesClusterConfig { export class KubeflowClusterConfig extends KubernetesClusterConfig {
public readonly operator: KubeflowOperator; public readonly operator: KubeflowOperator;
......
...@@ -22,6 +22,7 @@ import * as fs from 'fs'; ...@@ -22,6 +22,7 @@ import * as fs from 'fs';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import { getExperimentId } from '../../common/experimentStartupInfo'; import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger } from '../../common/log'; import { getLogger } from '../../common/log';
import { unixPathJoin } from '../../common/utils'
/** /**
* HDFS client utility, including copy file/directory * HDFS client utility, including copy file/directory
...@@ -32,7 +33,7 @@ export namespace HDFSClientUtility { ...@@ -32,7 +33,7 @@ export namespace HDFSClientUtility {
* @param hdfsUserName HDFS user name * @param hdfsUserName HDFS user name
*/ */
function hdfsExpRootDir(hdfsUserName: string): string { function hdfsExpRootDir(hdfsUserName: string): string {
return path.join('/', hdfsUserName, 'nni', 'experiments', getExperimentId()); return '/' + unixPathJoin(hdfsUserName, 'nni', 'experiments', getExperimentId());
} }
/** /**
...@@ -40,7 +41,7 @@ export namespace HDFSClientUtility { ...@@ -40,7 +41,7 @@ export namespace HDFSClientUtility {
* @param hdfsUserName HDFS user name * @param hdfsUserName HDFS user name
*/ */
export function getHdfsExpCodeDir(hdfsUserName: string): string { export function getHdfsExpCodeDir(hdfsUserName: string): string {
return path.join(hdfsExpRootDir(hdfsUserName), 'codeDir'); return unixPathJoin(hdfsExpRootDir(hdfsUserName), 'codeDir');
} }
/** /**
...@@ -49,7 +50,9 @@ export namespace HDFSClientUtility { ...@@ -49,7 +50,9 @@ export namespace HDFSClientUtility {
* @param trialId NNI trial ID * @param trialId NNI trial ID
*/ */
export function getHdfsTrialWorkDir(hdfsUserName: string, trialId: string): string { export function getHdfsTrialWorkDir(hdfsUserName: string, trialId: string): string {
return path.join(hdfsExpRootDir(hdfsUserName), 'trials', trialId); let root = hdfsExpRootDir(hdfsUserName)
console.log(root)
return unixPathJoin(root, 'trials', trialId);
} }
/** /**
......
...@@ -40,7 +40,8 @@ import { delay, generateParamFileName, ...@@ -40,7 +40,8 @@ import { delay, generateParamFileName,
getExperimentRootDir, getIPV4Address, getVersion, uniqueString } from '../../common/utils'; getExperimentRootDir, getIPV4Address, getVersion, uniqueString } from '../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { validateCodeDir } from '../common/util'; import { validateCodeDir, execMkdir } from '../common/util';
import { unixPathJoin } from '../../common/utils'
import { HDFSClientUtility } from './hdfsClientUtility'; import { HDFSClientUtility } from './hdfsClientUtility';
import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig'; import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig';
import { PAI_LOG_PATH_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAITrialJobDetail } from './paiData'; import { PAI_LOG_PATH_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAITrialJobDetail } from './paiData';
...@@ -406,12 +407,12 @@ class PAITrainingService implements TrainingService { ...@@ -406,12 +407,12 @@ class PAITrainingService implements TrainingService {
} }
// Step 1. Prepare PAI job configuration // Step 1. Prepare PAI job configuration
const hdfsOutputDir : string = path.join(this.hdfsBaseDir, this.experimentId, trialJobId); const hdfsOutputDir : string = unixPathJoin(this.hdfsBaseDir, this.experimentId, trialJobId);
const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId); const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId);
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId); const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
//create tmp trial working folder locally. //create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`); await execMkdir(trialLocalTempFolder);
const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT; const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files // Write NNI installation file to local tmp files
......
...@@ -86,7 +86,7 @@ def convert_command(): ...@@ -86,7 +86,7 @@ def convert_command():
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote'], default='pai') parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote', 'local'], default='pai')
parser.add_argument("--nni_docker_image", type=str) parser.add_argument("--nni_docker_image", type=str)
parser.add_argument("--nni_manager_ip", type=str) parser.add_argument("--nni_manager_ip", type=str)
# args for PAI # args for PAI
...@@ -111,4 +111,5 @@ if __name__ == '__main__': ...@@ -111,4 +111,5 @@ if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
update_training_service_config(args) update_training_service_config(args)
if args.ts == 'local':
convert_command() convert_command()
...@@ -14,7 +14,7 @@ jobs: ...@@ -14,7 +14,7 @@ jobs:
displayName: 'Install dependencies for integration tests' displayName: 'Install dependencies for integration tests'
- script: | - script: |
cd test cd test
python generate_ts_config.py python generate_ts_config.py --ts local
displayName: 'generate config files' displayName: 'generate config files'
- script: | - script: |
cd test cd test
......
jobs:
- job: 'build_docker_image'
timeoutInMinutes: 0
pool:
vmImage: 'Ubuntu 16.04'
steps:
- script: python3 -m pip install --upgrade pip setuptools --user
displayName: 'Install python tools'
- script: |
cd deployment/pypi
echo 'building prerelease package...'
make build
ls $(Build.SourcesDirectory)/deployment/pypi/dist/
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build nni bdsit_wheel'
- script: |
if [ $(build_docker_img) = 'true' ]
then
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
export IMG_TAG=`date -u +%y%m%d%H%M`
echo 'build and upload docker image'
docker build -f ../docker/Dockerfile -t $(test_docker_img_name):$IMG_TAG .
docker push $(test_docker_img_name):$IMG_TAG
export TEST_IMG=$(test_docker_img_name):$IMG_TAG
cd ../../
else
export TEST_IMG=$(existing_docker_img)
fi
echo "##vso[task.setvariable variable=TEST_IMG]$TEST_IMG"
displayName: 'build docker image'
- script:
echo $TEST_IMG
echo "##vso[task.setvariable variable=docker_image;isOutput=true]$TEST_IMG"
name: setvariableStep
displayName: 'set image variable'
- job: 'integration_test_pai'
timeoutInMinutes: 0
dependsOn: build_docker_image
variables:
docker_image: $[ dependencies.build_docker_image.outputs['setvariableStep.docker_image'] ]
steps:
- script: |
set PATH=$(ENV_PATH)
python --version
powershell.exe -file install.ps1
displayName: 'Install nni toolkit via source code'
- script: |
cd test
set PATH=$(ENV_PATH)
python --version
python generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --pai_pwd $(pai_pwd) --vc $(pai_virtual_cluster) --nni_docker_image $(docker_image) --data_dir $(data_dir) --output_dir $(output_dir) --nni_manager_ip $(nni_manager_ip)
python config_test.py --ts pai --exclude multi_phase,smac,bohb
displayName: 'Examples and advanced features tests on pai'
\ No newline at end of file
...@@ -24,9 +24,9 @@ import os ...@@ -24,9 +24,9 @@ import os
import sys import sys
import shutil import shutil
import string import string
from subprocess import Popen, PIPE, call, check_output, check_call from subprocess import Popen, PIPE, call, check_output, check_call, CalledProcessError
import tempfile import tempfile
from nni.constants import ModuleName from nni.constants import ModuleName, AdvisorModuleName
from nni_annotation import * from nni_annotation import *
from .launcher_utils import validate_all_content from .launcher_utils import validate_all_content
from .rest_utils import rest_put, rest_post, check_rest_server, check_rest_server_quick, check_response from .rest_utils import rest_put, rest_post, check_rest_server, check_rest_server_quick, check_response
...@@ -344,13 +344,18 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen ...@@ -344,13 +344,18 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
'''follow steps to start rest server and start experiment''' '''follow steps to start rest server and start experiment'''
nni_config = Config(config_file_name) nni_config = Config(config_file_name)
# check packages for tuner # check packages for tuner
package_name, module_name = None, None
if experiment_config.get('tuner') and experiment_config['tuner'].get('builtinTunerName'): if experiment_config.get('tuner') and experiment_config['tuner'].get('builtinTunerName'):
tuner_name = experiment_config['tuner']['builtinTunerName'] package_name = experiment_config['tuner']['builtinTunerName']
module_name = ModuleName[tuner_name] module_name = ModuleName.get(package_name)
elif experiment_config.get('advisor') and experiment_config['advisor'].get('builtinAdvisorName'):
package_name = experiment_config['advisor']['builtinAdvisorName']
module_name = AdvisorModuleName.get(package_name)
if package_name and module_name:
try: try:
check_call([sys.executable, '-c', 'import %s'%(module_name)]) check_call([sys.executable, '-c', 'import %s'%(module_name)], stdout=PIPE, stderr=PIPE)
except ModuleNotFoundError as e: except CalledProcessError as e:
print_error('The tuner %s should be installed through nnictl'%(tuner_name)) print_error('%s should be installed through \'nnictl package install --name %s\''%(package_name, package_name))
exit(1) exit(1)
log_dir = experiment_config['logDir'] if experiment_config.get('logDir') else None log_dir = experiment_config['logDir'] if experiment_config.get('logDir') else None
log_level = experiment_config['logLevel'] if experiment_config.get('logLevel') else None log_level = experiment_config['logLevel'] if experiment_config.get('logLevel') else None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment