Unverified Commit c5acd8c2 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #173 from microsoft/master

merge master
parents 40bae6e2 d135d184
......@@ -15,5 +15,5 @@ Assessor 从 Trial 中接收中间结果,并通过指定的算法决定此 Tri
.. toctree::
:maxdepth: 2
内置 Assessor<builtinAssessor>
自定义 Assessor<Customize_Assessor>
内置 Assessor<BuiltinAssessor>
自定义 Assessor<CustomizeAssessor>
#################
自动机器学习的经验分享
#################
.. toctree::
:maxdepth: 2
神经网络架构搜索的对比<CommunitySharings/AutomlPracticeSharing/NasComparison>
内置 Tuner
==================
.. toctree::
:maxdepth: 1
介绍<Builtin_Tuner>
TPE<hyperoptTuner>
Random Search<hyperoptTuner>
Anneal<hyperoptTuner>
Naive Evolution<evolutionTuner>
SMAC<smacTuner>
Batch Tuner<batchTuner>
Grid Search<gridsearchTuner>
Hyperband<hyperbandAdvisor>
Network Morphism<networkmorphismTuner>
Metis Tuner<metisTuner>
BOHB<bohbAdvisor>
\ No newline at end of file
......@@ -4,6 +4,6 @@
.. toctree::
:maxdepth: 1
介绍<Builtin_Assessors>
Medianstop<medianstopAssessor>
Curvefitting<curvefittingAssessor>
\ No newline at end of file
介绍<BuiltinAssessors>
Medianstop<MedianstopAssessor>
Curvefitting<CurvefittingAssessor>
\ No newline at end of file
内置 Tuner
==================
.. toctree::
:maxdepth: 1
介绍<BuiltinTuner>
TPE<HyperoptTuner>
Random Search<HyperoptTuner>
Anneal<HyperoptTuner>
Naive Evolution<EvolutionTuner>
SMAC<SmacTuner>
Batch Tuner<BatchTuner>
Grid Search<GridsearchTuner>
Hyperband<HyperbandAdvisor>
Network Morphism<NetworkmorphismTuner>
Metis Tuner<MetisTuner>
BOHB<BohbAdvisor>
\ No newline at end of file
######################
社区分享
######################
除了官方的教程和示例之外,也支持社区贡献者分享自己的自动机器学习实践经验,特别是使用 NNI 的实践经验。
.. toctree::
:maxdepth: 2
NNI 经验分享<nni_practice_sharing>
神经网络结构搜索的对比<CommunitySharings/NasComparison>
超参调优算法的对比<CommunitySharings/HpoComparison>
......@@ -3,5 +3,5 @@
###############################
.. toctree::
设置开发环境<SetupNNIDeveloperEnvironment>
贡献指南<CONTRIBUTING>
\ No newline at end of file
设置开发环境<SetupNniDeveloperEnvironment>
贡献指南<Contributing>
\ No newline at end of file
......@@ -5,8 +5,8 @@
.. toctree::
:maxdepth: 2
MNIST<mnist_examples>
Cifar10<cifar10_examples>
Scikit-learn<sklearn_examples>
EvolutionSQuAD<SQuAD_evolution_examples>
GBDT<gbdt_example>
MNIST<MnistExamples>
Cifar10<Cifar10Examples>
Scikit-learn<SklearnExamples>
EvolutionSQuAD<SquadEvolutionExamples>
GBDT<GbdtExample>
......@@ -13,10 +13,10 @@ Neural Network Intelligence(NNI)文档
概述<Overview>
入门<QuickStart>
教程<Tutorials>
例<Examples>
参考<Reference>
教程<tutorials>
例<examples>
参考<reference>
常见问答<FAQ>
贡献<Contribution>
版本日志<RELEASE>
博客<Blog/index>
贡献<contribution>
更改日志<Release>
社区经验分享<community_sharings>
#################
教程
#################
分享使用 NNI 来调优模型和系统的经验
.. toctree::
:maxdepth: 2
在 NNI 上调优 Recommenders 的 SVD<CommunitySharings/NniPracticeSharing/RecommendersSvd>
\ No newline at end of file
......@@ -4,7 +4,7 @@
.. toctree::
:maxdepth: 3
命令行<NNICTLDOC>
命令行<Nnictl>
Python API<sdk_reference>
Annotation<AnnotationSpec>
配置<ExperimentConfig>
......
......@@ -4,6 +4,6 @@ NNI 支持的训练平台介绍
.. toctree::
本机<LocalMode>
远程<RemoteMachineMode>
OpenPAI<PAIMode>
OpenPAI<PaiMode>
Kubeflow<KubeflowMode>
FrameworkController<FrameworkControllerMode>
\ No newline at end of file
......@@ -13,6 +13,6 @@ Tuner 从 Trial 接收指标结果,来评估一组超参或网络结构的性
.. toctree::
:maxdepth: 2
内置 Tuner<builtinTuner>
自定义 Tuner<Customize_Tuner>
自定义 Advisor<Customize_Advisor>
\ No newline at end of file
内置 Tuner<BuiltinTuner>
自定义 Tuner<CustomizeTuner>
自定义 Advisor<CustomizeAdvisor>
\ No newline at end of file
######################
教程
######################
.. toctree::
:maxdepth: 2
安装<Installation>
实现 Trial<Trials>
Tuner<tuners>
Assessor<assessors>
Web 界面<WebUI>
训练平台<training_services>
如何使用 Docker <HowToUseDocker>
高级功能<advanced>
如何调试<HowToDebug>
\ No newline at end of file
......@@ -15,7 +15,7 @@ $yarnUrl = "https://yarnpkg.com/latest.tar.gz"
$unzipNodeDir = "node-v*"
$unzipYarnDir = "yarn-v*"
$NNI_DEPENDENCY_FOLDER = "C:\tmp\$env:USERNAME"
$NNI_DEPENDENCY_FOLDER = [System.IO.Path]::GetTempPath()+$env:USERNAME
$WHICH_PYTHON = where.exe python
if($WHICH_PYTHON -eq $null){
......
......@@ -43,11 +43,11 @@ function getExperimentRootDir(): string {
.getLogDir();
}
function getLogDir(): string{
function getLogDir(): string {
return path.join(getExperimentRootDir(), 'log');
}
function getLogLevel(): string{
function getLogLevel(): string {
return getExperimentStartupInfo()
.getLogLevel();
}
......@@ -149,7 +149,7 @@ function parseArg(names: string[]): string {
return '';
}
function encodeCmdLineArgs(args:any):any{
function encodeCmdLineArgs(args: any): any {
if(process.platform === 'win32'){
return JSON.stringify(args);
}
......@@ -158,7 +158,7 @@ function encodeCmdLineArgs(args:any):any{
}
}
function getCmdPy():string{
function getCmdPy(): string {
let cmd = 'python3';
if(process.platform === 'win32'){
cmd = 'python';
......@@ -390,7 +390,7 @@ async function getVersion(): Promise<string> {
/**
* run command as ChildProcess
*/
function getTunerProc(command: string, stdio: StdioOptions, newCwd: string, newEnv: any): ChildProcess{
function getTunerProc(command: string, stdio: StdioOptions, newCwd: string, newEnv: any): ChildProcess {
let cmd: string = command;
let arg: string[] = [];
let newShell: boolean = true;
......@@ -411,7 +411,7 @@ function getTunerProc(command: string, stdio: StdioOptions, newCwd: string, newE
/**
* judge whether the process is alive
*/
async function isAlive(pid:any): Promise<boolean>{
async function isAlive(pid:any): Promise<boolean> {
let deferred : Deferred<boolean> = new Deferred<boolean>();
let alive: boolean = false;
if(process.platform ==='win32'){
......@@ -439,7 +439,7 @@ async function isAlive(pid:any): Promise<boolean>{
/**
* kill process
*/
async function killPid(pid:any): Promise<void>{
async function killPid(pid:any): Promise<void> {
let deferred : Deferred<void> = new Deferred<void>();
try {
if (process.platform === "win32") {
......@@ -455,7 +455,7 @@ async function killPid(pid:any): Promise<void>{
return deferred.promise;
}
function getNewLine(): string{
function getNewLine(): string {
if (process.platform === "win32") {
return "\r\n";
}
......
......@@ -58,6 +58,7 @@ class NNIManager implements Manager {
private status: NNIManagerStatus;
private waitingTrials: string[];
private trialJobs: Map<string, TrialJobDetail>;
private trialJobMetricListener: (metric: TrialJobMetric) => void;
constructor() {
this.currSubmittedTrialNum = 0;
......@@ -76,6 +77,11 @@ class NNIManager implements Manager {
status: 'INITIALIZED',
errors: []
};
this.trialJobMetricListener = (metric: TrialJobMetric) => {
this.onTrialJobMetrics(metric).catch((err: Error) => {
this.criticalError(NNIError.FromError(err, 'Job metrics error: '));
});
};
}
public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void> {
......@@ -342,6 +348,7 @@ class NNIManager implements Manager {
if (this.dispatcher === undefined) {
throw new Error('Error: tuner has not been setup');
}
this.trainingService.removeTrialJobMetricListener(this.trialJobMetricListener);
this.dispatcher.sendCommand(TERMINATE);
let tunerAlive: boolean = true;
// gracefully terminate tuner and assessor here, wait at most 30 seconds.
......@@ -589,11 +596,7 @@ class NNIManager implements Manager {
if (this.dispatcher === undefined) {
throw new Error('Error: tuner or job maintainer have not been setup');
}
this.trainingService.addTrialJobMetricListener((metric: TrialJobMetric) => {
this.onTrialJobMetrics(metric).catch((err: Error) => {
this.criticalError(NNIError.FromError(err, 'Job metrics error: '));
});
});
this.trainingService.addTrialJobMetricListener(this.trialJobMetricListener);
this.dispatcher.onCommand((commandType: string, content: string) => {
this.onTunerCommand(commandType, content).catch((err: Error) => {
......
......@@ -24,7 +24,10 @@ import { getLogger } from "common/log";
import { countFilesRecursively } from '../../common/utils'
import * as cpp from 'child-process-promise';
import * as cp from 'child_process';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData'
import * as os from 'os';
import * as fs from 'fs';
import { getNewLine } from '../../common/utils';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData';
import * as path from 'path';
import { String } from 'typescript-string-operations';
import { file } from "../../node_modules/@types/tmp";
......@@ -66,6 +69,20 @@ export async function execMkdir(directory: string): Promise<void> {
return Promise.resolve();
}
/**
* copy files to the directory
* @param source
* @param destination
*/
export async function execCopydir(source: string, destination: string): Promise<void> {
if (process.platform === 'win32') {
await cpp.exec(`powershell.exe Copy-Item ${source} -Destination ${destination} -Recurse`);
} else {
await cpp.exec(`cp -r ${source} ${destination}`);
}
return Promise.resolve();
}
/**
* crete a new file
* @param filename
......@@ -91,8 +108,6 @@ export function execScript(filePath: string): cp.ChildProcess {
}
}
/**
* output the last line of a file
* @param filePath
......@@ -111,9 +126,9 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis
* delete a directory
* @param directory
*/
export async function execRemove(directory: string): Promise<void>{
export async function execRemove(directory: string): Promise<void> {
if (process.platform === 'win32') {
await cpp.exec(`powershell.exe Remove-Item ${directory}`);
await cpp.exec(`powershell.exe Remove-Item ${directory} -Recurse -Force`);
} else {
await cpp.exec(`rm -rf ${directory}`);
}
......@@ -124,7 +139,7 @@ export async function execRemove(directory: string): Promise<void>{
* kill a process
* @param directory
*/
export async function execKill(pid: string): Promise<void>{
export async function execKill(pid: string): Promise<void> {
if (process.platform === 'win32') {
await cpp.exec(`cmd /c taskkill /PID ${pid} /T /F`);
} else {
......@@ -138,7 +153,7 @@ export async function execKill(pid: string): Promise<void>{
* @param variable
* @returns command string
*/
export function setEnvironmentVariable(variable: { key: string; value: string }): string{
export function setEnvironmentVariable(variable: { key: string; value: string }): string {
if (process.platform === 'win32') {
return `$env:${variable.key}="${variable.value}"`;
}
......@@ -147,6 +162,32 @@ export function setEnvironmentVariable(variable: { key: string; value: string })
}
}
/**
* Compress files in directory to tar file
* @param source_path
* @param tar_path
*/
export async function tarAdd(tar_path: string, source_path: string): Promise<void> {
if (process.platform === 'win32') {
tar_path = tar_path.split('\\').join('\\\\');
source_path = source_path.split('\\').join('\\\\');
let script: string[] = [];
script.push(
`import os`,
`import tarfile`,
String.Format(`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`, tar_path, source_path),
` for file in files:`,
` fullpath = os.path.join(root,file)`,
` tar.add(fullpath, arcname=file)`,
`tar.close()`);
await fs.promises.writeFile(path.join(os.tmpdir(), 'tar.py'), script.join(getNewLine()), { encoding: 'utf8', mode: 0o777 });
const tarScript: string = path.join(os.tmpdir(), 'tar.py');
await cpp.exec(`python ${tarScript}`);
} else {
await cpp.exec(`tar -czf ${tar_path} -C ${source_path} .`);
}
return Promise.resolve();
}
/**
* generate script file name
......
......@@ -36,7 +36,7 @@ import { ObservableTimer } from '../../common/observableTimer';
import {
HostJobApplicationForm, HyperParameters, JobApplicationForm, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, NNIManagerIpConfig
} from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString, getJobCancelStatus, getRemoteTmpDir,getIPV4Address } from '../../common/utils';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString, getJobCancelStatus, getRemoteTmpDir,getIPV4Address, getVersion, unixPathJoin } from '../../common/utils';
import { GPUSummary } from '../common/gpuData';
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
......@@ -48,10 +48,9 @@ import {
} from './remoteMachineData';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX } from '../common/gpuData';
import { SSHClientUtility } from './sshClientUtility';
import { validateCodeDir } from '../common/util';
import { validateCodeDir, execRemove, execMkdir, execCopydir } from '../common/util';
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { mkDirP, getVersion } from '../../common/utils';
/**
* Training Service implementation for Remote Machine (Linux)
......@@ -234,7 +233,7 @@ class RemoteMachineTrainingService implements TrainingService {
} else if (form.jobType === 'TRIAL') {
// Generate trial job id(random)
const trialJobId: string = uniqueString(5);
const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId);
const trialWorkingFolder: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJobId);
const trialJobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail(
trialJobId,
......@@ -354,7 +353,7 @@ class RemoteMachineTrainingService implements TrainingService {
case TrialConfigMetadataKey.MACHINE_LIST:
await this.setupConnections(value);
//remove local temp files
await cpp.exec(`rm -rf ${this.getLocalGpuMetricCollectorDir()}`);
await execRemove(this.getLocalGpuMetricCollectorDir());
break;
case TrialConfigMetadataKey.TRIAL_CONFIG:
const remoteMachineTrailConfig: TrialConfig = <TrialConfig>JSON.parse(value);
......@@ -417,7 +416,7 @@ class RemoteMachineTrainingService implements TrainingService {
private async cleanupConnections(): Promise<void> {
try{
for (const [rmMeta, sshClientManager] of this.machineSSHClientMap.entries()) {
let jobpidPath: string = path.join(this.getRemoteScriptsPath(rmMeta.username), 'pid');
let jobpidPath: string = unixPathJoin(this.getRemoteScriptsPath(rmMeta.username), 'pid');
let client: Client | undefined = sshClientManager.getFirstSSHClient();
if(client) {
await SSHClientUtility.remoteExeCommand(`pkill -P \`cat ${jobpidPath}\``, client);
......@@ -438,7 +437,7 @@ class RemoteMachineTrainingService implements TrainingService {
*/
private getLocalGpuMetricCollectorDir(): string {
let userName: string = path.basename(os.homedir()); //get current user name of os
return `${os.tmpdir()}/${userName}/nni/scripts/`;
return path.join(os.tmpdir(), userName, 'nni', 'scripts');
}
/**
......@@ -447,14 +446,14 @@ class RemoteMachineTrainingService implements TrainingService {
*/
private async generateGpuMetricsCollectorScript(userName: string): Promise<void> {
let gpuMetricCollectorScriptFolder : string = this.getLocalGpuMetricCollectorDir();
await cpp.exec(`mkdir -p ${path.join(gpuMetricCollectorScriptFolder, userName)}`);
await execMkdir(path.join(gpuMetricCollectorScriptFolder, userName));
//generate gpu_metrics_collector.sh
let gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh');
const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); // This directory is used to store gpu_metrics and pid created by script
const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT_LINUX,
remoteGPUScriptsDir,
path.join(remoteGPUScriptsDir, 'pid'),
unixPathJoin(remoteGPUScriptsDir, 'pid'),
);
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
}
......@@ -481,7 +480,7 @@ class RemoteMachineTrainingService implements TrainingService {
private async initRemoteMachineOnConnected(rmMeta: RemoteMachineMeta, conn: Client): Promise<void> {
// Create root working directory after ssh connection is ready
await this.generateGpuMetricsCollectorScript(rmMeta.username); //generate gpu script in local machine first, will copy to remote machine later
const nniRootDir: string = `${os.tmpdir()}/nni`;
const nniRootDir: string = unixPathJoin(getRemoteTmpDir(this.remoteOS), 'nni');
await SSHClientUtility.remoteExeCommand(`mkdir -p ${this.remoteExpRootDir}`, conn);
// Copy NNI scripts to remote expeirment working directory
......@@ -490,15 +489,15 @@ class RemoteMachineTrainingService implements TrainingService {
await SSHClientUtility.remoteExeCommand(`mkdir -p ${remoteGpuScriptCollectorDir}`, conn);
await SSHClientUtility.remoteExeCommand(`chmod 777 ${nniRootDir} ${nniRootDir}/* ${nniRootDir}/scripts/*`, conn);
//copy gpu_metrics_collector.sh to remote
await SSHClientUtility.copyFileToRemote(path.join(localGpuScriptCollectorDir, rmMeta.username, 'gpu_metrics_collector.sh'), path.join(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh'), conn);
await SSHClientUtility.copyFileToRemote(path.join(localGpuScriptCollectorDir, rmMeta.username, 'gpu_metrics_collector.sh'), unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh'), conn);
//Begin to execute gpu_metrics_collection scripts
SSHClientUtility.remoteExeCommand(`bash ${path.join(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh')}`, conn);
SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh')}`, conn);
this.timer.subscribe(
async (tick: number) => {
const cmdresult: RemoteCommandResult = await SSHClientUtility.remoteExeCommand(
`tail -n 1 ${path.join(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn);
`tail -n 1 ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn);
if (cmdresult && cmdresult.stdout) {
rmMeta.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout);
}
......@@ -531,7 +530,7 @@ class RemoteMachineTrainingService implements TrainingService {
} else if (rmScheduleResult.resultType === ScheduleResultType.SUCCEED
&& rmScheduleResult.scheduleInfo !== undefined) {
const rmScheduleInfo : RemoteMachineScheduleInfo = rmScheduleResult.scheduleInfo;
const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId);
const trialWorkingFolder: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJobId);
trialJobDetail.rmMeta = rmScheduleInfo.rmMeta;
......@@ -575,7 +574,7 @@ class RemoteMachineTrainingService implements TrainingService {
const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId);
await SSHClientUtility.remoteExeCommand(`mkdir -p ${trialWorkingFolder}`, sshClient);
await SSHClientUtility.remoteExeCommand(`mkdir -p ${path.join(trialWorkingFolder, '.nni')}`, sshClient);
await SSHClientUtility.remoteExeCommand(`mkdir -p ${unixPathJoin(trialWorkingFolder, '.nni')}`, sshClient);
// RemoteMachineRunShellFormat is the run shell format string,
// See definition in remoteMachineData.ts
......@@ -603,20 +602,20 @@ class RemoteMachineTrainingService implements TrainingService {
getExperimentId(),
trialJobDetail.sequenceId.toString(),
this.isMultiPhase,
path.join(trialWorkingFolder, '.nni', 'jobpid'),
unixPathJoin(trialWorkingFolder, '.nni', 'jobpid'),
command,
nniManagerIp,
this.remoteRestServerPort,
version,
this.logCollection,
path.join(trialWorkingFolder, '.nni', 'code')
unixPathJoin(trialWorkingFolder, '.nni', 'code')
)
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.join(trialLocalTempFolder, '.nni')}`);
await execMkdir(path.join(trialLocalTempFolder, '.nni'));
//create tmp trial working folder locally.
await cpp.exec(`cp -r ${this.trialConfig.codeDir}/* ${trialLocalTempFolder}`);
await execCopydir(path.join(this.trialConfig.codeDir, '*'), trialLocalTempFolder);
const installScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), installScriptContent, { encoding: 'utf8' });
......@@ -626,7 +625,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Copy files in codeDir to remote working directory
await SSHClientUtility.copyDirectoryToRemote(trialLocalTempFolder, trialWorkingFolder, sshClient, this.remoteOS);
// Execute command in remote machine
SSHClientUtility.remoteExeCommand(`bash ${path.join(trialWorkingFolder, 'run.sh')}`, sshClient);
SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(trialWorkingFolder, 'run.sh')}`, sshClient);
}
private async runHostJob(form: HostJobApplicationForm): Promise<TrialJobDetail> {
......@@ -646,8 +645,8 @@ class RemoteMachineTrainingService implements TrainingService {
);
await fs.promises.writeFile(path.join(localDir, 'run.sh'), runScriptContent, { encoding: 'utf8' });
await SSHClientUtility.copyFileToRemote(
path.join(localDir, 'run.sh'), path.join(remoteDir, 'run.sh'), sshClient);
SSHClientUtility.remoteExeCommand(`bash ${path.join(remoteDir, 'run.sh')}`, sshClient);
path.join(localDir, 'run.sh'), unixPathJoin(remoteDir, 'run.sh'), sshClient);
SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(remoteDir, 'run.sh')}`, sshClient);
const jobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail(
jobId, 'RUNNING', Date.now(), remoteDir, form, this.generateSequenceId()
......@@ -672,7 +671,7 @@ class RemoteMachineTrainingService implements TrainingService {
private async updateTrialJobStatus(trialJob: RemoteMachineTrialJobDetail, sshClient: Client): Promise<TrialJobDetail> {
const deferred: Deferred<TrialJobDetail> = new Deferred<TrialJobDetail>();
const jobpidPath: string = this.getJobPidPath(trialJob.id);
const trialReturnCodeFilePath: string = path.join(this.remoteExpRootDir, 'trials', trialJob.id, '.nni', 'code');
const trialReturnCodeFilePath: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJob.id, '.nni', 'code');
try {
const killResult: number = (await SSHClientUtility.remoteExeCommand(`kill -0 \`cat ${jobpidPath}\``, sshClient)).exitCode;
// if the process of jobpid is not alive any more
......@@ -712,15 +711,15 @@ class RemoteMachineTrainingService implements TrainingService {
}
private getRemoteScriptsPath(userName: string): string {
return path.join(getRemoteTmpDir(this.remoteOS), userName, 'nni', 'scripts');
return unixPathJoin(getRemoteTmpDir(this.remoteOS), userName, 'nni', 'scripts');
}
private getHostJobRemoteDir(jobId: string): string {
return path.join(this.remoteExpRootDir, 'hostjobs', jobId);
return unixPathJoin(this.remoteExpRootDir, 'hostjobs', jobId);
}
private getRemoteExperimentRootDir(): string{
return path.join(getRemoteTmpDir(this.remoteOS), 'nni', 'experiments', getExperimentId());
return unixPathJoin(getRemoteTmpDir(this.remoteOS), 'nni', 'experiments', getExperimentId());
}
public get MetricsEmitter() : EventEmitter {
......@@ -735,9 +734,9 @@ class RemoteMachineTrainingService implements TrainingService {
let jobpidPath: string;
if (trialJobDetail.form.jobType === 'TRIAL') {
jobpidPath = path.join(trialJobDetail.workingDirectory, '.nni', 'jobpid');
jobpidPath = unixPathJoin(trialJobDetail.workingDirectory, '.nni', 'jobpid');
} else if (trialJobDetail.form.jobType === 'HOST') {
jobpidPath = path.join(this.getHostJobRemoteDir(jobId), 'jobpid');
jobpidPath = unixPathJoin(this.getHostJobRemoteDir(jobId), 'jobpid');
} else {
throw new Error(`Job type not supported: ${trialJobDetail.form.jobType}`);
}
......@@ -751,14 +750,14 @@ class RemoteMachineTrainingService implements TrainingService {
throw new Error('sshClient is undefined.');
}
const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId);
const trialWorkingFolder: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJobId);
const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId);
const fileName: string = generateParamFileName(hyperParameters);
const localFilepath: string = path.join(trialLocalTempFolder, fileName);
await fs.promises.writeFile(localFilepath, hyperParameters.value, { encoding: 'utf8' });
await SSHClientUtility.copyFileToRemote(localFilepath, path.join(trialWorkingFolder, fileName), sshClient);
await SSHClientUtility.copyFileToRemote(localFilepath, unixPathJoin(trialWorkingFolder, fileName), sshClient);
}
private generateSequenceId(): number {
......
......@@ -28,8 +28,9 @@ import * as stream from 'stream';
import { Deferred } from 'ts-deferred';
import { NNIError, NNIErrorNames } from '../../common/errors';
import { getLogger, Logger } from '../../common/log';
import { uniqueString, getRemoteTmpDir } from '../../common/utils';
import { uniqueString, getRemoteTmpDir, unixPathJoin } from '../../common/utils';
import { RemoteCommandResult } from './remoteMachineData';
import { execRemove, tarAdd } from '../common/util';
/**
*
......@@ -47,13 +48,13 @@ export namespace SSHClientUtility {
const deferred: Deferred<void> = new Deferred<void>();
const tmpTarName: string = `${uniqueString(10)}.tar.gz`;
const localTarPath: string = path.join(os.tmpdir(), tmpTarName);
const remoteTarPath: string = path.join(getRemoteTmpDir(remoteOS), tmpTarName);
const remoteTarPath: string = unixPathJoin(getRemoteTmpDir(remoteOS), tmpTarName);
// Compress files in local directory to experiment root directory
await cpp.exec(`tar -czf ${localTarPath} -C ${localDirectory} .`);
await tarAdd(localTarPath, localDirectory);
// Copy the compressed file to remoteDirectory and delete it
await copyFileToRemote(localTarPath, remoteTarPath, sshClient);
await cpp.exec(`rm ${localTarPath}`);
await execRemove(localTarPath);
// Decompress the remote compressed file in and delete it
await remoteExeCommand(`tar -oxzf ${remoteTarPath} -C ${remoteDirectory}`, sshClient);
await remoteExeCommand(`rm ${remoteTarPath}`, sshClient);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment