"...git@developer.sourcefind.cn:chenpangpang/open-webui.git" did not exist on "f61869d90d0819d867fb3eaeb23cd28ebedf5741"
Unverified Commit c5acd8c2 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #173 from microsoft/master

merge master
parents 40bae6e2 d135d184
...@@ -15,5 +15,5 @@ Assessor 从 Trial 中接收中间结果,并通过指定的算法决定此 Tri ...@@ -15,5 +15,5 @@ Assessor 从 Trial 中接收中间结果,并通过指定的算法决定此 Tri
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
内置 Assessor<builtinAssessor> 内置 Assessor<BuiltinAssessor>
自定义 Assessor<Customize_Assessor> 自定义 Assessor<CustomizeAssessor>
#################
自动机器学习的经验分享
#################
.. toctree::
:maxdepth: 2
神经网络架构搜索的对比<CommunitySharings/AutomlPracticeSharing/NasComparison>
内置 Tuner
==================
.. toctree::
:maxdepth: 1
介绍<Builtin_Tuner>
TPE<hyperoptTuner>
Random Search<hyperoptTuner>
Anneal<hyperoptTuner>
Naive Evolution<evolutionTuner>
SMAC<smacTuner>
Batch Tuner<batchTuner>
Grid Search<gridsearchTuner>
Hyperband<hyperbandAdvisor>
Network Morphism<networkmorphismTuner>
Metis Tuner<metisTuner>
BOHB<bohbAdvisor>
\ No newline at end of file
...@@ -4,6 +4,6 @@ ...@@ -4,6 +4,6 @@
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
介绍<Builtin_Assessors> 介绍<BuiltinAssessors>
Medianstop<medianstopAssessor> Medianstop<MedianstopAssessor>
Curvefitting<curvefittingAssessor> Curvefitting<CurvefittingAssessor>
\ No newline at end of file \ No newline at end of file
内置 Tuner
==================
.. toctree::
:maxdepth: 1
介绍<BuiltinTuner>
TPE<HyperoptTuner>
Random Search<HyperoptTuner>
Anneal<HyperoptTuner>
Naive Evolution<EvolutionTuner>
SMAC<SmacTuner>
Batch Tuner<BatchTuner>
Grid Search<GridsearchTuner>
Hyperband<HyperbandAdvisor>
Network Morphism<NetworkmorphismTuner>
Metis Tuner<MetisTuner>
BOHB<BohbAdvisor>
\ No newline at end of file
######################
社区分享
######################
除了官方的教程和示例之外,也支持社区贡献者分享自己的自动机器学习实践经验,特别是使用 NNI 的实践经验。
.. toctree::
:maxdepth: 2
NNI 经验分享<nni_practice_sharing>
神经网络结构搜索的对比<CommunitySharings/NasComparison>
超参调优算法的对比<CommunitySharings/HpoComparison>
...@@ -3,5 +3,5 @@ ...@@ -3,5 +3,5 @@
############################### ###############################
.. toctree:: .. toctree::
设置开发环境<SetupNNIDeveloperEnvironment> 设置开发环境<SetupNniDeveloperEnvironment>
贡献指南<CONTRIBUTING> 贡献指南<Contributing>
\ No newline at end of file \ No newline at end of file
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
MNIST<mnist_examples> MNIST<MnistExamples>
Cifar10<cifar10_examples> Cifar10<Cifar10Examples>
Scikit-learn<sklearn_examples> Scikit-learn<SklearnExamples>
EvolutionSQuAD<SQuAD_evolution_examples> EvolutionSQuAD<SquadEvolutionExamples>
GBDT<gbdt_example> GBDT<GbdtExample>
...@@ -13,10 +13,10 @@ Neural Network Intelligence(NNI)文档 ...@@ -13,10 +13,10 @@ Neural Network Intelligence(NNI)文档
概述<Overview> 概述<Overview>
入门<QuickStart> 入门<QuickStart>
教程<Tutorials> 教程<tutorials>
例<Examples> 例<examples>
参考<Reference> 参考<reference>
常见问答<FAQ> 常见问答<FAQ>
贡献<Contribution> 贡献<contribution>
版本日志<RELEASE> 更改日志<Release>
博客<Blog/index> 社区经验分享<community_sharings>
#################
教程
#################
分享使用 NNI 来调优模型和系统的经验
.. toctree::
:maxdepth: 2
在 NNI 上调优 Recommenders 的 SVD<CommunitySharings/NniPracticeSharing/RecommendersSvd>
\ No newline at end of file
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
.. toctree:: .. toctree::
:maxdepth: 3 :maxdepth: 3
命令行<NNICTLDOC> 命令行<Nnictl>
Python API<sdk_reference> Python API<sdk_reference>
Annotation<AnnotationSpec> Annotation<AnnotationSpec>
配置<ExperimentConfig> 配置<ExperimentConfig>
......
...@@ -4,6 +4,6 @@ NNI 支持的训练平台介绍 ...@@ -4,6 +4,6 @@ NNI 支持的训练平台介绍
.. toctree:: .. toctree::
本机<LocalMode> 本机<LocalMode>
远程<RemoteMachineMode> 远程<RemoteMachineMode>
OpenPAI<PAIMode> OpenPAI<PaiMode>
Kubeflow<KubeflowMode> Kubeflow<KubeflowMode>
FrameworkController<FrameworkControllerMode> FrameworkController<FrameworkControllerMode>
\ No newline at end of file
...@@ -13,6 +13,6 @@ Tuner 从 Trial 接收指标结果,来评估一组超参或网络结构的性 ...@@ -13,6 +13,6 @@ Tuner 从 Trial 接收指标结果,来评估一组超参或网络结构的性
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
内置 Tuner<builtinTuner> 内置 Tuner<BuiltinTuner>
自定义 Tuner<Customize_Tuner> 自定义 Tuner<CustomizeTuner>
自定义 Advisor<Customize_Advisor> 自定义 Advisor<CustomizeAdvisor>
\ No newline at end of file \ No newline at end of file
######################
教程
######################
.. toctree::
:maxdepth: 2
安装<Installation>
实现 Trial<Trials>
Tuner<tuners>
Assessor<assessors>
Web 界面<WebUI>
训练平台<training_services>
如何使用 Docker <HowToUseDocker>
高级功能<advanced>
如何调试<HowToDebug>
\ No newline at end of file
...@@ -15,7 +15,7 @@ $yarnUrl = "https://yarnpkg.com/latest.tar.gz" ...@@ -15,7 +15,7 @@ $yarnUrl = "https://yarnpkg.com/latest.tar.gz"
$unzipNodeDir = "node-v*" $unzipNodeDir = "node-v*"
$unzipYarnDir = "yarn-v*" $unzipYarnDir = "yarn-v*"
$NNI_DEPENDENCY_FOLDER = "C:\tmp\$env:USERNAME" $NNI_DEPENDENCY_FOLDER = [System.IO.Path]::GetTempPath()+$env:USERNAME
$WHICH_PYTHON = where.exe python $WHICH_PYTHON = where.exe python
if($WHICH_PYTHON -eq $null){ if($WHICH_PYTHON -eq $null){
......
...@@ -43,11 +43,11 @@ function getExperimentRootDir(): string { ...@@ -43,11 +43,11 @@ function getExperimentRootDir(): string {
.getLogDir(); .getLogDir();
} }
function getLogDir(): string{ function getLogDir(): string {
return path.join(getExperimentRootDir(), 'log'); return path.join(getExperimentRootDir(), 'log');
} }
function getLogLevel(): string{ function getLogLevel(): string {
return getExperimentStartupInfo() return getExperimentStartupInfo()
.getLogLevel(); .getLogLevel();
} }
...@@ -149,7 +149,7 @@ function parseArg(names: string[]): string { ...@@ -149,7 +149,7 @@ function parseArg(names: string[]): string {
return ''; return '';
} }
function encodeCmdLineArgs(args:any):any{ function encodeCmdLineArgs(args: any): any {
if(process.platform === 'win32'){ if(process.platform === 'win32'){
return JSON.stringify(args); return JSON.stringify(args);
} }
...@@ -158,7 +158,7 @@ function encodeCmdLineArgs(args:any):any{ ...@@ -158,7 +158,7 @@ function encodeCmdLineArgs(args:any):any{
} }
} }
function getCmdPy():string{ function getCmdPy(): string {
let cmd = 'python3'; let cmd = 'python3';
if(process.platform === 'win32'){ if(process.platform === 'win32'){
cmd = 'python'; cmd = 'python';
...@@ -390,7 +390,7 @@ async function getVersion(): Promise<string> { ...@@ -390,7 +390,7 @@ async function getVersion(): Promise<string> {
/** /**
* run command as ChildProcess * run command as ChildProcess
*/ */
function getTunerProc(command: string, stdio: StdioOptions, newCwd: string, newEnv: any): ChildProcess{ function getTunerProc(command: string, stdio: StdioOptions, newCwd: string, newEnv: any): ChildProcess {
let cmd: string = command; let cmd: string = command;
let arg: string[] = []; let arg: string[] = [];
let newShell: boolean = true; let newShell: boolean = true;
...@@ -411,7 +411,7 @@ function getTunerProc(command: string, stdio: StdioOptions, newCwd: string, newE ...@@ -411,7 +411,7 @@ function getTunerProc(command: string, stdio: StdioOptions, newCwd: string, newE
/** /**
* judge whether the process is alive * judge whether the process is alive
*/ */
async function isAlive(pid:any): Promise<boolean>{ async function isAlive(pid:any): Promise<boolean> {
let deferred : Deferred<boolean> = new Deferred<boolean>(); let deferred : Deferred<boolean> = new Deferred<boolean>();
let alive: boolean = false; let alive: boolean = false;
if(process.platform ==='win32'){ if(process.platform ==='win32'){
...@@ -439,7 +439,7 @@ async function isAlive(pid:any): Promise<boolean>{ ...@@ -439,7 +439,7 @@ async function isAlive(pid:any): Promise<boolean>{
/** /**
* kill process * kill process
*/ */
async function killPid(pid:any): Promise<void>{ async function killPid(pid:any): Promise<void> {
let deferred : Deferred<void> = new Deferred<void>(); let deferred : Deferred<void> = new Deferred<void>();
try { try {
if (process.platform === "win32") { if (process.platform === "win32") {
...@@ -455,7 +455,7 @@ async function killPid(pid:any): Promise<void>{ ...@@ -455,7 +455,7 @@ async function killPid(pid:any): Promise<void>{
return deferred.promise; return deferred.promise;
} }
function getNewLine(): string{ function getNewLine(): string {
if (process.platform === "win32") { if (process.platform === "win32") {
return "\r\n"; return "\r\n";
} }
......
...@@ -58,7 +58,8 @@ class NNIManager implements Manager { ...@@ -58,7 +58,8 @@ class NNIManager implements Manager {
private status: NNIManagerStatus; private status: NNIManagerStatus;
private waitingTrials: string[]; private waitingTrials: string[];
private trialJobs: Map<string, TrialJobDetail>; private trialJobs: Map<string, TrialJobDetail>;
private trialJobMetricListener: (metric: TrialJobMetric) => void;
constructor() { constructor() {
this.currSubmittedTrialNum = 0; this.currSubmittedTrialNum = 0;
this.trialConcurrencyChange = 0; this.trialConcurrencyChange = 0;
...@@ -76,6 +77,11 @@ class NNIManager implements Manager { ...@@ -76,6 +77,11 @@ class NNIManager implements Manager {
status: 'INITIALIZED', status: 'INITIALIZED',
errors: [] errors: []
}; };
this.trialJobMetricListener = (metric: TrialJobMetric) => {
this.onTrialJobMetrics(metric).catch((err: Error) => {
this.criticalError(NNIError.FromError(err, 'Job metrics error: '));
});
};
} }
public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void> { public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void> {
...@@ -342,6 +348,7 @@ class NNIManager implements Manager { ...@@ -342,6 +348,7 @@ class NNIManager implements Manager {
if (this.dispatcher === undefined) { if (this.dispatcher === undefined) {
throw new Error('Error: tuner has not been setup'); throw new Error('Error: tuner has not been setup');
} }
this.trainingService.removeTrialJobMetricListener(this.trialJobMetricListener);
this.dispatcher.sendCommand(TERMINATE); this.dispatcher.sendCommand(TERMINATE);
let tunerAlive: boolean = true; let tunerAlive: boolean = true;
// gracefully terminate tuner and assessor here, wait at most 30 seconds. // gracefully terminate tuner and assessor here, wait at most 30 seconds.
...@@ -589,11 +596,7 @@ class NNIManager implements Manager { ...@@ -589,11 +596,7 @@ class NNIManager implements Manager {
if (this.dispatcher === undefined) { if (this.dispatcher === undefined) {
throw new Error('Error: tuner or job maintainer have not been setup'); throw new Error('Error: tuner or job maintainer have not been setup');
} }
this.trainingService.addTrialJobMetricListener((metric: TrialJobMetric) => { this.trainingService.addTrialJobMetricListener(this.trialJobMetricListener);
this.onTrialJobMetrics(metric).catch((err: Error) => {
this.criticalError(NNIError.FromError(err, 'Job metrics error: '));
});
});
this.dispatcher.onCommand((commandType: string, content: string) => { this.dispatcher.onCommand((commandType: string, content: string) => {
this.onTunerCommand(commandType, content).catch((err: Error) => { this.onTunerCommand(commandType, content).catch((err: Error) => {
......
...@@ -24,7 +24,10 @@ import { getLogger } from "common/log"; ...@@ -24,7 +24,10 @@ import { getLogger } from "common/log";
import { countFilesRecursively } from '../../common/utils' import { countFilesRecursively } from '../../common/utils'
import * as cpp from 'child-process-promise'; import * as cpp from 'child-process-promise';
import * as cp from 'child_process'; import * as cp from 'child_process';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData' import * as os from 'os';
import * as fs from 'fs';
import { getNewLine } from '../../common/utils';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData';
import * as path from 'path'; import * as path from 'path';
import { String } from 'typescript-string-operations'; import { String } from 'typescript-string-operations';
import { file } from "../../node_modules/@types/tmp"; import { file } from "../../node_modules/@types/tmp";
...@@ -66,6 +69,20 @@ export async function execMkdir(directory: string): Promise<void> { ...@@ -66,6 +69,20 @@ export async function execMkdir(directory: string): Promise<void> {
return Promise.resolve(); return Promise.resolve();
} }
/**
* copy files to the directory
* @param source
* @param destination
*/
export async function execCopydir(source: string, destination: string): Promise<void> {
if (process.platform === 'win32') {
await cpp.exec(`powershell.exe Copy-Item ${source} -Destination ${destination} -Recurse`);
} else {
await cpp.exec(`cp -r ${source} ${destination}`);
}
return Promise.resolve();
}
/** /**
* crete a new file * crete a new file
* @param filename * @param filename
...@@ -91,8 +108,6 @@ export function execScript(filePath: string): cp.ChildProcess { ...@@ -91,8 +108,6 @@ export function execScript(filePath: string): cp.ChildProcess {
} }
} }
/** /**
* output the last line of a file * output the last line of a file
* @param filePath * @param filePath
...@@ -111,9 +126,9 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis ...@@ -111,9 +126,9 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis
* delete a directory * delete a directory
* @param directory * @param directory
*/ */
export async function execRemove(directory: string): Promise<void>{ export async function execRemove(directory: string): Promise<void> {
if (process.platform === 'win32') { if (process.platform === 'win32') {
await cpp.exec(`powershell.exe Remove-Item ${directory}`); await cpp.exec(`powershell.exe Remove-Item ${directory} -Recurse -Force`);
} else { } else {
await cpp.exec(`rm -rf ${directory}`); await cpp.exec(`rm -rf ${directory}`);
} }
...@@ -124,7 +139,7 @@ export async function execRemove(directory: string): Promise<void>{ ...@@ -124,7 +139,7 @@ export async function execRemove(directory: string): Promise<void>{
* kill a process * kill a process
* @param directory * @param directory
*/ */
export async function execKill(pid: string): Promise<void>{ export async function execKill(pid: string): Promise<void> {
if (process.platform === 'win32') { if (process.platform === 'win32') {
await cpp.exec(`cmd /c taskkill /PID ${pid} /T /F`); await cpp.exec(`cmd /c taskkill /PID ${pid} /T /F`);
} else { } else {
...@@ -138,7 +153,7 @@ export async function execKill(pid: string): Promise<void>{ ...@@ -138,7 +153,7 @@ export async function execKill(pid: string): Promise<void>{
* @param variable * @param variable
* @returns command string * @returns command string
*/ */
export function setEnvironmentVariable(variable: { key: string; value: string }): string{ export function setEnvironmentVariable(variable: { key: string; value: string }): string {
if (process.platform === 'win32') { if (process.platform === 'win32') {
return `$env:${variable.key}="${variable.value}"`; return `$env:${variable.key}="${variable.value}"`;
} }
...@@ -147,6 +162,32 @@ export function setEnvironmentVariable(variable: { key: string; value: string }) ...@@ -147,6 +162,32 @@ export function setEnvironmentVariable(variable: { key: string; value: string })
} }
} }
/**
* Compress files in directory to tar file
* @param source_path
* @param tar_path
*/
export async function tarAdd(tar_path: string, source_path: string): Promise<void> {
if (process.platform === 'win32') {
tar_path = tar_path.split('\\').join('\\\\');
source_path = source_path.split('\\').join('\\\\');
let script: string[] = [];
script.push(
`import os`,
`import tarfile`,
String.Format(`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`, tar_path, source_path),
` for file in files:`,
` fullpath = os.path.join(root,file)`,
` tar.add(fullpath, arcname=file)`,
`tar.close()`);
await fs.promises.writeFile(path.join(os.tmpdir(), 'tar.py'), script.join(getNewLine()), { encoding: 'utf8', mode: 0o777 });
const tarScript: string = path.join(os.tmpdir(), 'tar.py');
await cpp.exec(`python ${tarScript}`);
} else {
await cpp.exec(`tar -czf ${tar_path} -C ${source_path} .`);
}
return Promise.resolve();
}
/** /**
* generate script file name * generate script file name
......
...@@ -36,7 +36,7 @@ import { ObservableTimer } from '../../common/observableTimer'; ...@@ -36,7 +36,7 @@ import { ObservableTimer } from '../../common/observableTimer';
import { import {
HostJobApplicationForm, HyperParameters, JobApplicationForm, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, NNIManagerIpConfig HostJobApplicationForm, HyperParameters, JobApplicationForm, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, NNIManagerIpConfig
} from '../../common/trainingService'; } from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString, getJobCancelStatus, getRemoteTmpDir,getIPV4Address } from '../../common/utils'; import { delay, generateParamFileName, getExperimentRootDir, uniqueString, getJobCancelStatus, getRemoteTmpDir,getIPV4Address, getVersion, unixPathJoin } from '../../common/utils';
import { GPUSummary } from '../common/gpuData'; import { GPUSummary } from '../common/gpuData';
import { TrialConfig } from '../common/trialConfig'; import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
...@@ -48,10 +48,9 @@ import { ...@@ -48,10 +48,9 @@ import {
} from './remoteMachineData'; } from './remoteMachineData';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX } from '../common/gpuData'; import { GPU_INFO_COLLECTOR_FORMAT_LINUX } from '../common/gpuData';
import { SSHClientUtility } from './sshClientUtility'; import { SSHClientUtility } from './sshClientUtility';
import { validateCodeDir } from '../common/util'; import { validateCodeDir, execRemove, execMkdir, execCopydir } from '../common/util';
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer'; import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { mkDirP, getVersion } from '../../common/utils';
/** /**
* Training Service implementation for Remote Machine (Linux) * Training Service implementation for Remote Machine (Linux)
...@@ -234,7 +233,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -234,7 +233,7 @@ class RemoteMachineTrainingService implements TrainingService {
} else if (form.jobType === 'TRIAL') { } else if (form.jobType === 'TRIAL') {
// Generate trial job id(random) // Generate trial job id(random)
const trialJobId: string = uniqueString(5); const trialJobId: string = uniqueString(5);
const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId); const trialWorkingFolder: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJobId);
const trialJobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail( const trialJobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail(
trialJobId, trialJobId,
...@@ -354,7 +353,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -354,7 +353,7 @@ class RemoteMachineTrainingService implements TrainingService {
case TrialConfigMetadataKey.MACHINE_LIST: case TrialConfigMetadataKey.MACHINE_LIST:
await this.setupConnections(value); await this.setupConnections(value);
//remove local temp files //remove local temp files
await cpp.exec(`rm -rf ${this.getLocalGpuMetricCollectorDir()}`); await execRemove(this.getLocalGpuMetricCollectorDir());
break; break;
case TrialConfigMetadataKey.TRIAL_CONFIG: case TrialConfigMetadataKey.TRIAL_CONFIG:
const remoteMachineTrailConfig: TrialConfig = <TrialConfig>JSON.parse(value); const remoteMachineTrailConfig: TrialConfig = <TrialConfig>JSON.parse(value);
...@@ -417,7 +416,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -417,7 +416,7 @@ class RemoteMachineTrainingService implements TrainingService {
private async cleanupConnections(): Promise<void> { private async cleanupConnections(): Promise<void> {
try{ try{
for (const [rmMeta, sshClientManager] of this.machineSSHClientMap.entries()) { for (const [rmMeta, sshClientManager] of this.machineSSHClientMap.entries()) {
let jobpidPath: string = path.join(this.getRemoteScriptsPath(rmMeta.username), 'pid'); let jobpidPath: string = unixPathJoin(this.getRemoteScriptsPath(rmMeta.username), 'pid');
let client: Client | undefined = sshClientManager.getFirstSSHClient(); let client: Client | undefined = sshClientManager.getFirstSSHClient();
if(client) { if(client) {
await SSHClientUtility.remoteExeCommand(`pkill -P \`cat ${jobpidPath}\``, client); await SSHClientUtility.remoteExeCommand(`pkill -P \`cat ${jobpidPath}\``, client);
...@@ -438,7 +437,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -438,7 +437,7 @@ class RemoteMachineTrainingService implements TrainingService {
*/ */
private getLocalGpuMetricCollectorDir(): string { private getLocalGpuMetricCollectorDir(): string {
let userName: string = path.basename(os.homedir()); //get current user name of os let userName: string = path.basename(os.homedir()); //get current user name of os
return `${os.tmpdir()}/${userName}/nni/scripts/`; return path.join(os.tmpdir(), userName, 'nni', 'scripts');
} }
/** /**
...@@ -447,14 +446,14 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -447,14 +446,14 @@ class RemoteMachineTrainingService implements TrainingService {
*/ */
private async generateGpuMetricsCollectorScript(userName: string): Promise<void> { private async generateGpuMetricsCollectorScript(userName: string): Promise<void> {
let gpuMetricCollectorScriptFolder : string = this.getLocalGpuMetricCollectorDir(); let gpuMetricCollectorScriptFolder : string = this.getLocalGpuMetricCollectorDir();
await cpp.exec(`mkdir -p ${path.join(gpuMetricCollectorScriptFolder, userName)}`); await execMkdir(path.join(gpuMetricCollectorScriptFolder, userName));
//generate gpu_metrics_collector.sh //generate gpu_metrics_collector.sh
let gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh'); let gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh');
const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); // This directory is used to store gpu_metrics and pid created by script const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); // This directory is used to store gpu_metrics and pid created by script
const gpuMetricsCollectorScriptContent: string = String.Format( const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_LINUX,
remoteGPUScriptsDir, remoteGPUScriptsDir,
path.join(remoteGPUScriptsDir, 'pid'), unixPathJoin(remoteGPUScriptsDir, 'pid'),
); );
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
} }
...@@ -481,7 +480,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -481,7 +480,7 @@ class RemoteMachineTrainingService implements TrainingService {
private async initRemoteMachineOnConnected(rmMeta: RemoteMachineMeta, conn: Client): Promise<void> { private async initRemoteMachineOnConnected(rmMeta: RemoteMachineMeta, conn: Client): Promise<void> {
// Create root working directory after ssh connection is ready // Create root working directory after ssh connection is ready
await this.generateGpuMetricsCollectorScript(rmMeta.username); //generate gpu script in local machine first, will copy to remote machine later await this.generateGpuMetricsCollectorScript(rmMeta.username); //generate gpu script in local machine first, will copy to remote machine later
const nniRootDir: string = `${os.tmpdir()}/nni`; const nniRootDir: string = unixPathJoin(getRemoteTmpDir(this.remoteOS), 'nni');
await SSHClientUtility.remoteExeCommand(`mkdir -p ${this.remoteExpRootDir}`, conn); await SSHClientUtility.remoteExeCommand(`mkdir -p ${this.remoteExpRootDir}`, conn);
// Copy NNI scripts to remote expeirment working directory // Copy NNI scripts to remote expeirment working directory
...@@ -490,15 +489,15 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -490,15 +489,15 @@ class RemoteMachineTrainingService implements TrainingService {
await SSHClientUtility.remoteExeCommand(`mkdir -p ${remoteGpuScriptCollectorDir}`, conn); await SSHClientUtility.remoteExeCommand(`mkdir -p ${remoteGpuScriptCollectorDir}`, conn);
await SSHClientUtility.remoteExeCommand(`chmod 777 ${nniRootDir} ${nniRootDir}/* ${nniRootDir}/scripts/*`, conn); await SSHClientUtility.remoteExeCommand(`chmod 777 ${nniRootDir} ${nniRootDir}/* ${nniRootDir}/scripts/*`, conn);
//copy gpu_metrics_collector.sh to remote //copy gpu_metrics_collector.sh to remote
await SSHClientUtility.copyFileToRemote(path.join(localGpuScriptCollectorDir, rmMeta.username, 'gpu_metrics_collector.sh'), path.join(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh'), conn); await SSHClientUtility.copyFileToRemote(path.join(localGpuScriptCollectorDir, rmMeta.username, 'gpu_metrics_collector.sh'), unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh'), conn);
//Begin to execute gpu_metrics_collection scripts //Begin to execute gpu_metrics_collection scripts
SSHClientUtility.remoteExeCommand(`bash ${path.join(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh')}`, conn); SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh')}`, conn);
this.timer.subscribe( this.timer.subscribe(
async (tick: number) => { async (tick: number) => {
const cmdresult: RemoteCommandResult = await SSHClientUtility.remoteExeCommand( const cmdresult: RemoteCommandResult = await SSHClientUtility.remoteExeCommand(
`tail -n 1 ${path.join(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn); `tail -n 1 ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn);
if (cmdresult && cmdresult.stdout) { if (cmdresult && cmdresult.stdout) {
rmMeta.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout); rmMeta.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout);
} }
...@@ -531,7 +530,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -531,7 +530,7 @@ class RemoteMachineTrainingService implements TrainingService {
} else if (rmScheduleResult.resultType === ScheduleResultType.SUCCEED } else if (rmScheduleResult.resultType === ScheduleResultType.SUCCEED
&& rmScheduleResult.scheduleInfo !== undefined) { && rmScheduleResult.scheduleInfo !== undefined) {
const rmScheduleInfo : RemoteMachineScheduleInfo = rmScheduleResult.scheduleInfo; const rmScheduleInfo : RemoteMachineScheduleInfo = rmScheduleResult.scheduleInfo;
const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId); const trialWorkingFolder: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJobId);
trialJobDetail.rmMeta = rmScheduleInfo.rmMeta; trialJobDetail.rmMeta = rmScheduleInfo.rmMeta;
...@@ -575,7 +574,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -575,7 +574,7 @@ class RemoteMachineTrainingService implements TrainingService {
const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId); const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId);
await SSHClientUtility.remoteExeCommand(`mkdir -p ${trialWorkingFolder}`, sshClient); await SSHClientUtility.remoteExeCommand(`mkdir -p ${trialWorkingFolder}`, sshClient);
await SSHClientUtility.remoteExeCommand(`mkdir -p ${path.join(trialWorkingFolder, '.nni')}`, sshClient); await SSHClientUtility.remoteExeCommand(`mkdir -p ${unixPathJoin(trialWorkingFolder, '.nni')}`, sshClient);
// RemoteMachineRunShellFormat is the run shell format string, // RemoteMachineRunShellFormat is the run shell format string,
// See definition in remoteMachineData.ts // See definition in remoteMachineData.ts
...@@ -603,20 +602,20 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -603,20 +602,20 @@ class RemoteMachineTrainingService implements TrainingService {
getExperimentId(), getExperimentId(),
trialJobDetail.sequenceId.toString(), trialJobDetail.sequenceId.toString(),
this.isMultiPhase, this.isMultiPhase,
path.join(trialWorkingFolder, '.nni', 'jobpid'), unixPathJoin(trialWorkingFolder, '.nni', 'jobpid'),
command, command,
nniManagerIp, nniManagerIp,
this.remoteRestServerPort, this.remoteRestServerPort,
version, version,
this.logCollection, this.logCollection,
path.join(trialWorkingFolder, '.nni', 'code') unixPathJoin(trialWorkingFolder, '.nni', 'code')
) )
//create tmp trial working folder locally. //create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.join(trialLocalTempFolder, '.nni')}`); await execMkdir(path.join(trialLocalTempFolder, '.nni'));
//create tmp trial working folder locally. //create tmp trial working folder locally.
await cpp.exec(`cp -r ${this.trialConfig.codeDir}/* ${trialLocalTempFolder}`); await execCopydir(path.join(this.trialConfig.codeDir, '*'), trialLocalTempFolder);
const installScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT; const installScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files // Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), installScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), installScriptContent, { encoding: 'utf8' });
...@@ -626,7 +625,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -626,7 +625,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Copy files in codeDir to remote working directory // Copy files in codeDir to remote working directory
await SSHClientUtility.copyDirectoryToRemote(trialLocalTempFolder, trialWorkingFolder, sshClient, this.remoteOS); await SSHClientUtility.copyDirectoryToRemote(trialLocalTempFolder, trialWorkingFolder, sshClient, this.remoteOS);
// Execute command in remote machine // Execute command in remote machine
SSHClientUtility.remoteExeCommand(`bash ${path.join(trialWorkingFolder, 'run.sh')}`, sshClient); SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(trialWorkingFolder, 'run.sh')}`, sshClient);
} }
private async runHostJob(form: HostJobApplicationForm): Promise<TrialJobDetail> { private async runHostJob(form: HostJobApplicationForm): Promise<TrialJobDetail> {
...@@ -646,8 +645,8 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -646,8 +645,8 @@ class RemoteMachineTrainingService implements TrainingService {
); );
await fs.promises.writeFile(path.join(localDir, 'run.sh'), runScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(localDir, 'run.sh'), runScriptContent, { encoding: 'utf8' });
await SSHClientUtility.copyFileToRemote( await SSHClientUtility.copyFileToRemote(
path.join(localDir, 'run.sh'), path.join(remoteDir, 'run.sh'), sshClient); path.join(localDir, 'run.sh'), unixPathJoin(remoteDir, 'run.sh'), sshClient);
SSHClientUtility.remoteExeCommand(`bash ${path.join(remoteDir, 'run.sh')}`, sshClient); SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(remoteDir, 'run.sh')}`, sshClient);
const jobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail( const jobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail(
jobId, 'RUNNING', Date.now(), remoteDir, form, this.generateSequenceId() jobId, 'RUNNING', Date.now(), remoteDir, form, this.generateSequenceId()
...@@ -672,7 +671,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -672,7 +671,7 @@ class RemoteMachineTrainingService implements TrainingService {
private async updateTrialJobStatus(trialJob: RemoteMachineTrialJobDetail, sshClient: Client): Promise<TrialJobDetail> { private async updateTrialJobStatus(trialJob: RemoteMachineTrialJobDetail, sshClient: Client): Promise<TrialJobDetail> {
const deferred: Deferred<TrialJobDetail> = new Deferred<TrialJobDetail>(); const deferred: Deferred<TrialJobDetail> = new Deferred<TrialJobDetail>();
const jobpidPath: string = this.getJobPidPath(trialJob.id); const jobpidPath: string = this.getJobPidPath(trialJob.id);
const trialReturnCodeFilePath: string = path.join(this.remoteExpRootDir, 'trials', trialJob.id, '.nni', 'code'); const trialReturnCodeFilePath: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJob.id, '.nni', 'code');
try { try {
const killResult: number = (await SSHClientUtility.remoteExeCommand(`kill -0 \`cat ${jobpidPath}\``, sshClient)).exitCode; const killResult: number = (await SSHClientUtility.remoteExeCommand(`kill -0 \`cat ${jobpidPath}\``, sshClient)).exitCode;
// if the process of jobpid is not alive any more // if the process of jobpid is not alive any more
...@@ -712,15 +711,15 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -712,15 +711,15 @@ class RemoteMachineTrainingService implements TrainingService {
} }
private getRemoteScriptsPath(userName: string): string { private getRemoteScriptsPath(userName: string): string {
return path.join(getRemoteTmpDir(this.remoteOS), userName, 'nni', 'scripts'); return unixPathJoin(getRemoteTmpDir(this.remoteOS), userName, 'nni', 'scripts');
} }
private getHostJobRemoteDir(jobId: string): string { private getHostJobRemoteDir(jobId: string): string {
return path.join(this.remoteExpRootDir, 'hostjobs', jobId); return unixPathJoin(this.remoteExpRootDir, 'hostjobs', jobId);
} }
private getRemoteExperimentRootDir(): string{ private getRemoteExperimentRootDir(): string{
return path.join(getRemoteTmpDir(this.remoteOS), 'nni', 'experiments', getExperimentId()); return unixPathJoin(getRemoteTmpDir(this.remoteOS), 'nni', 'experiments', getExperimentId());
} }
public get MetricsEmitter() : EventEmitter { public get MetricsEmitter() : EventEmitter {
...@@ -735,9 +734,9 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -735,9 +734,9 @@ class RemoteMachineTrainingService implements TrainingService {
let jobpidPath: string; let jobpidPath: string;
if (trialJobDetail.form.jobType === 'TRIAL') { if (trialJobDetail.form.jobType === 'TRIAL') {
jobpidPath = path.join(trialJobDetail.workingDirectory, '.nni', 'jobpid'); jobpidPath = unixPathJoin(trialJobDetail.workingDirectory, '.nni', 'jobpid');
} else if (trialJobDetail.form.jobType === 'HOST') { } else if (trialJobDetail.form.jobType === 'HOST') {
jobpidPath = path.join(this.getHostJobRemoteDir(jobId), 'jobpid'); jobpidPath = unixPathJoin(this.getHostJobRemoteDir(jobId), 'jobpid');
} else { } else {
throw new Error(`Job type not supported: ${trialJobDetail.form.jobType}`); throw new Error(`Job type not supported: ${trialJobDetail.form.jobType}`);
} }
...@@ -751,14 +750,14 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -751,14 +750,14 @@ class RemoteMachineTrainingService implements TrainingService {
throw new Error('sshClient is undefined.'); throw new Error('sshClient is undefined.');
} }
const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId); const trialWorkingFolder: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJobId);
const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId); const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId);
const fileName: string = generateParamFileName(hyperParameters); const fileName: string = generateParamFileName(hyperParameters);
const localFilepath: string = path.join(trialLocalTempFolder, fileName); const localFilepath: string = path.join(trialLocalTempFolder, fileName);
await fs.promises.writeFile(localFilepath, hyperParameters.value, { encoding: 'utf8' }); await fs.promises.writeFile(localFilepath, hyperParameters.value, { encoding: 'utf8' });
await SSHClientUtility.copyFileToRemote(localFilepath, path.join(trialWorkingFolder, fileName), sshClient); await SSHClientUtility.copyFileToRemote(localFilepath, unixPathJoin(trialWorkingFolder, fileName), sshClient);
} }
private generateSequenceId(): number { private generateSequenceId(): number {
......
...@@ -28,8 +28,9 @@ import * as stream from 'stream'; ...@@ -28,8 +28,9 @@ import * as stream from 'stream';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import { NNIError, NNIErrorNames } from '../../common/errors'; import { NNIError, NNIErrorNames } from '../../common/errors';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { uniqueString, getRemoteTmpDir } from '../../common/utils'; import { uniqueString, getRemoteTmpDir, unixPathJoin } from '../../common/utils';
import { RemoteCommandResult } from './remoteMachineData'; import { RemoteCommandResult } from './remoteMachineData';
import { execRemove, tarAdd } from '../common/util';
/** /**
* *
...@@ -47,13 +48,13 @@ export namespace SSHClientUtility { ...@@ -47,13 +48,13 @@ export namespace SSHClientUtility {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
const tmpTarName: string = `${uniqueString(10)}.tar.gz`; const tmpTarName: string = `${uniqueString(10)}.tar.gz`;
const localTarPath: string = path.join(os.tmpdir(), tmpTarName); const localTarPath: string = path.join(os.tmpdir(), tmpTarName);
const remoteTarPath: string = path.join(getRemoteTmpDir(remoteOS), tmpTarName); const remoteTarPath: string = unixPathJoin(getRemoteTmpDir(remoteOS), tmpTarName);
// Compress files in local directory to experiment root directory // Compress files in local directory to experiment root directory
await cpp.exec(`tar -czf ${localTarPath} -C ${localDirectory} .`); await tarAdd(localTarPath, localDirectory);
// Copy the compressed file to remoteDirectory and delete it // Copy the compressed file to remoteDirectory and delete it
await copyFileToRemote(localTarPath, remoteTarPath, sshClient); await copyFileToRemote(localTarPath, remoteTarPath, sshClient);
await cpp.exec(`rm ${localTarPath}`); await execRemove(localTarPath);
// Decompress the remote compressed file in and delete it // Decompress the remote compressed file in and delete it
await remoteExeCommand(`tar -oxzf ${remoteTarPath} -C ${remoteDirectory}`, sshClient); await remoteExeCommand(`tar -oxzf ${remoteTarPath} -C ${remoteDirectory}`, sshClient);
await remoteExeCommand(`rm ${remoteTarPath}`, sshClient); await remoteExeCommand(`rm ${remoteTarPath}`, sshClient);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment