Unverified Commit 6a5864cd authored by liuzhe-lz's avatar liuzhe-lz Committed by GitHub
Browse files

fix gpu script permission issue (#1707)

* fix gpu script permission issue

* make gpu tool local to user
parent cb52d441
......@@ -59,14 +59,6 @@ export class GPUSummary {
}
}
export const GPU_INFO_COLLECTOR_FORMAT_LINUX: string =
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`;
export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
`
$env:METRIC_OUTPUT_DIR="{0}"
......
......@@ -27,7 +27,7 @@ import * as path from 'path';
import { String } from 'typescript-string-operations';
import { countFilesRecursively, getNewLine, validateFileNameRecursively } from '../../common/utils';
import { file } from '../../node_modules/@types/tmp';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData';
import { GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData';
/**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
......@@ -219,22 +219,16 @@ export function getScriptName(fileNamePrefix: string): string {
}
}
/**
* generate script file
* @param gpuMetricCollectorScriptFolder
*/
export function getgpuMetricsCollectorScriptContent(gpuMetricCollectorScriptFolder: string): string {
export function getGpuMetricsCollectorBashScriptContent(scriptFolder: string): string {
return `echo $$ > ${scriptFolder}/pid ; METRIC_OUTPUT_DIR=${scriptFolder} python3 -m nni_gpu_tool.gpu_metrics_collector`;
}
export function runGpuMetricsCollector(scriptFolder: string): void {
if (process.platform === 'win32') {
return String.Format(
GPU_INFO_COLLECTOR_FORMAT_WINDOWS,
gpuMetricCollectorScriptFolder,
path.join(gpuMetricCollectorScriptFolder, 'pid')
);
const scriptPath = path.join(scriptFolder, 'gpu_metrics_collector.ps1');
const content = String.Format(GPU_INFO_COLLECTOR_FORMAT_WINDOWS, scriptFolder, path.join(scriptFolder, 'pid'));
fs.writeFile(scriptPath, content, { encoding: 'utf8' }, () => { runScript(scriptPath); });
} else {
return String.Format(
GPU_INFO_COLLECTOR_FORMAT_LINUX,
gpuMetricCollectorScriptFolder,
path.join(gpuMetricCollectorScriptFolder, 'pid')
);
cp.exec(getGpuMetricsCollectorBashScriptContent(scriptFolder), { shell: '/bin/bash' });
}
}
......@@ -28,7 +28,7 @@ import { String } from 'typescript-string-operations';
import { getLogger, Logger } from '../../common/log';
import { delay } from '../../common/utils';
import { GPUInfo, GPUSummary } from '../common/gpuData';
import { execKill, execMkdir, execRemove, execTail, getgpuMetricsCollectorScriptContent, getScriptName, runScript } from '../common/util';
import { execKill, execMkdir, execRemove, execTail, runGpuMetricsCollector } from '../common/util';
/**
* GPUScheduler for local training service
......@@ -43,7 +43,7 @@ class GPUScheduler {
constructor() {
this.stopping = false;
this.log = getLogger();
this.gpuMetricCollectorScriptFolder = `${os.tmpdir()}/nni/script`;
this.gpuMetricCollectorScriptFolder = `${os.tmpdir()}/${os.userInfo().username}/nni/script`;
}
public async run(): Promise<void> {
......@@ -101,12 +101,7 @@ class GPUScheduler {
*/
private async runGpuMetricsCollectorScript(): Promise<void> {
await execMkdir(this.gpuMetricCollectorScriptFolder, true);
//generate gpu_metrics_collector script
const gpuMetricsCollectorScriptPath: string =
path.join(this.gpuMetricCollectorScriptFolder, getScriptName('gpu_metrics_collector'));
const gpuMetricsCollectorScriptContent: string = getgpuMetricsCollectorScriptContent(this.gpuMetricCollectorScriptFolder);
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
runScript(gpuMetricsCollectorScriptPath);
runGpuMetricsCollector(this.gpuMetricCollectorScriptFolder);
}
// tslint:disable:non-literal-fs-path
......
......@@ -42,10 +42,10 @@ import {
getVersion, uniqueString, unixPathJoin
} from '../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPUSummary } from '../common/gpuData';
import { GPUSummary } from '../common/gpuData';
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { execCopydir, execMkdir, execRemove, validateCodeDir } from '../common/util';
import { execCopydir, execMkdir, execRemove, validateCodeDir, getGpuMetricsCollectorBashScriptContent } from '../common/util';
import { GPUScheduler } from './gpuScheduler';
import {
HOST_JOB_SHELL_FORMAT, RemoteCommandResult, REMOTEMACHINE_TRIAL_COMMAND_FORMAT, RemoteMachineMeta,
......@@ -334,8 +334,6 @@ class RemoteMachineTrainingService implements TrainingService {
break;
case TrialConfigMetadataKey.MACHINE_LIST:
await this.setupConnections(value);
//remove local temp files
await execRemove(this.getLocalGpuMetricCollectorDir());
break;
case TrialConfigMetadataKey.TRIAL_CONFIG:
const remoteMachineTrailConfig: TrialConfig = <TrialConfig>JSON.parse(value);
......@@ -428,34 +426,6 @@ class RemoteMachineTrainingService implements TrainingService {
return Promise.resolve();
}
/**
* Generate gpu metric collector directory to store temp gpu metric collector script files
*/
private getLocalGpuMetricCollectorDir(): string {
const userName: string = path.basename(os.homedir()); //get current user name of os
return path.join(os.tmpdir(), userName, 'nni', 'scripts');
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private async generateGpuMetricsCollectorScript(userName: string): Promise<void> {
const gpuMetricCollectorScriptFolder : string = this.getLocalGpuMetricCollectorDir();
await execMkdir(path.join(gpuMetricCollectorScriptFolder, userName));
//generate gpu_metrics_collector.sh
const gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh');
// This directory is used to store gpu_metrics and pid created by script
const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName);
const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT_LINUX,
remoteGPUScriptsDir,
unixPathJoin(remoteGPUScriptsDir, 'pid')
);
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
}
private async setupConnections(machineList: string): Promise<void> {
this.log.debug(`Connecting to remote machines: ${machineList}`);
const deferred: Deferred<void> = new Deferred<void>();
......@@ -479,24 +449,18 @@ class RemoteMachineTrainingService implements TrainingService {
private async initRemoteMachineOnConnected(rmMeta: RemoteMachineMeta, conn: Client): Promise<void> {
// Create root working directory after ssh connection is ready
// generate gpu script in local machine first, will copy to remote machine later
await this.generateGpuMetricsCollectorScript(rmMeta.username);
const nniRootDir: string = unixPathJoin(getRemoteTmpDir(this.remoteOS), 'nni');
await SSHClientUtility.remoteExeCommand(`mkdir -p ${this.remoteExpRootDir}`, conn);
// Copy NNI scripts to remote expeirment working directory
const localGpuScriptCollectorDir: string = this.getLocalGpuMetricCollectorDir();
// the directory to store temp scripts in remote machine
const remoteGpuScriptCollectorDir: string = this.getRemoteScriptsPath(rmMeta.username);
await SSHClientUtility.remoteExeCommand(`mkdir -p ${remoteGpuScriptCollectorDir}`, conn);
await SSHClientUtility.remoteExeCommand(`(umask 0 ; mkdir -p ${remoteGpuScriptCollectorDir})`, conn);
await SSHClientUtility.remoteExeCommand(`chmod 777 ${nniRootDir} ${nniRootDir}/* ${nniRootDir}/scripts/*`, conn);
//copy gpu_metrics_collector.sh to remote
await SSHClientUtility.copyFileToRemote(path.join(localGpuScriptCollectorDir, rmMeta.username, 'gpu_metrics_collector.sh'),
unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh'), conn);
//Begin to execute gpu_metrics_collection scripts
// tslint:disable-next-line: no-floating-promises
SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh')}`, conn);
const script = getGpuMetricsCollectorBashScriptContent(remoteGpuScriptCollectorDir);
SSHClientUtility.remoteExeCommand(`bash -c '${script}'`, conn);
const disposable: Rx.IDisposable = this.timer.subscribe(
async (tick: number) => {
......
......@@ -35,7 +35,7 @@ def check_ready_to_run():
pidList.remove(os.getpid())
return not pidList
else:
pgrep_output = subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pgrep_output = subprocess.check_output('pgrep -fxu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = []
for pid in pgrep_output.splitlines():
pidList.append(int(pid))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment