Unverified Commit e29b58a1 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #244 from microsoft/master

merge master
parents e0c2c0eb 4f88be1f
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'use strict';
import * as assert from 'assert';
import * as os from 'os';
import * as path from 'path';
import { Client, ClientChannel, SFTPWrapper } from 'ssh2';
import * as stream from 'stream';
import { Deferred } from 'ts-deferred';
import { NNIError, NNIErrorNames } from '../../common/errors';
import { getLogger, Logger } from '../../common/log';
import { getRemoteTmpDir, uniqueString, unixPathJoin } from '../../common/utils';
import { execRemove, tarAdd } from '../common/util';
import { RemoteCommandResult } from './remoteMachineData';
/**
*
* Utility for frequent operations towards SSH client
*
*/
export namespace SSHClientUtility {
/**
* Copy local file to remote path
* @param localFilePath the path of local file
* @param remoteFilePath the target path in remote machine
* @param sshClient SSH Client
*/
export function copyFileToRemote(localFilePath: string, remoteFilePath: string, sshClient: Client): Promise<boolean> {
const log: Logger = getLogger();
log.debug(`copyFileToRemote: localFilePath: ${localFilePath}, remoteFilePath: ${remoteFilePath}`);
assert(sshClient !== undefined);
const deferred: Deferred<boolean> = new Deferred<boolean>();
sshClient.sftp((err: Error, sftp: SFTPWrapper) => {
if (err !== undefined && err !== null) {
log.error(`copyFileToRemote: ${err.message}, ${localFilePath}, ${remoteFilePath}`);
deferred.reject(err);
return;
}
assert(sftp !== undefined);
sftp.fastPut(localFilePath, remoteFilePath, (fastPutErr: Error) => {
sftp.end();
if (fastPutErr !== undefined && fastPutErr !== null) {
deferred.reject(fastPutErr);
} else {
deferred.resolve(true);
}
});
});
return deferred.promise;
}
/**
* Execute command on remote machine
* @param command the command to execute remotely
* @param client SSH Client
*/
export function remoteExeCommand(command: string, client: Client): Promise<RemoteCommandResult> {
const log: Logger = getLogger();
log.debug(`remoteExeCommand: command: [${command}]`);
const deferred: Deferred<RemoteCommandResult> = new Deferred<RemoteCommandResult>();
let stdout: string = '';
let stderr: string = '';
let exitCode: number;
client.exec(command, (err: Error, channel: ClientChannel) => {
if (err !== undefined && err !== null) {
log.error(`remoteExeCommand: ${err.message}`);
deferred.reject(err);
return;
}
channel.on('data', (data: any, dataStderr: any) => {
if (dataStderr !== undefined && dataStderr !== null) {
stderr += data.toString();
} else {
stdout += data.toString();
}
})
.on('exit', (code: any, signal: any) => {
exitCode = <number>code;
deferred.resolve({
stdout : stdout,
stderr : stderr,
exitCode : exitCode
});
});
});
return deferred.promise;
}
/**
* Copy files and directories in local directory recursively to remote directory
* @param localDirectory local diretory
* @param remoteDirectory remote directory
* @param sshClient SSH client
*/
export async function copyDirectoryToRemote(localDirectory: string, remoteDirectory: string, sshClient: Client, remoteOS: string): Promise<void> {
const tmpSuffix: string = uniqueString(5);
const localTarPath: string = path.join(os.tmpdir(), `nni_tmp_local_${tmpSuffix}.tar.gz`);
const remoteTarPath: string = unixPathJoin(getRemoteTmpDir(remoteOS), `nni_tmp_remote_${tmpSuffix}.tar.gz`);
// Compress files in local directory to experiment root directory
await tarAdd(localTarPath, localDirectory);
// Copy the compressed file to remoteDirectory and delete it
await copyFileToRemote(localTarPath, remoteTarPath, sshClient);
await execRemove(localTarPath);
// Decompress the remote compressed file in and delete it
await remoteExeCommand(`tar -oxzf ${remoteTarPath} -C ${remoteDirectory}`, sshClient);
await remoteExeCommand(`rm ${remoteTarPath}`, sshClient);
}
export function getRemoteFileContent(filePath: string, sshClient: Client): Promise<string> {
const deferred: Deferred<string> = new Deferred<string>();
sshClient.sftp((err: Error, sftp: SFTPWrapper) => {
if (err !== undefined && err !== null) {
getLogger()
.error(`getRemoteFileContent: ${err.message}`);
deferred.reject(new Error(`SFTP error: ${err.message}`));
return;
}
try {
const sftpStream: stream.Readable = sftp.createReadStream(filePath);
let dataBuffer: string = '';
sftpStream.on('data', (data: Buffer | string) => {
dataBuffer += data;
})
.on('error', (streamErr: Error) => {
sftp.end();
deferred.reject(new NNIError(NNIErrorNames.NOT_FOUND, streamErr.message));
})
.on('end', () => {
// sftp connection need to be released manually once operation is done
sftp.end();
deferred.resolve(dataBuffer);
});
} catch (error) {
getLogger()
.error(`getRemoteFileContent: ${error.message}`);
sftp.end();
deferred.reject(new Error(`SFTP error: ${error.message}`));
}
});
return deferred.promise;
}
}
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'use strict';
import * as chai from 'chai';
import * as chaiAsPromised from 'chai-as-promised';
import * as component from '../../../common/component';
import { cleanupUnitTest, prepareUnitTest } from '../../../common/utils';
import { LinuxCommands } from '../extends/linuxCommands';
// import { TrialConfigMetadataKey } from '../trialConfigMetadataKey';
describe('Unit Test for linuxCommands', () => {
let linuxCommands: LinuxCommands
before(() => {
chai.should();
chai.use(chaiAsPromised);
prepareUnitTest();
});
after(() => {
cleanupUnitTest();
});
beforeEach(() => {
linuxCommands = component.get(LinuxCommands);
});
afterEach(() => {
});
it('joinPath', async () => {
chai.expect(linuxCommands.joinPath("/root/", "/first")).to.equal("/root/first");
chai.expect(linuxCommands.joinPath("/root", "first")).to.equal("/root/first");
chai.expect(linuxCommands.joinPath("/root/", "first")).to.equal("/root/first");
chai.expect(linuxCommands.joinPath("root/", "first")).to.equal("root/first");
chai.expect(linuxCommands.joinPath("root/")).to.equal("root/");
chai.expect(linuxCommands.joinPath("root")).to.equal("root");
chai.expect(linuxCommands.joinPath("./root")).to.equal("./root");
chai.expect(linuxCommands.joinPath("")).to.equal(".");
chai.expect(linuxCommands.joinPath("..")).to.equal("..");
})
it('createFolder', async () => {
chai.expect(linuxCommands.createFolder("test")).to.equal("mkdir -p 'test'");
chai.expect(linuxCommands.createFolder("test", true)).to.equal("umask 0; mkdir -p 'test'");
})
it('allowPermission', async () => {
chai.expect(linuxCommands.allowPermission(true, "test", "test1")).to.equal("chmod 777 -R 'test' 'test1'");
chai.expect(linuxCommands.allowPermission(false, "test")).to.equal("chmod 777 'test'");
})
it('removeFolder', async () => {
chai.expect(linuxCommands.removeFolder("test")).to.equal("rm -df 'test'");
chai.expect(linuxCommands.removeFolder("test", true)).to.equal("rm -rf 'test'");
chai.expect(linuxCommands.removeFolder("test", true, false)).to.equal("rm -r 'test'");
chai.expect(linuxCommands.removeFolder("test", false, false)).to.equal("rm 'test'");
})
it('removeFiles', async () => {
chai.expect(linuxCommands.removeFiles("test", "*.sh")).to.equal("rm 'test/*.sh'");
chai.expect(linuxCommands.removeFiles("test", "")).to.equal("rm 'test'");
})
it('readLastLines', async () => {
chai.expect(linuxCommands.readLastLines("test", 3)).to.equal("tail -n 3 'test'");
})
it('isProcessAlive', async () => {
chai.expect(linuxCommands.isProcessAliveCommand("test")).to.equal("kill -0 `cat 'test'`");
chai.expect(linuxCommands.isProcessAliveProcessOutput(
{
exitCode: 0,
stdout: "",
stderr: ""
}
)).to.equal(true);
chai.expect(linuxCommands.isProcessAliveProcessOutput(
{
exitCode: 10,
stdout: "",
stderr: ""
}
)).to.equal(false);
})
it('killChildProcesses', async () => {
chai.expect(linuxCommands.killChildProcesses("test")).to.equal("pkill -P `cat 'test'`");
})
it('extractFile', async () => {
chai.expect(linuxCommands.extractFile("test.tar", "testfolder")).to.equal("tar -oxzf 'test.tar' -C 'testfolder'");
})
it('executeScript', async () => {
chai.expect(linuxCommands.executeScript("test.sh", true)).to.equal("bash 'test.sh'");
chai.expect(linuxCommands.executeScript("test script'\"", false)).to.equal(`bash -c \"test script'\\""`);
})
});
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'use strict';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import * as chai from 'chai';
import * as chaiAsPromised from 'chai-as-promised';
import { Client } from 'ssh2';
import { ShellExecutor } from '../shellExecutor';
import { prepareUnitTest, cleanupUnitTest } from '../../../common/utils';
const LOCALFILE: string = '/tmp/localSshclientUTData';
const REMOTEFILE: string = '/tmp/remoteSshclientUTData';
const REMOTEFOLDER: string = '/tmp/remoteSshclientUTFolder';
async function copyFile(executor: ShellExecutor): Promise<void> {
await executor.copyFileToRemote(LOCALFILE, REMOTEFILE);
}
async function copyFileToRemoteLoop(executor: ShellExecutor): Promise<void> {
for (let i: number = 0; i < 10; i++) {
// console.log(i);
await executor.copyFileToRemote(LOCALFILE, REMOTEFILE);
}
}
async function getRemoteFileContentLoop(executor: ShellExecutor): Promise<void> {
for (let i: number = 0; i < 10; i++) {
// console.log(i);
await executor.getRemoteFileContent(REMOTEFILE);
}
}
describe('ShellExecutor test', () => {
let skip: boolean = false;
let rmMeta: any;
try {
rmMeta = JSON.parse(fs.readFileSync('../../.vscode/rminfo.json', 'utf8'));
console.log(rmMeta);
} catch (err) {
console.log(`Please configure rminfo.json to enable remote machine test.${err}`);
skip = true;
}
before(async () => {
chai.should();
chai.use(chaiAsPromised);
await cpp.exec(`echo '1234' > ${LOCALFILE}`);
prepareUnitTest();
});
after(() => {
cleanupUnitTest();
fs.unlinkSync(LOCALFILE);
});
it('Test mkdir', async () => {
if (skip) {
return;
}
const shellExecutor: ShellExecutor = new ShellExecutor();
await shellExecutor.initialize(rmMeta);
let result = await shellExecutor.createFolder(REMOTEFOLDER, false);
chai.expect(result).eq(true);
result = await shellExecutor.removeFolder(REMOTEFOLDER);
chai.expect(result).eq(true);
});
it('Test ShellExecutor', async () => {
if (skip) {
return;
}
const shellExecutor: ShellExecutor = new ShellExecutor();
await shellExecutor.initialize(rmMeta);
await copyFile(shellExecutor);
await Promise.all([
copyFileToRemoteLoop(shellExecutor),
copyFileToRemoteLoop(shellExecutor),
copyFileToRemoteLoop(shellExecutor),
getRemoteFileContentLoop(shellExecutor)
]);
});
});
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'use strict';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import { Client } from 'ssh2';
import { Deferred } from 'ts-deferred';
import { SSHClientUtility } from '../remote_machine/sshClientUtility';
const LOCALFILE: string = '/tmp/sshclientUTData';
const REMOTEFILE: string = '/tmp/sshclientUTData';
async function copyFile(conn: Client): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
conn.sftp((err, sftp) => {
if (err) {
deferred.reject(err);
return;
}
sftp.fastPut(
LOCALFILE,
REMOTEFILE, (fastPutErr: Error) => {
sftp.end();
if (fastPutErr) {
deferred.reject(fastPutErr);
} else {
deferred.resolve();
}
}
);
});
return deferred.promise;
}
async function copyFileToRemoteLoop(conn: Client): Promise<void> {
for (let i: number = 0; i < 500; i++) {
console.log(i);
await SSHClientUtility.copyFileToRemote(LOCALFILE, REMOTEFILE, conn);
}
}
async function remoteExeCommandLoop(conn: Client): Promise<void> {
for (let i: number = 0; i < 500; i++) {
console.log(i);
await SSHClientUtility.remoteExeCommand('ls', conn);
}
}
async function getRemoteFileContentLoop(conn: Client): Promise<void> {
for (let i: number = 0; i < 500; i++) {
console.log(i);
await SSHClientUtility.getRemoteFileContent(REMOTEFILE, conn);
}
}
describe('sshClientUtility test', () => {
let skip: boolean = true;
let rmMeta: any;
try {
rmMeta = JSON.parse(fs.readFileSync('../../.vscode/rminfo.json', 'utf8'));
} catch (err) {
skip = true;
}
before(async () => {
await cpp.exec(`echo '1234' > ${LOCALFILE}`);
});
after(() => {
fs.unlinkSync(LOCALFILE);
});
it('Test SSHClientUtility', (done) => {
if (skip) {
done();
return;
}
const conn: Client = new Client();
conn.on('ready', async () => {
await copyFile(conn);
await Promise.all([
copyFileToRemoteLoop(conn),
copyFileToRemoteLoop(conn),
copyFileToRemoteLoop(conn),
remoteExeCommandLoop(conn),
getRemoteFileContentLoop(conn)
]);
done();
}).connect(rmMeta);
});
});
......@@ -104,7 +104,7 @@ class BaseMutator(nn.Module):
"""
pass
def on_forward_layer_choice(self, mutable, *inputs):
def on_forward_layer_choice(self, mutable, *args, **kwargs):
"""
Callbacks of forward in LayerChoice.
......@@ -112,8 +112,10 @@ class BaseMutator(nn.Module):
----------
mutable : LayerChoice
Module whose forward is called.
inputs : list of torch.Tensor
args : list of torch.Tensor
The arguments of its forward function.
kwargs : dict
The keyword arguments of its forward function.
Returns
-------
......
......@@ -203,7 +203,7 @@ class ClassicMutator(Mutator):
# for now we only generate flattened search space
if isinstance(mutable, LayerChoice):
key = mutable.key
val = [repr(choice) for choice in mutable.choices]
val = mutable.names
search_space[key] = {"_type": LAYER_CHOICE, "_value": val}
elif isinstance(mutable, InputChoice):
key = mutable.key
......
......@@ -2,6 +2,7 @@
# Licensed under the MIT license.
import logging
from collections import OrderedDict
import torch.nn as nn
......@@ -58,9 +59,6 @@ class Mutable(nn.Module):
"Or did you apply multiple fixed architectures?")
self.__dict__["mutator"] = mutator
def forward(self, *inputs):
raise NotImplementedError
@property
def key(self):
"""
......@@ -86,9 +84,6 @@ class Mutable(nn.Module):
"Or did you initialize a mutable on the fly in forward pass? Move to `__init__` "
"so that trainer can locate all your mutables. See NNI docs for more details.".format(self))
def __repr__(self):
return "{} ({})".format(self.name, self.key)
class MutableScope(Mutable):
"""
......@@ -131,7 +126,7 @@ class LayerChoice(Mutable):
Parameters
----------
op_candidates : list of nn.Module
op_candidates : list of nn.Module or OrderedDict
A module list to be selected from.
reduction : str
``mean``, ``concat``, ``sum`` or ``none``. Policy if multiples are selected.
......@@ -146,23 +141,53 @@ class LayerChoice(Mutable):
----------
length : int
Number of ops to choose from.
names: list of str
Names of candidates.
Notes
-----
``op_candidates`` can be a list of modules or a ordered dict of named modules, for example,
.. code-block:: python
self.op_choice = LayerChoice(OrderedDict([
("conv3x3", nn.Conv2d(3, 16, 128)),
("conv5x5", nn.Conv2d(5, 16, 128)),
("conv7x7", nn.Conv2d(7, 16, 128))
]))
"""
def __init__(self, op_candidates, reduction="sum", return_mask=False, key=None):
super().__init__(key=key)
self.length = len(op_candidates)
self.choices = nn.ModuleList(op_candidates)
self.choices = []
self.names = []
if isinstance(op_candidates, OrderedDict):
for name, module in op_candidates.items():
assert name not in ["length", "reduction", "return_mask", "_key", "key", "names"], \
"Please don't use a reserved name '{}' for your module.".format(name)
self.add_module(name, module)
self.choices.append(module)
self.names.append(name)
elif isinstance(op_candidates, list):
for i, module in enumerate(op_candidates):
self.add_module(str(i), module)
self.choices.append(module)
self.names.append(str(i))
else:
raise TypeError("Unsupported op_candidates type: {}".format(type(op_candidates)))
self.reduction = reduction
self.return_mask = return_mask
def forward(self, *inputs):
def forward(self, *args, **kwargs):
"""
Returns
-------
tuple of tensors
Output and selection mask. If ``return_mask`` is ``False``, only output is returned.
"""
out, mask = self.mutator.on_forward_layer_choice(self, *inputs)
out, mask = self.mutator.on_forward_layer_choice(self, *args, **kwargs)
if self.return_mask:
return out, mask
return out
......
......@@ -128,7 +128,7 @@ class Mutator(BaseMutator):
result["mutable"][mutable.key].append(path)
return result
def on_forward_layer_choice(self, mutable, *inputs):
def on_forward_layer_choice(self, mutable, *args, **kwargs):
"""
On default, this method retrieves the decision obtained previously, and select certain operations.
Only operations with non-zero weight will be executed. The results will be added to a list.
......@@ -138,7 +138,9 @@ class Mutator(BaseMutator):
----------
mutable : LayerChoice
Layer choice module.
inputs : list of torch.Tensor
args : list of torch.Tensor
Inputs
kwargs : dict
Inputs
Returns
......@@ -148,16 +150,16 @@ class Mutator(BaseMutator):
"""
if self._connect_all:
return self._all_connect_tensor_reduction(mutable.reduction,
[op(*inputs) for op in mutable.choices]), \
[op(*args, **kwargs) for op in mutable.choices]), \
torch.ones(mutable.length)
def _map_fn(op, *inputs):
return op(*inputs)
def _map_fn(op, args, kwargs):
return op(*args, **kwargs)
mask = self._get_decision(mutable)
assert len(mask) == len(mutable.choices), \
"Invalid mask, expected {} to be of length {}.".format(mask, len(mutable.choices))
out = self._select_with_mask(_map_fn, [(choice, *inputs) for choice in mutable.choices], mask)
out = self._select_with_mask(_map_fn, [(choice, args, kwargs) for choice in mutable.choices], mask)
return self._tensor_reduction(mutable.reduction, out), mask
def on_forward_input_choice(self, mutable, tensor_list):
......
......@@ -317,7 +317,7 @@ class ProxylessNasMutator(BaseMutator):
self.mutable_list.append(mutable)
mutable.registered_module = MixedOp(mutable)
def on_forward_layer_choice(self, mutable, *inputs):
def on_forward_layer_choice(self, mutable, *args, **kwargs):
"""
Callback of layer choice forward. This function defines the forward
logic of the input mutable. So mutable is only interface, its real
......@@ -327,7 +327,9 @@ class ProxylessNasMutator(BaseMutator):
----------
mutable: LayerChoice
forward logic of this input mutable
inputs: list of torch.Tensor
args: list of torch.Tensor
inputs of this mutable
kwargs: dict
inputs of this mutable
Returns
......@@ -339,7 +341,7 @@ class ProxylessNasMutator(BaseMutator):
"""
# FIXME: return mask, to be consistent with other algorithms
idx = mutable.registered_module.active_op_index
return mutable.registered_module(mutable, *inputs), idx
return mutable.registered_module(mutable, *args, **kwargs), idx
def reset_binary_gates(self):
"""
......
......@@ -5593,7 +5593,7 @@ load-json-file@^4.0.0:
pify "^3.0.0"
strip-bom "^3.0.0"
loader-fs-cache@>=1.0.3, loader-fs-cache@^1.0.0:
loader-fs-cache@^1.0.0:
version "1.0.3"
resolved "https://registry.yarnpkg.com/loader-fs-cache/-/loader-fs-cache-1.0.3.tgz#f08657646d607078be2f0a032f8bd69dd6f277d9"
integrity sha512-ldcgZpjNJj71n+2Mf6yetz+c9bM4xpKtNds4LbqXzU/PTdeAX0g3ytnU1AJMEcTk2Lex4Smpe3Q/eCTsvUBxbA==
......
......@@ -77,6 +77,14 @@ testCases:
kwargs:
expected_result_file: expected_metrics.json
- name: export-float
configFile: test/config/metrics_test/config.yml
config:
maxTrialNum: 1
trialConcurrency: 1
validator:
class: ExportValidator
- name: metrics-dict
configFile: test/config/metrics_test/config_dict_metrics.yml
config:
......@@ -87,6 +95,14 @@ testCases:
kwargs:
expected_result_file: expected_metrics_dict.json
- name: export-dict
configFile: test/config/metrics_test/config_dict_metrics.yml
config:
maxTrialNum: 1
trialConcurrency: 1
validator:
class: ExportValidator
- name: nnicli
configFile: test/config/examples/sklearn-regression.yml
config:
......
......@@ -2,6 +2,8 @@
# Licensed under the MIT license.
import os.path as osp
from os import remove
import subprocess
import json
import requests
import nnicli as nc
......@@ -12,6 +14,24 @@ class ITValidator:
def __call__(self, rest_endpoint, experiment_dir, nni_source_dir, **kwargs):
pass
class ExportValidator(ITValidator):
def __call__(self, rest_endpoint, experiment_dir, nni_source_dir, **kwargs):
exp_id = osp.split(experiment_dir)[-1]
proc1 = subprocess.run(["nnictl", "experiment", "export", exp_id, "-t", "csv", "-f", "report.csv"])
assert proc1.returncode == 0, '`nnictl experiment export -t csv` failed with code %d' % proc1.returncode
with open("report.csv", 'r') as f:
print('Exported CSV file: \n')
print(''.join(f.readlines()))
print('\n\n')
remove('report.csv')
proc2 = subprocess.run(["nnictl", "experiment", "export", exp_id, "-t", "json", "-f", "report.json"])
assert proc2.returncode == 0, '`nnictl experiment export -t json` failed with code %d' % proc2.returncode
with open("report.json", 'r') as f:
print('Exported JSON file: \n')
print('\n'.join(f.readlines()))
print('\n\n')
remove('report.json')
class MetricsValidator(ITValidator):
def __call__(self, rest_endpoint, experiment_dir, nni_source_dir, **kwargs):
......
......@@ -70,5 +70,5 @@ jobs:
python --version
mount -o anon $(pai_nfs_uri) $(local_nfs_uri)
python nni_test/nnitest/generate_ts_config.py --ts pai --pai_token $(pai_token) --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $(docker_image) --pai_storage_plugin $(pai_storage_plugin) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip)
python nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai --exclude multi-phase
python nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai
displayName: 'Examples and advanced features tests on pai'
\ No newline at end of file
......@@ -57,5 +57,5 @@ jobs:
cd test
python3 nni_test/nnitest/generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --pai_storage_plugin $(pai_storage_plugin)\
--pai_token $(pai_token) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip)
PATH=$HOME/.local/bin:$PATH python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai --exclude multi-phase
PATH=$HOME/.local/bin:$PATH python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai
displayName: 'integration test'
......@@ -699,12 +699,13 @@ def export_trials_data(args):
content = json.loads(response.text)
trial_records = []
for record in content:
if not isinstance(record['value'], (float, int)):
formated_record = {**record['parameter'], **record['value'], **{'id': record['id']}}
record_value = json.loads(record['value'])
if not isinstance(record_value, (float, int)):
formated_record = {**record['parameter'], **record_value, **{'id': record['id']}}
else:
formated_record = {**record['parameter'], **{'reward': record['value'], 'id': record['id']}}
formated_record = {**record['parameter'], **{'reward': record_value, 'id': record['id']}}
trial_records.append(formated_record)
with open(args.path, 'w') as file:
with open(args.path, 'w', newline='') as file:
writer = csv.DictWriter(file, set.union(*[set(r.keys()) for r in trial_records]))
writer.writeheader()
writer.writerows(trial_records)
......
......@@ -10,27 +10,31 @@ import traceback
from xml.dom import minidom
def check_ready_to_run():
if sys.platform == 'win32':
pgrep_output = subprocess.check_output(
'wmic process where "CommandLine like \'%nni_gpu_tool.gpu_metrics_collector%\' and name like \'%python%\'" get processId')
pidList = pgrep_output.decode("utf-8").strip().split()
pidList.pop(0) # remove the key word 'ProcessId'
pidList.pop(0) # remove the key word 'ProcessId'
pidList = list(map(int, pidList))
pidList.remove(os.getpid())
return not pidList
else:
pgrep_output = subprocess.check_output('pgrep -fxu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pgrep_output = subprocess.check_output('pgrep -afu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = []
for pid in pgrep_output.splitlines():
pidList.append(int(pid))
pidList.remove(os.getpid())
pid = pid.decode()
if "pgrep " in pid or pid.startswith('%s ' % os.getpid()) or pid.startswith('%s ' % os.getppid()):
continue
pidList.append(pid)
return not pidList
def main(argv):
metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
if check_ready_to_run() == False:
# GPU metrics collector is already running. Exit
print("GPU metrics collector is already running. exiting...")
exit(2)
cmd = 'nvidia-smi -q -x'.split()
while(True):
......@@ -44,6 +48,7 @@ def main(argv):
# TODO: change to sleep time configurable via arguments
time.sleep(5)
def parse_nvidia_smi_result(smi, outputDir):
try:
old_umask = os.umask(0)
......@@ -70,13 +75,14 @@ def parse_nvidia_smi_result(smi, outputDir):
outPut["gpuInfos"].append(gpuInfo)
print(outPut)
outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
outputFile.flush();
except:
outputFile.flush()
except Exception as error:
# e_info = sys.exc_info()
print('xmldoc paring error')
print('gpu_metrics_collector error: %s' % error)
finally:
os.umask(old_umask)
def gen_empty_gpu_metric(outputDir):
try:
old_umask = os.umask(0)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment