"examples/vscode:/vscode.git/clone" did not exist on "aa2cc9223e8c7223ac1da49492332e7ec32298d9"
Unverified Commit e9f137f0 authored by QuanluZhang's avatar QuanluZhang Committed by GitHub
Browse files

merge from master (#2019)

parent f7cf3ea5
......@@ -38,6 +38,7 @@ export namespace ValidationSchemas {
authFile: joi.string(),
nniManagerNFSMountPath: joi.string().min(1),
containerNFSMountPath: joi.string().min(1),
paiConfigPath: joi.string(),
paiStoragePlugin: joi.string().min(1),
nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'),
portList: joi.array().items(joi.object({
......
......@@ -31,10 +31,11 @@ export class NNIPAIK8STrialConfig extends TrialConfig {
public readonly nniManagerNFSMountPath: string;
public readonly containerNFSMountPath: string;
public readonly paiStoragePlugin: string;
public readonly paiConfigPath?: string;
constructor(command: string, codeDir: string, gpuNum: number, cpuNum: number, memoryMB: number,
image: string, nniManagerNFSMountPath: string, containerNFSMountPath: string,
paiStoragePlugin: string, virtualCluster?: string) {
paiStoragePlugin: string, virtualCluster?: string, paiConfigPath?: string) {
super(command, codeDir, gpuNum);
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
......@@ -43,5 +44,6 @@ export class NNIPAIK8STrialConfig extends TrialConfig {
this.nniManagerNFSMountPath = nniManagerNFSMountPath;
this.containerNFSMountPath = containerNFSMountPath;
this.paiStoragePlugin = paiStoragePlugin;
this.paiConfigPath = paiConfigPath;
}
}
......@@ -44,6 +44,7 @@ import { PAIClusterConfig, PAITrialJobDetail } from '../paiConfig';
import { PAIJobRestServer } from '../paiJobRestServer';
const yaml = require('js-yaml');
const deepmerge = require('deepmerge');
/**
* Training Service implementation for OpenPAI (Open Platform for AI)
......@@ -59,6 +60,10 @@ class PAIK8STrainingService extends PAITrainingService {
public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) {
case TrialConfigMetadataKey.NNI_MANAGER_IP:
this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
break;
case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG:
this.paiJobRestServer = new PAIJobRestServer(component.get(PAIK8STrainingService));
this.paiClusterConfig = <PAIClusterConfig>JSON.parse(value);
......@@ -185,7 +190,19 @@ class PAIK8STrainingService extends PAITrainingService {
}
}
return yaml.safeDump(paiJobConfig);
if (this.paiTrialConfig.paiConfigPath) {
try {
const additionalPAIConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8'));
//deepmerge(x, y), if an element at the same key is present for both x and y, the value from y will appear in the result.
//refer: https://github.com/TehShrike/deepmerge
const overwriteMerge = (destinationArray: any, sourceArray: any, options: any) => sourceArray;
return yaml.safeDump(deepmerge(additionalPAIConfig, paiJobConfig, { arrayMerge: overwriteMerge }));
} catch (error) {
this.log.error(`Error occurs during loading and merge ${this.paiTrialConfig.paiConfigPath} : ${error}`);
}
} else {
return yaml.safeDump(paiJobConfig);
}
}
protected async submitTrialJobToPAI(trialJobId: string): Promise<boolean> {
......@@ -254,7 +271,7 @@ class PAIK8STrainingService extends PAITrainingService {
this.log.info(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`);
const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobId, nniPaiTrialCommand);
this.log.debug(paiJobConfig);
// Step 3. Submit PAI job via Rest call
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
const submitJobRequest: request.Options = {
......
......@@ -1112,6 +1112,11 @@ deepmerge@^2.1.1:
version "2.2.1"
resolved "https://registry.yarnpkg.com/deepmerge/-/deepmerge-2.2.1.tgz#5d3ff22a01c00f645405a2fbc17d0778a1801170"
deepmerge@^4.2.2:
version "4.2.2"
resolved "https://registry.yarnpkg.com/deepmerge/-/deepmerge-4.2.2.tgz#44d2ea3679b8f4d4ffba33f03d865fc1e7bf4955"
integrity sha512-FJ3UgI4gIl+PHZm53knsuSFpE+nESMr7M4v9QcgB7S63Kj/6WqMiFQJpBBYz1Pt+66bZpP3Q7Lye0Oo9MPKEdg==
default-require-extensions@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/default-require-extensions/-/default-require-extensions-2.0.0.tgz#f5f8fbb18a7d6d50b21f641f649ebb522cfe24f7"
......
......@@ -113,7 +113,7 @@ class AGP_Pruner(Pruner):
if k == 0 or target_sparsity >= 1 or target_sparsity <= 0:
return mask
# if we want to generate new mask, we should update weigth first
w_abs = weight.abs() * mask
w_abs = weight.abs() * mask['weight']
threshold = torch.topk(w_abs.view(-1), k, largest=False)[0].max()
new_mask = {'weight': torch.gt(w_abs, threshold).type_as(weight)}
self.mask_dict.update({op_name: new_mask})
......
......@@ -31,11 +31,11 @@ def test():
# [1,1,1,1,1,1,1,1,1,1],
# [1,1,1,1,1,1,1,1,1,1]]
assessor = MedianstopAssessor(FLAGS.start_step, FLAGS.optimize_mode)
for i in range(4):
assessor = MedianstopAssessor(FLAGS.optimize_mode, FLAGS.start_step)
for i in range(len(lcs)):
#lc = []
to_complete = True
for k in range(10):
for k in range(len(lcs[0])):
#d = random.randint(i*100+0, i*100+100)
#lc.append(d)
ret = assessor.assess_trial(i, lcs[i][:k+1])
......
......@@ -68,6 +68,13 @@ class ClassicMutator(Mutator):
else:
# get chosen arch from tuner
self._chosen_arch = nni.get_next_parameter()
if self._chosen_arch is None:
if trial_env_vars.NNI_PLATFORM == "unittest":
# happens if NNI_PLATFORM is intentionally set, e.g., in UT
logger.warning("`NNI_PLATFORM` is set but `param` is None. Falling back to standalone mode.")
self._chosen_arch = self._standalone_generate_chosen()
else:
raise RuntimeError("Chosen architecture is None. This may be a platform error.")
self.reset()
def _sample_layer_choice(self, mutable, idx, value, search_space_item):
......@@ -169,6 +176,8 @@ class ClassicMutator(Mutator):
elif val["_type"] == INPUT_CHOICE:
choices = val["_value"]["candidates"]
n_chosen = val["_value"]["n_chosen"]
if n_chosen is None:
n_chosen = len(choices)
chosen_arch[key] = {"_value": choices[:n_chosen], "_idx": list(range(n_chosen))}
else:
raise ValueError("Unknown key '%s' and value '%s'." % (key, val))
......
......@@ -63,18 +63,23 @@ class DartsMutator(Mutator):
edges_max[mutable.key] = max_val
result[mutable.key] = F.one_hot(index, num_classes=mutable.length).view(-1).bool()
for mutable in self.mutables:
if isinstance(mutable, InputChoice) and mutable.n_chosen is not None:
weights = []
for src_key in mutable.choose_from:
if src_key not in edges_max:
_logger.warning("InputChoice.NO_KEY in '%s' is weighted 0 when selecting inputs.", mutable.key)
weights.append(edges_max.get(src_key, 0.))
weights = torch.tensor(weights) # pylint: disable=not-callable
_, topk_edge_indices = torch.topk(weights, mutable.n_chosen)
selected_multihot = []
for i, src_key in enumerate(mutable.choose_from):
if i not in topk_edge_indices and src_key in result:
result[src_key] = torch.zeros_like(result[src_key]) # clear this choice to optimize calc graph
selected_multihot.append(i in topk_edge_indices)
result[mutable.key] = torch.tensor(selected_multihot, dtype=torch.bool, device=self.device()) # pylint: disable=not-callable
if isinstance(mutable, InputChoice):
if mutable.n_chosen is not None:
weights = []
for src_key in mutable.choose_from:
if src_key not in edges_max:
_logger.warning("InputChoice.NO_KEY in '%s' is weighted 0 when selecting inputs.", mutable.key)
weights.append(edges_max.get(src_key, 0.))
weights = torch.tensor(weights) # pylint: disable=not-callable
_, topk_edge_indices = torch.topk(weights, mutable.n_chosen)
selected_multihot = []
for i, src_key in enumerate(mutable.choose_from):
if i not in topk_edge_indices and src_key in result:
# If an edge is never selected, there is no need to calculate any op on this edge.
# This is to eliminate redundant calculation.
result[src_key] = torch.zeros_like(result[src_key])
selected_multihot.append(i in topk_edge_indices)
result[mutable.key] = torch.tensor(selected_multihot, dtype=torch.bool, device=self.device()) # pylint: disable=not-callable
else:
result[mutable.key] = torch.ones(mutable.n_candidates, dtype=torch.bool, device=self.device()) # pylint: disable=not-callable
return result
......@@ -58,16 +58,16 @@ def _encode_tensor(data):
return data
def apply_fixed_architecture(model, fixed_arc_path):
def apply_fixed_architecture(model, fixed_arc):
"""
Load architecture from `fixed_arc_path` and apply to model.
Load architecture from `fixed_arc` and apply to model.
Parameters
----------
model : torch.nn.Module
Model with mutables.
fixed_arc_path : str
Path to the JSON that stores the architecture.
fixed_arc : str or dict
Path to the JSON that stores the architecture, or dict that stores the exported architecture.
Returns
-------
......@@ -75,8 +75,8 @@ def apply_fixed_architecture(model, fixed_arc_path):
Mutator that is responsible for fixes the graph.
"""
if isinstance(fixed_arc_path, str):
with open(fixed_arc_path, "r") as f:
if isinstance(fixed_arc, str):
with open(fixed_arc) as f:
fixed_arc = json.load(f)
fixed_arc = _encode_tensor(fixed_arc)
architecture = FixedArchitecture(model, fixed_arc)
......
......@@ -20,6 +20,14 @@ def global_mutable_counting():
return _counter
def _reset_global_mutable_counting():
"""
Reset the global mutable counting to count from 1. Useful when defining multiple models with default keys.
"""
global _counter
_counter = 0
def to_device(obj, device):
"""
Move a tensor, tuple, list, or dict onto device.
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from .mutable_scope import SpaceWithMutableScope
from .naive import NaiveSearchSpace
from .nested import NestedSpace
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import torch
import torch.nn as nn
import torch.nn.functional as F
from nni.nas.pytorch.mutables import LayerChoice, InputChoice, MutableScope
class Cell(MutableScope):
def __init__(self, cell_name, prev_labels, channels):
super().__init__(cell_name)
self.input_choice = InputChoice(choose_from=prev_labels, n_chosen=1, return_mask=True,
key=cell_name + "_input")
self.op_choice = LayerChoice([
nn.Conv2d(channels, channels, 3, padding=1),
nn.Conv2d(channels, channels, 5, padding=2),
nn.MaxPool2d(3, stride=1, padding=1),
nn.AvgPool2d(3, stride=1, padding=1),
nn.Identity()
], key=cell_name + "_op")
def forward(self, prev_layers):
chosen_input, chosen_mask = self.input_choice(prev_layers)
cell_out = self.op_choice(chosen_input)
return cell_out, chosen_mask
class Node(MutableScope):
def __init__(self, node_name, prev_node_names, channels):
super().__init__(node_name)
self.cell_x = Cell(node_name + "_x", prev_node_names, channels)
self.cell_y = Cell(node_name + "_y", prev_node_names, channels)
def forward(self, prev_layers):
out_x, mask_x = self.cell_x(prev_layers)
out_y, mask_y = self.cell_y(prev_layers)
return out_x + out_y, mask_x | mask_y
class Layer(nn.Module):
def __init__(self, num_nodes, channels):
super().__init__()
self.num_nodes = num_nodes
self.nodes = nn.ModuleList()
node_labels = [InputChoice.NO_KEY, InputChoice.NO_KEY]
for i in range(num_nodes):
node_labels.append("node_{}".format(i))
self.nodes.append(Node(node_labels[-1], node_labels[:-1], channels))
self.final_conv_w = nn.Parameter(torch.zeros(channels, self.num_nodes + 2, channels, 1, 1),
requires_grad=True)
self.bn = nn.BatchNorm2d(channels, affine=False)
def forward(self, pprev, prev):
prev_nodes_out = [pprev, prev]
nodes_used_mask = torch.zeros(self.num_nodes + 2, dtype=torch.bool, device=prev.device)
for i in range(self.num_nodes):
node_out, mask = self.nodes[i](prev_nodes_out)
nodes_used_mask[:mask.size(0)] |= mask.to(prev.device)
# NOTE: which device should we put mask on?
prev_nodes_out.append(node_out)
unused_nodes = torch.cat([out for used, out in zip(nodes_used_mask, prev_nodes_out) if not used], 1)
unused_nodes = F.relu(unused_nodes)
conv_weight = self.final_conv_w[:, ~nodes_used_mask, :, :, :]
conv_weight = conv_weight.view(conv_weight.size(0), -1, 1, 1)
out = F.conv2d(unused_nodes, conv_weight)
return prev, self.bn(out)
class SpaceWithMutableScope(nn.Module):
def __init__(self, test_case, num_layers=4, num_nodes=5, channels=16, in_channels=3, num_classes=10):
super().__init__()
self.test_case = test_case
self.num_layers = num_layers
self.stem = nn.Sequential(
nn.Conv2d(in_channels, channels, 3, 1, 1, bias=False),
nn.BatchNorm2d(channels)
)
self.layers = nn.ModuleList()
for _ in range(self.num_layers + 2):
self.layers.append(Layer(num_nodes, channels))
self.gap = nn.AdaptiveAvgPool2d(1)
self.dense = nn.Linear(channels, num_classes)
def forward(self, x):
prev = cur = self.stem(x)
for layer in self.layers:
prev, cur = layer(prev, cur)
cur = self.gap(F.relu(cur)).view(x.size(0), -1)
return self.dense(cur)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import torch
import torch.nn as nn
import torch.nn.functional as F
from nni.nas.pytorch.mutables import LayerChoice, InputChoice
class NaiveSearchSpace(nn.Module):
def __init__(self, test_case):
super().__init__()
self.test_case = test_case
self.conv1 = LayerChoice([nn.Conv2d(3, 6, 3, padding=1), nn.Conv2d(3, 6, 5, padding=2)])
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = LayerChoice([nn.Conv2d(6, 16, 3, padding=1), nn.Conv2d(6, 16, 5, padding=2)],
return_mask=True)
self.conv3 = nn.Conv2d(16, 16, 1)
self.skipconnect = InputChoice(n_candidates=1)
self.skipconnect2 = InputChoice(n_candidates=2, return_mask=True)
self.bn = nn.BatchNorm2d(16)
self.gap = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Linear(16, 10)
def forward(self, x):
bs = x.size(0)
x = self.pool(F.relu(self.conv1(x)))
x0, mask = self.conv2(x)
self.test_case.assertEqual(mask.size(), torch.Size([2]))
x1 = F.relu(self.conv3(x0))
_, mask = self.skipconnect2([x0, x1])
x0 = self.skipconnect([x0])
if x0 is not None:
x1 += x0
x = self.pool(self.bn(x1))
self.test_case.assertEqual(mask.size(), torch.Size([2]))
x = self.gap(x).view(bs, -1)
x = self.fc(x)
return x
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import torch.nn as nn
import torch.nn.functional as F
from nni.nas.pytorch.mutables import LayerChoice, InputChoice
class MutableOp(nn.Module):
def __init__(self, kernel_size):
super().__init__()
self.conv = nn.Conv2d(3, 120, kernel_size, padding=kernel_size // 2)
self.nested_mutable = InputChoice(n_candidates=10)
def forward(self, x):
return self.conv(x)
class NestedSpace(nn.Module):
# this doesn't pass tests
def __init__(self, test_case):
super().__init__()
self.test_case = test_case
self.conv1 = LayerChoice([MutableOp(3), MutableOp(5)])
self.gap = nn.AdaptiveAvgPool2d(1)
self.fc1 = nn.Linear(120, 10)
def forward(self, x):
bs = x.size(0)
x = F.relu(self.conv1(x))
x = self.gap(x).view(bs, -1)
x = self.fc(x)
return x
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import importlib
import os
import sys
from unittest import TestCase, main
import torch
import torch.nn as nn
from nni.nas.pytorch.classic_nas import get_and_apply_next_architecture
from nni.nas.pytorch.darts import DartsMutator
from nni.nas.pytorch.enas import EnasMutator
from nni.nas.pytorch.fixed import apply_fixed_architecture
from nni.nas.pytorch.random import RandomMutator
from nni.nas.pytorch.utils import _reset_global_mutable_counting
class NasTestCase(TestCase):
def setUp(self):
self.default_input_size = [3, 32, 32]
self.model_path = os.path.join(os.path.dirname(__file__), "models")
sys.path.append(self.model_path)
self.model_module = importlib.import_module("pytorch_models")
self.default_cls = [self.model_module.NaiveSearchSpace, self.model_module.SpaceWithMutableScope]
self.cuda_test = [0]
if torch.cuda.is_available():
self.cuda_test.append(1)
if torch.cuda.device_count() > 1:
self.cuda_test.append(torch.cuda.device_count())
def tearDown(self):
sys.path.remove(self.model_path)
def iterative_sample_and_forward(self, model, mutator=None, input_size=None, n_iters=20, test_backward=True,
use_cuda=False):
if input_size is None:
input_size = self.default_input_size
# support pytorch only
input_size = [8 if use_cuda else 2] + input_size # at least 2 samples to enable batch norm
for _ in range(n_iters):
for param in model.parameters():
param.grad = None
if mutator is not None:
mutator.reset()
x = torch.randn(input_size)
if use_cuda:
x = x.cuda()
y = torch.sum(model(x))
if test_backward:
y.backward()
def default_mutator_test_pipeline(self, mutator_cls):
for model_cls in self.default_cls:
for cuda_test in self.cuda_test:
_reset_global_mutable_counting()
model = model_cls(self)
mutator = mutator_cls(model)
if cuda_test:
model.cuda()
mutator.cuda()
if cuda_test > 1:
model = nn.DataParallel(model)
self.iterative_sample_and_forward(model, mutator, use_cuda=cuda_test)
_reset_global_mutable_counting()
model_fixed = model_cls(self)
if cuda_test:
model_fixed.cuda()
if cuda_test > 1:
model_fixed = nn.DataParallel(model_fixed)
with torch.no_grad():
arc = mutator.export()
apply_fixed_architecture(model_fixed, arc)
self.iterative_sample_and_forward(model_fixed, n_iters=1, use_cuda=cuda_test)
def test_random_mutator(self):
self.default_mutator_test_pipeline(RandomMutator)
def test_enas_mutator(self):
self.default_mutator_test_pipeline(EnasMutator)
def test_darts_mutator(self):
# DARTS doesn't support DataParallel. To be fixed.
self.cuda_test = [t for t in self.cuda_test if t <= 1]
self.default_mutator_test_pipeline(DartsMutator)
def test_apply_twice(self):
model = self.model_module.NaiveSearchSpace(self)
with self.assertRaises(RuntimeError):
for _ in range(2):
RandomMutator(model)
def test_nested_space(self):
model = self.model_module.NestedSpace(self)
with self.assertRaises(RuntimeError):
RandomMutator(model)
def test_classic_nas(self):
for model_cls in self.default_cls:
model = model_cls(self)
get_and_apply_next_architecture(model)
self.iterative_sample_and_forward(model)
if __name__ == '__main__':
main()
......@@ -29,6 +29,12 @@ def gen_new_config(config_file, training_service='local'):
config['trial'].pop('command')
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
if training_service == 'frameworkcontroller':
it_config[training_service]['trial']['taskRoles'][0]['command'] = config['trial']['command']
config['trial'].pop('command')
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
deep_update(config, it_config['all'])
deep_update(config, it_config[training_service])
......@@ -106,7 +112,7 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default=None)
parser.add_argument("--exclude", type=str, default=None)
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai', 'kubeflow'], default='local')
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'], default='local')
parser.add_argument("--local_gpu", action='store_true')
parser.add_argument("--preinstall", action='store_true')
args = parser.parse_args()
......
......@@ -42,6 +42,21 @@ def update_training_service_config(args):
config[args.ts]['kubeflowConfig']['azureStorage']['azureShare'] = args.azs_share
if args.nni_docker_image is not None:
config[args.ts]['trial']['worker']['image'] = args.nni_docker_image
elif args.ts == 'frameworkcontroller':
if args.nfs_server is not None:
config[args.ts]['frameworkcontrollerConfig']['nfs']['server'] = args.nfs_server
if args.nfs_path is not None:
config[args.ts]['frameworkcontrollerConfig']['nfs']['path'] = args.nfs_path
if args.keyvault_vaultname is not None:
config[args.ts]['frameworkcontrollerConfig']['keyVault']['vaultName'] = args.keyvault_vaultname
if args.keyvault_name is not None:
config[args.ts]['frameworkcontrollerConfig']['keyVault']['name'] = args.keyvault_name
if args.azs_account is not None:
config[args.ts]['frameworkcontrollerConfig']['azureStorage']['accountName'] = args.azs_account
if args.azs_share is not None:
config[args.ts]['frameworkcontrollerConfig']['azureStorage']['azureShare'] = args.azs_share
if args.nni_docker_image is not None:
config[args.ts]['trial']['taskRoles'][0]['image'] = args.nni_docker_image
elif args.ts == 'remote':
if args.remote_user is not None:
config[args.ts]['machineList'][0]['username'] = args.remote_user
......@@ -69,7 +84,7 @@ def convert_command():
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote', 'local'], default='pai')
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote', 'local', 'frameworkcontroller'], default='pai')
parser.add_argument("--nni_docker_image", type=str)
parser.add_argument("--nni_manager_ip", type=str)
# args for PAI
......@@ -79,7 +94,7 @@ if __name__ == '__main__':
parser.add_argument("--data_dir", type=str)
parser.add_argument("--output_dir", type=str)
parser.add_argument("--vc", type=str)
# args for kubeflow
# args for kubeflow and frameworkController
parser.add_argument("--nfs_server", type=str)
parser.add_argument("--nfs_path", type=str)
parser.add_argument("--keyvault_vaultname", type=str)
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
jobs:
- job: 'integration_test_frameworkController'
timeoutInMinutes: 0
steps:
- script: python3 -m pip install --upgrade pip setuptools --user
displayName: 'Install python tools'
- script: |
cd deployment/pypi
echo 'building prerelease package...'
make build
ls $(Build.SourcesDirectory)/deployment/pypi/dist/
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build nni bdsit_wheel'
- script: |
source install.sh
displayName: 'Install nni toolkit via source code'
- script: |
sudo apt-get install swig -y
PATH=$HOME/.local/bin:$PATH nnictl package install --name=SMAC
PATH=$HOME/.local/bin:$PATH nnictl package install --name=BOHB
displayName: 'Install dependencies for integration tests in frameworkcontroller mode'
- script: |
if [ $(build_docker_img) = 'true' ]
then
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
export IMG_TAG=`date -u +%y%m%d%H%M`
docker build -f ../docker/Dockerfile -t $(test_docker_img_name):$IMG_TAG .
docker push $(test_docker_img_name):$IMG_TAG
export TEST_IMG=$(test_docker_img_name):$IMG_TAG
cd ../../
else
export TEST_IMG=$(existing_docker_img)
fi
echo "TEST_IMG:$TEST_IMG"
cd test
python3 generate_ts_config.py --ts frameworkcontroller --keyvault_vaultname $(keyVault_vaultName) --keyvault_name $(keyVault_name) \
--azs_account $(azureStorage_accountName) --azs_share $(azureStorage_azureShare) --nni_docker_image $TEST_IMG --nni_manager_ip $(nni_manager_ip)
cat training_service.yml
PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts frameworkcontroller --exclude multi_phase
displayName: 'integration test'
......@@ -8,7 +8,7 @@ jobs:
- script: |
python -m pip install scikit-learn==0.20.0 --user
python -m pip install keras==2.1.6 --user
python -m pip install https://download.pytorch.org/whl/cu90/torch-0.4.1-cp36-cp36m-win_amd64.whl --user
python -m pip install torch===1.2.0 torchvision===0.4.1 -f https://download.pytorch.org/whl/torch_stable.html --user
python -m pip install torchvision --user
python -m pip install tensorflow-gpu==1.11.0 --user
displayName: 'Install dependencies for integration tests'
......
......@@ -24,6 +24,32 @@ kubeflow:
image:
trainingServicePlatform: kubeflow
frameworkcontroller:
maxExecDuration: 15m
nniManagerIp:
frameworkcontrollerConfig:
serviceAccountName: frameworkbarrier
storage: azureStorage
keyVault:
vaultName:
name:
azureStorage:
accountName:
azureShare:
trial:
taskRoles:
- name: worker
taskNum: 1
command:
gpuNum: 1
cpuNum: 1
memoryMB: 8192
image:
frameworkAttemptCompletionPolicy:
minFailedTaskCount: 1
minSucceededTaskCount: 1
trainingServicePlatform: frameworkcontroller
local:
trainingServicePlatform: local
pai:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment