"...composable_kernel_onnxruntime.git" did not exist on "bf7e7d62a8a6753ac661879cdad061478498eea3"
Unverified Commit 125ec21f authored by Shudong Yang's avatar Shudong Yang Committed by GitHub
Browse files

Fix reusable k8s training service bug (#5045)

parent 9e8a0bf0
......@@ -3,6 +3,7 @@
'use strict';
import cpp from 'child-process-promise';
import * as fs from 'fs';
import * as path from 'path';
import * as component from '../../../../common/component';
......@@ -81,7 +82,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
const frameworkcontrollerJobName: string = `nniexp${this.experimentId}env${environment.id}`.toLowerCase();
const command = this.generateCommandScript(this.config.taskRoles, environment.command);
await fs.promises.writeFile(path.join(this.environmentLocalTempFolder, "run.sh"), command, { encoding: 'utf8' });
await fs.promises.writeFile(path.join(this.environmentLocalTempFolder, `${environment.id}_run.sh`), command, { encoding: 'utf8' });
//upload script files to sotrage
const trialJobOutputUrl: string = await this.uploadFolder(this.environmentLocalTempFolder, `nni/${this.experimentId}`);
......@@ -106,7 +107,13 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
}
return await this.uploadFolderToAzureStorage(srcDirectory, destDirectory, 2);
} else {
// do not need to upload files to nfs server, temp folder already mounted to nfs
try {
// copy envs and run.sh from environments-temp to nfs-root(mounted)
await cpp.exec(`mkdir -p ${this.nfsRootDir}/${destDirectory}`);
await cpp.exec(`cp -r ${srcDirectory}/* ${this.nfsRootDir}/${destDirectory}`);
} catch (uploadError) {
return Promise.reject(uploadError);
}
return `nfs://${this.config.storage.server}:${destDirectory}`;
}
}
......@@ -174,7 +181,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
const taskRole: any = this.generateTaskRoleConfig(
trialWorkingFolder,
this.config.taskRoles[index].dockerImage,
`run.sh`,
`${envId}_run.sh`,
podResources[index],
containerPort,
await this.createRegistrySecret(this.config.taskRoles[index].privateRegistryAuthPath)
......
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import cpp from 'child-process-promise';
import fs from 'fs';
import path from 'path';
import * as component from 'common/component';
......@@ -85,7 +86,7 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
const kubeflowJobName: string = `nniexp${this.experimentId}env${environment.id}`.toLowerCase();
await fs.promises.writeFile(path.join(this.environmentLocalTempFolder, "run.sh"), environment.command, { encoding: 'utf8' });
await fs.promises.writeFile(path.join(this.environmentLocalTempFolder, `${environment.id}_run.sh`), environment.command, { encoding: 'utf8' });
//upload script files to sotrage
const trialJobOutputUrl: string = await this.uploadFolder(this.environmentLocalTempFolder, `nni/${this.experimentId}`);
......@@ -106,7 +107,13 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
}
return await this.uploadFolderToAzureStorage(srcDirectory, destDirectory, 2);
} else {
// do not need to upload files to nfs server, temp folder already mounted to nfs
try {
// copy envs and run.sh from environments-temp to nfs-root(mounted)
await cpp.exec(`mkdir -p ${this.nfsRootDir}/${destDirectory}`);
await cpp.exec(`cp -r ${srcDirectory}/* ${this.nfsRootDir}/${destDirectory}`);
} catch (uploadError) {
return Promise.reject(uploadError);
}
return `nfs://${this.config.storage.server}:${destDirectory}`;
}
}
......@@ -159,24 +166,26 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
if (this.config.worker) {
const privateRegistrySecretName = await this.createRegistrySecret(this.config.worker.privateRegistryAuthPath);
replicaSpecsObj.Worker = this.generateReplicaConfig(this.config.worker.replicas,
this.config.worker.dockerImage, 'run.sh', workerPodResources, privateRegistrySecretName);
this.config.worker.dockerImage,
`${envId}_run.sh`, workerPodResources, privateRegistrySecretName);
}
if (this.config.ps !== undefined) {
const privateRegistrySecretName: string | undefined = await this.createRegistrySecret(this.config.ps.privateRegistryAuthPath);
replicaSpecsObj.Ps = this.generateReplicaConfig(this.config.ps.replicas,
this.config.ps.dockerImage, 'run.sh', nonWorkerPodResources, privateRegistrySecretName);
this.config.ps.dockerImage,
`${envId}_run.sh`, nonWorkerPodResources, privateRegistrySecretName);
}
replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {tfReplicaSpecs: replicaSpecsObj});
} else if (this.config.operator === 'pytorch-operator') {
if (this.config.worker !== undefined) {
const privateRegistrySecretName: string | undefined = await this.createRegistrySecret(this.config.worker.privateRegistryAuthPath);
replicaSpecsObj.Worker = this.generateReplicaConfig(this.config.worker.replicas,
this.config.worker.dockerImage, 'run.sh', workerPodResources, privateRegistrySecretName);
this.config.worker.dockerImage, `${envId}_run.sh`, workerPodResources, privateRegistrySecretName);
}
if (this.config.master !== undefined) {
const privateRegistrySecretName: string | undefined = await this.createRegistrySecret(this.config.master.privateRegistryAuthPath);
replicaSpecsObj.Master = this.generateReplicaConfig(this.config.master.replicas,
this.config.master.dockerImage, 'run.sh', nonWorkerPodResources, privateRegistrySecretName);
this.config.master.dockerImage, `${envId}_run.sh`, nonWorkerPodResources, privateRegistrySecretName);
}
replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {pytorchReplicaSpecs: replicaSpecsObj});
......
......@@ -32,6 +32,7 @@ export class KubernetesEnvironmentService extends EnvironmentService {
protected CONTAINER_MOUNT_PATH: string;
protected log: Logger = getLogger('KubernetesEnvironmentService');
protected environmentWorkingFolder: string;
protected nfsRootDir: string;
constructor(_config: any, info: ExperimentStartupInfo) {
super();
......@@ -39,6 +40,7 @@ export class KubernetesEnvironmentService extends EnvironmentService {
this.genericK8sClient = new GeneralK8sClient();
this.experimentRootDir = info.logDir;
this.environmentLocalTempFolder = path.join(this.experimentRootDir, "environment-temp");
this.nfsRootDir = path.join(this.experimentRootDir, "nfs-root");
this.experimentId = info.experimentId;
this.environmentWorkingFolder = path.join(this.CONTAINER_MOUNT_PATH, 'nni', this.experimentId);
}
......@@ -147,11 +149,11 @@ export class KubernetesEnvironmentService extends EnvironmentService {
}
protected async createNFSStorage(nfsServer: string, nfsPath: string): Promise<void> {
await cpp.exec(`mkdir -p ${this.environmentLocalTempFolder}`);
await cpp.exec(`mkdir -p ${this.nfsRootDir}`);
try {
await cpp.exec(`sudo mount ${nfsServer}:${nfsPath} ${this.environmentLocalTempFolder}`);
await cpp.exec(`sudo mount ${nfsServer}:${nfsPath} ${this.nfsRootDir}`);
} catch (error) {
const mountError: string = `Mount NFS ${nfsServer}:${nfsPath} to ${this.environmentLocalTempFolder} failed, error is ${error}`;
const mountError: string = `Mount NFS ${nfsServer}:${nfsPath} to ${this.nfsRootDir} failed, error is ${error}`;
this.log.error(mountError);
return Promise.reject(mountError);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment