Unverified Commit 125ec21f authored by Shudong Yang's avatar Shudong Yang Committed by GitHub
Browse files

Fix reusable k8s training service bug (#5045)

parent 9e8a0bf0
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
'use strict'; 'use strict';
import cpp from 'child-process-promise';
import * as fs from 'fs'; import * as fs from 'fs';
import * as path from 'path'; import * as path from 'path';
import * as component from '../../../../common/component'; import * as component from '../../../../common/component';
...@@ -81,7 +82,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment ...@@ -81,7 +82,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
const frameworkcontrollerJobName: string = `nniexp${this.experimentId}env${environment.id}`.toLowerCase(); const frameworkcontrollerJobName: string = `nniexp${this.experimentId}env${environment.id}`.toLowerCase();
const command = this.generateCommandScript(this.config.taskRoles, environment.command); const command = this.generateCommandScript(this.config.taskRoles, environment.command);
await fs.promises.writeFile(path.join(this.environmentLocalTempFolder, "run.sh"), command, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(this.environmentLocalTempFolder, `${environment.id}_run.sh`), command, { encoding: 'utf8' });
//upload script files to sotrage //upload script files to sotrage
const trialJobOutputUrl: string = await this.uploadFolder(this.environmentLocalTempFolder, `nni/${this.experimentId}`); const trialJobOutputUrl: string = await this.uploadFolder(this.environmentLocalTempFolder, `nni/${this.experimentId}`);
...@@ -106,7 +107,13 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment ...@@ -106,7 +107,13 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
} }
return await this.uploadFolderToAzureStorage(srcDirectory, destDirectory, 2); return await this.uploadFolderToAzureStorage(srcDirectory, destDirectory, 2);
} else { } else {
// do not need to upload files to nfs server, temp folder already mounted to nfs try {
// copy envs and run.sh from environments-temp to nfs-root(mounted)
await cpp.exec(`mkdir -p ${this.nfsRootDir}/${destDirectory}`);
await cpp.exec(`cp -r ${srcDirectory}/* ${this.nfsRootDir}/${destDirectory}`);
} catch (uploadError) {
return Promise.reject(uploadError);
}
return `nfs://${this.config.storage.server}:${destDirectory}`; return `nfs://${this.config.storage.server}:${destDirectory}`;
} }
} }
...@@ -174,7 +181,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment ...@@ -174,7 +181,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
const taskRole: any = this.generateTaskRoleConfig( const taskRole: any = this.generateTaskRoleConfig(
trialWorkingFolder, trialWorkingFolder,
this.config.taskRoles[index].dockerImage, this.config.taskRoles[index].dockerImage,
`run.sh`, `${envId}_run.sh`,
podResources[index], podResources[index],
containerPort, containerPort,
await this.createRegistrySecret(this.config.taskRoles[index].privateRegistryAuthPath) await this.createRegistrySecret(this.config.taskRoles[index].privateRegistryAuthPath)
......
// Copyright (c) Microsoft Corporation. // Copyright (c) Microsoft Corporation.
// Licensed under the MIT license. // Licensed under the MIT license.
import cpp from 'child-process-promise';
import fs from 'fs'; import fs from 'fs';
import path from 'path'; import path from 'path';
import * as component from 'common/component'; import * as component from 'common/component';
...@@ -85,7 +86,7 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService { ...@@ -85,7 +86,7 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
const kubeflowJobName: string = `nniexp${this.experimentId}env${environment.id}`.toLowerCase(); const kubeflowJobName: string = `nniexp${this.experimentId}env${environment.id}`.toLowerCase();
await fs.promises.writeFile(path.join(this.environmentLocalTempFolder, "run.sh"), environment.command, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(this.environmentLocalTempFolder, `${environment.id}_run.sh`), environment.command, { encoding: 'utf8' });
//upload script files to sotrage //upload script files to sotrage
const trialJobOutputUrl: string = await this.uploadFolder(this.environmentLocalTempFolder, `nni/${this.experimentId}`); const trialJobOutputUrl: string = await this.uploadFolder(this.environmentLocalTempFolder, `nni/${this.experimentId}`);
...@@ -106,7 +107,13 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService { ...@@ -106,7 +107,13 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
} }
return await this.uploadFolderToAzureStorage(srcDirectory, destDirectory, 2); return await this.uploadFolderToAzureStorage(srcDirectory, destDirectory, 2);
} else { } else {
// do not need to upload files to nfs server, temp folder already mounted to nfs try {
// copy envs and run.sh from environments-temp to nfs-root(mounted)
await cpp.exec(`mkdir -p ${this.nfsRootDir}/${destDirectory}`);
await cpp.exec(`cp -r ${srcDirectory}/* ${this.nfsRootDir}/${destDirectory}`);
} catch (uploadError) {
return Promise.reject(uploadError);
}
return `nfs://${this.config.storage.server}:${destDirectory}`; return `nfs://${this.config.storage.server}:${destDirectory}`;
} }
} }
...@@ -159,24 +166,26 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService { ...@@ -159,24 +166,26 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
if (this.config.worker) { if (this.config.worker) {
const privateRegistrySecretName = await this.createRegistrySecret(this.config.worker.privateRegistryAuthPath); const privateRegistrySecretName = await this.createRegistrySecret(this.config.worker.privateRegistryAuthPath);
replicaSpecsObj.Worker = this.generateReplicaConfig(this.config.worker.replicas, replicaSpecsObj.Worker = this.generateReplicaConfig(this.config.worker.replicas,
this.config.worker.dockerImage, 'run.sh', workerPodResources, privateRegistrySecretName); this.config.worker.dockerImage,
`${envId}_run.sh`, workerPodResources, privateRegistrySecretName);
} }
if (this.config.ps !== undefined) { if (this.config.ps !== undefined) {
const privateRegistrySecretName: string | undefined = await this.createRegistrySecret(this.config.ps.privateRegistryAuthPath); const privateRegistrySecretName: string | undefined = await this.createRegistrySecret(this.config.ps.privateRegistryAuthPath);
replicaSpecsObj.Ps = this.generateReplicaConfig(this.config.ps.replicas, replicaSpecsObj.Ps = this.generateReplicaConfig(this.config.ps.replicas,
this.config.ps.dockerImage, 'run.sh', nonWorkerPodResources, privateRegistrySecretName); this.config.ps.dockerImage,
`${envId}_run.sh`, nonWorkerPodResources, privateRegistrySecretName);
} }
replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {tfReplicaSpecs: replicaSpecsObj}); replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {tfReplicaSpecs: replicaSpecsObj});
} else if (this.config.operator === 'pytorch-operator') { } else if (this.config.operator === 'pytorch-operator') {
if (this.config.worker !== undefined) { if (this.config.worker !== undefined) {
const privateRegistrySecretName: string | undefined = await this.createRegistrySecret(this.config.worker.privateRegistryAuthPath); const privateRegistrySecretName: string | undefined = await this.createRegistrySecret(this.config.worker.privateRegistryAuthPath);
replicaSpecsObj.Worker = this.generateReplicaConfig(this.config.worker.replicas, replicaSpecsObj.Worker = this.generateReplicaConfig(this.config.worker.replicas,
this.config.worker.dockerImage, 'run.sh', workerPodResources, privateRegistrySecretName); this.config.worker.dockerImage, `${envId}_run.sh`, workerPodResources, privateRegistrySecretName);
} }
if (this.config.master !== undefined) { if (this.config.master !== undefined) {
const privateRegistrySecretName: string | undefined = await this.createRegistrySecret(this.config.master.privateRegistryAuthPath); const privateRegistrySecretName: string | undefined = await this.createRegistrySecret(this.config.master.privateRegistryAuthPath);
replicaSpecsObj.Master = this.generateReplicaConfig(this.config.master.replicas, replicaSpecsObj.Master = this.generateReplicaConfig(this.config.master.replicas,
this.config.master.dockerImage, 'run.sh', nonWorkerPodResources, privateRegistrySecretName); this.config.master.dockerImage, `${envId}_run.sh`, nonWorkerPodResources, privateRegistrySecretName);
} }
replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {pytorchReplicaSpecs: replicaSpecsObj}); replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {pytorchReplicaSpecs: replicaSpecsObj});
......
...@@ -32,6 +32,7 @@ export class KubernetesEnvironmentService extends EnvironmentService { ...@@ -32,6 +32,7 @@ export class KubernetesEnvironmentService extends EnvironmentService {
protected CONTAINER_MOUNT_PATH: string; protected CONTAINER_MOUNT_PATH: string;
protected log: Logger = getLogger('KubernetesEnvironmentService'); protected log: Logger = getLogger('KubernetesEnvironmentService');
protected environmentWorkingFolder: string; protected environmentWorkingFolder: string;
protected nfsRootDir: string;
constructor(_config: any, info: ExperimentStartupInfo) { constructor(_config: any, info: ExperimentStartupInfo) {
super(); super();
...@@ -39,6 +40,7 @@ export class KubernetesEnvironmentService extends EnvironmentService { ...@@ -39,6 +40,7 @@ export class KubernetesEnvironmentService extends EnvironmentService {
this.genericK8sClient = new GeneralK8sClient(); this.genericK8sClient = new GeneralK8sClient();
this.experimentRootDir = info.logDir; this.experimentRootDir = info.logDir;
this.environmentLocalTempFolder = path.join(this.experimentRootDir, "environment-temp"); this.environmentLocalTempFolder = path.join(this.experimentRootDir, "environment-temp");
this.nfsRootDir = path.join(this.experimentRootDir, "nfs-root");
this.experimentId = info.experimentId; this.experimentId = info.experimentId;
this.environmentWorkingFolder = path.join(this.CONTAINER_MOUNT_PATH, 'nni', this.experimentId); this.environmentWorkingFolder = path.join(this.CONTAINER_MOUNT_PATH, 'nni', this.experimentId);
} }
...@@ -147,11 +149,11 @@ export class KubernetesEnvironmentService extends EnvironmentService { ...@@ -147,11 +149,11 @@ export class KubernetesEnvironmentService extends EnvironmentService {
} }
protected async createNFSStorage(nfsServer: string, nfsPath: string): Promise<void> { protected async createNFSStorage(nfsServer: string, nfsPath: string): Promise<void> {
await cpp.exec(`mkdir -p ${this.environmentLocalTempFolder}`); await cpp.exec(`mkdir -p ${this.nfsRootDir}`);
try { try {
await cpp.exec(`sudo mount ${nfsServer}:${nfsPath} ${this.environmentLocalTempFolder}`); await cpp.exec(`sudo mount ${nfsServer}:${nfsPath} ${this.nfsRootDir}`);
} catch (error) { } catch (error) {
const mountError: string = `Mount NFS ${nfsServer}:${nfsPath} to ${this.environmentLocalTempFolder} failed, error is ${error}`; const mountError: string = `Mount NFS ${nfsServer}:${nfsPath} to ${this.nfsRootDir} failed, error is ${error}`;
this.log.error(mountError); this.log.error(mountError);
return Promise.reject(mountError); return Promise.reject(mountError);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment