Reusable environment support GPU scheduler, add test cases and refactoring. (#2627)

143c6615 · Chi Song · GitHub · 8a20c348 · 143c6615 · 143c6615
Unverified Commit 143c6615 authored Jul 30, 2020 by Chi Song Committed by GitHub Jul 30, 2020
20 changed files
--- a/src/nni_manager/common/utils.ts
+++ b/src/nni_manager/common/utils.ts
@@ -222,15 +222,16 @@ function getIPV4Address(): string {
        return cachedipv4Address;
    }

-    if (os.networkInterfaces().eth0) {
-        for (const item of os.networkInterfaces().eth0) {
+    const networkInterfaces = os.networkInterfaces();
+    if (networkInterfaces.eth0) {
+        for (const item of networkInterfaces.eth0) {
            if (item.family === 'IPv4') {
                cachedipv4Address = item.address;
                return cachedipv4Address;
            }
        }
    } else {
-        throw Error('getIPV4Address() failed because os.networkInterfaces().eth0 is undefined.');
+        throw Error(`getIPV4Address() failed because os.networkInterfaces().eth0 is undefined. Please specify NNI manager IP in config.`);
    }

    throw Error('getIPV4Address() failed because no valid IPv4 address found.')

--- a/src/nni_manager/package.json
+++ b/src/nni_manager/package.json
@@ -39,6 +39,7 @@
    "@types/express": "^4.16.0",
    "@types/glob": "^7.1.1",
    "@types/js-base64": "^2.3.1",
+    "@types/js-yaml": "^3.12.5",
    "@types/mocha": "^5.2.5",
    "@types/node": "10.12.18",
    "@types/request": "^2.47.1",

--- a/src/nni_manager/rest_server/restValidationSchemas.ts
+++ b/src/nni_manager/rest_server/restValidationSchemas.ts
@@ -107,6 +107,11 @@ export namespace ValidationSchemas {
                token: joi.string().min(1),
                host: joi.string().min(1).required(),
                reuse: joi.boolean(),
+                cpuNum: joi.number().min(1),
+                memoryMB: joi.number().min(100),
+                gpuNum: joi.number().min(1),
+                maxTrialNumPerGpu: joi.number(),
+                useActiveGpu: joi.boolean(),
            }),
            kubeflow_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
                operator: joi.string().min(1).required(),

--- a/src/nni_manager/training_service/common/gpuData.ts
+++ b/src/nni_manager/training_service/common/gpuData.ts
@@ -3,6 +3,17 @@

 'use strict';

+export enum ScheduleResultType {
+    // Schedule succeeded
+    SUCCEED,
+
+    // Temporarily, no enough available GPU right now
+    TMP_NO_AVAILABLE_GPU,
+
+    // Cannot match requirement even if all GPU are a
+    REQUIRE_EXCEED_TOTAL
+}
+
 /**
 * GPU Infromation class
 * Representing the dynamic and static information retrieved from Nvidia-smi
@@ -52,6 +63,19 @@ export class GPUSummary {
    }
 }

+
+export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
+    if (gpuIndices !== undefined) {
+        const indices: number[] = gpuIndices.split(',')
+            .map((x: string) => parseInt(x, 10));
+        if (indices.length > 0) {
+            return new Set(indices);
+        } else {
+            throw new Error('gpuIndices can not be empty if specified.');
+        }
+    }
+}
+
 export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
    `
 $env:METRIC_OUTPUT_DIR="{0}"

--- a/src/nni_manager/training_service/common/trialConfig.ts
+++ b/src/nni_manager/training_service/common/trialConfig.ts
@@ -17,6 +17,10 @@ export class TrialConfig {
    // Required GPU number for trial job. The number should be in [0,100]
    public readonly gpuNum: number;

+    // this flag uses for UT now.
+    // in future, all environments should be reusable, and this can be configurable by user.
+    public reuseEnvironment: boolean | undefined = true;
+
    /**
     * Constructor
     * @param command Trail command

--- a/src/nni_manager/training_service/pai/paiConfig.ts
+++ b/src/nni_manager/training_service/pai/paiConfig.ts
@@ -3,7 +3,7 @@

 'use strict';

-import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus  } from '../../common/trainingService';
+import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';

 export class PAIClusterConfig {
    public readonly userName: string;
@@ -12,6 +12,13 @@ export class PAIClusterConfig {
    public readonly token?: string;
    public readonly reuse?: boolean;

+    public cpuNum?: number;
+    public memoryMB?: number;
+    public gpuNum?: number;
+
+    public useActiveGpu?: boolean;
+    public maxTrialNumPerGpu?: number;
+
    /**
     * Constructor
     * @param userName User name of PAI Cluster
@@ -20,12 +27,16 @@ export class PAIClusterConfig {
     * @param token PAI token of PAI Cluster
     * @param reuse If job is reusable for multiple trials
     */
-    constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean) {
+    constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean,
+        cpuNum?: number, memoryMB?: number, gpuNum?: number) {
        this.userName = userName;
        this.passWord = passWord;
        this.host = host;
        this.token = token;
        this.reuse = reuse;
+        this.cpuNum = cpuNum;
+        this.memoryMB = memoryMB;
+        this.gpuNum = gpuNum;
    }
 }


--- a/src/nni_manager/training_service/remote_machine/gpuScheduler.ts
+++ b/src/nni_manager/training_service/remote_machine/gpuScheduler.ts
@@ -6,10 +6,8 @@
 import * as assert from 'assert';
 import { getLogger, Logger } from '../../common/log';
 import { randomSelect } from '../../common/utils';
-import { GPUInfo } from '../common/gpuData';
-import {
-    parseGpuIndices, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail, ScheduleResultType, ExecutorManager
-} from './remoteMachineData';
+import { GPUInfo, parseGpuIndices, ScheduleResultType } from '../common/gpuData';
+import { ExecutorManager, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail } from './remoteMachineData';

 type SCHEDULE_POLICY_NAME = 'random' | 'round-robin';

@@ -39,7 +37,7 @@ export class GPUScheduler {
     * @param requiredGPUNum required GPU number
     */
    public scheduleMachine(requiredGPUNum: number | undefined, trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
-        if(requiredGPUNum === undefined) {
+        if (requiredGPUNum === undefined) {
            requiredGPUNum = 0;
        }
        assert(requiredGPUNum >= 0);
@@ -48,7 +46,7 @@ export class GPUScheduler {

        // Step 1: Check if required GPU number not exceeds the total GPU number in all machines
        const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta: RemoteMachineMeta) =>
-                 rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
+            rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
        if (eligibleRM.length === 0) {
            // If the required gpu number exceeds the upper limit of all machine's GPU number
            // Return REQUIRE_EXCEED_TOTAL directly
@@ -75,8 +73,8 @@ export class GPUScheduler {
        this.log.warning(`Scheduler: trialJob id ${trialJobDetail.id}, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `);

        return {
-            resultType : ScheduleResultType.TMP_NO_AVAILABLE_GPU,
-            scheduleInfo : undefined
+            resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
+            scheduleInfo: undefined
        };
    }

@@ -159,7 +157,7 @@ export class GPUScheduler {
                            const num: number | undefined = rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
                            const maxTrialNumPerGpu: number = rmMeta.maxTrialNumPerGpu ? rmMeta.maxTrialNumPerGpu : 1;
                            if ((num === undefined && (!rmMeta.useActiveGpu && gpuInfo.activeProcessNum === 0 || rmMeta.useActiveGpu)) ||
-                               (num !== undefined && num < maxTrialNumPerGpu)) {
+                                (num !== undefined && num < maxTrialNumPerGpu)) {
                                availableGPUs.push(gpuInfo);
                            }
                        } else {
@@ -200,7 +198,7 @@ export class GPUScheduler {
    }

    private allocateHost(requiredGPUNum: number, rmMeta: RemoteMachineMeta,
-                         gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
+        gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
        assert(gpuInfos.length >= requiredGPUNum);
        const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum);
        allocatedGPUs.forEach((gpuInfo: GPUInfo) => {
@@ -222,10 +220,10 @@ export class GPUScheduler {
            scheduleInfo: {
                rmMeta: rmMeta,
                cudaVisibleDevice: allocatedGPUs
-                                    .map((gpuInfo: GPUInfo) => {
-                                        return gpuInfo.index;
-                                    })
-                                    .join(',')
+                    .map((gpuInfo: GPUInfo) => {
+                        return gpuInfo.index;
+                    })
+                    .join(',')
            }
        };
    }

--- a/src/nni_manager/training_service/remote_machine/remoteMachineData.ts
+++ b/src/nni_manager/training_service/remote_machine/remoteMachineData.ts
@@ -4,7 +4,7 @@
 'use strict';

 import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
-import { GPUInfo, GPUSummary } from '../common/gpuData';
+import { GPUInfo, GPUSummary, ScheduleResultType } from '../common/gpuData';
 import { ShellExecutor } from './shellExecutor';

 /**
@@ -25,18 +25,6 @@ export class RemoteMachineMeta {
    public readonly useActiveGpu?: boolean = false;
 }

-export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
-    if (gpuIndices !== undefined) {
-        const indices: number[] = gpuIndices.split(',')
-            .map((x: string) => parseInt(x, 10));
-        if (indices.length > 0) {
-            return new Set(indices);
-        } else {
-            throw new Error('gpuIndices can not be empty if specified.');
-        }
-    }
-}
-
 /**
 * The execution result for command executed on remote machine
 */
@@ -168,14 +156,3 @@ export class ExecutorManager {
 export type RemoteMachineScheduleResult = { scheduleInfo: RemoteMachineScheduleInfo | undefined; resultType: ScheduleResultType };

 export type RemoteMachineScheduleInfo = { rmMeta: RemoteMachineMeta; cudaVisibleDevice: string };
-
-export enum ScheduleResultType {
-    // Schedule succeeded
-    SUCCEED,
-
-    // Temporarily, no enough available GPU right now
-    TMP_NO_AVAILABLE_GPU,
-
-    // Cannot match requirement even if all GPU are a
-    REQUIRE_EXCEED_TOTAL
-}
--- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
+++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
@@ -7,6 +7,7 @@ import * as assert from 'assert';
 import { EventEmitter } from 'events';
 import * as fs from 'fs';
 import * as path from 'path';
+import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
 import { Deferred } from 'ts-deferred';
 import * as component from '../../common/component';
 import { NNIError, NNIErrorNames } from '../../common/errors';
@@ -22,18 +23,16 @@ import {
    getVersion, uniqueString
 } from '../../common/utils';
 import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
-import { GPUSummary } from '../common/gpuData';
+import { GPUSummary, ScheduleResultType } from '../common/gpuData';
 import { TrialConfig } from '../common/trialConfig';
 import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
 import { execMkdir, validateCodeDir } from '../common/util';
 import { GPUScheduler } from './gpuScheduler';
 import {
-    RemoteMachineMeta,
-    RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail,
-    ScheduleResultType, ExecutorManager
+    ExecutorManager, RemoteMachineMeta,
+    RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail
 } from './remoteMachineData';
 import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
-import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';

 /**
 * Training Service implementation for Remote Machine (Linux)

--- a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts
+++ b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts
@@ -3,7 +3,6 @@

 'use strict';

-import { EventEmitter } from 'events';
 import { delay } from "../../../common/utils";
 import { AMLEnvironmentInformation } from '../aml/amlConfig';
 import { CommandChannel, RunnerConnection } from "../commandChannel";
@@ -15,11 +14,7 @@ class AMLRunnerConnection extends RunnerConnection {
 export class AMLCommandChannel extends CommandChannel {
    private stopping: boolean = false;
    private sendQueues: [EnvironmentInformation, string][] = [];
-    private readonly NNI_METRICS_PATTERN: string = `NNISDK_MEb'(?<metrics>.*?)'`;
-    
-    public constructor(commandEmitter: EventEmitter) {
-        super(commandEmitter);
-    }
+
    public get channelName(): Channel {
        return "aml";
    }
@@ -99,11 +94,11 @@ export class AMLCommandChannel extends CommandChannel {
                    const messages = command['trial_runner'];
                    if (messages) {
                        if (messages instanceof Object && currentMessageIndex < messages.length - 1) {
-                            for (let index = currentMessageIndex + 1; index < messages.length; index ++) {
+                            for (let index = currentMessageIndex + 1; index < messages.length; index++) {
                                this.handleCommand(runnerConnection.environment, messages[index]);
                            }
                            currentMessageIndex = messages.length - 1;
-                        } else if (currentMessageIndex === -1){
+                        } else if (currentMessageIndex === -1) {
                            this.handleCommand(runnerConnection.environment, messages);
                            currentMessageIndex += 1;
                        }

--- a/src/nni_manager/training_service/reusable/environment.ts
+++ b/src/nni_manager/training_service/reusable/environment.ts
@@ -3,10 +3,10 @@

 'use strict';

-import { GPUSummary } from "training_service/common/gpuData";
+import { EventEmitter } from "events";
 import { getLogger, Logger } from "../../common/log";
 import { TrialJobStatus } from "../../common/trainingService";
-import { EventEmitter } from "events";
+import { GPUInfo } from "../../training_service/common/gpuData";
 import { WebCommandChannel } from "./channels/webCommandChannel";
 import { CommandChannel } from "./commandChannel";

@@ -14,24 +14,50 @@ import { CommandChannel } from "./commandChannel";
 export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED';
 export type Channel = "web" | "file" | "aml" | "ut";

+
+export class TrialGpuSummary {
+    // GPU count on the machine
+    public gpuCount: number;
+    // The timestamp when GPU summary data queried
+    public timestamp: string;
+    // The array of GPU information for each GPU card
+    public gpuInfos: GPUInfo[];
+    // GPU assigned status
+    public assignedGpuIndexMap: Map<number, number> = new Map<number, number>();
+
+    constructor(gpuCount: number, timestamp: string, gpuInfos: GPUInfo[]) {
+        this.gpuCount = gpuCount;
+        this.timestamp = timestamp;
+        this.gpuInfos = gpuInfos;
+    }
+}
+
 export class EnvironmentInformation {
+    // node id is 5 chars, so won't conflict.
+    private readonly defaultNodeId = "default";
    private log: Logger;
-
-    // NNI environment ID
-    public id: string;
-    // training platform unique job ID.
-    public jobId: string;
-    // training platform job friendly name, in case it's different with job ID.
-    public jobName: string;
+    private isNoGpuWarned: boolean = false;

    // key states
-    // true: environment is ready to run trial.
-    public isIdle: boolean = false;
    // true: environment is running, waiting, or unknown.
    public isAlive: boolean = true;
+    // true: Runner is initialized, and can receive trials.
+    public isRunnerReady: boolean = false;
    // don't set status in environment directly, use setFinalState function to set a final state.
    public status: EnvironmentStatus = "UNKNOWN";

+    // true: environment is ready to run trial.
+    public runningTrialCount: number = 0;
+    // uses to count how many trial runs on this environment.
+    // it can be used in many scenarios, but for now, it uses for reusable.
+    public assignedTrialCount: number = 0;
+
+    // NNI environment ID
+    public id: string;
+    // training platform unique job ID.
+    public envId: string;
+    // training platform job friendly name, in case it's different with job ID.
+    public name: string;
    public trackingUrl: string = "";
    public workingFolder: string = "";
    public runnerWorkingFolder: string = "";
@@ -40,41 +66,82 @@ export class EnvironmentInformation {

    // it's used to aggregate node status for multiple node trial
    public nodes: Map<string, NodeInfomation>;
-    public gpuSummary: Map<string, GPUSummary> = new Map<string, GPUSummary>();
+    public gpuSummaries: Map<string, TrialGpuSummary> = new Map<string, TrialGpuSummary>();

-    constructor(id: string, jobName: string, jobId?: string) {
+    // use can specify which gpus can be used by NNI.
+    // it's usable for sharable environment like remote machine.
+    public usableGpus?: number[];
+    // user can specify how to use GPU resource for an environment, like local and remote.
+    public maxTrialNumberPerGpu?: number;
+    public useActiveGpu?: boolean;
+
+    constructor(id: string, name: string, envId?: string) {
        this.log = getLogger();
        this.id = id;
-        this.jobName = jobName;
-        this.jobId = jobId ? jobId : jobName;
+        this.name = name;
+        this.envId = envId ? envId : name;
        this.nodes = new Map<string, NodeInfomation>();
    }

-    public setFinalStatus(status: EnvironmentStatus): void {
-        switch (status) {
-            case 'WAITING':
-            case 'SUCCEEDED':
-            case 'FAILED':
-            case 'USER_CANCELED':
-                this.status = status;
-                break;
-            default:
-                this.log.error(`Environment: job ${this.jobId} set an invalid final state ${status}.`);
-                break;
+    public setStatus(status: EnvironmentStatus): void {
+        if (this.status !== status) {
+            this.log.info(`EnvironmentInformation: ${this.envId} change status from ${this.status} to ${status}.`)
+            this.status = status;
+        }
+    }
+
+    public setGpuSummary(nodeId: string, newGpuSummary: TrialGpuSummary): void {
+        if (nodeId === null || nodeId === undefined) {
+            nodeId = this.defaultNodeId;
+        }
+
+        const originalGpuSummary = this.gpuSummaries.get(nodeId);
+        if (undefined === originalGpuSummary) {
+            newGpuSummary.assignedGpuIndexMap = new Map<number, number>();
+            this.gpuSummaries.set(nodeId, newGpuSummary);
+        } else {
+            originalGpuSummary.gpuCount = newGpuSummary.gpuCount;
+            originalGpuSummary.timestamp = newGpuSummary.timestamp;
+            originalGpuSummary.gpuInfos = newGpuSummary.gpuInfos;
+        }
+    }
+
+    public get defaultGpuSummary(): TrialGpuSummary | undefined {
+        const gpuSummary = this.gpuSummaries.get(this.defaultNodeId);
+        if (gpuSummary === undefined) {
+            if (false === this.isNoGpuWarned) {
+                this.log.warning(`EnvironmentInformation: ${this.envId} no default gpu found. current gpu info ${JSON.stringify(this.gpuSummaries)}`);
+                this.isNoGpuWarned = true;
+            }
+        } else {
+            this.isNoGpuWarned = false;
        }
+        return gpuSummary;
    }
 }

 export abstract class EnvironmentService {

    public abstract get hasStorageService(): boolean;
-
    public abstract config(key: string, value: string): Promise<void>;
    public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void>;
    public abstract startEnvironment(environment: EnvironmentInformation): Promise<void>;
    public abstract stopEnvironment(environment: EnvironmentInformation): Promise<void>;

-    public getCommandChannel(commandEmitter: EventEmitter): CommandChannel {
+    // It depends on environment pressure and settings
+    // for example, OpenPAI relies on API calls, and there is an limitation for frequence, so it need to be bigger.
+    public get environmentMaintenceLoopInterval(): number {
+        return 5000;
+    }
+
+    // it's needed in two scenario
+    // 1. remote machine has fixed number, so it can return false, when all environment are assigned.
+    // 2. If there are consistent error on requested environments, for example, authentication failure on platform.
+    public get hasMoreEnvironments(): boolean {
+        return true;
+    }
+
+    public createCommandChannel(commandEmitter: EventEmitter): CommandChannel {
        return new WebCommandChannel(commandEmitter);
    }

@@ -101,7 +168,7 @@ export class RunnerSettings {
    public nniManagerVersion: string = "";
    public logCollection: string = "none";
    public command: string = "";
-    public enableGpuCollector: boolean = false;
+    public enableGpuCollector: boolean = true;

    // specify which communication channel is used by runner.
    // supported channel includes: rest, storage, aml

--- a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts
+++ b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts
@@ -3,24 +3,20 @@

 'use strict';

+import { EventEmitter } from "events";
 import * as fs from 'fs';
 import * as path from 'path';
 import * as component from '../../../common/component';
 import { getExperimentId } from '../../../common/experimentStartupInfo';
 import { getLogger, Logger } from '../../../common/log';
+import { getExperimentRootDir } from '../../../common/utils';
 import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
-import { AMLClusterConfig, AMLTrialConfig } from '../aml/amlConfig';
-import { EnvironmentInformation, EnvironmentService } from '../environment';
-import { AMLEnvironmentInformation } from '../aml/amlConfig';
-import { AMLClient } from '../aml/amlClient';
-import {
-    NNIManagerIpConfig,
-} from '../../../common/trainingService';
 import { validateCodeDir } from '../../common/util';
-import { getExperimentRootDir } from '../../../common/utils';
+import { AMLClient } from '../aml/amlClient';
+import { AMLClusterConfig, AMLEnvironmentInformation, AMLTrialConfig } from '../aml/amlConfig';
 import { AMLCommandChannel } from '../channels/amlCommandChannel';
 import { CommandChannel } from "../commandChannel";
-import { EventEmitter } from "events";
+import { EnvironmentInformation, EnvironmentService, EnvironmentStatus } from '../environment';


 /**
@@ -28,17 +24,11 @@ import { EventEmitter } from "events";
 */
 @component.Singleton
 export class AMLEnvironmentService extends EnvironmentService {
-    
+
    private readonly log: Logger = getLogger();
    public amlClusterConfig: AMLClusterConfig | undefined;
    public amlTrialConfig: AMLTrialConfig | undefined;
-    private amlJobConfig: any;
-    private stopping: boolean = false;
-    private versionCheck: boolean = true;
-    private isMultiPhase: boolean = false;
-    private nniVersion?: string;
    private experimentId: string;
-    private nniManagerIpConfig?: NNIManagerIpConfig;
    private experimentRootDir: string;

    constructor() {
@@ -51,7 +41,7 @@ export class AMLEnvironmentService extends EnvironmentService {
        return false;
    }

-    public getCommandChannel(commandEmitter: EventEmitter): CommandChannel {
+    public createCommandChannel(commandEmitter: EventEmitter): CommandChannel {
        return new AMLCommandChannel(commandEmitter);
    }

@@ -83,29 +73,31 @@ export class AMLEnvironmentService extends EnvironmentService {
    public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void> {
        environments.forEach(async (environment) => {
            const amlClient = (environment as AMLEnvironmentInformation).amlClient;
-                    if (!amlClient) {
-            throw new Error('AML client not initialized!');
+            if (!amlClient) {
+                throw new Error('AML client not initialized!');
            }
-            const status = await amlClient.updateStatus(environment.status);
-            switch (status.toUpperCase()) {
+            const newStatus = await amlClient.updateStatus(environment.status);
+            switch (newStatus.toUpperCase()) {
                case 'WAITING':
-                case 'RUNNING':
                case 'QUEUED':
-                    // RUNNING status is set by runner, and ignore waiting status
+                    environment.setStatus('WAITING');
+                    break;
+                case 'RUNNING':
+                    environment.setStatus('RUNNING');
                    break;
                case 'COMPLETED':
                case 'SUCCEEDED':
-                    environment.setFinalStatus('SUCCEEDED');
+                    environment.setStatus('SUCCEEDED');
                    break;
                case 'FAILED':
-                    environment.setFinalStatus('FAILED');
+                    environment.setStatus(newStatus.toUpperCase() as EnvironmentStatus);
                    break;
                case 'STOPPED':
                case 'STOPPING':
-                    environment.setFinalStatus('USER_CANCELED');
+                    environment.setStatus('USER_CANCELED');
                    break;
                default:
-                    environment.setFinalStatus('UNKNOWN');
+                    environment.setStatus('UNKNOWN');
            }
        });
    }
@@ -120,7 +112,7 @@ export class AMLEnvironmentService extends EnvironmentService {
        const amlEnvironment: AMLEnvironmentInformation = environment as AMLEnvironmentInformation;
        const environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp");
        environment.command = `import os\nos.system('${amlEnvironment.command}')`;
-        await fs.promises.writeFile(path.join(environmentLocalTempFolder, 'nni_script.py'), amlEnvironment.command ,{ encoding: 'utf8' });
+        await fs.promises.writeFile(path.join(environmentLocalTempFolder, 'nni_script.py'), amlEnvironment.command, { encoding: 'utf8' });
        const amlClient = new AMLClient(
            this.amlClusterConfig.subscriptionId,
            this.amlClusterConfig.resourceGroup,

--- a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts
+++ b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts
@@ -4,6 +4,7 @@
 'use strict';

 import * as fs from 'fs';
+import * as yaml from 'js-yaml';
 import * as request from 'request';
 import { Deferred } from 'ts-deferred';
 import * as component from '../../../common/component';
@@ -15,7 +16,6 @@ import { NNIPAIK8STrialConfig } from '../../pai/paiK8S/paiK8SConfig';
 import { EnvironmentInformation, EnvironmentService } from '../environment';
 import { StorageService } from '../storageService';

-const yaml = require('js-yaml');

 /**
 * Collector PAI jobs info from PAI cluster, and update pai job status locally
@@ -40,6 +40,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
        this.experimentId = getExperimentId();
    }

+    public get environmentMaintenceLoopInterval(): number {
+        return 5000;
+    }
+
    public get hasStorageService(): boolean {
        return true;
    }
@@ -72,6 +76,16 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
                if (this.paiTrialConfig.paiConfigPath) {
                    this.paiJobConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8'));
                }
+
+                if (this.paiClusterConfig.gpuNum === undefined) {
+                    this.paiClusterConfig.gpuNum = this.paiTrialConfig.gpuNum;
+                }
+                if (this.paiClusterConfig.cpuNum === undefined) {
+                    this.paiClusterConfig.cpuNum = this.paiTrialConfig.cpuNum;
+                }
+                if (this.paiClusterConfig.memoryMB === undefined) {
+                    this.paiClusterConfig.memoryMB = this.paiTrialConfig.memoryMB;
+                }
                break;
            }
            default:
@@ -111,37 +125,35 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
                });

                environments.forEach((environment) => {
-                    if (jobInfos.has(environment.jobId)) {
-                        const jobResponse = jobInfos.get(environment.jobId);
+                    if (jobInfos.has(environment.envId)) {
+                        const jobResponse = jobInfos.get(environment.envId);
                        if (jobResponse && jobResponse.state) {
                            const oldEnvironmentStatus = environment.status;
                            switch (jobResponse.state) {
                                case 'RUNNING':
                                case 'WAITING':
-                                    // RUNNING status is set by runner, and ignore waiting status
-                                    break;
                                case 'SUCCEEDED':
                                case 'FAILED':
-                                    environment.setFinalStatus(jobResponse.state);
+                                    environment.setStatus(jobResponse.state);
                                    break;
                                case 'STOPPED':
                                case 'STOPPING':
-                                    environment.setFinalStatus('USER_CANCELED');
+                                    environment.setStatus('USER_CANCELED');
                                    break;
                                default:
-                                    this.log.error(`OpenPAI: job ${environment.jobId} returns unknown state ${jobResponse.state}.`);
-                                    environment.setFinalStatus('UNKNOWN');
+                                    this.log.error(`OpenPAI: job ${environment.envId} returns unknown state ${jobResponse.state}.`);
+                                    environment.setStatus('UNKNOWN');
                            }
                            if (oldEnvironmentStatus !== environment.status) {
-                                this.log.debug(`OpenPAI: job ${environment.jobId} change status ${oldEnvironmentStatus} to ${environment.status} due to job is ${jobResponse.state}.`)
+                                this.log.debug(`OpenPAI: job ${environment.envId} change status ${oldEnvironmentStatus} to ${environment.status} due to job is ${jobResponse.state}.`)
                            }
                        } else {
-                            this.log.error(`OpenPAI: job ${environment.jobId} has no state returned. body:${JSON.stringify(jobResponse)}`);
+                            this.log.error(`OpenPAI: job ${environment.envId} has no state returned. body:${JSON.stringify(jobResponse)}`);
                            // some error happens, and mark this environment
                            environment.status = 'FAILED';
                        }
                    } else {
-                        this.log.error(`OpenPAI job ${environment.jobId} is not found in job list.`);
+                        this.log.error(`OpenPAI job ${environment.envId} is not found in job list.`);
                        environment.status = 'UNKNOWN';
                    }
                });
@@ -169,8 +181,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
        // Step 1. Prepare PAI job configuration
        const environmentRoot = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}`;
        environment.runnerWorkingFolder = `${environmentRoot}/envs/${environment.id}`;
-        environment.command = `cd ${environmentRoot} && ${environment.command}`
-        environment.trackingUrl = `${this.protocol}://${this.paiClusterConfig.host}/job-detail.html?username=${this.paiClusterConfig.userName}&jobName=${environment.jobId}`
+        environment.command = `cd ${environmentRoot} && ${environment.command}`;
+        environment.trackingUrl = `${this.protocol}://${this.paiClusterConfig.host}/job-detail.html?username=${this.paiClusterConfig.userName}&jobName=${environment.envId}`;
+        environment.useActiveGpu = this.paiClusterConfig.useActiveGpu;
+        environment.maxTrialNumberPerGpu = this.paiClusterConfig.maxTrialNumPerGpu;

        // Step 2. Generate Job Configuration in yaml format
        const paiJobConfig = this.generateJobConfigInYamlFormat(environment);
@@ -189,7 +203,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
        request(submitJobRequest, (error, response, body) => {
            if ((error !== undefined && error !== null) || response.statusCode >= 400) {
                const errorMessage: string = (error !== undefined && error !== null) ? error.message :
-                    `start environment ${environment.jobId} failed, http code:${response.statusCode}, http body: ${body}`;
+                    `start environment ${environment.envId} failed, http code:${response.statusCode}, http body: ${body}`;

                this.log.error(errorMessage);
                environment.status = 'FAILED';
@@ -211,7 +225,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
        }

        const stopJobRequest: request.Options = {
-            uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs/${this.paiClusterConfig.userName}~${environment.jobId}/executionType`,
+            uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs/${this.paiClusterConfig.userName}~${environment.envId}/executionType`,
            method: 'PUT',
            json: true,
            body: { value: 'STOP' },
@@ -222,17 +236,17 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
            }
        };

-        this.log.debug(`stopping OpenPAI environment ${environment.jobId}, ${stopJobRequest.uri}`);
+        this.log.debug(`stopping OpenPAI environment ${environment.envId}, ${stopJobRequest.uri}`);

        try {
            request(stopJobRequest, (error, response, _body) => {
                try {
                    if ((error !== undefined && error !== null) || (response && response.statusCode >= 400)) {
-                        this.log.error(`OpenPAI: stop job ${environment.jobId} failed with ${response.statusCode}\n${error}`);
+                        this.log.error(`OpenPAI: stop job ${environment.envId} failed with ${response.statusCode}\n${error}`);
                        deferred.reject((error !== undefined && error !== null) ? error :
                            `Stop trial failed, http code: ${response.statusCode}`);
                    } else {
-                        this.log.info(`OpenPAI job ${environment.jobId} stopped.`);
+                        this.log.info(`OpenPAI job ${environment.envId} stopped.`);
                    }
                    deferred.resolve();
                } catch (error) {
@@ -265,7 +279,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
        if (this.paiTrialConfig === undefined) {
            throw new Error('trial config is not initialized');
        }
-        const jobName = environment.jobId;
+        const jobName = environment.envId;

        let nniJobConfig: any = undefined;
        if (this.paiTrialConfig.paiConfigPath) {
@@ -284,7 +298,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
                    environment.nodeCount += instanceCount;
                }

-
                // Each taskRole will generate new command in NNI's command format
                // Each command will be formatted to NNI style
                for (const taskRoleName in nniJobConfig.taskRoles) {
@@ -298,6 +311,19 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
            }

        } else {
+            if (this.paiClusterConfig === undefined) {
+                throw new Error('PAI Cluster config is not initialized');
+            }
+            if (this.paiClusterConfig.gpuNum === undefined) {
+                throw new Error('PAI Cluster gpuNum is not initialized');
+            }
+            if (this.paiClusterConfig.cpuNum === undefined) {
+                throw new Error('PAI Cluster cpuNum is not initialized');
+            }
+            if (this.paiClusterConfig.memoryMB === undefined) {
+                throw new Error('PAI Cluster memoryMB is not initialized');
+            }
+
            nniJobConfig = {
                protocolVersion: 2,
                name: jobName,
@@ -320,9 +346,9 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
                        taskRetryCount: 0,
                        dockerImage: 'docker_image_0',
                        resourcePerInstance: {
-                            gpu: this.paiTrialConfig.gpuNum,
-                            cpu: this.paiTrialConfig.cpuNum,
-                            memoryMB: this.paiTrialConfig.memoryMB
+                            gpu: this.paiClusterConfig.gpuNum,
+                            cpu: this.paiClusterConfig.cpuNum,
+                            memoryMB: this.paiClusterConfig.memoryMB
                        },
                        commands: [
                            environment.command

--- a/src/nni_manager/training_service/reusable/gpuScheduler.ts
+++ b/src/nni_manager/training_service/reusable/gpuScheduler.ts
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+'use strict';
+
+import * as assert from 'assert';
+import { getLogger, Logger } from '../../common/log';
+import { randomSelect } from '../../common/utils';
+import { GPUInfo, ScheduleResultType } from '../common/gpuData';
+import { EnvironmentInformation } from './environment';
+import { TrialDetail } from './trial';
+
+type SCHEDULE_POLICY_NAME = 'random' | 'round-robin';
+
+export class GpuSchedulerSetting {
+    public useActiveGpu: boolean = false;
+    public maxTrialNumberPerGpu: number = 1;
+}
+
+export type GpuScheduleResult = {
+    resultType: ScheduleResultType;
+    environment: EnvironmentInformation | undefined;
+    gpuIndices: GPUInfo[] | undefined;
+};
+
+/**
+ * A simple GPU scheduler implementation
+ */
+export class GpuScheduler {
+
+    // private readonly machineExecutorMap: Set<TrialDetail>;
+    private readonly log: Logger = getLogger();
+    private readonly policyName: SCHEDULE_POLICY_NAME = 'round-robin';
+    private defaultSetting: GpuSchedulerSetting;
+    private roundRobinIndex: number = 0;
+
+    /**
+     * Constructor
+     * @param environments map from remote machine to executor
+     */
+    constructor(gpuSchedulerSetting: GpuSchedulerSetting | undefined = undefined) {
+        if (undefined === gpuSchedulerSetting) {
+            gpuSchedulerSetting = new GpuSchedulerSetting();
+        }
+        this.defaultSetting = gpuSchedulerSetting;
+    }
+
+    public setSettings(gpuSchedulerSetting: GpuSchedulerSetting): void {
+        this.defaultSetting = gpuSchedulerSetting;
+    }
+
+    /**
+     * Schedule a machine according to the constraints (requiredGPUNum)
+     * @param requiredGPUNum required GPU number
+     */
+    public scheduleMachine(environments: EnvironmentInformation[], requiredGPUNum: number | undefined, trialDetail: TrialDetail): GpuScheduleResult {
+        if (requiredGPUNum === undefined) {
+            requiredGPUNum = 0;
+        }
+        assert(requiredGPUNum >= 0);
+        // Step 1: Check if required GPU number not exceeds the total GPU number in all machines
+        const eligibleEnvironments: EnvironmentInformation[] = environments.filter((environment: EnvironmentInformation) =>
+            environment.defaultGpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && environment.defaultGpuSummary.gpuCount >= requiredGPUNum));
+        if (eligibleEnvironments.length === 0) {
+            // If the required gpu number exceeds the upper limit of all machine's GPU number
+            // Return REQUIRE_EXCEED_TOTAL directly
+            return ({
+                resultType: ScheduleResultType.REQUIRE_EXCEED_TOTAL,
+                gpuIndices: undefined,
+                environment: undefined,
+            });
+        }
+
+        // Step 2: Allocate Host/GPU for specified trial job
+        // Currenty the requireGPUNum parameter for all trial jobs are identical.
+        if (requiredGPUNum > 0) {
+            // Trial job requires GPU
+            const result: GpuScheduleResult | undefined = this.scheduleGPUHost(environments, requiredGPUNum, trialDetail);
+            if (result !== undefined) {
+                return result;
+            }
+        } else {
+            // Trail job does not need GPU
+            const allocatedRm: EnvironmentInformation = this.selectMachine(environments, environments);
+
+            return this.allocateHost(requiredGPUNum, allocatedRm, [], trialDetail);
+        }
+
+        return {
+            resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
+            gpuIndices: undefined,
+            environment: undefined,
+        };
+    }
+
+    /**
+     * remove the job's gpu reversion
+     */
+    public removeGpuReservation(trial: TrialDetail): void {
+        if (trial.environment !== undefined &&
+            trial.environment.defaultGpuSummary !== undefined &&
+            trial.assignedGpus !== undefined &&
+            trial.assignedGpus.length > 0) {
+            for (const gpuInfo of trial.assignedGpus) {
+                const defaultGpuSummary = trial.environment.defaultGpuSummary;
+                const num: number | undefined = defaultGpuSummary.assignedGpuIndexMap.get(gpuInfo.index);
+                if (num !== undefined) {
+                    if (num === 1) {
+                        defaultGpuSummary.assignedGpuIndexMap.delete(gpuInfo.index);
+                    } else {
+                        defaultGpuSummary.assignedGpuIndexMap.set(gpuInfo.index, num - 1);
+                    }
+                }
+            }
+        }
+    }
+
+    private scheduleGPUHost(environments: EnvironmentInformation[], requiredGPUNumber: number, trial: TrialDetail): GpuScheduleResult | undefined {
+        const totalResourceMap: Map<EnvironmentInformation, GPUInfo[]> = this.gpuResourceDetection(environments);
+        const qualifiedEnvironments: EnvironmentInformation[] = [];
+        totalResourceMap.forEach((gpuInfos: GPUInfo[], environment: EnvironmentInformation) => {
+            if (gpuInfos !== undefined && gpuInfos.length >= requiredGPUNumber) {
+                qualifiedEnvironments.push(environment);
+            }
+        });
+        if (qualifiedEnvironments.length > 0) {
+            const allocatedEnvironment: EnvironmentInformation = this.selectMachine(qualifiedEnvironments, environments);
+            const gpuInfos: GPUInfo[] | undefined = totalResourceMap.get(allocatedEnvironment);
+            if (gpuInfos !== undefined) { // should always true
+                return this.allocateHost(requiredGPUNumber, allocatedEnvironment, gpuInfos, trial);
+            } else {
+                assert(false, 'gpuInfos is undefined');
+            }
+        }
+    }
+
+    /**
+     * Detect available GPU resource for an environment
+     * @returns Available GPUs on environments
+     */
+    private gpuResourceDetection(environments: EnvironmentInformation[]): Map<EnvironmentInformation, GPUInfo[]> {
+        const totalResourceMap: Map<EnvironmentInformation, GPUInfo[]> = new Map<EnvironmentInformation, GPUInfo[]>();
+        environments.forEach((environment: EnvironmentInformation) => {
+            // Assgin totoal GPU count as init available GPU number
+            if (environment.defaultGpuSummary !== undefined) {
+                const defaultGpuSummary = environment.defaultGpuSummary;
+                const availableGPUs: GPUInfo[] = [];
+                const designatedGpuIndices: Set<number> = new Set<number>(environment.usableGpus);
+                if (designatedGpuIndices.size > 0) {
+                    for (const gpuIndex of designatedGpuIndices) {
+                        if (gpuIndex >= environment.defaultGpuSummary.gpuCount) {
+                            throw new Error(`Specified GPU index not found: ${gpuIndex}`);
+                        }
+                    }
+                }
+
+                if (undefined !== defaultGpuSummary.gpuInfos) {
+                    defaultGpuSummary.gpuInfos.forEach((gpuInfo: GPUInfo) => {
+                        // if the GPU has active process, OR be reserved by a job,
+                        // or index not in gpuIndices configuration in machineList,
+                        // or trial number on a GPU reach max number,
+                        // We should NOT allocate this GPU
+                        // if users set useActiveGpu, use the gpu whether there is another activeProcess
+                        if (designatedGpuIndices.size === 0 || designatedGpuIndices.has(gpuInfo.index)) {
+                            if (defaultGpuSummary.assignedGpuIndexMap !== undefined) {
+                                const num: number | undefined = defaultGpuSummary.assignedGpuIndexMap.get(gpuInfo.index);
+                                const maxTrialNumberPerGpu: number = environment.maxTrialNumberPerGpu ? environment.maxTrialNumberPerGpu : this.defaultSetting.maxTrialNumberPerGpu;
+                                const useActiveGpu: boolean = environment.useActiveGpu ? environment.useActiveGpu : this.defaultSetting.useActiveGpu;
+                                if ((num === undefined && (!useActiveGpu && gpuInfo.activeProcessNum === 0 || useActiveGpu)) ||
+                                    (num !== undefined && num < maxTrialNumberPerGpu)) {
+                                    availableGPUs.push(gpuInfo);
+                                }
+                            } else {
+                                throw new Error(`occupiedGpuIndexMap is undefined!`);
+                            }
+                        }
+                    });
+                }
+                totalResourceMap.set(environment, availableGPUs);
+            }
+        });
+
+        return totalResourceMap;
+    }
+
+    private selectMachine(qualifiedEnvironments: EnvironmentInformation[], allEnvironments: EnvironmentInformation[]): EnvironmentInformation {
+        assert(qualifiedEnvironments !== undefined && qualifiedEnvironments.length > 0);
+
+        if (this.policyName === 'random') {
+            return randomSelect(qualifiedEnvironments);
+        } else if (this.policyName === 'round-robin') {
+            return this.roundRobinSelect(qualifiedEnvironments, allEnvironments);
+        } else {
+            throw new Error(`Unsupported schedule policy: ${this.policyName}`);
+        }
+    }
+
+    private roundRobinSelect(qualifiedEnvironments: EnvironmentInformation[], allEnvironments: EnvironmentInformation[]): EnvironmentInformation {
+        while (!qualifiedEnvironments.includes(allEnvironments[this.roundRobinIndex % allEnvironments.length])) {
+            this.roundRobinIndex++;
+        }
+
+        return allEnvironments[this.roundRobinIndex++ % allEnvironments.length];
+    }
+
+    private selectGPUsForTrial(gpuInfos: GPUInfo[], requiredGPUNum: number): GPUInfo[] {
+        // Sequentially allocate GPUs
+        return gpuInfos.slice(0, requiredGPUNum);
+    }
+
+    private allocateHost(requiredGPUNum: number, environment: EnvironmentInformation,
+        gpuInfos: GPUInfo[], trialDetails: TrialDetail): GpuScheduleResult {
+        assert(gpuInfos.length >= requiredGPUNum);
+        const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum);
+        const defaultGpuSummary = environment.defaultGpuSummary;
+        if (undefined === defaultGpuSummary) {
+            throw new Error(`Environment ${environment.id} defaultGpuSummary shouldn't be undefined!`);
+        }
+
+        allocatedGPUs.forEach((gpuInfo: GPUInfo) => {
+            let num: number | undefined = defaultGpuSummary.assignedGpuIndexMap.get(gpuInfo.index);
+            if (num === undefined) {
+                num = 0;
+            }
+            defaultGpuSummary.assignedGpuIndexMap.set(gpuInfo.index, num + 1);
+        });
+        trialDetails.assignedGpus = allocatedGPUs;
+
+        return {
+            resultType: ScheduleResultType.SUCCEED,
+            environment: environment,
+            gpuIndices: allocatedGPUs,
+        };
+    }
+}
--- a/src/nni_manager/training_service/reusable/storageService.ts
+++ b/src/nni_manager/training_service/reusable/storageService.ts
@@ -83,7 +83,7 @@ export abstract class StorageService {
        localPath = this.expandPath(false, localPath);
        remotePath = this.expandPath(true, remotePath);
        this.logger.debug(`copy remotePath: ${remotePath} to localPath: ${localPath}`);
-        return await this.internalCopy(localPath, remotePath, true, true, false);
+        return await this.internalCopy(remotePath, localPath, true, true, false);
    }

    public async removeDirectory(remotePath: string, isRecursive: boolean): Promise<void> {
@@ -151,7 +151,7 @@ export abstract class StorageService {
        localPath = this.expandPath(false, localPath);
        remotePath = this.expandPath(true, remotePath);
        this.logger.debug(`copy file remotePath: ${remotePath} to localPath: ${localPath}`);
-        await this.internalCopy(localPath, remotePath, false, true, false);
+        await this.internalCopy(remotePath, localPath, false, true, false);
    }

    public async removeFile(remotePath: string): Promise<void> {

--- a/src/nni_manager/training_service/reusable/storages/mountedStorageService.ts
+++ b/src/nni_manager/training_service/reusable/storages/mountedStorageService.ts
@@ -17,12 +17,12 @@ export class MountedStorageService extends StorageService {
            if (isRecursive) {
                const children = await fs.promises.readdir(path);
                for (const file of children) {
-                    const stat = await fs.promises.lstat(file);
-                    this.internalRemove(file, stat.isDirectory(), isRecursive);
+                    const filePath = this.internalJoin(path, file);
+                    const stat = await fs.promises.lstat(filePath);
+                    await this.internalRemove(filePath, stat.isDirectory(), isRecursive);
                }
-            } else {
-                await fs.promises.rmdir(path);
            }
+            await fs.promises.rmdir(path);
        } else {
            await fs.promises.unlink(path);
        }
@@ -98,7 +98,7 @@ export class MountedStorageService extends StorageService {
            {
                encoding: "utf8",
                start: current,
-                end: readLength + current,
+                end: readLength + current - 1,
            }).on("data", (data) => {
                result += data;
            }).on("end", () => {

--- a/src/nni_manager/training_service/reusable/test/mountedStorageService.test.ts
+++ b/src/nni_manager/training_service/reusable/test/mountedStorageService.test.ts
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+'use strict';
+
+import * as chai from 'chai';
+import * as fs from 'fs';
+import * as path from 'path';
+import { getLogger, Logger } from "../../../common/log";
+import { cleanupUnitTest, prepareUnitTest } from '../../../common/utils';
+import { MountedStorageService } from "../storages/mountedStorageService";
+import chaiAsPromised = require("chai-as-promised");
+
+
+async function remove(removedPath: string, isDirectory: boolean, isRecursive: boolean): Promise<void> {
+    if (isDirectory) {
+        if (isRecursive) {
+            const children = await fs.promises.readdir(removedPath);
+            for (const fileName of children) {
+                const filePath = path.join(removedPath, fileName);
+                const stat = await fs.promises.lstat(filePath);
+                await remove(filePath, stat.isDirectory(), isRecursive);
+            }
+        }
+        await fs.promises.rmdir(removedPath);
+    } else {
+        await fs.promises.unlink(removedPath);
+    }
+}
+
+describe('Unit Test for MountedStorageService', () => {
+
+    let service: MountedStorageService;
+    let log: Logger;
+    let localPath = "reusableut/local";
+    let mountedPath = "reusableut/mounted";
+
+    const testPath = "testpath";
+    const testFileName = "testfile.txt";
+    let localCopiedPath: string;
+    let localFileName: string;
+    let mountedFileName: string;
+
+    before(() => {
+        chai.should();
+        chai.use(chaiAsPromised);
+        prepareUnitTest();
+        log = getLogger();
+
+        const testRoot = path.dirname(__filename);
+        localPath = path.join(testRoot, localPath);
+        mountedPath = path.join(testRoot, mountedPath);
+        service = new MountedStorageService();
+        service.initialize(localPath, mountedPath);
+
+        localCopiedPath = path.join(localPath, testPath);
+        localFileName = path.join(localCopiedPath, testFileName);
+        mountedFileName = path.join(testPath, testFileName);
+    });
+
+    after(() => {
+        cleanupUnitTest();
+    });
+
+    beforeEach(async () => {
+        if (!fs.existsSync(localPath)) {
+            await fs.promises.mkdir(localPath, { recursive: true });
+        }
+        if (!fs.existsSync(mountedPath)) {
+            await fs.promises.mkdir(mountedPath, { recursive: true });
+        }
+        log.info(`localFileName: ${localFileName}`);
+
+        await fs.promises.mkdir(localCopiedPath, { recursive: true });
+        await fs.promises.writeFile(localFileName, "hello world");
+    });
+
+    afterEach(async () => {
+        const testRootPath = path.normalize(`${localPath}/../../reusableut`);
+        await remove(testRootPath, true, true);
+    });
+
+    it('copyAndRename', async () => {
+        await service.copyDirectory(localCopiedPath, ".");
+        chai.expect(fs.existsSync(mountedPath));
+
+        const newName = `${testFileName}new`;
+        await service.rename(mountedFileName, newName);
+        chai.assert.isFalse(fs.existsSync(testPath));
+        const newTestPath = `${mountedFileName}new`;
+        chai.assert.isTrue(await service.exists(newTestPath));
+
+        await service.copyFileBack(newTestPath, ".");
+        const localNewFileName = `${localPath}/${newName}`;
+        chai.assert.isTrue(fs.existsSync(localNewFileName));
+
+        fs.unlinkSync(`${localFileName}`);
+        fs.rmdirSync(`${localPath}/${testPath}`);
+        await service.copyDirectoryBack(`${mountedPath}/${testPath}`, `.`);
+        const localNewName = `${localFileName}new`;
+        chai.assert.isTrue(fs.existsSync(localNewName));
+    })
+
+    it('FileContentTest', async () => {
+        const savedFileName = "savedfile.txt";
+        await service.save("01234", savedFileName);
+        chai.expect(fs.existsSync(savedFileName));
+
+        let content = await service.readFileContent(savedFileName, 0, -1);
+        chai.assert.equal(content, "01234");
+
+        await service.save("56789", savedFileName, true);
+        content = await service.readFileContent(savedFileName, 0, -1);
+        chai.assert.equal(content, "0123456789");
+
+        content = await service.readFileContent(savedFileName, -1, 1);
+        chai.assert.equal(content, "0");
+
+        content = await service.readFileContent(savedFileName, 5, 1);
+        chai.assert.equal(content, "5");
+
+        content = await service.readFileContent(savedFileName, 5, -1);
+        chai.assert.equal(content, "56789");
+    });
+});
--- a/src/nni_manager/training_service/reusable/test/trialDispatcher.test.ts
+++ b/src/nni_manager/training_service/reusable/test/trialDispatcher.test.ts
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+import * as chai from 'chai';
+import * as path from 'path';
+import { Scope } from "typescript-ioc";
+import * as component from '../../../common/component';
+import { getLogger, Logger } from "../../../common/log";
+import { TrialJobApplicationForm, TrialJobStatus } from '../../../common/trainingService';
+import { cleanupUnitTest, delay, prepareUnitTest, uniqueString } from '../../../common/utils';
+import { INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, SEND_TRIAL_JOB_PARAMETER, TRIAL_END, GPU_INFO } from '../../../core/commands';
+import { TrialConfigMetadataKey } from '../../../training_service/common/trialConfigMetadataKey';
+import { Command } from '../commandChannel';
+import { EnvironmentInformation, EnvironmentService } from "../environment";
+import { TrialDetail } from '../trial';
+import { TrialDispatcher } from "../trialDispatcher";
+import { UtCommandChannel } from './utCommandChannel';
+import { UtEnvironmentService } from "./utEnvironmentService";
+import chaiAsPromised = require("chai-as-promised");
+import { promises } from 'fs';
+import { Deferred } from 'ts-deferred';
+import { NNIErrorNames, NNIError, MethodNotImplementedError } from '../../../common/errors';
+
+function createTrialForm(content: any = undefined): TrialJobApplicationForm {
+    if (content === undefined) {
+        content = {
+            "test": 1
+        };
+    }
+    const trialForm = {
+        sequenceId: 0,
+        hyperParameters: {
+            value: JSON.stringify(content),
+            index: 0
+        }
+    };
+    return trialForm;
+}
+
+async function waitResult<TResult>(callback: () => Promise<TResult | undefined>, waitMs: number = 1000, interval: number = 1, throwError: boolean = false): Promise<TResult | undefined> {
+    while (waitMs > 0) {
+        const result = await callback();
+        if (result !== undefined) {
+            return result;
+        }
+        await delay(interval);
+        waitMs -= interval;
+    };
+
+    if (throwError) {
+        throw new Error(`wait result timeout!\n${callback.toString()}`);
+    }
+
+    return undefined;
+}
+
+async function waitResultMust<TResult>(callback: () => Promise<TResult | undefined>, waitMs: number = 1000, interval: number = 1): Promise<TResult> {
+    const result = await waitResult(callback, waitMs, interval, true);
+    // this error should be thrown in waitResult already.
+    if (result === undefined) {
+        throw new Error(`wait result timeout!`);
+    }
+    return result;
+}
+
+async function newTrial(trialDispatcher: TrialDispatcher): Promise<TrialDetail> {
+    const trialDetail = await trialDispatcher.submitTrialJob(createTrialForm());
+
+    return trialDetail;
+}
+
+function newGpuInfo(gpuCount: Number = 2, nodeId: string | undefined = undefined): any {
+    let gpuInfos = [];
+    for (let index = 0; index < gpuCount; index++) {
+        gpuInfos.push({
+            index: index,
+            activeProcessNum: 0,
+        });
+    }
+    const gpuInfo = {
+        gpuInfos: gpuInfos,
+        gpuCount: gpuInfos.length,
+        node: nodeId
+    }
+    return gpuInfo;
+}
+
+async function verifyTrialRunning(commandChannel: UtCommandChannel, trialDetail: TrialDetail): Promise<Command> {
+
+    let command = await waitResultMust<Command>(async () => {
+        return await commandChannel.testReceiveCommandFromTrialDispatcher();
+    });
+    chai.assert.equal(command.command, NEW_TRIAL_JOB, "verifyTrialRunning command type");
+    chai.assert.equal(command.data["trialId"], trialDetail.id, "verifyTrialRunning trialDetail.id should be equal.");
+
+    return command;
+}
+
+async function verifyTrialResult(commandChannel: UtCommandChannel, trialDetail: TrialDetail, returnCode: number = 0): Promise<void> {
+    let trialResult = {
+        trial: trialDetail.id,
+        code: returnCode,
+        timestamp: Date.now(),
+    };
+    if (trialDetail.environment === undefined) {
+        throw new Error(`environment shouldn't be undefined.`)
+    }
+
+    await commandChannel.testSendCommandToTrialDispatcher(trialDetail.environment, TRIAL_END, trialResult);
+    await waitResultMust<boolean>(async () => {
+        return trialDetail.status !== 'RUNNING' ? true : undefined;
+    });
+    if (returnCode === 0) {
+        chai.assert.equal<TrialJobStatus>(trialDetail.status, 'SUCCEEDED', "trial should be succeeded");
+    } else {
+        chai.assert.equal<TrialJobStatus>(trialDetail.status, 'FAILED', "trial should be failed");
+    }
+}
+
+async function waitEnvironment(waitCount: number,
+    previousEnvironments: Map<string, EnvironmentInformation>,
+    environmentService: UtEnvironmentService, commandChannel: UtCommandChannel,
+    gpuCount: number = 2, nodeCount: number = 1,
+    callback: ((environment: EnvironmentInformation) => Promise<void>) | undefined = undefined): Promise<EnvironmentInformation> {
+    const waitRequestEnvironment = await waitResultMust<EnvironmentInformation>(async () => {
+        const environments = environmentService.testGetEnvironments();
+        if (environments.size === waitCount) {
+            for (const [id, environment] of environments) {
+                if (!previousEnvironments.has(id)) {
+                    previousEnvironments.set(id, environment);
+                    return environment;
+                }
+            }
+        }
+        return undefined;
+    });
+
+    if (waitRequestEnvironment === undefined) {
+        throw new Error(`waitRequestEnvironment is not defined.`);
+    }
+
+    const nodeIds = [];
+    waitRequestEnvironment.nodeCount = nodeCount;
+    if (nodeCount > 1) {
+        for (let index = 0; index < nodeCount; index++) {
+            nodeIds.push(uniqueString(5));
+        }
+    } else {
+        nodeIds.push(undefined);
+    }
+    for (const nodeId of nodeIds) {
+        // set runner is ready.
+        await commandChannel.testSendCommandToTrialDispatcher(waitRequestEnvironment, INITIALIZED, { node: nodeId });
+
+        if (gpuCount > 0) {
+            await commandChannel.testSendCommandToTrialDispatcher(waitRequestEnvironment, GPU_INFO, newGpuInfo(gpuCount, nodeId));
+        }
+    }
+
+    if (callback) {
+        await callback(waitRequestEnvironment);
+    }
+
+    // set env to running
+    environmentService.testSetEnvironmentStatus(waitRequestEnvironment, 'RUNNING');
+
+    await waitResultMust<boolean>(async () => {
+        return waitRequestEnvironment.isRunnerReady ? true : undefined;
+    });
+
+    return waitRequestEnvironment;
+}
+
+describe('Unit Test for TrialDispatcher', () => {
+
+    let trialRunPromise: Promise<void>;
+    let trialDispatcher: TrialDispatcher;
+    let commandChannel: UtCommandChannel;
+    let environmentService: UtEnvironmentService;
+    let log: Logger;
+    let previousEnvironments: Map<string, EnvironmentInformation> = new Map<string, EnvironmentInformation>();
+    const currentDir = path.dirname(__filename);
+
+    before(() => {
+        chai.should();
+        chai.use(chaiAsPromised);
+        prepareUnitTest();
+        log = getLogger();
+    });
+
+    after(() => {
+        cleanupUnitTest();
+    });
+
+    beforeEach(async () => {
+        const trialConfig = {
+            codeDir: currentDir,
+            command: "echo",
+        }
+        const nniManagerIpConfig = {
+            nniManagerIp: "127.0.0.1",
+        }
+        trialDispatcher = new TrialDispatcher();
+        component.Container.bind(EnvironmentService)
+            .to(UtEnvironmentService)
+            .scope(Scope.Singleton);
+
+        await trialDispatcher.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, JSON.stringify(trialConfig));
+        await trialDispatcher.setClusterMetadata(TrialConfigMetadataKey.NNI_MANAGER_IP, JSON.stringify(nniManagerIpConfig));
+        trialRunPromise = trialDispatcher.run();
+
+        environmentService = component.get(EnvironmentService) as UtEnvironmentService;
+        commandChannel = environmentService.testGetCommandChannel();
+    });
+
+    afterEach(async () => {
+        previousEnvironments.clear();
+        await trialDispatcher.cleanUp();
+        environmentService.testReset();
+        await trialRunPromise;
+    });
+
+    it('reuse env', async () => {
+
+        let trialDetail = await newTrial(trialDispatcher);
+        await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
+        await verifyTrialRunning(commandChannel, trialDetail);
+        await verifyTrialResult(commandChannel, trialDetail, 0);
+
+        trialDetail = await newTrial(trialDispatcher);
+        await verifyTrialRunning(commandChannel, trialDetail);
+        await verifyTrialResult(commandChannel, trialDetail, -1);
+
+        chai.assert.equal(environmentService.testGetEnvironments().size, 1, "as env reused, so only 1 env should be here.");
+        const trials = await trialDispatcher.listTrialJobs();
+        chai.assert.equal(trials.length, 2, "there should be 2 trials");
+    });
+
+    it('not reusable env', async () => {
+        trialDispatcher.setClusterMetadata(
+            TrialConfigMetadataKey.TRIAL_CONFIG,
+            JSON.stringify({
+                reuseEnvironment: false,
+                codeDir: currentDir,
+            }));
+
+        let trialDetail = await newTrial(trialDispatcher);
+
+        let environment = await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
+        await verifyTrialRunning(commandChannel, trialDetail);
+        await verifyTrialResult(commandChannel, trialDetail, 0);
+        await waitResultMust<true>(async () => {
+            return environment.status === 'USER_CANCELED' ? true : undefined;
+        });
+
+        trialDetail = await newTrial(trialDispatcher);
+
+        await waitEnvironment(2, previousEnvironments, environmentService, commandChannel);
+        await verifyTrialRunning(commandChannel, trialDetail);
+        await verifyTrialResult(commandChannel, trialDetail, -1);
+        await waitResultMust<true>(async () => {
+            return environment.status === 'USER_CANCELED' ? true : undefined;
+        });
+
+        chai.assert.equal(environmentService.testGetEnvironments().size, 2, "as env not reused, so only 2 envs should be here.");
+        const trials = await trialDispatcher.listTrialJobs();
+        chai.assert.equal(trials.length, 2, "there should be 2 trials");
+    });
+
+    it('no more env', async () => {
+
+        const trialDetail1 = await newTrial(trialDispatcher);
+        await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
+
+        // set to no more environment
+        environmentService.testSetNoMoreEnvironment(false);
+
+        const trialDetail2 = await newTrial(trialDispatcher);
+
+        await verifyTrialRunning(commandChannel, trialDetail1);
+        await verifyTrialResult(commandChannel, trialDetail1, 0);
+
+        await verifyTrialRunning(commandChannel, trialDetail2);
+        await verifyTrialResult(commandChannel, trialDetail2, -1);
+
+        chai.assert.equal(environmentService.testGetEnvironments().size, 1, "as env not reused, so only 1 envs should be here.");
+        const trials = await trialDispatcher.listTrialJobs();
+        chai.assert.equal(trials.length, 2, "there should be 2 trials");
+    });
+
+
+    it('2trial2env', async () => {
+
+        let trialDetail1 = await newTrial(trialDispatcher);
+        let trialDetail2 = await newTrial(trialDispatcher);
+
+        await waitEnvironment(2, previousEnvironments, environmentService, commandChannel);
+        await verifyTrialRunning(commandChannel, trialDetail1);
+        await verifyTrialResult(commandChannel, trialDetail1, 0);
+        await verifyTrialRunning(commandChannel, trialDetail2);
+        await verifyTrialResult(commandChannel, trialDetail2, 0);
+
+        chai.assert.equal(environmentService.testGetEnvironments().size, 2, "2 envs should be here.");
+        const trials = await trialDispatcher.listTrialJobs();
+        chai.assert.equal(trials.length, 2, "there should be 2 trials");
+    });
+
+    it('3trial2env', async () => {
+
+        let trialDetail1 = await newTrial(trialDispatcher);
+        let trialDetail2 = await newTrial(trialDispatcher);
+
+        await waitEnvironment(2, previousEnvironments, environmentService, commandChannel);
+        await verifyTrialRunning(commandChannel, trialDetail1);
+        await verifyTrialResult(commandChannel, trialDetail1, 0);
+        await verifyTrialRunning(commandChannel, trialDetail2);
+        await verifyTrialResult(commandChannel, trialDetail2, 0);
+
+        chai.assert.equal(environmentService.testGetEnvironments().size, 2, "2 envs should be here.");
+        let trials = await trialDispatcher.listTrialJobs();
+        chai.assert.equal(trials.length, 2, "there should be 2 trials");
+
+
+        let trialDetail3 = await newTrial(trialDispatcher);
+        await verifyTrialRunning(commandChannel, trialDetail3);
+        await verifyTrialResult(commandChannel, trialDetail3, 0);
+
+        chai.assert.equal(environmentService.testGetEnvironments().size, 2, "2 envs should be here.");
+        trials = await trialDispatcher.listTrialJobs();
+        chai.assert.equal(trials.length, 3, "there should be 2 trials");
+    });
+
+    it('stop trial', async () => {
+
+        let trialDetail1 = await newTrial(trialDispatcher);
+        await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
+        await verifyTrialRunning(commandChannel, trialDetail1);
+        await trialDispatcher.cancelTrialJob(trialDetail1.id, false);
+
+        let command = await waitResultMust<Command>(async () => {
+            return await commandChannel.testReceiveCommandFromTrialDispatcher();
+        });
+        chai.assert.equal(command.command, KILL_TRIAL_JOB);
+        log.info(`command: ${JSON.stringify(command)}`);
+        chai.assert.equal(command.data, trialDetail1.id);
+
+        await waitResultMust<boolean>(async () => {
+            return trialDetail1.status !== 'RUNNING' ? true : undefined;
+        });
+
+        let trialDetail2 = await newTrial(trialDispatcher);
+        await verifyTrialRunning(commandChannel, trialDetail2);
+        await trialDispatcher.cancelTrialJob(trialDetail2.id, true);
+        command = await waitResultMust<Command>(async () => {
+            return await commandChannel.testReceiveCommandFromTrialDispatcher();
+        });
+        chai.assert.equal(command.command, KILL_TRIAL_JOB);
+        log.info(`command: ${JSON.stringify(command)}`);
+        chai.assert.equal(command.data, trialDetail2.id);
+        await waitResultMust<boolean>(async () => {
+            return trialDetail2.status !== 'RUNNING' ? true : undefined;
+        });
+
+        chai.assert.equal(environmentService.testGetEnvironments().size, 1, "only one trial, so one env");
+        const trials = await trialDispatcher.listTrialJobs();
+
+        chai.assert.equal(trials.length, 2, "there should be 1 stopped trial only");
+        let trial = await trialDispatcher.getTrialJob(trialDetail1.id);
+        chai.assert.equal<TrialJobStatus>(trial.status, 'USER_CANCELED', `trial is canceled.`);
+        trial = await trialDispatcher.getTrialJob(trialDetail2.id);
+        chai.assert.equal<TrialJobStatus>(trial.status, 'EARLY_STOPPED', `trial is earlier stopped.`);
+    });
+
+    it('multi phase', async () => {
+        let trialDetail = await newTrial(trialDispatcher);
+
+        await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
+        await verifyTrialRunning(commandChannel, trialDetail);
+
+        let content = {
+            test: 2,
+        }
+        await trialDispatcher.updateTrialJob(trialDetail.id, createTrialForm(content));
+
+        let command = await waitResultMust<Command>(async () => {
+            return await commandChannel.testReceiveCommandFromTrialDispatcher();
+        });
+
+        chai.assert.equal(command.command, SEND_TRIAL_JOB_PARAMETER);
+        chai.assert.equal(command.data["trialId"], trialDetail.id);
+        chai.assert.equal(command.data.parameters.index, 0);
+        chai.assert.equal(command.data.parameters.value, JSON.stringify(content));
+
+        content = {
+            test: 3,
+        }
+        await trialDispatcher.updateTrialJob(trialDetail.id, createTrialForm(content));
+        command = await waitResultMust<Command>(async () => {
+            return await commandChannel.testReceiveCommandFromTrialDispatcher();
+        });
+        chai.assert.equal(command.command, SEND_TRIAL_JOB_PARAMETER);
+        chai.assert.equal(command.data["trialId"], trialDetail.id);
+        chai.assert.equal(command.data.parameters.index, 0);
+        chai.assert.equal(command.data.parameters.value, JSON.stringify(content));
+
+        await verifyTrialResult(commandChannel, trialDetail, 0);
+
+        chai.assert.equal(environmentService.testGetEnvironments().size, 1, "only one trial, so one env");
+        const trials = await trialDispatcher.listTrialJobs();
+        chai.assert.equal(trials.length, 1, "there should be 1 stopped trial only");
+    });
+
+    it('multi node', async () => {
+        let trialDetail = await newTrial(trialDispatcher);
+
+        const environment = await waitEnvironment(1, previousEnvironments, environmentService, commandChannel, 2, 2);
+        log.debug(`environment ${JSON.stringify(environment)}`);
+        await verifyTrialRunning(commandChannel, trialDetail);
+        await verifyTrialResult(commandChannel, trialDetail, 0);
+
+        chai.assert.equal(environment.nodes.size, 2);
+        let command = await waitResultMust<Command>(async () => {
+            return await commandChannel.testReceiveCommandFromTrialDispatcher();
+        });
+        chai.assert.equal(command.command, KILL_TRIAL_JOB);
+        chai.assert.equal(environmentService.testGetEnvironments().size, 1, "only one trial, so one env");
+        const trials = await trialDispatcher.listTrialJobs();
+        chai.assert.equal(trials.length, 1, "there should be 1 stopped trial only");
+    });
+
+    it('env timeout', async () => {
+        let trialDetail = await newTrial(trialDispatcher);
+        let environment = await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
+        await verifyTrialRunning(commandChannel, trialDetail);
+        await verifyTrialResult(commandChannel, trialDetail, 0);
+
+        environmentService.testSetEnvironmentStatus(environment, 'SUCCEEDED');
+        await waitResultMust<boolean>(async () => {
+            return environment.status === 'SUCCEEDED' ? true : undefined;
+        });
+
+        trialDetail = await newTrial(trialDispatcher);
+        await waitEnvironment(2, previousEnvironments, environmentService, commandChannel);
+        await verifyTrialRunning(commandChannel, trialDetail);
+        await verifyTrialResult(commandChannel, trialDetail, 0);
+
+        chai.assert.equal(previousEnvironments.size, 2, "as an env timeout, so 2 envs should be here.");
+        const trials = await trialDispatcher.listTrialJobs();
+        chai.assert.equal(trials.length, 2, "there should be 2 trials");
+    });
+
+    it('env failed with trial', async () => {
+        let trialDetail = await newTrial(trialDispatcher);
+        let environment = await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
+        await verifyTrialRunning(commandChannel, trialDetail);
+
+        environmentService.testSetEnvironmentStatus(environment, 'FAILED');
+        await waitResultMust<boolean>(async () => {
+            return environment.status === 'FAILED' ? true : undefined;
+        });
+
+        await waitResultMust<boolean>(async () => {
+            return trialDetail.status === 'FAILED' ? true : undefined;
+        });
+
+        chai.assert.equal<TrialJobStatus>(trialDetail.status, 'FAILED', "env failed, so trial also failed.");
+    });
+
+    it('GPUScheduler disabled gpuNum === undefined', async () => {
+
+        let trialDetail = await newTrial(trialDispatcher);
+        await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
+        const command = await verifyTrialRunning(commandChannel, trialDetail);
+        await verifyTrialResult(commandChannel, trialDetail, 0);
+
+        chai.assert.equal(command.data["gpuIndices"], undefined);
+    });
+
+    it('GPUScheduler disabled gpuNum === 0', async () => {
+        trialDispatcher.setClusterMetadata(
+            TrialConfigMetadataKey.TRIAL_CONFIG,
+            JSON.stringify({
+                reuseEnvironment: false,
+                codeDir: currentDir,
+                gpuNum: 0,
+            }));
+
+        let trialDetail = await newTrial(trialDispatcher);
+        await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
+        const command = await verifyTrialRunning(commandChannel, trialDetail);
+        await verifyTrialResult(commandChannel, trialDetail, 0);
+
+        chai.assert.equal(command.data["gpuIndices"], "");
+    });
+
+    it('GPUScheduler enable no cluster gpu config', async () => {
+        trialDispatcher.setClusterMetadata(
+            TrialConfigMetadataKey.TRIAL_CONFIG,
+            JSON.stringify({
+                reuseEnvironment: false,
+                codeDir: currentDir,
+                gpuNum: 1,
+            }));
+
+        let trialDetail = await newTrial(trialDispatcher);
+        await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
+        const command = await verifyTrialRunning(commandChannel, trialDetail);
+        await verifyTrialResult(commandChannel, trialDetail, 0);
+
+        chai.assert.equal(command.data["gpuIndices"], "0");
+    });
+
+    it('GPUScheduler skipped no GPU info', async () => {
+        trialDispatcher.setClusterMetadata(
+            TrialConfigMetadataKey.TRIAL_CONFIG,
+            JSON.stringify({
+                reuseEnvironment: false,
+                codeDir: currentDir,
+            }));
+
+        let trialDetail = await newTrial(trialDispatcher);
+        await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
+        const command = await verifyTrialRunning(commandChannel, trialDetail);
+        await verifyTrialResult(commandChannel, trialDetail, 0);
+
+        chai.assert.equal(command.data["gpuIndices"], undefined);
+    });
+
+    it('GPUScheduler disabled multi-node', async () => {
+        trialDispatcher.setClusterMetadata(
+            TrialConfigMetadataKey.TRIAL_CONFIG,
+            JSON.stringify({
+                reuseEnvironment: false,
+                codeDir: currentDir,
+                gpuNum: 0,
+            }));
+
+        let trialDetail = await newTrial(trialDispatcher);
+        await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
+        const command = await verifyTrialRunning(commandChannel, trialDetail);
+        await verifyTrialResult(commandChannel, trialDetail, 0);
+
+        chai.assert.equal(command.data["gpuIndices"], "");
+    });
+
+    it('GPUScheduler enabled 2 gpus 2 trial', async () => {
+        trialDispatcher.setClusterMetadata(
+            TrialConfigMetadataKey.TRIAL_CONFIG,
+            JSON.stringify({
+                reuseEnvironment: false,
+                codeDir: currentDir,
+                gpuNum: 1,
+            }));
+
+        const trialDetail1 = await newTrial(trialDispatcher);
+        const trialDetail2 = await newTrial(trialDispatcher);
+        await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
+        let command = await verifyTrialRunning(commandChannel, trialDetail1);
+        chai.assert.equal(command.data["gpuIndices"], "0");
+        command = await verifyTrialRunning(commandChannel, trialDetail2);
+        chai.assert.equal(command.data["gpuIndices"], "1");
+
+        await verifyTrialResult(commandChannel, trialDetail1, 0);
+        await verifyTrialResult(commandChannel, trialDetail2, 0);
+
+        chai.assert.equal(environmentService.testGetEnvironments().size, 1);
+        const trials = await trialDispatcher.listTrialJobs();
+        chai.assert.equal(trials.length, 2, "there should be 2 trials");
+    });
+
+    it('GPUScheduler enabled 4 gpus 2 trial(need 2 gpus)', async () => {
+        trialDispatcher.setClusterMetadata(
+            TrialConfigMetadataKey.TRIAL_CONFIG,
+            JSON.stringify({
+                reuseEnvironment: false,
+                codeDir: currentDir,
+                gpuNum: 2,
+            }));
+
+        const trialDetail1 = await newTrial(trialDispatcher);
+        const trialDetail2 = await newTrial(trialDispatcher);
+        await waitEnvironment(1, previousEnvironments, environmentService, commandChannel, 4);
+        let command = await verifyTrialRunning(commandChannel, trialDetail1);
+        chai.assert.equal(command.data["gpuIndices"], "0,1");
+        command = await verifyTrialRunning(commandChannel, trialDetail2);
+        chai.assert.equal(command.data["gpuIndices"], "2,3");
+
+        await verifyTrialResult(commandChannel, trialDetail1, 0);
+        await verifyTrialResult(commandChannel, trialDetail2, 0);
+
+        chai.assert.equal(environmentService.testGetEnvironments().size, 1);
+        const trials = await trialDispatcher.listTrialJobs();
+        chai.assert.equal(trials.length, 2, "there should be 2 trials");
+    });
+
+    it('GPUScheduler enabled use 4 gpus but only 1 usable(4)', async () => {
+        trialDispatcher.setClusterMetadata(
+            TrialConfigMetadataKey.TRIAL_CONFIG,
+            JSON.stringify({
+                reuseEnvironment: false,
+                codeDir: currentDir,
+                gpuNum: 1,
+            }));
+
+        const trialDetail = await newTrial(trialDispatcher);
+        await waitEnvironment(1, previousEnvironments, environmentService, commandChannel, 4, 1, async (environment) => {
+            environment.usableGpus = [3];
+        });
+        let command = await verifyTrialRunning(commandChannel, trialDetail);
+        chai.assert.equal(command.data["gpuIndices"], "3");
+
+        await verifyTrialResult(commandChannel, trialDetail, 0);
+
+        chai.assert.equal(environmentService.testGetEnvironments().size, 1);
+        const trials = await trialDispatcher.listTrialJobs();
+        chai.assert.equal(trials.length, 1);
+    });
+
+    it('GPUScheduler enabled TMP_NO_AVAILABLE_GPU, request new env', async () => {
+        trialDispatcher.setClusterMetadata(
+            TrialConfigMetadataKey.TRIAL_CONFIG,
+            JSON.stringify({
+                reuseEnvironment: false,
+                codeDir: currentDir,
+                gpuNum: 1,
+            }));
+
+        const trialDetail1 = await newTrial(trialDispatcher);
+        await waitEnvironment(1, previousEnvironments, environmentService, commandChannel, 1);
+        let command = await verifyTrialRunning(commandChannel, trialDetail1);
+        chai.assert.equal(command.data["gpuIndices"], "0");
+
+        const trialDetail2 = await newTrial(trialDispatcher);
+        await waitEnvironment(2, previousEnvironments, environmentService, commandChannel, 1);
+
+        await verifyTrialResult(commandChannel, trialDetail1, 0);
+
+        command = await verifyTrialRunning(commandChannel, trialDetail2);
+        await verifyTrialResult(commandChannel, trialDetail2, 0);
+        chai.assert.equal(command.data["gpuIndices"], "0");
+
+        chai.assert.equal(environmentService.testGetEnvironments().size, 2, 'environments');
+        const trials = await trialDispatcher.listTrialJobs();
+        chai.assert.equal(trials.length, 2, 'trials');
+    });
+
+    it('GPUScheduler enabled REQUIRE_EXCEED_TOTAL, need fail', async () => {
+        trialDispatcher.setClusterMetadata(
+            TrialConfigMetadataKey.TRIAL_CONFIG,
+            JSON.stringify({
+                reuseEnvironment: false,
+                codeDir: currentDir,
+                gpuNum: 8,
+            }));
+
+        await newTrial(trialDispatcher);
+        await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
+        await chai.expect(trialRunPromise).rejectedWith(NNIError, "REQUIRE_EXCEED_TOTAL");
+        const deferred = new Deferred<void>();
+        trialRunPromise = deferred.promise;
+        deferred.resolve();
+    });
+
+    it('GPUScheduler enabled maxTrialNumberPerGpu=2, 4 trials, 2 gpus', async () => {
+        trialDispatcher.setClusterMetadata(
+            TrialConfigMetadataKey.TRIAL_CONFIG,
+            JSON.stringify({
+                reuseEnvironment: false,
+                codeDir: currentDir,
+                gpuNum: 1,
+            }));
+        const trials = [];
+
+        // last two trials shouldn't be in first environment.
+        for (let index = 0; index < 6; index++) {
+            const trial = await newTrial(trialDispatcher);
+            trials.push(trial);
+        }
+        await waitEnvironment(1, previousEnvironments, environmentService, commandChannel, 2, 1, async (environment) => {
+            environment.maxTrialNumberPerGpu = 2;
+        });
+        await waitEnvironment(2, previousEnvironments, environmentService, commandChannel, 2, 1, async (environment) => {
+            environment.maxTrialNumberPerGpu = 2;
+        });
+        const gpuIndexMap = new Map<string, number>();
+        for (let index = 0; index < 6; index++) {
+            const trial = trials[index];
+            let command = await verifyTrialRunning(commandChannel, trial);
+            const gpuIndex = command.data["gpuIndices"];
+            const trialNumbers = gpuIndexMap.get(gpuIndex);
+            if (index < 4) {
+                if (undefined === trialNumbers) {
+                    gpuIndexMap.set(gpuIndex, 1);
+                } else {
+                    gpuIndexMap.set(gpuIndex, trialNumbers + 1);
+                }
+            }
+        }
+        chai.assert.equal(gpuIndexMap.size, 2);
+        chai.assert.equal(gpuIndexMap.get("0"), 2);
+        chai.assert.equal(gpuIndexMap.get("1"), 2);
+
+        for (let index = 0; index < 6; index++) {
+            const trial = trials[index];
+            await verifyTrialResult(commandChannel, trial, 0);
+        }
+
+        chai.assert.equal(environmentService.testGetEnvironments().size, 2);
+        const listedTrials = await trialDispatcher.listTrialJobs();
+        chai.assert.equal(listedTrials.length, 6);
+    });
+});
--- a/src/nni_manager/training_service/reusable/test/utCommandChannel.ts
+++ b/src/nni_manager/training_service/reusable/test/utCommandChannel.ts
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+import { encodeCommand } from "../../../core/ipcInterface";
+import { Command, CommandChannel, RunnerConnection } from "../commandChannel";
+import { Channel, EnvironmentInformation } from "../environment";
+
+class UtRunnerConnection extends RunnerConnection {
+
+}
+
+export class UtCommandChannel extends CommandChannel {
+    private readonly receivedCommands: Command[] = [];
+
+    public get channelName(): Channel {
+        return "ut";
+    }
+
+    public async testSendCommandToTrialDispatcher(environment: EnvironmentInformation, commandType: string, commandData: any) {
+        const content = encodeCommand(commandType, JSON.stringify(commandData));
+        this.log.debug(`UtCommandChannel: env ${environment.id} send test command ${content}`);
+        this.handleCommand(environment, content.toString("utf8"));
+    }
+
+    public async testReceiveCommandFromTrialDispatcher(): Promise<Command | undefined> {
+        return this.receivedCommands.shift();
+    }
+
+    public async config(_key: string, value: any): Promise<void> {
+        // do nothing
+    }
+
+    public async start(): Promise<void> {
+        // do nothing
+    }
+
+    public async stop(): Promise<void> {
+        // do nothing
+    }
+
+    public async run(): Promise<void> {
+        // do nothing
+    }
+
+    protected async sendCommandInternal(environment: EnvironmentInformation, message: string): Promise<void> {
+        const parsedCommands = this.parseCommands(message);
+        for (const parsedCommand of parsedCommands) {
+            const command = new Command(environment, parsedCommand[0], parsedCommand[1]);
+            this.receivedCommands.push(command);
+        }
+    }
+
+    protected createRunnerConnection(environment: EnvironmentInformation): RunnerConnection {
+        // do nothing
+        return new UtRunnerConnection(environment);
+    }
+}
--- a/src/nni_manager/training_service/reusable/test/utEnvironmentService.ts
+++ b/src/nni_manager/training_service/reusable/test/utEnvironmentService.ts
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+import { EnvironmentInformation, EnvironmentService, EnvironmentStatus } from "../environment";
+import { EventEmitter } from "events";
+import { CommandChannel } from "../commandChannel";
+import { UtCommandChannel } from "./utCommandChannel";
+
+export class UtEnvironmentService extends EnvironmentService {
+    private commandChannel: UtCommandChannel | undefined;
+    private allEnvironments = new Map<string, EnvironmentInformation>();
+    private hasMoreEnvironmentsInternal = true;
+
+    constructor() {
+        super();
+    }
+
+    public get hasStorageService(): boolean {
+        // storage service is tested by integration testing.
+        return false;
+    }
+    public get environmentMaintenceLoopInterval(): number {
+        return 1;
+    }
+
+    public testSetEnvironmentStatus(environment: EnvironmentInformation, newStatus: EnvironmentStatus): void {
+        environment.status = newStatus;
+    }
+
+    public testReset(): void {
+        this.allEnvironments.clear();
+    }
+
+    public testGetEnvironments(): Map<string, EnvironmentInformation> {
+        return this.allEnvironments;
+    }
+
+    public testGetCommandChannel(): UtCommandChannel {
+        if (this.commandChannel === undefined) {
+            throw new Error(`command channel shouldn't be undefined.`);
+        }
+        return this.commandChannel;
+    }
+
+    public testSetNoMoreEnvironment(hasMore: boolean): void {
+        this.hasMoreEnvironmentsInternal = hasMore;
+    }
+
+    public get hasMoreEnvironments(): boolean {
+        return this.hasMoreEnvironmentsInternal;
+    }
+
+    public createCommandChannel(commandEmitter: EventEmitter): CommandChannel {
+        this.commandChannel = new UtCommandChannel(commandEmitter)
+        return this.commandChannel;
+    }
+
+    public async config(_key: string, _value: string): Promise<void> {
+        // do nothing
+    }
+
+    public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void> {
+        // do nothing
+    }
+
+    public async startEnvironment(environment: EnvironmentInformation): Promise<void> {
+        if (!this.allEnvironments.has(environment.id)) {
+            this.allEnvironments.set(environment.id, environment);
+            environment.status = "WAITING";
+        }
+    }
+
+    public async stopEnvironment(environment: EnvironmentInformation): Promise<void> {
+        environment.status = "USER_CANCELED";
+    }
+}