kubernetesJobInfoCollector.ts 2.09 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
3
4
5
6

'use strict';

import * as assert from 'assert';
7
import { MethodNotImplementedError, NNIError, NNIErrorNames } from '../../common/errors';
8
9
10
11
12
13
14
15
16
import { getLogger, Logger } from '../../common/log';
import { TrialJobStatus } from '../../common/trainingService';
import { KubernetesCRDClient } from './kubernetesApiClient';
import { KubernetesTrialJobDetail } from './kubernetesData';

/**
 * Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
 */
export class KubernetesJobInfoCollector {
chicm-ms's avatar
chicm-ms committed
17
    protected readonly trialJobsMap: Map<string, KubernetesTrialJobDetail>;
18
19
20
21
22
23
24
25
    protected readonly log: Logger = getLogger();
    protected readonly statusesNeedToCheck: TrialJobStatus[];

    constructor(jobMap: Map<string, KubernetesTrialJobDetail>) {
        this.trialJobsMap = jobMap;
        this.statusesNeedToCheck = ['RUNNING', 'WAITING'];
    }

chicm-ms's avatar
chicm-ms committed
26
    public async retrieveTrialStatus(kubernetesCRDClient: KubernetesCRDClient | undefined): Promise<void> {
27
        assert(kubernetesCRDClient !== undefined);
chicm-ms's avatar
chicm-ms committed
28
        const updateKubernetesTrialJobs: Promise<void>[] = [];
29
30
        for (const [trialJobId, kubernetesTrialJob] of this.trialJobsMap) {
            if (kubernetesTrialJob === undefined) {
31
32
33
                throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`);
            }
            // Since Kubeflow needs some delay to schedule jobs, we provide 20 seconds buffer time to check kubeflow job's status
34
            if (Date.now() - kubernetesTrialJob.submitTime < 20 * 1000) {
35
36
                return Promise.resolve();
            }
37
            updateKubernetesTrialJobs.push(this.retrieveSingleTrialJobInfo(kubernetesCRDClient, kubernetesTrialJob));
38
39
40
41
42
        }

        await Promise.all(updateKubernetesTrialJobs);
    }

43
    protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
chicm-ms's avatar
chicm-ms committed
44
                                               kubernetesTrialJob: KubernetesTrialJobDetail): Promise<void> {
45
46
            throw new MethodNotImplementedError();
    }
47
}