adaptdljob-template.json 4.06 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
{
    "apiVersion": "adaptdl.petuum.com/v1",
    "kind": "AdaptDLJob",
    "metadata": {
        "name": "<name>",
        "labels": {
            "app": "<app_name>",
            "expId": "<exp_id>",
            "trialId": "<trial_id>"
        }
    },
    "spec": {
        "preemptible": false,
        "template": {
            "spec": {
                "containers": [
                    {
                        "lifecycle":
                        {
                            "preStop":
                            {
                                "exec": 
                                {
                                    "command": ["/cleanup.sh"]
                                }
                            }
                        },
                        "command": ["/run.sh"],
                        "env": [
                            {
                                "name": "ADAPTDL_CHECKPOINT_PATH",
                                "value": "/adaptdl/checkpoint"
                            },
                            {
                                "name": "ADAPTDL_TENSORBOARD_LOGDIR",
                                "value": "/adaptdl/tensorboard"
                            },
                            {
                                "name": "ADAPTDL_SHARE_PATH",
                                "value": "/adaptdl/share"
                            }
                        ],
                        "image": "<image>",
                        "imagePullPolicy": "Always",
                        "name": "main",
                        "resources": {
                            "requests": {
                                "memory": "<memorySize>",
                                "cpu": "<cpuNum>"
                            },
                            "limits": {
                                "nvidia.com/gpu": 1
                            }
                        },
                        "volumeMounts": [
                            {
                                "mountPath": "/adaptdl/checkpoint",
                                "name": "adaptdl-pvc",
                                "subPath": "adaptdl/checkpoint"
                            },
                            {
                                "mountPath": "/adaptdl/share",
                                "name": "adaptdl-pvc",
                                "subPath": "adaptdl/share"
                            },
                            {
                                "mountPath": "/adaptdl/tensorboard",
                                "name": "adaptdl-tensorboard-pvc",
                                "subPath": "adaptdl/tensorboard"
                            },
                            {
                                "mountPath": "/cleanup.sh",
                                "name": "adaptdl-nni-configmap",
                                "subPath": "cleanup.sh"
                            },
                            {
                                "mountPath": "/run.sh",
                                "name": "adaptdl-nni-configmap",
                                "subPath": "run.sh"
                            }
                        ]
                    }
                ],
                "imagePullSecrets": [],
                "volumes": [
                    {
                        "name": "adaptdl-pvc",
                        "persistentVolumeClaim": {
                            "claimName": "<adaptdl_pvc_name>"
                        }
                    },
                    {
                        "name": "adaptdl-tensorboard-pvc",
                        "persistentVolumeClaim": {
                            "claimName": "<adaptdl_tensorflow_pvc_name>"
                        }
                    },
                    {
                        "name": "adaptdl-nni-configmap",
                        "configMap": {
                            "name": "<adaptdl_nni_configmap_name>",
                            "defaultMode": 511
                        }
                    }
                ]
            }
        }
    }
}