Unverified Commit 70be7d0f authored by fishyds's avatar fishyds Committed by GitHub
Browse files

Pai training service bug fix and enhancement (#136)

* Add NNI installation scripts

* Update pai script, update NNI_out_dir

* Update NNI dir in nni sdk local.py

* Create .nni folder in nni sdk local.py

* Add check before creating .nni folder

* Fix typo for PAI_INSTALL_NNI_SHELL_FORMAT
parent 4c12435d
......@@ -47,12 +47,21 @@ export class PAITrialJobDetail implements TrialJobDetail {
}
}
export const PAI_INSTALL_NNI_SHELL_FORMAT: string =
`#!/bin/bash
if python3 -c 'import nni' > /dev/null 2>&1; then
# nni module is already installed, skip
return
else
# Install nni
pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.2
fi`;
export const PAI_TRIAL_COMMAND_FORMAT: string =
`pip3 install -v --user git+https://github.com/Microsoft/nni.git@master
&& export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_EXP_ID={2}
&& cd $NNI_SYS_DIR && mkdir .nni
&& python3 -m trial_tool.trial_keeper --trial_command '{3}' --nnimanager_ip '{4}' --pai_hdfs_output_dir '{5}'
--pai_hdfs_host '{6}' --pai_user_name {7}`;
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3}
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m trial_tool.trial_keeper --trial_command '{4}' --nnimanager_ip '{5}' --pai_hdfs_output_dir '{6}'
--pai_hdfs_host '{7}' --pai_user_name {8}`;
export const PAI_OUTPUT_DIR_FORMAT: string =
`hdfs://{0}:9000/`;
......
......@@ -39,7 +39,7 @@ import {
} from '../../common/trainingService';
import { delay, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils';
import { PAIJobRestServer } from './paiJobRestServer'
import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData';
import { PAITrialJobDetail, PAI_INSTALL_NNI_SHELL_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData';
import { PAIJobInfoCollector } from './paiJobInfoCollector';
import { String } from 'typescript-string-operations';
import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig';
......@@ -142,6 +142,10 @@ class PAITrainingService implements TrainingService {
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${this.paiTrialConfig.codeDir} ${trialLocalTempFolder}`);
const runScriptContent : string = PAI_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
// Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form)
......@@ -188,6 +192,7 @@ class PAITrainingService implements TrainingService {
PAI_TRIAL_COMMAND_FORMAT,
// PAI will copy job's codeDir into /root directory
`/root/${trialJobId}`,
`/root/${trialJobId}/nnioutput`,
trialJobId,
this.experimentId,
this.paiTrialConfig.command,
......
......@@ -24,16 +24,18 @@ import os
from ..common import init_logger
_sysdir = os.environ['NNI_SYS_DIR']
if not os.path.exists(os.path.join(_sysdir, '.nni')):
os.makedirs(os.path.join(_sysdir, '.nni'))
_metric_file = open(os.path.join(_sysdir, '.nni', 'metrics'), 'wb')
_dir = os.environ['NNI_SYS_DIR']
_metric_file = open(os.path.join(_dir, '.nni', 'metrics'), 'wb')
_log_file_path = os.path.join(_dir, 'trial.log')
_outputdir = os.environ['NNI_OUTPUT_DIR']
_log_file_path = os.path.join(_outputdir, 'trial.log')
init_logger(_log_file_path)
def get_parameters():
params_file = open(os.path.join(_dir, 'parameter.cfg'), 'r')
params_file = open(os.path.join(_sysdir, 'parameter.cfg'), 'r')
return json.load(params_file)
def send_metric(string):
......
......@@ -28,7 +28,7 @@ DEFAULT_REST_PORT = 51189
HOME_DIR = os.path.join(os.environ['HOME'], 'nni')
LOG_DIR = os.path.join(HOME_DIR, 'trial-keeper', 'log')
LOG_DIR = os.environ['NNI_OUTPUT_DIR']
STDOUT_FULL_PATH = os.path.join(LOG_DIR, 'stdout')
......
......@@ -41,9 +41,12 @@ class TrialMetricsReader():
Read metrics data from a trial job
'''
def __init__(self, rest_port = DEFAULT_REST_PORT):
self.offset_filename = os.path.join(NNI_SYS_DIR, '.nni', 'metrics_offset')
self.metrics_filename = os.path.join(NNI_SYS_DIR, '.nni', 'metrics')
metrics_base_dir = os.path.join(NNI_SYS_DIR, '.nni')
self.offset_filename = os.path.join(metrics_base_dir, 'metrics_offset')
self.metrics_filename = os.path.join(metrics_base_dir, 'metrics')
self.rest_port = rest_port
if not os.path.exists(metrics_base_dir):
os.makedirs(metrics_base_dir)
def _metrics_file_is_empty(self):
if not os.path.isfile(self.metrics_filename):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment