Unverified Commit 70be7d0f authored by fishyds's avatar fishyds Committed by GitHub
Browse files

Pai training service bug fix and enhancement (#136)

* Add NNI installation scripts

* Update pai script, update NNI_out_dir

* Update NNI dir in nni sdk local.py

* Create .nni folder in nni sdk local.py

* Add check before creating .nni folder

* Fix typo for PAI_INSTALL_NNI_SHELL_FORMAT
parent 4c12435d
...@@ -47,12 +47,21 @@ export class PAITrialJobDetail implements TrialJobDetail { ...@@ -47,12 +47,21 @@ export class PAITrialJobDetail implements TrialJobDetail {
} }
} }
export const PAI_INSTALL_NNI_SHELL_FORMAT: string =
`#!/bin/bash
if python3 -c 'import nni' > /dev/null 2>&1; then
# nni module is already installed, skip
return
else
# Install nni
pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.2
fi`;
export const PAI_TRIAL_COMMAND_FORMAT: string = export const PAI_TRIAL_COMMAND_FORMAT: string =
`pip3 install -v --user git+https://github.com/Microsoft/nni.git@master `export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3}
&& export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_EXP_ID={2} && cd $NNI_SYS_DIR && sh install_nni.sh
&& cd $NNI_SYS_DIR && mkdir .nni && python3 -m trial_tool.trial_keeper --trial_command '{4}' --nnimanager_ip '{5}' --pai_hdfs_output_dir '{6}'
&& python3 -m trial_tool.trial_keeper --trial_command '{3}' --nnimanager_ip '{4}' --pai_hdfs_output_dir '{5}' --pai_hdfs_host '{7}' --pai_user_name {8}`;
--pai_hdfs_host '{6}' --pai_user_name {7}`;
export const PAI_OUTPUT_DIR_FORMAT: string = export const PAI_OUTPUT_DIR_FORMAT: string =
`hdfs://{0}:9000/`; `hdfs://{0}:9000/`;
......
...@@ -39,7 +39,7 @@ import { ...@@ -39,7 +39,7 @@ import {
} from '../../common/trainingService'; } from '../../common/trainingService';
import { delay, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils'; import { delay, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils';
import { PAIJobRestServer } from './paiJobRestServer' import { PAIJobRestServer } from './paiJobRestServer'
import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData'; import { PAITrialJobDetail, PAI_INSTALL_NNI_SHELL_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData';
import { PAIJobInfoCollector } from './paiJobInfoCollector'; import { PAIJobInfoCollector } from './paiJobInfoCollector';
import { String } from 'typescript-string-operations'; import { String } from 'typescript-string-operations';
import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig'; import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig';
...@@ -142,6 +142,10 @@ class PAITrainingService implements TrainingService { ...@@ -142,6 +142,10 @@ class PAITrainingService implements TrainingService {
//create tmp trial working folder locally. //create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`); await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${this.paiTrialConfig.codeDir} ${trialLocalTempFolder}`); await cpp.exec(`cp -r ${this.paiTrialConfig.codeDir} ${trialLocalTempFolder}`);
const runScriptContent : string = PAI_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
// Write file content ( parameter.cfg ) to local tmp folders // Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form) const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form)
...@@ -188,6 +192,7 @@ class PAITrainingService implements TrainingService { ...@@ -188,6 +192,7 @@ class PAITrainingService implements TrainingService {
PAI_TRIAL_COMMAND_FORMAT, PAI_TRIAL_COMMAND_FORMAT,
// PAI will copy job's codeDir into /root directory // PAI will copy job's codeDir into /root directory
`/root/${trialJobId}`, `/root/${trialJobId}`,
`/root/${trialJobId}/nnioutput`,
trialJobId, trialJobId,
this.experimentId, this.experimentId,
this.paiTrialConfig.command, this.paiTrialConfig.command,
......
...@@ -24,16 +24,18 @@ import os ...@@ -24,16 +24,18 @@ import os
from ..common import init_logger from ..common import init_logger
_sysdir = os.environ['NNI_SYS_DIR']
if not os.path.exists(os.path.join(_sysdir, '.nni')):
os.makedirs(os.path.join(_sysdir, '.nni'))
_metric_file = open(os.path.join(_sysdir, '.nni', 'metrics'), 'wb')
_dir = os.environ['NNI_SYS_DIR'] _outputdir = os.environ['NNI_OUTPUT_DIR']
_metric_file = open(os.path.join(_dir, '.nni', 'metrics'), 'wb') _log_file_path = os.path.join(_outputdir, 'trial.log')
_log_file_path = os.path.join(_dir, 'trial.log')
init_logger(_log_file_path) init_logger(_log_file_path)
def get_parameters(): def get_parameters():
params_file = open(os.path.join(_dir, 'parameter.cfg'), 'r') params_file = open(os.path.join(_sysdir, 'parameter.cfg'), 'r')
return json.load(params_file) return json.load(params_file)
def send_metric(string): def send_metric(string):
......
...@@ -28,7 +28,7 @@ DEFAULT_REST_PORT = 51189 ...@@ -28,7 +28,7 @@ DEFAULT_REST_PORT = 51189
HOME_DIR = os.path.join(os.environ['HOME'], 'nni') HOME_DIR = os.path.join(os.environ['HOME'], 'nni')
LOG_DIR = os.path.join(HOME_DIR, 'trial-keeper', 'log') LOG_DIR = os.environ['NNI_OUTPUT_DIR']
STDOUT_FULL_PATH = os.path.join(LOG_DIR, 'stdout') STDOUT_FULL_PATH = os.path.join(LOG_DIR, 'stdout')
......
...@@ -41,9 +41,12 @@ class TrialMetricsReader(): ...@@ -41,9 +41,12 @@ class TrialMetricsReader():
Read metrics data from a trial job Read metrics data from a trial job
''' '''
def __init__(self, rest_port = DEFAULT_REST_PORT): def __init__(self, rest_port = DEFAULT_REST_PORT):
self.offset_filename = os.path.join(NNI_SYS_DIR, '.nni', 'metrics_offset') metrics_base_dir = os.path.join(NNI_SYS_DIR, '.nni')
self.metrics_filename = os.path.join(NNI_SYS_DIR, '.nni', 'metrics') self.offset_filename = os.path.join(metrics_base_dir, 'metrics_offset')
self.metrics_filename = os.path.join(metrics_base_dir, 'metrics')
self.rest_port = rest_port self.rest_port = rest_port
if not os.path.exists(metrics_base_dir):
os.makedirs(metrics_base_dir)
def _metrics_file_is_empty(self): def _metrics_file_is_empty(self):
if not os.path.isfile(self.metrics_filename): if not os.path.isfile(self.metrics_filename):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment