Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
add7ca66
"examples/deepseek_mla/torch_refs.py" did not exist on "159af5dfb322df80356130e30130db20a0ebdf21"
Unverified
Commit
add7ca66
authored
Oct 20, 2020
by
SparkSnail
Committed by
GitHub
Oct 20, 2020
Browse files
Fix remote reuse bugs (#2981)
parent
058b58a6
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
75 additions
and
44 deletions
+75
-44
src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts
.../training_service/remote_machine/extends/linuxCommands.ts
+4
-0
src/nni_manager/training_service/remote_machine/extends/windowsCommands.ts
...raining_service/remote_machine/extends/windowsCommands.ts
+4
-0
src/nni_manager/training_service/remote_machine/osCommands.ts
...nni_manager/training_service/remote_machine/osCommands.ts
+1
-0
src/nni_manager/training_service/remote_machine/shellExecutor.ts
..._manager/training_service/remote_machine/shellExecutor.ts
+6
-0
src/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts
...service/reusable/environments/remoteEnvironmentService.ts
+34
-31
src/nni_manager/training_service/reusable/trialDispatcher.ts
src/nni_manager/training_service/reusable/trialDispatcher.ts
+11
-8
test/config/training_service.yml
test/config/training_service.yml
+2
-0
test/nni_test/nnitest/generate_ts_config.py
test/nni_test/nnitest/generate_ts_config.py
+3
-0
test/pipelines/pipelines-it-remote-linux-to-linux.yml
test/pipelines/pipelines-it-remote-linux-to-linux.yml
+1
-1
tools/nni_trial_tool/trial.py
tools/nni_trial_tool/trial.py
+9
-4
No files found.
src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts
View file @
add7ca66
...
...
@@ -136,6 +136,10 @@ class LinuxCommands extends OsCommands {
return
`
${
preCommand
}
&&
${
command
}
`
;
}
}
public
fileExistCommand
(
filePath
:
string
):
string
{
return
`test -e
${
filePath
}
&& echo True || echo False`
;
}
}
export
{
LinuxCommands
};
src/nni_manager/training_service/remote_machine/extends/windowsCommands.ts
View file @
add7ca66
...
...
@@ -130,6 +130,10 @@ class WindowsCommands extends OsCommands {
return
`
${
preCommand
}
&& set prePath=%path% &&
${
command
}
`
;
}
}
public
fileExistCommand
(
filePath
:
string
):
string
{
return
`powershell Test-Path
${
filePath
}
-PathType Leaf`
;
}
}
export
{
WindowsCommands
};
src/nni_manager/training_service/remote_machine/osCommands.ts
View file @
add7ca66
...
...
@@ -29,6 +29,7 @@ abstract class OsCommands {
public
abstract
extractFile
(
tarFileName
:
string
,
targetFolder
:
string
):
string
;
public
abstract
executeScript
(
script
:
string
,
isFile
:
boolean
):
string
;
public
abstract
addPreCommand
(
preCommand
:
string
|
undefined
,
command
:
string
|
undefined
):
string
|
undefined
;
public
abstract
fileExistCommand
(
filePath
:
string
):
string
|
undefined
;
public
joinPath
(...
paths
:
string
[]):
string
{
let
dir
:
string
=
paths
.
filter
((
path
:
any
)
=>
path
!==
''
).
join
(
this
.
pathSpliter
);
...
...
src/nni_manager/training_service/remote_machine/shellExecutor.ts
View file @
add7ca66
...
...
@@ -238,6 +238,12 @@ class ShellExecutor {
return
commandResult
.
exitCode
==
0
;
}
public
async
fileExist
(
filePath
:
string
):
Promise
<
boolean
>
{
const
commandText
=
this
.
osCommands
&&
this
.
osCommands
.
fileExistCommand
(
filePath
);
const
commandResult
=
await
this
.
execute
(
commandText
);
return
commandResult
.
stdout
!==
undefined
&&
commandResult
.
stdout
.
trim
()
===
'
True
'
;
}
public
async
extractFile
(
tarFileName
:
string
,
targetFolder
:
string
):
Promise
<
boolean
>
{
const
commandText
=
this
.
osCommands
&&
this
.
osCommands
.
extractFile
(
tarFileName
,
targetFolder
);
const
commandResult
=
await
this
.
execute
(
commandText
);
...
...
src/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts
View file @
add7ca66
...
...
@@ -139,10 +139,15 @@ export class RemoteEnvironmentService extends EnvironmentService {
const
executor
=
await
this
.
getExecutor
(
environment
.
id
);
const
jobpidPath
:
string
=
`
${
environment
.
runnerWorkingFolder
}
/pid`
;
const
runnerReturnCodeFilePath
:
string
=
`
${
environment
.
runnerWorkingFolder
}
/code`
;
if
(
fs
.
existsSync
(
jobpidPath
))
{
/* eslint-disable require-atomic-updates */
try
{
// check if pid file exist
const
pidExist
=
await
executor
.
fileExist
(
jobpidPath
);
if
(
!
pidExist
)
{
return
;
}
const
isAlive
=
await
executor
.
isProcessAlive
(
jobpidPath
);
environment
.
status
=
'
RUNNING
'
;
// if the process of jobpid is not alive any more
if
(
!
isAlive
)
{
const
remoteEnvironment
:
RemoteMachineEnvironmentInformation
=
environment
as
RemoteMachineEnvironmentInformation
;
...
...
@@ -167,11 +172,9 @@ export class RemoteEnvironmentService extends EnvironmentService {
}
}
}
catch
(
error
)
{
this
.
releaseEnvironmentResource
(
environment
);
this
.
log
.
error
(
`Update job status exception, error is
${
error
.
message
}
`
);
}
}
}
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
const
tasks
:
Promise
<
void
>
[]
=
[];
...
...
@@ -245,6 +248,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
'
envs
'
,
environment
.
id
)
environment
.
command
=
`cd
${
environment
.
runnerWorkingFolder
}
&& \
${
environment
.
command
}
--job_pid_file
${
environment
.
runnerWorkingFolder
}
/pid \
1>
${
environment
.
runnerWorkingFolder
}
/trialrunner_stdout 2>
${
environment
.
runnerWorkingFolder
}
/trialrunner_stderr \
&& echo $?
\`
date +%s%3N
\`
>
${
environment
.
runnerWorkingFolder
}
/code`
;
return
Promise
.
resolve
(
true
);
}
...
...
@@ -266,7 +270,6 @@ ${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \
// Execute command in remote machine
executor
.
executeScript
(
executor
.
joinPath
(
environment
.
runnerWorkingFolder
,
executor
.
getScriptName
(
"
run
"
)),
true
,
false
);
environment
.
status
=
'
RUNNING
'
;
if
(
environment
.
rmMachineMeta
===
undefined
)
{
throw
new
Error
(
`
${
environment
.
id
}
rmMachineMeta not initialized!`
);
}
...
...
src/nni_manager/training_service/reusable/trialDispatcher.ts
View file @
add7ca66
...
...
@@ -663,18 +663,21 @@ class TrialDispatcher implements TrainingService {
await
this
.
commandChannel
.
sendCommand
(
trial
.
environment
,
NEW_TRIAL_JOB
,
trial
.
settings
);
}
/**
* release the trial assigned environment resources
* @param trial
*/
private
releaseEnvironment
(
trial
:
TrialDetail
):
void
{
if
(
undefined
===
trial
.
environment
)
{
throw
new
Error
(
`TrialDispatcher: environment is not assigned to trial
${
trial
.
id
}
, and cannot be released!`
);
}
if
(
trial
.
environment
!==
undefined
)
{
if
(
trial
.
environment
.
runningTrialCount
<=
0
)
{
throw
new
Error
(
`TrialDispatcher: environment
${
trial
.
environment
.
id
}
has no counted running trial!`
);
}
trial
.
environment
.
runningTrialCount
--
;
trial
.
environment
=
undefined
;
}
if
(
true
===
this
.
enableGpuScheduler
)
{
this
.
gpuScheduler
.
removeGpuReservation
(
trial
);
}
trial
.
environment
.
runningTrialCount
--
;
trial
.
environment
=
undefined
;
}
private
async
handleMetricData
(
trialId
:
string
,
data
:
any
):
Promise
<
void
>
{
...
...
test/config/training_service.yml
View file @
add7ca66
...
...
@@ -95,6 +95,8 @@ pai:
containerNFSMountPath
:
paiStorageConfigName
:
remote
:
remoteConfig
:
reuse
:
false
machineList
:
-
ip
:
passwd
:
...
...
test/nni_test/nnitest/generate_ts_config.py
View file @
add7ca66
...
...
@@ -86,6 +86,8 @@ def update_training_service_config(args):
config
[
args
.
ts
][
'machineList'
][
0
][
'port'
]
=
args
.
remote_port
if
args
.
remote_pwd
is
not
None
:
config
[
args
.
ts
][
'machineList'
][
0
][
'passwd'
]
=
args
.
remote_pwd
if
args
.
remote_reuse
is
not
None
:
config
[
args
.
ts
][
'remoteConfig'
][
'reuse'
]
=
args
.
remote_reuse
.
lower
()
==
'true'
dump_yml_content
(
TRAINING_SERVICE_FILE
,
config
)
...
...
@@ -119,6 +121,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
"--remote_pwd"
,
type
=
str
)
parser
.
add_argument
(
"--remote_host"
,
type
=
str
)
parser
.
add_argument
(
"--remote_port"
,
type
=
int
)
parser
.
add_argument
(
"--remote_reuse"
,
type
=
str
)
args
=
parser
.
parse_args
()
update_training_service_config
(
args
)
test/pipelines/pipelines-it-remote-linux-to-linux.yml
View file @
add7ca66
...
...
@@ -62,7 +62,7 @@ jobs:
-
script
:
|
set -e
cd test
python3 nni_test/nnitest/generate_ts_config.py --ts remote --remote_user $(docker_user) --remote_host $(remote_host) \
python3 nni_test/nnitest/generate_ts_config.py --ts remote
--remote_reuse $(remote_reuse)
--remote_user $(docker_user) --remote_host $(remote_host) \
--remote_port $(cat port) --remote_pwd $(docker_pwd) --nni_manager_ip $(nni_manager_ip)
cat config/training_service.yml
PATH=$HOME/.local/bin:$PATH python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
...
...
tools/nni_trial_tool/trial.py
View file @
add7ca66
...
...
@@ -137,10 +137,15 @@ class Trial:
def
kill
(
self
,
trial_id
=
None
):
if
trial_id
==
self
.
id
or
trial_id
is
None
:
if
self
.
process
is
not
None
:
try
:
nni_log
(
LogType
.
Info
,
"%s: killing trial"
%
self
.
name
)
for
child
in
psutil
.
Process
(
self
.
process
.
pid
).
children
(
True
):
child
.
kill
()
self
.
process
.
kill
()
except
psutil
.
NoSuchProcess
:
nni_log
(
LogType
.
Info
,
"kill trial %s failed: %s does not exist!"
%
(
trial_id
,
self
.
process
.
pid
))
except
Exception
as
ex
:
nni_log
(
LogType
.
Error
,
"kill trial %s failed: %s "
%
(
trial_id
,
str
(
ex
)))
self
.
cleanup
()
def
cleanup
(
self
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment