Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
d5e826e3
Unverified
Commit
d5e826e3
authored
Jan 29, 2018
by
Steven Hickson
Committed by
GitHub
Jan 29, 2018
Browse files
Merge branch 'master' into master
parents
e1ac09e1
fc37f117
Changes
153
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
204 additions
and
106 deletions
+204
-106
research/object_detection/utils/visualization_utils_test.py
research/object_detection/utils/visualization_utils_test.py
+1
-1
research/pcl_rl/README.md
research/pcl_rl/README.md
+18
-11
research/pcl_rl/baseline.py
research/pcl_rl/baseline.py
+1
-0
research/pcl_rl/controller.py
research/pcl_rl/controller.py
+20
-10
research/pcl_rl/env_spec.py
research/pcl_rl/env_spec.py
+1
-0
research/pcl_rl/expert_paths.py
research/pcl_rl/expert_paths.py
+1
-0
research/pcl_rl/full_episode_objective.py
research/pcl_rl/full_episode_objective.py
+2
-1
research/pcl_rl/gym_wrapper.py
research/pcl_rl/gym_wrapper.py
+3
-2
research/pcl_rl/model.py
research/pcl_rl/model.py
+58
-44
research/pcl_rl/objective.py
research/pcl_rl/objective.py
+49
-13
research/pcl_rl/optimizers.py
research/pcl_rl/optimizers.py
+1
-0
research/pcl_rl/replay_buffer.py
research/pcl_rl/replay_buffer.py
+2
-1
research/pcl_rl/trainer.py
research/pcl_rl/trainer.py
+36
-16
research/pcl_rl/trust_region.py
research/pcl_rl/trust_region.py
+1
-0
research/ptn/metrics.py
research/ptn/metrics.py
+1
-2
research/ptn/model_ptn.py
research/ptn/model_ptn.py
+1
-0
research/ptn/model_rotator.py
research/ptn/model_rotator.py
+5
-4
research/ptn/model_voxel_generation.py
research/ptn/model_voxel_generation.py
+1
-0
research/ptn/pretrain_rotator.py
research/ptn/pretrain_rotator.py
+1
-0
research/ptn/utils.py
research/ptn/utils.py
+1
-1
No files found.
research/object_detection/utils/visualization_utils_test.py
View file @
d5e826e3
...
...
@@ -145,7 +145,7 @@ class VisualizationUtilsTest(tf.test.TestCase):
for
i
in
range
(
images_with_boxes_np
.
shape
[
0
]):
img_name
=
'image_'
+
str
(
i
)
+
'.png'
output_file
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
img_name
)
print
'Writing output image %d to %s'
%
(
i
,
output_file
)
print
(
'Writing output image %d to %s'
%
(
i
,
output_file
)
)
image_pil
=
Image
.
fromarray
(
images_with_boxes_np
[
i
,
...])
image_pil
.
save
(
output_file
)
...
...
research/pcl_rl/README.md
View file @
d5e826e3
...
...
@@ -67,20 +67,27 @@ python trainer.py --logtostderr --batch_size=25 --env=HalfCheetah-v1 \
--max_divergence=0.05 --value_opt=best_fit --critic_weight=0.0 \
```
Run Mujoco task with Trust-PCL:
To run Mujoco task using Trust-PCL (off-policy) use the below command.
It should work well across all environments, given that you
search sufficiently among
(1) max_divergence (0.001, 0.0005, 0.002 are good values),
(2) rollout (1, 5, 10 are good values),
(3) tf_seed (need to average over enough random seeds).
```
python trainer.py --logtostderr --batch_size=1 --env=HalfCheetah-v1 \
--validation_frequency=50 --rollout=10 --critic_weight=0.0 \
--gamma=0.995 --clip_norm=40 --learning_rate=0.002 \
--replay_buffer_freq=1 --replay_buffer_size=20000 \
--replay_buffer_alpha=0.1 --norecurrent --objective=pcl \
--max_step=100 --tau=0.0 --eviction=fifo --max_divergence=0.001 \
--internal_dim=64 --cutoff_agent=1000 \
--replay_batch_size=25 --nouse_online_batch --batch_by_steps \
--sample_from=target --value_opt=grad --value_hidden_layers=2 \
--update_eps_lambda --unify_episodes --clip_adv=1.0 \
--target_network_lag=0.99 --prioritize_by=step
--validation_frequency=250 --rollout=1 --critic_weight=1.0 --gamma=0.995 \
--clip_norm=40 --learning_rate=0.0001 --replay_buffer_freq=1 \
--replay_buffer_size=5000 --replay_buffer_alpha=0.001 --norecurrent \
--objective=pcl --max_step=10 --cutoff_agent=1000 --tau=0.0 --eviction=fifo \
--max_divergence=0.001 --internal_dim=256 --replay_batch_size=64 \
--nouse_online_batch --batch_by_steps --value_hidden_layers=2 \
--update_eps_lambda --nounify_episodes --target_network_lag=0.99 \
--sample_from=online --clip_adv=1 --prioritize_by=step --num_steps=1000000 \
--noinput_prev_actions --use_target_values --tf_seed=57
```
Run Mujoco task with PCL constraint trust region:
...
...
research/pcl_rl/baseline.py
View file @
d5e826e3
...
...
@@ -20,6 +20,7 @@ In some cases this is just an additional linear layer on the policy.
In other cases, it is a completely separate neural network.
"""
from
six.moves
import
xrange
import
tensorflow
as
tf
import
numpy
as
np
...
...
research/pcl_rl/controller.py
View file @
d5e826e3
...
...
@@ -20,6 +20,7 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
six.moves
import
xrange
import
tensorflow
as
tf
import
numpy
as
np
import
pickle
...
...
@@ -109,13 +110,14 @@ class Controller(object):
self
.
episode_running_rewards
=
np
.
zeros
(
len
(
self
.
env
))
self
.
episode_running_lengths
=
np
.
zeros
(
len
(
self
.
env
))
self
.
episode_rewards
=
[]
self
.
greedy_episode_rewards
=
[]
self
.
episode_lengths
=
[]
self
.
total_rewards
=
[]
self
.
best_batch_rewards
=
None
def
setup
(
self
):
self
.
model
.
setup
()
def
setup
(
self
,
train
=
True
):
self
.
model
.
setup
(
train
=
train
)
def
initial_internal_state
(
self
):
return
np
.
zeros
(
self
.
model
.
policy
.
rnn_state_dim
)
...
...
@@ -187,7 +189,7 @@ class Controller(object):
return
initial_state
,
all_obs
,
all_act
,
rewards
,
all_pad
def
sample_episodes
(
self
,
sess
):
def
sample_episodes
(
self
,
sess
,
greedy
=
False
):
"""Sample steps from the environment until we have enough for a batch."""
# check if last batch ended with episode that was not terminated
...
...
@@ -200,7 +202,7 @@ class Controller(object):
while
total_steps
<
self
.
max_step
*
len
(
self
.
env
):
(
initial_state
,
observations
,
actions
,
rewards
,
pads
)
=
self
.
_sample_episodes
(
sess
)
pads
)
=
self
.
_sample_episodes
(
sess
,
greedy
=
greedy
)
observations
=
zip
(
*
observations
)
actions
=
zip
(
*
actions
)
...
...
@@ -249,19 +251,26 @@ class Controller(object):
observations
,
initial_state
,
actions
,
rewards
,
terminated
,
pads
):
"""Train model using batch."""
avg_episode_reward
=
np
.
mean
(
self
.
episode_rewards
)
greedy_episode_reward
=
(
np
.
mean
(
self
.
greedy_episode_rewards
)
if
self
.
greedy_episode_rewards
else
avg_episode_reward
)
loss
,
summary
=
None
,
None
if
self
.
use_trust_region
:
# use trust region to optimize policy
loss
,
_
,
summary
=
self
.
model
.
trust_region_step
(
sess
,
observations
,
initial_state
,
actions
,
rewards
,
terminated
,
pads
,
avg_episode_reward
=
np
.
mean
(
self
.
episode_rewards
))
avg_episode_reward
=
avg_episode_reward
,
greedy_episode_reward
=
greedy_episode_reward
)
else
:
# otherwise use simple gradient descent on policy
loss
,
_
,
summary
=
self
.
model
.
train_step
(
sess
,
observations
,
initial_state
,
actions
,
rewards
,
terminated
,
pads
,
avg_episode_reward
=
np
.
mean
(
self
.
episode_rewards
))
avg_episode_reward
=
avg_episode_reward
,
greedy_episode_reward
=
greedy_episode_reward
)
if
self
.
use_value_opt
:
# optionally perform specific value optimization
self
.
model
.
fit_values
(
...
...
@@ -305,7 +314,8 @@ class Controller(object):
if
self
.
update_eps_lambda
:
episode_rewards
=
np
.
array
(
self
.
episode_rewards
)
episode_lengths
=
np
.
array
(
self
.
episode_lengths
)
eps_lambda
=
find_best_eps_lambda
(
episode_rewards
,
episode_lengths
)
eps_lambda
=
find_best_eps_lambda
(
episode_rewards
[
-
20
:],
episode_lengths
[
-
20
:])
sess
.
run
(
self
.
model
.
objective
.
assign_eps_lambda
,
feed_dict
=
{
self
.
model
.
objective
.
new_eps_lambda
:
eps_lambda
})
...
...
@@ -328,10 +338,10 @@ class Controller(object):
"""Use greedy sampling."""
(
initial_state
,
observations
,
actions
,
rewards
,
pads
)
=
self
.
_
sample_episodes
(
sess
,
greedy
=
True
)
pads
,
terminated
)
=
self
.
sample_episodes
(
sess
,
greedy
=
True
)
total_rewards
=
np
.
sum
(
np
.
array
(
rewards
)
*
(
1
-
np
.
array
(
pads
)),
axis
=
0
)
return
np
.
mean
(
total_rewards
)
return
total_rewards
,
self
.
episode_rewards
def
convert_from_batched_episodes
(
self
,
initial_state
,
observations
,
actions
,
rewards
,
...
...
@@ -351,7 +361,7 @@ class Controller(object):
for
i
in
xrange
(
num_episodes
):
length
=
total_length
[
i
]
ep_initial
=
initial_state
[
i
]
ep_obs
=
[
obs
[:
length
,
i
,
...]
for
obs
in
observations
]
ep_obs
=
[
obs
[:
length
+
1
,
i
,
...]
for
obs
in
observations
]
ep_act
=
[
act
[:
length
+
1
,
i
,
...]
for
act
in
actions
]
ep_rewards
=
rewards
[:
length
,
i
]
...
...
research/pcl_rl/env_spec.py
View file @
d5e826e3
...
...
@@ -20,6 +20,7 @@ from __future__ import division
from
__future__
import
print_function
import
numpy
as
np
from
six.moves
import
xrange
class
spaces
(
object
):
...
...
research/pcl_rl/expert_paths.py
View file @
d5e826e3
...
...
@@ -22,6 +22,7 @@ import tensorflow as tf
import
random
import
os
import
numpy
as
np
from
six.moves
import
xrange
import
pickle
gfile
=
tf
.
gfile
...
...
research/pcl_rl/full_episode_objective.py
View file @
d5e826e3
...
...
@@ -42,7 +42,8 @@ class Reinforce(objective.Objective):
def
get
(
self
,
rewards
,
pads
,
values
,
final_values
,
log_probs
,
prev_log_probs
,
target_log_probs
,
entropies
,
logits
):
entropies
,
logits
,
target_values
,
final_target_values
):
seq_length
=
tf
.
shape
(
rewards
)[
0
]
not_pad
=
tf
.
reshape
(
1
-
pads
,
[
seq_length
,
-
1
,
self
.
num_samples
])
...
...
research/pcl_rl/gym_wrapper.py
View file @
d5e826e3
...
...
@@ -22,6 +22,7 @@ import gym
import
numpy
as
np
import
random
from
six.moves
import
xrange
import
env_spec
...
...
@@ -92,14 +93,14 @@ class GymWrapper(object):
def
step
(
self
,
actions
):
def
env_step
(
action
):
def
env_step
(
env
,
action
):
action
=
self
.
env_spec
.
convert_action_to_gym
(
action
)
obs
,
reward
,
done
,
tt
=
env
.
step
(
action
)
obs
=
self
.
env_spec
.
convert_obs_to_list
(
obs
)
return
obs
,
reward
,
done
,
tt
actions
=
zip
(
*
actions
)
outputs
=
[
env_step
(
action
)
outputs
=
[
env_step
(
env
,
action
)
if
not
done
else
(
self
.
env_spec
.
initial_obs
(
None
),
0
,
True
,
None
)
for
action
,
env
,
done
in
zip
(
actions
,
self
.
envs
,
self
.
dones
)]
for
i
,
(
_
,
_
,
done
,
_
)
in
enumerate
(
outputs
):
...
...
research/pcl_rl/model.py
View file @
d5e826e3
...
...
@@ -57,6 +57,8 @@ class Model(object):
# summary placeholder
self
.
avg_episode_reward
=
tf
.
placeholder
(
tf
.
float32
,
[],
'avg_episode_reward'
)
self
.
greedy_episode_reward
=
tf
.
placeholder
(
tf
.
float32
,
[],
'greedy_episode_reward'
)
# sampling placeholders
self
.
internal_state
=
tf
.
placeholder
(
tf
.
float32
,
...
...
@@ -118,12 +120,13 @@ class Model(object):
self
.
prev_log_probs
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
None
],
'prev_log_probs'
)
def
setup
(
self
):
def
setup
(
self
,
train
=
True
):
"""Setup Tensorflow Graph."""
self
.
setup_placeholders
()
tf
.
summary
.
scalar
(
'avg_episode_reward'
,
self
.
avg_episode_reward
)
tf
.
summary
.
scalar
(
'greedy_episode_reward'
,
self
.
greedy_episode_reward
)
with
tf
.
variable_scope
(
'model'
,
reuse
=
None
):
# policy network
...
...
@@ -174,45 +177,46 @@ class Model(object):
target_p
.
assign
(
aa
*
target_p
+
(
1
-
aa
)
*
online_p
)
for
online_p
,
target_p
in
zip
(
online_vars
,
target_vars
)])
# evaluate objective
(
self
.
loss
,
self
.
raw_loss
,
self
.
regression_target
,
self
.
gradient_ops
,
self
.
summary
)
=
self
.
objective
.
get
(
self
.
rewards
,
self
.
pads
,
self
.
values
[:
-
1
,
:],
self
.
values
[
-
1
,
:]
*
(
1
-
self
.
terminated
),
self
.
log_probs
,
self
.
prev_log_probs
,
self
.
target_log_probs
,
self
.
entropies
,
self
.
logits
)
self
.
regression_target
=
tf
.
reshape
(
self
.
regression_target
,
[
-
1
])
self
.
policy_vars
=
[
v
for
v
in
tf
.
trainable_variables
()
if
'/policy_net'
in
v
.
name
]
self
.
value_vars
=
[
v
for
v
in
tf
.
trainable_variables
()
if
'/value_net'
in
v
.
name
]
# trust region optimizer
if
self
.
trust_region_policy_opt
is
not
None
:
with
tf
.
variable_scope
(
'trust_region_policy'
,
reuse
=
None
):
avg_self_kl
=
(
tf
.
reduce_sum
(
sum
(
self
.
self_kls
)
*
(
1
-
self
.
pads
))
/
tf
.
reduce_sum
(
1
-
self
.
pads
))
self
.
trust_region_policy_opt
.
setup
(
self
.
policy_vars
,
self
.
raw_loss
,
avg_self_kl
,
self
.
avg_kl
)
# value optimizer
if
self
.
value_opt
is
not
None
:
with
tf
.
variable_scope
(
'trust_region_value'
,
reuse
=
None
):
self
.
value_opt
.
setup
(
self
.
value_vars
,
tf
.
reshape
(
self
.
values
[:
-
1
,
:],
[
-
1
]),
self
.
regression_target
,
tf
.
reshape
(
self
.
pads
,
[
-
1
]),
self
.
regression_input
,
self
.
regression_weight
)
if
train
:
# evaluate objective
(
self
.
loss
,
self
.
raw_loss
,
self
.
regression_target
,
self
.
gradient_ops
,
self
.
summary
)
=
self
.
objective
.
get
(
self
.
rewards
,
self
.
pads
,
self
.
values
[:
-
1
,
:],
self
.
values
[
-
1
,
:]
*
(
1
-
self
.
terminated
),
self
.
log_probs
,
self
.
prev_log_probs
,
self
.
target_log_probs
,
self
.
entropies
,
self
.
logits
,
self
.
target_values
[:
-
1
,
:],
self
.
target_values
[
-
1
,
:]
*
(
1
-
self
.
terminated
))
self
.
regression_target
=
tf
.
reshape
(
self
.
regression_target
,
[
-
1
])
self
.
policy_vars
=
[
v
for
v
in
tf
.
trainable_variables
()
if
'/policy_net'
in
v
.
name
]
self
.
value_vars
=
[
v
for
v
in
tf
.
trainable_variables
()
if
'/value_net'
in
v
.
name
]
# trust region optimizer
if
self
.
trust_region_policy_opt
is
not
None
:
with
tf
.
variable_scope
(
'trust_region_policy'
,
reuse
=
None
):
avg_self_kl
=
(
tf
.
reduce_sum
(
sum
(
self
.
self_kls
)
*
(
1
-
self
.
pads
))
/
tf
.
reduce_sum
(
1
-
self
.
pads
))
self
.
trust_region_policy_opt
.
setup
(
self
.
policy_vars
,
self
.
raw_loss
,
avg_self_kl
,
self
.
avg_kl
)
# value optimizer
if
self
.
value_opt
is
not
None
:
with
tf
.
variable_scope
(
'trust_region_value'
,
reuse
=
None
):
self
.
value_opt
.
setup
(
self
.
value_vars
,
tf
.
reshape
(
self
.
values
[:
-
1
,
:],
[
-
1
]),
self
.
regression_target
,
tf
.
reshape
(
self
.
pads
,
[
-
1
]),
self
.
regression_input
,
self
.
regression_weight
)
# we re-use variables for the sampling operations
with
tf
.
variable_scope
(
'model'
,
reuse
=
True
):
...
...
@@ -249,32 +253,42 @@ class Model(object):
def
train_step
(
self
,
sess
,
observations
,
internal_state
,
actions
,
rewards
,
terminated
,
pads
,
avg_episode_reward
=
0
):
avg_episode_reward
=
0
,
greedy_episode_reward
=
0
):
"""Train network using standard gradient descent."""
outputs
=
[
self
.
raw_loss
,
self
.
gradient_ops
,
self
.
summary
]
feed_dict
=
{
self
.
internal_state
:
internal_state
,
self
.
rewards
:
rewards
,
self
.
terminated
:
terminated
,
self
.
pads
:
pads
,
self
.
avg_episode_reward
:
avg_episode_reward
}
self
.
avg_episode_reward
:
avg_episode_reward
,
self
.
greedy_episode_reward
:
greedy_episode_reward
}
time_len
=
None
for
action_place
,
action
in
zip
(
self
.
actions
,
actions
):
if
time_len
is
None
:
time_len
=
len
(
action
)
assert
time_len
==
len
(
action
)
feed_dict
[
action_place
]
=
action
for
obs_place
,
obs
in
zip
(
self
.
observations
,
observations
):
assert
time_len
==
len
(
obs
)
feed_dict
[
obs_place
]
=
obs
assert
len
(
rewards
)
==
time_len
-
1
return
sess
.
run
(
outputs
,
feed_dict
=
feed_dict
)
def
trust_region_step
(
self
,
sess
,
observations
,
internal_state
,
actions
,
rewards
,
terminated
,
pads
,
avg_episode_reward
=
0
):
avg_episode_reward
=
0
,
greedy_episode_reward
=
0
):
"""Train policy using trust region step."""
feed_dict
=
{
self
.
internal_state
:
internal_state
,
self
.
rewards
:
rewards
,
self
.
terminated
:
terminated
,
self
.
pads
:
pads
,
self
.
avg_episode_reward
:
avg_episode_reward
}
self
.
avg_episode_reward
:
avg_episode_reward
,
self
.
greedy_episode_reward
:
greedy_episode_reward
}
for
action_place
,
action
in
zip
(
self
.
actions
,
actions
):
feed_dict
[
action_place
]
=
action
for
obs_place
,
obs
in
zip
(
self
.
observations
,
observations
):
...
...
research/pcl_rl/objective.py
View file @
d5e826e3
...
...
@@ -46,7 +46,8 @@ class Objective(object):
def
get
(
self
,
rewards
,
pads
,
values
,
final_values
,
log_probs
,
prev_log_probs
,
target_log_probs
,
entropies
,
logits
):
entropies
,
logits
,
target_values
,
final_target_values
):
"""Get objective calculations."""
raise
NotImplementedError
()
...
...
@@ -101,7 +102,8 @@ class ActorCritic(Objective):
def
__init__
(
self
,
learning_rate
,
clip_norm
=
5
,
policy_weight
=
1.0
,
critic_weight
=
0.1
,
tau
=
0.1
,
gamma
=
1.0
,
rollout
=
10
,
eps_lambda
=
0.0
,
clip_adv
=
None
):
eps_lambda
=
0.0
,
clip_adv
=
None
,
use_target_values
=
False
):
super
(
ActorCritic
,
self
).
__init__
(
learning_rate
,
clip_norm
=
clip_norm
)
self
.
policy_weight
=
policy_weight
self
.
critic_weight
=
critic_weight
...
...
@@ -111,14 +113,17 @@ class ActorCritic(Objective):
self
.
clip_adv
=
clip_adv
self
.
eps_lambda
=
tf
.
get_variable
(
# TODO: need a better way
'eps_lambda'
,
[],
initializer
=
tf
.
constant_initializer
(
eps_lambda
))
'eps_lambda'
,
[],
initializer
=
tf
.
constant_initializer
(
eps_lambda
),
trainable
=
False
)
self
.
new_eps_lambda
=
tf
.
placeholder
(
tf
.
float32
,
[])
self
.
assign_eps_lambda
=
self
.
eps_lambda
.
assign
(
0.95
*
self
.
eps_lambda
+
0.05
*
self
.
new_eps_lambda
)
0.99
*
self
.
eps_lambda
+
0.01
*
self
.
new_eps_lambda
)
self
.
use_target_values
=
use_target_values
def
get
(
self
,
rewards
,
pads
,
values
,
final_values
,
log_probs
,
prev_log_probs
,
target_log_probs
,
entropies
,
logits
):
entropies
,
logits
,
target_values
,
final_target_values
):
not_pad
=
1
-
pads
batch_size
=
tf
.
shape
(
rewards
)[
1
]
...
...
@@ -126,10 +131,17 @@ class ActorCritic(Objective):
rewards
=
not_pad
*
rewards
value_estimates
=
not_pad
*
values
log_probs
=
not_pad
*
sum
(
log_probs
)
target_values
=
not_pad
*
tf
.
stop_gradient
(
target_values
)
final_target_values
=
tf
.
stop_gradient
(
final_target_values
)
sum_rewards
=
discounted_future_sum
(
rewards
,
self
.
gamma
,
self
.
rollout
)
last_values
=
shift_values
(
value_estimates
,
self
.
gamma
,
self
.
rollout
,
final_values
)
if
self
.
use_target_values
:
last_values
=
shift_values
(
target_values
,
self
.
gamma
,
self
.
rollout
,
final_target_values
)
else
:
last_values
=
shift_values
(
value_estimates
,
self
.
gamma
,
self
.
rollout
,
final_values
)
future_values
=
sum_rewards
+
last_values
baseline_values
=
value_estimates
...
...
@@ -183,7 +195,8 @@ class PCL(ActorCritic):
def
get
(
self
,
rewards
,
pads
,
values
,
final_values
,
log_probs
,
prev_log_probs
,
target_log_probs
,
entropies
,
logits
):
entropies
,
logits
,
target_values
,
final_target_values
):
not_pad
=
1
-
pads
batch_size
=
tf
.
shape
(
rewards
)[
1
]
...
...
@@ -192,6 +205,8 @@ class PCL(ActorCritic):
log_probs
=
not_pad
*
sum
(
log_probs
)
target_log_probs
=
not_pad
*
tf
.
stop_gradient
(
sum
(
target_log_probs
))
relative_log_probs
=
not_pad
*
(
log_probs
-
target_log_probs
)
target_values
=
not_pad
*
tf
.
stop_gradient
(
target_values
)
final_target_values
=
tf
.
stop_gradient
(
final_target_values
)
# Prepend.
not_pad
=
tf
.
concat
([
tf
.
ones
([
self
.
rollout
-
1
,
batch_size
]),
...
...
@@ -210,14 +225,26 @@ class PCL(ActorCritic):
prev_log_probs
],
0
)
relative_log_probs
=
tf
.
concat
([
tf
.
zeros
([
self
.
rollout
-
1
,
batch_size
]),
relative_log_probs
],
0
)
target_values
=
tf
.
concat
(
[
self
.
gamma
**
tf
.
expand_dims
(
tf
.
range
(
float
(
self
.
rollout
-
1
),
0
,
-
1
),
1
)
*
tf
.
ones
([
self
.
rollout
-
1
,
batch_size
])
*
target_values
[
0
:
1
,
:],
target_values
],
0
)
sum_rewards
=
discounted_future_sum
(
rewards
,
self
.
gamma
,
self
.
rollout
)
sum_log_probs
=
discounted_future_sum
(
log_probs
,
self
.
gamma
,
self
.
rollout
)
sum_prev_log_probs
=
discounted_future_sum
(
prev_log_probs
,
self
.
gamma
,
self
.
rollout
)
sum_relative_log_probs
=
discounted_future_sum
(
relative_log_probs
,
self
.
gamma
,
self
.
rollout
)
last_values
=
shift_values
(
value_estimates
,
self
.
gamma
,
self
.
rollout
,
final_values
)
if
self
.
use_target_values
:
last_values
=
shift_values
(
target_values
,
self
.
gamma
,
self
.
rollout
,
final_target_values
)
else
:
last_values
=
shift_values
(
value_estimates
,
self
.
gamma
,
self
.
rollout
,
final_values
)
future_values
=
(
-
self
.
tau
*
sum_log_probs
...
...
@@ -272,7 +299,8 @@ class TRPO(ActorCritic):
def
get
(
self
,
rewards
,
pads
,
values
,
final_values
,
log_probs
,
prev_log_probs
,
target_log_probs
,
entropies
,
logits
):
entropies
,
logits
,
target_values
,
final_target_values
):
not_pad
=
1
-
pads
batch_size
=
tf
.
shape
(
rewards
)[
1
]
...
...
@@ -280,10 +308,18 @@ class TRPO(ActorCritic):
value_estimates
=
not_pad
*
values
log_probs
=
not_pad
*
sum
(
log_probs
)
prev_log_probs
=
not_pad
*
prev_log_probs
target_values
=
not_pad
*
tf
.
stop_gradient
(
target_values
)
final_target_values
=
tf
.
stop_gradient
(
final_target_values
)
sum_rewards
=
discounted_future_sum
(
rewards
,
self
.
gamma
,
self
.
rollout
)
last_values
=
shift_values
(
value_estimates
,
self
.
gamma
,
self
.
rollout
,
final_values
)
if
self
.
use_target_values
:
last_values
=
shift_values
(
target_values
,
self
.
gamma
,
self
.
rollout
,
final_target_values
)
else
:
last_values
=
shift_values
(
value_estimates
,
self
.
gamma
,
self
.
rollout
,
final_values
)
future_values
=
sum_rewards
+
last_values
baseline_values
=
value_estimates
...
...
research/pcl_rl/optimizers.py
View file @
d5e826e3
...
...
@@ -25,6 +25,7 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
six.moves
import
xrange
import
tensorflow
as
tf
import
numpy
as
np
import
scipy.optimize
...
...
research/pcl_rl/replay_buffer.py
View file @
d5e826e3
...
...
@@ -20,6 +20,7 @@ Implements replay buffer in Python.
import
random
import
numpy
as
np
from
six.moves
import
xrange
class
ReplayBuffer
(
object
):
...
...
@@ -150,7 +151,7 @@ class PrioritizedReplayBuffer(ReplayBuffer):
def
get_batch
(
self
,
n
):
"""Get batch of episodes to train on."""
p
=
self
.
sampling_distribution
()
idxs
=
np
.
random
.
choice
(
self
.
cur_size
,
size
=
n
,
replace
=
False
,
p
=
p
)
idxs
=
np
.
random
.
choice
(
self
.
cur_size
,
size
=
int
(
n
)
,
replace
=
False
,
p
=
p
)
self
.
last_batch
=
idxs
return
[
self
.
buffer
[
idx
]
for
idx
in
idxs
],
p
[
idxs
]
...
...
research/pcl_rl/trainer.py
View file @
d5e826e3
...
...
@@ -25,6 +25,7 @@ import random
import
os
import
pickle
from
six.moves
import
xrange
import
controller
import
model
import
policy
...
...
@@ -92,6 +93,8 @@ flags.DEFINE_bool('update_eps_lambda', False,
'Update lambda automatically based on last 100 episodes.'
)
flags
.
DEFINE_float
(
'gamma'
,
1.0
,
'discount'
)
flags
.
DEFINE_integer
(
'rollout'
,
10
,
'rollout'
)
flags
.
DEFINE_bool
(
'use_target_values'
,
False
,
'use target network for value estimates'
)
flags
.
DEFINE_bool
(
'fixed_std'
,
True
,
'fix the std in Gaussian distributions'
)
flags
.
DEFINE_bool
(
'input_prev_actions'
,
True
,
...
...
@@ -152,6 +155,10 @@ class Trainer(object):
self
.
env
=
gym_wrapper
.
GymWrapper
(
self
.
env_str
,
distinct
=
FLAGS
.
batch_size
//
self
.
num_samples
,
count
=
self
.
num_samples
)
self
.
eval_env
=
gym_wrapper
.
GymWrapper
(
self
.
env_str
,
distinct
=
FLAGS
.
batch_size
//
self
.
num_samples
,
count
=
self
.
num_samples
)
self
.
env_spec
=
env_spec
.
EnvSpec
(
self
.
env
.
get_one
())
self
.
max_step
=
FLAGS
.
max_step
...
...
@@ -169,7 +176,8 @@ class Trainer(object):
self
.
value_opt
=
FLAGS
.
value_opt
assert
not
self
.
trust_region_p
or
self
.
objective
in
[
'pcl'
,
'trpo'
]
assert
self
.
objective
!=
'trpo'
or
self
.
trust_region_p
assert
self
.
value_opt
is
None
or
self
.
critic_weight
==
0.0
assert
self
.
value_opt
is
None
or
self
.
value_opt
==
'None'
or
\
self
.
critic_weight
==
0.0
self
.
max_divergence
=
FLAGS
.
max_divergence
self
.
learning_rate
=
FLAGS
.
learning_rate
...
...
@@ -182,6 +190,7 @@ class Trainer(object):
self
.
update_eps_lambda
=
FLAGS
.
update_eps_lambda
self
.
gamma
=
FLAGS
.
gamma
self
.
rollout
=
FLAGS
.
rollout
self
.
use_target_values
=
FLAGS
.
use_target_values
self
.
fixed_std
=
FLAGS
.
fixed_std
self
.
input_prev_actions
=
FLAGS
.
input_prev_actions
self
.
recurrent
=
FLAGS
.
recurrent
...
...
@@ -208,8 +217,7 @@ class Trainer(object):
self
.
value_hidden_layers
=
FLAGS
.
value_hidden_layers
self
.
tf_seed
=
FLAGS
.
tf_seed
self
.
save_trajectories_dir
=
(
FLAGS
.
save_trajectories_dir
or
FLAGS
.
save_dir
)
self
.
save_trajectories_dir
=
FLAGS
.
save_trajectories_dir
self
.
save_trajectories_file
=
(
os
.
path
.
join
(
self
.
save_trajectories_dir
,
self
.
env_str
.
replace
(
'-'
,
'_'
))
...
...
@@ -244,7 +252,8 @@ class Trainer(object):
policy_weight
=
policy_weight
,
critic_weight
=
self
.
critic_weight
,
tau
=
tau
,
gamma
=
self
.
gamma
,
rollout
=
self
.
rollout
,
eps_lambda
=
self
.
eps_lambda
,
clip_adv
=
self
.
clip_adv
)
eps_lambda
=
self
.
eps_lambda
,
clip_adv
=
self
.
clip_adv
,
use_target_values
=
self
.
use_target_values
)
elif
self
.
objective
in
[
'reinforce'
,
'urex'
]:
cls
=
(
full_episode_objective
.
Reinforce
if
self
.
objective
==
'reinforce'
else
...
...
@@ -322,10 +331,10 @@ class Trainer(object):
self
.
num_expert_paths
,
self
.
env_str
,
self
.
env_spec
,
load_trajectories_file
=
self
.
load_trajectories_file
)
def
get_controller
(
self
):
def
get_controller
(
self
,
env
):
"""Get controller."""
cls
=
controller
.
Controller
return
cls
(
self
.
env
,
self
.
env_spec
,
self
.
internal_dim
,
return
cls
(
env
,
self
.
env_spec
,
self
.
internal_dim
,
use_online_batch
=
self
.
use_online_batch
,
batch_by_steps
=
self
.
batch_by_steps
,
unify_episodes
=
self
.
unify_episodes
,
...
...
@@ -334,7 +343,7 @@ class Trainer(object):
cutoff_agent
=
self
.
cutoff_agent
,
save_trajectories_file
=
self
.
save_trajectories_file
,
use_trust_region
=
self
.
trust_region_p
,
use_value_opt
=
self
.
value_opt
is
not
None
,
use_value_opt
=
self
.
value_opt
not
in
[
None
,
'None'
],
update_eps_lambda
=
self
.
update_eps_lambda
,
prioritize_by
=
self
.
prioritize_by
,
get_model
=
self
.
get_model
,
...
...
@@ -359,16 +368,19 @@ class Trainer(object):
saver
.
restore
(
sess
,
ckpt
.
model_checkpoint_path
)
elif
FLAGS
.
load_path
:
logging
.
info
(
'restoring from %s'
,
FLAGS
.
load_path
)
with
gfile
.
AsUser
(
'distbelief-brain-gpu'
):
saver
.
restore
(
sess
,
FLAGS
.
load_path
)
saver
.
restore
(
sess
,
FLAGS
.
load_path
)
if
FLAGS
.
supervisor
:
with
tf
.
device
(
tf
.
ReplicaDeviceSetter
(
FLAGS
.
ps_tasks
,
merge_devices
=
True
)):
self
.
global_step
=
tf
.
train
.
get_or_create_global_step
()
self
.
global_step
=
tf
.
contrib
.
framework
.
get_or_create_global_step
()
tf
.
set_random_seed
(
FLAGS
.
tf_seed
)
self
.
controller
=
self
.
get_controller
()
self
.
controller
=
self
.
get_controller
(
self
.
env
)
self
.
model
=
self
.
controller
.
model
self
.
controller
.
setup
()
with
tf
.
variable_scope
(
tf
.
get_variable_scope
(),
reuse
=
True
):
self
.
eval_controller
=
self
.
get_controller
(
self
.
eval_env
)
self
.
eval_controller
.
setup
(
train
=
False
)
saver
=
tf
.
train
.
Saver
(
max_to_keep
=
10
)
step
=
self
.
model
.
global_step
sv
=
tf
.
Supervisor
(
logdir
=
FLAGS
.
save_dir
,
...
...
@@ -382,10 +394,14 @@ class Trainer(object):
sess
=
sv
.
PrepareSession
(
FLAGS
.
master
)
else
:
tf
.
set_random_seed
(
FLAGS
.
tf_seed
)
self
.
global_step
=
tf
.
train
.
get_or_create_global_step
()
self
.
controller
=
self
.
get_controller
()
self
.
global_step
=
tf
.
contrib
.
framework
.
get_or_create_global_step
()
self
.
controller
=
self
.
get_controller
(
self
.
env
)
self
.
model
=
self
.
controller
.
model
self
.
controller
.
setup
()
with
tf
.
variable_scope
(
tf
.
get_variable_scope
(),
reuse
=
True
):
self
.
eval_controller
=
self
.
get_controller
(
self
.
eval_env
)
self
.
eval_controller
.
setup
(
train
=
False
)
saver
=
tf
.
train
.
Saver
(
max_to_keep
=
10
)
sess
=
tf
.
Session
()
sess
.
run
(
tf
.
initialize_all_variables
())
...
...
@@ -414,21 +430,25 @@ class Trainer(object):
(
loss
,
summary
,
total_rewards
,
episode_rewards
)
=
self
.
controller
.
train
(
sess
)
_
,
greedy_episode_rewards
=
self
.
eval_controller
.
eval
(
sess
)
self
.
controller
.
greedy_episode_rewards
=
greedy_episode_rewards
losses
.
append
(
loss
)
rewards
.
append
(
total_rewards
)
all_ep_rewards
.
extend
(
episode_rewards
)
if
random
.
random
()
<
1
and
is_chief
and
sv
and
sv
.
_summary_writer
:
if
(
random
.
random
()
<
0.1
and
summary
and
episode_rewards
and
is_chief
and
sv
and
sv
.
_summary_writer
):
sv
.
summary_computed
(
sess
,
summary
)
model_step
=
sess
.
run
(
self
.
model
.
global_step
)
if
is_chief
and
step
%
self
.
validation_frequency
==
0
:
logging
.
info
(
'at training step %d, model step %d: '
'avg loss %f, avg reward %f, '
'episode rewards: %f'
,
'episode rewards:
%f, greedy rewards:
%f'
,
step
,
model_step
,
np
.
mean
(
losses
),
np
.
mean
(
rewards
),
np
.
mean
(
all_ep_rewards
))
np
.
mean
(
all_ep_rewards
),
np
.
mean
(
greedy_episode_rewards
))
losses
=
[]
rewards
=
[]
...
...
research/pcl_rl/trust_region.py
View file @
d5e826e3
...
...
@@ -24,6 +24,7 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
six.moves
import
xrange
import
tensorflow
as
tf
import
numpy
as
np
...
...
research/ptn/metrics.py
View file @
d5e826e3
...
...
@@ -19,6 +19,7 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
six.moves
import
xrange
import
tensorflow
as
tf
slim
=
tf
.
contrib
.
slim
...
...
@@ -108,5 +109,3 @@ def add_volume_iou_metrics(inputs, outputs):
names_to_values
[
'volume_iou'
]
=
tmp_values
*
3.0
names_to_updates
[
'volume_iou'
]
=
tmp_updates
return
names_to_values
,
names_to_updates
research/ptn/model_ptn.py
View file @
d5e826e3
...
...
@@ -21,6 +21,7 @@ from __future__ import print_function
import
os
import
numpy
as
np
from
six.moves
import
xrange
import
tensorflow
as
tf
import
losses
...
...
research/ptn/model_rotator.py
View file @
d5e826e3
...
...
@@ -21,6 +21,7 @@ from __future__ import print_function
import
os
import
numpy
as
np
from
six.moves
import
xrange
import
tensorflow
as
tf
import
input_generator
...
...
@@ -191,19 +192,19 @@ def get_train_op_for_scope(loss, optimizer, scopes, params):
def
get_metrics
(
inputs
,
outputs
,
params
):
"""Aggregate the metrics for rotator model.
Args:
inputs: Input dictionary of the rotator model.
outputs: Output dictionary returned by the rotator model.
params: Hyperparameters of the rotator model.
Returns:
names_to_values: metrics->values (dict).
names_to_updates: metrics->ops (dict).
"""
names_to_values
=
dict
()
names_to_updates
=
dict
()
tmp_values
,
tmp_updates
=
metrics
.
add_image_pred_metrics
(
inputs
,
outputs
,
params
.
num_views
,
3
*
params
.
image_size
**
2
)
names_to_values
.
update
(
tmp_values
)
...
...
@@ -217,7 +218,7 @@ def get_metrics(inputs, outputs, params):
for
name
,
value
in
names_to_values
.
iteritems
():
slim
.
summaries
.
add_scalar_summary
(
value
,
name
,
prefix
=
'eval'
,
print_summary
=
True
)
return
names_to_values
,
names_to_updates
...
...
research/ptn/model_voxel_generation.py
View file @
d5e826e3
...
...
@@ -22,6 +22,7 @@ import abc
import
os
import
numpy
as
np
from
six.moves
import
xrange
import
tensorflow
as
tf
import
input_generator
...
...
research/ptn/pretrain_rotator.py
View file @
d5e826e3
...
...
@@ -21,6 +21,7 @@ from __future__ import print_function
import
os
import
numpy
as
np
from
six.moves
import
xrange
import
tensorflow
as
tf
from
tensorflow
import
app
...
...
research/ptn/utils.py
View file @
d5e826e3
...
...
@@ -28,6 +28,7 @@ from mpl_toolkits.mplot3d import axes3d as p3 # pylint:disable=unused-import
import
numpy
as
np
from
PIL
import
Image
from
skimage
import
measure
from
six.moves
import
xrange
import
tensorflow
as
tf
...
...
@@ -116,4 +117,3 @@ def visualize_voxel_scatter(points, vis_size=128):
vis_size
,
vis_size
,
3
)
p
.
close
(
'all'
)
return
data
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment