Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
78ddf6eb
Unverified
Commit
78ddf6eb
authored
Jan 26, 2018
by
cclauss
Committed by
GitHub
Jan 26, 2018
Browse files
Merge branch 'master' into patch-6
parents
50cb0365
1f34fcaf
Changes
328
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
187 additions
and
96 deletions
+187
-96
research/pcl_rl/baseline.py
research/pcl_rl/baseline.py
+1
-0
research/pcl_rl/controller.py
research/pcl_rl/controller.py
+20
-10
research/pcl_rl/env_spec.py
research/pcl_rl/env_spec.py
+1
-0
research/pcl_rl/expert_paths.py
research/pcl_rl/expert_paths.py
+1
-0
research/pcl_rl/full_episode_objective.py
research/pcl_rl/full_episode_objective.py
+2
-1
research/pcl_rl/gym_wrapper.py
research/pcl_rl/gym_wrapper.py
+3
-2
research/pcl_rl/model.py
research/pcl_rl/model.py
+58
-44
research/pcl_rl/objective.py
research/pcl_rl/objective.py
+49
-13
research/pcl_rl/optimizers.py
research/pcl_rl/optimizers.py
+1
-0
research/pcl_rl/replay_buffer.py
research/pcl_rl/replay_buffer.py
+2
-1
research/pcl_rl/trainer.py
research/pcl_rl/trainer.py
+36
-16
research/pcl_rl/trust_region.py
research/pcl_rl/trust_region.py
+1
-0
research/ptn/eval_rotator.py
research/ptn/eval_rotator.py
+1
-1
research/ptn/metrics.py
research/ptn/metrics.py
+1
-2
research/ptn/model_ptn.py
research/ptn/model_ptn.py
+1
-0
research/ptn/model_rotator.py
research/ptn/model_rotator.py
+5
-4
research/ptn/model_voxel_generation.py
research/ptn/model_voxel_generation.py
+1
-0
research/ptn/pretrain_rotator.py
research/ptn/pretrain_rotator.py
+1
-0
research/ptn/train_ptn.py
research/ptn/train_ptn.py
+1
-1
research/ptn/utils.py
research/ptn/utils.py
+1
-1
No files found.
research/pcl_rl/baseline.py
View file @
78ddf6eb
...
...
@@ -20,6 +20,7 @@ In some cases this is just an additional linear layer on the policy.
In other cases, it is a completely separate neural network.
"""
from
six.moves
import
xrange
import
tensorflow
as
tf
import
numpy
as
np
...
...
research/pcl_rl/controller.py
View file @
78ddf6eb
...
...
@@ -20,6 +20,7 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
six.moves
import
xrange
import
tensorflow
as
tf
import
numpy
as
np
import
pickle
...
...
@@ -109,13 +110,14 @@ class Controller(object):
self
.
episode_running_rewards
=
np
.
zeros
(
len
(
self
.
env
))
self
.
episode_running_lengths
=
np
.
zeros
(
len
(
self
.
env
))
self
.
episode_rewards
=
[]
self
.
greedy_episode_rewards
=
[]
self
.
episode_lengths
=
[]
self
.
total_rewards
=
[]
self
.
best_batch_rewards
=
None
def
setup
(
self
):
self
.
model
.
setup
()
def
setup
(
self
,
train
=
True
):
self
.
model
.
setup
(
train
=
train
)
def
initial_internal_state
(
self
):
return
np
.
zeros
(
self
.
model
.
policy
.
rnn_state_dim
)
...
...
@@ -187,7 +189,7 @@ class Controller(object):
return
initial_state
,
all_obs
,
all_act
,
rewards
,
all_pad
def
sample_episodes
(
self
,
sess
):
def
sample_episodes
(
self
,
sess
,
greedy
=
False
):
"""Sample steps from the environment until we have enough for a batch."""
# check if last batch ended with episode that was not terminated
...
...
@@ -200,7 +202,7 @@ class Controller(object):
while
total_steps
<
self
.
max_step
*
len
(
self
.
env
):
(
initial_state
,
observations
,
actions
,
rewards
,
pads
)
=
self
.
_sample_episodes
(
sess
)
pads
)
=
self
.
_sample_episodes
(
sess
,
greedy
=
greedy
)
observations
=
zip
(
*
observations
)
actions
=
zip
(
*
actions
)
...
...
@@ -249,19 +251,26 @@ class Controller(object):
observations
,
initial_state
,
actions
,
rewards
,
terminated
,
pads
):
"""Train model using batch."""
avg_episode_reward
=
np
.
mean
(
self
.
episode_rewards
)
greedy_episode_reward
=
(
np
.
mean
(
self
.
greedy_episode_rewards
)
if
self
.
greedy_episode_rewards
else
avg_episode_reward
)
loss
,
summary
=
None
,
None
if
self
.
use_trust_region
:
# use trust region to optimize policy
loss
,
_
,
summary
=
self
.
model
.
trust_region_step
(
sess
,
observations
,
initial_state
,
actions
,
rewards
,
terminated
,
pads
,
avg_episode_reward
=
np
.
mean
(
self
.
episode_rewards
))
avg_episode_reward
=
avg_episode_reward
,
greedy_episode_reward
=
greedy_episode_reward
)
else
:
# otherwise use simple gradient descent on policy
loss
,
_
,
summary
=
self
.
model
.
train_step
(
sess
,
observations
,
initial_state
,
actions
,
rewards
,
terminated
,
pads
,
avg_episode_reward
=
np
.
mean
(
self
.
episode_rewards
))
avg_episode_reward
=
avg_episode_reward
,
greedy_episode_reward
=
greedy_episode_reward
)
if
self
.
use_value_opt
:
# optionally perform specific value optimization
self
.
model
.
fit_values
(
...
...
@@ -305,7 +314,8 @@ class Controller(object):
if
self
.
update_eps_lambda
:
episode_rewards
=
np
.
array
(
self
.
episode_rewards
)
episode_lengths
=
np
.
array
(
self
.
episode_lengths
)
eps_lambda
=
find_best_eps_lambda
(
episode_rewards
,
episode_lengths
)
eps_lambda
=
find_best_eps_lambda
(
episode_rewards
[
-
20
:],
episode_lengths
[
-
20
:])
sess
.
run
(
self
.
model
.
objective
.
assign_eps_lambda
,
feed_dict
=
{
self
.
model
.
objective
.
new_eps_lambda
:
eps_lambda
})
...
...
@@ -328,10 +338,10 @@ class Controller(object):
"""Use greedy sampling."""
(
initial_state
,
observations
,
actions
,
rewards
,
pads
)
=
self
.
_
sample_episodes
(
sess
,
greedy
=
True
)
pads
,
terminated
)
=
self
.
sample_episodes
(
sess
,
greedy
=
True
)
total_rewards
=
np
.
sum
(
np
.
array
(
rewards
)
*
(
1
-
np
.
array
(
pads
)),
axis
=
0
)
return
np
.
mean
(
total_rewards
)
return
total_rewards
,
self
.
episode_rewards
def
convert_from_batched_episodes
(
self
,
initial_state
,
observations
,
actions
,
rewards
,
...
...
@@ -351,7 +361,7 @@ class Controller(object):
for
i
in
xrange
(
num_episodes
):
length
=
total_length
[
i
]
ep_initial
=
initial_state
[
i
]
ep_obs
=
[
obs
[:
length
,
i
,
...]
for
obs
in
observations
]
ep_obs
=
[
obs
[:
length
+
1
,
i
,
...]
for
obs
in
observations
]
ep_act
=
[
act
[:
length
+
1
,
i
,
...]
for
act
in
actions
]
ep_rewards
=
rewards
[:
length
,
i
]
...
...
research/pcl_rl/env_spec.py
View file @
78ddf6eb
...
...
@@ -20,6 +20,7 @@ from __future__ import division
from
__future__
import
print_function
import
numpy
as
np
from
six.moves
import
xrange
class
spaces
(
object
):
...
...
research/pcl_rl/expert_paths.py
View file @
78ddf6eb
...
...
@@ -22,6 +22,7 @@ import tensorflow as tf
import
random
import
os
import
numpy
as
np
from
six.moves
import
xrange
import
pickle
gfile
=
tf
.
gfile
...
...
research/pcl_rl/full_episode_objective.py
View file @
78ddf6eb
...
...
@@ -42,7 +42,8 @@ class Reinforce(objective.Objective):
def
get
(
self
,
rewards
,
pads
,
values
,
final_values
,
log_probs
,
prev_log_probs
,
target_log_probs
,
entropies
,
logits
):
entropies
,
logits
,
target_values
,
final_target_values
):
seq_length
=
tf
.
shape
(
rewards
)[
0
]
not_pad
=
tf
.
reshape
(
1
-
pads
,
[
seq_length
,
-
1
,
self
.
num_samples
])
...
...
research/pcl_rl/gym_wrapper.py
View file @
78ddf6eb
...
...
@@ -22,6 +22,7 @@ import gym
import
numpy
as
np
import
random
from
six.moves
import
xrange
import
env_spec
...
...
@@ -92,14 +93,14 @@ class GymWrapper(object):
def
step
(
self
,
actions
):
def
env_step
(
action
):
def
env_step
(
env
,
action
):
action
=
self
.
env_spec
.
convert_action_to_gym
(
action
)
obs
,
reward
,
done
,
tt
=
env
.
step
(
action
)
obs
=
self
.
env_spec
.
convert_obs_to_list
(
obs
)
return
obs
,
reward
,
done
,
tt
actions
=
zip
(
*
actions
)
outputs
=
[
env_step
(
action
)
outputs
=
[
env_step
(
env
,
action
)
if
not
done
else
(
self
.
env_spec
.
initial_obs
(
None
),
0
,
True
,
None
)
for
action
,
env
,
done
in
zip
(
actions
,
self
.
envs
,
self
.
dones
)]
for
i
,
(
_
,
_
,
done
,
_
)
in
enumerate
(
outputs
):
...
...
research/pcl_rl/model.py
View file @
78ddf6eb
...
...
@@ -57,6 +57,8 @@ class Model(object):
# summary placeholder
self
.
avg_episode_reward
=
tf
.
placeholder
(
tf
.
float32
,
[],
'avg_episode_reward'
)
self
.
greedy_episode_reward
=
tf
.
placeholder
(
tf
.
float32
,
[],
'greedy_episode_reward'
)
# sampling placeholders
self
.
internal_state
=
tf
.
placeholder
(
tf
.
float32
,
...
...
@@ -118,12 +120,13 @@ class Model(object):
self
.
prev_log_probs
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
None
],
'prev_log_probs'
)
def
setup
(
self
):
def
setup
(
self
,
train
=
True
):
"""Setup Tensorflow Graph."""
self
.
setup_placeholders
()
tf
.
summary
.
scalar
(
'avg_episode_reward'
,
self
.
avg_episode_reward
)
tf
.
summary
.
scalar
(
'greedy_episode_reward'
,
self
.
greedy_episode_reward
)
with
tf
.
variable_scope
(
'model'
,
reuse
=
None
):
# policy network
...
...
@@ -174,45 +177,46 @@ class Model(object):
target_p
.
assign
(
aa
*
target_p
+
(
1
-
aa
)
*
online_p
)
for
online_p
,
target_p
in
zip
(
online_vars
,
target_vars
)])
# evaluate objective
(
self
.
loss
,
self
.
raw_loss
,
self
.
regression_target
,
self
.
gradient_ops
,
self
.
summary
)
=
self
.
objective
.
get
(
self
.
rewards
,
self
.
pads
,
self
.
values
[:
-
1
,
:],
self
.
values
[
-
1
,
:]
*
(
1
-
self
.
terminated
),
self
.
log_probs
,
self
.
prev_log_probs
,
self
.
target_log_probs
,
self
.
entropies
,
self
.
logits
)
self
.
regression_target
=
tf
.
reshape
(
self
.
regression_target
,
[
-
1
])
self
.
policy_vars
=
[
v
for
v
in
tf
.
trainable_variables
()
if
'/policy_net'
in
v
.
name
]
self
.
value_vars
=
[
v
for
v
in
tf
.
trainable_variables
()
if
'/value_net'
in
v
.
name
]
# trust region optimizer
if
self
.
trust_region_policy_opt
is
not
None
:
with
tf
.
variable_scope
(
'trust_region_policy'
,
reuse
=
None
):
avg_self_kl
=
(
tf
.
reduce_sum
(
sum
(
self
.
self_kls
)
*
(
1
-
self
.
pads
))
/
tf
.
reduce_sum
(
1
-
self
.
pads
))
self
.
trust_region_policy_opt
.
setup
(
self
.
policy_vars
,
self
.
raw_loss
,
avg_self_kl
,
self
.
avg_kl
)
# value optimizer
if
self
.
value_opt
is
not
None
:
with
tf
.
variable_scope
(
'trust_region_value'
,
reuse
=
None
):
self
.
value_opt
.
setup
(
self
.
value_vars
,
tf
.
reshape
(
self
.
values
[:
-
1
,
:],
[
-
1
]),
self
.
regression_target
,
tf
.
reshape
(
self
.
pads
,
[
-
1
]),
self
.
regression_input
,
self
.
regression_weight
)
if
train
:
# evaluate objective
(
self
.
loss
,
self
.
raw_loss
,
self
.
regression_target
,
self
.
gradient_ops
,
self
.
summary
)
=
self
.
objective
.
get
(
self
.
rewards
,
self
.
pads
,
self
.
values
[:
-
1
,
:],
self
.
values
[
-
1
,
:]
*
(
1
-
self
.
terminated
),
self
.
log_probs
,
self
.
prev_log_probs
,
self
.
target_log_probs
,
self
.
entropies
,
self
.
logits
,
self
.
target_values
[:
-
1
,
:],
self
.
target_values
[
-
1
,
:]
*
(
1
-
self
.
terminated
))
self
.
regression_target
=
tf
.
reshape
(
self
.
regression_target
,
[
-
1
])
self
.
policy_vars
=
[
v
for
v
in
tf
.
trainable_variables
()
if
'/policy_net'
in
v
.
name
]
self
.
value_vars
=
[
v
for
v
in
tf
.
trainable_variables
()
if
'/value_net'
in
v
.
name
]
# trust region optimizer
if
self
.
trust_region_policy_opt
is
not
None
:
with
tf
.
variable_scope
(
'trust_region_policy'
,
reuse
=
None
):
avg_self_kl
=
(
tf
.
reduce_sum
(
sum
(
self
.
self_kls
)
*
(
1
-
self
.
pads
))
/
tf
.
reduce_sum
(
1
-
self
.
pads
))
self
.
trust_region_policy_opt
.
setup
(
self
.
policy_vars
,
self
.
raw_loss
,
avg_self_kl
,
self
.
avg_kl
)
# value optimizer
if
self
.
value_opt
is
not
None
:
with
tf
.
variable_scope
(
'trust_region_value'
,
reuse
=
None
):
self
.
value_opt
.
setup
(
self
.
value_vars
,
tf
.
reshape
(
self
.
values
[:
-
1
,
:],
[
-
1
]),
self
.
regression_target
,
tf
.
reshape
(
self
.
pads
,
[
-
1
]),
self
.
regression_input
,
self
.
regression_weight
)
# we re-use variables for the sampling operations
with
tf
.
variable_scope
(
'model'
,
reuse
=
True
):
...
...
@@ -249,32 +253,42 @@ class Model(object):
def
train_step
(
self
,
sess
,
observations
,
internal_state
,
actions
,
rewards
,
terminated
,
pads
,
avg_episode_reward
=
0
):
avg_episode_reward
=
0
,
greedy_episode_reward
=
0
):
"""Train network using standard gradient descent."""
outputs
=
[
self
.
raw_loss
,
self
.
gradient_ops
,
self
.
summary
]
feed_dict
=
{
self
.
internal_state
:
internal_state
,
self
.
rewards
:
rewards
,
self
.
terminated
:
terminated
,
self
.
pads
:
pads
,
self
.
avg_episode_reward
:
avg_episode_reward
}
self
.
avg_episode_reward
:
avg_episode_reward
,
self
.
greedy_episode_reward
:
greedy_episode_reward
}
time_len
=
None
for
action_place
,
action
in
zip
(
self
.
actions
,
actions
):
if
time_len
is
None
:
time_len
=
len
(
action
)
assert
time_len
==
len
(
action
)
feed_dict
[
action_place
]
=
action
for
obs_place
,
obs
in
zip
(
self
.
observations
,
observations
):
assert
time_len
==
len
(
obs
)
feed_dict
[
obs_place
]
=
obs
assert
len
(
rewards
)
==
time_len
-
1
return
sess
.
run
(
outputs
,
feed_dict
=
feed_dict
)
def
trust_region_step
(
self
,
sess
,
observations
,
internal_state
,
actions
,
rewards
,
terminated
,
pads
,
avg_episode_reward
=
0
):
avg_episode_reward
=
0
,
greedy_episode_reward
=
0
):
"""Train policy using trust region step."""
feed_dict
=
{
self
.
internal_state
:
internal_state
,
self
.
rewards
:
rewards
,
self
.
terminated
:
terminated
,
self
.
pads
:
pads
,
self
.
avg_episode_reward
:
avg_episode_reward
}
self
.
avg_episode_reward
:
avg_episode_reward
,
self
.
greedy_episode_reward
:
greedy_episode_reward
}
for
action_place
,
action
in
zip
(
self
.
actions
,
actions
):
feed_dict
[
action_place
]
=
action
for
obs_place
,
obs
in
zip
(
self
.
observations
,
observations
):
...
...
research/pcl_rl/objective.py
View file @
78ddf6eb
...
...
@@ -46,7 +46,8 @@ class Objective(object):
def
get
(
self
,
rewards
,
pads
,
values
,
final_values
,
log_probs
,
prev_log_probs
,
target_log_probs
,
entropies
,
logits
):
entropies
,
logits
,
target_values
,
final_target_values
):
"""Get objective calculations."""
raise
NotImplementedError
()
...
...
@@ -101,7 +102,8 @@ class ActorCritic(Objective):
def
__init__
(
self
,
learning_rate
,
clip_norm
=
5
,
policy_weight
=
1.0
,
critic_weight
=
0.1
,
tau
=
0.1
,
gamma
=
1.0
,
rollout
=
10
,
eps_lambda
=
0.0
,
clip_adv
=
None
):
eps_lambda
=
0.0
,
clip_adv
=
None
,
use_target_values
=
False
):
super
(
ActorCritic
,
self
).
__init__
(
learning_rate
,
clip_norm
=
clip_norm
)
self
.
policy_weight
=
policy_weight
self
.
critic_weight
=
critic_weight
...
...
@@ -111,14 +113,17 @@ class ActorCritic(Objective):
self
.
clip_adv
=
clip_adv
self
.
eps_lambda
=
tf
.
get_variable
(
# TODO: need a better way
'eps_lambda'
,
[],
initializer
=
tf
.
constant_initializer
(
eps_lambda
))
'eps_lambda'
,
[],
initializer
=
tf
.
constant_initializer
(
eps_lambda
),
trainable
=
False
)
self
.
new_eps_lambda
=
tf
.
placeholder
(
tf
.
float32
,
[])
self
.
assign_eps_lambda
=
self
.
eps_lambda
.
assign
(
0.95
*
self
.
eps_lambda
+
0.05
*
self
.
new_eps_lambda
)
0.99
*
self
.
eps_lambda
+
0.01
*
self
.
new_eps_lambda
)
self
.
use_target_values
=
use_target_values
def
get
(
self
,
rewards
,
pads
,
values
,
final_values
,
log_probs
,
prev_log_probs
,
target_log_probs
,
entropies
,
logits
):
entropies
,
logits
,
target_values
,
final_target_values
):
not_pad
=
1
-
pads
batch_size
=
tf
.
shape
(
rewards
)[
1
]
...
...
@@ -126,10 +131,17 @@ class ActorCritic(Objective):
rewards
=
not_pad
*
rewards
value_estimates
=
not_pad
*
values
log_probs
=
not_pad
*
sum
(
log_probs
)
target_values
=
not_pad
*
tf
.
stop_gradient
(
target_values
)
final_target_values
=
tf
.
stop_gradient
(
final_target_values
)
sum_rewards
=
discounted_future_sum
(
rewards
,
self
.
gamma
,
self
.
rollout
)
last_values
=
shift_values
(
value_estimates
,
self
.
gamma
,
self
.
rollout
,
final_values
)
if
self
.
use_target_values
:
last_values
=
shift_values
(
target_values
,
self
.
gamma
,
self
.
rollout
,
final_target_values
)
else
:
last_values
=
shift_values
(
value_estimates
,
self
.
gamma
,
self
.
rollout
,
final_values
)
future_values
=
sum_rewards
+
last_values
baseline_values
=
value_estimates
...
...
@@ -183,7 +195,8 @@ class PCL(ActorCritic):
def
get
(
self
,
rewards
,
pads
,
values
,
final_values
,
log_probs
,
prev_log_probs
,
target_log_probs
,
entropies
,
logits
):
entropies
,
logits
,
target_values
,
final_target_values
):
not_pad
=
1
-
pads
batch_size
=
tf
.
shape
(
rewards
)[
1
]
...
...
@@ -192,6 +205,8 @@ class PCL(ActorCritic):
log_probs
=
not_pad
*
sum
(
log_probs
)
target_log_probs
=
not_pad
*
tf
.
stop_gradient
(
sum
(
target_log_probs
))
relative_log_probs
=
not_pad
*
(
log_probs
-
target_log_probs
)
target_values
=
not_pad
*
tf
.
stop_gradient
(
target_values
)
final_target_values
=
tf
.
stop_gradient
(
final_target_values
)
# Prepend.
not_pad
=
tf
.
concat
([
tf
.
ones
([
self
.
rollout
-
1
,
batch_size
]),
...
...
@@ -210,14 +225,26 @@ class PCL(ActorCritic):
prev_log_probs
],
0
)
relative_log_probs
=
tf
.
concat
([
tf
.
zeros
([
self
.
rollout
-
1
,
batch_size
]),
relative_log_probs
],
0
)
target_values
=
tf
.
concat
(
[
self
.
gamma
**
tf
.
expand_dims
(
tf
.
range
(
float
(
self
.
rollout
-
1
),
0
,
-
1
),
1
)
*
tf
.
ones
([
self
.
rollout
-
1
,
batch_size
])
*
target_values
[
0
:
1
,
:],
target_values
],
0
)
sum_rewards
=
discounted_future_sum
(
rewards
,
self
.
gamma
,
self
.
rollout
)
sum_log_probs
=
discounted_future_sum
(
log_probs
,
self
.
gamma
,
self
.
rollout
)
sum_prev_log_probs
=
discounted_future_sum
(
prev_log_probs
,
self
.
gamma
,
self
.
rollout
)
sum_relative_log_probs
=
discounted_future_sum
(
relative_log_probs
,
self
.
gamma
,
self
.
rollout
)
last_values
=
shift_values
(
value_estimates
,
self
.
gamma
,
self
.
rollout
,
final_values
)
if
self
.
use_target_values
:
last_values
=
shift_values
(
target_values
,
self
.
gamma
,
self
.
rollout
,
final_target_values
)
else
:
last_values
=
shift_values
(
value_estimates
,
self
.
gamma
,
self
.
rollout
,
final_values
)
future_values
=
(
-
self
.
tau
*
sum_log_probs
...
...
@@ -272,7 +299,8 @@ class TRPO(ActorCritic):
def
get
(
self
,
rewards
,
pads
,
values
,
final_values
,
log_probs
,
prev_log_probs
,
target_log_probs
,
entropies
,
logits
):
entropies
,
logits
,
target_values
,
final_target_values
):
not_pad
=
1
-
pads
batch_size
=
tf
.
shape
(
rewards
)[
1
]
...
...
@@ -280,10 +308,18 @@ class TRPO(ActorCritic):
value_estimates
=
not_pad
*
values
log_probs
=
not_pad
*
sum
(
log_probs
)
prev_log_probs
=
not_pad
*
prev_log_probs
target_values
=
not_pad
*
tf
.
stop_gradient
(
target_values
)
final_target_values
=
tf
.
stop_gradient
(
final_target_values
)
sum_rewards
=
discounted_future_sum
(
rewards
,
self
.
gamma
,
self
.
rollout
)
last_values
=
shift_values
(
value_estimates
,
self
.
gamma
,
self
.
rollout
,
final_values
)
if
self
.
use_target_values
:
last_values
=
shift_values
(
target_values
,
self
.
gamma
,
self
.
rollout
,
final_target_values
)
else
:
last_values
=
shift_values
(
value_estimates
,
self
.
gamma
,
self
.
rollout
,
final_values
)
future_values
=
sum_rewards
+
last_values
baseline_values
=
value_estimates
...
...
research/pcl_rl/optimizers.py
View file @
78ddf6eb
...
...
@@ -25,6 +25,7 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
six.moves
import
xrange
import
tensorflow
as
tf
import
numpy
as
np
import
scipy.optimize
...
...
research/pcl_rl/replay_buffer.py
View file @
78ddf6eb
...
...
@@ -20,6 +20,7 @@ Implements replay buffer in Python.
import
random
import
numpy
as
np
from
six.moves
import
xrange
class
ReplayBuffer
(
object
):
...
...
@@ -150,7 +151,7 @@ class PrioritizedReplayBuffer(ReplayBuffer):
def
get_batch
(
self
,
n
):
"""Get batch of episodes to train on."""
p
=
self
.
sampling_distribution
()
idxs
=
np
.
random
.
choice
(
self
.
cur_size
,
size
=
n
,
replace
=
False
,
p
=
p
)
idxs
=
np
.
random
.
choice
(
self
.
cur_size
,
size
=
int
(
n
)
,
replace
=
False
,
p
=
p
)
self
.
last_batch
=
idxs
return
[
self
.
buffer
[
idx
]
for
idx
in
idxs
],
p
[
idxs
]
...
...
research/pcl_rl/trainer.py
View file @
78ddf6eb
...
...
@@ -25,6 +25,7 @@ import random
import
os
import
pickle
from
six.moves
import
xrange
import
controller
import
model
import
policy
...
...
@@ -92,6 +93,8 @@ flags.DEFINE_bool('update_eps_lambda', False,
'Update lambda automatically based on last 100 episodes.'
)
flags
.
DEFINE_float
(
'gamma'
,
1.0
,
'discount'
)
flags
.
DEFINE_integer
(
'rollout'
,
10
,
'rollout'
)
flags
.
DEFINE_bool
(
'use_target_values'
,
False
,
'use target network for value estimates'
)
flags
.
DEFINE_bool
(
'fixed_std'
,
True
,
'fix the std in Gaussian distributions'
)
flags
.
DEFINE_bool
(
'input_prev_actions'
,
True
,
...
...
@@ -152,6 +155,10 @@ class Trainer(object):
self
.
env
=
gym_wrapper
.
GymWrapper
(
self
.
env_str
,
distinct
=
FLAGS
.
batch_size
//
self
.
num_samples
,
count
=
self
.
num_samples
)
self
.
eval_env
=
gym_wrapper
.
GymWrapper
(
self
.
env_str
,
distinct
=
FLAGS
.
batch_size
//
self
.
num_samples
,
count
=
self
.
num_samples
)
self
.
env_spec
=
env_spec
.
EnvSpec
(
self
.
env
.
get_one
())
self
.
max_step
=
FLAGS
.
max_step
...
...
@@ -169,7 +176,8 @@ class Trainer(object):
self
.
value_opt
=
FLAGS
.
value_opt
assert
not
self
.
trust_region_p
or
self
.
objective
in
[
'pcl'
,
'trpo'
]
assert
self
.
objective
!=
'trpo'
or
self
.
trust_region_p
assert
self
.
value_opt
is
None
or
self
.
critic_weight
==
0.0
assert
self
.
value_opt
is
None
or
self
.
value_opt
==
'None'
or
\
self
.
critic_weight
==
0.0
self
.
max_divergence
=
FLAGS
.
max_divergence
self
.
learning_rate
=
FLAGS
.
learning_rate
...
...
@@ -182,6 +190,7 @@ class Trainer(object):
self
.
update_eps_lambda
=
FLAGS
.
update_eps_lambda
self
.
gamma
=
FLAGS
.
gamma
self
.
rollout
=
FLAGS
.
rollout
self
.
use_target_values
=
FLAGS
.
use_target_values
self
.
fixed_std
=
FLAGS
.
fixed_std
self
.
input_prev_actions
=
FLAGS
.
input_prev_actions
self
.
recurrent
=
FLAGS
.
recurrent
...
...
@@ -208,8 +217,7 @@ class Trainer(object):
self
.
value_hidden_layers
=
FLAGS
.
value_hidden_layers
self
.
tf_seed
=
FLAGS
.
tf_seed
self
.
save_trajectories_dir
=
(
FLAGS
.
save_trajectories_dir
or
FLAGS
.
save_dir
)
self
.
save_trajectories_dir
=
FLAGS
.
save_trajectories_dir
self
.
save_trajectories_file
=
(
os
.
path
.
join
(
self
.
save_trajectories_dir
,
self
.
env_str
.
replace
(
'-'
,
'_'
))
...
...
@@ -244,7 +252,8 @@ class Trainer(object):
policy_weight
=
policy_weight
,
critic_weight
=
self
.
critic_weight
,
tau
=
tau
,
gamma
=
self
.
gamma
,
rollout
=
self
.
rollout
,
eps_lambda
=
self
.
eps_lambda
,
clip_adv
=
self
.
clip_adv
)
eps_lambda
=
self
.
eps_lambda
,
clip_adv
=
self
.
clip_adv
,
use_target_values
=
self
.
use_target_values
)
elif
self
.
objective
in
[
'reinforce'
,
'urex'
]:
cls
=
(
full_episode_objective
.
Reinforce
if
self
.
objective
==
'reinforce'
else
...
...
@@ -322,10 +331,10 @@ class Trainer(object):
self
.
num_expert_paths
,
self
.
env_str
,
self
.
env_spec
,
load_trajectories_file
=
self
.
load_trajectories_file
)
def
get_controller
(
self
):
def
get_controller
(
self
,
env
):
"""Get controller."""
cls
=
controller
.
Controller
return
cls
(
self
.
env
,
self
.
env_spec
,
self
.
internal_dim
,
return
cls
(
env
,
self
.
env_spec
,
self
.
internal_dim
,
use_online_batch
=
self
.
use_online_batch
,
batch_by_steps
=
self
.
batch_by_steps
,
unify_episodes
=
self
.
unify_episodes
,
...
...
@@ -334,7 +343,7 @@ class Trainer(object):
cutoff_agent
=
self
.
cutoff_agent
,
save_trajectories_file
=
self
.
save_trajectories_file
,
use_trust_region
=
self
.
trust_region_p
,
use_value_opt
=
self
.
value_opt
is
not
None
,
use_value_opt
=
self
.
value_opt
not
in
[
None
,
'None'
],
update_eps_lambda
=
self
.
update_eps_lambda
,
prioritize_by
=
self
.
prioritize_by
,
get_model
=
self
.
get_model
,
...
...
@@ -359,16 +368,19 @@ class Trainer(object):
saver
.
restore
(
sess
,
ckpt
.
model_checkpoint_path
)
elif
FLAGS
.
load_path
:
logging
.
info
(
'restoring from %s'
,
FLAGS
.
load_path
)
with
gfile
.
AsUser
(
'distbelief-brain-gpu'
):
saver
.
restore
(
sess
,
FLAGS
.
load_path
)
saver
.
restore
(
sess
,
FLAGS
.
load_path
)
if
FLAGS
.
supervisor
:
with
tf
.
device
(
tf
.
ReplicaDeviceSetter
(
FLAGS
.
ps_tasks
,
merge_devices
=
True
)):
self
.
global_step
=
tf
.
train
.
get_or_create_global_step
()
self
.
global_step
=
tf
.
contrib
.
framework
.
get_or_create_global_step
()
tf
.
set_random_seed
(
FLAGS
.
tf_seed
)
self
.
controller
=
self
.
get_controller
()
self
.
controller
=
self
.
get_controller
(
self
.
env
)
self
.
model
=
self
.
controller
.
model
self
.
controller
.
setup
()
with
tf
.
variable_scope
(
tf
.
get_variable_scope
(),
reuse
=
True
):
self
.
eval_controller
=
self
.
get_controller
(
self
.
eval_env
)
self
.
eval_controller
.
setup
(
train
=
False
)
saver
=
tf
.
train
.
Saver
(
max_to_keep
=
10
)
step
=
self
.
model
.
global_step
sv
=
tf
.
Supervisor
(
logdir
=
FLAGS
.
save_dir
,
...
...
@@ -382,10 +394,14 @@ class Trainer(object):
sess
=
sv
.
PrepareSession
(
FLAGS
.
master
)
else
:
tf
.
set_random_seed
(
FLAGS
.
tf_seed
)
self
.
global_step
=
tf
.
train
.
get_or_create_global_step
()
self
.
controller
=
self
.
get_controller
()
self
.
global_step
=
tf
.
contrib
.
framework
.
get_or_create_global_step
()
self
.
controller
=
self
.
get_controller
(
self
.
env
)
self
.
model
=
self
.
controller
.
model
self
.
controller
.
setup
()
with
tf
.
variable_scope
(
tf
.
get_variable_scope
(),
reuse
=
True
):
self
.
eval_controller
=
self
.
get_controller
(
self
.
eval_env
)
self
.
eval_controller
.
setup
(
train
=
False
)
saver
=
tf
.
train
.
Saver
(
max_to_keep
=
10
)
sess
=
tf
.
Session
()
sess
.
run
(
tf
.
initialize_all_variables
())
...
...
@@ -414,21 +430,25 @@ class Trainer(object):
(
loss
,
summary
,
total_rewards
,
episode_rewards
)
=
self
.
controller
.
train
(
sess
)
_
,
greedy_episode_rewards
=
self
.
eval_controller
.
eval
(
sess
)
self
.
controller
.
greedy_episode_rewards
=
greedy_episode_rewards
losses
.
append
(
loss
)
rewards
.
append
(
total_rewards
)
all_ep_rewards
.
extend
(
episode_rewards
)
if
random
.
random
()
<
1
and
is_chief
and
sv
and
sv
.
_summary_writer
:
if
(
random
.
random
()
<
0.1
and
summary
and
episode_rewards
and
is_chief
and
sv
and
sv
.
_summary_writer
):
sv
.
summary_computed
(
sess
,
summary
)
model_step
=
sess
.
run
(
self
.
model
.
global_step
)
if
is_chief
and
step
%
self
.
validation_frequency
==
0
:
logging
.
info
(
'at training step %d, model step %d: '
'avg loss %f, avg reward %f, '
'episode rewards: %f'
,
'episode rewards:
%f, greedy rewards:
%f'
,
step
,
model_step
,
np
.
mean
(
losses
),
np
.
mean
(
rewards
),
np
.
mean
(
all_ep_rewards
))
np
.
mean
(
all_ep_rewards
),
np
.
mean
(
greedy_episode_rewards
))
losses
=
[]
rewards
=
[]
...
...
research/pcl_rl/trust_region.py
View file @
78ddf6eb
...
...
@@ -24,6 +24,7 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
six.moves
import
xrange
import
tensorflow
as
tf
import
numpy
as
np
...
...
research/ptn/eval_rotator.py
View file @
78ddf6eb
...
...
@@ -64,7 +64,7 @@ flags.DEFINE_float('clip_gradient_norm', 0, '')
flags
.
DEFINE_integer
(
'save_summaries_secs'
,
15
,
''
)
flags
.
DEFINE_integer
(
'eval_interval_secs'
,
60
*
5
,
''
)
# Scheduling
flags
.
DEFINE_string
(
'master'
,
'
local
'
,
''
)
flags
.
DEFINE_string
(
'master'
,
''
,
''
)
FLAGS
=
flags
.
FLAGS
...
...
research/ptn/metrics.py
View file @
78ddf6eb
...
...
@@ -19,6 +19,7 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
six.moves
import
xrange
import
tensorflow
as
tf
slim
=
tf
.
contrib
.
slim
...
...
@@ -108,5 +109,3 @@ def add_volume_iou_metrics(inputs, outputs):
names_to_values
[
'volume_iou'
]
=
tmp_values
*
3.0
names_to_updates
[
'volume_iou'
]
=
tmp_updates
return
names_to_values
,
names_to_updates
research/ptn/model_ptn.py
View file @
78ddf6eb
...
...
@@ -21,6 +21,7 @@ from __future__ import print_function
import
os
import
numpy
as
np
from
six.moves
import
xrange
import
tensorflow
as
tf
import
losses
...
...
research/ptn/model_rotator.py
View file @
78ddf6eb
...
...
@@ -21,6 +21,7 @@ from __future__ import print_function
import
os
import
numpy
as
np
from
six.moves
import
xrange
import
tensorflow
as
tf
import
input_generator
...
...
@@ -191,19 +192,19 @@ def get_train_op_for_scope(loss, optimizer, scopes, params):
def
get_metrics
(
inputs
,
outputs
,
params
):
"""Aggregate the metrics for rotator model.
Args:
inputs: Input dictionary of the rotator model.
outputs: Output dictionary returned by the rotator model.
params: Hyperparameters of the rotator model.
Returns:
names_to_values: metrics->values (dict).
names_to_updates: metrics->ops (dict).
"""
names_to_values
=
dict
()
names_to_updates
=
dict
()
tmp_values
,
tmp_updates
=
metrics
.
add_image_pred_metrics
(
inputs
,
outputs
,
params
.
num_views
,
3
*
params
.
image_size
**
2
)
names_to_values
.
update
(
tmp_values
)
...
...
@@ -217,7 +218,7 @@ def get_metrics(inputs, outputs, params):
for
name
,
value
in
names_to_values
.
iteritems
():
slim
.
summaries
.
add_scalar_summary
(
value
,
name
,
prefix
=
'eval'
,
print_summary
=
True
)
return
names_to_values
,
names_to_updates
...
...
research/ptn/model_voxel_generation.py
View file @
78ddf6eb
...
...
@@ -22,6 +22,7 @@ import abc
import
os
import
numpy
as
np
from
six.moves
import
xrange
import
tensorflow
as
tf
import
input_generator
...
...
research/ptn/pretrain_rotator.py
View file @
78ddf6eb
...
...
@@ -21,6 +21,7 @@ from __future__ import print_function
import
os
import
numpy
as
np
from
six.moves
import
xrange
import
tensorflow
as
tf
from
tensorflow
import
app
...
...
research/ptn/train_ptn.py
View file @
78ddf6eb
...
...
@@ -44,7 +44,7 @@ flags.DEFINE_integer('image_size', 64,
'Input images dimension (pixels) - width & height.'
)
flags
.
DEFINE_integer
(
'vox_size'
,
32
,
'Voxel prediction dimension.'
)
flags
.
DEFINE_integer
(
'step_size'
,
24
,
'Steps to take in rotation to fetch viewpoints.'
)
flags
.
DEFINE_integer
(
'batch_size'
,
1
,
'Batch size while training.'
)
flags
.
DEFINE_integer
(
'batch_size'
,
6
,
'Batch size while training.'
)
flags
.
DEFINE_float
(
'focal_length'
,
0.866
,
'Focal length parameter used in perspective projection.'
)
flags
.
DEFINE_float
(
'focal_range'
,
1.732
,
'Focal length parameter used in perspective projection.'
)
flags
.
DEFINE_string
(
'encoder_name'
,
'ptn_encoder'
,
...
...
research/ptn/utils.py
View file @
78ddf6eb
...
...
@@ -28,6 +28,7 @@ from mpl_toolkits.mplot3d import axes3d as p3 # pylint:disable=unused-import
import
numpy
as
np
from
PIL
import
Image
from
skimage
import
measure
from
six.moves
import
xrange
import
tensorflow
as
tf
...
...
@@ -116,4 +117,3 @@ def visualize_voxel_scatter(points, vis_size=128):
vis_size
,
vis_size
,
3
)
p
.
close
(
'all'
)
return
data
Prev
1
…
7
8
9
10
11
12
13
14
15
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment