Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
439a7edc
"app/git@developer.sourcefind.cn:orangecat/ollama.git" did not exist on "e88dd25babdf8c09e0010aea8ba754df7eb2191d"
Commit
439a7edc
authored
Jul 20, 2018
by
Raymond Yuan
Browse files
a3c for cartpole blogpost
parent
76d3d72e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
390 additions
and
0 deletions
+390
-0
research/a3c_blogpost/a3c_cartpole.py
research/a3c_blogpost/a3c_cartpole.py
+390
-0
No files found.
research/a3c_blogpost/a3c_cartpole.py
0 → 100644
View file @
439a7edc
import
os
os
.
environ
[
"CUDA_DEVICE_ORDER"
]
=
"PCI_BUS_ID"
# see issue #152
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
""
import
threading
import
gym
import
multiprocessing
import
numpy
as
np
from
queue
import
Queue
import
argparse
import
matplotlib.pyplot
as
plt
import
tensorflow
as
tf
from
tensorflow.python
import
keras
from
tensorflow.python.keras
import
layers
tf
.
enable_eager_execution
()
print
(
"Eager execution: {}"
.
format
(
tf
.
executing_eagerly
()))
def
str2bool
(
v
):
if
v
.
lower
()
in
(
'yes'
,
'true'
,
't'
,
'y'
,
'1'
):
return
True
elif
v
.
lower
()
in
(
'no'
,
'false'
,
'f'
,
'n'
,
'0'
):
return
False
else
:
raise
argparse
.
ArgumentTypeError
(
'Boolean value expected.'
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Run A3C algorithm on the game '
'Cartpole.'
)
parser
.
add_argument
(
'--algorithm'
,
default
=
'a3c'
,
type
=
str
,
help
=
'Choose between
\'
a3c
\'
and
\'
random
\'
.'
)
parser
.
add_argument
(
"--train"
,
type
=
str2bool
,
default
=
True
,
help
=
'Train our model or to run an existing model and '
'watch it play.'
)
parser
.
add_argument
(
'--lr'
,
default
=
0.0005
,
help
=
'Learning rate for the shared optimizer.'
)
parser
.
add_argument
(
'--update-freq'
,
default
=
20
,
type
=
int
,
help
=
'How often to update the global model.'
)
parser
.
add_argument
(
'--max-eps'
,
default
=
2000
,
type
=
int
,
help
=
'Global maximum number of episodes to run.'
)
parser
.
add_argument
(
'--gamma'
,
default
=
0.99
,
help
=
'Discount factor of rewards.'
)
parser
.
add_argument
(
'--save-dir'
,
default
=
'/tmp/'
,
type
=
str
,
help
=
'Directory in which you desire to save the model.'
)
args
=
parser
.
parse_args
()
class
ActorCriticModel
(
keras
.
Model
):
def
__init__
(
self
,
state_size
,
action_size
):
super
(
ActorCriticModel
,
self
).
__init__
()
self
.
state_size
=
state_size
self
.
action_size
=
action_size
self
.
dense1
=
layers
.
Dense
(
100
,
activation
=
'relu'
)
self
.
policy_logits
=
layers
.
Dense
(
action_size
)
self
.
dense2
=
layers
.
Dense
(
100
,
activation
=
'relu'
)
self
.
values
=
layers
.
Dense
(
1
)
def
call
(
self
,
inputs
):
# Forward pass
x
=
self
.
dense1
(
inputs
)
logits
=
self
.
policy_logits
(
x
)
v1
=
self
.
dense2
(
inputs
)
values
=
self
.
values
(
v1
)
return
logits
,
values
def
tf_wrap
(
np_array
,
dtype
=
np
.
float32
):
"""Converts an np array to a tf constant.
Arguments:
np_array: Input array.
dtype: The desired data type of the array.
Returns:
A tensor of the np array of type dtype.
"""
if
np_array
.
dtype
!=
dtype
:
np_array
=
np_array
.
astype
(
dtype
)
return
tf
.
constant
(
np_array
)
def
record
(
episode
,
episode_reward
,
worker_idx
,
global_ep_reward
,
result_queue
,
total_loss
,
num_steps
):
"""Helper function to store score and print statistics.
Arguments:
episode: Current episode
episode_reward: Reward accumulated over the current episode
worker_idx: Which thread (worker)
global_ep_reward: The moving average of the global reward
result_queue: Queue storing the moving average of the scores
total_loss: The total loss accumualted over the current episode
num_steps: The number of steps the episode took to complete
"""
if
global_ep_reward
==
0
:
global_ep_reward
=
episode_reward
else
:
global_ep_reward
=
global_ep_reward
*
0.99
+
episode_reward
*
0.01
print
(
f
"Episode:
{
episode
}
| "
f
"Moving Average Reward:
{
int
(
global_ep_reward
)
}
| "
f
"Episode Reward:
{
int
(
episode_reward
)
}
| "
f
"Loss:
{
int
(
total_loss
/
float
(
num_steps
)
*
1000
)
/
1000
}
| "
f
"Steps:
{
num_steps
}
| "
f
"Worker:
{
worker_idx
}
"
)
result_queue
.
put
(
global_ep_reward
)
return
global_ep_reward
class
RandomAgent
:
"""Random Agent that will play the specified game
Arguments:
env_name: Name of the environment to be played
max_eps: Maximum number of episodes to run agent for.
"""
def
__init__
(
self
,
env_name
,
max_eps
):
self
.
env
=
gym
.
make
(
env_name
)
self
.
max_episodes
=
max_eps
self
.
global_moving_average_reward
=
0
self
.
res_queue
=
Queue
()
def
run
(
self
):
reward_avg
=
0
for
episode
in
range
(
self
.
max_episodes
):
done
=
False
self
.
env
.
reset
()
reward_sum
=
0.0
steps
=
0
while
not
done
:
# Sample randomly from the action space and step
_
,
reward
,
done
,
_
=
self
.
env
.
step
(
self
.
env
.
action_space
.
sample
())
steps
+=
1
reward_sum
+=
reward
# Record statistics
self
.
global_moving_average_reward
=
record
(
episode
,
reward_sum
,
0
,
self
.
global_moving_average_reward
,
self
.
res_queue
,
0
,
steps
)
reward_avg
+=
reward_sum
final_avg
=
reward_avg
/
float
(
self
.
max_episodes
)
print
(
"Average score across {} episodes: {}"
.
format
(
self
.
max_episodes
,
final_avg
))
return
final_avg
class
MasterAgent
():
def
__init__
(
self
):
self
.
game_name
=
'CartPole-v0'
save_dir
=
args
.
save_dir
self
.
save_dir
=
save_dir
if
not
os
.
path
.
exists
(
save_dir
):
os
.
makedirs
(
save_dir
)
env
=
gym
.
make
(
self
.
game_name
)
self
.
state_size
=
env
.
observation_space
.
shape
[
0
]
self
.
action_size
=
env
.
action_space
.
n
self
.
opt
=
tf
.
train
.
AdamOptimizer
(
args
.
lr
,
use_locking
=
True
)
print
(
self
.
state_size
,
self
.
action_size
)
self
.
global_model
=
ActorCriticModel
(
self
.
state_size
,
self
.
action_size
)
# global network
self
.
global_model
(
tf
.
constant
(
np
.
random
.
random
((
1
,
self
.
state_size
)),
dtype
=
tf
.
float32
))
def
train
(
self
):
if
args
.
algorithm
==
'random'
:
random_agent
=
RandomAgent
(
self
.
game_name
,
args
.
max_eps
)
random_agent
.
run
()
return
res_queue
=
Queue
()
# We run the algorithm on cpu only!
with
tf
.
device
(
'/cpu:0'
):
workers
=
[
Worker
(
self
.
state_size
,
self
.
action_size
,
self
.
global_model
,
self
.
opt
,
res_queue
,
i
,
save_dir
=
self
.
save_dir
)
for
i
in
range
(
multiprocessing
.
cpu_count
())]
for
i
,
worker
in
enumerate
(
workers
):
print
(
"Starting worker {}"
.
format
(
i
))
worker
.
start
()
moving_average_rewards
=
[]
# record episode reward to plot
while
True
:
reward
=
res_queue
.
get
()
if
reward
is
not
None
:
moving_average_rewards
.
append
(
reward
)
else
:
break
[
w
.
join
()
for
w
in
workers
]
plt
.
plot
(
moving_average_rewards
)
plt
.
ylabel
(
'Moving average ep reward'
)
plt
.
xlabel
(
'Step'
)
plt
.
savefig
(
os
.
path
.
join
(
self
.
save_dir
,
'{} Moving Average.png'
.
format
(
self
.
game_name
)))
plt
.
show
()
def
play
(
self
):
env
=
gym
.
make
(
self
.
game_name
).
unwrapped
state
=
env
.
reset
()
model
=
self
.
global_model
model_path
=
os
.
path
.
join
(
self
.
save_dir
,
'model_{}.h5'
.
format
(
self
.
game_name
))
print
(
'Loading model from: {}'
.
format
(
model_path
))
model
.
load_weights
(
model_path
)
done
=
False
step_counter
=
0
reward_sum
=
0
try
:
while
not
done
:
env
.
render
(
mode
=
'rgb_array'
)
policy
,
value
=
model
(
tf_wrap
(
state
[
None
,
:]))
policy
=
tf
.
nn
.
softmax
(
policy
)
action
=
np
.
argmax
(
policy
)
state
,
reward
,
done
,
_
=
env
.
step
(
action
)
reward_sum
+=
reward
print
(
"{}. Reward: {}, action: {}"
.
format
(
step_counter
,
reward_sum
,
action
))
step_counter
+=
1
except
KeyboardInterrupt
:
print
(
"Received Keyboard Interrupt. Shutting down."
)
finally
:
env
.
close
()
class
Memory
:
def
__init__
(
self
):
self
.
states
=
[]
self
.
actions
=
[]
self
.
rewards
=
[]
def
store
(
self
,
state
,
action
,
reward
):
self
.
states
.
append
(
state
)
self
.
actions
.
append
(
action
)
self
.
rewards
.
append
(
reward
)
def
clear
(
self
):
self
.
states
=
[]
self
.
actions
=
[]
self
.
rewards
=
[]
class
Worker
(
threading
.
Thread
):
# Set up global variables across different threads
global_episode
=
0
# Moving average reward
global_moving_average_reward
=
0
best_score
=
0
save_lock
=
threading
.
Lock
()
def
__init__
(
self
,
state_size
,
action_size
,
global_model
,
opt
,
result_queue
,
idx
,
game_name
=
'CartPole-v0'
,
save_dir
=
'/tmp'
):
super
(
Worker
,
self
).
__init__
()
self
.
state_size
=
state_size
self
.
action_size
=
action_size
self
.
result_queue
=
result_queue
self
.
global_model
=
global_model
self
.
opt
=
opt
self
.
local_model
=
ActorCriticModel
(
self
.
state_size
,
self
.
action_size
)
self
.
worker_idx
=
idx
self
.
game_name
=
game_name
self
.
env
=
gym
.
make
(
self
.
game_name
).
unwrapped
self
.
save_dir
=
save_dir
self
.
ep_loss
=
0.0
def
run
(
self
):
total_step
=
1
mem
=
Memory
()
while
Worker
.
global_episode
<
args
.
max_eps
:
current_state
=
self
.
env
.
reset
()
mem
.
clear
()
ep_reward
=
0.
ep_steps
=
0
self
.
ep_loss
=
0
time_count
=
0
done
=
False
while
not
done
:
logits
,
_
=
self
.
local_model
(
tf_wrap
(
current_state
[
None
,
:]))
probs
=
tf
.
nn
.
softmax
(
logits
)
action
=
np
.
random
.
choice
(
self
.
action_size
,
p
=
probs
.
numpy
()[
0
])
new_state
,
reward
,
done
,
_
=
self
.
env
.
step
(
action
)
if
done
:
reward
=
-
1
ep_reward
+=
reward
mem
.
store
(
current_state
,
action
,
reward
)
if
time_count
==
args
.
update_freq
or
done
:
# Calculate gradient wrt to local model. We do so by tracking the
# variables involved in computing the loss by using tf.GradientTape
with
tf
.
GradientTape
()
as
tape
:
total_loss
=
self
.
compute_loss
(
done
,
new_state
,
mem
,
args
.
gamma
)
self
.
ep_loss
+=
total_loss
# Calculate local gradients
grads
=
tape
.
gradient
(
total_loss
,
self
.
local_model
.
trainable_weights
)
# Push local gradients to global model
self
.
opt
.
apply_gradients
(
zip
(
grads
,
self
.
global_model
.
trainable_weights
))
# Update local model with new weights
self
.
local_model
.
set_weights
(
self
.
global_model
.
get_weights
())
mem
.
clear
()
time_count
=
0
if
done
:
# done and print information
Worker
.
global_moving_average_reward
=
\
record
(
Worker
.
global_episode
,
ep_reward
,
self
.
worker_idx
,
Worker
.
global_moving_average_reward
,
self
.
result_queue
,
self
.
ep_loss
,
ep_steps
)
# We must use a lock to save our model and to print to prevent data races.
if
ep_reward
>
Worker
.
best_score
:
with
Worker
.
save_lock
:
print
(
"Saving best model to {}, "
"episode score: {}"
.
format
(
self
.
save_dir
,
ep_reward
))
self
.
global_model
.
save_weights
(
os
.
path
.
join
(
self
.
save_dir
,
'model_{}.h5'
.
format
(
self
.
game_name
))
)
Worker
.
best_score
=
ep_reward
Worker
.
global_episode
+=
1
ep_steps
+=
1
time_count
+=
1
current_state
=
new_state
total_step
+=
1
self
.
result_queue
.
put
(
None
)
def
compute_loss
(
self
,
done
,
new_state
,
memory
,
gamma
=
0.99
):
if
done
:
reward_sum
=
0.
# terminal
else
:
reward_sum
=
self
.
local_model
(
tf_wrap
(
new_state
[
None
,
:]))[
-
1
].
numpy
()[
0
]
# Get discounted rewards
discounted_rewards
=
[]
for
reward
in
memory
.
rewards
[::
-
1
]:
# reverse buffer r
reward_sum
=
reward
+
gamma
*
reward_sum
discounted_rewards
.
append
(
reward_sum
)
discounted_rewards
.
reverse
()
logits
,
values
=
self
.
local_model
(
tf_wrap
(
np
.
vstack
(
memory
.
states
)))
# Get our advantages
advantage
=
tf_wrap
(
np
.
array
(
discounted_rewards
)[:,
None
])
-
values
# Value loss
value_loss
=
advantage
**
2
# Calculate our policy loss
actions_one_hot
=
tf
.
one_hot
(
memory
.
actions
,
self
.
action_size
,
dtype
=
tf
.
float32
)
policy_loss
=
-
tf
.
nn
.
softmax_cross_entropy_with_logits_v2
(
labels
=
actions_one_hot
,
logits
=
logits
)
total_loss
=
tf
.
reduce_mean
((
value_loss
+
policy_loss
))
return
total_loss
if
__name__
==
'__main__'
:
print
(
args
)
master
=
MasterAgent
()
if
args
.
train
:
master
.
train
()
else
:
master
.
play
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment