Changes from honors thesis

This commit is contained in:
Brandon Rozek 2020-03-23 20:02:06 -04:00
parent a44b981e55
commit 66496fe0d8
4 changed files with 40 additions and 54 deletions

View file

@ -4,24 +4,32 @@ config = {}
config['seed'] = 901 config['seed'] = 901
config['zoom'] = 4 config['zoom'] = 4
config['environment_name'] = 'PongNoFrameskip-v4' config['environment_name'] = 'PongNoFrameskip-v4'
config['learning_rate'] = 1e-5 config['learning_rate'] = 1e-4
config['target_sync_tau'] = 1e-3 config['target_sync_tau'] = 1e-3
config['discount_rate'] = 0.99 config['discount_rate'] = 0.99
config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.02, iterations = 10**5)
config['replay_skip'] = 4
config['batch_size'] = 32 * (config['replay_skip'] + 1)
config['num_sneaky_episodes'] = 10 # per loop
config['disable_cuda'] = False config['disable_cuda'] = False
config['seconds_play_per_state'] = 120 config['seconds_play_per_state'] = 120
config['seconds_play_per_state'] = 5
# 30 transitions per second for 120 seconds = 3600 transitions per turn # 30 transitions per second for 120 seconds = 3600 transitions per turn
config['memory_size'] = 21600 # To hold 6 demonstrations config['memory_size'] = 86400
config['batch_size'] = 64 config['dqfd_demo_loss_weight'] = 0.01
config['num_train_per_demo'] = 115 # 4 looks * transitions per turn / (2 * batch_size) config['dqfd_td_loss_weight'] = 1.
config['demo_prio_bonus'] = 0.
config['observed_prio_bonus'] = 0.
# Prioritized vs Random Sampling # Prioritized vs Random Sampling
# 0 - Random sampling # 0 - Random sampling
# 1 - Only the highest prioirities # 1 - Only the highest prioirities
config['prioritized_replay_sampling_priority'] = 0.6 config['prioritized_replay_sampling_priority'] = 0.6
config['prioritized_replay_sampling_priority'] = 0.
# How important are the weights for the loss? # How important are the weights for the loss?
# 0 - Treat all losses equally # 0 - Treat all losses equally
# 1 - Lower the importance of high losses # 1 - Lower the importance of high losses
# Should ideally start from 0 and move your way to 1 to prevent overfitting # Should ideally start from 0 and move your way to 1 to prevent overfitting
config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5) config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5)
config['prioritized_replay_weight_importance'] = 0.

46
play.py
View file

@ -1,15 +1,16 @@
from gym.spaces.box import Box from gym.spaces.box import Box
import pygame import pygame
from pygame.locals import VIDEORESIZE from pygame.locals import VIDEORESIZE
import rltorch
from rltorch.memory import ReplayMemory from rltorch.memory import ReplayMemory
class Play: class Play:
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config): def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, record_lock, config):
self.env = env self.env = env
self.action_selector = action_selector self.action_selector = action_selector
self.record_lock = record_lock self.record_lock = record_lock
self.record_locked = False self.record_locked = False
self.sneaky_agent = sneaky_agent #self.sneaky_agent = sneaky_agent
self.agent = agent self.agent = agent
self.sneaky_env = sneaky_env self.sneaky_env = sneaky_env
self.sneaky_actor = sneaky_actor self.sneaky_actor = sneaky_actor
@ -19,8 +20,8 @@ class Play:
self.zoom = config['zoom'] if 'zoom' in config else 1 self.zoom = config['zoom'] if 'zoom' in config else 1
self.keys_to_action = config['keys_to_action'] if 'keys_to_action' in config else None self.keys_to_action = config['keys_to_action'] if 'keys_to_action' in config else None
self.seconds_play_per_state = config['seconds_play_per_state'] if 'seconds_play_per_state' in config else 30 self.seconds_play_per_state = config['seconds_play_per_state'] if 'seconds_play_per_state' in config else 30
self.num_sneaky_episodes = sneaky_config['num_sneaky_episodes'] if 'num_sneaky_episodes' in sneaky_config else 10 self.num_sneaky_episodes = config['num_sneaky_episodes'] if 'num_sneaky_episodes' in config else 10
self.replay_skip = sneaky_config['replay_skip'] if 'replay_skip' in sneaky_config else 0 self.replay_skip = config['replay_skip'] if 'replay_skip' in config else 0
self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1 self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1
# Initial values... # Initial values...
self.video_size = (0, 0) self.video_size = (0, 0)
@ -32,6 +33,7 @@ class Play:
self.clock = pygame.time.Clock() self.clock = pygame.time.Clock()
self.sneaky_iteration = 0 self.sneaky_iteration = 0
self.paused = False self.paused = False
self.space_pressed = False
def _display_arr(self, obs, screen, arr, video_size): def _display_arr(self, obs, screen, arr, video_size):
if obs is not None: if obs is not None:
@ -135,42 +137,39 @@ class Play:
for event in pygame.event.get(): for event in pygame.event.get():
if self._process_common_pygame_events(event): if self._process_common_pygame_events(event):
continue continue
elif event.type == pygame.KEYDOWN: elif event.type == pygame.KEYDOWN and event.key == pygame.K_SPACE:
if event.key == pygame.K_SPACE: self.space_pressed = True
self.pressed_keys.append(event.key) elif event.type == pygame.KEYUP and event.key == pygame.K_SPACE and self.space_pressed:
elif event.type == pygame.KEYUP and event.key == pygame.K_SPACE: self.space_pressed = False
self.pressed_keys.remove(event.key)
self._increment_state() self._increment_state()
pygame.display.flip() pygame.display.flip()
self.clock.tick(self.fps) self.clock.tick(self.fps)
def sneaky_train(self): def sneaky_train(self):
# self.record_lock.acquire()
# Do a standard RL algorithm process for a certain number of episodes # Do a standard RL algorithm process for a certain number of episodes
step = 0
for i in range(self.num_sneaky_episodes): for i in range(self.num_sneaky_episodes):
print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "") print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "")
# Reset all episode related variables # Reset all episode related variables
prev_obs = self.sneaky_env.reset() prev_obs = self.sneaky_env.reset()
done = False done = False
step = 0
total_reward = 0 total_reward = 0
while not done: while not done:
action = self.sneaky_actor.act(prev_obs) action = self.sneaky_actor.act(prev_obs)
obs, reward, done, _ = self.sneaky_env.step(action) obs, reward, done, _ = self.sneaky_env.step(action)
total_reward += reward total_reward += reward
self.sneaky_agent.memory.append(prev_obs, action, reward, obs, done) self.agent.memory.append(prev_obs, action, reward, obs, done)
prev_obs = obs prev_obs = obs
step += 1 step += 1
if step % self.replay_skip == 0: if step % self.replay_skip == 0:
self.sneaky_agent.learn() self.agent.learn()
# Finish the previous print with the total reward obtained during the episode # Finish the previous print with the total reward obtained during the episode
print(total_reward, flush = True) print(total_reward, "Epsilon:", next(self.sneaky_actor.epsilon), flush = True)
self.sneaky_iteration += 1 self.sneaky_iteration += 1
# self.record_lock.release()
def display_text(self, text): def display_text(self, text):
myfont = pygame.font.SysFont('Comic Sans MS', 50) myfont = pygame.font.SysFont('Comic Sans MS', 50)
@ -247,7 +246,9 @@ class Play:
# The computer will train for a few episodes without showing to the user. # The computer will train for a few episodes without showing to the user.
# Mainly to speed up the learning process a bit # Mainly to speed up the learning process a bit
elif self.state is SNEAKY_COMPUTER_PLAY: elif self.state == SNEAKY_COMPUTER_PLAY:
# Clear pressed keys in case a key is left inside (the bug where you can't control it since it just holds a button)
self.pressed_keys.clear()
if not self.record_locked: if not self.record_locked:
self.record_lock.acquire() self.record_lock.acquire()
self.record_locked = True self.record_locked = True
@ -277,25 +278,18 @@ class Play:
self.record_lock.acquire() self.record_lock.acquire()
self.record_locked = True self.record_locked = True
self.transition("Your Turn! Press <Space> to Start") self.transition("Your Turn! Press <Space> to Start")
# Increment the timer if it's the human or shown computer's turn # Increment the timer if it's the human or shown computer's turn
if self.state is COMPUTER_PLAY or self.state is HUMAN_PLAY: if self.state is COMPUTER_PLAY or self.state is HUMAN_PLAY:
if self.state == HUMAN_PLAY and isinstance(self.agent.memory, 'DQfDMemory'): if self.state == HUMAN_PLAY and (isinstance(self.agent.memory, rltorch.memory.DQfDMemory) or isinstance(self.agent.memory, rltorch.memory.iDQfDMemory)):
self.agent.memory.append_demonstration(prev_obs, action, reward, obs, env_done) self.agent.memory.append_demonstration(prev_obs, action, reward, obs, env_done)
else: else:
self.agent.memory.append(prev_obs, action, reward, obs, env_done) self.agent.memory.append(prev_obs, action, reward, obs, env_done)
i += 1 i += 1
# Perform a quick learning process and increment the state after a certain time period has passed # Perform a quick learning process and increment the state after a certain time period has passed
if i % (self.fps * self.seconds_play_per_state) == 0: if i % (self.fps * self.seconds_play_per_state) == 0:
self.record_lock.acquire()
self.display_text("Demo Training...")
print("Begin Demonstration Training")
print("Number of transitions in buffer: ", len(self.agent.memory), flush = True) print("Number of transitions in buffer: ", len(self.agent.memory), flush = True)
for j in range(self.num_train_per_demo):
print("Iteration %d / %d" % (j + 1, self.num_train_per_demo))
self.agent.learn()
self.clear_text(obs)
self.record_lock.release()
self._increment_state() self._increment_state()
i = 0 i = 0

View file

@ -16,7 +16,7 @@ from torch.optim import Adam
# Import my custom RL library # Import my custom RL library
import rltorch import rltorch
from rltorch.memory import PrioritizedReplayMemory, ReplayMemory, DQfDMemory from rltorch.memory import PrioritizedReplayMemory, ReplayMemory, iDQfDMemory
from rltorch.action_selector import EpsilonGreedySelector, ArgMaxSelector from rltorch.action_selector import EpsilonGreedySelector, ArgMaxSelector
import rltorch.env as E import rltorch.env as E
import rltorch.network as rn import rltorch.network as rn
@ -37,9 +37,9 @@ from networks import Value
## Play Related Classes ## Play Related Classes
# #
class PlayClass(Thread): class PlayClass(Thread):
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config): def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, record_lock, config):
super(PlayClass, self).__init__() super(PlayClass, self).__init__()
self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config) self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, record_lock, config)
def run(self): def run(self):
self.play.start() self.play.start()
@ -93,7 +93,6 @@ args = vars(parser.parse_args())
## Main configuration for script ## Main configuration for script
from config import config from config import config
from sneaky_config import sneaky_config
# Environment name and log directory is vital so show help message and exit if not provided # Environment name and log directory is vital so show help message and exit if not provided
if args['environment_name'] is None or args['logdir'] is None: if args['environment_name'] is None or args['logdir'] is None:
@ -152,20 +151,18 @@ net = rn.Network(Value(state_size, action_size),
target_net = rn.TargetNetwork(net, device = device) target_net = rn.TargetNetwork(net, device = device)
# Relevant components from RLTorch # Relevant components from RLTorch
memory = DQfDMemory(capacity= config['memory_size'], alpha = config['prioritized_replay_sampling_priority'], max_demo = config['memory_size'] // 2) memory = iDQfDMemory(capacity= config['memory_size'], max_demo = config['memory_size'] // 10)
actor = ArgMaxSelector(net, action_size, device = device) actor = ArgMaxSelector(net, action_size, device = device)
agent = rltorch.agents.DQfDAgent(net, memory, config, target_net = target_net) agent = rltorch.agents.DQfDAgent(net, memory, config, target_net = target_net)
# Use a different environment for when the computer trains on the side so that the current game state isn't manipuated # Use a different environment for when the computer trains on the side so that the current game state isn't manipuated
# Also use MaxEnvSkip to speed up processing # Also use MaxEnvSkip to speed up processing
sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True) sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True)
sneaky_memory = ReplayMemory(capacity = sneaky_config['memory_size']) sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = config['exploration_rate'])
sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = sneaky_config['exploration_rate'])
sneaky_agent = rltorch.agents.DQNAgent(net, sneaky_memory, sneaky_config, target_net = target_net)
# Pass all this information into the thread that will handle the game play and start # Pass all this information into the thread that will handle the game play and start
playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config) playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, record_lock, config)
playThread.start() playThread.start()
# While the play thread is running, we'll periodically log transitions we've encountered # While the play thread is running, we'll periodically log transitions we've encountered
@ -179,4 +176,4 @@ while playThread.is_alive():
# Save what's remaining after process died # Save what's remaining after process died
record_lock.acquire() record_lock.acquire()
record_env.log_transitions() record_env.log_transitions()
record_lock.release() record_lock.release()

View file

@ -1,13 +0,0 @@
import rltorch
sneaky_config = {}
sneaky_config['learning_rate'] = 1e-5
sneaky_config['target_sync_tau'] = 1e-3
sneaky_config['discount_rate'] = 0.99
sneaky_config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.02, iterations = 10**5)
# Number of episodes for the computer to train the agent without the human seeing
sneaky_config['replay_skip'] = 29 # Gradient descent every second
sneaky_config['batch_size'] = 16 * (sneaky_config['replay_skip'] + 1) # Calculated based on memory constraints
sneaky_config['memory_size'] = 2000 # batch_size * 2 looks = 66 seconds of gameplay
# Number of episodes for the computer to train the agent without the human seeing
sneaky_config['num_sneaky_episodes'] = 10