From 66496fe0d8ddf781f6dd155a306ba9d11212925c Mon Sep 17 00:00:00 2001 From: Brandon Rozek Date: Mon, 23 Mar 2020 20:02:06 -0400 Subject: [PATCH] Changes from honors thesis --- config.py | 18 +++++++++++++----- play.py | 46 ++++++++++++++++++++-------------------------- play_env.py | 17 +++++++---------- sneaky_config.py | 13 ------------- 4 files changed, 40 insertions(+), 54 deletions(-) delete mode 100644 sneaky_config.py diff --git a/config.py b/config.py index 30970ff..725888d 100644 --- a/config.py +++ b/config.py @@ -4,24 +4,32 @@ config = {} config['seed'] = 901 config['zoom'] = 4 config['environment_name'] = 'PongNoFrameskip-v4' -config['learning_rate'] = 1e-5 +config['learning_rate'] = 1e-4 config['target_sync_tau'] = 1e-3 config['discount_rate'] = 0.99 +config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.02, iterations = 10**5) +config['replay_skip'] = 4 +config['batch_size'] = 32 * (config['replay_skip'] + 1) +config['num_sneaky_episodes'] = 10 # per loop config['disable_cuda'] = False config['seconds_play_per_state'] = 120 +config['seconds_play_per_state'] = 5 # 30 transitions per second for 120 seconds = 3600 transitions per turn -config['memory_size'] = 21600 # To hold 6 demonstrations -config['batch_size'] = 64 -config['num_train_per_demo'] = 115 # 4 looks * transitions per turn / (2 * batch_size) - +config['memory_size'] = 86400 +config['dqfd_demo_loss_weight'] = 0.01 +config['dqfd_td_loss_weight'] = 1. +config['demo_prio_bonus'] = 0. +config['observed_prio_bonus'] = 0. # Prioritized vs Random Sampling # 0 - Random sampling # 1 - Only the highest prioirities config['prioritized_replay_sampling_priority'] = 0.6 +config['prioritized_replay_sampling_priority'] = 0. # How important are the weights for the loss? # 0 - Treat all losses equally # 1 - Lower the importance of high losses # Should ideally start from 0 and move your way to 1 to prevent overfitting config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5) +config['prioritized_replay_weight_importance'] = 0. diff --git a/play.py b/play.py index 61a102f..5f680b9 100644 --- a/play.py +++ b/play.py @@ -1,15 +1,16 @@ from gym.spaces.box import Box import pygame from pygame.locals import VIDEORESIZE +import rltorch from rltorch.memory import ReplayMemory class Play: - def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config): + def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, record_lock, config): self.env = env self.action_selector = action_selector self.record_lock = record_lock self.record_locked = False - self.sneaky_agent = sneaky_agent + #self.sneaky_agent = sneaky_agent self.agent = agent self.sneaky_env = sneaky_env self.sneaky_actor = sneaky_actor @@ -19,8 +20,8 @@ class Play: self.zoom = config['zoom'] if 'zoom' in config else 1 self.keys_to_action = config['keys_to_action'] if 'keys_to_action' in config else None self.seconds_play_per_state = config['seconds_play_per_state'] if 'seconds_play_per_state' in config else 30 - self.num_sneaky_episodes = sneaky_config['num_sneaky_episodes'] if 'num_sneaky_episodes' in sneaky_config else 10 - self.replay_skip = sneaky_config['replay_skip'] if 'replay_skip' in sneaky_config else 0 + self.num_sneaky_episodes = config['num_sneaky_episodes'] if 'num_sneaky_episodes' in config else 10 + self.replay_skip = config['replay_skip'] if 'replay_skip' in config else 0 self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1 # Initial values... self.video_size = (0, 0) @@ -32,6 +33,7 @@ class Play: self.clock = pygame.time.Clock() self.sneaky_iteration = 0 self.paused = False + self.space_pressed = False def _display_arr(self, obs, screen, arr, video_size): if obs is not None: @@ -135,42 +137,39 @@ class Play: for event in pygame.event.get(): if self._process_common_pygame_events(event): continue - elif event.type == pygame.KEYDOWN: - if event.key == pygame.K_SPACE: - self.pressed_keys.append(event.key) - elif event.type == pygame.KEYUP and event.key == pygame.K_SPACE: - self.pressed_keys.remove(event.key) + elif event.type == pygame.KEYDOWN and event.key == pygame.K_SPACE: + self.space_pressed = True + elif event.type == pygame.KEYUP and event.key == pygame.K_SPACE and self.space_pressed: + self.space_pressed = False self._increment_state() pygame.display.flip() self.clock.tick(self.fps) def sneaky_train(self): - # self.record_lock.acquire() # Do a standard RL algorithm process for a certain number of episodes + step = 0 for i in range(self.num_sneaky_episodes): print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "") # Reset all episode related variables prev_obs = self.sneaky_env.reset() done = False - step = 0 total_reward = 0 while not done: action = self.sneaky_actor.act(prev_obs) obs, reward, done, _ = self.sneaky_env.step(action) total_reward += reward - self.sneaky_agent.memory.append(prev_obs, action, reward, obs, done) + self.agent.memory.append(prev_obs, action, reward, obs, done) prev_obs = obs step += 1 if step % self.replay_skip == 0: - self.sneaky_agent.learn() + self.agent.learn() # Finish the previous print with the total reward obtained during the episode - print(total_reward, flush = True) + print(total_reward, "Epsilon:", next(self.sneaky_actor.epsilon), flush = True) self.sneaky_iteration += 1 - # self.record_lock.release() def display_text(self, text): myfont = pygame.font.SysFont('Comic Sans MS', 50) @@ -247,7 +246,9 @@ class Play: # The computer will train for a few episodes without showing to the user. # Mainly to speed up the learning process a bit - elif self.state is SNEAKY_COMPUTER_PLAY: + elif self.state == SNEAKY_COMPUTER_PLAY: + # Clear pressed keys in case a key is left inside (the bug where you can't control it since it just holds a button) + self.pressed_keys.clear() if not self.record_locked: self.record_lock.acquire() self.record_locked = True @@ -277,25 +278,18 @@ class Play: self.record_lock.acquire() self.record_locked = True self.transition("Your Turn! Press to Start") - + + # Increment the timer if it's the human or shown computer's turn if self.state is COMPUTER_PLAY or self.state is HUMAN_PLAY: - if self.state == HUMAN_PLAY and isinstance(self.agent.memory, 'DQfDMemory'): + if self.state == HUMAN_PLAY and (isinstance(self.agent.memory, rltorch.memory.DQfDMemory) or isinstance(self.agent.memory, rltorch.memory.iDQfDMemory)): self.agent.memory.append_demonstration(prev_obs, action, reward, obs, env_done) else: self.agent.memory.append(prev_obs, action, reward, obs, env_done) i += 1 # Perform a quick learning process and increment the state after a certain time period has passed if i % (self.fps * self.seconds_play_per_state) == 0: - self.record_lock.acquire() - self.display_text("Demo Training...") - print("Begin Demonstration Training") print("Number of transitions in buffer: ", len(self.agent.memory), flush = True) - for j in range(self.num_train_per_demo): - print("Iteration %d / %d" % (j + 1, self.num_train_per_demo)) - self.agent.learn() - self.clear_text(obs) - self.record_lock.release() self._increment_state() i = 0 diff --git a/play_env.py b/play_env.py index 952d663..fe9534c 100644 --- a/play_env.py +++ b/play_env.py @@ -16,7 +16,7 @@ from torch.optim import Adam # Import my custom RL library import rltorch -from rltorch.memory import PrioritizedReplayMemory, ReplayMemory, DQfDMemory +from rltorch.memory import PrioritizedReplayMemory, ReplayMemory, iDQfDMemory from rltorch.action_selector import EpsilonGreedySelector, ArgMaxSelector import rltorch.env as E import rltorch.network as rn @@ -37,9 +37,9 @@ from networks import Value ## Play Related Classes # class PlayClass(Thread): - def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config): + def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, record_lock, config): super(PlayClass, self).__init__() - self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config) + self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, record_lock, config) def run(self): self.play.start() @@ -93,7 +93,6 @@ args = vars(parser.parse_args()) ## Main configuration for script from config import config -from sneaky_config import sneaky_config # Environment name and log directory is vital so show help message and exit if not provided if args['environment_name'] is None or args['logdir'] is None: @@ -152,20 +151,18 @@ net = rn.Network(Value(state_size, action_size), target_net = rn.TargetNetwork(net, device = device) # Relevant components from RLTorch -memory = DQfDMemory(capacity= config['memory_size'], alpha = config['prioritized_replay_sampling_priority'], max_demo = config['memory_size'] // 2) +memory = iDQfDMemory(capacity= config['memory_size'], max_demo = config['memory_size'] // 10) actor = ArgMaxSelector(net, action_size, device = device) agent = rltorch.agents.DQfDAgent(net, memory, config, target_net = target_net) # Use a different environment for when the computer trains on the side so that the current game state isn't manipuated # Also use MaxEnvSkip to speed up processing sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True) -sneaky_memory = ReplayMemory(capacity = sneaky_config['memory_size']) -sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = sneaky_config['exploration_rate']) +sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = config['exploration_rate']) -sneaky_agent = rltorch.agents.DQNAgent(net, sneaky_memory, sneaky_config, target_net = target_net) # Pass all this information into the thread that will handle the game play and start -playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config) +playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, record_lock, config) playThread.start() # While the play thread is running, we'll periodically log transitions we've encountered @@ -179,4 +176,4 @@ while playThread.is_alive(): # Save what's remaining after process died record_lock.acquire() record_env.log_transitions() -record_lock.release() \ No newline at end of file +record_lock.release() diff --git a/sneaky_config.py b/sneaky_config.py deleted file mode 100644 index a83a3f0..0000000 --- a/sneaky_config.py +++ /dev/null @@ -1,13 +0,0 @@ -import rltorch - -sneaky_config = {} -sneaky_config['learning_rate'] = 1e-5 -sneaky_config['target_sync_tau'] = 1e-3 -sneaky_config['discount_rate'] = 0.99 -sneaky_config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.02, iterations = 10**5) -# Number of episodes for the computer to train the agent without the human seeing -sneaky_config['replay_skip'] = 29 # Gradient descent every second -sneaky_config['batch_size'] = 16 * (sneaky_config['replay_skip'] + 1) # Calculated based on memory constraints -sneaky_config['memory_size'] = 2000 # batch_size * 2 looks = 66 seconds of gameplay -# Number of episodes for the computer to train the agent without the human seeing -sneaky_config['num_sneaky_episodes'] = 10 \ No newline at end of file