From 66496fe0d8ddf781f6dd155a306ba9d11212925c Mon Sep 17 00:00:00 2001
From: Brandon Rozek <brozek@mail.umw.edu>
Date: Mon, 23 Mar 2020 20:02:06 -0400
Subject: [PATCH] Changes from honors thesis

---
 config.py        | 18 +++++++++++++-----
 play.py          | 46 ++++++++++++++++++++--------------------------
 play_env.py      | 17 +++++++----------
 sneaky_config.py | 13 -------------
 4 files changed, 40 insertions(+), 54 deletions(-)
 delete mode 100644 sneaky_config.py

diff --git a/config.py b/config.py
index 30970ff..725888d 100644
--- a/config.py
+++ b/config.py
@@ -4,24 +4,32 @@ config = {}
 config['seed'] = 901
 config['zoom'] = 4
 config['environment_name'] = 'PongNoFrameskip-v4'
-config['learning_rate'] = 1e-5
+config['learning_rate'] = 1e-4
 config['target_sync_tau'] = 1e-3
 config['discount_rate'] = 0.99
+config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.02, iterations = 10**5)
+config['replay_skip'] = 4
+config['batch_size'] = 32 * (config['replay_skip'] + 1)
+config['num_sneaky_episodes'] = 10 # per loop
 config['disable_cuda'] = False
 
 config['seconds_play_per_state'] = 120
+config['seconds_play_per_state'] = 5
 # 30 transitions per second for 120 seconds = 3600 transitions per turn
-config['memory_size'] = 21600 # To hold 6 demonstrations
-config['batch_size'] = 64
-config['num_train_per_demo'] = 115 # 4 looks * transitions per turn / (2 * batch_size)
-
+config['memory_size'] = 86400
+config['dqfd_demo_loss_weight'] = 0.01
+config['dqfd_td_loss_weight'] = 1.
+config['demo_prio_bonus'] = 0.
+config['observed_prio_bonus'] = 0.
 
 # Prioritized vs Random Sampling
 # 0 - Random sampling
 # 1 - Only the highest prioirities
 config['prioritized_replay_sampling_priority'] = 0.6
+config['prioritized_replay_sampling_priority'] = 0.
 # How important are the weights for the loss?
 # 0 - Treat all losses equally
 # 1 - Lower the importance of high losses
 # Should ideally start from 0 and move your way to 1 to prevent overfitting
 config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5)
+config['prioritized_replay_weight_importance'] = 0.
diff --git a/play.py b/play.py
index 61a102f..5f680b9 100644
--- a/play.py
+++ b/play.py
@@ -1,15 +1,16 @@
 from gym.spaces.box import Box
 import pygame
 from pygame.locals import VIDEORESIZE
+import rltorch
 from rltorch.memory import ReplayMemory
 
 class Play:
-    def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config):
+    def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, record_lock, config):
         self.env = env
         self.action_selector = action_selector
         self.record_lock = record_lock
         self.record_locked = False
-        self.sneaky_agent = sneaky_agent
+        #self.sneaky_agent = sneaky_agent
         self.agent = agent
         self.sneaky_env = sneaky_env
         self.sneaky_actor = sneaky_actor
@@ -19,8 +20,8 @@ class Play:
         self.zoom = config['zoom'] if 'zoom' in config else 1
         self.keys_to_action = config['keys_to_action'] if 'keys_to_action' in config else None
         self.seconds_play_per_state = config['seconds_play_per_state'] if 'seconds_play_per_state' in config else 30
-        self.num_sneaky_episodes = sneaky_config['num_sneaky_episodes'] if 'num_sneaky_episodes' in sneaky_config else 10
-        self.replay_skip = sneaky_config['replay_skip'] if 'replay_skip' in sneaky_config else 0
+        self.num_sneaky_episodes = config['num_sneaky_episodes'] if 'num_sneaky_episodes' in config else 10
+        self.replay_skip = config['replay_skip'] if 'replay_skip' in config else 0
         self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1
         # Initial values...
         self.video_size = (0, 0)
@@ -32,6 +33,7 @@ class Play:
         self.clock = pygame.time.Clock()
         self.sneaky_iteration = 0
         self.paused = False
+        self.space_pressed = False
     
     def _display_arr(self, obs, screen, arr, video_size):
         if obs is not None:
@@ -135,42 +137,39 @@ class Play:
         for event in pygame.event.get():
             if self._process_common_pygame_events(event):
                 continue
-            elif event.type == pygame.KEYDOWN:
-                if event.key == pygame.K_SPACE:
-                    self.pressed_keys.append(event.key)
-            elif event.type == pygame.KEYUP and event.key == pygame.K_SPACE:
-                self.pressed_keys.remove(event.key)
+            elif event.type == pygame.KEYDOWN and event.key == pygame.K_SPACE:
+                self.space_pressed = True
+            elif event.type == pygame.KEYUP and event.key == pygame.K_SPACE and self.space_pressed:
+                self.space_pressed = False
                 self._increment_state()
         
         pygame.display.flip()
         self.clock.tick(self.fps)
     
     def sneaky_train(self):
-        # self.record_lock.acquire()
         # Do a standard RL algorithm process for a certain number of episodes
+        step = 0
         for i in range(self.num_sneaky_episodes):
             print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "")
 
             # Reset all episode related variables
             prev_obs = self.sneaky_env.reset()
             done = False
-            step = 0
             total_reward = 0
             
             while not done:
                 action = self.sneaky_actor.act(prev_obs)
                 obs, reward, done, _ = self.sneaky_env.step(action)
                 total_reward += reward
-                self.sneaky_agent.memory.append(prev_obs, action, reward, obs, done)
+                self.agent.memory.append(prev_obs, action, reward, obs, done)
                 prev_obs = obs
                 step += 1
                 if step % self.replay_skip == 0:
-                    self.sneaky_agent.learn()
+                    self.agent.learn()
             
             # Finish the previous print with the total reward obtained during the episode
-            print(total_reward, flush = True)
+            print(total_reward, "Epsilon:", next(self.sneaky_actor.epsilon), flush = True)
         self.sneaky_iteration += 1
-        # self.record_lock.release()
     
     def display_text(self, text):
         myfont = pygame.font.SysFont('Comic Sans MS', 50)
@@ -247,7 +246,9 @@ class Play:
             
             # The computer will train for a few episodes without showing to the user.
             # Mainly to speed up the learning process a bit
-            elif self.state is SNEAKY_COMPUTER_PLAY:
+            elif self.state == SNEAKY_COMPUTER_PLAY:
+                # Clear pressed keys in case a key is left inside (the bug where you can't control it since it just holds a button)
+                self.pressed_keys.clear()
                 if not self.record_locked:
                     self.record_lock.acquire()
                     self.record_locked = True
@@ -277,25 +278,18 @@ class Play:
                     self.record_lock.acquire()
                     self.record_locked = True
                 self.transition("Your Turn! Press <Space> to Start")
-
+            
+            
             # Increment the timer if it's the human or shown computer's turn
             if self.state is COMPUTER_PLAY or self.state is HUMAN_PLAY:
-                if self.state == HUMAN_PLAY and isinstance(self.agent.memory, 'DQfDMemory'):
+                if self.state == HUMAN_PLAY and (isinstance(self.agent.memory, rltorch.memory.DQfDMemory) or isinstance(self.agent.memory, rltorch.memory.iDQfDMemory)):
                     self.agent.memory.append_demonstration(prev_obs, action, reward, obs, env_done)
                 else:
                     self.agent.memory.append(prev_obs, action, reward, obs, env_done)
                 i += 1
                 # Perform a quick learning process and increment the state after a certain time period has passed
                 if i % (self.fps * self.seconds_play_per_state) == 0:
-                    self.record_lock.acquire()
-                    self.display_text("Demo Training...")
-                    print("Begin Demonstration Training")
                     print("Number of transitions in buffer: ", len(self.agent.memory), flush = True)
-                    for j in range(self.num_train_per_demo):
-                        print("Iteration %d / %d" % (j + 1, self.num_train_per_demo))
-                        self.agent.learn()
-                    self.clear_text(obs)
-                    self.record_lock.release()
                     self._increment_state()
                     i = 0
         
diff --git a/play_env.py b/play_env.py
index 952d663..fe9534c 100644
--- a/play_env.py
+++ b/play_env.py
@@ -16,7 +16,7 @@ from torch.optim import Adam
 
 # Import my custom RL library
 import rltorch
-from rltorch.memory import PrioritizedReplayMemory, ReplayMemory, DQfDMemory
+from rltorch.memory import PrioritizedReplayMemory, ReplayMemory, iDQfDMemory
 from rltorch.action_selector import EpsilonGreedySelector, ArgMaxSelector
 import rltorch.env as E
 import rltorch.network as rn
@@ -37,9 +37,9 @@ from networks import Value
 ## Play Related Classes
 #
 class PlayClass(Thread):
-  def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config):
+  def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, record_lock, config):
     super(PlayClass, self).__init__()
-    self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config)
+    self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, record_lock, config)
 
   def run(self):
     self.play.start()
@@ -93,7 +93,6 @@ args = vars(parser.parse_args())
 
 ## Main configuration for script
 from config import config
-from sneaky_config import sneaky_config
 
 # Environment name and log directory is vital so show help message and exit if not provided
 if args['environment_name'] is None or args['logdir'] is None:
@@ -152,20 +151,18 @@ net = rn.Network(Value(state_size, action_size),
 target_net = rn.TargetNetwork(net, device = device)
 
 # Relevant components from RLTorch
-memory =  DQfDMemory(capacity= config['memory_size'], alpha = config['prioritized_replay_sampling_priority'], max_demo = config['memory_size'] // 2)
+memory =  iDQfDMemory(capacity= config['memory_size'], max_demo = config['memory_size'] // 10)
 actor = ArgMaxSelector(net, action_size, device = device)
 agent = rltorch.agents.DQfDAgent(net, memory, config, target_net = target_net)
 
 # Use a different environment for when the computer trains on the side so that the current game state isn't manipuated
 # Also use MaxEnvSkip to speed up processing
 sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True)
-sneaky_memory = ReplayMemory(capacity = sneaky_config['memory_size'])
-sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = sneaky_config['exploration_rate'])
+sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = config['exploration_rate'])
 
-sneaky_agent = rltorch.agents.DQNAgent(net, sneaky_memory, sneaky_config, target_net = target_net)
 
 # Pass all this information into the thread that will handle the game play and start
-playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config)
+playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, record_lock, config)
 playThread.start()
 
 # While the play thread is running, we'll periodically log transitions we've encountered
@@ -179,4 +176,4 @@ while playThread.is_alive():
 # Save what's remaining after process died
 record_lock.acquire()
 record_env.log_transitions()
-record_lock.release()
\ No newline at end of file
+record_lock.release()
diff --git a/sneaky_config.py b/sneaky_config.py
deleted file mode 100644
index a83a3f0..0000000
--- a/sneaky_config.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import rltorch
-
-sneaky_config = {}
-sneaky_config['learning_rate'] = 1e-5
-sneaky_config['target_sync_tau'] = 1e-3
-sneaky_config['discount_rate'] = 0.99
-sneaky_config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.02, iterations = 10**5)
-# Number of episodes for the computer to train the agent without the human seeing
-sneaky_config['replay_skip'] = 29 # Gradient descent every second
-sneaky_config['batch_size'] = 16 * (sneaky_config['replay_skip'] + 1) # Calculated based on memory constraints
-sneaky_config['memory_size'] = 2000 # batch_size * 2 looks = 66 seconds of gameplay
-# Number of episodes for the computer to train the agent without the human seeing
-sneaky_config['num_sneaky_episodes'] = 10
\ No newline at end of file