From 32862e4d798f30eaaab1d29862243cc59aa5e95a Mon Sep 17 00:00:00 2001
From: Brandon Rozek <brozek@mail.umw.edu>
Date: Sun, 27 Oct 2019 20:42:37 -0400
Subject: [PATCH] Began separating config & networks, F1 for pausing, text
 functions, and more sneaky agent stuff

---
 .gitignore       |   1 +
 config.py        |  27 ++++++++++
 networks.py      |  51 ++++++++++++++++++
 play.py          | 106 ++++++++++++++++++++++---------------
 play_env.py      | 134 +++++++++++++----------------------------------
 sneaky_config.py |  11 ++++
 6 files changed, 188 insertions(+), 142 deletions(-)
 create mode 100644 config.py
 create mode 100644 networks.py
 create mode 100644 sneaky_config.py

diff --git a/.gitignore b/.gitignore
index 5f46325..eb6de05 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 __pycache__/
 playlogs/
+.vscode/
\ No newline at end of file
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..6a1b865
--- /dev/null
+++ b/config.py
@@ -0,0 +1,27 @@
+import rltorch
+
+config = {}
+config['seed'] = 901
+config['seconds_play_per_state'] = 120
+config['zoom'] = 4
+config['environment_name'] = 'PongNoFrameskip-v4'
+config['learning_rate'] = 1e-4
+config['target_sync_tau'] = 1e-3
+config['discount_rate'] = 0.99
+config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5)
+# Number of episodes for the computer to train the agent without the human seeing
+config['num_sneaky_episodes'] = 10
+config['num_train_per_demo'] = 50 # 100 total since you have two demo training per cycle
+config['replay_skip'] = 14
+config['batch_size'] = 32 * (config['replay_skip'] + 1)
+config['disable_cuda'] = False
+config['memory_size'] = 10**4
+# Prioritized vs Random Sampling
+# 0 - Random sampling
+# 1 - Only the highest prioirities
+config['prioritized_replay_sampling_priority'] = 0.6
+# How important are the weights for the loss?
+# 0 - Treat all losses equally
+# 1 - Lower the importance of high losses
+# Should ideally start from 0 and move your way to 1 to prevent overfitting
+config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5)
diff --git a/networks.py b/networks.py
new file mode 100644
index 0000000..e02f1b6
--- /dev/null
+++ b/networks.py
@@ -0,0 +1,51 @@
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import rltorch.network as rn
+
+class Value(nn.Module):
+  def __init__(self, state_size, action_size):
+    super(Value, self).__init__()
+    self.state_size = state_size
+    self.action_size = action_size
+    
+    self.conv1 = nn.Conv2d(4, 32, kernel_size = (8, 8), stride = (4, 4))
+    self.conv2 = nn.Conv2d(32, 64, kernel_size = (4, 4), stride = (2, 2))    
+    self.conv3 = nn.Conv2d(64, 64, kernel_size = (3, 3), stride = (1, 1))
+    
+    self.fc1 = nn.Linear(3136, 512)
+    self.fc1_norm = nn.LayerNorm(512)
+
+    self.value_fc = rn.NoisyLinear(512, 512)
+    self.value_fc_norm = nn.LayerNorm(512)
+    self.value = nn.Linear(512, 1)
+    
+    self.advantage_fc = rn.NoisyLinear(512, 512)
+    self.advantage_fc_norm = nn.LayerNorm(512)
+    self.advantage = nn.Linear(512, action_size)
+
+  
+  def forward(self, x):
+    x = x.float() / 256
+    x = F.relu(self.conv1(x))
+    x = F.relu(self.conv2(x))
+    x = F.relu(self.conv3(x))
+    
+    # Makes batch_size dimension again
+    x = x.view(-1, 3136)
+    x = F.relu(self.fc1_norm(self.fc1(x)))
+    
+    state_value = F.relu(self.value_fc_norm(self.value_fc(x)))
+    state_value = self.value(state_value)
+    
+    advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x)))
+    advantage = self.advantage(advantage)
+    
+    x = state_value + advantage - advantage.mean()
+    
+    # For debugging purposes...
+    if torch.isnan(x).any().item():
+      print("WARNING NAN IN MODEL DETECTED")
+    
+    return x
diff --git a/play.py b/play.py
index 48b7d52..daea72e 100644
--- a/play.py
+++ b/play.py
@@ -4,13 +4,14 @@ from pygame.locals import VIDEORESIZE
 from rltorch.memory import ReplayMemory
 
 class Play:
-    def __init__(self, env, action_selector, memory, memory_lock, agent, sneaky_env, config):
+    def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config):
         self.env = env
         self.action_selector = action_selector
-        self.memory = memory
-        self.memory_lock = memory_lock
+        self.record_lock = record_lock
+        self.sneaky_agent = sneaky_agent
         self.agent = agent
         self.sneaky_env = sneaky_env
+        self.sneaky_actor = sneaky_actor
         # Get relevant parameters from config or set sane defaults
         self.transpose = config['transpose'] if 'transpose' in config else True
         self.fps = config['fps'] if 'fps' in config else 30
@@ -20,6 +21,7 @@ class Play:
         self.num_sneaky_episodes = config['num_sneaky_episodes'] if 'num_sneaky_episodes' in config else 10
         self.memory_size = config['memory_size'] if 'memory_size' in config else 10**4
         self.replay_skip = config['replay_skip'] if 'replay_skip' in config else 0
+        self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1
         # Initial values...
         self.video_size = (0, 0)
         self.pressed_keys = []
@@ -28,6 +30,8 @@ class Play:
         self.running = True
         self.state = 0
         self.clock = pygame.time.Clock()
+        self.sneaky_iteration = 0
+        self.paused = False
     
     def _display_arr(self, obs, screen, arr, video_size):
         if obs is not None:
@@ -49,6 +53,9 @@ class Play:
             self.screen = pygame.display.set_mode(self.video_size)
         elif event.type == pygame.KEYDOWN and event.key == pygame.K_ESCAPE:
             self.running = False
+        elif not self.paused and self.state in [0, 3] and event.type == pygame.KEYUP and event.key == pygame.K_F1:
+            self.paused = True
+            self.display_text("Paused... Press F1 to unpause.")
         else:
             # No event was matched here
             return False
@@ -118,7 +125,7 @@ class Play:
     def _increment_state(self):
         self.state = (self.state + 1) % 5
 
-    def pause(self, text = ""):
+    def transition(self, text = ""):
         myfont = pygame.font.SysFont('Comic Sans MS', 50)
         textsurface = myfont.render(text, False, (0, 0, 0))
         self.screen.blit(textsurface,(0,0))
@@ -138,15 +145,10 @@ class Play:
         self.clock.tick(self.fps)
     
     def sneaky_train(self):
-        self.memory_lock.acquire()
-
-        # Backup memory
-        backup_memory = self.memory
-        self.memory = ReplayMemory(capacity = self.memory_size)
-
+        self.record_lock.acquire()
         # Do a standard RL algorithm process for a certain number of episodes
         for i in range(self.num_sneaky_episodes):
-            print("Episode: %d / %d, Reward: " % (i + 1, self.num_sneaky_episodes), end = "")
+            print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "")
 
             # Reset all episode releated variables
             prev_obs = self.sneaky_env.reset()
@@ -155,28 +157,40 @@ class Play:
             total_reward = 0
             
             while not done:
-                action = self.action_selector.act(prev_obs)
+                action = self.sneaky_actor.act(prev_obs)
                 obs, reward, done, _ = self.sneaky_env.step(action)
                 total_reward += reward
-                self.memory.append(prev_obs, action, reward, obs, done)
+                self.sneaky_agent.memory.append(prev_obs, action, reward, obs, done)
                 prev_obs = obs
                 step += 1
                 if step % self.replay_skip == 0:
-                    self.agent.learn()
+                    self.sneaky_agent.learn()
             
             # Finish the previous print with the total reward obtained during the episode
             print(total_reward)
-        
-        # Reset the memory back to the human demonstration / shown computer data 
-        self.memory = backup_memory
-        self.memory_lock.release()
-
-        # Thoughts:
-        # It would be cool instead of throwing away all this new data, we keep just a sample of it
-        # Not sure if i want all of it because then it'll drown out the expert demonstration data
-
-        
+        self.sneaky_iteration += 1
+        self.record_lock.release()
     
+    def display_text(self, text):
+        myfont = pygame.font.SysFont('Comic Sans MS', 50)
+        textsurface = myfont.render(text, False, (0, 0, 0))
+        self.screen.blit(textsurface,(0,0))
+        pygame.display.flip()
+    
+    def clear_text(self, obs):
+        self._display_arr(obs, self.screen, self.env.unwrapped._get_obs(), video_size=self.video_size)
+        pygame.display.flip()
+    
+    def process_pause_state(self, obs):
+        # Process game events
+        for event in pygame.event.get():
+            # This rule needs to be before the common one otherwise unpausing is ignored
+            if event.type == pygame.KEYUP and event.key == pygame.K_F1:
+                self.paused = False
+                self.clear_text(obs)
+            else:
+                self._process_common_pygame_events(event)
+
     def start(self):
         """Allows one to play the game using keyboard.
         To simply play the game use:
@@ -200,57 +214,63 @@ class Play:
         TRANSITION2 = 4
         
         env_done = True
+        prev_obs = None
+        action = None
+        reward = None
         obs = None
         i = 0
+        episode_num = 0
         while self.running:
             # If the environment is done after a turn, reset it so we can keep going
             if env_done:
+                episode_num += 1
+                print("Human/Computer Episode: ", episode_num)
                 obs = self.env.reset()
                 env_done = False
             
+            if self.paused:
+                self.process_pause_state(obs)
+                continue
 
             if self.state is HUMAN_PLAY:
-                _, _, _, obs, env_done = self._human_play(obs)
+                prev_obs, action, reward, obs, env_done = self._human_play(obs)
             
             # The computer will train for a few episodes without showing to the user.
             # Mainly to speed up the learning process a bit
             elif self.state is SNEAKY_COMPUTER_PLAY:
                 print("Sneaky Computer Time")
-
-                # Display "Training..." text to user
-                myfont = pygame.font.SysFont('Comic Sans MS', 50)
-                textsurface = myfont.render("Training....", False, (0, 0, 0))
-                self.screen.blit(textsurface,(0,0))
-                pygame.display.flip()
+                self.display_text("Training...")
 
                 # Have the agent play a few rounds without showing to the user
                 self.sneaky_train()
 
-                # To take away training text
-                self._display_arr(obs, self.screen, self.env.unwrapped._get_obs(), video_size=self.video_size)
-                pygame.display.flip()
-
-                # Go to the next step immediately
+                self.clear_text(obs)
                 self._increment_state()
             
             elif self.state is TRANSITION:
-                self.pause("Computers Turn! Press <Space> to Start")
+                self.transition("Computers Turn! Press <Space> to Start")
             
             elif self.state is COMPUTER_PLAY:
-                _, _, _, obs, env_done = self._computer_play(obs)
+                prev_obs, action, reward, obs, env_done = self._computer_play(obs)
             
             elif self.state is TRANSITION2:
-                self.pause("Your Turn! Press <Space> to Start")
+                self.transition("Your Turn! Press <Space> to Start")
 
             # Increment the timer if it's the human or shown computer's turn
             if self.state is COMPUTER_PLAY or self.state is HUMAN_PLAY:
+                self.agent.memory.append(prev_obs, action, reward, obs, env_done)
                 i += 1
                 # Perform a quick learning process and increment the state after a certain time period has passed
                 if i % (self.fps * self.seconds_play_per_state) == 0:
-                    self.memory_lock.acquire()
-                    print("Number of transitions in buffer: ", len(self.memory))
-                    self.agent.learn()
-                    self.memory_lock.release()
+                    self.record_lock.acquire()
+                    self.display_text("Demo Training...")
+                    print("Begin Demonstration Training")
+                    print("Number of transitions in buffer: ", len(self.agent.memory))
+                    for j in range(self.num_train_per_demo):
+                        print("Iteration %d / %d" % (j + 1, self.num_train_per_demo))
+                        self.agent.learn()
+                    self.clear_text(obs)
+                    self.record_lock.release()
                     self._increment_state()
                     i = 0
         
diff --git a/play_env.py b/play_env.py
index 815115f..f06f8ee 100644
--- a/play_env.py
+++ b/play_env.py
@@ -1,4 +1,7 @@
 
+# TODO: I'm kinda using this project to pilot the whole config/network/example separation
+# The motivation behind this is that the file sizes are getting large and its increasing cognitive load :(
+
 # Import Python Standard Libraries
 from threading import Thread, Lock
 from argparse import ArgumentParser
@@ -10,13 +13,11 @@ from numpy import array as np_array
 from numpy import save as np_save
 import torch
 from torch.optim import Adam
-import torch.nn as nn
-import torch.nn.functional as F
 
 # Import my custom RL library
 import rltorch
-from rltorch.memory import PrioritizedReplayMemory
-from rltorch.action_selector import EpsilonGreedySelector
+from rltorch.memory import PrioritizedReplayMemory, ReplayMemory
+from rltorch.action_selector import EpsilonGreedySelector, ArgMaxSelector
 import rltorch.env as E
 import rltorch.network as rn
 
@@ -28,73 +29,24 @@ import play
 
 
 #
-## Networks
+## Networks (Probably want to move this to config file)
 #
-class Value(nn.Module):
-  def __init__(self, state_size, action_size):
-    super(Value, self).__init__()
-    self.state_size = state_size
-    self.action_size = action_size
-    
-    self.conv1 = nn.Conv2d(4, 32, kernel_size = (8, 8), stride = (4, 4))
-    self.conv2 = nn.Conv2d(32, 64, kernel_size = (4, 4), stride = (2, 2))    
-    self.conv3 = nn.Conv2d(64, 64, kernel_size = (3, 3), stride = (1, 1))
-    
-    self.fc1 = nn.Linear(3136, 512)
-    self.fc1_norm = nn.LayerNorm(512)
-
-    self.value_fc = rn.NoisyLinear(512, 512)
-    self.value_fc_norm = nn.LayerNorm(512)
-    self.value = nn.Linear(512, 1)
-    
-    self.advantage_fc = rn.NoisyLinear(512, 512)
-    self.advantage_fc_norm = nn.LayerNorm(512)
-    self.advantage = nn.Linear(512, action_size)
-
-  
-  def forward(self, x):
-    x = x.float() / 256
-    x = F.relu(self.conv1(x))
-    x = F.relu(self.conv2(x))
-    x = F.relu(self.conv3(x))
-    
-    # Makes batch_size dimension again
-    x = x.view(-1, 3136)
-    x = F.relu(self.fc1_norm(self.fc1(x)))
-    
-    state_value = F.relu(self.value_fc_norm(self.value_fc(x)))
-    state_value = self.value(state_value)
-    
-    advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x)))
-    advantage = self.advantage(advantage)
-    
-    x = state_value + advantage - advantage.mean()
-    
-    # For debugging purposes...
-    if torch.isnan(x).any().item():
-      print("WARNING NAN IN MODEL DETECTED")
-    
-    return x
-
+from networks import Value
 
 #
 ## Play Related Classes
 #
-Transition = namedtuple('Transition',
-      ('state', 'action', 'reward', 'next_state', 'done'))
-
 class PlayClass(Thread):
-  def __init__(self, env, action_selector, memory, memory_lock, agent, sneaky_env, config):
+  def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config):
     super(PlayClass, self).__init__()
-    self.play = play.Play(env, action_selector, memory, memory_lock, agent, sneaky_env, config)
+    self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock,  config)
 
   def run(self):
     self.play.start()
 
 class Record(GymWrapper):
-  def __init__(self, env, memory, memory_lock, args):
+  def __init__(self, env, memory, args):
     GymWrapper.__init__(self, env)
-    self.memory_lock = memory_lock
     self.memory = memory
     self.skipframes = args['skip']
     self.environment_name = args['environment_name']
@@ -110,14 +62,11 @@ class Record(GymWrapper):
     self.current_i += 1
     # Don't add to memory until a certain number of frames is reached
     if self.current_i % self.skipframes == 0:
-      self.memory_lock.acquire()
-      self.memory.append(state, action, reward, next_state, done)
-      self.memory_lock.release()
+      self.memory.append((state, action, reward, next_state, done))
       self.current_i = 0
     return next_state, reward, done, info
   
   def log_transitions(self):
-    self.memory_lock.acquire()
     if len(self.memory) > 0:
       basename = self.logdir + "/{}.{}".format(self.environment_name, datetime.now().strftime("%Y-%m-%d-%H-%M-%s"))
       print("Base Filename: ", basename)
@@ -128,7 +77,6 @@ class Record(GymWrapper):
       np_save(basename + "-nextstate.npy", np_array(next_state), allow_pickle = False)
       np_save(basename + "-done.npy", np_array(done), allow_pickle = False)
       self.memory.clear()
-    self.memory_lock.release()
 
 
 ## Parsing arguments
@@ -141,31 +89,8 @@ parser.add_argument("--model", type=str, help = "The path location of the PyTorc
 args = vars(parser.parse_args())
 
 ## Main configuration for script
-config = {}
-config['seed'] = 901
-config['seconds_play_per_state'] = 60
-config['zoom'] = 4
-config['environment_name'] = 'PongNoFrameskip-v4'
-config['learning_rate'] = 1e-4
-config['target_sync_tau'] = 1e-3
-config['discount_rate'] = 0.99
-config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5)
-# Number of episodes for the computer to train the agent without the human seeing
-config['num_sneaky_episodes'] = 20
-config['replay_skip'] = 14
-config['batch_size'] = 32 * (config['replay_skip'] + 1)
-config['disable_cuda'] = False
-config['memory_size'] = 10**4
-# Prioritized vs Random Sampling
-# 0 - Random sampling
-# 1 - Only the highest prioirities
-config['prioritized_replay_sampling_priority'] = 0.6
-# How important are the weights for the loss?
-# 0 - Treat all losses equally
-# 1 - Lower the importance of high losses
-# Should ideally start from 0 and move your way to 1 to prevent overfitting
-config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5)
-
+from config import config
+from sneaky_config import sneaky_config
 
 # Environment name and log directory is vital so show help message and exit if not provided
 if args['environment_name'] is None or args['logdir'] is None:
@@ -175,7 +100,7 @@ if args['environment_name'] is None or args['logdir'] is None:
 # Number of frames to skip when recording and fps can have sane defaults
 if args['skip'] is None:
   args['skip'] = 3
-if args['fps'] is None:
+if 'fps' not in args:
   args['fps'] = 30
 
 
@@ -196,22 +121,20 @@ def wrap_preprocessing(env, MaxAndSkipEnv = False):
     , 4)
   )
 
-
 ## Set up environment to be recorded and preprocessed
-memory = PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority'])
-memory_lock = Lock()
-env = Record(makeEnv(args['environment_name']), memory, memory_lock, args)
+record_memory = []
+record_lock = Lock()
+env = Record(makeEnv(args['environment_name']), record_memory, args)
+
 # Bind record_env to current env so that we can reference log_transitions easier later
 record_env = env
+
 # Use native gym  monitor to get video recording
 env = GymMonitor(env, args['logdir'], force=True)
+
 # Preprocess enviornment
 env = wrap_preprocessing(env)
 
-# Use a different environment for when the computer trains on the side so that the current game state isn't manipuated
-# Also use MaxEnvSkip to speed up processing
-sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True)
-
 # Set seeds
 rltorch.set_seed(config['seed'])
 env.seed(config['seed'])
@@ -226,18 +149,31 @@ net = rn.Network(Value(state_size, action_size),
 target_net = rn.TargetNetwork(net, device = device)
 
 # Relevant components from RLTorch
-actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = config['exploration_rate'])
+memory = PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority'])
+actor = ArgMaxSelector(net, action_size, device = device)
 agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net)
 
+# Use a different environment for when the computer trains on the side so that the current game state isn't manipuated
+# Also use MaxEnvSkip to speed up processing
+sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True)
+sneaky_memory = ReplayMemory(capacity = sneaky_config['memory_size'])
+sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = sneaky_config['exploration_rate'])
+
+sneaky_agent = rltorch.agents.DQNAgent(net, sneaky_memory, sneaky_config, target_net = target_net)
+
 # Pass all this information into the thread that will handle the game play and start
-playThread = PlayClass(env, actor, memory, memory_lock, agent, sneaky_env, config)
+playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config)
 playThread.start()
 
 # While the play thread is running, we'll periodically log transitions we've encountered
 while playThread.is_alive():
   playThread.join(60) 
+  record_lock.acquire()
   print("Logging....", end = " ")
   record_env.log_transitions()
+  record_lock.release()
 
 # Save what's remaining after process died
+record_lock.acquire()
 record_env.log_transitions()
+record_lock.release()
\ No newline at end of file
diff --git a/sneaky_config.py b/sneaky_config.py
new file mode 100644
index 0000000..f72b2af
--- /dev/null
+++ b/sneaky_config.py
@@ -0,0 +1,11 @@
+import rltorch
+
+sneaky_config = {}
+sneaky_config['learning_rate'] = 1e-4
+sneaky_config['target_sync_tau'] = 1e-3
+sneaky_config['discount_rate'] = 0.99
+sneaky_config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5)
+# Number of episodes for the computer to train the agent without the human seeing
+sneaky_config['replay_skip'] = 14
+sneaky_config['batch_size'] = 32 * (sneaky_config['replay_skip'] + 1)
+sneaky_config['memory_size'] = 10**4