Began separating config & networks, F1 for pausing, text functions, and more sneaky agent stuff

2019-10-27 20:42:37 -04:00 · 2019-10-27 20:42:37 -04:00 · 32862e4d79
commit 32862e4d79
parent d78892e62c
6 changed files with 188 additions and 142 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 __pycache__/
 playlogs/
+.vscode/
--- a/config.py
+++ b/config.py
@ -0,0 +1,27 @@
+import rltorch
+
+config = {}
+config['seed'] = 901
+config['seconds_play_per_state'] = 120
+config['zoom'] = 4
+config['environment_name'] = 'PongNoFrameskip-v4'
+config['learning_rate'] = 1e-4
+config['target_sync_tau'] = 1e-3
+config['discount_rate'] = 0.99
+config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5)
+# Number of episodes for the computer to train the agent without the human seeing
+config['num_sneaky_episodes'] = 10
+config['num_train_per_demo'] = 50 # 100 total since you have two demo training per cycle
+config['replay_skip'] = 14
+config['batch_size'] = 32 * (config['replay_skip'] + 1)
+config['disable_cuda'] = False
+config['memory_size'] = 10**4
+# Prioritized vs Random Sampling
+# 0 - Random sampling
+# 1 - Only the highest prioirities
+config['prioritized_replay_sampling_priority'] = 0.6
+# How important are the weights for the loss?
+# 0 - Treat all losses equally
+# 1 - Lower the importance of high losses
+# Should ideally start from 0 and move your way to 1 to prevent overfitting
+config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5)
--- a/networks.py
+++ b/networks.py
@ -0,0 +1,51 @@
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import rltorch.network as rn
+
+class Value(nn.Module):
+  def __init__(self, state_size, action_size):
+    super(Value, self).__init__()
+    self.state_size = state_size
+    self.action_size = action_size
+    
+    self.conv1 = nn.Conv2d(4, 32, kernel_size = (8, 8), stride = (4, 4))
+    self.conv2 = nn.Conv2d(32, 64, kernel_size = (4, 4), stride = (2, 2))    
+    self.conv3 = nn.Conv2d(64, 64, kernel_size = (3, 3), stride = (1, 1))
+    
+    self.fc1 = nn.Linear(3136, 512)
+    self.fc1_norm = nn.LayerNorm(512)
+
+    self.value_fc = rn.NoisyLinear(512, 512)
+    self.value_fc_norm = nn.LayerNorm(512)
+    self.value = nn.Linear(512, 1)
+    
+    self.advantage_fc = rn.NoisyLinear(512, 512)
+    self.advantage_fc_norm = nn.LayerNorm(512)
+    self.advantage = nn.Linear(512, action_size)
+
+  
+  def forward(self, x):
+    x = x.float() / 256
+    x = F.relu(self.conv1(x))
+    x = F.relu(self.conv2(x))
+    x = F.relu(self.conv3(x))
+    
+    # Makes batch_size dimension again
+    x = x.view(-1, 3136)
+    x = F.relu(self.fc1_norm(self.fc1(x)))
+    
+    state_value = F.relu(self.value_fc_norm(self.value_fc(x)))
+    state_value = self.value(state_value)
+    
+    advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x)))
+    advantage = self.advantage(advantage)
+    
+    x = state_value + advantage - advantage.mean()
+    
+    # For debugging purposes...
+    if torch.isnan(x).any().item():
+      print("WARNING NAN IN MODEL DETECTED")
+    
+    return x
--- a/play.py
+++ b/play.py
@ -4,13 +4,14 @@ from pygame.locals import VIDEORESIZE
 from rltorch.memory import ReplayMemory

 class Play:
-    def __init__(self, env, action_selector, memory, memory_lock, agent, sneaky_env, config):
+    def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config):
        self.env = env
        self.action_selector = action_selector
-        self.memory = memory
-        self.memory_lock = memory_lock
+        self.record_lock = record_lock
+        self.sneaky_agent = sneaky_agent
        self.agent = agent
        self.sneaky_env = sneaky_env
+        self.sneaky_actor = sneaky_actor
        # Get relevant parameters from config or set sane defaults
        self.transpose = config['transpose'] if 'transpose' in config else True
        self.fps = config['fps'] if 'fps' in config else 30
@ -20,6 +21,7 @@ class Play:
        self.num_sneaky_episodes = config['num_sneaky_episodes'] if 'num_sneaky_episodes' in config else 10
        self.memory_size = config['memory_size'] if 'memory_size' in config else 10**4
        self.replay_skip = config['replay_skip'] if 'replay_skip' in config else 0
+        self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1
        # Initial values...
        self.video_size = (0, 0)
        self.pressed_keys = []
@ -28,6 +30,8 @@ class Play:
        self.running = True
        self.state = 0
        self.clock = pygame.time.Clock()
+        self.sneaky_iteration = 0
+        self.paused = False
    
    def _display_arr(self, obs, screen, arr, video_size):
        if obs is not None:
@ -49,6 +53,9 @@ class Play:
            self.screen = pygame.display.set_mode(self.video_size)
        elif event.type == pygame.KEYDOWN and event.key == pygame.K_ESCAPE:
            self.running = False
+        elif not self.paused and self.state in [0, 3] and event.type == pygame.KEYUP and event.key == pygame.K_F1:
+            self.paused = True
+            self.display_text("Paused... Press F1 to unpause.")
        else:
            # No event was matched here
            return False
@ -118,7 +125,7 @@ class Play:
    def _increment_state(self):
        self.state = (self.state + 1) % 5

-    def pause(self, text = ""):
+    def transition(self, text = ""):
        myfont = pygame.font.SysFont('Comic Sans MS', 50)
        textsurface = myfont.render(text, False, (0, 0, 0))
        self.screen.blit(textsurface,(0,0))
@ -138,15 +145,10 @@ class Play:
        self.clock.tick(self.fps)
    
    def sneaky_train(self):
-        self.memory_lock.acquire()
-
-        # Backup memory
-        backup_memory = self.memory
-        self.memory = ReplayMemory(capacity = self.memory_size)
-
+        self.record_lock.acquire()
        # Do a standard RL algorithm process for a certain number of episodes
        for i in range(self.num_sneaky_episodes):
-            print("Episode: %d / %d, Reward: " % (i + 1, self.num_sneaky_episodes), end = "")
+            print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "")

            # Reset all episode releated variables
            prev_obs = self.sneaky_env.reset()
@ -155,28 +157,40 @@ class Play:
            total_reward = 0
            
            while not done:
-                action = self.action_selector.act(prev_obs)
+                action = self.sneaky_actor.act(prev_obs)
                obs, reward, done, _ = self.sneaky_env.step(action)
                total_reward += reward
-                self.memory.append(prev_obs, action, reward, obs, done)
+                self.sneaky_agent.memory.append(prev_obs, action, reward, obs, done)
                prev_obs = obs
                step += 1
                if step % self.replay_skip == 0:
-                    self.agent.learn()
+                    self.sneaky_agent.learn()
            
            # Finish the previous print with the total reward obtained during the episode
            print(total_reward)
-        
-        # Reset the memory back to the human demonstration / shown computer data 
-        self.memory = backup_memory
-        self.memory_lock.release()
-
-        # Thoughts:
-        # It would be cool instead of throwing away all this new data, we keep just a sample of it
-        # Not sure if i want all of it because then it'll drown out the expert demonstration data
-
-        
+        self.sneaky_iteration += 1
+        self.record_lock.release()
    
+    def display_text(self, text):
+        myfont = pygame.font.SysFont('Comic Sans MS', 50)
+        textsurface = myfont.render(text, False, (0, 0, 0))
+        self.screen.blit(textsurface,(0,0))
+        pygame.display.flip()
+    
+    def clear_text(self, obs):
+        self._display_arr(obs, self.screen, self.env.unwrapped._get_obs(), video_size=self.video_size)
+        pygame.display.flip()
+    
+    def process_pause_state(self, obs):
+        # Process game events
+        for event in pygame.event.get():
+            # This rule needs to be before the common one otherwise unpausing is ignored
+            if event.type == pygame.KEYUP and event.key == pygame.K_F1:
+                self.paused = False
+                self.clear_text(obs)
+            else:
+                self._process_common_pygame_events(event)
+
    def start(self):
        """Allows one to play the game using keyboard.
        To simply play the game use:
@ -200,57 +214,63 @@ class Play:
        TRANSITION2 = 4
        
        env_done = True
+        prev_obs = None
+        action = None
+        reward = None
        obs = None
        i = 0
+        episode_num = 0
        while self.running:
            # If the environment is done after a turn, reset it so we can keep going
            if env_done:
+                episode_num += 1
+                print("Human/Computer Episode: ", episode_num)
                obs = self.env.reset()
                env_done = False
            
+            if self.paused:
+                self.process_pause_state(obs)
+                continue

            if self.state is HUMAN_PLAY:
-                _, _, _, obs, env_done = self._human_play(obs)
+                prev_obs, action, reward, obs, env_done = self._human_play(obs)
            
            # The computer will train for a few episodes without showing to the user.
            # Mainly to speed up the learning process a bit
            elif self.state is SNEAKY_COMPUTER_PLAY:
                print("Sneaky Computer Time")
-
-                # Display "Training..." text to user
-                myfont = pygame.font.SysFont('Comic Sans MS', 50)
-                textsurface = myfont.render("Training....", False, (0, 0, 0))
-                self.screen.blit(textsurface,(0,0))
-                pygame.display.flip()
+                self.display_text("Training...")

                # Have the agent play a few rounds without showing to the user
                self.sneaky_train()

-                # To take away training text
-                self._display_arr(obs, self.screen, self.env.unwrapped._get_obs(), video_size=self.video_size)
-                pygame.display.flip()
-
-                # Go to the next step immediately
+                self.clear_text(obs)
                self._increment_state()
            
            elif self.state is TRANSITION:
-                self.pause("Computers Turn! Press <Space> to Start")
+                self.transition("Computers Turn! Press <Space> to Start")
            
            elif self.state is COMPUTER_PLAY:
-                _, _, _, obs, env_done = self._computer_play(obs)
+                prev_obs, action, reward, obs, env_done = self._computer_play(obs)
            
            elif self.state is TRANSITION2:
-                self.pause("Your Turn! Press <Space> to Start")
+                self.transition("Your Turn! Press <Space> to Start")

            # Increment the timer if it's the human or shown computer's turn
            if self.state is COMPUTER_PLAY or self.state is HUMAN_PLAY:
+                self.agent.memory.append(prev_obs, action, reward, obs, env_done)
                i += 1
                # Perform a quick learning process and increment the state after a certain time period has passed
                if i % (self.fps * self.seconds_play_per_state) == 0:
-                    self.memory_lock.acquire()
-                    print("Number of transitions in buffer: ", len(self.memory))
-                    self.agent.learn()
-                    self.memory_lock.release()
+                    self.record_lock.acquire()
+                    self.display_text("Demo Training...")
+                    print("Begin Demonstration Training")
+                    print("Number of transitions in buffer: ", len(self.agent.memory))
+                    for j in range(self.num_train_per_demo):
+                        print("Iteration %d / %d" % (j + 1, self.num_train_per_demo))
+                        self.agent.learn()
+                    self.clear_text(obs)
+                    self.record_lock.release()
                    self._increment_state()
                    i = 0
        
--- a/play_env.py
+++ b/play_env.py
@ -1,4 +1,7 @@

+# TODO: I'm kinda using this project to pilot the whole config/network/example separation
+# The motivation behind this is that the file sizes are getting large and its increasing cognitive load :(
+
 # Import Python Standard Libraries
 from threading import Thread, Lock
 from argparse import ArgumentParser
@ -10,13 +13,11 @@ from numpy import array as np_array
 from numpy import save as np_save
 import torch
 from torch.optim import Adam
-import torch.nn as nn
-import torch.nn.functional as F

 # Import my custom RL library
 import rltorch
-from rltorch.memory import PrioritizedReplayMemory
-from rltorch.action_selector import EpsilonGreedySelector
+from rltorch.memory import PrioritizedReplayMemory, ReplayMemory
+from rltorch.action_selector import EpsilonGreedySelector, ArgMaxSelector
 import rltorch.env as E
 import rltorch.network as rn

@ -28,73 +29,24 @@ import play


 #
-## Networks
+## Networks (Probably want to move this to config file)
 #
-class Value(nn.Module):
-  def __init__(self, state_size, action_size):
-    super(Value, self).__init__()
-    self.state_size = state_size
-    self.action_size = action_size
-    
-    self.conv1 = nn.Conv2d(4, 32, kernel_size = (8, 8), stride = (4, 4))
-    self.conv2 = nn.Conv2d(32, 64, kernel_size = (4, 4), stride = (2, 2))    
-    self.conv3 = nn.Conv2d(64, 64, kernel_size = (3, 3), stride = (1, 1))
-    
-    self.fc1 = nn.Linear(3136, 512)
-    self.fc1_norm = nn.LayerNorm(512)
-
-    self.value_fc = rn.NoisyLinear(512, 512)
-    self.value_fc_norm = nn.LayerNorm(512)
-    self.value = nn.Linear(512, 1)
-    
-    self.advantage_fc = rn.NoisyLinear(512, 512)
-    self.advantage_fc_norm = nn.LayerNorm(512)
-    self.advantage = nn.Linear(512, action_size)
-
-  
-  def forward(self, x):
-    x = x.float() / 256
-    x = F.relu(self.conv1(x))
-    x = F.relu(self.conv2(x))
-    x = F.relu(self.conv3(x))
-    
-    # Makes batch_size dimension again
-    x = x.view(-1, 3136)
-    x = F.relu(self.fc1_norm(self.fc1(x)))
-    
-    state_value = F.relu(self.value_fc_norm(self.value_fc(x)))
-    state_value = self.value(state_value)
-    
-    advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x)))
-    advantage = self.advantage(advantage)
-    
-    x = state_value + advantage - advantage.mean()
-    
-    # For debugging purposes...
-    if torch.isnan(x).any().item():
-      print("WARNING NAN IN MODEL DETECTED")
-    
-    return x
-
+from networks import Value

 #
 ## Play Related Classes
 #
-Transition = namedtuple('Transition',
-      ('state', 'action', 'reward', 'next_state', 'done'))
-
 class PlayClass(Thread):
-  def __init__(self, env, action_selector, memory, memory_lock, agent, sneaky_env, config):
+  def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config):
    super(PlayClass, self).__init__()
-    self.play = play.Play(env, action_selector, memory, memory_lock, agent, sneaky_env, config)
+    self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock,  config)

  def run(self):
    self.play.start()

 class Record(GymWrapper):
-  def __init__(self, env, memory, memory_lock, args):
+  def __init__(self, env, memory, args):
    GymWrapper.__init__(self, env)
-    self.memory_lock = memory_lock
    self.memory = memory
    self.skipframes = args['skip']
    self.environment_name = args['environment_name']
@ -110,14 +62,11 @@ class Record(GymWrapper):
    self.current_i += 1
    # Don't add to memory until a certain number of frames is reached
    if self.current_i % self.skipframes == 0:
-      self.memory_lock.acquire()
-      self.memory.append(state, action, reward, next_state, done)
-      self.memory_lock.release()
+      self.memory.append((state, action, reward, next_state, done))
      self.current_i = 0
    return next_state, reward, done, info
  
  def log_transitions(self):
-    self.memory_lock.acquire()
    if len(self.memory) > 0:
      basename = self.logdir + "/{}.{}".format(self.environment_name, datetime.now().strftime("%Y-%m-%d-%H-%M-%s"))
      print("Base Filename: ", basename)
@ -128,7 +77,6 @@ class Record(GymWrapper):
      np_save(basename + "-nextstate.npy", np_array(next_state), allow_pickle = False)
      np_save(basename + "-done.npy", np_array(done), allow_pickle = False)
      self.memory.clear()
-    self.memory_lock.release()


 ## Parsing arguments
@ -141,31 +89,8 @@ parser.add_argument("--model", type=str, help = "The path location of the PyTorc
 args = vars(parser.parse_args())

 ## Main configuration for script
-config = {}
-config['seed'] = 901
-config['seconds_play_per_state'] = 60
-config['zoom'] = 4
-config['environment_name'] = 'PongNoFrameskip-v4'
-config['learning_rate'] = 1e-4
-config['target_sync_tau'] = 1e-3
-config['discount_rate'] = 0.99
-config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5)
-# Number of episodes for the computer to train the agent without the human seeing
-config['num_sneaky_episodes'] = 20
-config['replay_skip'] = 14
-config['batch_size'] = 32 * (config['replay_skip'] + 1)
-config['disable_cuda'] = False
-config['memory_size'] = 10**4
-# Prioritized vs Random Sampling
-# 0 - Random sampling
-# 1 - Only the highest prioirities
-config['prioritized_replay_sampling_priority'] = 0.6
-# How important are the weights for the loss?
-# 0 - Treat all losses equally
-# 1 - Lower the importance of high losses
-# Should ideally start from 0 and move your way to 1 to prevent overfitting
-config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5)
-
+from config import config
+from sneaky_config import sneaky_config

 # Environment name and log directory is vital so show help message and exit if not provided
 if args['environment_name'] is None or args['logdir'] is None:
@ -175,7 +100,7 @@ if args['environment_name'] is None or args['logdir'] is None:
 # Number of frames to skip when recording and fps can have sane defaults
 if args['skip'] is None:
  args['skip'] = 3
-if args['fps'] is None:
+if 'fps' not in args:
  args['fps'] = 30


@ -196,22 +121,20 @@ def wrap_preprocessing(env, MaxAndSkipEnv = False):
    , 4)
  )

-
 ## Set up environment to be recorded and preprocessed
-memory = PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority'])
-memory_lock = Lock()
-env = Record(makeEnv(args['environment_name']), memory, memory_lock, args)
+record_memory = []
+record_lock = Lock()
+env = Record(makeEnv(args['environment_name']), record_memory, args)
+
 # Bind record_env to current env so that we can reference log_transitions easier later
 record_env = env
+
 # Use native gym  monitor to get video recording
 env = GymMonitor(env, args['logdir'], force=True)
+
 # Preprocess enviornment
 env = wrap_preprocessing(env)

-# Use a different environment for when the computer trains on the side so that the current game state isn't manipuated
-# Also use MaxEnvSkip to speed up processing
-sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True)
-
 # Set seeds
 rltorch.set_seed(config['seed'])
 env.seed(config['seed'])
@ -226,18 +149,31 @@ net = rn.Network(Value(state_size, action_size),
 target_net = rn.TargetNetwork(net, device = device)

 # Relevant components from RLTorch
-actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = config['exploration_rate'])
+memory = PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority'])
+actor = ArgMaxSelector(net, action_size, device = device)
 agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net)

+# Use a different environment for when the computer trains on the side so that the current game state isn't manipuated
+# Also use MaxEnvSkip to speed up processing
+sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True)
+sneaky_memory = ReplayMemory(capacity = sneaky_config['memory_size'])
+sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = sneaky_config['exploration_rate'])
+
+sneaky_agent = rltorch.agents.DQNAgent(net, sneaky_memory, sneaky_config, target_net = target_net)
+
 # Pass all this information into the thread that will handle the game play and start
-playThread = PlayClass(env, actor, memory, memory_lock, agent, sneaky_env, config)
+playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config)
 playThread.start()

 # While the play thread is running, we'll periodically log transitions we've encountered
 while playThread.is_alive():
  playThread.join(60) 
+  record_lock.acquire()
  print("Logging....", end = " ")
  record_env.log_transitions()
+  record_lock.release()

 # Save what's remaining after process died
+record_lock.acquire()
 record_env.log_transitions()
+record_lock.release()
--- a/sneaky_config.py
+++ b/sneaky_config.py
@ -0,0 +1,11 @@
+import rltorch
+
+sneaky_config = {}
+sneaky_config['learning_rate'] = 1e-4
+sneaky_config['target_sync_tau'] = 1e-3
+sneaky_config['discount_rate'] = 0.99
+sneaky_config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5)
+# Number of episodes for the computer to train the agent without the human seeing
+sneaky_config['replay_skip'] = 14
+sneaky_config['batch_size'] = 32 * (sneaky_config['replay_skip'] + 1)
+sneaky_config['memory_size'] = 10**4