From 32862e4d798f30eaaab1d29862243cc59aa5e95a Mon Sep 17 00:00:00 2001 From: Brandon Rozek Date: Sun, 27 Oct 2019 20:42:37 -0400 Subject: [PATCH] Began separating config & networks, F1 for pausing, text functions, and more sneaky agent stuff --- .gitignore | 1 + config.py | 27 ++++++++++ networks.py | 51 ++++++++++++++++++ play.py | 106 ++++++++++++++++++++++--------------- play_env.py | 134 +++++++++++++---------------------------------- sneaky_config.py | 11 ++++ 6 files changed, 188 insertions(+), 142 deletions(-) create mode 100644 config.py create mode 100644 networks.py create mode 100644 sneaky_config.py diff --git a/.gitignore b/.gitignore index 5f46325..eb6de05 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ __pycache__/ playlogs/ +.vscode/ \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 0000000..6a1b865 --- /dev/null +++ b/config.py @@ -0,0 +1,27 @@ +import rltorch + +config = {} +config['seed'] = 901 +config['seconds_play_per_state'] = 120 +config['zoom'] = 4 +config['environment_name'] = 'PongNoFrameskip-v4' +config['learning_rate'] = 1e-4 +config['target_sync_tau'] = 1e-3 +config['discount_rate'] = 0.99 +config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5) +# Number of episodes for the computer to train the agent without the human seeing +config['num_sneaky_episodes'] = 10 +config['num_train_per_demo'] = 50 # 100 total since you have two demo training per cycle +config['replay_skip'] = 14 +config['batch_size'] = 32 * (config['replay_skip'] + 1) +config['disable_cuda'] = False +config['memory_size'] = 10**4 +# Prioritized vs Random Sampling +# 0 - Random sampling +# 1 - Only the highest prioirities +config['prioritized_replay_sampling_priority'] = 0.6 +# How important are the weights for the loss? +# 0 - Treat all losses equally +# 1 - Lower the importance of high losses +# Should ideally start from 0 and move your way to 1 to prevent overfitting +config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5) diff --git a/networks.py b/networks.py new file mode 100644 index 0000000..e02f1b6 --- /dev/null +++ b/networks.py @@ -0,0 +1,51 @@ + +import torch +import torch.nn as nn +import torch.nn.functional as F +import rltorch.network as rn + +class Value(nn.Module): + def __init__(self, state_size, action_size): + super(Value, self).__init__() + self.state_size = state_size + self.action_size = action_size + + self.conv1 = nn.Conv2d(4, 32, kernel_size = (8, 8), stride = (4, 4)) + self.conv2 = nn.Conv2d(32, 64, kernel_size = (4, 4), stride = (2, 2)) + self.conv3 = nn.Conv2d(64, 64, kernel_size = (3, 3), stride = (1, 1)) + + self.fc1 = nn.Linear(3136, 512) + self.fc1_norm = nn.LayerNorm(512) + + self.value_fc = rn.NoisyLinear(512, 512) + self.value_fc_norm = nn.LayerNorm(512) + self.value = nn.Linear(512, 1) + + self.advantage_fc = rn.NoisyLinear(512, 512) + self.advantage_fc_norm = nn.LayerNorm(512) + self.advantage = nn.Linear(512, action_size) + + + def forward(self, x): + x = x.float() / 256 + x = F.relu(self.conv1(x)) + x = F.relu(self.conv2(x)) + x = F.relu(self.conv3(x)) + + # Makes batch_size dimension again + x = x.view(-1, 3136) + x = F.relu(self.fc1_norm(self.fc1(x))) + + state_value = F.relu(self.value_fc_norm(self.value_fc(x))) + state_value = self.value(state_value) + + advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x))) + advantage = self.advantage(advantage) + + x = state_value + advantage - advantage.mean() + + # For debugging purposes... + if torch.isnan(x).any().item(): + print("WARNING NAN IN MODEL DETECTED") + + return x diff --git a/play.py b/play.py index 48b7d52..daea72e 100644 --- a/play.py +++ b/play.py @@ -4,13 +4,14 @@ from pygame.locals import VIDEORESIZE from rltorch.memory import ReplayMemory class Play: - def __init__(self, env, action_selector, memory, memory_lock, agent, sneaky_env, config): + def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config): self.env = env self.action_selector = action_selector - self.memory = memory - self.memory_lock = memory_lock + self.record_lock = record_lock + self.sneaky_agent = sneaky_agent self.agent = agent self.sneaky_env = sneaky_env + self.sneaky_actor = sneaky_actor # Get relevant parameters from config or set sane defaults self.transpose = config['transpose'] if 'transpose' in config else True self.fps = config['fps'] if 'fps' in config else 30 @@ -20,6 +21,7 @@ class Play: self.num_sneaky_episodes = config['num_sneaky_episodes'] if 'num_sneaky_episodes' in config else 10 self.memory_size = config['memory_size'] if 'memory_size' in config else 10**4 self.replay_skip = config['replay_skip'] if 'replay_skip' in config else 0 + self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1 # Initial values... self.video_size = (0, 0) self.pressed_keys = [] @@ -28,6 +30,8 @@ class Play: self.running = True self.state = 0 self.clock = pygame.time.Clock() + self.sneaky_iteration = 0 + self.paused = False def _display_arr(self, obs, screen, arr, video_size): if obs is not None: @@ -49,6 +53,9 @@ class Play: self.screen = pygame.display.set_mode(self.video_size) elif event.type == pygame.KEYDOWN and event.key == pygame.K_ESCAPE: self.running = False + elif not self.paused and self.state in [0, 3] and event.type == pygame.KEYUP and event.key == pygame.K_F1: + self.paused = True + self.display_text("Paused... Press F1 to unpause.") else: # No event was matched here return False @@ -118,7 +125,7 @@ class Play: def _increment_state(self): self.state = (self.state + 1) % 5 - def pause(self, text = ""): + def transition(self, text = ""): myfont = pygame.font.SysFont('Comic Sans MS', 50) textsurface = myfont.render(text, False, (0, 0, 0)) self.screen.blit(textsurface,(0,0)) @@ -138,15 +145,10 @@ class Play: self.clock.tick(self.fps) def sneaky_train(self): - self.memory_lock.acquire() - - # Backup memory - backup_memory = self.memory - self.memory = ReplayMemory(capacity = self.memory_size) - + self.record_lock.acquire() # Do a standard RL algorithm process for a certain number of episodes for i in range(self.num_sneaky_episodes): - print("Episode: %d / %d, Reward: " % (i + 1, self.num_sneaky_episodes), end = "") + print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "") # Reset all episode releated variables prev_obs = self.sneaky_env.reset() @@ -155,28 +157,40 @@ class Play: total_reward = 0 while not done: - action = self.action_selector.act(prev_obs) + action = self.sneaky_actor.act(prev_obs) obs, reward, done, _ = self.sneaky_env.step(action) total_reward += reward - self.memory.append(prev_obs, action, reward, obs, done) + self.sneaky_agent.memory.append(prev_obs, action, reward, obs, done) prev_obs = obs step += 1 if step % self.replay_skip == 0: - self.agent.learn() + self.sneaky_agent.learn() # Finish the previous print with the total reward obtained during the episode print(total_reward) - - # Reset the memory back to the human demonstration / shown computer data - self.memory = backup_memory - self.memory_lock.release() - - # Thoughts: - # It would be cool instead of throwing away all this new data, we keep just a sample of it - # Not sure if i want all of it because then it'll drown out the expert demonstration data - - + self.sneaky_iteration += 1 + self.record_lock.release() + def display_text(self, text): + myfont = pygame.font.SysFont('Comic Sans MS', 50) + textsurface = myfont.render(text, False, (0, 0, 0)) + self.screen.blit(textsurface,(0,0)) + pygame.display.flip() + + def clear_text(self, obs): + self._display_arr(obs, self.screen, self.env.unwrapped._get_obs(), video_size=self.video_size) + pygame.display.flip() + + def process_pause_state(self, obs): + # Process game events + for event in pygame.event.get(): + # This rule needs to be before the common one otherwise unpausing is ignored + if event.type == pygame.KEYUP and event.key == pygame.K_F1: + self.paused = False + self.clear_text(obs) + else: + self._process_common_pygame_events(event) + def start(self): """Allows one to play the game using keyboard. To simply play the game use: @@ -200,57 +214,63 @@ class Play: TRANSITION2 = 4 env_done = True + prev_obs = None + action = None + reward = None obs = None i = 0 + episode_num = 0 while self.running: # If the environment is done after a turn, reset it so we can keep going if env_done: + episode_num += 1 + print("Human/Computer Episode: ", episode_num) obs = self.env.reset() env_done = False + if self.paused: + self.process_pause_state(obs) + continue if self.state is HUMAN_PLAY: - _, _, _, obs, env_done = self._human_play(obs) + prev_obs, action, reward, obs, env_done = self._human_play(obs) # The computer will train for a few episodes without showing to the user. # Mainly to speed up the learning process a bit elif self.state is SNEAKY_COMPUTER_PLAY: print("Sneaky Computer Time") - - # Display "Training..." text to user - myfont = pygame.font.SysFont('Comic Sans MS', 50) - textsurface = myfont.render("Training....", False, (0, 0, 0)) - self.screen.blit(textsurface,(0,0)) - pygame.display.flip() + self.display_text("Training...") # Have the agent play a few rounds without showing to the user self.sneaky_train() - # To take away training text - self._display_arr(obs, self.screen, self.env.unwrapped._get_obs(), video_size=self.video_size) - pygame.display.flip() - - # Go to the next step immediately + self.clear_text(obs) self._increment_state() elif self.state is TRANSITION: - self.pause("Computers Turn! Press to Start") + self.transition("Computers Turn! Press to Start") elif self.state is COMPUTER_PLAY: - _, _, _, obs, env_done = self._computer_play(obs) + prev_obs, action, reward, obs, env_done = self._computer_play(obs) elif self.state is TRANSITION2: - self.pause("Your Turn! Press to Start") + self.transition("Your Turn! Press to Start") # Increment the timer if it's the human or shown computer's turn if self.state is COMPUTER_PLAY or self.state is HUMAN_PLAY: + self.agent.memory.append(prev_obs, action, reward, obs, env_done) i += 1 # Perform a quick learning process and increment the state after a certain time period has passed if i % (self.fps * self.seconds_play_per_state) == 0: - self.memory_lock.acquire() - print("Number of transitions in buffer: ", len(self.memory)) - self.agent.learn() - self.memory_lock.release() + self.record_lock.acquire() + self.display_text("Demo Training...") + print("Begin Demonstration Training") + print("Number of transitions in buffer: ", len(self.agent.memory)) + for j in range(self.num_train_per_demo): + print("Iteration %d / %d" % (j + 1, self.num_train_per_demo)) + self.agent.learn() + self.clear_text(obs) + self.record_lock.release() self._increment_state() i = 0 diff --git a/play_env.py b/play_env.py index 815115f..f06f8ee 100644 --- a/play_env.py +++ b/play_env.py @@ -1,4 +1,7 @@ +# TODO: I'm kinda using this project to pilot the whole config/network/example separation +# The motivation behind this is that the file sizes are getting large and its increasing cognitive load :( + # Import Python Standard Libraries from threading import Thread, Lock from argparse import ArgumentParser @@ -10,13 +13,11 @@ from numpy import array as np_array from numpy import save as np_save import torch from torch.optim import Adam -import torch.nn as nn -import torch.nn.functional as F # Import my custom RL library import rltorch -from rltorch.memory import PrioritizedReplayMemory -from rltorch.action_selector import EpsilonGreedySelector +from rltorch.memory import PrioritizedReplayMemory, ReplayMemory +from rltorch.action_selector import EpsilonGreedySelector, ArgMaxSelector import rltorch.env as E import rltorch.network as rn @@ -28,73 +29,24 @@ import play # -## Networks +## Networks (Probably want to move this to config file) # -class Value(nn.Module): - def __init__(self, state_size, action_size): - super(Value, self).__init__() - self.state_size = state_size - self.action_size = action_size - - self.conv1 = nn.Conv2d(4, 32, kernel_size = (8, 8), stride = (4, 4)) - self.conv2 = nn.Conv2d(32, 64, kernel_size = (4, 4), stride = (2, 2)) - self.conv3 = nn.Conv2d(64, 64, kernel_size = (3, 3), stride = (1, 1)) - - self.fc1 = nn.Linear(3136, 512) - self.fc1_norm = nn.LayerNorm(512) - - self.value_fc = rn.NoisyLinear(512, 512) - self.value_fc_norm = nn.LayerNorm(512) - self.value = nn.Linear(512, 1) - - self.advantage_fc = rn.NoisyLinear(512, 512) - self.advantage_fc_norm = nn.LayerNorm(512) - self.advantage = nn.Linear(512, action_size) - - - def forward(self, x): - x = x.float() / 256 - x = F.relu(self.conv1(x)) - x = F.relu(self.conv2(x)) - x = F.relu(self.conv3(x)) - - # Makes batch_size dimension again - x = x.view(-1, 3136) - x = F.relu(self.fc1_norm(self.fc1(x))) - - state_value = F.relu(self.value_fc_norm(self.value_fc(x))) - state_value = self.value(state_value) - - advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x))) - advantage = self.advantage(advantage) - - x = state_value + advantage - advantage.mean() - - # For debugging purposes... - if torch.isnan(x).any().item(): - print("WARNING NAN IN MODEL DETECTED") - - return x - +from networks import Value # ## Play Related Classes # -Transition = namedtuple('Transition', - ('state', 'action', 'reward', 'next_state', 'done')) - class PlayClass(Thread): - def __init__(self, env, action_selector, memory, memory_lock, agent, sneaky_env, config): + def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config): super(PlayClass, self).__init__() - self.play = play.Play(env, action_selector, memory, memory_lock, agent, sneaky_env, config) + self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config) def run(self): self.play.start() class Record(GymWrapper): - def __init__(self, env, memory, memory_lock, args): + def __init__(self, env, memory, args): GymWrapper.__init__(self, env) - self.memory_lock = memory_lock self.memory = memory self.skipframes = args['skip'] self.environment_name = args['environment_name'] @@ -110,14 +62,11 @@ class Record(GymWrapper): self.current_i += 1 # Don't add to memory until a certain number of frames is reached if self.current_i % self.skipframes == 0: - self.memory_lock.acquire() - self.memory.append(state, action, reward, next_state, done) - self.memory_lock.release() + self.memory.append((state, action, reward, next_state, done)) self.current_i = 0 return next_state, reward, done, info def log_transitions(self): - self.memory_lock.acquire() if len(self.memory) > 0: basename = self.logdir + "/{}.{}".format(self.environment_name, datetime.now().strftime("%Y-%m-%d-%H-%M-%s")) print("Base Filename: ", basename) @@ -128,7 +77,6 @@ class Record(GymWrapper): np_save(basename + "-nextstate.npy", np_array(next_state), allow_pickle = False) np_save(basename + "-done.npy", np_array(done), allow_pickle = False) self.memory.clear() - self.memory_lock.release() ## Parsing arguments @@ -141,31 +89,8 @@ parser.add_argument("--model", type=str, help = "The path location of the PyTorc args = vars(parser.parse_args()) ## Main configuration for script -config = {} -config['seed'] = 901 -config['seconds_play_per_state'] = 60 -config['zoom'] = 4 -config['environment_name'] = 'PongNoFrameskip-v4' -config['learning_rate'] = 1e-4 -config['target_sync_tau'] = 1e-3 -config['discount_rate'] = 0.99 -config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5) -# Number of episodes for the computer to train the agent without the human seeing -config['num_sneaky_episodes'] = 20 -config['replay_skip'] = 14 -config['batch_size'] = 32 * (config['replay_skip'] + 1) -config['disable_cuda'] = False -config['memory_size'] = 10**4 -# Prioritized vs Random Sampling -# 0 - Random sampling -# 1 - Only the highest prioirities -config['prioritized_replay_sampling_priority'] = 0.6 -# How important are the weights for the loss? -# 0 - Treat all losses equally -# 1 - Lower the importance of high losses -# Should ideally start from 0 and move your way to 1 to prevent overfitting -config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5) - +from config import config +from sneaky_config import sneaky_config # Environment name and log directory is vital so show help message and exit if not provided if args['environment_name'] is None or args['logdir'] is None: @@ -175,7 +100,7 @@ if args['environment_name'] is None or args['logdir'] is None: # Number of frames to skip when recording and fps can have sane defaults if args['skip'] is None: args['skip'] = 3 -if args['fps'] is None: +if 'fps' not in args: args['fps'] = 30 @@ -196,22 +121,20 @@ def wrap_preprocessing(env, MaxAndSkipEnv = False): , 4) ) - ## Set up environment to be recorded and preprocessed -memory = PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority']) -memory_lock = Lock() -env = Record(makeEnv(args['environment_name']), memory, memory_lock, args) +record_memory = [] +record_lock = Lock() +env = Record(makeEnv(args['environment_name']), record_memory, args) + # Bind record_env to current env so that we can reference log_transitions easier later record_env = env + # Use native gym monitor to get video recording env = GymMonitor(env, args['logdir'], force=True) + # Preprocess enviornment env = wrap_preprocessing(env) -# Use a different environment for when the computer trains on the side so that the current game state isn't manipuated -# Also use MaxEnvSkip to speed up processing -sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True) - # Set seeds rltorch.set_seed(config['seed']) env.seed(config['seed']) @@ -226,18 +149,31 @@ net = rn.Network(Value(state_size, action_size), target_net = rn.TargetNetwork(net, device = device) # Relevant components from RLTorch -actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = config['exploration_rate']) +memory = PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority']) +actor = ArgMaxSelector(net, action_size, device = device) agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net) +# Use a different environment for when the computer trains on the side so that the current game state isn't manipuated +# Also use MaxEnvSkip to speed up processing +sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True) +sneaky_memory = ReplayMemory(capacity = sneaky_config['memory_size']) +sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = sneaky_config['exploration_rate']) + +sneaky_agent = rltorch.agents.DQNAgent(net, sneaky_memory, sneaky_config, target_net = target_net) + # Pass all this information into the thread that will handle the game play and start -playThread = PlayClass(env, actor, memory, memory_lock, agent, sneaky_env, config) +playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config) playThread.start() # While the play thread is running, we'll periodically log transitions we've encountered while playThread.is_alive(): playThread.join(60) + record_lock.acquire() print("Logging....", end = " ") record_env.log_transitions() + record_lock.release() # Save what's remaining after process died +record_lock.acquire() record_env.log_transitions() +record_lock.release() \ No newline at end of file diff --git a/sneaky_config.py b/sneaky_config.py new file mode 100644 index 0000000..f72b2af --- /dev/null +++ b/sneaky_config.py @@ -0,0 +1,11 @@ +import rltorch + +sneaky_config = {} +sneaky_config['learning_rate'] = 1e-4 +sneaky_config['target_sync_tau'] = 1e-3 +sneaky_config['discount_rate'] = 0.99 +sneaky_config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5) +# Number of episodes for the computer to train the agent without the human seeing +sneaky_config['replay_skip'] = 14 +sneaky_config['batch_size'] = 32 * (sneaky_config['replay_skip'] + 1) +sneaky_config['memory_size'] = 10**4