Began separating config & networks, F1 for pausing, text functions, and more sneaky agent stuff

This commit is contained in:
Brandon Rozek 2019-10-27 20:42:37 -04:00
parent d78892e62c
commit 32862e4d79
6 changed files with 188 additions and 142 deletions

1
.gitignore vendored
View file

@ -1,2 +1,3 @@
__pycache__/
playlogs/
.vscode/

27
config.py Normal file
View file

@ -0,0 +1,27 @@
import rltorch
config = {}
config['seed'] = 901
config['seconds_play_per_state'] = 120
config['zoom'] = 4
config['environment_name'] = 'PongNoFrameskip-v4'
config['learning_rate'] = 1e-4
config['target_sync_tau'] = 1e-3
config['discount_rate'] = 0.99
config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5)
# Number of episodes for the computer to train the agent without the human seeing
config['num_sneaky_episodes'] = 10
config['num_train_per_demo'] = 50 # 100 total since you have two demo training per cycle
config['replay_skip'] = 14
config['batch_size'] = 32 * (config['replay_skip'] + 1)
config['disable_cuda'] = False
config['memory_size'] = 10**4
# Prioritized vs Random Sampling
# 0 - Random sampling
# 1 - Only the highest prioirities
config['prioritized_replay_sampling_priority'] = 0.6
# How important are the weights for the loss?
# 0 - Treat all losses equally
# 1 - Lower the importance of high losses
# Should ideally start from 0 and move your way to 1 to prevent overfitting
config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5)

51
networks.py Normal file
View file

@ -0,0 +1,51 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import rltorch.network as rn
class Value(nn.Module):
def __init__(self, state_size, action_size):
super(Value, self).__init__()
self.state_size = state_size
self.action_size = action_size
self.conv1 = nn.Conv2d(4, 32, kernel_size = (8, 8), stride = (4, 4))
self.conv2 = nn.Conv2d(32, 64, kernel_size = (4, 4), stride = (2, 2))
self.conv3 = nn.Conv2d(64, 64, kernel_size = (3, 3), stride = (1, 1))
self.fc1 = nn.Linear(3136, 512)
self.fc1_norm = nn.LayerNorm(512)
self.value_fc = rn.NoisyLinear(512, 512)
self.value_fc_norm = nn.LayerNorm(512)
self.value = nn.Linear(512, 1)
self.advantage_fc = rn.NoisyLinear(512, 512)
self.advantage_fc_norm = nn.LayerNorm(512)
self.advantage = nn.Linear(512, action_size)
def forward(self, x):
x = x.float() / 256
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
x = F.relu(self.conv3(x))
# Makes batch_size dimension again
x = x.view(-1, 3136)
x = F.relu(self.fc1_norm(self.fc1(x)))
state_value = F.relu(self.value_fc_norm(self.value_fc(x)))
state_value = self.value(state_value)
advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x)))
advantage = self.advantage(advantage)
x = state_value + advantage - advantage.mean()
# For debugging purposes...
if torch.isnan(x).any().item():
print("WARNING NAN IN MODEL DETECTED")
return x

106
play.py
View file

@ -4,13 +4,14 @@ from pygame.locals import VIDEORESIZE
from rltorch.memory import ReplayMemory
class Play:
def __init__(self, env, action_selector, memory, memory_lock, agent, sneaky_env, config):
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config):
self.env = env
self.action_selector = action_selector
self.memory = memory
self.memory_lock = memory_lock
self.record_lock = record_lock
self.sneaky_agent = sneaky_agent
self.agent = agent
self.sneaky_env = sneaky_env
self.sneaky_actor = sneaky_actor
# Get relevant parameters from config or set sane defaults
self.transpose = config['transpose'] if 'transpose' in config else True
self.fps = config['fps'] if 'fps' in config else 30
@ -20,6 +21,7 @@ class Play:
self.num_sneaky_episodes = config['num_sneaky_episodes'] if 'num_sneaky_episodes' in config else 10
self.memory_size = config['memory_size'] if 'memory_size' in config else 10**4
self.replay_skip = config['replay_skip'] if 'replay_skip' in config else 0
self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1
# Initial values...
self.video_size = (0, 0)
self.pressed_keys = []
@ -28,6 +30,8 @@ class Play:
self.running = True
self.state = 0
self.clock = pygame.time.Clock()
self.sneaky_iteration = 0
self.paused = False
def _display_arr(self, obs, screen, arr, video_size):
if obs is not None:
@ -49,6 +53,9 @@ class Play:
self.screen = pygame.display.set_mode(self.video_size)
elif event.type == pygame.KEYDOWN and event.key == pygame.K_ESCAPE:
self.running = False
elif not self.paused and self.state in [0, 3] and event.type == pygame.KEYUP and event.key == pygame.K_F1:
self.paused = True
self.display_text("Paused... Press F1 to unpause.")
else:
# No event was matched here
return False
@ -118,7 +125,7 @@ class Play:
def _increment_state(self):
self.state = (self.state + 1) % 5
def pause(self, text = ""):
def transition(self, text = ""):
myfont = pygame.font.SysFont('Comic Sans MS', 50)
textsurface = myfont.render(text, False, (0, 0, 0))
self.screen.blit(textsurface,(0,0))
@ -138,15 +145,10 @@ class Play:
self.clock.tick(self.fps)
def sneaky_train(self):
self.memory_lock.acquire()
# Backup memory
backup_memory = self.memory
self.memory = ReplayMemory(capacity = self.memory_size)
self.record_lock.acquire()
# Do a standard RL algorithm process for a certain number of episodes
for i in range(self.num_sneaky_episodes):
print("Episode: %d / %d, Reward: " % (i + 1, self.num_sneaky_episodes), end = "")
print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "")
# Reset all episode releated variables
prev_obs = self.sneaky_env.reset()
@ -155,28 +157,40 @@ class Play:
total_reward = 0
while not done:
action = self.action_selector.act(prev_obs)
action = self.sneaky_actor.act(prev_obs)
obs, reward, done, _ = self.sneaky_env.step(action)
total_reward += reward
self.memory.append(prev_obs, action, reward, obs, done)
self.sneaky_agent.memory.append(prev_obs, action, reward, obs, done)
prev_obs = obs
step += 1
if step % self.replay_skip == 0:
self.agent.learn()
self.sneaky_agent.learn()
# Finish the previous print with the total reward obtained during the episode
print(total_reward)
# Reset the memory back to the human demonstration / shown computer data
self.memory = backup_memory
self.memory_lock.release()
# Thoughts:
# It would be cool instead of throwing away all this new data, we keep just a sample of it
# Not sure if i want all of it because then it'll drown out the expert demonstration data
self.sneaky_iteration += 1
self.record_lock.release()
def display_text(self, text):
myfont = pygame.font.SysFont('Comic Sans MS', 50)
textsurface = myfont.render(text, False, (0, 0, 0))
self.screen.blit(textsurface,(0,0))
pygame.display.flip()
def clear_text(self, obs):
self._display_arr(obs, self.screen, self.env.unwrapped._get_obs(), video_size=self.video_size)
pygame.display.flip()
def process_pause_state(self, obs):
# Process game events
for event in pygame.event.get():
# This rule needs to be before the common one otherwise unpausing is ignored
if event.type == pygame.KEYUP and event.key == pygame.K_F1:
self.paused = False
self.clear_text(obs)
else:
self._process_common_pygame_events(event)
def start(self):
"""Allows one to play the game using keyboard.
To simply play the game use:
@ -200,57 +214,63 @@ class Play:
TRANSITION2 = 4
env_done = True
prev_obs = None
action = None
reward = None
obs = None
i = 0
episode_num = 0
while self.running:
# If the environment is done after a turn, reset it so we can keep going
if env_done:
episode_num += 1
print("Human/Computer Episode: ", episode_num)
obs = self.env.reset()
env_done = False
if self.paused:
self.process_pause_state(obs)
continue
if self.state is HUMAN_PLAY:
_, _, _, obs, env_done = self._human_play(obs)
prev_obs, action, reward, obs, env_done = self._human_play(obs)
# The computer will train for a few episodes without showing to the user.
# Mainly to speed up the learning process a bit
elif self.state is SNEAKY_COMPUTER_PLAY:
print("Sneaky Computer Time")
# Display "Training..." text to user
myfont = pygame.font.SysFont('Comic Sans MS', 50)
textsurface = myfont.render("Training....", False, (0, 0, 0))
self.screen.blit(textsurface,(0,0))
pygame.display.flip()
self.display_text("Training...")
# Have the agent play a few rounds without showing to the user
self.sneaky_train()
# To take away training text
self._display_arr(obs, self.screen, self.env.unwrapped._get_obs(), video_size=self.video_size)
pygame.display.flip()
# Go to the next step immediately
self.clear_text(obs)
self._increment_state()
elif self.state is TRANSITION:
self.pause("Computers Turn! Press <Space> to Start")
self.transition("Computers Turn! Press <Space> to Start")
elif self.state is COMPUTER_PLAY:
_, _, _, obs, env_done = self._computer_play(obs)
prev_obs, action, reward, obs, env_done = self._computer_play(obs)
elif self.state is TRANSITION2:
self.pause("Your Turn! Press <Space> to Start")
self.transition("Your Turn! Press <Space> to Start")
# Increment the timer if it's the human or shown computer's turn
if self.state is COMPUTER_PLAY or self.state is HUMAN_PLAY:
self.agent.memory.append(prev_obs, action, reward, obs, env_done)
i += 1
# Perform a quick learning process and increment the state after a certain time period has passed
if i % (self.fps * self.seconds_play_per_state) == 0:
self.memory_lock.acquire()
print("Number of transitions in buffer: ", len(self.memory))
self.agent.learn()
self.memory_lock.release()
self.record_lock.acquire()
self.display_text("Demo Training...")
print("Begin Demonstration Training")
print("Number of transitions in buffer: ", len(self.agent.memory))
for j in range(self.num_train_per_demo):
print("Iteration %d / %d" % (j + 1, self.num_train_per_demo))
self.agent.learn()
self.clear_text(obs)
self.record_lock.release()
self._increment_state()
i = 0

View file

@ -1,4 +1,7 @@
# TODO: I'm kinda using this project to pilot the whole config/network/example separation
# The motivation behind this is that the file sizes are getting large and its increasing cognitive load :(
# Import Python Standard Libraries
from threading import Thread, Lock
from argparse import ArgumentParser
@ -10,13 +13,11 @@ from numpy import array as np_array
from numpy import save as np_save
import torch
from torch.optim import Adam
import torch.nn as nn
import torch.nn.functional as F
# Import my custom RL library
import rltorch
from rltorch.memory import PrioritizedReplayMemory
from rltorch.action_selector import EpsilonGreedySelector
from rltorch.memory import PrioritizedReplayMemory, ReplayMemory
from rltorch.action_selector import EpsilonGreedySelector, ArgMaxSelector
import rltorch.env as E
import rltorch.network as rn
@ -28,73 +29,24 @@ import play
#
## Networks
## Networks (Probably want to move this to config file)
#
class Value(nn.Module):
def __init__(self, state_size, action_size):
super(Value, self).__init__()
self.state_size = state_size
self.action_size = action_size
self.conv1 = nn.Conv2d(4, 32, kernel_size = (8, 8), stride = (4, 4))
self.conv2 = nn.Conv2d(32, 64, kernel_size = (4, 4), stride = (2, 2))
self.conv3 = nn.Conv2d(64, 64, kernel_size = (3, 3), stride = (1, 1))
self.fc1 = nn.Linear(3136, 512)
self.fc1_norm = nn.LayerNorm(512)
self.value_fc = rn.NoisyLinear(512, 512)
self.value_fc_norm = nn.LayerNorm(512)
self.value = nn.Linear(512, 1)
self.advantage_fc = rn.NoisyLinear(512, 512)
self.advantage_fc_norm = nn.LayerNorm(512)
self.advantage = nn.Linear(512, action_size)
def forward(self, x):
x = x.float() / 256
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
x = F.relu(self.conv3(x))
# Makes batch_size dimension again
x = x.view(-1, 3136)
x = F.relu(self.fc1_norm(self.fc1(x)))
state_value = F.relu(self.value_fc_norm(self.value_fc(x)))
state_value = self.value(state_value)
advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x)))
advantage = self.advantage(advantage)
x = state_value + advantage - advantage.mean()
# For debugging purposes...
if torch.isnan(x).any().item():
print("WARNING NAN IN MODEL DETECTED")
return x
from networks import Value
#
## Play Related Classes
#
Transition = namedtuple('Transition',
('state', 'action', 'reward', 'next_state', 'done'))
class PlayClass(Thread):
def __init__(self, env, action_selector, memory, memory_lock, agent, sneaky_env, config):
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config):
super(PlayClass, self).__init__()
self.play = play.Play(env, action_selector, memory, memory_lock, agent, sneaky_env, config)
self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config)
def run(self):
self.play.start()
class Record(GymWrapper):
def __init__(self, env, memory, memory_lock, args):
def __init__(self, env, memory, args):
GymWrapper.__init__(self, env)
self.memory_lock = memory_lock
self.memory = memory
self.skipframes = args['skip']
self.environment_name = args['environment_name']
@ -110,14 +62,11 @@ class Record(GymWrapper):
self.current_i += 1
# Don't add to memory until a certain number of frames is reached
if self.current_i % self.skipframes == 0:
self.memory_lock.acquire()
self.memory.append(state, action, reward, next_state, done)
self.memory_lock.release()
self.memory.append((state, action, reward, next_state, done))
self.current_i = 0
return next_state, reward, done, info
def log_transitions(self):
self.memory_lock.acquire()
if len(self.memory) > 0:
basename = self.logdir + "/{}.{}".format(self.environment_name, datetime.now().strftime("%Y-%m-%d-%H-%M-%s"))
print("Base Filename: ", basename)
@ -128,7 +77,6 @@ class Record(GymWrapper):
np_save(basename + "-nextstate.npy", np_array(next_state), allow_pickle = False)
np_save(basename + "-done.npy", np_array(done), allow_pickle = False)
self.memory.clear()
self.memory_lock.release()
## Parsing arguments
@ -141,31 +89,8 @@ parser.add_argument("--model", type=str, help = "The path location of the PyTorc
args = vars(parser.parse_args())
## Main configuration for script
config = {}
config['seed'] = 901
config['seconds_play_per_state'] = 60
config['zoom'] = 4
config['environment_name'] = 'PongNoFrameskip-v4'
config['learning_rate'] = 1e-4
config['target_sync_tau'] = 1e-3
config['discount_rate'] = 0.99
config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5)
# Number of episodes for the computer to train the agent without the human seeing
config['num_sneaky_episodes'] = 20
config['replay_skip'] = 14
config['batch_size'] = 32 * (config['replay_skip'] + 1)
config['disable_cuda'] = False
config['memory_size'] = 10**4
# Prioritized vs Random Sampling
# 0 - Random sampling
# 1 - Only the highest prioirities
config['prioritized_replay_sampling_priority'] = 0.6
# How important are the weights for the loss?
# 0 - Treat all losses equally
# 1 - Lower the importance of high losses
# Should ideally start from 0 and move your way to 1 to prevent overfitting
config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5)
from config import config
from sneaky_config import sneaky_config
# Environment name and log directory is vital so show help message and exit if not provided
if args['environment_name'] is None or args['logdir'] is None:
@ -175,7 +100,7 @@ if args['environment_name'] is None or args['logdir'] is None:
# Number of frames to skip when recording and fps can have sane defaults
if args['skip'] is None:
args['skip'] = 3
if args['fps'] is None:
if 'fps' not in args:
args['fps'] = 30
@ -196,22 +121,20 @@ def wrap_preprocessing(env, MaxAndSkipEnv = False):
, 4)
)
## Set up environment to be recorded and preprocessed
memory = PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority'])
memory_lock = Lock()
env = Record(makeEnv(args['environment_name']), memory, memory_lock, args)
record_memory = []
record_lock = Lock()
env = Record(makeEnv(args['environment_name']), record_memory, args)
# Bind record_env to current env so that we can reference log_transitions easier later
record_env = env
# Use native gym monitor to get video recording
env = GymMonitor(env, args['logdir'], force=True)
# Preprocess enviornment
env = wrap_preprocessing(env)
# Use a different environment for when the computer trains on the side so that the current game state isn't manipuated
# Also use MaxEnvSkip to speed up processing
sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True)
# Set seeds
rltorch.set_seed(config['seed'])
env.seed(config['seed'])
@ -226,18 +149,31 @@ net = rn.Network(Value(state_size, action_size),
target_net = rn.TargetNetwork(net, device = device)
# Relevant components from RLTorch
actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = config['exploration_rate'])
memory = PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority'])
actor = ArgMaxSelector(net, action_size, device = device)
agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net)
# Use a different environment for when the computer trains on the side so that the current game state isn't manipuated
# Also use MaxEnvSkip to speed up processing
sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True)
sneaky_memory = ReplayMemory(capacity = sneaky_config['memory_size'])
sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = sneaky_config['exploration_rate'])
sneaky_agent = rltorch.agents.DQNAgent(net, sneaky_memory, sneaky_config, target_net = target_net)
# Pass all this information into the thread that will handle the game play and start
playThread = PlayClass(env, actor, memory, memory_lock, agent, sneaky_env, config)
playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config)
playThread.start()
# While the play thread is running, we'll periodically log transitions we've encountered
while playThread.is_alive():
playThread.join(60)
record_lock.acquire()
print("Logging....", end = " ")
record_env.log_transitions()
record_lock.release()
# Save what's remaining after process died
record_lock.acquire()
record_env.log_transitions()
record_lock.release()

11
sneaky_config.py Normal file
View file

@ -0,0 +1,11 @@
import rltorch
sneaky_config = {}
sneaky_config['learning_rate'] = 1e-4
sneaky_config['target_sync_tau'] = 1e-3
sneaky_config['discount_rate'] = 0.99
sneaky_config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5)
# Number of episodes for the computer to train the agent without the human seeing
sneaky_config['replay_skip'] = 14
sneaky_config['batch_size'] = 32 * (sneaky_config['replay_skip'] + 1)
sneaky_config['memory_size'] = 10**4