diff --git a/play.py b/play.py index 6441f93..a26e254 100644 --- a/play.py +++ b/play.py @@ -3,6 +3,7 @@ import pygame import sys import time import matplotlib +import rltorch.memory as M try: matplotlib.use('GTK3Agg') import matplotlib.pyplot as plt @@ -17,7 +18,7 @@ from pygame.locals import HWSURFACE, DOUBLEBUF, RESIZABLE, VIDEORESIZE from threading import Thread, Event, Timer class Play: - def __init__(self, env, action_selector, memory, agent, transpose = True, fps = 30, zoom = None, keys_to_action = None): + def __init__(self, env, action_selector, memory, agent, sneaky_env, transpose = True, fps = 30, zoom = None, keys_to_action = None): self.env = env self.action_selector = action_selector self.transpose = transpose @@ -34,7 +35,7 @@ class Play: self.paused = False self.memory = memory self.agent = agent - print("FPS ", 30) + self.sneaky_env = sneaky_env def _display_arr(self, obs, screen, arr, video_size): if obs is not None: @@ -120,7 +121,7 @@ class Play: self.relevant_keys = set(sum(map(list, self.keys_to_action.keys()),[])) def _increment_state(self): - self.state = (self.state + 1) % 4 + self.state = (self.state + 1) % 5 def pause(self, text = ""): self.paused = True @@ -145,6 +146,31 @@ class Play: pygame.display.flip() self.clock.tick(self.fps) + + def sneaky_train(self): + # Backup memory + backup_memory = self.memory + self.memory = M.ReplayMemory(capacity = 2000) # Another configurable parameter + EPISODES = 30 # Make this configurable + replay_skip = 4 # Make this configurable + for _ in range(EPISODES): + prev_obs = self.sneaky_env.reset() + done = False + step = 0 + while not done: + action = self.action_selector.act(prev_obs) + obs, reward, done, _ = self.sneaky_env.step(action) + self.memory.append(prev_obs, action, reward, obs, done) + prev_obs = obs + step += 1 + if step % replay_skip == 0: + self.agent.learn() + self.memory = backup_memory + # It would be cool instead of throwing away all this new data, we keep just a sample of it + # Not sure if i want all of it because then it'll drown out the expert demonstration data + + + def start(self): """Allows one to play the game using keyboard. To simply play the game use: @@ -202,8 +228,12 @@ class Play: self.clock = pygame.time.Clock() # States - COMPUTER_PLAY = 0 - HUMAN_PLAY = 2 + HUMAN_PLAY = 0 + SNEAKY_COMPUTER_PLAY = 1 + TRANSITION = 2 + COMPUTER_PLAY = 3 + TRANSITION2 = 4 + env_done = True prev_obs = None @@ -214,28 +244,31 @@ class Play: if env_done: obs = self.env.reset() env_done = False - - if self.state == 0: - prev_obs, action, reward, obs, env_done = self._computer_play(obs) - elif self.state == 1: - self.pause("Your Turn! Press to Start") - elif self.state == 2: + if self.state is HUMAN_PLAY: prev_obs, action, reward, obs, env_done = self._human_play(obs) - elif self.state == 3: + elif self.state is SNEAKY_COMPUTER_PLAY: + myfont = pygame.font.SysFont('Comic Sans MS', 50) + textsurface = myfont.render("Training....", False, (0, 0, 0)) + self.screen.blit(textsurface,(0,0)) + self.sneaky_train() + self._increment_state() + elif self.state is TRANSITION: self.pause("Computers Turn! Press to Start") + elif self.state is COMPUTER_PLAY: + prev_obs, action, reward, obs, env_done = self._computer_play(obs) + elif self.state is TRANSITION2: + self.pause("Your Turn! Press to Start") if self.state is COMPUTER_PLAY or self.state is HUMAN_PLAY: self.memory.append(prev_obs, action, reward, obs, env_done) - - if not self.paused: i += 1 - if i % (self.fps * 30) == 0: # Every 30 seconds... - print("TRAINING...") + # Every 30 seconds... + if i % (self.fps * 30) == 0: + print("Training...") self.agent.learn() print("PAUSING...") self._increment_state() i = 0 - pygame.quit() diff --git a/play_env.py b/play_env.py index da6391f..7e53911 100644 --- a/play_env.py +++ b/play_env.py @@ -17,11 +17,9 @@ import argparse import sys import numpy as np - -## CURRRENT ISSUE: MaxSkipEnv applies to the human player as well, which makes for an awkward gaming experience -# What are your thoughts? Training is different if expert isn't forced with the same constraint -# At some point I need to introduce learning - +# +## Networks +# class Value(nn.Module): def __init__(self, state_size, action_size): super(Value, self).__init__() @@ -69,16 +67,18 @@ class Value(nn.Module): return x - +# +## Play Related Classes +# Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done')) class PlayClass(threading.Thread): - def __init__(self, env, action_selector, memory, agent, fps = 60): + def __init__(self, env, action_selector, memory, agent, sneaky_env, fps = 60): super(PlayClass, self).__init__() self.env = env self.fps = fps - self.play = play.Play(self.env, action_selector, memory, agent, fps = fps, zoom = 4) + self.play = play.Play(self.env, action_selector, memory, agent, sneaky_env, fps = fps, zoom = 4) def run(self): self.play.start() @@ -162,19 +162,15 @@ if args['skip'] is None: if args['fps'] is None: args['fps'] = 30 -## Starting the game -memory = [] -env = Record(gym.make(args['environment_name']), memory, args, skipframes = args['skip']) -record_env = env -env = gym.wrappers.Monitor(env, args['logdir'], force=True) -env = E.ClippedRewardsWrapper( +def wrap_preprocessing(env): + return E.ClippedRewardsWrapper( E.FrameStack( E.TorchWrap( E.ProcessFrame84( E.FireResetEnv( # E.MaxAndSkipEnv( E.NoopResetEnv( - E.EpisodicLifeEnv(gym.make(config['environment_name'])) + E.EpisodicLifeEnv(env) , noop_max = 30) # , skip=4) ) @@ -183,6 +179,15 @@ env = E.ClippedRewardsWrapper( 4) ) +## Starting the game +memory = [] +env = Record(gym.make(args['environment_name']), memory, args, skipframes = args['skip']) +record_env = env +env = gym.wrappers.Monitor(env, args['logdir'], force=True) +env = wrap_preprocessing(env) + +sneaky_env = wrap_preprocessing(gym.make(args['environment_name'])) + rltorch.set_seed(config['seed']) device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") @@ -199,7 +204,7 @@ agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net) env.seed(config['seed']) -playThread = PlayClass(env, actor, memory, agent, args['fps']) +playThread = PlayClass(env, actor, memory, agent, sneaky_env, fps = args['fps']) playThread.start() ## Logging portion