diff --git a/config.py b/config.py index 6a1b865..30970ff 100644 --- a/config.py +++ b/config.py @@ -2,20 +2,20 @@ import rltorch config = {} config['seed'] = 901 -config['seconds_play_per_state'] = 120 config['zoom'] = 4 config['environment_name'] = 'PongNoFrameskip-v4' -config['learning_rate'] = 1e-4 +config['learning_rate'] = 1e-5 config['target_sync_tau'] = 1e-3 config['discount_rate'] = 0.99 -config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5) -# Number of episodes for the computer to train the agent without the human seeing -config['num_sneaky_episodes'] = 10 -config['num_train_per_demo'] = 50 # 100 total since you have two demo training per cycle -config['replay_skip'] = 14 -config['batch_size'] = 32 * (config['replay_skip'] + 1) config['disable_cuda'] = False -config['memory_size'] = 10**4 + +config['seconds_play_per_state'] = 120 +# 30 transitions per second for 120 seconds = 3600 transitions per turn +config['memory_size'] = 21600 # To hold 6 demonstrations +config['batch_size'] = 64 +config['num_train_per_demo'] = 115 # 4 looks * transitions per turn / (2 * batch_size) + + # Prioritized vs Random Sampling # 0 - Random sampling # 1 - Only the highest prioirities diff --git a/play.py b/play.py index daea72e..c530d9e 100644 --- a/play.py +++ b/play.py @@ -4,10 +4,11 @@ from pygame.locals import VIDEORESIZE from rltorch.memory import ReplayMemory class Play: - def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config): + def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config): self.env = env self.action_selector = action_selector self.record_lock = record_lock + self.record_locked = False self.sneaky_agent = sneaky_agent self.agent = agent self.sneaky_env = sneaky_env @@ -18,9 +19,8 @@ class Play: self.zoom = config['zoom'] if 'zoom' in config else 1 self.keys_to_action = config['keys_to_action'] if 'keys_to_action' in config else None self.seconds_play_per_state = config['seconds_play_per_state'] if 'seconds_play_per_state' in config else 30 - self.num_sneaky_episodes = config['num_sneaky_episodes'] if 'num_sneaky_episodes' in config else 10 - self.memory_size = config['memory_size'] if 'memory_size' in config else 10**4 - self.replay_skip = config['replay_skip'] if 'replay_skip' in config else 0 + self.num_sneaky_episodes = sneaky_config['num_sneaky_episodes'] if 'num_sneaky_episodes' in sneaky_config else 10 + self.replay_skip = sneaky_config['replay_skip'] if 'replay_skip' in sneaky_config else 0 self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1 # Initial values... self.video_size = (0, 0) @@ -78,7 +78,8 @@ class Play: self.pressed_keys.append(event.key) elif event.type == pygame.KEYUP: if event.key in self.relevant_keys: - self.pressed_keys.remove(event.key) + if event.key in self.pressed_keys: # To make sure that program doesn't crash + self.pressed_keys.remove(event.key) pygame.display.flip() self.clock.tick(self.fps) @@ -145,7 +146,7 @@ class Play: self.clock.tick(self.fps) def sneaky_train(self): - self.record_lock.acquire() + # self.record_lock.acquire() # Do a standard RL algorithm process for a certain number of episodes for i in range(self.num_sneaky_episodes): print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "") @@ -167,9 +168,9 @@ class Play: self.sneaky_agent.learn() # Finish the previous print with the total reward obtained during the episode - print(total_reward) + print(total_reward, flush = True) self.sneaky_iteration += 1 - self.record_lock.release() + # self.record_lock.release() def display_text(self, text): myfont = pygame.font.SysFont('Comic Sans MS', 50) @@ -188,6 +189,9 @@ class Play: if event.type == pygame.KEYUP and event.key == pygame.K_F1: self.paused = False self.clear_text(obs) + if self.record_locked: + self.record_lock.release() + self.record_locked = False else: self._process_common_pygame_events(event) @@ -224,21 +228,30 @@ class Play: # If the environment is done after a turn, reset it so we can keep going if env_done: episode_num += 1 - print("Human/Computer Episode: ", episode_num) + print("Human/Computer Episode:", episode_num, flush = True) obs = self.env.reset() env_done = False if self.paused: + if not self.record_locked: + self.record_lock.acquire() + self.record_locked = True self.process_pause_state(obs) continue if self.state is HUMAN_PLAY: + if self.record_locked: + self.record_lock.release() + self.record_locked = False prev_obs, action, reward, obs, env_done = self._human_play(obs) # The computer will train for a few episodes without showing to the user. # Mainly to speed up the learning process a bit elif self.state is SNEAKY_COMPUTER_PLAY: - print("Sneaky Computer Time") + if not self.record_locked: + self.record_lock.acquire() + self.record_locked = True + print("Sneaky Computer Time", flush = True) self.display_text("Training...") # Have the agent play a few rounds without showing to the user @@ -248,12 +261,21 @@ class Play: self._increment_state() elif self.state is TRANSITION: + if not self.record_locked: + self.record_lock.acquire() + self.record_locked = True self.transition("Computers Turn! Press to Start") elif self.state is COMPUTER_PLAY: + if self.record_locked: + self.record_lock.release() + self.record_locked = False prev_obs, action, reward, obs, env_done = self._computer_play(obs) elif self.state is TRANSITION2: + if not self.record_locked: + self.record_lock.acquire() + self.record_locked = True self.transition("Your Turn! Press to Start") # Increment the timer if it's the human or shown computer's turn @@ -265,7 +287,7 @@ class Play: self.record_lock.acquire() self.display_text("Demo Training...") print("Begin Demonstration Training") - print("Number of transitions in buffer: ", len(self.agent.memory)) + print("Number of transitions in buffer: ", len(self.agent.memory), flush = True) for j in range(self.num_train_per_demo): print("Iteration %d / %d" % (j + 1, self.num_train_per_demo)) self.agent.learn() diff --git a/play_env.py b/play_env.py index f06f8ee..fe1a3ce 100644 --- a/play_env.py +++ b/play_env.py @@ -37,17 +37,18 @@ from networks import Value ## Play Related Classes # class PlayClass(Thread): - def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config): + def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config): super(PlayClass, self).__init__() - self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config) + self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config) def run(self): self.play.start() class Record(GymWrapper): - def __init__(self, env, memory, args): + def __init__(self, env, memory, lock, args): GymWrapper.__init__(self, env) self.memory = memory + self.lock = lock # Lock for memory access self.skipframes = args['skip'] self.environment_name = args['environment_name'] self.logdir = args['logdir'] @@ -62,14 +63,16 @@ class Record(GymWrapper): self.current_i += 1 # Don't add to memory until a certain number of frames is reached if self.current_i % self.skipframes == 0: + self.lock.acquire() self.memory.append((state, action, reward, next_state, done)) + self.lock.release() self.current_i = 0 return next_state, reward, done, info def log_transitions(self): if len(self.memory) > 0: basename = self.logdir + "/{}.{}".format(self.environment_name, datetime.now().strftime("%Y-%m-%d-%H-%M-%s")) - print("Base Filename: ", basename) + print("Base Filename: ", basename, flush = True) state, action, reward, next_state, done = zip(*self.memory) np_save(basename + "-state.npy", np_array(state), allow_pickle = False) np_save(basename + "-action.npy", np_array(action), allow_pickle = False) @@ -124,7 +127,7 @@ def wrap_preprocessing(env, MaxAndSkipEnv = False): ## Set up environment to be recorded and preprocessed record_memory = [] record_lock = Lock() -env = Record(makeEnv(args['environment_name']), record_memory, args) +env = Record(makeEnv(args['environment_name']), record_memory, record_lock, args) # Bind record_env to current env so that we can reference log_transitions easier later record_env = env @@ -162,7 +165,7 @@ sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon sneaky_agent = rltorch.agents.DQNAgent(net, sneaky_memory, sneaky_config, target_net = target_net) # Pass all this information into the thread that will handle the game play and start -playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config) +playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config) playThread.start() # While the play thread is running, we'll periodically log transitions we've encountered diff --git a/sneaky_config.py b/sneaky_config.py index f72b2af..a83a3f0 100644 --- a/sneaky_config.py +++ b/sneaky_config.py @@ -1,11 +1,13 @@ import rltorch sneaky_config = {} -sneaky_config['learning_rate'] = 1e-4 +sneaky_config['learning_rate'] = 1e-5 sneaky_config['target_sync_tau'] = 1e-3 sneaky_config['discount_rate'] = 0.99 -sneaky_config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5) +sneaky_config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.02, iterations = 10**5) # Number of episodes for the computer to train the agent without the human seeing -sneaky_config['replay_skip'] = 14 -sneaky_config['batch_size'] = 32 * (sneaky_config['replay_skip'] + 1) -sneaky_config['memory_size'] = 10**4 +sneaky_config['replay_skip'] = 29 # Gradient descent every second +sneaky_config['batch_size'] = 16 * (sneaky_config['replay_skip'] + 1) # Calculated based on memory constraints +sneaky_config['memory_size'] = 2000 # batch_size * 2 looks = 66 seconds of gameplay +# Number of episodes for the computer to train the agent without the human seeing +sneaky_config['num_sneaky_episodes'] = 10 \ No newline at end of file