Updated configs and fixed threading issues

This commit is contained in:
Brandon Rozek 2019-11-05 07:09:49 -05:00
parent 32862e4d79
commit 744656aaa9
4 changed files with 58 additions and 31 deletions

View file

@ -2,20 +2,20 @@ import rltorch
config = {} config = {}
config['seed'] = 901 config['seed'] = 901
config['seconds_play_per_state'] = 120
config['zoom'] = 4 config['zoom'] = 4
config['environment_name'] = 'PongNoFrameskip-v4' config['environment_name'] = 'PongNoFrameskip-v4'
config['learning_rate'] = 1e-4 config['learning_rate'] = 1e-5
config['target_sync_tau'] = 1e-3 config['target_sync_tau'] = 1e-3
config['discount_rate'] = 0.99 config['discount_rate'] = 0.99
config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5)
# Number of episodes for the computer to train the agent without the human seeing
config['num_sneaky_episodes'] = 10
config['num_train_per_demo'] = 50 # 100 total since you have two demo training per cycle
config['replay_skip'] = 14
config['batch_size'] = 32 * (config['replay_skip'] + 1)
config['disable_cuda'] = False config['disable_cuda'] = False
config['memory_size'] = 10**4
config['seconds_play_per_state'] = 120
# 30 transitions per second for 120 seconds = 3600 transitions per turn
config['memory_size'] = 21600 # To hold 6 demonstrations
config['batch_size'] = 64
config['num_train_per_demo'] = 115 # 4 looks * transitions per turn / (2 * batch_size)
# Prioritized vs Random Sampling # Prioritized vs Random Sampling
# 0 - Random sampling # 0 - Random sampling
# 1 - Only the highest prioirities # 1 - Only the highest prioirities

42
play.py
View file

@ -4,10 +4,11 @@ from pygame.locals import VIDEORESIZE
from rltorch.memory import ReplayMemory from rltorch.memory import ReplayMemory
class Play: class Play:
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config): def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config):
self.env = env self.env = env
self.action_selector = action_selector self.action_selector = action_selector
self.record_lock = record_lock self.record_lock = record_lock
self.record_locked = False
self.sneaky_agent = sneaky_agent self.sneaky_agent = sneaky_agent
self.agent = agent self.agent = agent
self.sneaky_env = sneaky_env self.sneaky_env = sneaky_env
@ -18,9 +19,8 @@ class Play:
self.zoom = config['zoom'] if 'zoom' in config else 1 self.zoom = config['zoom'] if 'zoom' in config else 1
self.keys_to_action = config['keys_to_action'] if 'keys_to_action' in config else None self.keys_to_action = config['keys_to_action'] if 'keys_to_action' in config else None
self.seconds_play_per_state = config['seconds_play_per_state'] if 'seconds_play_per_state' in config else 30 self.seconds_play_per_state = config['seconds_play_per_state'] if 'seconds_play_per_state' in config else 30
self.num_sneaky_episodes = config['num_sneaky_episodes'] if 'num_sneaky_episodes' in config else 10 self.num_sneaky_episodes = sneaky_config['num_sneaky_episodes'] if 'num_sneaky_episodes' in sneaky_config else 10
self.memory_size = config['memory_size'] if 'memory_size' in config else 10**4 self.replay_skip = sneaky_config['replay_skip'] if 'replay_skip' in sneaky_config else 0
self.replay_skip = config['replay_skip'] if 'replay_skip' in config else 0
self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1 self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1
# Initial values... # Initial values...
self.video_size = (0, 0) self.video_size = (0, 0)
@ -78,6 +78,7 @@ class Play:
self.pressed_keys.append(event.key) self.pressed_keys.append(event.key)
elif event.type == pygame.KEYUP: elif event.type == pygame.KEYUP:
if event.key in self.relevant_keys: if event.key in self.relevant_keys:
if event.key in self.pressed_keys: # To make sure that program doesn't crash
self.pressed_keys.remove(event.key) self.pressed_keys.remove(event.key)
pygame.display.flip() pygame.display.flip()
@ -145,7 +146,7 @@ class Play:
self.clock.tick(self.fps) self.clock.tick(self.fps)
def sneaky_train(self): def sneaky_train(self):
self.record_lock.acquire() # self.record_lock.acquire()
# Do a standard RL algorithm process for a certain number of episodes # Do a standard RL algorithm process for a certain number of episodes
for i in range(self.num_sneaky_episodes): for i in range(self.num_sneaky_episodes):
print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "") print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "")
@ -167,9 +168,9 @@ class Play:
self.sneaky_agent.learn() self.sneaky_agent.learn()
# Finish the previous print with the total reward obtained during the episode # Finish the previous print with the total reward obtained during the episode
print(total_reward) print(total_reward, flush = True)
self.sneaky_iteration += 1 self.sneaky_iteration += 1
self.record_lock.release() # self.record_lock.release()
def display_text(self, text): def display_text(self, text):
myfont = pygame.font.SysFont('Comic Sans MS', 50) myfont = pygame.font.SysFont('Comic Sans MS', 50)
@ -188,6 +189,9 @@ class Play:
if event.type == pygame.KEYUP and event.key == pygame.K_F1: if event.type == pygame.KEYUP and event.key == pygame.K_F1:
self.paused = False self.paused = False
self.clear_text(obs) self.clear_text(obs)
if self.record_locked:
self.record_lock.release()
self.record_locked = False
else: else:
self._process_common_pygame_events(event) self._process_common_pygame_events(event)
@ -224,21 +228,30 @@ class Play:
# If the environment is done after a turn, reset it so we can keep going # If the environment is done after a turn, reset it so we can keep going
if env_done: if env_done:
episode_num += 1 episode_num += 1
print("Human/Computer Episode: ", episode_num) print("Human/Computer Episode:", episode_num, flush = True)
obs = self.env.reset() obs = self.env.reset()
env_done = False env_done = False
if self.paused: if self.paused:
if not self.record_locked:
self.record_lock.acquire()
self.record_locked = True
self.process_pause_state(obs) self.process_pause_state(obs)
continue continue
if self.state is HUMAN_PLAY: if self.state is HUMAN_PLAY:
if self.record_locked:
self.record_lock.release()
self.record_locked = False
prev_obs, action, reward, obs, env_done = self._human_play(obs) prev_obs, action, reward, obs, env_done = self._human_play(obs)
# The computer will train for a few episodes without showing to the user. # The computer will train for a few episodes without showing to the user.
# Mainly to speed up the learning process a bit # Mainly to speed up the learning process a bit
elif self.state is SNEAKY_COMPUTER_PLAY: elif self.state is SNEAKY_COMPUTER_PLAY:
print("Sneaky Computer Time") if not self.record_locked:
self.record_lock.acquire()
self.record_locked = True
print("Sneaky Computer Time", flush = True)
self.display_text("Training...") self.display_text("Training...")
# Have the agent play a few rounds without showing to the user # Have the agent play a few rounds without showing to the user
@ -248,12 +261,21 @@ class Play:
self._increment_state() self._increment_state()
elif self.state is TRANSITION: elif self.state is TRANSITION:
if not self.record_locked:
self.record_lock.acquire()
self.record_locked = True
self.transition("Computers Turn! Press <Space> to Start") self.transition("Computers Turn! Press <Space> to Start")
elif self.state is COMPUTER_PLAY: elif self.state is COMPUTER_PLAY:
if self.record_locked:
self.record_lock.release()
self.record_locked = False
prev_obs, action, reward, obs, env_done = self._computer_play(obs) prev_obs, action, reward, obs, env_done = self._computer_play(obs)
elif self.state is TRANSITION2: elif self.state is TRANSITION2:
if not self.record_locked:
self.record_lock.acquire()
self.record_locked = True
self.transition("Your Turn! Press <Space> to Start") self.transition("Your Turn! Press <Space> to Start")
# Increment the timer if it's the human or shown computer's turn # Increment the timer if it's the human or shown computer's turn
@ -265,7 +287,7 @@ class Play:
self.record_lock.acquire() self.record_lock.acquire()
self.display_text("Demo Training...") self.display_text("Demo Training...")
print("Begin Demonstration Training") print("Begin Demonstration Training")
print("Number of transitions in buffer: ", len(self.agent.memory)) print("Number of transitions in buffer: ", len(self.agent.memory), flush = True)
for j in range(self.num_train_per_demo): for j in range(self.num_train_per_demo):
print("Iteration %d / %d" % (j + 1, self.num_train_per_demo)) print("Iteration %d / %d" % (j + 1, self.num_train_per_demo))
self.agent.learn() self.agent.learn()

View file

@ -37,17 +37,18 @@ from networks import Value
## Play Related Classes ## Play Related Classes
# #
class PlayClass(Thread): class PlayClass(Thread):
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config): def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config):
super(PlayClass, self).__init__() super(PlayClass, self).__init__()
self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config) self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config)
def run(self): def run(self):
self.play.start() self.play.start()
class Record(GymWrapper): class Record(GymWrapper):
def __init__(self, env, memory, args): def __init__(self, env, memory, lock, args):
GymWrapper.__init__(self, env) GymWrapper.__init__(self, env)
self.memory = memory self.memory = memory
self.lock = lock # Lock for memory access
self.skipframes = args['skip'] self.skipframes = args['skip']
self.environment_name = args['environment_name'] self.environment_name = args['environment_name']
self.logdir = args['logdir'] self.logdir = args['logdir']
@ -62,14 +63,16 @@ class Record(GymWrapper):
self.current_i += 1 self.current_i += 1
# Don't add to memory until a certain number of frames is reached # Don't add to memory until a certain number of frames is reached
if self.current_i % self.skipframes == 0: if self.current_i % self.skipframes == 0:
self.lock.acquire()
self.memory.append((state, action, reward, next_state, done)) self.memory.append((state, action, reward, next_state, done))
self.lock.release()
self.current_i = 0 self.current_i = 0
return next_state, reward, done, info return next_state, reward, done, info
def log_transitions(self): def log_transitions(self):
if len(self.memory) > 0: if len(self.memory) > 0:
basename = self.logdir + "/{}.{}".format(self.environment_name, datetime.now().strftime("%Y-%m-%d-%H-%M-%s")) basename = self.logdir + "/{}.{}".format(self.environment_name, datetime.now().strftime("%Y-%m-%d-%H-%M-%s"))
print("Base Filename: ", basename) print("Base Filename: ", basename, flush = True)
state, action, reward, next_state, done = zip(*self.memory) state, action, reward, next_state, done = zip(*self.memory)
np_save(basename + "-state.npy", np_array(state), allow_pickle = False) np_save(basename + "-state.npy", np_array(state), allow_pickle = False)
np_save(basename + "-action.npy", np_array(action), allow_pickle = False) np_save(basename + "-action.npy", np_array(action), allow_pickle = False)
@ -124,7 +127,7 @@ def wrap_preprocessing(env, MaxAndSkipEnv = False):
## Set up environment to be recorded and preprocessed ## Set up environment to be recorded and preprocessed
record_memory = [] record_memory = []
record_lock = Lock() record_lock = Lock()
env = Record(makeEnv(args['environment_name']), record_memory, args) env = Record(makeEnv(args['environment_name']), record_memory, record_lock, args)
# Bind record_env to current env so that we can reference log_transitions easier later # Bind record_env to current env so that we can reference log_transitions easier later
record_env = env record_env = env
@ -162,7 +165,7 @@ sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon
sneaky_agent = rltorch.agents.DQNAgent(net, sneaky_memory, sneaky_config, target_net = target_net) sneaky_agent = rltorch.agents.DQNAgent(net, sneaky_memory, sneaky_config, target_net = target_net)
# Pass all this information into the thread that will handle the game play and start # Pass all this information into the thread that will handle the game play and start
playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config) playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config)
playThread.start() playThread.start()
# While the play thread is running, we'll periodically log transitions we've encountered # While the play thread is running, we'll periodically log transitions we've encountered

View file

@ -1,11 +1,13 @@
import rltorch import rltorch
sneaky_config = {} sneaky_config = {}
sneaky_config['learning_rate'] = 1e-4 sneaky_config['learning_rate'] = 1e-5
sneaky_config['target_sync_tau'] = 1e-3 sneaky_config['target_sync_tau'] = 1e-3
sneaky_config['discount_rate'] = 0.99 sneaky_config['discount_rate'] = 0.99
sneaky_config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5) sneaky_config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.02, iterations = 10**5)
# Number of episodes for the computer to train the agent without the human seeing # Number of episodes for the computer to train the agent without the human seeing
sneaky_config['replay_skip'] = 14 sneaky_config['replay_skip'] = 29 # Gradient descent every second
sneaky_config['batch_size'] = 32 * (sneaky_config['replay_skip'] + 1) sneaky_config['batch_size'] = 16 * (sneaky_config['replay_skip'] + 1) # Calculated based on memory constraints
sneaky_config['memory_size'] = 10**4 sneaky_config['memory_size'] = 2000 # batch_size * 2 looks = 66 seconds of gameplay
# Number of episodes for the computer to train the agent without the human seeing
sneaky_config['num_sneaky_episodes'] = 10