Updated configs and fixed threading issues
This commit is contained in:
parent
32862e4d79
commit
744656aaa9
4 changed files with 58 additions and 31 deletions
18
config.py
18
config.py
|
@ -2,20 +2,20 @@ import rltorch
|
||||||
|
|
||||||
config = {}
|
config = {}
|
||||||
config['seed'] = 901
|
config['seed'] = 901
|
||||||
config['seconds_play_per_state'] = 120
|
|
||||||
config['zoom'] = 4
|
config['zoom'] = 4
|
||||||
config['environment_name'] = 'PongNoFrameskip-v4'
|
config['environment_name'] = 'PongNoFrameskip-v4'
|
||||||
config['learning_rate'] = 1e-4
|
config['learning_rate'] = 1e-5
|
||||||
config['target_sync_tau'] = 1e-3
|
config['target_sync_tau'] = 1e-3
|
||||||
config['discount_rate'] = 0.99
|
config['discount_rate'] = 0.99
|
||||||
config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5)
|
|
||||||
# Number of episodes for the computer to train the agent without the human seeing
|
|
||||||
config['num_sneaky_episodes'] = 10
|
|
||||||
config['num_train_per_demo'] = 50 # 100 total since you have two demo training per cycle
|
|
||||||
config['replay_skip'] = 14
|
|
||||||
config['batch_size'] = 32 * (config['replay_skip'] + 1)
|
|
||||||
config['disable_cuda'] = False
|
config['disable_cuda'] = False
|
||||||
config['memory_size'] = 10**4
|
|
||||||
|
config['seconds_play_per_state'] = 120
|
||||||
|
# 30 transitions per second for 120 seconds = 3600 transitions per turn
|
||||||
|
config['memory_size'] = 21600 # To hold 6 demonstrations
|
||||||
|
config['batch_size'] = 64
|
||||||
|
config['num_train_per_demo'] = 115 # 4 looks * transitions per turn / (2 * batch_size)
|
||||||
|
|
||||||
|
|
||||||
# Prioritized vs Random Sampling
|
# Prioritized vs Random Sampling
|
||||||
# 0 - Random sampling
|
# 0 - Random sampling
|
||||||
# 1 - Only the highest prioirities
|
# 1 - Only the highest prioirities
|
||||||
|
|
44
play.py
44
play.py
|
@ -4,10 +4,11 @@ from pygame.locals import VIDEORESIZE
|
||||||
from rltorch.memory import ReplayMemory
|
from rltorch.memory import ReplayMemory
|
||||||
|
|
||||||
class Play:
|
class Play:
|
||||||
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config):
|
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config):
|
||||||
self.env = env
|
self.env = env
|
||||||
self.action_selector = action_selector
|
self.action_selector = action_selector
|
||||||
self.record_lock = record_lock
|
self.record_lock = record_lock
|
||||||
|
self.record_locked = False
|
||||||
self.sneaky_agent = sneaky_agent
|
self.sneaky_agent = sneaky_agent
|
||||||
self.agent = agent
|
self.agent = agent
|
||||||
self.sneaky_env = sneaky_env
|
self.sneaky_env = sneaky_env
|
||||||
|
@ -18,9 +19,8 @@ class Play:
|
||||||
self.zoom = config['zoom'] if 'zoom' in config else 1
|
self.zoom = config['zoom'] if 'zoom' in config else 1
|
||||||
self.keys_to_action = config['keys_to_action'] if 'keys_to_action' in config else None
|
self.keys_to_action = config['keys_to_action'] if 'keys_to_action' in config else None
|
||||||
self.seconds_play_per_state = config['seconds_play_per_state'] if 'seconds_play_per_state' in config else 30
|
self.seconds_play_per_state = config['seconds_play_per_state'] if 'seconds_play_per_state' in config else 30
|
||||||
self.num_sneaky_episodes = config['num_sneaky_episodes'] if 'num_sneaky_episodes' in config else 10
|
self.num_sneaky_episodes = sneaky_config['num_sneaky_episodes'] if 'num_sneaky_episodes' in sneaky_config else 10
|
||||||
self.memory_size = config['memory_size'] if 'memory_size' in config else 10**4
|
self.replay_skip = sneaky_config['replay_skip'] if 'replay_skip' in sneaky_config else 0
|
||||||
self.replay_skip = config['replay_skip'] if 'replay_skip' in config else 0
|
|
||||||
self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1
|
self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1
|
||||||
# Initial values...
|
# Initial values...
|
||||||
self.video_size = (0, 0)
|
self.video_size = (0, 0)
|
||||||
|
@ -78,7 +78,8 @@ class Play:
|
||||||
self.pressed_keys.append(event.key)
|
self.pressed_keys.append(event.key)
|
||||||
elif event.type == pygame.KEYUP:
|
elif event.type == pygame.KEYUP:
|
||||||
if event.key in self.relevant_keys:
|
if event.key in self.relevant_keys:
|
||||||
self.pressed_keys.remove(event.key)
|
if event.key in self.pressed_keys: # To make sure that program doesn't crash
|
||||||
|
self.pressed_keys.remove(event.key)
|
||||||
|
|
||||||
pygame.display.flip()
|
pygame.display.flip()
|
||||||
self.clock.tick(self.fps)
|
self.clock.tick(self.fps)
|
||||||
|
@ -145,7 +146,7 @@ class Play:
|
||||||
self.clock.tick(self.fps)
|
self.clock.tick(self.fps)
|
||||||
|
|
||||||
def sneaky_train(self):
|
def sneaky_train(self):
|
||||||
self.record_lock.acquire()
|
# self.record_lock.acquire()
|
||||||
# Do a standard RL algorithm process for a certain number of episodes
|
# Do a standard RL algorithm process for a certain number of episodes
|
||||||
for i in range(self.num_sneaky_episodes):
|
for i in range(self.num_sneaky_episodes):
|
||||||
print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "")
|
print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "")
|
||||||
|
@ -167,9 +168,9 @@ class Play:
|
||||||
self.sneaky_agent.learn()
|
self.sneaky_agent.learn()
|
||||||
|
|
||||||
# Finish the previous print with the total reward obtained during the episode
|
# Finish the previous print with the total reward obtained during the episode
|
||||||
print(total_reward)
|
print(total_reward, flush = True)
|
||||||
self.sneaky_iteration += 1
|
self.sneaky_iteration += 1
|
||||||
self.record_lock.release()
|
# self.record_lock.release()
|
||||||
|
|
||||||
def display_text(self, text):
|
def display_text(self, text):
|
||||||
myfont = pygame.font.SysFont('Comic Sans MS', 50)
|
myfont = pygame.font.SysFont('Comic Sans MS', 50)
|
||||||
|
@ -188,6 +189,9 @@ class Play:
|
||||||
if event.type == pygame.KEYUP and event.key == pygame.K_F1:
|
if event.type == pygame.KEYUP and event.key == pygame.K_F1:
|
||||||
self.paused = False
|
self.paused = False
|
||||||
self.clear_text(obs)
|
self.clear_text(obs)
|
||||||
|
if self.record_locked:
|
||||||
|
self.record_lock.release()
|
||||||
|
self.record_locked = False
|
||||||
else:
|
else:
|
||||||
self._process_common_pygame_events(event)
|
self._process_common_pygame_events(event)
|
||||||
|
|
||||||
|
@ -224,21 +228,30 @@ class Play:
|
||||||
# If the environment is done after a turn, reset it so we can keep going
|
# If the environment is done after a turn, reset it so we can keep going
|
||||||
if env_done:
|
if env_done:
|
||||||
episode_num += 1
|
episode_num += 1
|
||||||
print("Human/Computer Episode: ", episode_num)
|
print("Human/Computer Episode:", episode_num, flush = True)
|
||||||
obs = self.env.reset()
|
obs = self.env.reset()
|
||||||
env_done = False
|
env_done = False
|
||||||
|
|
||||||
if self.paused:
|
if self.paused:
|
||||||
|
if not self.record_locked:
|
||||||
|
self.record_lock.acquire()
|
||||||
|
self.record_locked = True
|
||||||
self.process_pause_state(obs)
|
self.process_pause_state(obs)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if self.state is HUMAN_PLAY:
|
if self.state is HUMAN_PLAY:
|
||||||
|
if self.record_locked:
|
||||||
|
self.record_lock.release()
|
||||||
|
self.record_locked = False
|
||||||
prev_obs, action, reward, obs, env_done = self._human_play(obs)
|
prev_obs, action, reward, obs, env_done = self._human_play(obs)
|
||||||
|
|
||||||
# The computer will train for a few episodes without showing to the user.
|
# The computer will train for a few episodes without showing to the user.
|
||||||
# Mainly to speed up the learning process a bit
|
# Mainly to speed up the learning process a bit
|
||||||
elif self.state is SNEAKY_COMPUTER_PLAY:
|
elif self.state is SNEAKY_COMPUTER_PLAY:
|
||||||
print("Sneaky Computer Time")
|
if not self.record_locked:
|
||||||
|
self.record_lock.acquire()
|
||||||
|
self.record_locked = True
|
||||||
|
print("Sneaky Computer Time", flush = True)
|
||||||
self.display_text("Training...")
|
self.display_text("Training...")
|
||||||
|
|
||||||
# Have the agent play a few rounds without showing to the user
|
# Have the agent play a few rounds without showing to the user
|
||||||
|
@ -248,12 +261,21 @@ class Play:
|
||||||
self._increment_state()
|
self._increment_state()
|
||||||
|
|
||||||
elif self.state is TRANSITION:
|
elif self.state is TRANSITION:
|
||||||
|
if not self.record_locked:
|
||||||
|
self.record_lock.acquire()
|
||||||
|
self.record_locked = True
|
||||||
self.transition("Computers Turn! Press <Space> to Start")
|
self.transition("Computers Turn! Press <Space> to Start")
|
||||||
|
|
||||||
elif self.state is COMPUTER_PLAY:
|
elif self.state is COMPUTER_PLAY:
|
||||||
|
if self.record_locked:
|
||||||
|
self.record_lock.release()
|
||||||
|
self.record_locked = False
|
||||||
prev_obs, action, reward, obs, env_done = self._computer_play(obs)
|
prev_obs, action, reward, obs, env_done = self._computer_play(obs)
|
||||||
|
|
||||||
elif self.state is TRANSITION2:
|
elif self.state is TRANSITION2:
|
||||||
|
if not self.record_locked:
|
||||||
|
self.record_lock.acquire()
|
||||||
|
self.record_locked = True
|
||||||
self.transition("Your Turn! Press <Space> to Start")
|
self.transition("Your Turn! Press <Space> to Start")
|
||||||
|
|
||||||
# Increment the timer if it's the human or shown computer's turn
|
# Increment the timer if it's the human or shown computer's turn
|
||||||
|
@ -265,7 +287,7 @@ class Play:
|
||||||
self.record_lock.acquire()
|
self.record_lock.acquire()
|
||||||
self.display_text("Demo Training...")
|
self.display_text("Demo Training...")
|
||||||
print("Begin Demonstration Training")
|
print("Begin Demonstration Training")
|
||||||
print("Number of transitions in buffer: ", len(self.agent.memory))
|
print("Number of transitions in buffer: ", len(self.agent.memory), flush = True)
|
||||||
for j in range(self.num_train_per_demo):
|
for j in range(self.num_train_per_demo):
|
||||||
print("Iteration %d / %d" % (j + 1, self.num_train_per_demo))
|
print("Iteration %d / %d" % (j + 1, self.num_train_per_demo))
|
||||||
self.agent.learn()
|
self.agent.learn()
|
||||||
|
|
15
play_env.py
15
play_env.py
|
@ -37,17 +37,18 @@ from networks import Value
|
||||||
## Play Related Classes
|
## Play Related Classes
|
||||||
#
|
#
|
||||||
class PlayClass(Thread):
|
class PlayClass(Thread):
|
||||||
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config):
|
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config):
|
||||||
super(PlayClass, self).__init__()
|
super(PlayClass, self).__init__()
|
||||||
self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config)
|
self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
self.play.start()
|
self.play.start()
|
||||||
|
|
||||||
class Record(GymWrapper):
|
class Record(GymWrapper):
|
||||||
def __init__(self, env, memory, args):
|
def __init__(self, env, memory, lock, args):
|
||||||
GymWrapper.__init__(self, env)
|
GymWrapper.__init__(self, env)
|
||||||
self.memory = memory
|
self.memory = memory
|
||||||
|
self.lock = lock # Lock for memory access
|
||||||
self.skipframes = args['skip']
|
self.skipframes = args['skip']
|
||||||
self.environment_name = args['environment_name']
|
self.environment_name = args['environment_name']
|
||||||
self.logdir = args['logdir']
|
self.logdir = args['logdir']
|
||||||
|
@ -62,14 +63,16 @@ class Record(GymWrapper):
|
||||||
self.current_i += 1
|
self.current_i += 1
|
||||||
# Don't add to memory until a certain number of frames is reached
|
# Don't add to memory until a certain number of frames is reached
|
||||||
if self.current_i % self.skipframes == 0:
|
if self.current_i % self.skipframes == 0:
|
||||||
|
self.lock.acquire()
|
||||||
self.memory.append((state, action, reward, next_state, done))
|
self.memory.append((state, action, reward, next_state, done))
|
||||||
|
self.lock.release()
|
||||||
self.current_i = 0
|
self.current_i = 0
|
||||||
return next_state, reward, done, info
|
return next_state, reward, done, info
|
||||||
|
|
||||||
def log_transitions(self):
|
def log_transitions(self):
|
||||||
if len(self.memory) > 0:
|
if len(self.memory) > 0:
|
||||||
basename = self.logdir + "/{}.{}".format(self.environment_name, datetime.now().strftime("%Y-%m-%d-%H-%M-%s"))
|
basename = self.logdir + "/{}.{}".format(self.environment_name, datetime.now().strftime("%Y-%m-%d-%H-%M-%s"))
|
||||||
print("Base Filename: ", basename)
|
print("Base Filename: ", basename, flush = True)
|
||||||
state, action, reward, next_state, done = zip(*self.memory)
|
state, action, reward, next_state, done = zip(*self.memory)
|
||||||
np_save(basename + "-state.npy", np_array(state), allow_pickle = False)
|
np_save(basename + "-state.npy", np_array(state), allow_pickle = False)
|
||||||
np_save(basename + "-action.npy", np_array(action), allow_pickle = False)
|
np_save(basename + "-action.npy", np_array(action), allow_pickle = False)
|
||||||
|
@ -124,7 +127,7 @@ def wrap_preprocessing(env, MaxAndSkipEnv = False):
|
||||||
## Set up environment to be recorded and preprocessed
|
## Set up environment to be recorded and preprocessed
|
||||||
record_memory = []
|
record_memory = []
|
||||||
record_lock = Lock()
|
record_lock = Lock()
|
||||||
env = Record(makeEnv(args['environment_name']), record_memory, args)
|
env = Record(makeEnv(args['environment_name']), record_memory, record_lock, args)
|
||||||
|
|
||||||
# Bind record_env to current env so that we can reference log_transitions easier later
|
# Bind record_env to current env so that we can reference log_transitions easier later
|
||||||
record_env = env
|
record_env = env
|
||||||
|
@ -162,7 +165,7 @@ sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon
|
||||||
sneaky_agent = rltorch.agents.DQNAgent(net, sneaky_memory, sneaky_config, target_net = target_net)
|
sneaky_agent = rltorch.agents.DQNAgent(net, sneaky_memory, sneaky_config, target_net = target_net)
|
||||||
|
|
||||||
# Pass all this information into the thread that will handle the game play and start
|
# Pass all this information into the thread that will handle the game play and start
|
||||||
playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config)
|
playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config)
|
||||||
playThread.start()
|
playThread.start()
|
||||||
|
|
||||||
# While the play thread is running, we'll periodically log transitions we've encountered
|
# While the play thread is running, we'll periodically log transitions we've encountered
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
import rltorch
|
import rltorch
|
||||||
|
|
||||||
sneaky_config = {}
|
sneaky_config = {}
|
||||||
sneaky_config['learning_rate'] = 1e-4
|
sneaky_config['learning_rate'] = 1e-5
|
||||||
sneaky_config['target_sync_tau'] = 1e-3
|
sneaky_config['target_sync_tau'] = 1e-3
|
||||||
sneaky_config['discount_rate'] = 0.99
|
sneaky_config['discount_rate'] = 0.99
|
||||||
sneaky_config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5)
|
sneaky_config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.02, iterations = 10**5)
|
||||||
# Number of episodes for the computer to train the agent without the human seeing
|
# Number of episodes for the computer to train the agent without the human seeing
|
||||||
sneaky_config['replay_skip'] = 14
|
sneaky_config['replay_skip'] = 29 # Gradient descent every second
|
||||||
sneaky_config['batch_size'] = 32 * (sneaky_config['replay_skip'] + 1)
|
sneaky_config['batch_size'] = 16 * (sneaky_config['replay_skip'] + 1) # Calculated based on memory constraints
|
||||||
sneaky_config['memory_size'] = 10**4
|
sneaky_config['memory_size'] = 2000 # batch_size * 2 looks = 66 seconds of gameplay
|
||||||
|
# Number of episodes for the computer to train the agent without the human seeing
|
||||||
|
sneaky_config['num_sneaky_episodes'] = 10
|
Loading…
Reference in a new issue