Changes from honors thesis
This commit is contained in:
parent
a44b981e55
commit
66496fe0d8
4 changed files with 40 additions and 54 deletions
18
config.py
18
config.py
|
@ -4,24 +4,32 @@ config = {}
|
||||||
config['seed'] = 901
|
config['seed'] = 901
|
||||||
config['zoom'] = 4
|
config['zoom'] = 4
|
||||||
config['environment_name'] = 'PongNoFrameskip-v4'
|
config['environment_name'] = 'PongNoFrameskip-v4'
|
||||||
config['learning_rate'] = 1e-5
|
config['learning_rate'] = 1e-4
|
||||||
config['target_sync_tau'] = 1e-3
|
config['target_sync_tau'] = 1e-3
|
||||||
config['discount_rate'] = 0.99
|
config['discount_rate'] = 0.99
|
||||||
|
config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.02, iterations = 10**5)
|
||||||
|
config['replay_skip'] = 4
|
||||||
|
config['batch_size'] = 32 * (config['replay_skip'] + 1)
|
||||||
|
config['num_sneaky_episodes'] = 10 # per loop
|
||||||
config['disable_cuda'] = False
|
config['disable_cuda'] = False
|
||||||
|
|
||||||
config['seconds_play_per_state'] = 120
|
config['seconds_play_per_state'] = 120
|
||||||
|
config['seconds_play_per_state'] = 5
|
||||||
# 30 transitions per second for 120 seconds = 3600 transitions per turn
|
# 30 transitions per second for 120 seconds = 3600 transitions per turn
|
||||||
config['memory_size'] = 21600 # To hold 6 demonstrations
|
config['memory_size'] = 86400
|
||||||
config['batch_size'] = 64
|
config['dqfd_demo_loss_weight'] = 0.01
|
||||||
config['num_train_per_demo'] = 115 # 4 looks * transitions per turn / (2 * batch_size)
|
config['dqfd_td_loss_weight'] = 1.
|
||||||
|
config['demo_prio_bonus'] = 0.
|
||||||
|
config['observed_prio_bonus'] = 0.
|
||||||
|
|
||||||
# Prioritized vs Random Sampling
|
# Prioritized vs Random Sampling
|
||||||
# 0 - Random sampling
|
# 0 - Random sampling
|
||||||
# 1 - Only the highest prioirities
|
# 1 - Only the highest prioirities
|
||||||
config['prioritized_replay_sampling_priority'] = 0.6
|
config['prioritized_replay_sampling_priority'] = 0.6
|
||||||
|
config['prioritized_replay_sampling_priority'] = 0.
|
||||||
# How important are the weights for the loss?
|
# How important are the weights for the loss?
|
||||||
# 0 - Treat all losses equally
|
# 0 - Treat all losses equally
|
||||||
# 1 - Lower the importance of high losses
|
# 1 - Lower the importance of high losses
|
||||||
# Should ideally start from 0 and move your way to 1 to prevent overfitting
|
# Should ideally start from 0 and move your way to 1 to prevent overfitting
|
||||||
config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5)
|
config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5)
|
||||||
|
config['prioritized_replay_weight_importance'] = 0.
|
||||||
|
|
46
play.py
46
play.py
|
@ -1,15 +1,16 @@
|
||||||
from gym.spaces.box import Box
|
from gym.spaces.box import Box
|
||||||
import pygame
|
import pygame
|
||||||
from pygame.locals import VIDEORESIZE
|
from pygame.locals import VIDEORESIZE
|
||||||
|
import rltorch
|
||||||
from rltorch.memory import ReplayMemory
|
from rltorch.memory import ReplayMemory
|
||||||
|
|
||||||
class Play:
|
class Play:
|
||||||
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config):
|
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, record_lock, config):
|
||||||
self.env = env
|
self.env = env
|
||||||
self.action_selector = action_selector
|
self.action_selector = action_selector
|
||||||
self.record_lock = record_lock
|
self.record_lock = record_lock
|
||||||
self.record_locked = False
|
self.record_locked = False
|
||||||
self.sneaky_agent = sneaky_agent
|
#self.sneaky_agent = sneaky_agent
|
||||||
self.agent = agent
|
self.agent = agent
|
||||||
self.sneaky_env = sneaky_env
|
self.sneaky_env = sneaky_env
|
||||||
self.sneaky_actor = sneaky_actor
|
self.sneaky_actor = sneaky_actor
|
||||||
|
@ -19,8 +20,8 @@ class Play:
|
||||||
self.zoom = config['zoom'] if 'zoom' in config else 1
|
self.zoom = config['zoom'] if 'zoom' in config else 1
|
||||||
self.keys_to_action = config['keys_to_action'] if 'keys_to_action' in config else None
|
self.keys_to_action = config['keys_to_action'] if 'keys_to_action' in config else None
|
||||||
self.seconds_play_per_state = config['seconds_play_per_state'] if 'seconds_play_per_state' in config else 30
|
self.seconds_play_per_state = config['seconds_play_per_state'] if 'seconds_play_per_state' in config else 30
|
||||||
self.num_sneaky_episodes = sneaky_config['num_sneaky_episodes'] if 'num_sneaky_episodes' in sneaky_config else 10
|
self.num_sneaky_episodes = config['num_sneaky_episodes'] if 'num_sneaky_episodes' in config else 10
|
||||||
self.replay_skip = sneaky_config['replay_skip'] if 'replay_skip' in sneaky_config else 0
|
self.replay_skip = config['replay_skip'] if 'replay_skip' in config else 0
|
||||||
self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1
|
self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1
|
||||||
# Initial values...
|
# Initial values...
|
||||||
self.video_size = (0, 0)
|
self.video_size = (0, 0)
|
||||||
|
@ -32,6 +33,7 @@ class Play:
|
||||||
self.clock = pygame.time.Clock()
|
self.clock = pygame.time.Clock()
|
||||||
self.sneaky_iteration = 0
|
self.sneaky_iteration = 0
|
||||||
self.paused = False
|
self.paused = False
|
||||||
|
self.space_pressed = False
|
||||||
|
|
||||||
def _display_arr(self, obs, screen, arr, video_size):
|
def _display_arr(self, obs, screen, arr, video_size):
|
||||||
if obs is not None:
|
if obs is not None:
|
||||||
|
@ -135,42 +137,39 @@ class Play:
|
||||||
for event in pygame.event.get():
|
for event in pygame.event.get():
|
||||||
if self._process_common_pygame_events(event):
|
if self._process_common_pygame_events(event):
|
||||||
continue
|
continue
|
||||||
elif event.type == pygame.KEYDOWN:
|
elif event.type == pygame.KEYDOWN and event.key == pygame.K_SPACE:
|
||||||
if event.key == pygame.K_SPACE:
|
self.space_pressed = True
|
||||||
self.pressed_keys.append(event.key)
|
elif event.type == pygame.KEYUP and event.key == pygame.K_SPACE and self.space_pressed:
|
||||||
elif event.type == pygame.KEYUP and event.key == pygame.K_SPACE:
|
self.space_pressed = False
|
||||||
self.pressed_keys.remove(event.key)
|
|
||||||
self._increment_state()
|
self._increment_state()
|
||||||
|
|
||||||
pygame.display.flip()
|
pygame.display.flip()
|
||||||
self.clock.tick(self.fps)
|
self.clock.tick(self.fps)
|
||||||
|
|
||||||
def sneaky_train(self):
|
def sneaky_train(self):
|
||||||
# self.record_lock.acquire()
|
|
||||||
# Do a standard RL algorithm process for a certain number of episodes
|
# Do a standard RL algorithm process for a certain number of episodes
|
||||||
|
step = 0
|
||||||
for i in range(self.num_sneaky_episodes):
|
for i in range(self.num_sneaky_episodes):
|
||||||
print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "")
|
print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "")
|
||||||
|
|
||||||
# Reset all episode related variables
|
# Reset all episode related variables
|
||||||
prev_obs = self.sneaky_env.reset()
|
prev_obs = self.sneaky_env.reset()
|
||||||
done = False
|
done = False
|
||||||
step = 0
|
|
||||||
total_reward = 0
|
total_reward = 0
|
||||||
|
|
||||||
while not done:
|
while not done:
|
||||||
action = self.sneaky_actor.act(prev_obs)
|
action = self.sneaky_actor.act(prev_obs)
|
||||||
obs, reward, done, _ = self.sneaky_env.step(action)
|
obs, reward, done, _ = self.sneaky_env.step(action)
|
||||||
total_reward += reward
|
total_reward += reward
|
||||||
self.sneaky_agent.memory.append(prev_obs, action, reward, obs, done)
|
self.agent.memory.append(prev_obs, action, reward, obs, done)
|
||||||
prev_obs = obs
|
prev_obs = obs
|
||||||
step += 1
|
step += 1
|
||||||
if step % self.replay_skip == 0:
|
if step % self.replay_skip == 0:
|
||||||
self.sneaky_agent.learn()
|
self.agent.learn()
|
||||||
|
|
||||||
# Finish the previous print with the total reward obtained during the episode
|
# Finish the previous print with the total reward obtained during the episode
|
||||||
print(total_reward, flush = True)
|
print(total_reward, "Epsilon:", next(self.sneaky_actor.epsilon), flush = True)
|
||||||
self.sneaky_iteration += 1
|
self.sneaky_iteration += 1
|
||||||
# self.record_lock.release()
|
|
||||||
|
|
||||||
def display_text(self, text):
|
def display_text(self, text):
|
||||||
myfont = pygame.font.SysFont('Comic Sans MS', 50)
|
myfont = pygame.font.SysFont('Comic Sans MS', 50)
|
||||||
|
@ -247,7 +246,9 @@ class Play:
|
||||||
|
|
||||||
# The computer will train for a few episodes without showing to the user.
|
# The computer will train for a few episodes without showing to the user.
|
||||||
# Mainly to speed up the learning process a bit
|
# Mainly to speed up the learning process a bit
|
||||||
elif self.state is SNEAKY_COMPUTER_PLAY:
|
elif self.state == SNEAKY_COMPUTER_PLAY:
|
||||||
|
# Clear pressed keys in case a key is left inside (the bug where you can't control it since it just holds a button)
|
||||||
|
self.pressed_keys.clear()
|
||||||
if not self.record_locked:
|
if not self.record_locked:
|
||||||
self.record_lock.acquire()
|
self.record_lock.acquire()
|
||||||
self.record_locked = True
|
self.record_locked = True
|
||||||
|
@ -277,25 +278,18 @@ class Play:
|
||||||
self.record_lock.acquire()
|
self.record_lock.acquire()
|
||||||
self.record_locked = True
|
self.record_locked = True
|
||||||
self.transition("Your Turn! Press <Space> to Start")
|
self.transition("Your Turn! Press <Space> to Start")
|
||||||
|
|
||||||
|
|
||||||
# Increment the timer if it's the human or shown computer's turn
|
# Increment the timer if it's the human or shown computer's turn
|
||||||
if self.state is COMPUTER_PLAY or self.state is HUMAN_PLAY:
|
if self.state is COMPUTER_PLAY or self.state is HUMAN_PLAY:
|
||||||
if self.state == HUMAN_PLAY and isinstance(self.agent.memory, 'DQfDMemory'):
|
if self.state == HUMAN_PLAY and (isinstance(self.agent.memory, rltorch.memory.DQfDMemory) or isinstance(self.agent.memory, rltorch.memory.iDQfDMemory)):
|
||||||
self.agent.memory.append_demonstration(prev_obs, action, reward, obs, env_done)
|
self.agent.memory.append_demonstration(prev_obs, action, reward, obs, env_done)
|
||||||
else:
|
else:
|
||||||
self.agent.memory.append(prev_obs, action, reward, obs, env_done)
|
self.agent.memory.append(prev_obs, action, reward, obs, env_done)
|
||||||
i += 1
|
i += 1
|
||||||
# Perform a quick learning process and increment the state after a certain time period has passed
|
# Perform a quick learning process and increment the state after a certain time period has passed
|
||||||
if i % (self.fps * self.seconds_play_per_state) == 0:
|
if i % (self.fps * self.seconds_play_per_state) == 0:
|
||||||
self.record_lock.acquire()
|
|
||||||
self.display_text("Demo Training...")
|
|
||||||
print("Begin Demonstration Training")
|
|
||||||
print("Number of transitions in buffer: ", len(self.agent.memory), flush = True)
|
print("Number of transitions in buffer: ", len(self.agent.memory), flush = True)
|
||||||
for j in range(self.num_train_per_demo):
|
|
||||||
print("Iteration %d / %d" % (j + 1, self.num_train_per_demo))
|
|
||||||
self.agent.learn()
|
|
||||||
self.clear_text(obs)
|
|
||||||
self.record_lock.release()
|
|
||||||
self._increment_state()
|
self._increment_state()
|
||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
|
|
17
play_env.py
17
play_env.py
|
@ -16,7 +16,7 @@ from torch.optim import Adam
|
||||||
|
|
||||||
# Import my custom RL library
|
# Import my custom RL library
|
||||||
import rltorch
|
import rltorch
|
||||||
from rltorch.memory import PrioritizedReplayMemory, ReplayMemory, DQfDMemory
|
from rltorch.memory import PrioritizedReplayMemory, ReplayMemory, iDQfDMemory
|
||||||
from rltorch.action_selector import EpsilonGreedySelector, ArgMaxSelector
|
from rltorch.action_selector import EpsilonGreedySelector, ArgMaxSelector
|
||||||
import rltorch.env as E
|
import rltorch.env as E
|
||||||
import rltorch.network as rn
|
import rltorch.network as rn
|
||||||
|
@ -37,9 +37,9 @@ from networks import Value
|
||||||
## Play Related Classes
|
## Play Related Classes
|
||||||
#
|
#
|
||||||
class PlayClass(Thread):
|
class PlayClass(Thread):
|
||||||
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config):
|
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, record_lock, config):
|
||||||
super(PlayClass, self).__init__()
|
super(PlayClass, self).__init__()
|
||||||
self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config)
|
self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, record_lock, config)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
self.play.start()
|
self.play.start()
|
||||||
|
@ -93,7 +93,6 @@ args = vars(parser.parse_args())
|
||||||
|
|
||||||
## Main configuration for script
|
## Main configuration for script
|
||||||
from config import config
|
from config import config
|
||||||
from sneaky_config import sneaky_config
|
|
||||||
|
|
||||||
# Environment name and log directory is vital so show help message and exit if not provided
|
# Environment name and log directory is vital so show help message and exit if not provided
|
||||||
if args['environment_name'] is None or args['logdir'] is None:
|
if args['environment_name'] is None or args['logdir'] is None:
|
||||||
|
@ -152,20 +151,18 @@ net = rn.Network(Value(state_size, action_size),
|
||||||
target_net = rn.TargetNetwork(net, device = device)
|
target_net = rn.TargetNetwork(net, device = device)
|
||||||
|
|
||||||
# Relevant components from RLTorch
|
# Relevant components from RLTorch
|
||||||
memory = DQfDMemory(capacity= config['memory_size'], alpha = config['prioritized_replay_sampling_priority'], max_demo = config['memory_size'] // 2)
|
memory = iDQfDMemory(capacity= config['memory_size'], max_demo = config['memory_size'] // 10)
|
||||||
actor = ArgMaxSelector(net, action_size, device = device)
|
actor = ArgMaxSelector(net, action_size, device = device)
|
||||||
agent = rltorch.agents.DQfDAgent(net, memory, config, target_net = target_net)
|
agent = rltorch.agents.DQfDAgent(net, memory, config, target_net = target_net)
|
||||||
|
|
||||||
# Use a different environment for when the computer trains on the side so that the current game state isn't manipuated
|
# Use a different environment for when the computer trains on the side so that the current game state isn't manipuated
|
||||||
# Also use MaxEnvSkip to speed up processing
|
# Also use MaxEnvSkip to speed up processing
|
||||||
sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True)
|
sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True)
|
||||||
sneaky_memory = ReplayMemory(capacity = sneaky_config['memory_size'])
|
sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = config['exploration_rate'])
|
||||||
sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = sneaky_config['exploration_rate'])
|
|
||||||
|
|
||||||
sneaky_agent = rltorch.agents.DQNAgent(net, sneaky_memory, sneaky_config, target_net = target_net)
|
|
||||||
|
|
||||||
# Pass all this information into the thread that will handle the game play and start
|
# Pass all this information into the thread that will handle the game play and start
|
||||||
playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config, sneaky_config)
|
playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, record_lock, config)
|
||||||
playThread.start()
|
playThread.start()
|
||||||
|
|
||||||
# While the play thread is running, we'll periodically log transitions we've encountered
|
# While the play thread is running, we'll periodically log transitions we've encountered
|
||||||
|
@ -179,4 +176,4 @@ while playThread.is_alive():
|
||||||
# Save what's remaining after process died
|
# Save what's remaining after process died
|
||||||
record_lock.acquire()
|
record_lock.acquire()
|
||||||
record_env.log_transitions()
|
record_env.log_transitions()
|
||||||
record_lock.release()
|
record_lock.release()
|
||||||
|
|
|
@ -1,13 +0,0 @@
|
||||||
import rltorch
|
|
||||||
|
|
||||||
sneaky_config = {}
|
|
||||||
sneaky_config['learning_rate'] = 1e-5
|
|
||||||
sneaky_config['target_sync_tau'] = 1e-3
|
|
||||||
sneaky_config['discount_rate'] = 0.99
|
|
||||||
sneaky_config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.02, iterations = 10**5)
|
|
||||||
# Number of episodes for the computer to train the agent without the human seeing
|
|
||||||
sneaky_config['replay_skip'] = 29 # Gradient descent every second
|
|
||||||
sneaky_config['batch_size'] = 16 * (sneaky_config['replay_skip'] + 1) # Calculated based on memory constraints
|
|
||||||
sneaky_config['memory_size'] = 2000 # batch_size * 2 looks = 66 seconds of gameplay
|
|
||||||
# Number of episodes for the computer to train the agent without the human seeing
|
|
||||||
sneaky_config['num_sneaky_episodes'] = 10
|
|
Loading…
Reference in a new issue