Began separating config & networks, F1 for pausing, text functions, and more sneaky agent stuff
This commit is contained in:
parent
d78892e62c
commit
32862e4d79
6 changed files with 188 additions and 142 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,2 +1,3 @@
|
|||
__pycache__/
|
||||
playlogs/
|
||||
.vscode/
|
27
config.py
Normal file
27
config.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import rltorch
|
||||
|
||||
config = {}
|
||||
config['seed'] = 901
|
||||
config['seconds_play_per_state'] = 120
|
||||
config['zoom'] = 4
|
||||
config['environment_name'] = 'PongNoFrameskip-v4'
|
||||
config['learning_rate'] = 1e-4
|
||||
config['target_sync_tau'] = 1e-3
|
||||
config['discount_rate'] = 0.99
|
||||
config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5)
|
||||
# Number of episodes for the computer to train the agent without the human seeing
|
||||
config['num_sneaky_episodes'] = 10
|
||||
config['num_train_per_demo'] = 50 # 100 total since you have two demo training per cycle
|
||||
config['replay_skip'] = 14
|
||||
config['batch_size'] = 32 * (config['replay_skip'] + 1)
|
||||
config['disable_cuda'] = False
|
||||
config['memory_size'] = 10**4
|
||||
# Prioritized vs Random Sampling
|
||||
# 0 - Random sampling
|
||||
# 1 - Only the highest prioirities
|
||||
config['prioritized_replay_sampling_priority'] = 0.6
|
||||
# How important are the weights for the loss?
|
||||
# 0 - Treat all losses equally
|
||||
# 1 - Lower the importance of high losses
|
||||
# Should ideally start from 0 and move your way to 1 to prevent overfitting
|
||||
config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5)
|
51
networks.py
Normal file
51
networks.py
Normal file
|
@ -0,0 +1,51 @@
|
|||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import rltorch.network as rn
|
||||
|
||||
class Value(nn.Module):
|
||||
def __init__(self, state_size, action_size):
|
||||
super(Value, self).__init__()
|
||||
self.state_size = state_size
|
||||
self.action_size = action_size
|
||||
|
||||
self.conv1 = nn.Conv2d(4, 32, kernel_size = (8, 8), stride = (4, 4))
|
||||
self.conv2 = nn.Conv2d(32, 64, kernel_size = (4, 4), stride = (2, 2))
|
||||
self.conv3 = nn.Conv2d(64, 64, kernel_size = (3, 3), stride = (1, 1))
|
||||
|
||||
self.fc1 = nn.Linear(3136, 512)
|
||||
self.fc1_norm = nn.LayerNorm(512)
|
||||
|
||||
self.value_fc = rn.NoisyLinear(512, 512)
|
||||
self.value_fc_norm = nn.LayerNorm(512)
|
||||
self.value = nn.Linear(512, 1)
|
||||
|
||||
self.advantage_fc = rn.NoisyLinear(512, 512)
|
||||
self.advantage_fc_norm = nn.LayerNorm(512)
|
||||
self.advantage = nn.Linear(512, action_size)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
x = x.float() / 256
|
||||
x = F.relu(self.conv1(x))
|
||||
x = F.relu(self.conv2(x))
|
||||
x = F.relu(self.conv3(x))
|
||||
|
||||
# Makes batch_size dimension again
|
||||
x = x.view(-1, 3136)
|
||||
x = F.relu(self.fc1_norm(self.fc1(x)))
|
||||
|
||||
state_value = F.relu(self.value_fc_norm(self.value_fc(x)))
|
||||
state_value = self.value(state_value)
|
||||
|
||||
advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x)))
|
||||
advantage = self.advantage(advantage)
|
||||
|
||||
x = state_value + advantage - advantage.mean()
|
||||
|
||||
# For debugging purposes...
|
||||
if torch.isnan(x).any().item():
|
||||
print("WARNING NAN IN MODEL DETECTED")
|
||||
|
||||
return x
|
106
play.py
106
play.py
|
@ -4,13 +4,14 @@ from pygame.locals import VIDEORESIZE
|
|||
from rltorch.memory import ReplayMemory
|
||||
|
||||
class Play:
|
||||
def __init__(self, env, action_selector, memory, memory_lock, agent, sneaky_env, config):
|
||||
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config):
|
||||
self.env = env
|
||||
self.action_selector = action_selector
|
||||
self.memory = memory
|
||||
self.memory_lock = memory_lock
|
||||
self.record_lock = record_lock
|
||||
self.sneaky_agent = sneaky_agent
|
||||
self.agent = agent
|
||||
self.sneaky_env = sneaky_env
|
||||
self.sneaky_actor = sneaky_actor
|
||||
# Get relevant parameters from config or set sane defaults
|
||||
self.transpose = config['transpose'] if 'transpose' in config else True
|
||||
self.fps = config['fps'] if 'fps' in config else 30
|
||||
|
@ -20,6 +21,7 @@ class Play:
|
|||
self.num_sneaky_episodes = config['num_sneaky_episodes'] if 'num_sneaky_episodes' in config else 10
|
||||
self.memory_size = config['memory_size'] if 'memory_size' in config else 10**4
|
||||
self.replay_skip = config['replay_skip'] if 'replay_skip' in config else 0
|
||||
self.num_train_per_demo = config['num_train_per_demo'] if 'num_train_per_demo' in config else 1
|
||||
# Initial values...
|
||||
self.video_size = (0, 0)
|
||||
self.pressed_keys = []
|
||||
|
@ -28,6 +30,8 @@ class Play:
|
|||
self.running = True
|
||||
self.state = 0
|
||||
self.clock = pygame.time.Clock()
|
||||
self.sneaky_iteration = 0
|
||||
self.paused = False
|
||||
|
||||
def _display_arr(self, obs, screen, arr, video_size):
|
||||
if obs is not None:
|
||||
|
@ -49,6 +53,9 @@ class Play:
|
|||
self.screen = pygame.display.set_mode(self.video_size)
|
||||
elif event.type == pygame.KEYDOWN and event.key == pygame.K_ESCAPE:
|
||||
self.running = False
|
||||
elif not self.paused and self.state in [0, 3] and event.type == pygame.KEYUP and event.key == pygame.K_F1:
|
||||
self.paused = True
|
||||
self.display_text("Paused... Press F1 to unpause.")
|
||||
else:
|
||||
# No event was matched here
|
||||
return False
|
||||
|
@ -118,7 +125,7 @@ class Play:
|
|||
def _increment_state(self):
|
||||
self.state = (self.state + 1) % 5
|
||||
|
||||
def pause(self, text = ""):
|
||||
def transition(self, text = ""):
|
||||
myfont = pygame.font.SysFont('Comic Sans MS', 50)
|
||||
textsurface = myfont.render(text, False, (0, 0, 0))
|
||||
self.screen.blit(textsurface,(0,0))
|
||||
|
@ -138,15 +145,10 @@ class Play:
|
|||
self.clock.tick(self.fps)
|
||||
|
||||
def sneaky_train(self):
|
||||
self.memory_lock.acquire()
|
||||
|
||||
# Backup memory
|
||||
backup_memory = self.memory
|
||||
self.memory = ReplayMemory(capacity = self.memory_size)
|
||||
|
||||
self.record_lock.acquire()
|
||||
# Do a standard RL algorithm process for a certain number of episodes
|
||||
for i in range(self.num_sneaky_episodes):
|
||||
print("Episode: %d / %d, Reward: " % (i + 1, self.num_sneaky_episodes), end = "")
|
||||
print("Episode: %d / %d, Reward: " % ((self.num_sneaky_episodes * self.sneaky_iteration) + i + 1, (self.sneaky_iteration + 1) * self.num_sneaky_episodes), end = "")
|
||||
|
||||
# Reset all episode releated variables
|
||||
prev_obs = self.sneaky_env.reset()
|
||||
|
@ -155,28 +157,40 @@ class Play:
|
|||
total_reward = 0
|
||||
|
||||
while not done:
|
||||
action = self.action_selector.act(prev_obs)
|
||||
action = self.sneaky_actor.act(prev_obs)
|
||||
obs, reward, done, _ = self.sneaky_env.step(action)
|
||||
total_reward += reward
|
||||
self.memory.append(prev_obs, action, reward, obs, done)
|
||||
self.sneaky_agent.memory.append(prev_obs, action, reward, obs, done)
|
||||
prev_obs = obs
|
||||
step += 1
|
||||
if step % self.replay_skip == 0:
|
||||
self.agent.learn()
|
||||
self.sneaky_agent.learn()
|
||||
|
||||
# Finish the previous print with the total reward obtained during the episode
|
||||
print(total_reward)
|
||||
|
||||
# Reset the memory back to the human demonstration / shown computer data
|
||||
self.memory = backup_memory
|
||||
self.memory_lock.release()
|
||||
|
||||
# Thoughts:
|
||||
# It would be cool instead of throwing away all this new data, we keep just a sample of it
|
||||
# Not sure if i want all of it because then it'll drown out the expert demonstration data
|
||||
|
||||
|
||||
self.sneaky_iteration += 1
|
||||
self.record_lock.release()
|
||||
|
||||
def display_text(self, text):
|
||||
myfont = pygame.font.SysFont('Comic Sans MS', 50)
|
||||
textsurface = myfont.render(text, False, (0, 0, 0))
|
||||
self.screen.blit(textsurface,(0,0))
|
||||
pygame.display.flip()
|
||||
|
||||
def clear_text(self, obs):
|
||||
self._display_arr(obs, self.screen, self.env.unwrapped._get_obs(), video_size=self.video_size)
|
||||
pygame.display.flip()
|
||||
|
||||
def process_pause_state(self, obs):
|
||||
# Process game events
|
||||
for event in pygame.event.get():
|
||||
# This rule needs to be before the common one otherwise unpausing is ignored
|
||||
if event.type == pygame.KEYUP and event.key == pygame.K_F1:
|
||||
self.paused = False
|
||||
self.clear_text(obs)
|
||||
else:
|
||||
self._process_common_pygame_events(event)
|
||||
|
||||
def start(self):
|
||||
"""Allows one to play the game using keyboard.
|
||||
To simply play the game use:
|
||||
|
@ -200,57 +214,63 @@ class Play:
|
|||
TRANSITION2 = 4
|
||||
|
||||
env_done = True
|
||||
prev_obs = None
|
||||
action = None
|
||||
reward = None
|
||||
obs = None
|
||||
i = 0
|
||||
episode_num = 0
|
||||
while self.running:
|
||||
# If the environment is done after a turn, reset it so we can keep going
|
||||
if env_done:
|
||||
episode_num += 1
|
||||
print("Human/Computer Episode: ", episode_num)
|
||||
obs = self.env.reset()
|
||||
env_done = False
|
||||
|
||||
if self.paused:
|
||||
self.process_pause_state(obs)
|
||||
continue
|
||||
|
||||
if self.state is HUMAN_PLAY:
|
||||
_, _, _, obs, env_done = self._human_play(obs)
|
||||
prev_obs, action, reward, obs, env_done = self._human_play(obs)
|
||||
|
||||
# The computer will train for a few episodes without showing to the user.
|
||||
# Mainly to speed up the learning process a bit
|
||||
elif self.state is SNEAKY_COMPUTER_PLAY:
|
||||
print("Sneaky Computer Time")
|
||||
|
||||
# Display "Training..." text to user
|
||||
myfont = pygame.font.SysFont('Comic Sans MS', 50)
|
||||
textsurface = myfont.render("Training....", False, (0, 0, 0))
|
||||
self.screen.blit(textsurface,(0,0))
|
||||
pygame.display.flip()
|
||||
self.display_text("Training...")
|
||||
|
||||
# Have the agent play a few rounds without showing to the user
|
||||
self.sneaky_train()
|
||||
|
||||
# To take away training text
|
||||
self._display_arr(obs, self.screen, self.env.unwrapped._get_obs(), video_size=self.video_size)
|
||||
pygame.display.flip()
|
||||
|
||||
# Go to the next step immediately
|
||||
self.clear_text(obs)
|
||||
self._increment_state()
|
||||
|
||||
elif self.state is TRANSITION:
|
||||
self.pause("Computers Turn! Press <Space> to Start")
|
||||
self.transition("Computers Turn! Press <Space> to Start")
|
||||
|
||||
elif self.state is COMPUTER_PLAY:
|
||||
_, _, _, obs, env_done = self._computer_play(obs)
|
||||
prev_obs, action, reward, obs, env_done = self._computer_play(obs)
|
||||
|
||||
elif self.state is TRANSITION2:
|
||||
self.pause("Your Turn! Press <Space> to Start")
|
||||
self.transition("Your Turn! Press <Space> to Start")
|
||||
|
||||
# Increment the timer if it's the human or shown computer's turn
|
||||
if self.state is COMPUTER_PLAY or self.state is HUMAN_PLAY:
|
||||
self.agent.memory.append(prev_obs, action, reward, obs, env_done)
|
||||
i += 1
|
||||
# Perform a quick learning process and increment the state after a certain time period has passed
|
||||
if i % (self.fps * self.seconds_play_per_state) == 0:
|
||||
self.memory_lock.acquire()
|
||||
print("Number of transitions in buffer: ", len(self.memory))
|
||||
self.agent.learn()
|
||||
self.memory_lock.release()
|
||||
self.record_lock.acquire()
|
||||
self.display_text("Demo Training...")
|
||||
print("Begin Demonstration Training")
|
||||
print("Number of transitions in buffer: ", len(self.agent.memory))
|
||||
for j in range(self.num_train_per_demo):
|
||||
print("Iteration %d / %d" % (j + 1, self.num_train_per_demo))
|
||||
self.agent.learn()
|
||||
self.clear_text(obs)
|
||||
self.record_lock.release()
|
||||
self._increment_state()
|
||||
i = 0
|
||||
|
||||
|
|
134
play_env.py
134
play_env.py
|
@ -1,4 +1,7 @@
|
|||
|
||||
# TODO: I'm kinda using this project to pilot the whole config/network/example separation
|
||||
# The motivation behind this is that the file sizes are getting large and its increasing cognitive load :(
|
||||
|
||||
# Import Python Standard Libraries
|
||||
from threading import Thread, Lock
|
||||
from argparse import ArgumentParser
|
||||
|
@ -10,13 +13,11 @@ from numpy import array as np_array
|
|||
from numpy import save as np_save
|
||||
import torch
|
||||
from torch.optim import Adam
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
# Import my custom RL library
|
||||
import rltorch
|
||||
from rltorch.memory import PrioritizedReplayMemory
|
||||
from rltorch.action_selector import EpsilonGreedySelector
|
||||
from rltorch.memory import PrioritizedReplayMemory, ReplayMemory
|
||||
from rltorch.action_selector import EpsilonGreedySelector, ArgMaxSelector
|
||||
import rltorch.env as E
|
||||
import rltorch.network as rn
|
||||
|
||||
|
@ -28,73 +29,24 @@ import play
|
|||
|
||||
|
||||
#
|
||||
## Networks
|
||||
## Networks (Probably want to move this to config file)
|
||||
#
|
||||
class Value(nn.Module):
|
||||
def __init__(self, state_size, action_size):
|
||||
super(Value, self).__init__()
|
||||
self.state_size = state_size
|
||||
self.action_size = action_size
|
||||
|
||||
self.conv1 = nn.Conv2d(4, 32, kernel_size = (8, 8), stride = (4, 4))
|
||||
self.conv2 = nn.Conv2d(32, 64, kernel_size = (4, 4), stride = (2, 2))
|
||||
self.conv3 = nn.Conv2d(64, 64, kernel_size = (3, 3), stride = (1, 1))
|
||||
|
||||
self.fc1 = nn.Linear(3136, 512)
|
||||
self.fc1_norm = nn.LayerNorm(512)
|
||||
|
||||
self.value_fc = rn.NoisyLinear(512, 512)
|
||||
self.value_fc_norm = nn.LayerNorm(512)
|
||||
self.value = nn.Linear(512, 1)
|
||||
|
||||
self.advantage_fc = rn.NoisyLinear(512, 512)
|
||||
self.advantage_fc_norm = nn.LayerNorm(512)
|
||||
self.advantage = nn.Linear(512, action_size)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
x = x.float() / 256
|
||||
x = F.relu(self.conv1(x))
|
||||
x = F.relu(self.conv2(x))
|
||||
x = F.relu(self.conv3(x))
|
||||
|
||||
# Makes batch_size dimension again
|
||||
x = x.view(-1, 3136)
|
||||
x = F.relu(self.fc1_norm(self.fc1(x)))
|
||||
|
||||
state_value = F.relu(self.value_fc_norm(self.value_fc(x)))
|
||||
state_value = self.value(state_value)
|
||||
|
||||
advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x)))
|
||||
advantage = self.advantage(advantage)
|
||||
|
||||
x = state_value + advantage - advantage.mean()
|
||||
|
||||
# For debugging purposes...
|
||||
if torch.isnan(x).any().item():
|
||||
print("WARNING NAN IN MODEL DETECTED")
|
||||
|
||||
return x
|
||||
|
||||
from networks import Value
|
||||
|
||||
#
|
||||
## Play Related Classes
|
||||
#
|
||||
Transition = namedtuple('Transition',
|
||||
('state', 'action', 'reward', 'next_state', 'done'))
|
||||
|
||||
class PlayClass(Thread):
|
||||
def __init__(self, env, action_selector, memory, memory_lock, agent, sneaky_env, config):
|
||||
def __init__(self, env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config):
|
||||
super(PlayClass, self).__init__()
|
||||
self.play = play.Play(env, action_selector, memory, memory_lock, agent, sneaky_env, config)
|
||||
self.play = play.Play(env, action_selector, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config)
|
||||
|
||||
def run(self):
|
||||
self.play.start()
|
||||
|
||||
class Record(GymWrapper):
|
||||
def __init__(self, env, memory, memory_lock, args):
|
||||
def __init__(self, env, memory, args):
|
||||
GymWrapper.__init__(self, env)
|
||||
self.memory_lock = memory_lock
|
||||
self.memory = memory
|
||||
self.skipframes = args['skip']
|
||||
self.environment_name = args['environment_name']
|
||||
|
@ -110,14 +62,11 @@ class Record(GymWrapper):
|
|||
self.current_i += 1
|
||||
# Don't add to memory until a certain number of frames is reached
|
||||
if self.current_i % self.skipframes == 0:
|
||||
self.memory_lock.acquire()
|
||||
self.memory.append(state, action, reward, next_state, done)
|
||||
self.memory_lock.release()
|
||||
self.memory.append((state, action, reward, next_state, done))
|
||||
self.current_i = 0
|
||||
return next_state, reward, done, info
|
||||
|
||||
def log_transitions(self):
|
||||
self.memory_lock.acquire()
|
||||
if len(self.memory) > 0:
|
||||
basename = self.logdir + "/{}.{}".format(self.environment_name, datetime.now().strftime("%Y-%m-%d-%H-%M-%s"))
|
||||
print("Base Filename: ", basename)
|
||||
|
@ -128,7 +77,6 @@ class Record(GymWrapper):
|
|||
np_save(basename + "-nextstate.npy", np_array(next_state), allow_pickle = False)
|
||||
np_save(basename + "-done.npy", np_array(done), allow_pickle = False)
|
||||
self.memory.clear()
|
||||
self.memory_lock.release()
|
||||
|
||||
|
||||
## Parsing arguments
|
||||
|
@ -141,31 +89,8 @@ parser.add_argument("--model", type=str, help = "The path location of the PyTorc
|
|||
args = vars(parser.parse_args())
|
||||
|
||||
## Main configuration for script
|
||||
config = {}
|
||||
config['seed'] = 901
|
||||
config['seconds_play_per_state'] = 60
|
||||
config['zoom'] = 4
|
||||
config['environment_name'] = 'PongNoFrameskip-v4'
|
||||
config['learning_rate'] = 1e-4
|
||||
config['target_sync_tau'] = 1e-3
|
||||
config['discount_rate'] = 0.99
|
||||
config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5)
|
||||
# Number of episodes for the computer to train the agent without the human seeing
|
||||
config['num_sneaky_episodes'] = 20
|
||||
config['replay_skip'] = 14
|
||||
config['batch_size'] = 32 * (config['replay_skip'] + 1)
|
||||
config['disable_cuda'] = False
|
||||
config['memory_size'] = 10**4
|
||||
# Prioritized vs Random Sampling
|
||||
# 0 - Random sampling
|
||||
# 1 - Only the highest prioirities
|
||||
config['prioritized_replay_sampling_priority'] = 0.6
|
||||
# How important are the weights for the loss?
|
||||
# 0 - Treat all losses equally
|
||||
# 1 - Lower the importance of high losses
|
||||
# Should ideally start from 0 and move your way to 1 to prevent overfitting
|
||||
config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 10**5)
|
||||
|
||||
from config import config
|
||||
from sneaky_config import sneaky_config
|
||||
|
||||
# Environment name and log directory is vital so show help message and exit if not provided
|
||||
if args['environment_name'] is None or args['logdir'] is None:
|
||||
|
@ -175,7 +100,7 @@ if args['environment_name'] is None or args['logdir'] is None:
|
|||
# Number of frames to skip when recording and fps can have sane defaults
|
||||
if args['skip'] is None:
|
||||
args['skip'] = 3
|
||||
if args['fps'] is None:
|
||||
if 'fps' not in args:
|
||||
args['fps'] = 30
|
||||
|
||||
|
||||
|
@ -196,22 +121,20 @@ def wrap_preprocessing(env, MaxAndSkipEnv = False):
|
|||
, 4)
|
||||
)
|
||||
|
||||
|
||||
## Set up environment to be recorded and preprocessed
|
||||
memory = PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority'])
|
||||
memory_lock = Lock()
|
||||
env = Record(makeEnv(args['environment_name']), memory, memory_lock, args)
|
||||
record_memory = []
|
||||
record_lock = Lock()
|
||||
env = Record(makeEnv(args['environment_name']), record_memory, args)
|
||||
|
||||
# Bind record_env to current env so that we can reference log_transitions easier later
|
||||
record_env = env
|
||||
|
||||
# Use native gym monitor to get video recording
|
||||
env = GymMonitor(env, args['logdir'], force=True)
|
||||
|
||||
# Preprocess enviornment
|
||||
env = wrap_preprocessing(env)
|
||||
|
||||
# Use a different environment for when the computer trains on the side so that the current game state isn't manipuated
|
||||
# Also use MaxEnvSkip to speed up processing
|
||||
sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True)
|
||||
|
||||
# Set seeds
|
||||
rltorch.set_seed(config['seed'])
|
||||
env.seed(config['seed'])
|
||||
|
@ -226,18 +149,31 @@ net = rn.Network(Value(state_size, action_size),
|
|||
target_net = rn.TargetNetwork(net, device = device)
|
||||
|
||||
# Relevant components from RLTorch
|
||||
actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = config['exploration_rate'])
|
||||
memory = PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority'])
|
||||
actor = ArgMaxSelector(net, action_size, device = device)
|
||||
agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net)
|
||||
|
||||
# Use a different environment for when the computer trains on the side so that the current game state isn't manipuated
|
||||
# Also use MaxEnvSkip to speed up processing
|
||||
sneaky_env = wrap_preprocessing(makeEnv(args['environment_name']), MaxAndSkipEnv = True)
|
||||
sneaky_memory = ReplayMemory(capacity = sneaky_config['memory_size'])
|
||||
sneaky_actor = EpsilonGreedySelector(net, action_size, device = device, epsilon = sneaky_config['exploration_rate'])
|
||||
|
||||
sneaky_agent = rltorch.agents.DQNAgent(net, sneaky_memory, sneaky_config, target_net = target_net)
|
||||
|
||||
# Pass all this information into the thread that will handle the game play and start
|
||||
playThread = PlayClass(env, actor, memory, memory_lock, agent, sneaky_env, config)
|
||||
playThread = PlayClass(env, actor, agent, sneaky_env, sneaky_actor, sneaky_agent, record_lock, config)
|
||||
playThread.start()
|
||||
|
||||
# While the play thread is running, we'll periodically log transitions we've encountered
|
||||
while playThread.is_alive():
|
||||
playThread.join(60)
|
||||
record_lock.acquire()
|
||||
print("Logging....", end = " ")
|
||||
record_env.log_transitions()
|
||||
record_lock.release()
|
||||
|
||||
# Save what's remaining after process died
|
||||
record_lock.acquire()
|
||||
record_env.log_transitions()
|
||||
record_lock.release()
|
11
sneaky_config.py
Normal file
11
sneaky_config.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
import rltorch
|
||||
|
||||
sneaky_config = {}
|
||||
sneaky_config['learning_rate'] = 1e-4
|
||||
sneaky_config['target_sync_tau'] = 1e-3
|
||||
sneaky_config['discount_rate'] = 0.99
|
||||
sneaky_config['exploration_rate'] = rltorch.scheduler.ExponentialScheduler(initial_value = 1, end_value = 0.1, iterations = 10**5)
|
||||
# Number of episodes for the computer to train the agent without the human seeing
|
||||
sneaky_config['replay_skip'] = 14
|
||||
sneaky_config['batch_size'] = 32 * (sneaky_config['replay_skip'] + 1)
|
||||
sneaky_config['memory_size'] = 10**4
|
Loading…
Reference in a new issue