Made Logger global

This commit is contained in:
Brandon Rozek 2020-04-14 15:24:48 -04:00
parent 1f7c6f10ab
commit c6172f309d
21 changed files with 513 additions and 527 deletions

View file

@ -14,13 +14,8 @@ This is a dictionary that is shared around the different components. Contains hy
### Environment ### Environment
This component needs to support the standard openai functions reset and step. This component needs to support the standard openai functions reset and step.
### Logger
For Tensorboard to work, you need to define a logger that will (optionally) later go into the network, runner, and agent/trainer.
Due to issues with multiprocessing, the Logger is a shared dictionary of lists that get appended to and the LogWriter writes on the main thread.
### Network ### Network
A network takes a PyTorch nn.Module, PyTorch optimizer, configuration, and the optional logger. A network takes a PyTorch nn.Module, PyTorch optimizer, and configuration.
### Target Network ### Target Network
Takes in a network and provides methods to sync a copy of the original network. Takes in a network and provides methods to sync a copy of the original network.

View file

@ -8,6 +8,7 @@ import rltorch.memory as M
import rltorch.env as E import rltorch.env as E
from rltorch.action_selector import StochasticSelector from rltorch.action_selector import StochasticSelector
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from rltorch.log import Logger
# #
## Networks ## Networks
@ -68,65 +69,55 @@ config['disable_cuda'] = False
# #
## Training Loop ## Training Loop
# #
def train(runner, agent, config, logger = None, logwriter = None): def train(runner, agent, config, logwriter=None):
finished = False finished = False
while not finished: while not finished:
runner.run() runner.run()
agent.learn() agent.learn()
if logwriter is not None: if logwriter is not None:
agent.value_net.log_named_parameters() agent.value_net.log_named_parameters()
agent.policy_net.log_named_parameters() agent.policy_net.log_named_parameters()
logwriter.write(logger) logwriter.write(Logger)
finished = runner.episode_num > config['total_training_episodes'] finished = runner.episode_num > config['total_training_episodes']
if __name__ == "__main__": if __name__ == "__main__":
# Setting up the environment # Setting up the environment
rltorch.set_seed(config['seed']) rltorch.set_seed(config['seed'])
print("Setting up environment...", end = " ") print("Setting up environment...", end=" ")
env = E.TorchWrap(gym.make(config['environment_name'])) env = E.TorchWrap(gym.make(config['environment_name']))
env.seed(config['seed']) env.seed(config['seed'])
print("Done.") print("Done.")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
# Logging state_size = env.observation_space.shape[0]
logger = rltorch.log.Logger() action_size = env.action_space.n
logwriter = rltorch.log.LogWriter(SummaryWriter()) # Logging
logwriter = rltorch.log.LogWriter(SummaryWriter())
# Setting up the networks # Setting up the networks
device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu")
policy_net = rn.Network(Policy(state_size, action_size), policy_net = rn.Network(Policy(state_size, action_size),
torch.optim.Adam, config, device = device, name = "Policy") torch.optim.Adam, config, device=device, name="Policy")
value_net = rn.Network(Value(state_size), value_net = rn.Network(Value(state_size),
torch.optim.Adam, config, device = device, name = "DQN") torch.optim.Adam, config, device=device, name="DQN")
# Memory stores experiences for later training
memory = M.EpisodeMemory()
# Memory stores experiences for later training # Actor takes a net and uses it to produce actions from given states
memory = M.EpisodeMemory() actor = StochasticSelector(policy_net, action_size, memory, device = device)
# Agent is what performs the training
# Actor takes a net and uses it to produce actions from given states agent = rltorch.agents.A2CSingleAgent(policy_net, value_net, memory, config)
actor = StochasticSelector(policy_net, action_size, memory, device = device) # Runner performs one episode in the environment
runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name="Training", memory=memory, logwriter=logwriter)
# Agent is what performs the training
agent = rltorch.agents.A2CSingleAgent(policy_net, value_net, memory, config, logger = logger)
# Runner performs one episode in the environment
runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name = "Training", memory = memory, logwriter = logwriter)
print("Training...") print("Training...")
train(runner, agent, config, logger = logger, logwriter = logwriter) train(runner, agent, config, logwriter=logwriter)
# For profiling... # For profiling...
# import cProfile # import cProfile
# cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )') # cProfile.run('train(runner, agent, config, logwriter = logwriter )')
# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution...
print("Training Finished.") print("Training Finished.")
print("Evaluating...")
print("Evaluating...") rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], name="Evaluation")
rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") print("Evaulations Done.")
print("Evaulations Done.") logwriter.close() # We don't need to write anything out to disk anymore
logwriter.close() # We don't need to write anything out to disk anymore

View file

@ -9,29 +9,28 @@ import rltorch.memory as M
import rltorch.env as E import rltorch.env as E
from rltorch.action_selector import StochasticSelector from rltorch.action_selector import StochasticSelector
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from rltorch.log import Logger
# #
## Networks ## Networks
# #
class Policy(nn.Module): class Policy(nn.Module):
def __init__(self, state_size, action_size): def __init__(self, state_size, action_size):
super(Policy, self).__init__() super(Policy, self).__init__()
self.state_size = state_size self.state_size = state_size
self.action_size = action_size self.action_size = action_size
self.fc1 = nn.Linear(state_size, 125)
self.fc_norm = nn.LayerNorm(125)
self.fc1 = nn.Linear(state_size, 125) self.fc2 = nn.Linear(125, 125)
self.fc_norm = nn.LayerNorm(125) self.fc2_norm = nn.LayerNorm(125)
self.action_prob = nn.Linear(125, action_size)
self.fc2 = nn.Linear(125, 125)
self.fc2_norm = nn.LayerNorm(125)
self.action_prob = nn.Linear(125, action_size) def forward(self, x):
x = F.relu(self.fc_norm(self.fc1(x)))
def forward(self, x): x = F.relu(self.fc2_norm(self.fc2(x)))
x = F.relu(self.fc_norm(self.fc1(x))) x = F.softmax(self.action_prob(x), dim = 1)
x = F.relu(self.fc2_norm(self.fc2(x))) return x
x = F.softmax(self.action_prob(x), dim = 1)
return x
# #
## Configuration ## Configuration
@ -50,75 +49,67 @@ config['disable_cuda'] = False
# #
## Training Loop ## Training Loop
# #
def train(runner, net, config, logger = None, logwriter = None): def train(runner, net, config, logwriter=None):
finished = False finished = False
while not finished: while not finished:
runner.run() runner.run()
net.calc_gradients() net.calc_gradients()
net.step() net.step()
if logwriter is not None: if logwriter is not None:
net.log_named_parameters() net.log_named_parameters()
logwriter.write(logger) logwriter.write(Logger)
finished = runner.episode_num > config['total_training_episodes'] finished = runner.episode_num > config['total_training_episodes']
# #
## Loss function ## Loss function
# #
def fitness(model): def fitness(model):
env = gym.make("Acrobot-v1") env = gym.make("Acrobot-v1")
state = torch.from_numpy(env.reset()).float().unsqueeze(0) state = torch.from_numpy(env.reset()).float().unsqueeze(0)
total_reward = 0 total_reward = 0
done = False done = False
while not done: while not done:
action_probabilities = model(state) action_probabilities = model(state)
distribution = Categorical(action_probabilities) distribution = Categorical(action_probabilities)
action = distribution.sample().item() action = distribution.sample().item()
next_state, reward, done, _ = env.step(action) next_state, reward, done, _ = env.step(action)
total_reward += reward total_reward += reward
state = torch.from_numpy(next_state).float().unsqueeze(0) state = torch.from_numpy(next_state).float().unsqueeze(0)
return -total_reward return -total_reward
if __name__ == "__main__": if __name__ == "__main__":
# Hide internal gym warnings # Hide internal gym warnings
gym.logger.set_level(40) gym.logger.set_level(40)
# Setting up the environment # Setting up the environment
rltorch.set_seed(config['seed']) rltorch.set_seed(config['seed'])
print("Setting up environment...", end = " ") print("Setting up environment...", end=" ")
env = E.TorchWrap(gym.make(config['environment_name'])) env = E.TorchWrap(gym.make(config['environment_name']))
env.seed(config['seed']) env.seed(config['seed'])
print("Done.") print("Done.")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
# Logging state_size = env.observation_space.shape[0]
logger = rltorch.log.Logger() action_size = env.action_space.n
logwriter = rltorch.log.LogWriter(SummaryWriter())
# Setting up the networks # Logging
device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") logwriter = rltorch.log.LogWriter(SummaryWriter())
net = rn.ESNetwork(Policy(state_size, action_size), # Setting up the networks
torch.optim.Adam, 100, fitness, config, device = device, name = "ES", logger = logger) device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu")
net = rn.ESNetwork(Policy(state_size, action_size),
torch.optim.Adam, 100, fitness, config, device=device, name="ES")
# Actor takes a net and uses it to produce actions from given states
actor = StochasticSelector(net, action_size, device=device)
# Runner performs an episode of the environment
runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name="Training", logwriter=logwriter)
print("Training...")
train(runner, net, config, logwriter=logwriter)
# For profiling...
# import cProfile
# cProfile.run('train(runner, agent, config, logwriter = logwriter )')
# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution...
print("Training Finished.")
print("Evaluating...")
rltorch.env.simulateEnvEps(env, actor, config, total_episodes=config['total_evaluation_episodes'], name="Evaluation")
print("Evaulations Done.")
# Actor takes a net and uses it to produce actions from given states logwriter.close() # We don't need to write anything out to disk anymore
actor = StochasticSelector(net, action_size, device = device)
# Runner performs an episode of the environment
runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name = "Training", logwriter = logwriter)
print("Training...")
train(runner, net, config, logger = logger, logwriter = logwriter)
# For profiling...
# import cProfile
# cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )')
# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution...
print("Training Finished.")
print("Evaluating...")
rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation")
print("Evaulations Done.")
logwriter.close() # We don't need to write anything out to disk anymore

View file

@ -8,48 +8,49 @@ import rltorch.memory as M
import rltorch.env as E import rltorch.env as E
from rltorch.action_selector import StochasticSelector from rltorch.action_selector import StochasticSelector
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from rltorch.log import Logger
# #
## Networks ## Networks
# #
class Value(nn.Module): class Value(nn.Module):
def __init__(self, state_size): def __init__(self, state_size):
super(Value, self).__init__() super(Value, self).__init__()
self.state_size = state_size self.state_size = state_size
self.fc1 = rn.NoisyLinear(state_size, 64) self.fc1 = rn.NoisyLinear(state_size, 64)
self.fc_norm = nn.LayerNorm(64) self.fc_norm = nn.LayerNorm(64)
self.fc2 = rn.NoisyLinear(64, 64) self.fc2 = rn.NoisyLinear(64, 64)
self.fc2_norm = nn.LayerNorm(64) self.fc2_norm = nn.LayerNorm(64)
self.fc3 = rn.NoisyLinear(64, 1) self.fc3 = rn.NoisyLinear(64, 1)
def forward(self, x): def forward(self, x):
x = F.relu(self.fc_norm(self.fc1(x))) x = F.relu(self.fc_norm(self.fc1(x)))
x = F.relu(self.fc2_norm(self.fc2(x))) x = F.relu(self.fc2_norm(self.fc2(x)))
x = self.fc3(x) x = self.fc3(x)
return x return x
class Policy(nn.Module): class Policy(nn.Module):
def __init__(self, state_size, action_size): def __init__(self, state_size, action_size):
super(Policy, self).__init__() super(Policy, self).__init__()
self.state_size = state_size self.state_size = state_size
self.action_size = action_size self.action_size = action_size
self.fc1 = rn.NoisyLinear(state_size, 64) self.fc1 = rn.NoisyLinear(state_size, 64)
self.fc_norm = nn.LayerNorm(64) self.fc_norm = nn.LayerNorm(64)
self.fc2 = rn.NoisyLinear(64, 64) self.fc2 = rn.NoisyLinear(64, 64)
self.fc2_norm = nn.LayerNorm(64) self.fc2_norm = nn.LayerNorm(64)
self.fc3 = rn.NoisyLinear(64, action_size) self.fc3 = rn.NoisyLinear(64, action_size)
def forward(self, x): def forward(self, x):
x = F.relu(self.fc_norm(self.fc1(x))) x = F.relu(self.fc_norm(self.fc1(x)))
x = F.relu(self.fc2_norm(self.fc2(x))) x = F.relu(self.fc2_norm(self.fc2(x)))
x = F.softmax(self.fc3(x), dim = 1) x = F.softmax(self.fc3(x), dim = 1)
return x return x
# #
## Configuration ## Configuration
@ -68,64 +69,63 @@ config['disable_cuda'] = False
# #
## Training Loop ## Training Loop
# #
def train(runner, agent, config, logger = None, logwriter = None): def train(runner, agent, config, logwriter = None):
finished = False finished = False
while not finished: while not finished:
runner.run() runner.run()
agent.learn() agent.learn()
if logwriter is not None: if logwriter is not None:
agent.value_net.log_named_parameters() agent.value_net.log_named_parameters()
agent.policy_net.log_named_parameters() agent.policy_net.log_named_parameters()
logwriter.write(logger) logwriter.write(Logger)
finished = runner.episode_num > config['total_training_episodes'] finished = runner.episode_num > config['total_training_episodes']
if __name__ == "__main__": if __name__ == "__main__":
# Setting up the environment # Setting up the environment
rltorch.set_seed(config['seed']) rltorch.set_seed(config['seed'])
print("Setting up environment...", end = " ") print("Setting up environment...", end=" ")
env = E.TorchWrap(gym.make(config['environment_name'])) env = E.TorchWrap(gym.make(config['environment_name']))
env.seed(config['seed']) env.seed(config['seed'])
print("Done.") print("Done.")
state_size = env.observation_space.shape[0] state_size = env.observation_space.shape[0]
action_size = env.action_space.n action_size = env.action_space.n
# Logging # Logging
logger = rltorch.log.Logger() logwriter = rltorch.log.LogWriter(SummaryWriter())
logwriter = rltorch.log.LogWriter(SummaryWriter())
# Setting up the networks # Setting up the networks
device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu")
policy_net = rn.Network(Policy(state_size, action_size), policy_net = rn.Network(Policy(state_size, action_size),
torch.optim.Adam, config, device = device, name = "Policy") torch.optim.Adam, config, device=device, name="Policy")
value_net = rn.Network(Value(state_size), value_net = rn.Network(Value(state_size),
torch.optim.Adam, config, device = device, name = "DQN") torch.optim.Adam, config, device=device, name="DQN")
# Memory stores experiences for later training # Memory stores experiences for later training
memory = M.EpisodeMemory() memory = M.EpisodeMemory()
# Actor takes a net and uses it to produce actions from given states # Actor takes a net and uses it to produce actions from given states
actor = StochasticSelector(policy_net, action_size, memory, device = device) actor = StochasticSelector(policy_net, action_size, memory, device=device)
# Agent is what performs the training # Agent is what performs the training
agent = rltorch.agents.PPOAgent(policy_net, value_net, memory, config, logger = logger) agent = rltorch.agents.PPOAgent(policy_net, value_net, memory, config)
# Runner performs a certain number of steps in the environment # Runner performs a certain number of steps in the environment
runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name = "Training", memory = memory, logwriter = logwriter) runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name="Training", memory=memory, logwriter=logwriter)
print("Training...") print("Training...")
train(runner, agent, config, logger = logger, logwriter = logwriter) train(runner, agent, config, logwriter=logwriter)
# For profiling... # For profiling...
# import cProfile # import cProfile
# cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )') # cProfile.run('train(runner, agent, config, logwriter = logwriter )')
# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution...
print("Training Finished.") print("Training Finished.")
print("Evaluating...") print("Evaluating...")
rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") rltorch.env.simulateEnvEps(env, actor, config, total_episodes=config['total_evaluation_episodes'], name="Evaluation")
print("Evaulations Done.") print("Evaulations Done.")
logwriter.close() # We don't need to write anything out to disk anymore logwriter.close() # We don't need to write anything out to disk anymore

View file

@ -7,61 +7,62 @@ import rltorch.network as rn
import rltorch.memory as M import rltorch.memory as M
import rltorch.env as E import rltorch.env as E
from rltorch.action_selector import StochasticSelector from rltorch.action_selector import StochasticSelector
from tensorboardX import SummaryWriter # from tensorboardX import SummaryWriter
from copy import deepcopy from copy import deepcopy
from rltorch.log import Logger
# #
## Networks ## Networks
# #
class Value(nn.Module): class Value(nn.Module):
def __init__(self, state_size, action_size): def __init__(self, state_size, action_size):
super(Value, self).__init__() super(Value, self).__init__()
self.state_size = state_size self.state_size = state_size
self.action_size = action_size self.action_size = action_size
self.fc1 = rn.NoisyLinear(state_size, 255) self.fc1 = rn.NoisyLinear(state_size, 255)
self.fc_norm = nn.LayerNorm(255) self.fc_norm = nn.LayerNorm(255)
self.value_fc = rn.NoisyLinear(255, 255) self.value_fc = rn.NoisyLinear(255, 255)
self.value_fc_norm = nn.LayerNorm(255) self.value_fc_norm = nn.LayerNorm(255)
self.value = rn.NoisyLinear(255, 1) self.value = rn.NoisyLinear(255, 1)
self.advantage_fc = rn.NoisyLinear(255, 255)
self.advantage_fc_norm = nn.LayerNorm(255)
self.advantage = rn.NoisyLinear(255, action_size)
def forward(self, x): self.advantage_fc = rn.NoisyLinear(255, 255)
x = F.relu(self.fc_norm(self.fc1(x))) self.advantage_fc_norm = nn.LayerNorm(255)
self.advantage = rn.NoisyLinear(255, action_size)
def forward(self, x):
x = F.relu(self.fc_norm(self.fc1(x)))
state_value = F.relu(self.value_fc_norm(self.value_fc(x))) state_value = F.relu(self.value_fc_norm(self.value_fc(x)))
state_value = self.value(state_value) state_value = self.value(state_value)
advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x))) advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x)))
advantage = self.advantage(advantage) advantage = self.advantage(advantage)
x = state_value + advantage - advantage.mean() x = state_value + advantage - advantage.mean()
return x return x
class Policy(nn.Module): class Policy(nn.Module):
def __init__(self, state_size, action_size): def __init__(self, state_size, action_size):
super(Policy, self).__init__() super(Policy, self).__init__()
self.state_size = state_size self.state_size = state_size
self.action_size = action_size self.action_size = action_size
self.fc1 = nn.Linear(state_size, 125) self.fc1 = nn.Linear(state_size, 125)
self.fc_norm = nn.LayerNorm(125) self.fc_norm = nn.LayerNorm(125)
self.fc2 = nn.Linear(125, 125) self.fc2 = nn.Linear(125, 125)
self.fc2_norm = nn.LayerNorm(125) self.fc2_norm = nn.LayerNorm(125)
self.action_prob = nn.Linear(125, action_size) self.action_prob = nn.Linear(125, action_size)
def forward(self, x): def forward(self, x):
x = F.relu(self.fc_norm(self.fc1(x))) x = F.relu(self.fc_norm(self.fc1(x)))
x = F.relu(self.fc2_norm(self.fc2(x))) x = F.relu(self.fc2_norm(self.fc2(x)))
x = F.softmax(self.action_prob(x), dim = 1) x = F.softmax(self.action_prob(x), dim = 1)
return x return x
# #
## Configuration ## Configuration
@ -94,70 +95,70 @@ config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialSc
# #
## Training Loop ## Training Loop
# #
def train(runner, agent, config, logger = None, logwriter = None): def train(runner, agent, config, logwriter=None):
finished = False finished = False
last_episode_num = 1 last_episode_num = 1
while not finished: while not finished:
runner.run(config['replay_skip'] + 1) runner.run(config['replay_skip'] + 1)
agent.learn() agent.learn()
if logwriter is not None: if logwriter is not None:
if last_episode_num < runner.episode_num: if last_episode_num < runner.episode_num:
last_episode_num = runner.episode_num last_episode_num = runner.episode_num
agent.value_net.log_named_parameters() agent.value_net.log_named_parameters()
agent.policy_net.log_named_parameters() agent.policy_net.log_named_parameters()
logwriter.write(logger) logwriter.write(Logger)
finished = runner.episode_num > config['total_training_episodes'] finished = runner.episode_num > config['total_training_episodes']
if __name__ == "__main__": if __name__ == "__main__":
# Setting up the environment # Setting up the environment
rltorch.set_seed(config['seed']) rltorch.set_seed(config['seed'])
print("Setting up environment...", end = " ") print("Setting up environment...", end = " ")
env = E.TorchWrap(gym.make(config['environment_name'])) env = E.TorchWrap(gym.make(config['environment_name']))
env.seed(config['seed']) env.seed(config['seed'])
print("Done.") print("Done.")
state_size = env.observation_space.shape[0] state_size = env.observation_space.shape[0]
action_size = env.action_space.n action_size = env.action_space.n
# Logging # Logging
logger = rltorch.log.Logger() logwriter = None
logwriter = rltorch.log.LogWriter(SummaryWriter()) # logwriter = rltorch.log.LogWriter(SummaryWriter())
# Setting up the networks # Setting up the networks
device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu")
config2 = deepcopy(config) config2 = deepcopy(config)
config2['learning_rate'] = 0.01 config2['learning_rate'] = 0.01
policy_net = rn.ESNetwork(Policy(state_size, action_size), policy_net = rn.ESNetwork(Policy(state_size, action_size),
torch.optim.Adam, 500, None, config2, sigma = 0.1, device = device, name = "ES", logger = logger) torch.optim.Adam, 500, None, config2, sigma=0.1, device=device, name="ES")
value_net = rn.Network(Value(state_size, action_size), value_net = rn.Network(Value(state_size, action_size),
torch.optim.Adam, config, device = device, name = "DQN", logger = logger) torch.optim.Adam, config, device=device, name="DQN")
target_net = rn.TargetNetwork(value_net, device = device) target_net = rn.TargetNetwork(value_net, device=device)
# Actor takes a net and uses it to produce actions from given states # Actor takes a net and uses it to produce actions from given states
actor = StochasticSelector(policy_net, action_size, device = device) actor = StochasticSelector(policy_net, action_size, device=device)
# Memory stores experiences for later training # Memory stores experiences for later training
memory = M.PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority']) memory = M.PrioritizedReplayMemory(capacity=config['memory_size'], alpha=config['prioritized_replay_sampling_priority'])
# Runner performs a certain number of steps in the environment # Runner performs a certain number of steps in the environment
runner = rltorch.env.EnvironmentRunSync(env, actor, config, name = "Training", memory = memory, logwriter = logwriter) runner = rltorch.env.EnvironmentRunSync(env, actor, config, name="Training", memory=memory, logwriter=logwriter)
# Agent is what performs the training # Agent is what performs the training
agent = rltorch.agents.QEPAgent(policy_net, value_net, memory, config, target_value_net = target_net, logger = logger) agent = rltorch.agents.QEPAgent(policy_net, value_net, memory, config, target_value_net=target_net)
print("Training...") print("Training...")
train(runner, agent, config, logger = logger, logwriter = logwriter) train(runner, agent, config, logwriter=logwriter)
# For profiling... # For profiling...
# import cProfile # import cProfile
# cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )') # cProfile.run('train(runner, agent, config, logwriter = logwriter )')
# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution...
print("Training Finished.") print("Training Finished.")
print("Evaluating...") print("Evaluating...")
rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], name="Evaluation")
print("Evaulations Done.") print("Evaulations Done.")
logwriter.close() # We don't need to write anything out to disk anymore # logwriter.close() # We don't need to write anything out to disk anymore

View file

@ -7,30 +7,30 @@ import rltorch.network as rn
import rltorch.memory as M import rltorch.memory as M
import rltorch.env as E import rltorch.env as E
from rltorch.action_selector import StochasticSelector from rltorch.action_selector import StochasticSelector
from tensorboardX import SummaryWriter from rltorch.log import Logger
# #
## Networks ## Networks
# #
class Policy(nn.Module): class Policy(nn.Module):
def __init__(self, state_size, action_size): def __init__(self, state_size, action_size):
super(Policy, self).__init__() super(Policy, self).__init__()
self.state_size = state_size self.state_size = state_size
self.action_size = action_size self.action_size = action_size
self.fc1 = rn.NoisyLinear(state_size, 64) self.fc1 = rn.NoisyLinear(state_size, 64)
self.fc_norm = nn.LayerNorm(64) self.fc_norm = nn.LayerNorm(64)
self.fc2 = rn.NoisyLinear(64, 64) self.fc2 = rn.NoisyLinear(64, 64)
self.fc2_norm = nn.LayerNorm(64) self.fc2_norm = nn.LayerNorm(64)
self.fc3 = rn.NoisyLinear(64, action_size) self.fc3 = rn.NoisyLinear(64, action_size)
def forward(self, x): def forward(self, x):
x = F.relu(self.fc_norm(self.fc1(x))) x = F.relu(self.fc_norm(self.fc1(x)))
x = F.relu(self.fc2_norm(self.fc2(x))) x = F.relu(self.fc2_norm(self.fc2(x)))
x = F.softmax(self.fc3(x), dim = 1) x = F.softmax(self.fc3(x), dim=1)
return x return x
# #
## Configuration ## Configuration
@ -49,65 +49,65 @@ config['disable_cuda'] = False
# #
## Training Loop ## Training Loop
# #
def train(runner, agent, config, logger = None, logwriter = None): def train(runner, agent, config, logwriter=None):
finished = False finished = False
while not finished: while not finished:
runner.run() runner.run()
agent.learn() agent.learn()
# When the episode number changes, log network paramters # When the episode number changes, log network paramters
if logwriter is not None: if logwriter is not None:
agent.net.log_named_parameters() agent.net.log_named_parameters()
logwriter.write(logger) logwriter.write(Logger)
finished = runner.episode_num > config['total_training_episodes'] finished = runner.episode_num > config['total_training_episodes']
if __name__ == "__main__": if __name__ == "__main__":
torch.multiprocessing.set_sharing_strategy('file_system') # To not hit file descriptor memory limit torch.multiprocessing.set_sharing_strategy('file_system') # To not hit file descriptor memory limit
# Setting up the environment # Setting up the environment
rltorch.set_seed(config['seed']) rltorch.set_seed(config['seed'])
print("Setting up environment...", end = " ") print("Setting up environment...", end=" ")
env = E.TorchWrap(gym.make(config['environment_name'])) env = E.TorchWrap(gym.make(config['environment_name']))
env.seed(config['seed']) env.seed(config['seed'])
print("Done.") print("Done.")
state_size = env.observation_space.shape[0] state_size = env.observation_space.shape[0]
action_size = env.action_space.n action_size = env.action_space.n
# Logging # Logging
logger = rltorch.log.Logger() logwriter = None
logwriter = rltorch.log.LogWriter(SummaryWriter()) # logwriter = rltorch.log.LogWriter(SummaryWriter())
# Setting up the networks # Setting up the networks
device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu")
net = rn.Network(Policy(state_size, action_size), net = rn.Network(Policy(state_size, action_size),
torch.optim.Adam, config, device = device, name = "DQN") torch.optim.Adam, config, device=device, name="DQN")
target_net = rn.TargetNetwork(net, device = device) target_net = rn.TargetNetwork(net, device=device)
# Memory stores experiences for later training # Memory stores experiences for later training
memory = M.EpisodeMemory() memory = M.EpisodeMemory()
# Actor takes a net and uses it to produce actions from given states # Actor takes a net and uses it to produce actions from given states
actor = StochasticSelector(net, action_size, memory, device = device) actor = StochasticSelector(net, action_size, memory, device=device)
# Agent is what performs the training # Agent is what performs the training
agent = rltorch.agents.REINFORCEAgent(net, memory, config, target_net = target_net, logger = logger) agent = rltorch.agents.REINFORCEAgent(net, memory, config, target_net=target_net)
# Runner performs one episode in the environment # Runner performs one episode in the environment
runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name = "Training", memory = memory, logwriter = logwriter) runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name="Training", memory=memory, logwriter=logwriter)
print("Training...") print("Training...")
train(runner, agent, config, logger = logger, logwriter = logwriter) train(runner, agent, config, logwriter=logwriter)
# For profiling... # For profiling...
# import cProfile # import cProfile
# cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )') # cProfile.run('train(runner, agent, config, logwriter = logwriter )')
# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution...
print("Training Finished.") print("Training Finished.")
print("Evaluating...") print("Evaluating...")
rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") rltorch.env.simulateEnvEps(env, actor, config, total_episodes=config['total_evaluation_episodes'], name="Evaluation")
print("Evaulations Done.") print("Evaulations Done.")
logwriter.close() # We don't need to write anything out to disk anymore # logwriter.close() # We don't need to write anything out to disk anymore

View file

@ -7,39 +7,39 @@ import rltorch.network as rn
import rltorch.memory as M import rltorch.memory as M
import rltorch.env as E import rltorch.env as E
from rltorch.action_selector import ArgMaxSelector from rltorch.action_selector import ArgMaxSelector
from tensorboardX import SummaryWriter from rltorch.log import Logger
# #
## Networks ## Networks
# #
class Value(nn.Module): class Value(nn.Module):
def __init__(self, state_size, action_size): def __init__(self, state_size, action_size):
super(Value, self).__init__() super(Value, self).__init__()
self.state_size = state_size self.state_size = state_size
self.action_size = action_size self.action_size = action_size
self.fc1 = rn.NoisyLinear(state_size, 255) self.fc1 = rn.NoisyLinear(state_size, 255)
self.fc_norm = nn.LayerNorm(255) self.fc_norm = nn.LayerNorm(255)
self.value_fc = rn.NoisyLinear(255, 255) self.value_fc = rn.NoisyLinear(255, 255)
self.value_fc_norm = nn.LayerNorm(255) self.value_fc_norm = nn.LayerNorm(255)
self.value = rn.NoisyLinear(255, 1) self.value = rn.NoisyLinear(255, 1)
self.advantage_fc = rn.NoisyLinear(255, 255) self.advantage_fc = rn.NoisyLinear(255, 255)
self.advantage_fc_norm = nn.LayerNorm(255) self.advantage_fc_norm = nn.LayerNorm(255)
self.advantage = rn.NoisyLinear(255, action_size) self.advantage = rn.NoisyLinear(255, action_size)
def forward(self, x): def forward(self, x):
x = F.relu(self.fc_norm(self.fc1(x))) x = F.relu(self.fc_norm(self.fc1(x)))
state_value = F.relu(self.value_fc_norm(self.value_fc(x))) state_value = F.relu(self.value_fc_norm(self.value_fc(x)))
state_value = self.value(state_value) state_value = self.value(state_value)
advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x))) advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x)))
advantage = self.advantage(advantage) advantage = self.advantage(advantage)
x = state_value + advantage - advantage.mean() x = state_value + advantage - advantage.mean()
return x return x
# #
## Configuration ## Configuration
@ -71,7 +71,7 @@ config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialSc
# #
## Training Loop ## Training Loop
# #
def train(runner, agent, config, logger = None, logwriter = None): def train(runner, agent, config, logwriter=None):
finished = False finished = False
last_episode_num = 1 last_episode_num = 1
while not finished: while not finished:
@ -79,56 +79,56 @@ def train(runner, agent, config, logger = None, logwriter = None):
agent.learn() agent.learn()
if logwriter is not None: if logwriter is not None:
if last_episode_num < runner.episode_num: if last_episode_num < runner.episode_num:
last_episode_num = runner.episode_num last_episode_num = runner.episode_num
agent.net.log_named_parameters() agent.net.log_named_parameters()
logwriter.write(logger) logwriter.write(Logger)
finished = runner.episode_num > config['total_training_episodes'] finished = runner.episode_num > config['total_training_episodes']
if __name__ == "__main__": if __name__ == "__main__":
# Setting up the environment # Setting up the environment
rltorch.set_seed(config['seed']) rltorch.set_seed(config['seed'])
print("Setting up environment...", end = " ") print("Setting up environment...", end=" ")
env = E.TorchWrap(gym.make(config['environment_name'])) env = E.TorchWrap(gym.make(config['environment_name']))
env.seed(config['seed']) env.seed(config['seed'])
print("Done.") print("Done.")
state_size = env.observation_space.shape[0] state_size = env.observation_space.shape[0]
action_size = env.action_space.n action_size = env.action_space.n
# Logging # Logging
logger = rltorch.log.Logger() logwriter = None
logwriter = rltorch.log.LogWriter(SummaryWriter()) # logwriter = rltorch.log.LogWriter(SummaryWriter())
# Setting up the networks # Setting up the networks
device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu")
net = rn.Network(Value(state_size, action_size), net = rn.Network(Value(state_size, action_size),
torch.optim.Adam, config, device = device, name = "DQN", logger = logger) torch.optim.Adam, config, device=device, name="DQN")
target_net = rn.TargetNetwork(net, device = device) target_net = rn.TargetNetwork(net, device=device)
# Actor takes a net and uses it to produce actions from given states # Actor takes a net and uses it to produce actions from given states
actor = ArgMaxSelector(net, action_size, device = device) actor = ArgMaxSelector(net, action_size, device=device)
# Memory stores experiences for later training # Memory stores experiences for later training
memory = M.PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority']) memory = M.PrioritizedReplayMemory(capacity=config['memory_size'], alpha=config['prioritized_replay_sampling_priority'])
# Runner performs a certain number of steps in the environment # Runner performs a certain number of steps in the environment
runner = rltorch.env.EnvironmentRunSync(env, actor, config, name = "Training", memory = memory, logwriter = logwriter) runner = rltorch.env.EnvironmentRunSync(env, actor, config, name="Training", memory=memory, logwriter=logwriter)
# Agent is what performs the training # Agent is what performs the training
agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net, logger = logger) agent = rltorch.agents.DQNAgent(net, memory, config, target_net=target_net)
print("Training...") print("Training...")
train(runner, agent, config, logger = logger, logwriter = logwriter) train(runner, agent, config, logwriter=logwriter)
# For profiling... # For profiling...
# import cProfile # import cProfile
# cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )') # cProfile.run('train(runner, agent, config, logwriter = logwriter )')
# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution...
print("Training Finished.") print("Training Finished.")
print("Evaluating...") print("Evaluating...")
rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], name = "Evaluation")
print("Evaulations Done.") print("Evaulations Done.")
logwriter.close() # We don't need to write anything out to disk anymore logwriter.close() # We don't need to write anything out to disk anymore

View file

@ -9,58 +9,59 @@ import rltorch.env as E
from rltorch.action_selector import ArgMaxSelector from rltorch.action_selector import ArgMaxSelector
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
import torch.multiprocessing as mp import torch.multiprocessing as mp
from rltorch.log import Logger
# #
## Networks ## Networks
# #
class Value(nn.Module): class Value(nn.Module):
def __init__(self, state_size, action_size): def __init__(self, state_size, action_size):
super(Value, self).__init__() super(Value, self).__init__()
self.state_size = state_size self.state_size = state_size
self.action_size = action_size self.action_size = action_size
self.conv1 = nn.Conv2d(4, 32, kernel_size = (8, 8), stride = (4, 4)) self.conv1 = nn.Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
self.conv_norm1 = nn.LayerNorm([32, 19, 19]) self.conv_norm1 = nn.LayerNorm([32, 19, 19])
self.conv2 = nn.Conv2d(32, 64, kernel_size = (4, 4), stride = (2, 2)) self.conv2 = nn.Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
self.conv_norm2 = nn.LayerNorm([64, 8, 8]) self.conv_norm2 = nn.LayerNorm([64, 8, 8])
self.conv3 = nn.Conv2d(64, 64, kernel_size = (3, 3), stride = (1, 1)) self.conv3 = nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
self.conv_norm3 = nn.LayerNorm([64, 6, 6]) self.conv_norm3 = nn.LayerNorm([64, 6, 6])
self.fc1 = rn.NoisyLinear(64 * 6 * 6, 384) self.fc1 = rn.NoisyLinear(64 * 6 * 6, 384)
self.fc_norm = nn.LayerNorm(384) self.fc_norm = nn.LayerNorm(384)
self.value_fc = rn.NoisyLinear(384, 384) self.value_fc = rn.NoisyLinear(384, 384)
self.value_fc_norm = nn.LayerNorm(384) self.value_fc_norm = nn.LayerNorm(384)
self.value = rn.NoisyLinear(384, 1) self.value = rn.NoisyLinear(384, 1)
self.advantage_fc = rn.NoisyLinear(384, 384) self.advantage_fc = rn.NoisyLinear(384, 384)
self.advantage_fc_norm = nn.LayerNorm(384) self.advantage_fc_norm = nn.LayerNorm(384)
self.advantage = rn.NoisyLinear(384, action_size) self.advantage = rn.NoisyLinear(384, action_size)
def forward(self, x): def forward(self, x):
x = F.relu(self.conv_norm1(self.conv1(x))) x = F.relu(self.conv_norm1(self.conv1(x)))
x = F.relu(self.conv_norm2(self.conv2(x))) x = F.relu(self.conv_norm2(self.conv2(x)))
x = F.relu(self.conv_norm3(self.conv3(x))) x = F.relu(self.conv_norm3(self.conv3(x)))
# Makes batch_size dimension again # Makes batch_size dimension again
x = x.view(-1, 64 * 6 * 6) x = x.view(-1, 64 * 6 * 6)
x = F.relu(self.fc_norm(self.fc1(x))) x = F.relu(self.fc_norm(self.fc1(x)))
state_value = F.relu(self.value_fc_norm(self.value_fc(x))) state_value = F.relu(self.value_fc_norm(self.value_fc(x)))
state_value = self.value(state_value) state_value = self.value(state_value)
advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x))) advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x)))
advantage = self.advantage(advantage) advantage = self.advantage(advantage)
x = state_value + advantage - advantage.mean() x = state_value + advantage - advantage.mean()
# For debugging purposes... # For debugging purposes...
if torch.isnan(x).any().item(): if torch.isnan(x).any().item():
print("WARNING NAN IN MODEL DETECTED") print("WARNING NAN IN MODEL DETECTED")
return x
return x
# #
## Configuration ## Configuration
# #
@ -89,59 +90,73 @@ config['prioritized_replay_sampling_priority'] = 0.6
# Should ideally start from 0 and move your way to 1 to prevent overfitting # Should ideally start from 0 and move your way to 1 to prevent overfitting
config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 5000) config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 5000)
#
## Training Loop
#
def train(runner, agent, config, logwriter = None):
finished = False
while not finished:
runner.run()
agent.learn()
if logwriter is not None:
agent.value_net.log_named_parameters()
agent.policy_net.log_named_parameters()
logwriter.write(Logger)
finished = runner.episode_num > config['total_training_episodes']
if __name__ == "__main__": if __name__ == "__main__":
# To not hit file descriptor memory limit # To not hit file descriptor memory limit
torch.multiprocessing.set_sharing_strategy('file_system') torch.multiprocessing.set_sharing_strategy('file_system')
# Setting up the environment # Setting up the environment
rltorch.set_seed(config['seed']) rltorch.set_seed(config['seed'])
print("Setting up environment...", end = " ") print("Setting up environment...", end = " ")
env = E.FrameStack(E.TorchWrap( env = E.FrameStack(E.TorchWrap(
E.ProcessFrame(E.FireResetEnv(gym.make(config['environment_name'])), E.ProcessFrame(E.FireResetEnv(gym.make(config['environment_name'])),
resize_shape = (80, 80), crop_bounds = [34, 194, 15, 145], grayscale = True)) resize_shape=(80, 80), crop_bounds=[34, 194, 15, 145], grayscale=True))
, 4) , 4)
env.seed(config['seed']) env.seed(config['seed'])
print("Done.") print("Done.")
state_size = env.observation_space.shape[0] state_size = env.observation_space.shape[0]
action_size = env.action_space.n action_size = env.action_space.n
# Logging # Logging
logger = rltorch.log.Logger() logwriter = rltorch.log.LogWriter(SummaryWriter())
logwriter = rltorch.log.LogWriter(SummaryWriter())
# Setting up the networks # Setting up the networks
device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu")
net = rn.Network(Value(state_size, action_size), net = rn.Network(Value(state_size, action_size),
torch.optim.Adam, config, device = device, name = "DQN") torch.optim.Adam, config, device=device, name="DQN")
target_net = rn.TargetNetwork(net, device = device) target_net = rn.TargetNetwork(net, device=device)
net.model.share_memory() net.model.share_memory()
target_net.model.share_memory() target_net.model.share_memory()
# Actor takes a net and uses it to produce actions from given states # Actor takes a net and uses it to produce actions from given states
actor = ArgMaxSelector(net, action_size, device = device) actor = ArgMaxSelector(net, action_size, device=device)
# Memory stores experiences for later training # Memory stores experiences for later training
memory = M.PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority']) memory = M.PrioritizedReplayMemory(capacity=config['memory_size'], alpha=config['prioritized_replay_sampling_priority'])
# Runner performs a certain number of steps in the environment # Runner performs a certain number of steps in the environment
runner = rltorch.mp.EnvironmentRun(env, actor, config, name = "Training", memory = memory, logwriter = logwriter) runner = rltorch.mp.EnvironmentRun(env, actor, config, name="Training", memory=memory, logwriter=logwriter)
# Agent is what performs the training # Agent is what performs the training
agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net, logger = logger) agent = rltorch.agents.DQNAgent(net, memory, config, target_net=target_net)
print("Training...") print("Training...")
train(runner, agent, config, logger = logger, logwriter = logwriter) train(runner, agent, config, logwriter=logwriter)
# For profiling... # For profiling...
# import cProfile # import cProfile
# cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )') # cProfile.run('train(runner, agent, config, logwriter = logwriter )')
# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution...
print("Training Finished.") print("Training Finished.")
runner.terminate() # We don't need the extra process anymore runner.terminate() # We don't need the extra process anymore
print("Evaluating...") print("Evaluating...")
rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") rltorch.env.simulateEnvEps(env, actor, config, total_episodes=config['total_evaluation_episodes'], name="Evaluation")
print("Evaulations Done.") print("Evaulations Done.")
logwriter.close() # We don't need to write anything out to disk anymore logwriter.close() # We don't need to write anything out to disk anymore

View file

@ -2,14 +2,14 @@ from copy import deepcopy
import numpy as np import numpy as np
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import rltorch.log as log
class A2CSingleAgent: class A2CSingleAgent:
def __init__(self, policy_net, value_net, memory, config, logger=None): def __init__(self, policy_net, value_net, memory, config):
self.policy_net = policy_net self.policy_net = policy_net
self.value_net = value_net self.value_net = value_net
self.memory = memory self.memory = memory
self.config = deepcopy(config) self.config = deepcopy(config)
self.logger = logger
def _discount_rewards(self, rewards): def _discount_rewards(self, rewards):
gammas = torch.ones_like(rewards) gammas = torch.ones_like(rewards)
@ -79,9 +79,9 @@ class A2CSingleAgent:
policy_loss = (-log_prob_batch * advantages).sum() policy_loss = (-log_prob_batch * advantages).sum()
if self.logger is not None: if log.enabled:
self.logger.append("Loss/Policy", policy_loss.item()) log.Logger["Loss/Policy"].append(policy_loss.item())
self.logger.append("Loss/Value", value_loss.item()) log.Logger["Loss/Value"].append(value_loss.item())
self.policy_net.zero_grad() self.policy_net.zero_grad()

View file

@ -3,14 +3,14 @@ from copy import deepcopy
import rltorch.memory as M import rltorch.memory as M
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import rltorch.log as log
class DQNAgent: class DQNAgent:
def __init__(self, net, memory, config, target_net=None, logger=None): def __init__(self, net, memory, config, target_net=None):
self.net = net self.net = net
self.target_net = target_net self.target_net = target_net
self.memory = memory self.memory = memory
self.config = deepcopy(config) self.config = deepcopy(config)
self.logger = logger
def save(self, file_location): def save(self, file_location):
torch.save(self.net.model.state_dict(), file_location) torch.save(self.net.model.state_dict(), file_location)
def load(self, file_location): def load(self, file_location):
@ -18,7 +18,7 @@ class DQNAgent:
self.net.model.to(self.net.device) self.net.model.to(self.net.device)
self.target_net.sync() self.target_net.sync()
def learn(self, logger=None): def learn(self):
if len(self.memory) < self.config['batch_size']: if len(self.memory) < self.config['batch_size']:
return return
@ -68,8 +68,8 @@ class DQNAgent:
# loss = F.smooth_l1_loss(obtained_values, expected_values) # loss = F.smooth_l1_loss(obtained_values, expected_values)
loss = F.mse_loss(obtained_values, expected_values) loss = F.mse_loss(obtained_values, expected_values)
if self.logger is not None: if log.enabled:
self.logger.append("Loss", loss.item()) log.Logger["Loss"].append(loss.item())
self.net.zero_grad() self.net.zero_grad()
loss.backward() loss.backward()

View file

@ -3,15 +3,14 @@ from copy import deepcopy
import rltorch.memory as M import rltorch.memory as M
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import rltorch.log as log
class DQfDAgent: class DQfDAgent:
def __init__(self, net, memory, config, target_net=None, logger=None): def __init__(self, net, memory, config, target_net=None):
self.net = net self.net = net
self.target_net = target_net self.target_net = target_net
self.memory = memory self.memory = memory
self.config = deepcopy(config) self.config = deepcopy(config)
self.logger = logger
def save(self, file_location): def save(self, file_location):
torch.save(self.net.model.state_dict(), file_location) torch.save(self.net.model.state_dict(), file_location)
def load(self, file_location): def load(self, file_location):
@ -19,7 +18,7 @@ class DQfDAgent:
self.net.model.to(self.net.device) self.net.model.to(self.net.device)
self.target_net.sync() self.target_net.sync()
def learn(self, logger=None): def learn(self):
if len(self.memory) < self.config['batch_size']: if len(self.memory) < self.config['batch_size']:
return return
@ -149,8 +148,8 @@ class DQfDAgent:
demo_loss = 0 demo_loss = 0
loss = td_importance * dqn_loss + td_importance * dqn_n_step_loss + demo_importance * demo_loss loss = td_importance * dqn_loss + td_importance * dqn_n_step_loss + demo_importance * demo_loss
if self.logger is not None: if log.enabled:
self.logger.append("Loss", loss.item()) log.Logger["Loss"].append(loss.item())
self.net.zero_grad() self.net.zero_grad()
loss.backward() loss.backward()

View file

@ -3,15 +3,15 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
from torch.distributions import Categorical from torch.distributions import Categorical
import rltorch import rltorch
import rltorch.log as log
class PPOAgent: class PPOAgent:
def __init__(self, policy_net, value_net, memory, config, logger=None): def __init__(self, policy_net, value_net, memory, config):
self.policy_net = policy_net self.policy_net = policy_net
self.old_policy_net = rltorch.network.TargetNetwork(policy_net) self.old_policy_net = rltorch.network.TargetNetwork(policy_net)
self.value_net = value_net self.value_net = value_net
self.memory = memory self.memory = memory
self.config = deepcopy(config) self.config = deepcopy(config)
self.logger = logger
def _discount_rewards(self, rewards): def _discount_rewards(self, rewards):
gammas = torch.ones_like(rewards) gammas = torch.ones_like(rewards)
@ -59,9 +59,9 @@ class PPOAgent:
policy_loss2 = policy_ratio.clamp(min=0.8, max=1.2) * advantages # From original paper policy_loss2 = policy_ratio.clamp(min=0.8, max=1.2) * advantages # From original paper
policy_loss = -torch.min(policy_loss1, policy_loss2).sum() policy_loss = -torch.min(policy_loss1, policy_loss2).sum()
if self.logger is not None: if log.enabled:
self.logger.append("Loss/Policy", policy_loss.item()) log.Logger["Loss/Policy"].append(policy_loss.item())
self.logger.append("Loss/Value", value_loss.item()) log.Logger["Loss/Value"].append(value_loss.item())
self.old_policy_net.sync() self.old_policy_net.sync()
self.policy_net.zero_grad() self.policy_net.zero_grad()

View file

@ -6,13 +6,14 @@ import torch.nn.functional as F
from torch.distributions import Categorical from torch.distributions import Categorical
import rltorch import rltorch
import rltorch.memory as M import rltorch.memory as M
import rltorch.log as log
# Q-Evolutionary Policy Agent # Q-Evolutionary Policy Agent
# Maximizes the policy with respect to the Q-Value function. # Maximizes the policy with respect to the Q-Value function.
# Since function is non-differentiabile, depends on the Evolutionary Strategy algorithm # Since function is non-differentiabile, depends on the Evolutionary Strategy algorithm
class QEPAgent: class QEPAgent:
def __init__(self, policy_net, value_net, memory, config, target_value_net=None, logger=None, entropy_importance=0, policy_skip=4): def __init__(self, policy_net, value_net, memory, config, target_value_net=None, entropy_importance=0, policy_skip=4):
self.policy_net = policy_net self.policy_net = policy_net
assert isinstance(self.policy_net, rltorch.network.ESNetwork) or isinstance(self.policy_net, rltorch.network.ESNetworkMP) assert isinstance(self.policy_net, rltorch.network.ESNetwork) or isinstance(self.policy_net, rltorch.network.ESNetworkMP)
self.policy_net.fitness = self.fitness self.policy_net.fitness = self.fitness
@ -20,7 +21,6 @@ class QEPAgent:
self.target_value_net = target_value_net self.target_value_net = target_value_net
self.memory = memory self.memory = memory
self.config = deepcopy(config) self.config = deepcopy(config)
self.logger = logger
self.policy_skip = policy_skip self.policy_skip = policy_skip
self.entropy_importance = entropy_importance self.entropy_importance = entropy_importance
@ -67,7 +67,7 @@ class QEPAgent:
return (entropy_importance * entropy_loss - value_importance * obtained_values).mean().item() return (entropy_importance * entropy_loss - value_importance * obtained_values).mean().item()
def learn(self, logger=None): def learn(self):
if len(self.memory) < self.config['batch_size']: if len(self.memory) < self.config['batch_size']:
return return
@ -114,8 +114,8 @@ class QEPAgent:
else: else:
value_loss = F.mse_loss(obtained_values, expected_values) value_loss = F.mse_loss(obtained_values, expected_values)
if self.logger is not None: if log.enabled:
self.logger.append("Loss/Value", value_loss.item()) log.Logger["Loss/Value"].append(value_loss.item())
self.value_net.zero_grad() self.value_net.zero_grad()
value_loss.backward() value_loss.backward()

View file

@ -4,14 +4,13 @@ import torch
import rltorch import rltorch
class REINFORCEAgent: class REINFORCEAgent:
def __init__(self, net, memory, config, target_net=None, logger=None): def __init__(self, net, memory, config, target_net=None):
self.net = net self.net = net
if not isinstance(memory, rltorch.memory.EpisodeMemory): if not isinstance(memory, rltorch.memory.EpisodeMemory):
raise ValueError("Memory must be of instance EpisodeMemory") raise ValueError("Memory must be of instance EpisodeMemory")
self.memory = memory self.memory = memory
self.config = deepcopy(config) self.config = deepcopy(config)
self.target_net = target_net self.target_net = target_net
self.logger = logger
# Shaped rewards implements three improvements to REINFORCE # Shaped rewards implements three improvements to REINFORCE
# 1) Discounted rewards, future rewards matter less than current # 1) Discounted rewards, future rewards matter less than current
@ -42,8 +41,8 @@ class REINFORCEAgent:
policy_loss = (-log_prob_batch * shaped_reward_batch).sum() policy_loss = (-log_prob_batch * shaped_reward_batch).sum()
if self.logger is not None: if rltorch.log.enabled:
self.logger.append("Loss", policy_loss.item()) rltorch.log.Logger["Loss"].append(policy_loss.item())
self.net.zero_grad() self.net.zero_grad()
policy_loss.backward() policy_loss.backward()

View file

@ -2,7 +2,7 @@ from copy import deepcopy
import time import time
import rltorch import rltorch
def simulateEnvEps(env, actor, config, total_episodes=1, memory=None, logger=None, name="", render=False): def simulateEnvEps(env, actor, config, total_episodes=1, memory=None, name="", render=False):
for episode in range(total_episodes): for episode in range(total_episodes):
state = env.reset() state = env.reset()
done = False done = False
@ -23,8 +23,8 @@ def simulateEnvEps(env, actor, config, total_episodes=1, memory=None, logger=Non
print("episode: {}/{}, score: {}" print("episode: {}/{}, score: {}"
.format(episode, total_episodes, episode_reward), flush=True) .format(episode, total_episodes, episode_reward), flush=True)
if logger is not None: if rltorch.log.enabled:
logger.append(name + '/EpisodeReward', episode_reward) rltorch.log.Logger[name + '/EpisodeReward'].append(episode_reward)
class EnvironmentRunSync: class EnvironmentRunSync:
@ -42,7 +42,6 @@ class EnvironmentRunSync:
def run(self, iterations): def run(self, iterations):
state = self.last_state state = self.last_state
logger = rltorch.log.Logger() if self.logwriter is not None else None
for _ in range(iterations): for _ in range(iterations):
action = self.actor.act(state) action = self.actor.act(state)
next_state, reward, done, _ = self.env.step(action) next_state, reward, done, _ = self.env.step(action)
@ -61,13 +60,13 @@ class EnvironmentRunSync:
.format(self.episode_num, self.config['total_training_episodes'], self.episode_reward), flush=True) .format(self.episode_num, self.config['total_training_episodes'], self.episode_reward), flush=True)
if self.logwriter is not None: if self.logwriter is not None:
logger.append(self.name + '/EpisodeReward', self.episode_reward) rltorch.log.Logger[self.name + '/EpisodeReward'].append(self.episode_reward)
self.episode_reward = 0 self.episode_reward = 0
state = self.env.reset() state = self.env.reset()
self.episode_num += 1 self.episode_num += 1
if self.logwriter is not None: if self.logwriter is not None:
self.logwriter.write(logger) self.logwriter.write(rltorch.log.Logger)
self.last_state = state self.last_state = state
@ -86,15 +85,13 @@ class EnvironmentEpisodeSync:
state = self.env.reset() state = self.env.reset()
done = False done = False
episodeReward = 0 episodeReward = 0
logger = rltorch.log.Logger() if self.logwriter is not None else None
while not done: while not done:
action = self.actor.act(state) action = self.actor.act(state)
next_state, reward, done, _ = self.env.step(action) next_state, reward, done, _ = self.env.step(action)
episodeReward += reward episodeReward += reward
if self.memory is not None: if self.memory is not None:
self.memory.append(state, action, reward, next_state, done) self.memory.append(state, action, reward, next_state, done)
state = next_state state = next_state
if self.episode_num % self.config['print_stat_n_eps'] == 0: if self.episode_num % self.config['print_stat_n_eps'] == 0:
@ -102,7 +99,7 @@ class EnvironmentEpisodeSync:
.format(self.episode_num, self.config['total_training_episodes'], episodeReward), flush=True) .format(self.episode_num, self.config['total_training_episodes'], episodeReward), flush=True)
if self.logwriter is not None: if self.logwriter is not None:
logger.append(self.name + '/EpisodeReward', episodeReward) rltorch.log.Logger[self.name + '/EpisodeReward'].append(episodeReward)
self.logwriter.write(logger) self.logwriter.write(rltorch.log.Logger)
self.episode_num += 1 self.episode_num += 1

View file

@ -3,6 +3,7 @@ from typing import Dict, List, Any
import numpy as np import numpy as np
import torch import torch
enabled = False
Logger: Dict[Any, List[Any]] = defaultdict(list) Logger: Dict[Any, List[Any]] = defaultdict(list)
class LogWriter: class LogWriter:

View file

@ -2,4 +2,3 @@ from .EpisodeMemory import *
from .ReplayMemory import * from .ReplayMemory import *
from .PrioritizedReplayMemory import * from .PrioritizedReplayMemory import *
from .DQfDMemory import * from .DQfDMemory import *
from .iDQfDMemory import *

View file

@ -3,14 +3,14 @@
from copy import deepcopy from copy import deepcopy
import torch.multiprocessing as mp import torch.multiprocessing as mp
import rltorch.log as log
class EnvironmentEpisode(mp.Process): class EnvironmentEpisode(mp.Process):
def __init__(self, env, actor, config, logger=None, name=""): def __init__(self, env, actor, config, name=""):
super(EnvironmentEpisode, self).__init__() super(EnvironmentEpisode, self).__init__()
self.env = env self.env = env
self.actor = actor self.actor = actor
self.config = deepcopy(config) self.config = deepcopy(config)
self.logger = logger
self.name = name self.name = name
self.episode_num = 1 self.episode_num = 1
@ -30,7 +30,7 @@ class EnvironmentEpisode(mp.Process):
if printstat: if printstat:
print("episode: {}/{}, score: {}" print("episode: {}/{}, score: {}"
.format(self.episode_num, self.config['total_training_episodes'], episode_reward)) .format(self.episode_num, self.config['total_training_episodes'], episode_reward))
if self.logger is not None: if log.enabled:
self.logger.append(self.name + '/EpisodeReward', episode_reward) log.Logger[self.name + '/EpisodeReward'].append(episode_reward)
self.episode_num += 1 self.episode_num += 1

View file

@ -2,7 +2,7 @@ from copy import deepcopy
import numpy as np import numpy as np
import torch import torch
from .Network import Network from .Network import Network
import rltorch.log as log
# [TODO] Should we torch.no_grad the __call__? # [TODO] Should we torch.no_grad the __call__?
# What if we want to sometimes do gradient descent as well? # What if we want to sometimes do gradient descent as well?
@ -34,13 +34,11 @@ class ESNetwork(Network):
A dictionary of configuration items. A dictionary of configuration items.
device device
A device to send the weights to. A device to send the weights to.
logger
Keeps track of historical weights
name name
For use in logger to differentiate in analysis. For use in logger to differentiate in analysis.
""" """
def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, logger=None, name=""): def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, name=""):
super(ESNetwork, self).__init__(model, optimizer, config, device, logger, name) super(ESNetwork, self).__init__(model, optimizer, config, device, name)
self.population_size = population_size self.population_size = population_size
self.fitness = fitness_fn self.fitness = fitness_fn
self.sigma = sigma self.sigma = sigma
@ -105,8 +103,8 @@ class ESNetwork(Network):
[self.fitness(x, *args) for x in candidate_solutions], [self.fitness(x, *args) for x in candidate_solutions],
device=self.device device=self.device
) )
if self.logger is not None: if log.enabled:
self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item()) log.Logger[self.name + "/" + "fitness_value"].append(fitness_values.mean().item())
fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps) fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps)
## Insert adjustments into gradients slot ## Insert adjustments into gradients slot

View file

@ -3,6 +3,7 @@ import numpy as np
import torch import torch
import torch.multiprocessing as mp import torch.multiprocessing as mp
from .Network import Network from .Network import Network
import rltorch.log as log
class fn_copy: class fn_copy:
def __init__(self, fn, args): def __init__(self, fn, args):
@ -19,8 +20,8 @@ class ESNetworkMP(Network):
fitness_fun := model, *args -> fitness_value (float) fitness_fun := model, *args -> fitness_value (float)
We wish to find a model that maximizes the fitness function We wish to find a model that maximizes the fitness function
""" """
def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, logger=None, name=""): def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, name=""):
super(ESNetworkMP, self).__init__(model, optimizer, config, device, logger, name) super(ESNetworkMP, self).__init__(model, optimizer, config, device, name)
self.population_size = population_size self.population_size = population_size
self.fitness = fitness_fn self.fitness = fitness_fn
self.sigma = sigma self.sigma = sigma
@ -76,8 +77,8 @@ class ESNetworkMP(Network):
device=self.device device=self.device
) )
if self.logger is not None: if log.enabled:
self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item()) log.Logger[self.name + "/" + "fitness_value"].append(fitness_values.mean().item())
fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps) fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps)
## Insert adjustments into gradients slot ## Insert adjustments into gradients slot

View file

@ -1,3 +1,5 @@
import rltorch.log as log
class Network: class Network:
""" """
Wrapper around model and optimizer in PyTorch to abstract away common use cases. Wrapper around model and optimizer in PyTorch to abstract away common use cases.
@ -12,12 +14,10 @@ class Network:
A dictionary of configuration items. A dictionary of configuration items.
device device
A device to send the weights to. A device to send the weights to.
logger
Keeps track of historical weights
name name
For use in logger to differentiate in analysis. For use in logger to differentiate in analysis.
""" """
def __init__(self, model, optimizer, config, device=None, logger=None, name=""): def __init__(self, model, optimizer, config, device=None, name=""):
self.model = model self.model = model
if 'weight_decay' in config: if 'weight_decay' in config:
self.optimizer = optimizer( self.optimizer = optimizer(
@ -27,7 +27,6 @@ class Network:
) )
else: else:
self.optimizer = optimizer(model.parameters(), lr=config['learning_rate']) self.optimizer = optimizer(model.parameters(), lr=config['learning_rate'])
self.logger = logger
self.name = name self.name = name
self.device = device self.device = device
if self.device is not None: if self.device is not None:
@ -63,8 +62,8 @@ class Network:
self.optimizer.step() self.optimizer.step()
def log_named_parameters(self): def log_named_parameters(self):
if self.logger is not None: if log.enabled:
for name, param in self.model.named_parameters(): for name, param in self.model.named_parameters():
self.logger.append(self.name + "/" + name, param.cpu().detach().numpy()) log.Logger[self.name + "/" + name].append(param.cpu().detach().numpy())