Added Evolutionary Strategies Network and added more example scripts

This commit is contained in:
Brandon Rozek 2019-02-27 09:52:28 -05:00
parent 26084d4c7c
commit 76a044ace9
14 changed files with 695 additions and 41 deletions

161
examples/acrobot_a2c.py Normal file
View file

@ -0,0 +1,161 @@
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import rltorch
import rltorch.network as rn
import rltorch.memory as M
import rltorch.env as E
from rltorch.action_selector import StochasticSelector
from tensorboardX import SummaryWriter
import torch.multiprocessing as mp
import signal
from copy import deepcopy
class Value(nn.Module):
def __init__(self, state_size):
super(Value, self).__init__()
self.state_size = state_size
self.fc1 = rn.NoisyLinear(state_size, 64)
self.fc_norm = nn.LayerNorm(64)
self.fc2 = rn.NoisyLinear(64, 64)
self.fc2_norm = nn.LayerNorm(64)
self.fc3 = rn.NoisyLinear(64, 1)
def forward(self, x):
x = F.relu(self.fc_norm(self.fc1(x)))
x = F.relu(self.fc2_norm(self.fc2(x)))
x = self.fc3(x)
return x
class Policy(nn.Module):
def __init__(self, state_size, action_size):
super(Policy, self).__init__()
self.state_size = state_size
self.action_size = action_size
self.fc1 = rn.NoisyLinear(state_size, 64)
self.fc_norm = nn.LayerNorm(64)
self.fc2 = rn.NoisyLinear(64, 64)
self.fc2_norm = nn.LayerNorm(64)
self.fc3 = rn.NoisyLinear(64, action_size)
# self.fc3_norm = nn.LayerNorm(action_size)
# self.value_fc = rn.NoisyLinear(64, 64)
# self.value_fc_norm = nn.LayerNorm(64)
# self.value = rn.NoisyLinear(64, 1)
# self.advantage_fc = rn.NoisyLinear(64, 64)
# self.advantage_fc_norm = nn.LayerNorm(64)
# self.advantage = rn.NoisyLinear(64, action_size)
def forward(self, x):
x = F.relu(self.fc_norm(self.fc1(x)))
x = F.relu(self.fc2_norm(self.fc2(x)))
x = F.softmax(self.fc3(x), dim = 1)
# state_value = F.relu(self.value_fc_norm(self.value_fc(x)))
# state_value = self.value(state_value)
# advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x)))
# advantage = self.advantage(advantage)
# x = F.softmax(state_value + advantage - advantage.mean(), dim = 1)
return x
config = {}
config['seed'] = 901
config['environment_name'] = 'Acrobot-v1'
config['memory_size'] = 2000
config['total_training_episodes'] = 500
config['total_evaluation_episodes'] = 10
config['batch_size'] = 32
config['learning_rate'] = 1e-3
config['target_sync_tau'] = 1e-1
config['discount_rate'] = 0.99
config['replay_skip'] = 0
# How many episodes between printing out the episode stats
config['print_stat_n_eps'] = 1
config['disable_cuda'] = False
def train(runner, agent, config, logger = None, logwriter = None):
finished = False
last_episode_num = 1
while not finished:
runner.run(config['replay_skip'] + 1)
agent.learn()
if logwriter is not None:
if last_episode_num < runner.episode_num:
last_episode_num = runner.episode_num
agent.value_net.log_named_parameters()
agent.policy_net.log_named_parameters()
logwriter.write(logger)
finished = runner.episode_num > config['total_training_episodes']
if __name__ == "__main__":
torch.multiprocessing.set_sharing_strategy('file_system') # To not hit file descriptor memory limit
# Setting up the environment
rltorch.set_seed(config['seed'])
print("Setting up environment...", end = " ")
env = E.TorchWrap(gym.make(config['environment_name']))
env.seed(config['seed'])
print("Done.")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
# Logging
logger = rltorch.log.Logger()
logwriter = rltorch.log.LogWriter(SummaryWriter())
# Setting up the networks
device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu")
policy_net = rn.Network(Policy(state_size, action_size),
torch.optim.Adam, config, device = device, name = "Policy")
value_net = rn.Network(Value(state_size),
torch.optim.Adam, config, device = device, name = "DQN")
# Memory stores experiences for later training
memory = M.EpisodeMemory()
# Actor takes a net and uses it to produce actions from given states
actor = StochasticSelector(policy_net, action_size, memory, device = device)
# Agent is what performs the training
# agent = rltorch.agents.REINFORCEAgent(net, memory, config, target_net = target_net, logger = logger)
agent = rltorch.agents.A2CSingleAgent(policy_net, value_net, memory, config, logger = logger)
# Runner performs a certain number of steps in the environment
runner = rltorch.env.EnvironmentRunSync(env, actor, config, name = "Training", memory = memory, logwriter = logwriter)
print("Training...")
train(runner, agent, config, logger = logger, logwriter = logwriter)
# For profiling...
# import cProfile
# cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )')
# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution...
print("Training Finished.")
print("Evaluating...")
rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation")
print("Evaulations Done.")
logwriter.close() # We don't need to write anything out to disk anymore

120
examples/acrobot_es.py Normal file
View file

@ -0,0 +1,120 @@
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import rltorch
import rltorch.network as rn
import rltorch.memory as M
import rltorch.env as E
from rltorch.action_selector import StochasticSelector
from tensorboardX import SummaryWriter
import torch.multiprocessing as mp
class Policy(nn.Module):
def __init__(self, state_size, action_size):
super(Policy, self).__init__()
self.state_size = state_size
self.action_size = action_size
self.fc1 = nn.Linear(state_size, 125)
self.fc_norm = nn.LayerNorm(125)
self.fc2 = nn.Linear(125, 125)
self.fc2_norm = nn.LayerNorm(125)
self.action_prob = nn.Linear(125, action_size)
def forward(self, x):
x = F.relu(self.fc_norm(self.fc1(x)))
x = F.relu(self.fc2_norm(self.fc2(x)))
x = F.softmax(self.action_prob(x), dim = 1)
return x
config = {}
config['seed'] = 901
config['environment_name'] = 'Acrobot-v1'
config['memory_size'] = 2000
config['total_training_episodes'] = 50
config['total_evaluation_episodes'] = 5
config['batch_size'] = 32
config['learning_rate'] = 1e-1
config['target_sync_tau'] = 1e-1
config['discount_rate'] = 0.99
config['replay_skip'] = 0
# How many episodes between printing out the episode stats
config['print_stat_n_eps'] = 1
config['disable_cuda'] = False
def train(env, net, actor, config, logger = None, logwriter = None):
finished = False
episode_num = 1
while not finished:
rltorch.env.simulateEnvEps(env, actor, config, logger = logger, name = "Training")
episode_num += 1
net.calc_gradients()
net.step()
# When the episode number changes, log network paramters
if logwriter is not None:
net.log_named_parameters()
logwriter.write(logger)
finished = episode_num > config['total_training_episodes']
def fitness(model):
env = gym.make("Acrobot-v1")
state = torch.from_numpy(env.reset()).float().unsqueeze(0)
total_reward = 0
done = False
while not done:
action_probabilities = model(state)
distribution = Categorical(action_probabilities)
action = distribution.sample().item()
next_state, reward, done, _ = env.step(action)
total_reward += reward
state = torch.from_numpy(next_state).float().unsqueeze(0)
return total_reward
if __name__ == "__main__":
# Setting up the environment
rltorch.set_seed(config['seed'])
print("Setting up environment...", end = " ")
env = E.TorchWrap(gym.make(config['environment_name']))
env.seed(config['seed'])
print("Done.")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
# Logging
logger = rltorch.log.Logger()
# logwriter = rltorch.log.LogWriter(logger, SummaryWriter())
logwriter = rltorch.log.LogWriter(SummaryWriter())
# Setting up the networks
device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu")
net = rn.ESNetwork(Policy(state_size, action_size),
torch.optim.Adam, 100, fitness, config, device = device, name = "ES", logger = logger)
net.model.share_memory()
# Actor takes a net and uses it to produce actions from given states
actor = StochasticSelector(net, action_size, device = device)
print("Training...")
train(env, net, actor, config, logger = logger, logwriter = logwriter)
# For profiling...
# import cProfile
# cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )')
# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution...
print("Training Finished.")
print("Evaluating...")
rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation")
print("Evaulations Done.")
logwriter.close() # We don't need to write anything out to disk anymore

161
examples/acrobot_ppo.py Normal file
View file

@ -0,0 +1,161 @@
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import rltorch
import rltorch.network as rn
import rltorch.memory as M
import rltorch.env as E
from rltorch.action_selector import StochasticSelector
from tensorboardX import SummaryWriter
import torch.multiprocessing as mp
import signal
from copy import deepcopy
class Value(nn.Module):
def __init__(self, state_size):
super(Value, self).__init__()
self.state_size = state_size
self.fc1 = rn.NoisyLinear(state_size, 64)
self.fc_norm = nn.LayerNorm(64)
self.fc2 = rn.NoisyLinear(64, 64)
self.fc2_norm = nn.LayerNorm(64)
self.fc3 = rn.NoisyLinear(64, 1)
def forward(self, x):
x = F.relu(self.fc_norm(self.fc1(x)))
x = F.relu(self.fc2_norm(self.fc2(x)))
x = self.fc3(x)
return x
class Policy(nn.Module):
def __init__(self, state_size, action_size):
super(Policy, self).__init__()
self.state_size = state_size
self.action_size = action_size
self.fc1 = rn.NoisyLinear(state_size, 64)
self.fc_norm = nn.LayerNorm(64)
self.fc2 = rn.NoisyLinear(64, 64)
self.fc2_norm = nn.LayerNorm(64)
self.fc3 = rn.NoisyLinear(64, action_size)
# self.fc3_norm = nn.LayerNorm(action_size)
# self.value_fc = rn.NoisyLinear(64, 64)
# self.value_fc_norm = nn.LayerNorm(64)
# self.value = rn.NoisyLinear(64, 1)
# self.advantage_fc = rn.NoisyLinear(64, 64)
# self.advantage_fc_norm = nn.LayerNorm(64)
# self.advantage = rn.NoisyLinear(64, action_size)
def forward(self, x):
x = F.relu(self.fc_norm(self.fc1(x)))
x = F.relu(self.fc2_norm(self.fc2(x)))
x = F.softmax(self.fc3(x), dim = 1)
# state_value = F.relu(self.value_fc_norm(self.value_fc(x)))
# state_value = self.value(state_value)
# advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x)))
# advantage = self.advantage(advantage)
# x = F.softmax(state_value + advantage - advantage.mean(), dim = 1)
return x
config = {}
config['seed'] = 901
config['environment_name'] = 'Acrobot-v1'
config['memory_size'] = 2000
config['total_training_episodes'] = 500
config['total_evaluation_episodes'] = 10
config['batch_size'] = 32
config['learning_rate'] = 1e-3
config['target_sync_tau'] = 1e-1
config['discount_rate'] = 0.99
config['replay_skip'] = 0
# How many episodes between printing out the episode stats
config['print_stat_n_eps'] = 1
config['disable_cuda'] = False
def train(runner, agent, config, logger = None, logwriter = None):
finished = False
last_episode_num = 1
while not finished:
runner.run(config['replay_skip'] + 1)
agent.learn()
if logwriter is not None:
if last_episode_num < runner.episode_num:
last_episode_num = runner.episode_num
agent.value_net.log_named_parameters()
agent.policy_net.log_named_parameters()
logwriter.write(logger)
finished = runner.episode_num > config['total_training_episodes']
if __name__ == "__main__":
torch.multiprocessing.set_sharing_strategy('file_system') # To not hit file descriptor memory limit
# Setting up the environment
rltorch.set_seed(config['seed'])
print("Setting up environment...", end = " ")
env = E.TorchWrap(gym.make(config['environment_name']))
env.seed(config['seed'])
print("Done.")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
# Logging
logger = rltorch.log.Logger()
logwriter = rltorch.log.LogWriter(SummaryWriter())
# Setting up the networks
device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu")
policy_net = rn.Network(Policy(state_size, action_size),
torch.optim.Adam, config, device = device, name = "Policy")
value_net = rn.Network(Value(state_size),
torch.optim.Adam, config, device = device, name = "DQN")
# Memory stores experiences for later training
memory = M.EpisodeMemory()
# Actor takes a net and uses it to produce actions from given states
actor = StochasticSelector(policy_net, action_size, memory, device = device)
# Agent is what performs the training
# agent = rltorch.agents.REINFORCEAgent(net, memory, config, target_net = target_net, logger = logger)
agent = rltorch.agents.PPOAgent(policy_net, value_net, memory, config, logger = logger)
# Runner performs a certain number of steps in the environment
runner = rltorch.env.EnvironmentRunSync(env, actor, config, name = "Training", memory = memory, logwriter = logwriter)
print("Training...")
train(runner, agent, config, logger = logger, logwriter = logwriter)
# For profiling...
# import cProfile
# cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )')
# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution...
print("Training Finished.")
print("Evaluating...")
rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation")
print("Evaulations Done.")
logwriter.close() # We don't need to write anything out to disk anymore

View file

@ -7,9 +7,10 @@ import rltorch
import rltorch.network as rn
import rltorch.memory as M
import rltorch.env as E
from rltorch.action_selector import ArgMaxSelector
from rltorch.action_selector import StochasticSelector
from tensorboardX import SummaryWriter
import torch.multiprocessing as mp
from copy import deepcopy
class Value(nn.Module):
def __init__(self, state_size, action_size):
@ -17,16 +18,16 @@ class Value(nn.Module):
self.state_size = state_size
self.action_size = action_size
self.fc1 = rn.NoisyLinear(state_size, 64)
self.fc_norm = nn.LayerNorm(64)
self.fc1 = rn.NoisyLinear(state_size, 255)
self.fc_norm = nn.LayerNorm(255)
self.value_fc = rn.NoisyLinear(64, 64)
self.value_fc_norm = nn.LayerNorm(64)
self.value = rn.NoisyLinear(64, 1)
self.value_fc = rn.NoisyLinear(255, 255)
self.value_fc_norm = nn.LayerNorm(255)
self.value = rn.NoisyLinear(255, 1)
self.advantage_fc = rn.NoisyLinear(64, 64)
self.advantage_fc_norm = nn.LayerNorm(64)
self.advantage = rn.NoisyLinear(64, action_size)
self.advantage_fc = rn.NoisyLinear(255, 255)
self.advantage_fc_norm = nn.LayerNorm(255)
self.advantage = rn.NoisyLinear(255, action_size)
def forward(self, x):
x = F.relu(self.fc_norm(self.fc1(x)))
@ -42,12 +43,32 @@ class Value(nn.Module):
return x
class Policy(nn.Module):
def __init__(self, state_size, action_size):
super(Policy, self).__init__()
self.state_size = state_size
self.action_size = action_size
self.fc1 = nn.Linear(state_size, 125)
self.fc_norm = nn.LayerNorm(125)
self.fc2 = nn.Linear(125, 125)
self.fc2_norm = nn.LayerNorm(125)
self.action_prob = nn.Linear(125, action_size)
def forward(self, x):
x = F.relu(self.fc_norm(self.fc1(x)))
x = F.relu(self.fc2_norm(self.fc2(x)))
x = F.softmax(self.action_prob(x), dim = 1)
return x
config = {}
config['seed'] = 901
config['environment_name'] = 'Acrobot-v1'
config['memory_size'] = 2000
config['total_training_episodes'] = 50
config['total_evaluation_episodes'] = 10
config['total_evaluation_episodes'] = 5
config['batch_size'] = 32
config['learning_rate'] = 1e-3
config['target_sync_tau'] = 1e-1
@ -65,28 +86,24 @@ config['prioritized_replay_sampling_priority'] = 0.6
# 1 - Lower the importance of high losses
# Should ideally start from 0 and move your way to 1 to prevent overfitting
config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 5000)
def train(runner, agent, config, logger = None, logwriter = None):
finished = False
last_episode_num = 1
while not finished:
runner.run()
runner.run(config['replay_skip'] + 1)
agent.learn()
runner.join()
# When the episode number changes, log network paramters
with runner.episode_num.get_lock():
if logwriter is not None and last_episode_num < runner.episode_num.value:
last_episode_num = runner.episode_num.value
agent.net.log_named_parameters()
if logwriter is not None:
logwriter.write(logger)
finished = runner.episode_num.value > config['total_training_episodes']
if logwriter is not None:
if last_episode_num < runner.episode_num:
last_episode_num = runner.episode_num
agent.value_net.log_named_parameters()
agent.policy_net.log_named_parameters()
logwriter.write(logger)
finished = runner.episode_num > config['total_training_episodes']
if __name__ == "__main__":
torch.multiprocessing.set_sharing_strategy('file_system') # To not hit file descriptor memory limit
# Setting up the environment
rltorch.set_seed(config['seed'])
print("Setting up environment...", end = " ")
@ -104,24 +121,29 @@ if __name__ == "__main__":
# Setting up the networks
device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu")
net = rn.Network(Value(state_size, action_size),
torch.optim.Adam, config, device = device, name = "DQN")
target_net = rn.TargetNetwork(net, device = device)
net.model.share_memory()
config2 = deepcopy(config)
config2['learning_rate'] = 0.01
policy_net = rn.ESNetwork(Policy(state_size, action_size),
torch.optim.Adam, 500, None, config2, sigma = 0.1, device = device, name = "ES", logger = logger)
value_net = rn.Network(Value(state_size, action_size),
torch.optim.Adam, config, device = device, name = "DQN", logger = logger)
target_net = rn.TargetNetwork(value_net, device = device)
value_net.model.share_memory()
target_net.model.share_memory()
# Actor takes a net and uses it to produce actions from given states
actor = ArgMaxSelector(net, action_size, device = device)
actor = StochasticSelector(policy_net, action_size, device = device)
# Memory stores experiences for later training
memory = M.PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority'])
# memory = M.ReplayMemory(capacity = config['memory_size'])
# Runner performs a certain number of steps in the environment
runner = rltorch.mp.EnvironmentRun(env, actor, config, name = "Training", memory = memory, logwriter = logwriter)
runner = rltorch.env.EnvironmentRunSync(env, actor, config, name = "Training", memory = memory, logwriter = logwriter)
# Agent is what performs the training
agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net, logger = logger)
# agent = TestAgent(policy_net, value_net, memory, config, target_value_net = target_net, logger = logger)
agent = rltorch.agents.QEPAgent(policy_net, value_net, memory, config, target_value_net = target_net, logger = logger)
print("Training...")
train(runner, agent, config, logger = logger, logwriter = logwriter)
@ -132,7 +154,6 @@ if __name__ == "__main__":
# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution...
print("Training Finished.")
runner.terminate() # We don't need the extra process anymore
print("Evaluating...")
rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation")

View file

@ -48,7 +48,7 @@ config = {}
config['seed'] = 901
config['environment_name'] = 'Acrobot-v1'
config['memory_size'] = 2000
config['total_training_episodes'] = 100
config['total_training_episodes'] = 500
config['total_evaluation_episodes'] = 10
config['batch_size'] = 32
config['learning_rate'] = 1e-3

View file

@ -46,8 +46,8 @@ config = {}
config['seed'] = 901
config['environment_name'] = 'Acrobot-v1'
config['memory_size'] = 2000
config['total_training_episodes'] = 5
config['total_evaluation_episodes'] = 2
config['total_training_episodes'] = 50
config['total_evaluation_episodes'] = 5
config['batch_size'] = 32
config['learning_rate'] = 1e-3
config['target_sync_tau'] = 1e-1

View file

@ -0,0 +1,14 @@
from .ArgMaxSelector import ArgMaxSelector
import torch
class IdentitySelector(ArgMaxSelector):
def __init__(self, model, action_size, device = None):
super(IdentitySelector, self).__init__(model, action_size, device = device)
# random_act is already implemented in ArgMaxSelector
def best_act(self, state):
with torch.no_grad():
if self.device is not None:
state = state.to(self.device)
action = self.model(state).squeeze(0).item()
return action
def act(self, state):
return self.best_act(state)

View file

@ -5,7 +5,7 @@ import rltorch
from rltorch.action_selector import ArgMaxSelector
class StochasticSelector(ArgMaxSelector):
def __init__(self, model, action_size, memory, device = None):
def __init__(self, model, action_size, memory = None, device = None):
super(StochasticSelector, self).__init__(model, action_size, device = device)
self.model = model
self.action_size = action_size

View file

@ -1,4 +1,5 @@
from .ArgMaxSelector import *
from .EpsilonGreedySelector import *
from .IdentitySelector import *
from .RandomSelector import *
from .StochasticSelector import *

View file

@ -1,5 +1,3 @@
# Deprecated since the idea of the idea shouldn't work without having some sort of "mental model" of the environment
from copy import deepcopy
import numpy as np
import torch

110
rltorch/agents/QEPAgent.py Normal file
View file

@ -0,0 +1,110 @@
from copy import deepcopy
import collections
import torch
from torch.distributions import Categorical
import rltorch
import rltorch.memory as M
# Q-Evolutionary Policy Agent
# Maximizes the policy with respect to the Q-Value function.
# Since function is non-differentiabile, depends on the Evolutionary Strategy algorithm
class QEPAgent:
def __init__(self, policy_net, value_net, memory, config, target_value_net = None, logger = None):
self.policy_net = policy_net
assert isinstance(self.policy_net, rltorch.network.ESNetwork)
self.policy_net.fitness = self.fitness
self.value_net = value_net
self.target_value_net = target_value_net
self.memory = memory
self.config = deepcopy(config)
self.logger = logger
self.policy_skip = 10
def fitness(self, policy_net, value_net, state_batch):
action_probabilities = policy_net(state_batch)
distributions = list(map(Categorical, action_probabilities))
actions = torch.tensor([d.sample() for d in distributions])
with torch.no_grad():
state_values = value_net(state_batch)
obtained_values = state_values.gather(1, actions.view(len(state_batch), 1)).squeeze(1)
return -obtained_values.mean().item()
def learn(self, logger = None):
if len(self.memory) < self.config['batch_size']:
return
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
weight_importance = self.config['prioritized_replay_weight_importance']
# If it's a scheduler then get the next value by calling next, otherwise just use it's value
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
minibatch = self.memory.sample(self.config['batch_size'], beta = beta)
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True)
else:
minibatch = self.memory.sample(self.config['batch_size'])
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch)
# Send to their appropriate devices
state_batch = state_batch.to(self.value_net.device)
action_batch = action_batch.to(self.value_net.device)
reward_batch = reward_batch.to(self.value_net.device)
next_state_batch = next_state_batch.to(self.value_net.device)
not_done_batch = not_done_batch.to(self.value_net.device)
state_values = self.value_net(state_batch)
obtained_values = state_values.gather(1, action_batch.view(self.config['batch_size'], 1))
with torch.no_grad():
# Use the target net to produce action values for the next state
# and the regular net to select the action
# That way we decouple the value and action selecting processes (DOUBLE DQN)
not_done_size = not_done_batch.sum()
next_state_values = torch.zeros_like(state_values, device = self.value_net.device)
if self.target_value_net is not None:
next_state_values[not_done_batch] = self.target_value_net(next_state_batch[not_done_batch])
next_best_action = self.value_net(next_state_batch[not_done_batch]).argmax(1)
else:
next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch])
next_best_action = next_state_values[not_done_batch].argmax(1)
best_next_state_value = torch.zeros(self.config['batch_size'], device = self.value_net.device)
best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
value_loss = (torch.as_tensor(importance_weights, device = self.value_net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
else:
value_loss = F.mse_loss(obtained_values, expected_values)
if self.logger is not None:
self.logger.append("Loss/Value", value_loss.item())
self.value_net.zero_grad()
value_loss.backward()
self.value_net.clamp_gradients()
self.value_net.step()
if self.target_value_net is not None:
if 'target_sync_tau' in self.config:
self.target_value_net.partial_sync(self.config['target_sync_tau'])
else:
self.target_value_net.sync()
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
td_error = (obtained_values - expected_values).detach().abs()
self.memory.update_priorities(batch_indexes, td_error)
## Policy Training
if self.policy_skip > 0:
self.policy_skip -= 1
return
self.policy_skip = 10
if self.target_value_net is not None:
self.policy_net.calc_gradients(self.target_value_net, state_batch)
else:
self.policy_net.calc_gradients(self.value_net, state_batch)
self.policy_net.clamp_gradients()
self.policy_net.step()

View file

@ -1,4 +1,5 @@
from .A2CSingleAgent import *
from .DQNAgent import *
from .PPOAgent import *
from .QEPAgent import *
from .REINFORCEAgent import *

View file

@ -0,0 +1,66 @@
import numpy as np
import torch
from .Network import Network
from copy import deepcopy
class ESNetwork(Network):
"""
Network that functions from the paper Evolutionary Strategies (https://arxiv.org/abs/1703.03864)
fitness_fun := model, *args -> fitness_value (float)
We wish to find a model that maximizes the fitness function
"""
def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma = 0.05, device = None, logger = None, name = ""):
super(ESNetwork, self).__init__(model, optimizer, config, device, logger, name)
self.population_size = population_size
self.fitness = fitness_fn
self.sigma = sigma
# We're not going to be calculating gradients in the traditional way
# So there's no need to waste computation time keeping track
def __call__(self, *args):
with torch.no_grad():
result = self.model(*args)
return result
def _generate_noise_dicts(self):
model_dict = self.model.state_dict()
white_noise_dict = {}
noise_dict = {}
for key in model_dict.keys():
white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape)
noise_dict[key] = self.sigma * white_noise_dict[key]
return white_noise_dict, noise_dict
def _generate_candidate_solutions(self, noise_dict):
model_dict = self.model.state_dict()
candidate_solutions = []
for i in range(self.population_size):
candidate_statedict = {}
for key in model_dict.keys():
candidate_statedict[key] = model_dict[key] + noise_dict[key][i]
candidate = deepcopy(self.model)
candidate.load_state_dict(candidate_statedict)
candidate_solutions.append(candidate)
return candidate_solutions
def calc_gradients(self, *args):
## Generate Noise
white_noise_dict, noise_dict = self._generate_noise_dicts()
## Generate candidate solutions
candidate_solutions = self._generate_candidate_solutions(noise_dict)
## Calculate fitness then mean shift, scale
fitness_values = torch.tensor([self.fitness(x, *args) for x in candidate_solutions])
if self.logger is not None:
self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item())
fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps)
## Insert adjustments into gradients slot
self.zero_grad()
for name, param in self.model.named_parameters():
if param.requires_grad:
noise_dim_n = len(white_noise_dict[name].shape)
dim = np.repeat(1, noise_dim_n - 1).tolist() if noise_dim_n > 0 else []
param.grad = (white_noise_dict[name] * fitness_values.float().reshape(self.population_size, *dim)).mean(0) / self.sigma

View file

@ -1,3 +1,4 @@
from .ESNetwork import *
from .Network import *
from .NoisyLinear import *
from .TargetNetwork import *