Added Evolutionary Strategies Network and added more example scripts

This commit is contained in:
Brandon Rozek 2019-02-27 09:52:28 -05:00
parent 26084d4c7c
commit 76a044ace9
14 changed files with 695 additions and 41 deletions

View file

@ -0,0 +1,14 @@
from .ArgMaxSelector import ArgMaxSelector
import torch
class IdentitySelector(ArgMaxSelector):
def __init__(self, model, action_size, device = None):
super(IdentitySelector, self).__init__(model, action_size, device = device)
# random_act is already implemented in ArgMaxSelector
def best_act(self, state):
with torch.no_grad():
if self.device is not None:
state = state.to(self.device)
action = self.model(state).squeeze(0).item()
return action
def act(self, state):
return self.best_act(state)

View file

@ -5,7 +5,7 @@ import rltorch
from rltorch.action_selector import ArgMaxSelector
class StochasticSelector(ArgMaxSelector):
def __init__(self, model, action_size, memory, device = None):
def __init__(self, model, action_size, memory = None, device = None):
super(StochasticSelector, self).__init__(model, action_size, device = device)
self.model = model
self.action_size = action_size

View file

@ -1,4 +1,5 @@
from .ArgMaxSelector import *
from .EpsilonGreedySelector import *
from .IdentitySelector import *
from .RandomSelector import *
from .StochasticSelector import *

View file

@ -1,5 +1,3 @@
# Deprecated since the idea of the idea shouldn't work without having some sort of "mental model" of the environment
from copy import deepcopy
import numpy as np
import torch

110
rltorch/agents/QEPAgent.py Normal file
View file

@ -0,0 +1,110 @@
from copy import deepcopy
import collections
import torch
from torch.distributions import Categorical
import rltorch
import rltorch.memory as M
# Q-Evolutionary Policy Agent
# Maximizes the policy with respect to the Q-Value function.
# Since function is non-differentiabile, depends on the Evolutionary Strategy algorithm
class QEPAgent:
def __init__(self, policy_net, value_net, memory, config, target_value_net = None, logger = None):
self.policy_net = policy_net
assert isinstance(self.policy_net, rltorch.network.ESNetwork)
self.policy_net.fitness = self.fitness
self.value_net = value_net
self.target_value_net = target_value_net
self.memory = memory
self.config = deepcopy(config)
self.logger = logger
self.policy_skip = 10
def fitness(self, policy_net, value_net, state_batch):
action_probabilities = policy_net(state_batch)
distributions = list(map(Categorical, action_probabilities))
actions = torch.tensor([d.sample() for d in distributions])
with torch.no_grad():
state_values = value_net(state_batch)
obtained_values = state_values.gather(1, actions.view(len(state_batch), 1)).squeeze(1)
return -obtained_values.mean().item()
def learn(self, logger = None):
if len(self.memory) < self.config['batch_size']:
return
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
weight_importance = self.config['prioritized_replay_weight_importance']
# If it's a scheduler then get the next value by calling next, otherwise just use it's value
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
minibatch = self.memory.sample(self.config['batch_size'], beta = beta)
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True)
else:
minibatch = self.memory.sample(self.config['batch_size'])
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch)
# Send to their appropriate devices
state_batch = state_batch.to(self.value_net.device)
action_batch = action_batch.to(self.value_net.device)
reward_batch = reward_batch.to(self.value_net.device)
next_state_batch = next_state_batch.to(self.value_net.device)
not_done_batch = not_done_batch.to(self.value_net.device)
state_values = self.value_net(state_batch)
obtained_values = state_values.gather(1, action_batch.view(self.config['batch_size'], 1))
with torch.no_grad():
# Use the target net to produce action values for the next state
# and the regular net to select the action
# That way we decouple the value and action selecting processes (DOUBLE DQN)
not_done_size = not_done_batch.sum()
next_state_values = torch.zeros_like(state_values, device = self.value_net.device)
if self.target_value_net is not None:
next_state_values[not_done_batch] = self.target_value_net(next_state_batch[not_done_batch])
next_best_action = self.value_net(next_state_batch[not_done_batch]).argmax(1)
else:
next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch])
next_best_action = next_state_values[not_done_batch].argmax(1)
best_next_state_value = torch.zeros(self.config['batch_size'], device = self.value_net.device)
best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
value_loss = (torch.as_tensor(importance_weights, device = self.value_net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
else:
value_loss = F.mse_loss(obtained_values, expected_values)
if self.logger is not None:
self.logger.append("Loss/Value", value_loss.item())
self.value_net.zero_grad()
value_loss.backward()
self.value_net.clamp_gradients()
self.value_net.step()
if self.target_value_net is not None:
if 'target_sync_tau' in self.config:
self.target_value_net.partial_sync(self.config['target_sync_tau'])
else:
self.target_value_net.sync()
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
td_error = (obtained_values - expected_values).detach().abs()
self.memory.update_priorities(batch_indexes, td_error)
## Policy Training
if self.policy_skip > 0:
self.policy_skip -= 1
return
self.policy_skip = 10
if self.target_value_net is not None:
self.policy_net.calc_gradients(self.target_value_net, state_batch)
else:
self.policy_net.calc_gradients(self.value_net, state_batch)
self.policy_net.clamp_gradients()
self.policy_net.step()

View file

@ -1,4 +1,5 @@
from .A2CSingleAgent import *
from .DQNAgent import *
from .PPOAgent import *
from .QEPAgent import *
from .REINFORCEAgent import *

View file

@ -0,0 +1,66 @@
import numpy as np
import torch
from .Network import Network
from copy import deepcopy
class ESNetwork(Network):
"""
Network that functions from the paper Evolutionary Strategies (https://arxiv.org/abs/1703.03864)
fitness_fun := model, *args -> fitness_value (float)
We wish to find a model that maximizes the fitness function
"""
def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma = 0.05, device = None, logger = None, name = ""):
super(ESNetwork, self).__init__(model, optimizer, config, device, logger, name)
self.population_size = population_size
self.fitness = fitness_fn
self.sigma = sigma
# We're not going to be calculating gradients in the traditional way
# So there's no need to waste computation time keeping track
def __call__(self, *args):
with torch.no_grad():
result = self.model(*args)
return result
def _generate_noise_dicts(self):
model_dict = self.model.state_dict()
white_noise_dict = {}
noise_dict = {}
for key in model_dict.keys():
white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape)
noise_dict[key] = self.sigma * white_noise_dict[key]
return white_noise_dict, noise_dict
def _generate_candidate_solutions(self, noise_dict):
model_dict = self.model.state_dict()
candidate_solutions = []
for i in range(self.population_size):
candidate_statedict = {}
for key in model_dict.keys():
candidate_statedict[key] = model_dict[key] + noise_dict[key][i]
candidate = deepcopy(self.model)
candidate.load_state_dict(candidate_statedict)
candidate_solutions.append(candidate)
return candidate_solutions
def calc_gradients(self, *args):
## Generate Noise
white_noise_dict, noise_dict = self._generate_noise_dicts()
## Generate candidate solutions
candidate_solutions = self._generate_candidate_solutions(noise_dict)
## Calculate fitness then mean shift, scale
fitness_values = torch.tensor([self.fitness(x, *args) for x in candidate_solutions])
if self.logger is not None:
self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item())
fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps)
## Insert adjustments into gradients slot
self.zero_grad()
for name, param in self.model.named_parameters():
if param.requires_grad:
noise_dim_n = len(white_noise_dict[name].shape)
dim = np.repeat(1, noise_dim_n - 1).tolist() if noise_dim_n > 0 else []
param.grad = (white_noise_dict[name] * fitness_values.float().reshape(self.population_size, *dim)).mean(0) / self.sigma

View file

@ -1,3 +1,4 @@
from .ESNetwork import *
from .Network import *
from .NoisyLinear import *
from .TargetNetwork import *