PEP8 Conformance

This commit is contained in:
Brandon Rozek 2020-04-14 14:16:14 -04:00
parent 9b81188a77
commit 8fa4691511
29 changed files with 652 additions and 755 deletions

View file

@ -1,7 +1,7 @@
from random import randrange from random import randrange
import torch import torch
class ArgMaxSelector: class ArgMaxSelector:
def __init__(self, model, action_size, device = None): def __init__(self, model, action_size, device=None):
self.model = model self.model = model
self.action_size = action_size self.action_size = action_size
self.device = device self.device = device
@ -12,7 +12,8 @@ class ArgMaxSelector:
if self.device is not None: if self.device is not None:
state = state.to(self.device) state = state.to(self.device)
action_values = self.model(state).squeeze(0) action_values = self.model(state).squeeze(0)
action = self.random_act() if (action_values[0] == action_values).all() else action_values.argmax().item() action = self.random_act() if (action_values[0] == action_values).all() \
else action_values.argmax().item()
return action return action
def act(self, state): def act(self, state):
return self.best_act(state) return self.best_act(state)

View file

@ -1,9 +1,10 @@
from .ArgMaxSelector import ArgMaxSelector
import numpy as np
import collections import collections
import numpy as np
from .ArgMaxSelector import ArgMaxSelector
class EpsilonGreedySelector(ArgMaxSelector): class EpsilonGreedySelector(ArgMaxSelector):
def __init__(self, model, action_size, device = None, epsilon = 0.1): def __init__(self, model, action_size, device=None, epsilon=0.1):
super(EpsilonGreedySelector, self).__init__(model, action_size, device = device) super(EpsilonGreedySelector, self).__init__(model, action_size, device=device)
self.epsilon = epsilon self.epsilon = epsilon
# random_act is already implemented in ArgMaxSelector # random_act is already implemented in ArgMaxSelector
# best_act is already implemented in ArgMaxSelector # best_act is already implemented in ArgMaxSelector

View file

@ -1,8 +1,9 @@
from .ArgMaxSelector import ArgMaxSelector
import torch import torch
from .ArgMaxSelector import ArgMaxSelector
class IdentitySelector(ArgMaxSelector): class IdentitySelector(ArgMaxSelector):
def __init__(self, model, action_size, device = None): def __init__(self, model, action_size, device=None):
super(IdentitySelector, self).__init__(model, action_size, device = device) super(IdentitySelector, self).__init__(model, action_size, device=device)
# random_act is already implemented in ArgMaxSelector # random_act is already implemented in ArgMaxSelector
def best_act(self, state): def best_act(self, state):
with torch.no_grad(): with torch.no_grad():

View file

@ -1,10 +1,10 @@
from random import randrange from random import randrange
class RandomSelector(): class RandomSelector:
def __init__(self, action_size): def __init__(self, action_size):
self.action_size = action_size self.action_size = action_size
def random_act(self): def random_act(self):
return randrange(action_size) return randrange(self.action_size)
def best_act(self, state): def best_act(self, _):
return self.random_act() return self.random_act()
def act(self, state): def act(self, _):
return self.random_act() return self.random_act()

View file

@ -1,22 +1,19 @@
from random import randrange
import torch
from torch.distributions import Categorical from torch.distributions import Categorical
import rltorch from .ArgMaxSelector import ArgMaxSelector
from rltorch.action_selector import ArgMaxSelector from ..memory.EpisodeMemory import EpisodeMemory
class StochasticSelector(ArgMaxSelector): class StochasticSelector(ArgMaxSelector):
def __init__(self, model, action_size, memory = None, device = None): def __init__(self, model, action_size, memory=None, device=None):
super(StochasticSelector, self).__init__(model, action_size, device = device) super(StochasticSelector, self).__init__(model, action_size, device=device)
self.model = model self.model = model
self.action_size = action_size self.action_size = action_size
self.device = device self.device = device
self.memory = memory self.memory = memory
def best_act(self, state, log_prob = True): def best_act(self, state, log_prob=True):
if self.device is not None: if self.device is not None:
state = state.to(self.device) state = state.to(self.device)
action_probabilities = self.model(state) action_probabilities = self.model(state)
distribution = Categorical(action_probabilities) distribution = Categorical(action_probabilities)
action = distribution.sample() action = distribution.sample()
if log_prob and isinstance(self.memory, rltorch.memory.EpisodeMemory): if log_prob and isinstance(self.memory, EpisodeMemory):
self.memory.append_log_probs(distribution.log_prob(action)) self.memory.append_log_probs(distribution.log_prob(action))
return action.item() return action.item()

View file

@ -1,5 +1,5 @@
from .ArgMaxSelector import * from .ArgMaxSelector import ArgMaxSelector
from .EpsilonGreedySelector import * from .EpsilonGreedySelector import EpsilonGreedySelector
from .IdentitySelector import * from .IdentitySelector import IdentitySelector
from .RandomSelector import * from .RandomSelector import RandomSelector
from .StochasticSelector import * from .StochasticSelector import StochasticSelector

View file

@ -2,11 +2,9 @@ from copy import deepcopy
import numpy as np import numpy as np
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import rltorch
import rltorch.memory as M
class A2CSingleAgent: class A2CSingleAgent:
def __init__(self, policy_net, value_net, memory, config, logger = None): def __init__(self, policy_net, value_net, memory, config, logger=None):
self.policy_net = policy_net self.policy_net = policy_net
self.value_net = value_net self.value_net = value_net
self.memory = memory self.memory = memory
@ -16,7 +14,11 @@ class A2CSingleAgent:
def _discount_rewards(self, rewards): def _discount_rewards(self, rewards):
gammas = torch.ones_like(rewards) gammas = torch.ones_like(rewards)
if len(rewards) > 1: if len(rewards) > 1:
gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0) discount_tensor = torch.tensor(self.config['discount_rate'])
gammas[1:] = torch.cumprod(
discount_tensor.repeat(len(rewards) - 1),
dim=0
)
return gammas * rewards return gammas * rewards
# This function is currently not used since the performance gains hasn't been shown # This function is currently not used since the performance gains hasn't been shown
@ -29,18 +31,18 @@ class A2CSingleAgent:
values = self.value_net(states).squeeze(1) values = self.value_net(states).squeeze(1)
generalized_advantages = torch.zeros_like(rewards) generalized_advantages = torch.zeros_like(rewards)
for i in range(len(generalized_advantages)): discount_tensor = torch.tensor(self.config['discount_rate']) * tradeoff
for i, _ in enumerate(generalized_advantages):
weights = torch.ones_like(rewards[i:]) weights = torch.ones_like(rewards[i:])
if i != len(generalized_advantages) - 1: if i != len(generalized_advantages) - 1:
weights[1:] = torch.cumprod(torch.tensor(self.config['discount_rate'] * tradeoff).repeat(len(rewards) - i - 1), dim = 0) weights[1:] = torch.cumprod(discount_tensor.repeat(len(rewards) - i - 1), dim=0)
generalized_advantages[i] = (weights * (rewards[i:] + self.config['discount_rate'] * next_values[i:] - values[i:])).sum() generalized_advantages[i] = (weights * (rewards[i:] + self.config['discount_rate'] * next_values[i:] - values[i:])).sum()
return generalized_advantages return generalized_advantages
def learn(self): def learn(self):
episode_batch = self.memory.recall() episode_batch = self.memory.recall()
state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch) state_batch, _, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
# Send batches to the appropriate device # Send batches to the appropriate device
state_batch = torch.cat(state_batch).to(self.value_net.device) state_batch = torch.cat(state_batch).to(self.value_net.device)
@ -50,8 +52,10 @@ class A2CSingleAgent:
log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device) log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device)
## Value Loss ## Value Loss
# In A2C, the value loss is the difference between the discounted reward and the value from the first state # In A2C, the value loss is the difference between the discounted reward
# The value of the first state is supposed to tell us the expected reward from the current policy of the whole episode # and the value from the first state.
# The value of the first state is supposed to tell us
# the expected reward from the current policy of the whole episode
discounted_reward = self._discount_rewards(reward_batch) discounted_reward = self._discount_rewards(reward_batch)
observed_value = discounted_reward.sum() observed_value = discounted_reward.sum()
value_loss = F.mse_loss(observed_value, self.value_net(state_batch[0])) value_loss = F.mse_loss(observed_value, self.value_net(state_batch[0]))
@ -86,5 +90,3 @@ class A2CSingleAgent:
# Memory under the old policy is not needed for future training # Memory under the old policy is not needed for future training
self.memory.clear() self.memory.clear()

View file

@ -1,13 +1,11 @@
import collections import collections
from copy import deepcopy
import rltorch.memory as M import rltorch.memory as M
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from copy import deepcopy
import numpy as np
from pathlib import Path
class DQNAgent: class DQNAgent:
def __init__(self, net , memory, config, target_net = None, logger = None): def __init__(self, net, memory, config, target_net=None, logger=None):
self.net = net self.net = net
self.target_net = target_net self.target_net = target_net
self.memory = memory self.memory = memory
@ -20,16 +18,16 @@ class DQNAgent:
self.net.model.to(self.net.device) self.net.model.to(self.net.device)
self.target_net.sync() self.target_net.sync()
def learn(self, logger = None): def learn(self, logger=None):
if len(self.memory) < self.config['batch_size']: if len(self.memory) < self.config['batch_size']:
return return
if (isinstance(self.memory, M.PrioritizedReplayMemory)): if isinstance(self.memory, M.PrioritizedReplayMemory):
weight_importance = self.config['prioritized_replay_weight_importance'] weight_importance = self.config['prioritized_replay_weight_importance']
# If it's a scheduler then get the next value by calling next, otherwise just use it's value # If it's a scheduler then get the next value by calling next, otherwise just use it's value
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
minibatch = self.memory.sample(self.config['batch_size'], beta = beta) minibatch = self.memory.sample(self.config['batch_size'], beta=beta)
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True) state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True)
else: else:
minibatch = self.memory.sample(self.config['batch_size']) minibatch = self.memory.sample(self.config['batch_size'])
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch) state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch)
@ -49,7 +47,7 @@ class DQNAgent:
# and the regular net to select the action # and the regular net to select the action
# That way we decouple the value and action selecting processes (DOUBLE DQN) # That way we decouple the value and action selecting processes (DOUBLE DQN)
not_done_size = not_done_batch.sum() not_done_size = not_done_batch.sum()
next_state_values = torch.zeros_like(state_values, device = self.net.device) next_state_values = torch.zeros_like(state_values, device=self.net.device)
if self.target_net is not None: if self.target_net is not None:
next_state_values[not_done_batch] = self.target_net(next_state_batch[not_done_batch]) next_state_values[not_done_batch] = self.target_net(next_state_batch[not_done_batch])
next_best_action = self.net(next_state_batch[not_done_batch]).argmax(1) next_best_action = self.net(next_state_batch[not_done_batch]).argmax(1)
@ -57,15 +55,15 @@ class DQNAgent:
next_state_values[not_done_batch] = self.net(next_state_batch[not_done_batch]) next_state_values[not_done_batch] = self.net(next_state_batch[not_done_batch])
next_best_action = next_state_values[not_done_batch].argmax(1) next_best_action = next_state_values[not_done_batch].argmax(1)
best_next_state_value = torch.zeros(self.config['batch_size'], device = self.net.device) best_next_state_value = torch.zeros(self.config['batch_size'], device=self.net.device)
best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1) best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1) expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
# If we're sampling by TD error, multiply loss by a importance weight which helps decrease overfitting # If we're sampling by TD error, multiply loss by a importance weight which helps decrease overfitting
if (isinstance(self.memory, M.PrioritizedReplayMemory)): if isinstance(self.memory, M.PrioritizedReplayMemory):
# loss = (torch.as_tensor(importance_weights, device = self.net.device) * F.smooth_l1_loss(obtained_values, expected_values, reduction = 'none').squeeze(1)).mean() # loss = (torch.as_tensor(importance_weights, device = self.net.device) * F.smooth_l1_loss(obtained_values, expected_values, reduction = 'none').squeeze(1)).mean()
loss = (torch.as_tensor(importance_weights, device = self.net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean() loss = (torch.as_tensor(importance_weights, device=self.net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
else: else:
# loss = F.smooth_l1_loss(obtained_values, expected_values) # loss = F.smooth_l1_loss(obtained_values, expected_values)
loss = F.mse_loss(obtained_values, expected_values) loss = F.mse_loss(obtained_values, expected_values)
@ -85,8 +83,6 @@ class DQNAgent:
self.target_net.sync() self.target_net.sync()
# If we're sampling by TD error, readjust the weights of the experiences # If we're sampling by TD error, readjust the weights of the experiences
if (isinstance(self.memory, M.PrioritizedReplayMemory)): if isinstance(self.memory, M.PrioritizedReplayMemory):
td_error = (obtained_values - expected_values).detach().abs() td_error = (obtained_values - expected_values).detach().abs()
self.memory.update_priorities(batch_indexes, td_error) self.memory.update_priorities(batch_indexes, td_error)

View file

@ -1,14 +1,12 @@
import collections import collections
from copy import deepcopy
import rltorch.memory as M import rltorch.memory as M
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from copy import deepcopy
import numpy as np
from pathlib import Path
from rltorch.action_selector import ArgMaxSelector
class DQfDAgent: class DQfDAgent:
def __init__(self, net, memory, config, target_net = None, logger = None): def __init__(self, net, memory, config, target_net=None, logger=None):
self.net = net self.net = net
self.target_net = target_net self.target_net = target_net
self.memory = memory self.memory = memory
@ -21,7 +19,7 @@ class DQfDAgent:
self.net.model.to(self.net.device) self.net.model.to(self.net.device)
self.target_net.sync() self.target_net.sync()
def learn(self, logger = None): def learn(self, logger=None):
if len(self.memory) < self.config['batch_size']: if len(self.memory) < self.config['batch_size']:
return return
@ -32,29 +30,19 @@ class DQfDAgent:
batch_size = self.config['batch_size'] batch_size = self.config['batch_size']
steps = None steps = None
if isinstance(self.memory, M.DQfDMemory):
weight_importance = self.config['prioritized_replay_weight_importance'] weight_importance = self.config['prioritized_replay_weight_importance']
# If it's a scheduler then get the next value by calling next, otherwise just use it's value # If it's a scheduler then get the next value by calling next, otherwise just use it's value
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) \
else weight_importance
# Check to see if we are doing N-Step DQN # Check to see if we are doing N-Step DQN
if steps is not None: if steps is not None:
minibatch = self.memory.sample_n_steps(batch_size, steps, beta) minibatch = self.memory.sample_n_steps(batch_size, steps, beta)
else: else:
minibatch = self.memory.sample(batch_size, beta = beta) minibatch = self.memory.sample(batch_size, beta=beta)
# Process batch # Process batch
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True) state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True)
else:
# Check to see if we're doing N-Step DQN
if steps is not None:
minibatch = self.memory.sample_n_steps(batch_size, steps)
else:
minibatch = self.memory.sample(batch_size)
# Process batch
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, batch_indexes = M.zip_batch(minibatch, want_indices = True)
batch_index_tensors = torch.tensor(batch_indexes) batch_index_tensors = torch.tensor(batch_indexes)
demo_mask = batch_index_tensors < self.memory.demo_position demo_mask = batch_index_tensors < self.memory.demo_position
@ -75,7 +63,7 @@ class DQfDAgent:
# and the regular net to select the action # and the regular net to select the action
# That way we decouple the value and action selecting processes (DOUBLE DQN) # That way we decouple the value and action selecting processes (DOUBLE DQN)
not_done_size = not_done_batch.sum() not_done_size = not_done_batch.sum()
next_state_values = torch.zeros_like(state_values, device = self.net.device) next_state_values = torch.zeros_like(state_values, device=self.net.device)
if self.target_net is not None: if self.target_net is not None:
next_state_values[not_done_batch] = self.target_net(next_state_batch[not_done_batch]) next_state_values[not_done_batch] = self.target_net(next_state_batch[not_done_batch])
next_best_action = self.net(next_state_batch[not_done_batch]).argmax(1) next_best_action = self.net(next_state_batch[not_done_batch]).argmax(1)
@ -83,14 +71,14 @@ class DQfDAgent:
next_state_values[not_done_batch] = self.net(next_state_batch[not_done_batch]) next_state_values[not_done_batch] = self.net(next_state_batch[not_done_batch])
next_best_action = next_state_values[not_done_batch].argmax(1) next_best_action = next_state_values[not_done_batch].argmax(1)
best_next_state_value = torch.zeros(batch_size, device = self.net.device) best_next_state_value = torch.zeros(batch_size, device=self.net.device)
best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1) best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1) expected_values = (reward_batch + (batch_size * best_next_state_value)).unsqueeze(1)
# N-Step DQN Loss # N-Step DQN Loss
# num_steps capture how many steps actually exist before the end of episode # num_steps capture how many steps actually exist before the end of episode
if steps != None: if steps is not None:
expected_n_step_values = [] expected_n_step_values = []
with torch.no_grad(): with torch.no_grad():
for i in range(0, len(state_batch), steps): for i in range(0, len(state_batch), steps):
@ -127,7 +115,7 @@ class DQfDAgent:
l = torch.ones_like(state_values[demo_mask]) l = torch.ones_like(state_values[demo_mask])
expert_actions = action_batch[demo_mask] expert_actions = action_batch[demo_mask]
# l(s, a) is zero for every action the expert doesn't take # l(s, a) is zero for every action the expert doesn't take
for i,a in zip(range(len(l)), expert_actions): for i, _, a in zip(enumerate(l), expert_actions):
l[i].fill_(0.8) # According to paper l[i].fill_(0.8) # According to paper
l[i, a] = 0 l[i, a] = 0
if self.target_net is not None: if self.target_net is not None:
@ -148,26 +136,17 @@ class DQfDAgent:
# Since dqn_loss and demo_loss are different sizes, the reduction has to happen before they are combined # Since dqn_loss and demo_loss are different sizes, the reduction has to happen before they are combined
if isinstance(self.memory, M.DQfDMemory): dqn_loss = (torch.as_tensor(importance_weights, device=self.net.device) * F.mse_loss(obtained_values, expected_values, reduction='none').squeeze(1)).mean()
dqn_loss = (torch.as_tensor(importance_weights, device = self.net.device) * F.mse_loss(obtained_values, expected_values, reduction = 'none').squeeze(1)).mean()
else:
dqn_loss = F.mse_loss(obtained_values, expected_values)
if steps != None: if steps is not None:
if isinstance(self.memory, M.DQfDMemory): dqn_n_step_loss = (torch.as_tensor(importance_weights[::steps], device=self.net.device) * F.mse_loss(observed_n_step_values, expected_n_step_values, reduction='none')).mean()
dqn_n_step_loss = (torch.as_tensor(importance_weights[::steps], device = self.net.device) * F.mse_loss(observed_n_step_values, expected_n_step_values, reduction = 'none')).mean()
else: else:
dqn_n_step_loss = F.mse_loss(observed_n_step_values, expected_n_step_values, reduction = 'none').mean() dqn_n_step_loss = torch.tensor(0, device=self.net.device)
else:
dqn_n_step_loss = torch.tensor(0, device = self.net.device)
if demo_mask.sum() > 0: if demo_mask.sum() > 0:
if isinstance(self.memory, M.DQfDMemory): demo_loss = (torch.as_tensor(importance_weights, device=self.net.device)[demo_mask] * F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction='none').squeeze(1)).mean()
demo_loss = (torch.as_tensor(importance_weights, device = self.net.device)[demo_mask] * F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction = 'none').squeeze(1)).mean()
else: else:
demo_loss = F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction = 'none').squeeze(1).mean() demo_loss = 0
else:
demo_loss = 0.
loss = td_importance * dqn_loss + td_importance * dqn_n_step_loss + demo_importance * demo_loss loss = td_importance * dqn_loss + td_importance * dqn_n_step_loss + demo_importance * demo_loss
if self.logger is not None: if self.logger is not None:

View file

@ -1,15 +1,11 @@
from copy import deepcopy from copy import deepcopy
import numpy as np
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from torch.distributions import Categorical from torch.distributions import Categorical
import rltorch import rltorch
import rltorch.memory as M
import collections
import random
class PPOAgent: class PPOAgent:
def __init__(self, policy_net, value_net, memory, config, logger = None): def __init__(self, policy_net, value_net, memory, config, logger=None):
self.policy_net = policy_net self.policy_net = policy_net
self.old_policy_net = rltorch.network.TargetNetwork(policy_net) self.old_policy_net = rltorch.network.TargetNetwork(policy_net)
self.value_net = value_net self.value_net = value_net
@ -20,10 +16,9 @@ class PPOAgent:
def _discount_rewards(self, rewards): def _discount_rewards(self, rewards):
gammas = torch.ones_like(rewards) gammas = torch.ones_like(rewards)
if len(rewards) > 1: if len(rewards) > 1:
gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0) gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim=0)
return gammas * rewards return gammas * rewards
def learn(self): def learn(self):
episode_batch = self.memory.recall() episode_batch = self.memory.recall()
state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch) state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
@ -61,21 +56,17 @@ class PPOAgent:
# For PPO we want to stay within a certain ratio of the old policy # For PPO we want to stay within a certain ratio of the old policy
policy_ratio = torch.exp(log_prob_batch - old_log_probs) # Equivalent to (log_prob / old_log_prob) policy_ratio = torch.exp(log_prob_batch - old_log_probs) # Equivalent to (log_prob / old_log_prob)
policy_loss1 = policy_ratio * advantages policy_loss1 = policy_ratio * advantages
policy_loss2 = policy_ratio.clamp(min = 0.8, max = 1.2) * advantages # From original paper policy_loss2 = policy_ratio.clamp(min=0.8, max=1.2) * advantages # From original paper
policy_loss = -torch.min(policy_loss1, policy_loss2).sum() policy_loss = -torch.min(policy_loss1, policy_loss2).sum()
if self.logger is not None: if self.logger is not None:
self.logger.append("Loss/Policy", policy_loss.item()) self.logger.append("Loss/Policy", policy_loss.item())
self.logger.append("Loss/Value", value_loss.item()) self.logger.append("Loss/Value", value_loss.item())
self.old_policy_net.sync() self.old_policy_net.sync()
self.policy_net.zero_grad() self.policy_net.zero_grad()
policy_loss.backward() policy_loss.backward()
self.policy_net.step() self.policy_net.step()
# Memory under the old policy is not needed for future training # Memory under the old policy is not needed for future training
self.memory.clear() self.memory.clear()

View file

@ -2,16 +2,17 @@ from copy import deepcopy
import collections import collections
import numpy as np import numpy as np
import torch import torch
import torch.nn.functional as F
from torch.distributions import Categorical from torch.distributions import Categorical
import rltorch import rltorch
import rltorch.memory as M import rltorch.memory as M
import torch.nn.functional as F
# Q-Evolutionary Policy Agent # Q-Evolutionary Policy Agent
# Maximizes the policy with respect to the Q-Value function. # Maximizes the policy with respect to the Q-Value function.
# Since function is non-differentiabile, depends on the Evolutionary Strategy algorithm # Since function is non-differentiabile, depends on the Evolutionary Strategy algorithm
class QEPAgent: class QEPAgent:
def __init__(self, policy_net, value_net, memory, config, target_value_net = None, logger = None, entropy_importance = 0, policy_skip = 4, after_value_train = None): def __init__(self, policy_net, value_net, memory, config, target_value_net=None, logger=None, entropy_importance=0, policy_skip=4):
self.policy_net = policy_net self.policy_net = policy_net
assert isinstance(self.policy_net, rltorch.network.ESNetwork) or isinstance(self.policy_net, rltorch.network.ESNetworkMP) assert isinstance(self.policy_net, rltorch.network.ESNetwork) or isinstance(self.policy_net, rltorch.network.ESNetworkMP)
self.policy_net.fitness = self.fitness self.policy_net.fitness = self.fitness
@ -22,7 +23,6 @@ class QEPAgent:
self.logger = logger self.logger = logger
self.policy_skip = policy_skip self.policy_skip = policy_skip
self.entropy_importance = entropy_importance self.entropy_importance = entropy_importance
self.after_value_train = after_value_train
def save(self, file_location): def save(self, file_location):
torch.save({ torch.save({
@ -42,10 +42,8 @@ class QEPAgent:
batch_size = len(state_batch) batch_size = len(state_batch)
with torch.no_grad(): with torch.no_grad():
action_probabilities = policy_net(state_batch) action_probabilities = policy_net(state_batch)
action_size = action_probabilities.shape[1] action_size = action_probabilities.shape[1]
distributions = list(map(Categorical, action_probabilities)) distributions = list(map(Categorical, action_probabilities))
actions = torch.stack([d.sample() for d in distributions]) actions = torch.stack([d.sample() for d in distributions])
with torch.no_grad(): with torch.no_grad():
@ -54,31 +52,31 @@ class QEPAgent:
# Weird hacky solution where in multiprocess, it sometimes spits out nans # Weird hacky solution where in multiprocess, it sometimes spits out nans
# So have it try again # So have it try again
while torch.isnan(state_values).any(): while torch.isnan(state_values).any():
print("NAN DETECTED")
with torch.no_grad(): with torch.no_grad():
state_values = value_net(state_batch) state_values = value_net(state_batch)
obtained_values = state_values.gather(1, actions.view(batch_size, 1)).squeeze(1) obtained_values = state_values.gather(1, actions.view(len(state_batch), 1)).squeeze(1)
# return -obtained_values.mean().item()
entropy_importance = 0 # Entropy accounting for 1% of loss seems to work well
entropy_importance = next(self.entropy_importance) if isinstance(self.entropy_importance, collections.Iterable) else self.entropy_importance entropy_importance = next(self.entropy_importance) if isinstance(self.entropy_importance, collections.Iterable) else self.entropy_importance
value_importance = 1 - entropy_importance value_importance = 1 - entropy_importance
# entropy_loss = (action_probabilities * torch.log2(action_probabilities)).sum(1) # Standard entropy loss from information theory # entropy_loss = (action_probabilities * torch.log2(action_probabilities)).sum(1) # Standard entropy loss from information theory
entropy_loss = (action_probabilities - torch.tensor(1 / action_size, device = state_batch.device).repeat(batch_size, action_size)).abs().sum(1) entropy_loss = (action_probabilities - torch.tensor(1 / action_size, device=state_batch.device).repeat(len(state_batch), action_size)).abs().sum(1)
return (entropy_importance * entropy_loss - value_importance * obtained_values).mean().item() return (entropy_importance * entropy_loss - value_importance * obtained_values).mean().item()
def learn(self, logger = None): def learn(self, logger=None):
if len(self.memory) < self.config['batch_size']: if len(self.memory) < self.config['batch_size']:
return return
if (isinstance(self.memory, M.PrioritizedReplayMemory)): if isinstance(self.memory, M.PrioritizedReplayMemory):
weight_importance = self.config['prioritized_replay_weight_importance'] weight_importance = self.config['prioritized_replay_weight_importance']
# If it's a scheduler then get the next value by calling next, otherwise just use it's value # If it's a scheduler then get the next value by calling next, otherwise just use it's value
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
minibatch = self.memory.sample(self.config['batch_size'], beta = beta) minibatch = self.memory.sample(self.config['batch_size'], beta=beta)
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True) state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True)
else: else:
minibatch = self.memory.sample(self.config['batch_size']) minibatch = self.memory.sample(self.config['batch_size'])
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch) state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch)
@ -98,7 +96,7 @@ class QEPAgent:
# and the regular net to select the action # and the regular net to select the action
# That way we decouple the value and action selecting processes (DOUBLE DQN) # That way we decouple the value and action selecting processes (DOUBLE DQN)
not_done_size = not_done_batch.sum() not_done_size = not_done_batch.sum()
next_state_values = torch.zeros_like(state_values, device = self.value_net.device) next_state_values = torch.zeros_like(state_values, device=self.value_net.device)
if self.target_value_net is not None: if self.target_value_net is not None:
next_state_values[not_done_batch] = self.target_value_net(next_state_batch[not_done_batch]) next_state_values[not_done_batch] = self.target_value_net(next_state_batch[not_done_batch])
next_best_action = self.value_net(next_state_batch[not_done_batch]).argmax(1) next_best_action = self.value_net(next_state_batch[not_done_batch]).argmax(1)
@ -106,13 +104,13 @@ class QEPAgent:
next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch]) next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch])
next_best_action = next_state_values[not_done_batch].argmax(1) next_best_action = next_state_values[not_done_batch].argmax(1)
best_next_state_value = torch.zeros(self.config['batch_size'], device = self.value_net.device) best_next_state_value = torch.zeros(self.config['batch_size'], device=self.value_net.device)
best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1) best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1) expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
if (isinstance(self.memory, M.PrioritizedReplayMemory)): if isinstance(self.memory, M.PrioritizedReplayMemory):
value_loss = (torch.as_tensor(importance_weights, device = self.value_net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean() value_loss = (torch.as_tensor(importance_weights, device=self.value_net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
else: else:
value_loss = F.mse_loss(obtained_values, expected_values) value_loss = F.mse_loss(obtained_values, expected_values)
@ -124,16 +122,13 @@ class QEPAgent:
self.value_net.clamp_gradients() self.value_net.clamp_gradients()
self.value_net.step() self.value_net.step()
if callable(self.after_value_train):
self.after_value_train()
if self.target_value_net is not None: if self.target_value_net is not None:
if 'target_sync_tau' in self.config: if 'target_sync_tau' in self.config:
self.target_value_net.partial_sync(self.config['target_sync_tau']) self.target_value_net.partial_sync(self.config['target_sync_tau'])
else: else:
self.target_value_net.sync() self.target_value_net.sync()
if (isinstance(self.memory, M.PrioritizedReplayMemory)): if isinstance(self.memory, M.PrioritizedReplayMemory):
td_error = (obtained_values - expected_values).detach().abs() td_error = (obtained_values - expected_values).detach().abs()
self.memory.update_priorities(batch_indexes, td_error) self.memory.update_priorities(batch_indexes, td_error)
@ -141,7 +136,8 @@ class QEPAgent:
if self.policy_skip > 0: if self.policy_skip > 0:
self.policy_skip -= 1 self.policy_skip -= 1
return return
self.policy_skip = self.config['policy_skip'] self.policy_skip = 4
if self.target_value_net is not None: if self.target_value_net is not None:
self.policy_net.calc_gradients(self.target_value_net, state_batch) self.policy_net.calc_gradients(self.target_value_net, state_batch)
else: else:

View file

@ -1,10 +1,10 @@
import rltorch
from copy import deepcopy from copy import deepcopy
import torch
import numpy as np import numpy as np
import torch
import rltorch
class REINFORCEAgent: class REINFORCEAgent:
def __init__(self, net , memory, config, target_net = None, logger = None): def __init__(self, net, memory, config, target_net=None, logger=None):
self.net = net self.net = net
if not isinstance(memory, rltorch.memory.EpisodeMemory): if not isinstance(memory, rltorch.memory.EpisodeMemory):
raise ValueError("Memory must be of instance EpisodeMemory") raise ValueError("Memory must be of instance EpisodeMemory")
@ -23,14 +23,14 @@ class REINFORCEAgent:
for i in range(len(rewards)): for i in range(len(rewards)):
gammas = torch.ones_like(rewards[i:]) gammas = torch.ones_like(rewards[i:])
if i != len(rewards) - 1: if i != len(rewards) - 1:
gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i - 1), dim = 0) gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i - 1), dim=0)
advantages = rewards[i:] - baseline advantages = rewards[i:] - baseline
shaped_rewards[i] = (gammas * advantages).sum() shaped_rewards[i] = (gammas * advantages).sum()
return shaped_rewards return shaped_rewards
def learn(self): def learn(self):
episode_batch = self.memory.recall() episode_batch = self.memory.recall()
state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch) _, _, reward_batch, _, _, log_prob_batch = zip(*episode_batch)
# Caluclate discounted rewards to place more importance to recent rewards # Caluclate discounted rewards to place more importance to recent rewards
shaped_reward_batch = self._shape_rewards(torch.tensor(reward_batch)) shaped_reward_batch = self._shape_rewards(torch.tensor(reward_batch))

View file

@ -1,6 +1,6 @@
from .A2CSingleAgent import * from .A2CSingleAgent import A2CSingleAgent
from .DQNAgent import * from .DQNAgent import DQNAgent
from .DQfDAgent import * from .DQfDAgent import DQfDAgent
from .PPOAgent import * from .PPOAgent import PPOAgent
from .QEPAgent import * from .QEPAgent import QEPAgent
from .REINFORCEAgent import * from .REINFORCEAgent import REINFORCEAgent

View file

@ -1,8 +1,8 @@
from copy import deepcopy from copy import deepcopy
import rltorch
import time import time
import rltorch
def simulateEnvEps(env, actor, config, total_episodes = 1, memory = None, logger = None, name = "", render = False): def simulateEnvEps(env, actor, config, total_episodes=1, memory=None, logger=None, name="", render=False):
for episode in range(total_episodes): for episode in range(total_episodes):
state = env.reset() state = env.reset()
done = False done = False
@ -27,8 +27,8 @@ def simulateEnvEps(env, actor, config, total_episodes = 1, memory = None, logger
logger.append(name + '/EpisodeReward', episode_reward) logger.append(name + '/EpisodeReward', episode_reward)
class EnvironmentRunSync(): class EnvironmentRunSync:
def __init__(self, env, actor, config, memory = None, logwriter = None, name = "", render = False): def __init__(self, env, actor, config, memory=None, logwriter=None, name="", render=False):
self.env = env self.env = env
self.name = name self.name = name
self.actor = actor self.actor = actor
@ -72,8 +72,8 @@ class EnvironmentRunSync():
self.last_state = state self.last_state = state
class EnvironmentEpisodeSync(): class EnvironmentEpisodeSync:
def __init__(self, env, actor, config, memory = None, logwriter = None, name = ""): def __init__(self, env, actor, config, memory=None, logwriter=None, name=""):
self.env = env self.env = env
self.name = name self.name = name
self.actor = actor self.actor = actor

View file

@ -1,8 +1,8 @@
from collections import deque
import gym import gym
import torch import torch
from gym import spaces from gym import spaces
import cv2 import cv2
from collections import deque
import numpy as np import numpy as np
class EpisodicLifeEnv(gym.Wrapper): class EpisodicLifeEnv(gym.Wrapper):
@ -170,7 +170,12 @@ class FrameStack(gym.Wrapper):
self.k = k self.k = k
self.frames = deque([], maxlen=k) self.frames = deque([], maxlen=k)
shp = env.observation_space.shape shp = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype) self.observation_space = spaces.Box(
low=0,
high=255,
shape=(shp[:-1] + (shp[-1] * k,)),
dtype=env.observation_space.dtype
)
def reset(self): def reset(self):
ob = self.env.reset() ob = self.env.reset()
@ -189,7 +194,7 @@ class FrameStack(gym.Wrapper):
return torch.cat(list(self.frames)).unsqueeze(0) return torch.cat(list(self.frames)).unsqueeze(0)
class ProcessFrame(gym.Wrapper): class ProcessFrame(gym.Wrapper):
def __init__(self, env, resize_shape = None, crop_bounds = None, grayscale = False): def __init__(self, env, resize_shape=None, crop_bounds=None, grayscale=False):
gym.Wrapper.__init__(self, env) gym.Wrapper.__init__(self, env)
self.resize_shape = resize_shape self.resize_shape = resize_shape
self.crop_bounds = crop_bounds self.crop_bounds = crop_bounds
@ -207,14 +212,16 @@ class ProcessFrame(gym.Wrapper):
if self.grayscale: if self.grayscale:
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
if self.crop_bounds is not None and len(self.crop_bounds) == 4: if self.crop_bounds is not None and len(self.crop_bounds) == 4:
frame = frame[self.crop_bounds[0]:self.crop_bounds[1], self.crop_bounds[2]:self.crop_bounds[3]] frame = frame[
self.crop_bounds[0]:self.crop_bounds[1],
self.crop_bounds[2]:self.crop_bounds[3]
]
if self.resize_shape is not None and len(self.resize_shape) == 2: if self.resize_shape is not None and len(self.resize_shape) == 2:
frame = cv2.resize(frame, self.resize_shape, interpolation=cv2.INTER_AREA) frame = cv2.resize(frame, self.resize_shape, interpolation=cv2.INTER_AREA)
# Normalize # Normalize
frame = frame / 255 frame = frame / 255
return frame return frame
# Turns observations into torch tensors # Turns observations into torch tensors
# Adds an additional dimension that's suppose to represent the batch dim # Adds an additional dimension that's suppose to represent the batch dim
class TorchWrap(gym.Wrapper): class TorchWrap(gym.Wrapper):
@ -233,8 +240,6 @@ class TorchWrap(gym.Wrapper):
frame = torch.from_numpy(frame).unsqueeze(0).float() frame = torch.from_numpy(frame).unsqueeze(0).float()
return frame return frame
class ProcessFrame84(gym.ObservationWrapper): class ProcessFrame84(gym.ObservationWrapper):
def __init__(self, env=None): def __init__(self, env=None):
super(ProcessFrame84, self).__init__(env) super(ProcessFrame84, self).__init__(env)
@ -256,4 +261,3 @@ class ProcessFrame84(gym.ObservationWrapper):
x_t = resized_screen[18:102, :] x_t = resized_screen[18:102, :]
x_t = np.reshape(x_t, [84, 84]) x_t = np.reshape(x_t, [84, 84])
return x_t.astype(np.uint8) return x_t.astype(np.uint8)

View file

@ -1,13 +1,13 @@
from .PrioritizedReplayMemory import PrioritizedReplayMemory
from collections import namedtuple from collections import namedtuple
import numpy as np import numpy as np
from .PrioritizedReplayMemory import PrioritizedReplayMemory
Transition = namedtuple('Transition', Transition = namedtuple('Transition',
('state', 'action', 'reward', 'next_state', 'done')) ('state', 'action', 'reward', 'next_state', 'done'))
class DQfDMemory(PrioritizedReplayMemory): class DQfDMemory(PrioritizedReplayMemory):
def __init__(self, capacity, alpha, max_demo = -1): def __init__(self, capacity, alpha, max_demo=-1):
assert max_demo <= capacity assert max_demo <= capacity
super().__init__(capacity, alpha) super().__init__(capacity, alpha)
self.demo_position = 0 self.demo_position = 0
@ -47,7 +47,8 @@ class DQfDMemory(PrioritizedReplayMemory):
idxes = self._sample_proportional(sample_size) idxes = self._sample_proportional(sample_size)
step_idxes = [] step_idxes = []
for i in idxes: for i in idxes:
# If the interval of experiences fall between demonstration and obtained, move it over to the demonstration half # If the interval of experiences fall between demonstration and obtained,
# move it over to the demonstration half
if i < self.demo_position and i + steps > self.demo_position: if i < self.demo_position and i + steps > self.demo_position:
diff = i + steps - self.demo_position diff = i + steps - self.demo_position
step_idxes += range(i - diff, i + steps - diff) step_idxes += range(i - diff, i + steps - diff)

View file

@ -1,6 +1,4 @@
import random
from collections import namedtuple from collections import namedtuple
import torch
Transition = namedtuple('Transition', Transition = namedtuple('Transition',
('state', 'action', 'reward', 'next_state', 'done')) ('state', 'action', 'reward', 'next_state', 'done'))

View file

@ -1,10 +1,9 @@
# From OpenAI Baselines https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py # From OpenAI Baselines https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
from .ReplayMemory import ReplayMemory
import operator import operator
import random import random
import numpy as np import numpy as np
from numba import jit from numba import jit
from .ReplayMemory import ReplayMemory
class SegmentTree(object): class SegmentTree(object):
def __init__(self, capacity, operation, neutral_element): def __init__(self, capacity, operation, neutral_element):
@ -34,7 +33,7 @@ class SegmentTree(object):
self._value = [neutral_element for _ in range(2 * capacity)] self._value = [neutral_element for _ in range(2 * capacity)]
self._operation = operation self._operation = operation
@jit(forceobj = True) @jit(forceobj=True)
def _reduce_helper(self, start, end, node, node_start, node_end): def _reduce_helper(self, start, end, node, node_start, node_end):
if start == node_start and end == node_end: if start == node_start and end == node_end:
return self._value[node] return self._value[node]
@ -50,7 +49,7 @@ class SegmentTree(object):
self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
) )
@jit(forceobj = True) @jit(forceobj=True)
def reduce(self, start=0, end=None): def reduce(self, start=0, end=None):
"""Returns result of applying `self.operation` """Returns result of applying `self.operation`
to a contiguous subsequence of the array. to a contiguous subsequence of the array.
@ -73,7 +72,7 @@ class SegmentTree(object):
end -= 1 end -= 1
return self._reduce_helper(start, end, 1, 0, self._capacity - 1) return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
@jit(forceobj = True) @jit(forceobj=True)
def __setitem__(self, idx, val): def __setitem__(self, idx, val):
# index of the leaf # index of the leaf
idx += self._capacity idx += self._capacity
@ -86,7 +85,7 @@ class SegmentTree(object):
) )
idx //= 2 idx //= 2
@jit(forceobj = True) @jit(forceobj=True)
def __getitem__(self, idx): def __getitem__(self, idx):
assert 0 <= idx < self._capacity assert 0 <= idx < self._capacity
return self._value[self._capacity + idx] return self._value[self._capacity + idx]
@ -100,12 +99,12 @@ class SumSegmentTree(SegmentTree):
neutral_element=0.0 neutral_element=0.0
) )
@jit(forceobj = True) @jit(forceobj=True)
def sum(self, start=0, end=None): def sum(self, start=0, end=None):
"""Returns arr[start] + ... + arr[end]""" """Returns arr[start] + ... + arr[end]"""
return super(SumSegmentTree, self).reduce(start, end) return super(SumSegmentTree, self).reduce(start, end)
@jit(forceobj = True, parallel = True) @jit(forceobj=True, parallel=True)
def find_prefixsum_idx(self, prefixsum): def find_prefixsum_idx(self, prefixsum):
"""Find the highest index `i` in the array such that """Find the highest index `i` in the array such that
sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
@ -140,7 +139,7 @@ class MinSegmentTree(SegmentTree):
neutral_element=float('inf') neutral_element=float('inf')
) )
@jit(forceobj = True) @jit(forceobj=True)
def min(self, start=0, end=None): def min(self, start=0, end=None):
"""Returns min(arr[start], ..., arr[end])""" """Returns min(arr[start], ..., arr[end])"""
return super(MinSegmentTree, self).reduce(start, end) return super(MinSegmentTree, self).reduce(start, end)
@ -185,7 +184,7 @@ class PrioritizedReplayMemory(ReplayMemory):
self._it_sum[idx] = self._max_priority ** self._alpha self._it_sum[idx] = self._max_priority ** self._alpha
self._it_min[idx] = self._max_priority ** self._alpha self._it_min[idx] = self._max_priority ** self._alpha
@jit(forceobj = True) @jit(forceobj=True)
def _sample_proportional(self, batch_size): def _sample_proportional(self, batch_size):
res = [] res = []
p_total = self._it_sum.sum(0, len(self.memory) - 1) p_total = self._it_sum.sum(0, len(self.memory) - 1)
@ -294,7 +293,7 @@ class PrioritizedReplayMemory(ReplayMemory):
batch = list(zip(*encoded_sample, weights, step_idxes)) batch = list(zip(*encoded_sample, weights, step_idxes))
return batch return batch
@jit(forceobj = True) @jit(forceobj=True)
def update_priorities(self, idxes, priorities): def update_priorities(self, idxes, priorities):
""" """
Update priorities of sampled transitions. Update priorities of sampled transitions.
@ -320,4 +319,3 @@ class PrioritizedReplayMemory(ReplayMemory):
self._it_min[idx] = priority ** self._alpha self._it_min[idx] = priority ** self._alpha
self._max_priority = max(self._max_priority, priority) self._max_priority = max(self._max_priority, priority)

View file

@ -106,11 +106,9 @@ class ReplayMemory(object):
def __reversed__(self): def __reversed__(self):
return reversed(self.memory) return reversed(self.memory)
def zip_batch(minibatch, priority = False, want_indices = False): def zip_batch(minibatch, priority=False):
if priority: if priority:
state_batch, action_batch, reward_batch, next_state_batch, done_batch, weights, indexes = zip(*minibatch) state_batch, action_batch, reward_batch, next_state_batch, done_batch, weights, indexes = zip(*minibatch)
elif want_indices:
state_batch, action_batch, reward_batch, next_state_batch, done_batch, indexes = zip(*minibatch)
else: else:
state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*minibatch) state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*minibatch)
@ -122,7 +120,5 @@ def zip_batch(minibatch, priority = False, want_indices = False):
if priority: if priority:
return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, weights, indexes return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, weights, indexes
elif want_indices:
return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, indexes
else: else:
return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch

View file

@ -5,7 +5,7 @@ from copy import deepcopy
import torch.multiprocessing as mp import torch.multiprocessing as mp
class EnvironmentEpisode(mp.Process): class EnvironmentEpisode(mp.Process):
def __init__(self, env, actor, config, logger = None, name = ""): def __init__(self, env, actor, config, logger=None, name=""):
super(EnvironmentEpisode, self).__init__() super(EnvironmentEpisode, self).__init__()
self.env = env self.env = env
self.actor = actor self.actor = actor
@ -14,7 +14,7 @@ class EnvironmentEpisode(mp.Process):
self.name = name self.name = name
self.episode_num = 1 self.episode_num = 1
def run(self, printstat = False, memory = None): def run(self, printstat=False, memory=None):
state = self.env.reset() state = self.env.reset()
done = False done = False
episode_reward = 0 episode_reward = 0
@ -34,86 +34,3 @@ class EnvironmentEpisode(mp.Process):
self.logger.append(self.name + '/EpisodeReward', episode_reward) self.logger.append(self.name + '/EpisodeReward', episode_reward)
self.episode_num += 1 self.episode_num += 1
# from copy import deepcopy
# import torch.multiprocessing as mp
# from ctypes import *
# import rltorch.log
# def envepisode(actor, env, episode_num, config, runcondition, memoryqueue = None, logqueue = None, name = ""):
# # Wait for signal to start running through the environment
# while runcondition.wait():
# # Start a logger to log the rewards
# logger = rltorch.log.Logger()
# state = env.reset()
# episode_reward = 0
# done = False
# while not done:
# action = actor.act(state)
# next_state, reward, done, _ = env.step(action)
# episode_reward += reward
# if memoryqueue is not None:
# memoryqueue.put((state, action, reward, next_state, done))
# state = next_state
# if done:
# with episode_num.get_lock():
# if episode_num.value % config['print_stat_n_eps'] == 0:
# print("episode: {}/{}, score: {}"
# .format(episode_num.value, config['total_training_episodes'], episode_reward))
# if logger is not None:
# logger.append(name + '/EpisodeReward', episode_reward)
# episode_reward = 0
# state = env.reset()
# with episode_num.get_lock():
# episode_num.value += 1
# logqueue.put(logger)
# class EnvironmentRun():
# def __init__(self, env_func, actor, config, memory = None, name = ""):
# self.config = deepcopy(config)
# self.memory = memory
# self.episode_num = mp.Value(c_uint)
# self.runcondition = mp.Event()
# # Interestingly enough, there isn't a good reliable way to know how many states an episode will have
# # Perhaps we can share a uint to keep track...
# self.memory_queue = mp.Queue(maxsize = config['replay_skip'] + 1)
# self.logqueue = mp.Queue(maxsize = 1)
# with self.episode_num.get_lock():
# self.episode_num.value = 1
# self.runner = mp.Process(target=envrun,
# args=(actor, env_func, self.episode_num, config, self.runcondition),
# kwargs = {'iterations': config['replay_skip'] + 1,
# 'memoryqueue' : self.memory_queue, 'logqueue' : self.logqueue, 'name' : name})
# self.runner.start()
# def run(self):
# self.runcondition.set()
# def join(self):
# self._sync_memory()
# if self.logwriter is not None:
# self.logwriter.write(self._get_reward_logger())
# def sync_memory(self):
# if self.memory is not None:
# for i in range(self.config['replay_skip'] + 1):
# self.memory.append(*self.memory_queue.get())
# def get_reward_logger(self):
# return self.logqueue.get()
# def terminate(self):
# self.runner.terminate()

View file

@ -1,9 +1,9 @@
from copy import deepcopy from copy import deepcopy
from ctypes import c_uint
import torch.multiprocessing as mp import torch.multiprocessing as mp
from ctypes import *
import rltorch.log import rltorch.log
def envrun(actor, env, episode_num, config, runcondition, iterations = 1, memoryqueue = None, logqueue = None, name = ""): def envrun(actor, env, episode_num, config, runcondition, iterations=1, memoryqueue=None, logqueue=None, name=""):
state = env.reset() state = env.reset()
episode_reward = 0 episode_reward = 0
# Wait for signal to start running through the environment # Wait for signal to start running through the environment

View file

@ -1,7 +1,8 @@
from copy import deepcopy
import numpy as np import numpy as np
import torch import torch
from .Network import Network from .Network import Network
from copy import deepcopy
# [TODO] Should we torch.no_grad the __call__? # [TODO] Should we torch.no_grad the __call__?
# What if we want to sometimes do gradient descent as well? # What if we want to sometimes do gradient descent as well?
@ -38,7 +39,7 @@ class ESNetwork(Network):
name name
For use in logger to differentiate in analysis. For use in logger to differentiate in analysis.
""" """
def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma = 0.05, device = None, logger = None, name = ""): def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, logger=None, name=""):
super(ESNetwork, self).__init__(model, optimizer, config, device, logger, name) super(ESNetwork, self).__init__(model, optimizer, config, device, logger, name)
self.population_size = population_size self.population_size = population_size
self.fitness = fitness_fn self.fitness = fitness_fn
@ -64,7 +65,11 @@ class ESNetwork(Network):
white_noise_dict = {} white_noise_dict = {}
noise_dict = {} noise_dict = {}
for key in model_dict.keys(): for key in model_dict.keys():
white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape, device = self.device) white_noise_dict[key] = torch.randn(
self.population_size,
*model_dict[key].shape,
device=self.device
)
noise_dict[key] = self.sigma * white_noise_dict[key] noise_dict[key] = self.sigma * white_noise_dict[key]
return white_noise_dict, noise_dict return white_noise_dict, noise_dict
@ -96,7 +101,10 @@ class ESNetwork(Network):
candidate_solutions = self._generate_candidate_solutions(noise_dict) candidate_solutions = self._generate_candidate_solutions(noise_dict)
## Calculate fitness then mean shift, scale ## Calculate fitness then mean shift, scale
fitness_values = torch.tensor([self.fitness(x, *args) for x in candidate_solutions], device = self.device) fitness_values = torch.tensor(
[self.fitness(x, *args) for x in candidate_solutions],
device=self.device
)
if self.logger is not None: if self.logger is not None:
self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item()) self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item())
fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps) fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps)

View file

@ -1,9 +1,8 @@
from copy import deepcopy
import numpy as np import numpy as np
import torch import torch
from .Network import Network
from copy import deepcopy
import torch.multiprocessing as mp import torch.multiprocessing as mp
import functools from .Network import Network
class fn_copy: class fn_copy:
def __init__(self, fn, args): def __init__(self, fn, args):
@ -20,14 +19,15 @@ class ESNetworkMP(Network):
fitness_fun := model, *args -> fitness_value (float) fitness_fun := model, *args -> fitness_value (float)
We wish to find a model that maximizes the fitness function We wish to find a model that maximizes the fitness function
""" """
def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma = 0.05, device = None, logger = None, name = ""): def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, logger=None, name=""):
super(ESNetworkMP, self).__init__(model, optimizer, config, device, logger, name) super(ESNetworkMP, self).__init__(model, optimizer, config, device, logger, name)
self.population_size = population_size self.population_size = population_size
self.fitness = fitness_fn self.fitness = fitness_fn
self.sigma = sigma self.sigma = sigma
assert self.sigma > 0 assert self.sigma > 0
mp_ctx = mp.get_context("spawn") mp_ctx = mp.get_context("spawn")
self.pool = mp_ctx.Pool(processes=2) #[TODO] Probably should make number of processes a config variable #[TODO] Probably should make number of processes a config variable
self.pool = mp_ctx.Pool(processes=2)
# We're not going to be calculating gradients in the traditional way # We're not going to be calculating gradients in the traditional way
# So there's no need to waste computation time keeping track # So there's no need to waste computation time keeping track
@ -42,7 +42,11 @@ class ESNetworkMP(Network):
white_noise_dict = {} white_noise_dict = {}
noise_dict = {} noise_dict = {}
for key in model_dict.keys(): for key in model_dict.keys():
white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape, device = self.device) white_noise_dict[key] = torch.randn(
self.population_size,
*model_dict[key].shape,
device=self.device
)
noise_dict[key] = self.sigma * white_noise_dict[key] noise_dict[key] = self.sigma * white_noise_dict[key]
return white_noise_dict, noise_dict return white_noise_dict, noise_dict
@ -67,7 +71,10 @@ class ESNetworkMP(Network):
candidate_solutions = self._generate_candidate_solutions(noise_dict) candidate_solutions = self._generate_candidate_solutions(noise_dict)
## Calculate fitness then mean shift, scale ## Calculate fitness then mean shift, scale
fitness_values = torch.tensor(list(self.pool.map(fn_copy(self.fitness, args), candidate_solutions)), device = self.device) fitness_values = torch.tensor(
list(self.pool.map(fn_copy(self.fitness, args), candidate_solutions)),
device=self.device
)
if self.logger is not None: if self.logger is not None:
self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item()) self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item())

View file

@ -17,12 +17,16 @@ class Network:
name name
For use in logger to differentiate in analysis. For use in logger to differentiate in analysis.
""" """
def __init__(self, model, optimizer, config, device = None, logger = None, name = ""): def __init__(self, model, optimizer, config, device=None, logger=None, name=""):
self.model = model self.model = model
if 'weight_decay' in config: if 'weight_decay' in config:
self.optimizer = optimizer(model.parameters(), lr = config['learning_rate'], weight_decay = config['weight_decay']) self.optimizer = optimizer(
model.parameters(),
lr=config['learning_rate'],
weight_decay=config['weight_decay']
)
else: else:
self.optimizer = optimizer(model.parameters(), lr = config['learning_rate']) self.optimizer = optimizer(model.parameters(), lr=config['learning_rate'])
self.logger = logger self.logger = logger
self.name = name self.name = name
self.device = device self.device = device
@ -32,7 +36,7 @@ class Network:
def __call__(self, *args): def __call__(self, *args):
return self.model(*args) return self.model(*args)
def clamp_gradients(self, x = 1): def clamp_gradients(self, x=1):
""" """
Forcing gradients to stay within a certain interval Forcing gradients to stay within a certain interval
by setting it to the bound if it goes over it. by setting it to the bound if it goes over it.

View file

@ -1,7 +1,8 @@
import math
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import math
# This class utilizes this property of the normal distribution # This class utilizes this property of the normal distribution
# N(mu, sigma) = mu + sigma * N(0, 1) # N(mu, sigma) = mu + sigma * N(0, 1)
@ -10,7 +11,6 @@ class NoisyLinear(nn.Linear):
Draws the parameters of nn.Linear from a normal distribution. Draws the parameters of nn.Linear from a normal distribution.
The parameters of the normal distribution are registered as The parameters of the normal distribution are registered as
learnable parameters in the neural network. learnable parameters in the neural network.
Parameters Parameters
---------- ----------
in_features in_features
@ -24,8 +24,8 @@ class NoisyLinear(nn.Linear):
learn an additive bias. learn an additive bias.
Default: True Default: True
""" """
def __init__(self, in_features, out_features, sigma_init = 0.017, bias = True): def __init__(self, in_features, out_features, sigma_init=0.017, bias=True):
super(NoisyLinear, self).__init__(in_features, out_features, bias = bias) super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)
# One of the parameters the network is going to tune is the # One of the parameters the network is going to tune is the
# standard deviation of the gaussian noise on the weights # standard deviation of the gaussian noise on the weights
self.sigma_weight = nn.Parameter(torch.Tensor(out_features, in_features).fill_(sigma_init)) self.sigma_weight = nn.Parameter(torch.Tensor(out_features, in_features).fill_(sigma_init))

View file

@ -11,7 +11,7 @@ class TargetNetwork:
device device
The device to put the cloned parameters in. The device to put the cloned parameters in.
""" """
def __init__(self, network, device = None): def __init__(self, network, device=None):
self.model = network.model self.model = network.model
self.target_model = deepcopy(network.model) self.target_model = deepcopy(network.model)
if device is not None: if device is not None:
@ -37,7 +37,8 @@ class TargetNetwork:
Parameters Parameters
---------- ----------
tau : number tau : number
A number between 0-1 which indicates the proportion of the originator and clone in the new clone. A number between 0-1 which indicates
the proportion of the originator and clone in the new clone.
""" """
assert isinstance(tau, float) assert isinstance(tau, float)
assert 0.0 < tau <= 1.0 assert 0.0 < tau <= 1.0

View file

@ -1,5 +1,5 @@
from .ESNetwork import * from .ESNetwork import ESNetwork
from .ESNetworkMP import * from .ESNetworkMP import ESNetworkMP
from .Network import * from .Network import Network
from .NoisyLinear import * from .NoisyLinear import NoisyLinear
from .TargetNetwork import * from .TargetNetwork import TargetNetwork

View file

@ -36,4 +36,3 @@ class ExponentialScheduler(Scheduler):
return self.initial_value * (self.base ** (self.current_iteration - 1)) return self.initial_value * (self.base ** (self.current_iteration - 1))
else: else:
return self.end_value return self.end_value

View file

@ -7,4 +7,4 @@ class Scheduler():
def __iter__(self): def __iter__(self):
return self return self
def __next__(self): def __next__(self):
raise NotImplementedError("Scheduler does not have it's function to create a value implemented") raise NotImplementedError("__next__ not implemented in Scheduler")