PEP8 Conformance

This commit is contained in:
Brandon Rozek 2020-04-14 14:16:14 -04:00
parent 9b81188a77
commit 8fa4691511
29 changed files with 652 additions and 755 deletions

View file

@ -1,7 +1,7 @@
from random import randrange
import torch
class ArgMaxSelector:
def __init__(self, model, action_size, device = None):
def __init__(self, model, action_size, device=None):
self.model = model
self.action_size = action_size
self.device = device
@ -12,7 +12,8 @@ class ArgMaxSelector:
if self.device is not None:
state = state.to(self.device)
action_values = self.model(state).squeeze(0)
action = self.random_act() if (action_values[0] == action_values).all() else action_values.argmax().item()
action = self.random_act() if (action_values[0] == action_values).all() \
else action_values.argmax().item()
return action
def act(self, state):
return self.best_act(state)

View file

@ -1,9 +1,10 @@
from .ArgMaxSelector import ArgMaxSelector
import numpy as np
import collections
import numpy as np
from .ArgMaxSelector import ArgMaxSelector
class EpsilonGreedySelector(ArgMaxSelector):
def __init__(self, model, action_size, device = None, epsilon = 0.1):
super(EpsilonGreedySelector, self).__init__(model, action_size, device = device)
def __init__(self, model, action_size, device=None, epsilon=0.1):
super(EpsilonGreedySelector, self).__init__(model, action_size, device=device)
self.epsilon = epsilon
# random_act is already implemented in ArgMaxSelector
# best_act is already implemented in ArgMaxSelector

View file

@ -1,8 +1,9 @@
from .ArgMaxSelector import ArgMaxSelector
import torch
from .ArgMaxSelector import ArgMaxSelector
class IdentitySelector(ArgMaxSelector):
def __init__(self, model, action_size, device = None):
super(IdentitySelector, self).__init__(model, action_size, device = device)
def __init__(self, model, action_size, device=None):
super(IdentitySelector, self).__init__(model, action_size, device=device)
# random_act is already implemented in ArgMaxSelector
def best_act(self, state):
with torch.no_grad():

View file

@ -1,10 +1,10 @@
from random import randrange
class RandomSelector():
class RandomSelector:
def __init__(self, action_size):
self.action_size = action_size
def random_act(self):
return randrange(action_size)
def best_act(self, state):
return randrange(self.action_size)
def best_act(self, _):
return self.random_act()
def act(self, state):
def act(self, _):
return self.random_act()

View file

@ -1,22 +1,19 @@
from random import randrange
import torch
from torch.distributions import Categorical
import rltorch
from rltorch.action_selector import ArgMaxSelector
from .ArgMaxSelector import ArgMaxSelector
from ..memory.EpisodeMemory import EpisodeMemory
class StochasticSelector(ArgMaxSelector):
def __init__(self, model, action_size, memory = None, device = None):
super(StochasticSelector, self).__init__(model, action_size, device = device)
def __init__(self, model, action_size, memory=None, device=None):
super(StochasticSelector, self).__init__(model, action_size, device=device)
self.model = model
self.action_size = action_size
self.device = device
self.memory = memory
def best_act(self, state, log_prob = True):
def best_act(self, state, log_prob=True):
if self.device is not None:
state = state.to(self.device)
action_probabilities = self.model(state)
distribution = Categorical(action_probabilities)
action = distribution.sample()
if log_prob and isinstance(self.memory, rltorch.memory.EpisodeMemory):
if log_prob and isinstance(self.memory, EpisodeMemory):
self.memory.append_log_probs(distribution.log_prob(action))
return action.item()

View file

@ -1,5 +1,5 @@
from .ArgMaxSelector import *
from .EpsilonGreedySelector import *
from .IdentitySelector import *
from .RandomSelector import *
from .StochasticSelector import *
from .ArgMaxSelector import ArgMaxSelector
from .EpsilonGreedySelector import EpsilonGreedySelector
from .IdentitySelector import IdentitySelector
from .RandomSelector import RandomSelector
from .StochasticSelector import StochasticSelector

View file

@ -2,89 +2,91 @@ from copy import deepcopy
import numpy as np
import torch
import torch.nn.functional as F
import rltorch
import rltorch.memory as M
class A2CSingleAgent:
def __init__(self, policy_net, value_net, memory, config, logger = None):
self.policy_net = policy_net
self.value_net = value_net
self.memory = memory
self.config = deepcopy(config)
self.logger = logger
def __init__(self, policy_net, value_net, memory, config, logger=None):
self.policy_net = policy_net
self.value_net = value_net
self.memory = memory
self.config = deepcopy(config)
self.logger = logger
def _discount_rewards(self, rewards):
gammas = torch.ones_like(rewards)
if len(rewards) > 1:
gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0)
return gammas * rewards
def _discount_rewards(self, rewards):
gammas = torch.ones_like(rewards)
if len(rewards) > 1:
discount_tensor = torch.tensor(self.config['discount_rate'])
gammas[1:] = torch.cumprod(
discount_tensor.repeat(len(rewards) - 1),
dim=0
)
return gammas * rewards
# This function is currently not used since the performance gains hasn't been shown
# May be due to a faulty implementation, need to investigate more..
def _generalized_advantage_estimation(self, states, rewards, next_states, not_done):
tradeoff = 0.5
with torch.no_grad():
next_values = torch.zeros_like(rewards)
next_values[not_done] = self.value_net(next_states[not_done]).squeeze(1)
values = self.value_net(states).squeeze(1)
# This function is currently not used since the performance gains hasn't been shown
# May be due to a faulty implementation, need to investigate more..
def _generalized_advantage_estimation(self, states, rewards, next_states, not_done):
tradeoff = 0.5
with torch.no_grad():
next_values = torch.zeros_like(rewards)
next_values[not_done] = self.value_net(next_states[not_done]).squeeze(1)
values = self.value_net(states).squeeze(1)
generalized_advantages = torch.zeros_like(rewards)
for i in range(len(generalized_advantages)):
weights = torch.ones_like(rewards[i:])
if i != len(generalized_advantages) - 1:
weights[1:] = torch.cumprod(torch.tensor(self.config['discount_rate'] * tradeoff).repeat(len(rewards) - i - 1), dim = 0)
generalized_advantages[i] = (weights * (rewards[i:] + self.config['discount_rate'] * next_values[i:] - values[i:])).sum()
generalized_advantages = torch.zeros_like(rewards)
discount_tensor = torch.tensor(self.config['discount_rate']) * tradeoff
for i, _ in enumerate(generalized_advantages):
weights = torch.ones_like(rewards[i:])
if i != len(generalized_advantages) - 1:
weights[1:] = torch.cumprod(discount_tensor.repeat(len(rewards) - i - 1), dim=0)
generalized_advantages[i] = (weights * (rewards[i:] + self.config['discount_rate'] * next_values[i:] - values[i:])).sum()
return generalized_advantages
return generalized_advantages
def learn(self):
episode_batch = self.memory.recall()
state_batch, _, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
# Send batches to the appropriate device
state_batch = torch.cat(state_batch).to(self.value_net.device)
reward_batch = torch.tensor(reward_batch).to(self.value_net.device).float()
not_done_batch = ~torch.tensor(done_batch).to(self.value_net.device)
next_state_batch = torch.cat(next_state_batch).to(self.value_net.device)
log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device)
## Value Loss
# In A2C, the value loss is the difference between the discounted reward
# and the value from the first state.
# The value of the first state is supposed to tell us
# the expected reward from the current policy of the whole episode
discounted_reward = self._discount_rewards(reward_batch)
observed_value = discounted_reward.sum()
value_loss = F.mse_loss(observed_value, self.value_net(state_batch[0]))
self.value_net.zero_grad()
value_loss.backward()
self.value_net.step()
## Policy Loss
# Increase probabilities of advantageous states
# and decrease the probabilities of non-advantageous ones
with torch.no_grad():
state_values = self.value_net(state_batch)
next_state_values = torch.zeros_like(state_values)
next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch])
advantages = (reward_batch.unsqueeze(1) + self.config['discount_rate'] * next_state_values) - state_values
advantages = advantages.squeeze(1)
# advantages = self._generalized_advantage_estimation(state_batch, reward_batch, next_state_batch, not_done_batch)
# Scale for more stable learning
advantages = advantages / (advantages.std() + np.finfo('float').eps)
policy_loss = (-log_prob_batch * advantages).sum()
if self.logger is not None:
self.logger.append("Loss/Policy", policy_loss.item())
self.logger.append("Loss/Value", value_loss.item())
def learn(self):
episode_batch = self.memory.recall()
state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
# Send batches to the appropriate device
state_batch = torch.cat(state_batch).to(self.value_net.device)
reward_batch = torch.tensor(reward_batch).to(self.value_net.device).float()
not_done_batch = ~torch.tensor(done_batch).to(self.value_net.device)
next_state_batch = torch.cat(next_state_batch).to(self.value_net.device)
log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device)
## Value Loss
# In A2C, the value loss is the difference between the discounted reward and the value from the first state
# The value of the first state is supposed to tell us the expected reward from the current policy of the whole episode
discounted_reward = self._discount_rewards(reward_batch)
observed_value = discounted_reward.sum()
value_loss = F.mse_loss(observed_value, self.value_net(state_batch[0]))
self.value_net.zero_grad()
value_loss.backward()
self.value_net.step()
## Policy Loss
# Increase probabilities of advantageous states
# and decrease the probabilities of non-advantageous ones
with torch.no_grad():
state_values = self.value_net(state_batch)
next_state_values = torch.zeros_like(state_values)
next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch])
advantages = (reward_batch.unsqueeze(1) + self.config['discount_rate'] * next_state_values) - state_values
advantages = advantages.squeeze(1)
# advantages = self._generalized_advantage_estimation(state_batch, reward_batch, next_state_batch, not_done_batch)
# Scale for more stable learning
advantages = advantages / (advantages.std() + np.finfo('float').eps)
policy_loss = (-log_prob_batch * advantages).sum()
if self.logger is not None:
self.logger.append("Loss/Policy", policy_loss.item())
self.logger.append("Loss/Value", value_loss.item())
self.policy_net.zero_grad()
policy_loss.backward()
self.policy_net.step()
# Memory under the old policy is not needed for future training
self.memory.clear()
self.policy_net.zero_grad()
policy_loss.backward()
self.policy_net.step()
# Memory under the old policy is not needed for future training
self.memory.clear()

View file

@ -1,13 +1,11 @@
import collections
from copy import deepcopy
import rltorch.memory as M
import torch
import torch.nn.functional as F
from copy import deepcopy
import numpy as np
from pathlib import Path
class DQNAgent:
def __init__(self, net , memory, config, target_net = None, logger = None):
def __init__(self, net, memory, config, target_net=None, logger=None):
self.net = net
self.target_net = target_net
self.memory = memory
@ -20,16 +18,16 @@ class DQNAgent:
self.net.model.to(self.net.device)
self.target_net.sync()
def learn(self, logger = None):
def learn(self, logger=None):
if len(self.memory) < self.config['batch_size']:
return
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
if isinstance(self.memory, M.PrioritizedReplayMemory):
weight_importance = self.config['prioritized_replay_weight_importance']
# If it's a scheduler then get the next value by calling next, otherwise just use it's value
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
minibatch = self.memory.sample(self.config['batch_size'], beta = beta)
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True)
minibatch = self.memory.sample(self.config['batch_size'], beta=beta)
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True)
else:
minibatch = self.memory.sample(self.config['batch_size'])
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch)
@ -49,7 +47,7 @@ class DQNAgent:
# and the regular net to select the action
# That way we decouple the value and action selecting processes (DOUBLE DQN)
not_done_size = not_done_batch.sum()
next_state_values = torch.zeros_like(state_values, device = self.net.device)
next_state_values = torch.zeros_like(state_values, device=self.net.device)
if self.target_net is not None:
next_state_values[not_done_batch] = self.target_net(next_state_batch[not_done_batch])
next_best_action = self.net(next_state_batch[not_done_batch]).argmax(1)
@ -57,15 +55,15 @@ class DQNAgent:
next_state_values[not_done_batch] = self.net(next_state_batch[not_done_batch])
next_best_action = next_state_values[not_done_batch].argmax(1)
best_next_state_value = torch.zeros(self.config['batch_size'], device = self.net.device)
best_next_state_value = torch.zeros(self.config['batch_size'], device=self.net.device)
best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
# If we're sampling by TD error, multiply loss by a importance weight which helps decrease overfitting
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
if isinstance(self.memory, M.PrioritizedReplayMemory):
# loss = (torch.as_tensor(importance_weights, device = self.net.device) * F.smooth_l1_loss(obtained_values, expected_values, reduction = 'none').squeeze(1)).mean()
loss = (torch.as_tensor(importance_weights, device = self.net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
loss = (torch.as_tensor(importance_weights, device=self.net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
else:
# loss = F.smooth_l1_loss(obtained_values, expected_values)
loss = F.mse_loss(obtained_values, expected_values)
@ -85,8 +83,6 @@ class DQNAgent:
self.target_net.sync()
# If we're sampling by TD error, readjust the weights of the experiences
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
if isinstance(self.memory, M.PrioritizedReplayMemory):
td_error = (obtained_values - expected_values).detach().abs()
self.memory.update_priorities(batch_indexes, td_error)

View file

@ -1,14 +1,12 @@
import collections
from copy import deepcopy
import rltorch.memory as M
import torch
import torch.nn.functional as F
from copy import deepcopy
import numpy as np
from pathlib import Path
from rltorch.action_selector import ArgMaxSelector
class DQfDAgent:
def __init__(self, net, memory, config, target_net = None, logger = None):
def __init__(self, net, memory, config, target_net=None, logger=None):
self.net = net
self.target_net = target_net
self.memory = memory
@ -21,7 +19,7 @@ class DQfDAgent:
self.net.model.to(self.net.device)
self.target_net.sync()
def learn(self, logger = None):
def learn(self, logger=None):
if len(self.memory) < self.config['batch_size']:
return
@ -32,29 +30,19 @@ class DQfDAgent:
batch_size = self.config['batch_size']
steps = None
if isinstance(self.memory, M.DQfDMemory):
weight_importance = self.config['prioritized_replay_weight_importance']
# If it's a scheduler then get the next value by calling next, otherwise just use it's value
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
# Check to see if we are doing N-Step DQN
if steps is not None:
minibatch = self.memory.sample_n_steps(batch_size, steps, beta)
else:
minibatch = self.memory.sample(batch_size, beta = beta)
# Process batch
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True)
weight_importance = self.config['prioritized_replay_weight_importance']
# If it's a scheduler then get the next value by calling next, otherwise just use it's value
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) \
else weight_importance
# Check to see if we are doing N-Step DQN
if steps is not None:
minibatch = self.memory.sample_n_steps(batch_size, steps, beta)
else:
# Check to see if we're doing N-Step DQN
if steps is not None:
minibatch = self.memory.sample_n_steps(batch_size, steps)
else:
minibatch = self.memory.sample(batch_size)
minibatch = self.memory.sample(batch_size, beta=beta)
# Process batch
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, batch_indexes = M.zip_batch(minibatch, want_indices = True)
# Process batch
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True)
batch_index_tensors = torch.tensor(batch_indexes)
demo_mask = batch_index_tensors < self.memory.demo_position
@ -75,7 +63,7 @@ class DQfDAgent:
# and the regular net to select the action
# That way we decouple the value and action selecting processes (DOUBLE DQN)
not_done_size = not_done_batch.sum()
next_state_values = torch.zeros_like(state_values, device = self.net.device)
next_state_values = torch.zeros_like(state_values, device=self.net.device)
if self.target_net is not None:
next_state_values[not_done_batch] = self.target_net(next_state_batch[not_done_batch])
next_best_action = self.net(next_state_batch[not_done_batch]).argmax(1)
@ -83,14 +71,14 @@ class DQfDAgent:
next_state_values[not_done_batch] = self.net(next_state_batch[not_done_batch])
next_best_action = next_state_values[not_done_batch].argmax(1)
best_next_state_value = torch.zeros(batch_size, device = self.net.device)
best_next_state_value = torch.zeros(batch_size, device=self.net.device)
best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
expected_values = (reward_batch + (batch_size * best_next_state_value)).unsqueeze(1)
# N-Step DQN Loss
# num_steps capture how many steps actually exist before the end of episode
if steps != None:
if steps is not None:
expected_n_step_values = []
with torch.no_grad():
for i in range(0, len(state_batch), steps):
@ -127,7 +115,7 @@ class DQfDAgent:
l = torch.ones_like(state_values[demo_mask])
expert_actions = action_batch[demo_mask]
# l(s, a) is zero for every action the expert doesn't take
for i,a in zip(range(len(l)), expert_actions):
for i, _, a in zip(enumerate(l), expert_actions):
l[i].fill_(0.8) # According to paper
l[i, a] = 0
if self.target_net is not None:
@ -148,26 +136,17 @@ class DQfDAgent:
# Since dqn_loss and demo_loss are different sizes, the reduction has to happen before they are combined
if isinstance(self.memory, M.DQfDMemory):
dqn_loss = (torch.as_tensor(importance_weights, device = self.net.device) * F.mse_loss(obtained_values, expected_values, reduction = 'none').squeeze(1)).mean()
else:
dqn_loss = F.mse_loss(obtained_values, expected_values)
dqn_loss = (torch.as_tensor(importance_weights, device=self.net.device) * F.mse_loss(obtained_values, expected_values, reduction='none').squeeze(1)).mean()
if steps != None:
if isinstance(self.memory, M.DQfDMemory):
dqn_n_step_loss = (torch.as_tensor(importance_weights[::steps], device = self.net.device) * F.mse_loss(observed_n_step_values, expected_n_step_values, reduction = 'none')).mean()
else:
dqn_n_step_loss = F.mse_loss(observed_n_step_values, expected_n_step_values, reduction = 'none').mean()
if steps is not None:
dqn_n_step_loss = (torch.as_tensor(importance_weights[::steps], device=self.net.device) * F.mse_loss(observed_n_step_values, expected_n_step_values, reduction='none')).mean()
else:
dqn_n_step_loss = torch.tensor(0, device = self.net.device)
dqn_n_step_loss = torch.tensor(0, device=self.net.device)
if demo_mask.sum() > 0:
if isinstance(self.memory, M.DQfDMemory):
demo_loss = (torch.as_tensor(importance_weights, device = self.net.device)[demo_mask] * F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction = 'none').squeeze(1)).mean()
else:
demo_loss = F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction = 'none').squeeze(1).mean()
demo_loss = (torch.as_tensor(importance_weights, device=self.net.device)[demo_mask] * F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction='none').squeeze(1)).mean()
else:
demo_loss = 0.
demo_loss = 0
loss = td_importance * dqn_loss + td_importance * dqn_n_step_loss + demo_importance * demo_loss
if self.logger is not None:

View file

@ -1,81 +1,72 @@
from copy import deepcopy
import numpy as np
import torch
import torch.nn.functional as F
from torch.distributions import Categorical
import rltorch
import rltorch.memory as M
import collections
import random
class PPOAgent:
def __init__(self, policy_net, value_net, memory, config, logger = None):
self.policy_net = policy_net
self.old_policy_net = rltorch.network.TargetNetwork(policy_net)
self.value_net = value_net
self.memory = memory
self.config = deepcopy(config)
self.logger = logger
def __init__(self, policy_net, value_net, memory, config, logger=None):
self.policy_net = policy_net
self.old_policy_net = rltorch.network.TargetNetwork(policy_net)
self.value_net = value_net
self.memory = memory
self.config = deepcopy(config)
self.logger = logger
def _discount_rewards(self, rewards):
gammas = torch.ones_like(rewards)
if len(rewards) > 1:
gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0)
return gammas * rewards
def _discount_rewards(self, rewards):
gammas = torch.ones_like(rewards)
if len(rewards) > 1:
gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim=0)
return gammas * rewards
def learn(self):
episode_batch = self.memory.recall()
state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
def learn(self):
episode_batch = self.memory.recall()
state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
# Send batches to the appropriate device
state_batch = torch.cat(state_batch).to(self.value_net.device)
action_batch = torch.tensor(action_batch).to(self.value_net.device)
reward_batch = torch.tensor(reward_batch).to(self.value_net.device).float()
not_done_batch = ~torch.tensor(done_batch).to(self.value_net.device)
next_state_batch = torch.cat(next_state_batch).to(self.value_net.device)
log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device)
# Send batches to the appropriate device
state_batch = torch.cat(state_batch).to(self.value_net.device)
action_batch = torch.tensor(action_batch).to(self.value_net.device)
reward_batch = torch.tensor(reward_batch).to(self.value_net.device).float()
not_done_batch = ~torch.tensor(done_batch).to(self.value_net.device)
next_state_batch = torch.cat(next_state_batch).to(self.value_net.device)
log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device)
## Value Loss
# In PPO, the value loss is the difference between the discounted reward and the value from the first state
# The value of the first state is supposed to tell us the expected reward from the current policy of the whole episode
value_loss = F.mse_loss(self._discount_rewards(reward_batch).sum(), self.value_net(state_batch[0]))
self.value_net.zero_grad()
value_loss.backward()
self.value_net.step()
## Value Loss
# In PPO, the value loss is the difference between the discounted reward and the value from the first state
# The value of the first state is supposed to tell us the expected reward from the current policy of the whole episode
value_loss = F.mse_loss(self._discount_rewards(reward_batch).sum(), self.value_net(state_batch[0]))
self.value_net.zero_grad()
value_loss.backward()
self.value_net.step()
## Policy Loss
# Increase probabilities of advantageous states
# and decrease the probabilities of non-advantageous ones
with torch.no_grad():
state_values = self.value_net(state_batch)
next_state_values = torch.zeros_like(state_values)
next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch])
advantages = (reward_batch.unsqueeze(1) + self.config['discount_rate'] * next_state_values) - state_values
advantages = advantages.squeeze(1)
## Policy Loss
# Increase probabilities of advantageous states
# and decrease the probabilities of non-advantageous ones
with torch.no_grad():
state_values = self.value_net(state_batch)
next_state_values = torch.zeros_like(state_values)
next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch])
advantages = (reward_batch.unsqueeze(1) + self.config['discount_rate'] * next_state_values) - state_values
advantages = advantages.squeeze(1)
action_probabilities = self.old_policy_net(state_batch).detach()
distributions = list(map(Categorical, action_probabilities))
old_log_probs = torch.stack(list(map(lambda distribution, action: distribution.log_prob(action), distributions, action_batch)))
action_probabilities = self.old_policy_net(state_batch).detach()
distributions = list(map(Categorical, action_probabilities))
old_log_probs = torch.stack(list(map(lambda distribution, action: distribution.log_prob(action), distributions, action_batch)))
# For PPO we want to stay within a certain ratio of the old policy
policy_ratio = torch.exp(log_prob_batch - old_log_probs) # Equivalent to (log_prob / old_log_prob)
policy_loss1 = policy_ratio * advantages
policy_loss2 = policy_ratio.clamp(min=0.8, max=1.2) * advantages # From original paper
policy_loss = -torch.min(policy_loss1, policy_loss2).sum()
# For PPO we want to stay within a certain ratio of the old policy
policy_ratio = torch.exp(log_prob_batch - old_log_probs) # Equivalent to (log_prob / old_log_prob)
policy_loss1 = policy_ratio * advantages
policy_loss2 = policy_ratio.clamp(min = 0.8, max = 1.2) * advantages # From original paper
policy_loss = -torch.min(policy_loss1, policy_loss2).sum()
if self.logger is not None:
self.logger.append("Loss/Policy", policy_loss.item())
self.logger.append("Loss/Value", value_loss.item())
self.old_policy_net.sync()
self.policy_net.zero_grad()
policy_loss.backward()
self.policy_net.step()
# Memory under the old policy is not needed for future training
self.memory.clear()
if self.logger is not None:
self.logger.append("Loss/Policy", policy_loss.item())
self.logger.append("Loss/Value", value_loss.item())
self.old_policy_net.sync()
self.policy_net.zero_grad()
policy_loss.backward()
self.policy_net.step()
# Memory under the old policy is not needed for future training
self.memory.clear()

View file

@ -2,16 +2,17 @@ from copy import deepcopy
import collections
import numpy as np
import torch
import torch.nn.functional as F
from torch.distributions import Categorical
import rltorch
import rltorch.memory as M
import torch.nn.functional as F
# Q-Evolutionary Policy Agent
# Maximizes the policy with respect to the Q-Value function.
# Since function is non-differentiabile, depends on the Evolutionary Strategy algorithm
class QEPAgent:
def __init__(self, policy_net, value_net, memory, config, target_value_net = None, logger = None, entropy_importance = 0, policy_skip = 4, after_value_train = None):
def __init__(self, policy_net, value_net, memory, config, target_value_net=None, logger=None, entropy_importance=0, policy_skip=4):
self.policy_net = policy_net
assert isinstance(self.policy_net, rltorch.network.ESNetwork) or isinstance(self.policy_net, rltorch.network.ESNetworkMP)
self.policy_net.fitness = self.fitness
@ -22,7 +23,6 @@ class QEPAgent:
self.logger = logger
self.policy_skip = policy_skip
self.entropy_importance = entropy_importance
self.after_value_train = after_value_train
def save(self, file_location):
torch.save({
@ -42,10 +42,8 @@ class QEPAgent:
batch_size = len(state_batch)
with torch.no_grad():
action_probabilities = policy_net(state_batch)
action_size = action_probabilities.shape[1]
distributions = list(map(Categorical, action_probabilities))
actions = torch.stack([d.sample() for d in distributions])
with torch.no_grad():
@ -54,31 +52,31 @@ class QEPAgent:
# Weird hacky solution where in multiprocess, it sometimes spits out nans
# So have it try again
while torch.isnan(state_values).any():
print("NAN DETECTED")
with torch.no_grad():
state_values = value_net(state_batch)
obtained_values = state_values.gather(1, actions.view(batch_size, 1)).squeeze(1)
obtained_values = state_values.gather(1, actions.view(len(state_batch), 1)).squeeze(1)
# return -obtained_values.mean().item()
entropy_importance = 0 # Entropy accounting for 1% of loss seems to work well
entropy_importance = next(self.entropy_importance) if isinstance(self.entropy_importance, collections.Iterable) else self.entropy_importance
value_importance = 1 - entropy_importance
# entropy_loss = (action_probabilities * torch.log2(action_probabilities)).sum(1) # Standard entropy loss from information theory
entropy_loss = (action_probabilities - torch.tensor(1 / action_size, device = state_batch.device).repeat(batch_size, action_size)).abs().sum(1)
entropy_loss = (action_probabilities - torch.tensor(1 / action_size, device=state_batch.device).repeat(len(state_batch), action_size)).abs().sum(1)
return (entropy_importance * entropy_loss - value_importance * obtained_values).mean().item()
def learn(self, logger = None):
def learn(self, logger=None):
if len(self.memory) < self.config['batch_size']:
return
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
if isinstance(self.memory, M.PrioritizedReplayMemory):
weight_importance = self.config['prioritized_replay_weight_importance']
# If it's a scheduler then get the next value by calling next, otherwise just use it's value
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
minibatch = self.memory.sample(self.config['batch_size'], beta = beta)
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True)
minibatch = self.memory.sample(self.config['batch_size'], beta=beta)
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True)
else:
minibatch = self.memory.sample(self.config['batch_size'])
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch)
@ -98,7 +96,7 @@ class QEPAgent:
# and the regular net to select the action
# That way we decouple the value and action selecting processes (DOUBLE DQN)
not_done_size = not_done_batch.sum()
next_state_values = torch.zeros_like(state_values, device = self.value_net.device)
next_state_values = torch.zeros_like(state_values, device=self.value_net.device)
if self.target_value_net is not None:
next_state_values[not_done_batch] = self.target_value_net(next_state_batch[not_done_batch])
next_best_action = self.value_net(next_state_batch[not_done_batch]).argmax(1)
@ -106,13 +104,13 @@ class QEPAgent:
next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch])
next_best_action = next_state_values[not_done_batch].argmax(1)
best_next_state_value = torch.zeros(self.config['batch_size'], device = self.value_net.device)
best_next_state_value = torch.zeros(self.config['batch_size'], device=self.value_net.device)
best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
value_loss = (torch.as_tensor(importance_weights, device = self.value_net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
if isinstance(self.memory, M.PrioritizedReplayMemory):
value_loss = (torch.as_tensor(importance_weights, device=self.value_net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
else:
value_loss = F.mse_loss(obtained_values, expected_values)
@ -124,28 +122,26 @@ class QEPAgent:
self.value_net.clamp_gradients()
self.value_net.step()
if callable(self.after_value_train):
self.after_value_train()
if self.target_value_net is not None:
if 'target_sync_tau' in self.config:
self.target_value_net.partial_sync(self.config['target_sync_tau'])
else:
self.target_value_net.sync()
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
if isinstance(self.memory, M.PrioritizedReplayMemory):
td_error = (obtained_values - expected_values).detach().abs()
self.memory.update_priorities(batch_indexes, td_error)
## Policy Training
if self.policy_skip > 0:
self.policy_skip -= 1
return
self.policy_skip = self.config['policy_skip']
self.policy_skip -= 1
return
self.policy_skip = 4
if self.target_value_net is not None:
self.policy_net.calc_gradients(self.target_value_net, state_batch)
self.policy_net.calc_gradients(self.target_value_net, state_batch)
else:
self.policy_net.calc_gradients(self.value_net, state_batch)
self.policy_net.calc_gradients(self.value_net, state_batch)
self.policy_net.step()

View file

@ -1,60 +1,60 @@
import rltorch
from copy import deepcopy
import torch
import numpy as np
import torch
import rltorch
class REINFORCEAgent:
def __init__(self, net , memory, config, target_net = None, logger = None):
self.net = net
if not isinstance(memory, rltorch.memory.EpisodeMemory):
raise ValueError("Memory must be of instance EpisodeMemory")
self.memory = memory
self.config = deepcopy(config)
self.target_net = target_net
self.logger = logger
def __init__(self, net, memory, config, target_net=None, logger=None):
self.net = net
if not isinstance(memory, rltorch.memory.EpisodeMemory):
raise ValueError("Memory must be of instance EpisodeMemory")
self.memory = memory
self.config = deepcopy(config)
self.target_net = target_net
self.logger = logger
# Shaped rewards implements three improvements to REINFORCE
# 1) Discounted rewards, future rewards matter less than current
# 2) Baselines: We use the mean reward to see if the current reward is advantageous or not
# 3) Causality: Your current actions do not affect your past. Only the present and future.
def _shape_rewards(self, rewards):
shaped_rewards = torch.zeros_like(rewards)
baseline = rewards.mean()
for i in range(len(rewards)):
gammas = torch.ones_like(rewards[i:])
if i != len(rewards) - 1:
gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i - 1), dim = 0)
advantages = rewards[i:] - baseline
shaped_rewards[i] = (gammas * advantages).sum()
return shaped_rewards
# Shaped rewards implements three improvements to REINFORCE
# 1) Discounted rewards, future rewards matter less than current
# 2) Baselines: We use the mean reward to see if the current reward is advantageous or not
# 3) Causality: Your current actions do not affect your past. Only the present and future.
def _shape_rewards(self, rewards):
shaped_rewards = torch.zeros_like(rewards)
baseline = rewards.mean()
for i in range(len(rewards)):
gammas = torch.ones_like(rewards[i:])
if i != len(rewards) - 1:
gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i - 1), dim=0)
advantages = rewards[i:] - baseline
shaped_rewards[i] = (gammas * advantages).sum()
return shaped_rewards
def learn(self):
episode_batch = self.memory.recall()
state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
def learn(self):
episode_batch = self.memory.recall()
_, _, reward_batch, _, _, log_prob_batch = zip(*episode_batch)
# Caluclate discounted rewards to place more importance to recent rewards
shaped_reward_batch = self._shape_rewards(torch.tensor(reward_batch))
# Caluclate discounted rewards to place more importance to recent rewards
shaped_reward_batch = self._shape_rewards(torch.tensor(reward_batch))
# Scale discounted rewards to have variance 1 (stabalizes training)
shaped_reward_batch = shaped_reward_batch / (shaped_reward_batch.std() + np.finfo('float').eps)
# Scale discounted rewards to have variance 1 (stabalizes training)
shaped_reward_batch = shaped_reward_batch / (shaped_reward_batch.std() + np.finfo('float').eps)
log_prob_batch = torch.cat(log_prob_batch)
log_prob_batch = torch.cat(log_prob_batch)
policy_loss = (-log_prob_batch * shaped_reward_batch).sum()
policy_loss = (-log_prob_batch * shaped_reward_batch).sum()
if self.logger is not None:
if self.logger is not None:
self.logger.append("Loss", policy_loss.item())
self.net.zero_grad()
policy_loss.backward()
self.net.clamp_gradients()
self.net.step()
self.net.zero_grad()
policy_loss.backward()
self.net.clamp_gradients()
self.net.step()
if self.target_net is not None:
if 'target_sync_tau' in self.config:
self.target_net.partial_sync(self.config['target_sync_tau'])
else:
self.target_net.sync()
if self.target_net is not None:
if 'target_sync_tau' in self.config:
self.target_net.partial_sync(self.config['target_sync_tau'])
else:
self.target_net.sync()
# Memory under the old policy is not needed for future training
self.memory.clear()
# Memory under the old policy is not needed for future training
self.memory.clear()

View file

@ -1,6 +1,6 @@
from .A2CSingleAgent import *
from .DQNAgent import *
from .DQfDAgent import *
from .PPOAgent import *
from .QEPAgent import *
from .REINFORCEAgent import *
from .A2CSingleAgent import A2CSingleAgent
from .DQNAgent import DQNAgent
from .DQfDAgent import DQfDAgent
from .PPOAgent import PPOAgent
from .QEPAgent import QEPAgent
from .REINFORCEAgent import REINFORCEAgent

View file

@ -1,108 +1,108 @@
from copy import deepcopy
import rltorch
import time
import rltorch
def simulateEnvEps(env, actor, config, total_episodes = 1, memory = None, logger = None, name = "", render = False):
for episode in range(total_episodes):
state = env.reset()
done = False
episode_reward = 0
while not done:
action = actor.act(state)
next_state, reward, done, _ = env.step(action)
if render:
env.render()
time.sleep(0.01)
def simulateEnvEps(env, actor, config, total_episodes=1, memory=None, logger=None, name="", render=False):
for episode in range(total_episodes):
state = env.reset()
done = False
episode_reward = 0
while not done:
action = actor.act(state)
next_state, reward, done, _ = env.step(action)
if render:
env.render()
time.sleep(0.01)
episode_reward = episode_reward + reward
if memory is not None:
memory.append(state, action, reward, next_state, done)
state = next_state
episode_reward = episode_reward + reward
if memory is not None:
memory.append(state, action, reward, next_state, done)
state = next_state
if episode % config['print_stat_n_eps'] == 0:
print("episode: {}/{}, score: {}"
.format(episode, total_episodes, episode_reward), flush=True)
if episode % config['print_stat_n_eps'] == 0:
print("episode: {}/{}, score: {}"
.format(episode, total_episodes, episode_reward), flush=True)
if logger is not None:
logger.append(name + '/EpisodeReward', episode_reward)
if logger is not None:
logger.append(name + '/EpisodeReward', episode_reward)
class EnvironmentRunSync():
def __init__(self, env, actor, config, memory = None, logwriter = None, name = "", render = False):
self.env = env
self.name = name
self.actor = actor
self.config = deepcopy(config)
self.logwriter = logwriter
self.memory = memory
self.episode_num = 1
self.episode_reward = 0
self.last_state = env.reset()
self.render = render
class EnvironmentRunSync:
def __init__(self, env, actor, config, memory=None, logwriter=None, name="", render=False):
self.env = env
self.name = name
self.actor = actor
self.config = deepcopy(config)
self.logwriter = logwriter
self.memory = memory
self.episode_num = 1
self.episode_reward = 0
self.last_state = env.reset()
self.render = render
def run(self, iterations):
state = self.last_state
logger = rltorch.log.Logger() if self.logwriter is not None else None
for _ in range(iterations):
action = self.actor.act(state)
next_state, reward, done, _ = self.env.step(action)
if self.render:
self.env.render()
def run(self, iterations):
state = self.last_state
logger = rltorch.log.Logger() if self.logwriter is not None else None
for _ in range(iterations):
action = self.actor.act(state)
next_state, reward, done, _ = self.env.step(action)
if self.render:
self.env.render()
self.episode_reward += reward
if self.memory is not None:
self.memory.append(state, action, reward, next_state, done)
self.episode_reward += reward
if self.memory is not None:
self.memory.append(state, action, reward, next_state, done)
state = next_state
state = next_state
if done:
if self.episode_num % self.config['print_stat_n_eps'] == 0:
print("episode: {}/{}, score: {}"
.format(self.episode_num, self.config['total_training_episodes'], self.episode_reward), flush=True)
if self.logwriter is not None:
logger.append(self.name + '/EpisodeReward', self.episode_reward)
self.episode_reward = 0
state = self.env.reset()
self.episode_num += 1
if self.logwriter is not None:
self.logwriter.write(logger)
self.last_state = state
class EnvironmentEpisodeSync:
def __init__(self, env, actor, config, memory=None, logwriter=None, name=""):
self.env = env
self.name = name
self.actor = actor
self.config = deepcopy(config)
self.logwriter = logwriter
self.memory = memory
self.episode_num = 1
def run(self):
state = self.env.reset()
done = False
episodeReward = 0
logger = rltorch.log.Logger() if self.logwriter is not None else None
while not done:
action = self.actor.act(state)
next_state, reward, done, _ = self.env.step(action)
episodeReward += reward
if self.memory is not None:
self.memory.append(state, action, reward, next_state, done)
state = next_state
if done:
if self.episode_num % self.config['print_stat_n_eps'] == 0:
print("episode: {}/{}, score: {}"
.format(self.episode_num, self.config['total_training_episodes'], self.episode_reward), flush=True)
print("episode: {}/{}, score: {}"
.format(self.episode_num, self.config['total_training_episodes'], episodeReward), flush=True)
if self.logwriter is not None:
logger.append(self.name + '/EpisodeReward', self.episode_reward)
self.episode_reward = 0
state = self.env.reset()
self.episode_num += 1
logger.append(self.name + '/EpisodeReward', episodeReward)
self.logwriter.write(logger)
if self.logwriter is not None:
self.logwriter.write(logger)
self.last_state = state
class EnvironmentEpisodeSync():
def __init__(self, env, actor, config, memory = None, logwriter = None, name = ""):
self.env = env
self.name = name
self.actor = actor
self.config = deepcopy(config)
self.logwriter = logwriter
self.memory = memory
self.episode_num = 1
def run(self):
state = self.env.reset()
done = False
episodeReward = 0
logger = rltorch.log.Logger() if self.logwriter is not None else None
while not done:
action = self.actor.act(state)
next_state, reward, done, _ = self.env.step(action)
episodeReward += reward
if self.memory is not None:
self.memory.append(state, action, reward, next_state, done)
state = next_state
if self.episode_num % self.config['print_stat_n_eps'] == 0:
print("episode: {}/{}, score: {}"
.format(self.episode_num, self.config['total_training_episodes'], episodeReward), flush=True)
if self.logwriter is not None:
logger.append(self.name + '/EpisodeReward', episodeReward)
self.logwriter.write(logger)
self.episode_num += 1
self.episode_num += 1

View file

@ -1,8 +1,8 @@
from collections import deque
import gym
import torch
from gym import spaces
import cv2
from collections import deque
import numpy as np
class EpisodicLifeEnv(gym.Wrapper):
@ -111,129 +111,134 @@ class ClippedRewardsWrapper(gym.RewardWrapper):
# Mostly derived from OpenAI baselines
class FireResetEnv(gym.Wrapper):
def __init__(self, env):
"""Take action on reset for environments that are fixed until firing."""
gym.Wrapper.__init__(self, env)
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
assert len(env.unwrapped.get_action_meanings()) >= 3
def __init__(self, env):
"""Take action on reset for environments that are fixed until firing."""
gym.Wrapper.__init__(self, env)
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
assert len(env.unwrapped.get_action_meanings()) >= 3
def reset(self, **kwargs):
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(1)
if done:
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(2)
if done:
self.env.reset(**kwargs)
return obs
def reset(self, **kwargs):
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(1)
if done:
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(2)
if done:
self.env.reset(**kwargs)
return obs
def step(self, ac):
return self.env.step(ac)
def step(self, ac):
return self.env.step(ac)
class LazyFrames(object):
def __init__(self, frames):
"""This object ensures that common frames between the observations are only stored once.
It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
buffers.
This object should only be converted to numpy array before being passed to the model.
You'd not believe how complex the previous solution was."""
self._frames = frames
self._out = None
def __init__(self, frames):
"""This object ensures that common frames between the observations are only stored once.
It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
buffers.
This object should only be converted to numpy array before being passed to the model.
You'd not believe how complex the previous solution was."""
self._frames = frames
self._out = None
def _force(self):
if self._out is None:
self._out = torch.stack(self._frames)
self._frames = None
return self._out
def _force(self):
if self._out is None:
self._out = torch.stack(self._frames)
self._frames = None
return self._out
def __array__(self, dtype=None):
out = self._force()
if dtype is not None:
out = out.astype(dtype)
return out
def __array__(self, dtype=None):
out = self._force()
if dtype is not None:
out = out.astype(dtype)
return out
def __len__(self):
return len(self._force())
def __len__(self):
return len(self._force())
def __getitem__(self, i):
return self._force()[i]
def __getitem__(self, i):
return self._force()[i]
class FrameStack(gym.Wrapper):
def __init__(self, env, k):
"""Stack k last frames.
Returns lazy array, which is much more memory efficient.
See Also
--------
baselines.common.atari_wrappers.LazyFrames
"""
gym.Wrapper.__init__(self, env)
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)
def __init__(self, env, k):
"""Stack k last frames.
Returns lazy array, which is much more memory efficient.
See Also
--------
baselines.common.atari_wrappers.LazyFrames
"""
gym.Wrapper.__init__(self, env)
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
self.observation_space = spaces.Box(
low=0,
high=255,
shape=(shp[:-1] + (shp[-1] * k,)),
dtype=env.observation_space.dtype
)
def reset(self):
ob = self.env.reset()
for _ in range(self.k):
self.frames.append(ob)
return self._get_ob()
def reset(self):
ob = self.env.reset()
for _ in range(self.k):
self.frames.append(ob)
return self._get_ob()
def step(self, action):
ob, reward, done, info = self.env.step(action)
self.frames.append(ob)
return self._get_ob(), reward, done, info
def step(self, action):
ob, reward, done, info = self.env.step(action)
self.frames.append(ob)
return self._get_ob(), reward, done, info
def _get_ob(self):
assert len(self.frames) == self.k
# return LazyFrames(list(self.frames))
return torch.cat(list(self.frames)).unsqueeze(0)
def _get_ob(self):
assert len(self.frames) == self.k
# return LazyFrames(list(self.frames))
return torch.cat(list(self.frames)).unsqueeze(0)
class ProcessFrame(gym.Wrapper):
def __init__(self, env, resize_shape = None, crop_bounds = None, grayscale = False):
gym.Wrapper.__init__(self, env)
self.resize_shape = resize_shape
self.crop_bounds = crop_bounds
self.grayscale = grayscale
def __init__(self, env, resize_shape=None, crop_bounds=None, grayscale=False):
gym.Wrapper.__init__(self, env)
self.resize_shape = resize_shape
self.crop_bounds = crop_bounds
self.grayscale = grayscale
def reset(self):
return self._preprocess(self.env.reset())
def reset(self):
return self._preprocess(self.env.reset())
def step(self, action):
next_state, reward, done, info = self.env.step(action)
next_state = self._preprocess(next_state)
return next_state, reward, done, info
def _preprocess(self, frame):
if self.grayscale:
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
if self.crop_bounds is not None and len(self.crop_bounds) == 4:
frame = frame[self.crop_bounds[0]:self.crop_bounds[1], self.crop_bounds[2]:self.crop_bounds[3]]
if self.resize_shape is not None and len(self.resize_shape) == 2:
frame = cv2.resize(frame, self.resize_shape, interpolation=cv2.INTER_AREA)
# Normalize
frame = frame / 255
return frame
def step(self, action):
next_state, reward, done, info = self.env.step(action)
next_state = self._preprocess(next_state)
return next_state, reward, done, info
def _preprocess(self, frame):
if self.grayscale:
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
if self.crop_bounds is not None and len(self.crop_bounds) == 4:
frame = frame[
self.crop_bounds[0]:self.crop_bounds[1],
self.crop_bounds[2]:self.crop_bounds[3]
]
if self.resize_shape is not None and len(self.resize_shape) == 2:
frame = cv2.resize(frame, self.resize_shape, interpolation=cv2.INTER_AREA)
# Normalize
frame = frame / 255
return frame
# Turns observations into torch tensors
# Adds an additional dimension that's suppose to represent the batch dim
class TorchWrap(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
def __init__(self, env):
gym.Wrapper.__init__(self, env)
def reset(self):
return self._convert(self.env.reset())
def step(self, action):
next_state, reward, done, info = self.env.step(action)
next_state = self._convert(next_state)
return next_state, reward, done, info
def _convert(self, frame):
frame = torch.from_numpy(frame).unsqueeze(0).float()
return frame
def reset(self):
return self._convert(self.env.reset())
def step(self, action):
next_state, reward, done, info = self.env.step(action)
next_state = self._convert(next_state)
return next_state, reward, done, info
def _convert(self, frame):
frame = torch.from_numpy(frame).unsqueeze(0).float()
return frame
class ProcessFrame84(gym.ObservationWrapper):
def __init__(self, env=None):
@ -256,4 +261,3 @@ class ProcessFrame84(gym.ObservationWrapper):
x_t = resized_screen[18:102, :]
x_t = np.reshape(x_t, [84, 84])
return x_t.astype(np.uint8)

View file

@ -1,13 +1,13 @@
from .PrioritizedReplayMemory import PrioritizedReplayMemory
from collections import namedtuple
import numpy as np
from .PrioritizedReplayMemory import PrioritizedReplayMemory
Transition = namedtuple('Transition',
('state', 'action', 'reward', 'next_state', 'done'))
class DQfDMemory(PrioritizedReplayMemory):
def __init__(self, capacity, alpha, max_demo = -1):
def __init__(self, capacity, alpha, max_demo=-1):
assert max_demo <= capacity
super().__init__(capacity, alpha)
self.demo_position = 0
@ -47,7 +47,8 @@ class DQfDMemory(PrioritizedReplayMemory):
idxes = self._sample_proportional(sample_size)
step_idxes = []
for i in idxes:
# If the interval of experiences fall between demonstration and obtained, move it over to the demonstration half
# If the interval of experiences fall between demonstration and obtained,
# move it over to the demonstration half
if i < self.demo_position and i + steps > self.demo_position:
diff = i + steps - self.demo_position
step_idxes += range(i - diff, i + steps - diff)

View file

@ -1,6 +1,4 @@
import random
from collections import namedtuple
import torch
Transition = namedtuple('Transition',
('state', 'action', 'reward', 'next_state', 'done'))

View file

@ -1,10 +1,9 @@
# From OpenAI Baselines https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
from .ReplayMemory import ReplayMemory
import operator
import random
import numpy as np
from numba import jit
from .ReplayMemory import ReplayMemory
class SegmentTree(object):
def __init__(self, capacity, operation, neutral_element):
@ -34,7 +33,7 @@ class SegmentTree(object):
self._value = [neutral_element for _ in range(2 * capacity)]
self._operation = operation
@jit(forceobj = True)
@jit(forceobj=True)
def _reduce_helper(self, start, end, node, node_start, node_end):
if start == node_start and end == node_end:
return self._value[node]
@ -50,7 +49,7 @@ class SegmentTree(object):
self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
)
@jit(forceobj = True)
@jit(forceobj=True)
def reduce(self, start=0, end=None):
"""Returns result of applying `self.operation`
to a contiguous subsequence of the array.
@ -73,7 +72,7 @@ class SegmentTree(object):
end -= 1
return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
@jit(forceobj = True)
@jit(forceobj=True)
def __setitem__(self, idx, val):
# index of the leaf
idx += self._capacity
@ -86,7 +85,7 @@ class SegmentTree(object):
)
idx //= 2
@jit(forceobj = True)
@jit(forceobj=True)
def __getitem__(self, idx):
assert 0 <= idx < self._capacity
return self._value[self._capacity + idx]
@ -100,12 +99,12 @@ class SumSegmentTree(SegmentTree):
neutral_element=0.0
)
@jit(forceobj = True)
@jit(forceobj=True)
def sum(self, start=0, end=None):
"""Returns arr[start] + ... + arr[end]"""
return super(SumSegmentTree, self).reduce(start, end)
@jit(forceobj = True, parallel = True)
@jit(forceobj=True, parallel=True)
def find_prefixsum_idx(self, prefixsum):
"""Find the highest index `i` in the array such that
sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
@ -140,7 +139,7 @@ class MinSegmentTree(SegmentTree):
neutral_element=float('inf')
)
@jit(forceobj = True)
@jit(forceobj=True)
def min(self, start=0, end=None):
"""Returns min(arr[start], ..., arr[end])"""
return super(MinSegmentTree, self).reduce(start, end)
@ -185,7 +184,7 @@ class PrioritizedReplayMemory(ReplayMemory):
self._it_sum[idx] = self._max_priority ** self._alpha
self._it_min[idx] = self._max_priority ** self._alpha
@jit(forceobj = True)
@jit(forceobj=True)
def _sample_proportional(self, batch_size):
res = []
p_total = self._it_sum.sum(0, len(self.memory) - 1)
@ -294,7 +293,7 @@ class PrioritizedReplayMemory(ReplayMemory):
batch = list(zip(*encoded_sample, weights, step_idxes))
return batch
@jit(forceobj = True)
@jit(forceobj=True)
def update_priorities(self, idxes, priorities):
"""
Update priorities of sampled transitions.
@ -320,4 +319,3 @@ class PrioritizedReplayMemory(ReplayMemory):
self._it_min[idx] = priority ** self._alpha
self._max_priority = max(self._max_priority, priority)

View file

@ -106,11 +106,9 @@ class ReplayMemory(object):
def __reversed__(self):
return reversed(self.memory)
def zip_batch(minibatch, priority = False, want_indices = False):
def zip_batch(minibatch, priority=False):
if priority:
state_batch, action_batch, reward_batch, next_state_batch, done_batch, weights, indexes = zip(*minibatch)
elif want_indices:
state_batch, action_batch, reward_batch, next_state_batch, done_batch, indexes = zip(*minibatch)
else:
state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*minibatch)
@ -122,7 +120,5 @@ def zip_batch(minibatch, priority = False, want_indices = False):
if priority:
return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, weights, indexes
elif want_indices:
return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, indexes
else:
return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch

View file

@ -5,115 +5,32 @@ from copy import deepcopy
import torch.multiprocessing as mp
class EnvironmentEpisode(mp.Process):
def __init__(self, env, actor, config, logger = None, name = ""):
super(EnvironmentEpisode, self).__init__()
self.env = env
self.actor = actor
self.config = deepcopy(config)
self.logger = logger
self.name = name
self.episode_num = 1
def __init__(self, env, actor, config, logger=None, name=""):
super(EnvironmentEpisode, self).__init__()
self.env = env
self.actor = actor
self.config = deepcopy(config)
self.logger = logger
self.name = name
self.episode_num = 1
def run(self, printstat = False, memory = None):
state = self.env.reset()
done = False
episode_reward = 0
while not done:
action = self.actor.act(state)
next_state, reward, done, _ = self.env.step(action)
def run(self, printstat=False, memory=None):
state = self.env.reset()
done = False
episode_reward = 0
while not done:
action = self.actor.act(state)
next_state, reward, done, _ = self.env.step(action)
episode_reward = episode_reward + reward
if memory is not None:
memory.put((state, action, reward, next_state, done))
state = next_state
episode_reward = episode_reward + reward
if memory is not None:
memory.put((state, action, reward, next_state, done))
state = next_state
if printstat:
print("episode: {}/{}, score: {}"
.format(self.episode_num, self.config['total_training_episodes'], episode_reward))
if self.logger is not None:
self.logger.append(self.name + '/EpisodeReward', episode_reward)
self.episode_num += 1
# from copy import deepcopy
# import torch.multiprocessing as mp
# from ctypes import *
# import rltorch.log
# def envepisode(actor, env, episode_num, config, runcondition, memoryqueue = None, logqueue = None, name = ""):
# # Wait for signal to start running through the environment
# while runcondition.wait():
# # Start a logger to log the rewards
# logger = rltorch.log.Logger()
# state = env.reset()
# episode_reward = 0
# done = False
# while not done:
# action = actor.act(state)
# next_state, reward, done, _ = env.step(action)
# episode_reward += reward
# if memoryqueue is not None:
# memoryqueue.put((state, action, reward, next_state, done))
# state = next_state
# if done:
# with episode_num.get_lock():
# if episode_num.value % config['print_stat_n_eps'] == 0:
# print("episode: {}/{}, score: {}"
# .format(episode_num.value, config['total_training_episodes'], episode_reward))
# if logger is not None:
# logger.append(name + '/EpisodeReward', episode_reward)
# episode_reward = 0
# state = env.reset()
# with episode_num.get_lock():
# episode_num.value += 1
# logqueue.put(logger)
# class EnvironmentRun():
# def __init__(self, env_func, actor, config, memory = None, name = ""):
# self.config = deepcopy(config)
# self.memory = memory
# self.episode_num = mp.Value(c_uint)
# self.runcondition = mp.Event()
# # Interestingly enough, there isn't a good reliable way to know how many states an episode will have
# # Perhaps we can share a uint to keep track...
# self.memory_queue = mp.Queue(maxsize = config['replay_skip'] + 1)
# self.logqueue = mp.Queue(maxsize = 1)
# with self.episode_num.get_lock():
# self.episode_num.value = 1
# self.runner = mp.Process(target=envrun,
# args=(actor, env_func, self.episode_num, config, self.runcondition),
# kwargs = {'iterations': config['replay_skip'] + 1,
# 'memoryqueue' : self.memory_queue, 'logqueue' : self.logqueue, 'name' : name})
# self.runner.start()
# def run(self):
# self.runcondition.set()
# def join(self):
# self._sync_memory()
# if self.logwriter is not None:
# self.logwriter.write(self._get_reward_logger())
# def sync_memory(self):
# if self.memory is not None:
# for i in range(self.config['replay_skip'] + 1):
# self.memory.append(*self.memory_queue.get())
# def get_reward_logger(self):
# return self.logqueue.get()
# def terminate(self):
# self.runner.terminate()
if printstat:
print("episode: {}/{}, score: {}"
.format(self.episode_num, self.config['total_training_episodes'], episode_reward))
if self.logger is not None:
self.logger.append(self.name + '/EpisodeReward', episode_reward)
self.episode_num += 1

View file

@ -1,40 +1,40 @@
from copy import deepcopy
from ctypes import c_uint
import torch.multiprocessing as mp
from ctypes import *
import rltorch.log
def envrun(actor, env, episode_num, config, runcondition, iterations = 1, memoryqueue = None, logqueue = None, name = ""):
state = env.reset()
episode_reward = 0
# Wait for signal to start running through the environment
while runcondition.wait():
# Start a logger to log the rewards
logger = rltorch.log.Logger() if logqueue is not None else None
def envrun(actor, env, episode_num, config, runcondition, iterations=1, memoryqueue=None, logqueue=None, name=""):
state = env.reset()
episode_reward = 0
# Wait for signal to start running through the environment
while runcondition.wait():
# Start a logger to log the rewards
logger = rltorch.log.Logger() if logqueue is not None else None
for _ in range(iterations):
action = actor.act(state)
next_state, reward, done, _ = env.step(action)
action = actor.act(state)
next_state, reward, done, _ = env.step(action)
episode_reward += reward
if memoryqueue is not None:
memoryqueue.put((state, action, reward, next_state, done))
episode_reward += reward
if memoryqueue is not None:
memoryqueue.put((state, action, reward, next_state, done))
state = next_state
state = next_state
if done:
with episode_num.get_lock():
if episode_num.value % config['print_stat_n_eps'] == 0:
print("episode: {}/{}, score: {}"
.format(episode_num.value, config['total_training_episodes'], episode_reward))
if done:
with episode_num.get_lock():
if episode_num.value % config['print_stat_n_eps'] == 0:
print("episode: {}/{}, score: {}"
.format(episode_num.value, config['total_training_episodes'], episode_reward))
if logger is not None:
logger.append(name + '/EpisodeReward', episode_reward)
episode_reward = 0
state = env.reset()
with episode_num.get_lock():
episode_num.value += 1
if logger is not None:
logger.append(name + '/EpisodeReward', episode_reward)
episode_reward = 0
state = env.reset()
with episode_num.get_lock():
episode_num.value += 1
if logqueue is not None:
logqueue.put(logger)
if logqueue is not None:
logqueue.put(logger)
class EnvironmentRun():
def __init__(self, env, actor, config, memory = None, logwriter = None, name = ""):

View file

@ -1,7 +1,8 @@
from copy import deepcopy
import numpy as np
import torch
from .Network import Network
from copy import deepcopy
# [TODO] Should we torch.no_grad the __call__?
# What if we want to sometimes do gradient descent as well?
@ -38,7 +39,7 @@ class ESNetwork(Network):
name
For use in logger to differentiate in analysis.
"""
def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma = 0.05, device = None, logger = None, name = ""):
def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, logger=None, name=""):
super(ESNetwork, self).__init__(model, optimizer, config, device, logger, name)
self.population_size = population_size
self.fitness = fitness_fn
@ -64,7 +65,11 @@ class ESNetwork(Network):
white_noise_dict = {}
noise_dict = {}
for key in model_dict.keys():
white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape, device = self.device)
white_noise_dict[key] = torch.randn(
self.population_size,
*model_dict[key].shape,
device=self.device
)
noise_dict[key] = self.sigma * white_noise_dict[key]
return white_noise_dict, noise_dict
@ -96,7 +101,10 @@ class ESNetwork(Network):
candidate_solutions = self._generate_candidate_solutions(noise_dict)
## Calculate fitness then mean shift, scale
fitness_values = torch.tensor([self.fitness(x, *args) for x in candidate_solutions], device = self.device)
fitness_values = torch.tensor(
[self.fitness(x, *args) for x in candidate_solutions],
device=self.device
)
if self.logger is not None:
self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item())
fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps)

View file

@ -1,9 +1,8 @@
from copy import deepcopy
import numpy as np
import torch
from .Network import Network
from copy import deepcopy
import torch.multiprocessing as mp
import functools
from .Network import Network
class fn_copy:
def __init__(self, fn, args):
@ -20,14 +19,15 @@ class ESNetworkMP(Network):
fitness_fun := model, *args -> fitness_value (float)
We wish to find a model that maximizes the fitness function
"""
def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma = 0.05, device = None, logger = None, name = ""):
def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, logger=None, name=""):
super(ESNetworkMP, self).__init__(model, optimizer, config, device, logger, name)
self.population_size = population_size
self.fitness = fitness_fn
self.sigma = sigma
assert self.sigma > 0
mp_ctx = mp.get_context("spawn")
self.pool = mp_ctx.Pool(processes=2) #[TODO] Probably should make number of processes a config variable
#[TODO] Probably should make number of processes a config variable
self.pool = mp_ctx.Pool(processes=2)
# We're not going to be calculating gradients in the traditional way
# So there's no need to waste computation time keeping track
@ -42,7 +42,11 @@ class ESNetworkMP(Network):
white_noise_dict = {}
noise_dict = {}
for key in model_dict.keys():
white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape, device = self.device)
white_noise_dict[key] = torch.randn(
self.population_size,
*model_dict[key].shape,
device=self.device
)
noise_dict[key] = self.sigma * white_noise_dict[key]
return white_noise_dict, noise_dict
@ -67,7 +71,10 @@ class ESNetworkMP(Network):
candidate_solutions = self._generate_candidate_solutions(noise_dict)
## Calculate fitness then mean shift, scale
fitness_values = torch.tensor(list(self.pool.map(fn_copy(self.fitness, args), candidate_solutions)), device = self.device)
fitness_values = torch.tensor(
list(self.pool.map(fn_copy(self.fitness, args), candidate_solutions)),
device=self.device
)
if self.logger is not None:
self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item())

View file

@ -17,12 +17,16 @@ class Network:
name
For use in logger to differentiate in analysis.
"""
def __init__(self, model, optimizer, config, device = None, logger = None, name = ""):
def __init__(self, model, optimizer, config, device=None, logger=None, name=""):
self.model = model
if 'weight_decay' in config:
self.optimizer = optimizer(model.parameters(), lr = config['learning_rate'], weight_decay = config['weight_decay'])
self.optimizer = optimizer(
model.parameters(),
lr=config['learning_rate'],
weight_decay=config['weight_decay']
)
else:
self.optimizer = optimizer(model.parameters(), lr = config['learning_rate'])
self.optimizer = optimizer(model.parameters(), lr=config['learning_rate'])
self.logger = logger
self.name = name
self.device = device
@ -32,7 +36,7 @@ class Network:
def __call__(self, *args):
return self.model(*args)
def clamp_gradients(self, x = 1):
def clamp_gradients(self, x=1):
"""
Forcing gradients to stay within a certain interval
by setting it to the bound if it goes over it.

View file

@ -1,67 +1,67 @@
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
# This class utilizes this property of the normal distribution
# N(mu, sigma) = mu + sigma * N(0, 1)
class NoisyLinear(nn.Linear):
"""
Draws the parameters of nn.Linear from a normal distribution.
The parameters of the normal distribution are registered as
learnable parameters in the neural network.
Parameters
----------
in_features
Size of each input sample.
out_features
Size of each output sample.
sigma_init
The starting standard deviation of guassian noise.
bias
If set to False, the layer will not
learn an additive bias.
Default: True
"""
def __init__(self, in_features, out_features, sigma_init = 0.017, bias = True):
super(NoisyLinear, self).__init__(in_features, out_features, bias = bias)
# One of the parameters the network is going to tune is the
# standard deviation of the gaussian noise on the weights
self.sigma_weight = nn.Parameter(torch.Tensor(out_features, in_features).fill_(sigma_init))
# Reserve space for N(0, 1) of weights in the forward() call
self.register_buffer("s_normal_weight", torch.zeros(out_features, in_features))
if bias:
# If a bias exists, then we manipulate the standard deviation of the
# gaussion noise on them as well
self.sigma_bias = nn.Parameter(torch.Tensor(out_features).fill_(sigma_init))
# Reserve space for N(0, 1) of bias in the foward() call
self.register_buffer("s_normal_bias", torch.zeros(out_features))
self.reset_parameters()
def reset_parameters(self):
std = math.sqrt(3 / self.in_features)
nn.init.uniform_(self.weight, -std, std)
nn.init.uniform_(self.bias, -std, std)
def forward(self, x):
r"""
Calculates the output :math:`y` through the following:
:math:`sigma \sim N(mu_1, std_1)`
:math:`bias \sim N(mu_2, std_2)`
:math:`y = sigma \cdot x + bias`
"""
Draws the parameters of nn.Linear from a normal distribution.
The parameters of the normal distribution are registered as
learnable parameters in the neural network.
Parameters
----------
in_features
Size of each input sample.
out_features
Size of each output sample.
sigma_init
The starting standard deviation of guassian noise.
bias
If set to False, the layer will not
learn an additive bias.
Default: True
"""
def __init__(self, in_features, out_features, sigma_init=0.017, bias=True):
super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)
# One of the parameters the network is going to tune is the
# standard deviation of the gaussian noise on the weights
self.sigma_weight = nn.Parameter(torch.Tensor(out_features, in_features).fill_(sigma_init))
# Reserve space for N(0, 1) of weights in the forward() call
self.register_buffer("s_normal_weight", torch.zeros(out_features, in_features))
if bias:
# If a bias exists, then we manipulate the standard deviation of the
# gaussion noise on them as well
self.sigma_bias = nn.Parameter(torch.Tensor(out_features).fill_(sigma_init))
# Reserve space for N(0, 1) of bias in the foward() call
self.register_buffer("s_normal_bias", torch.zeros(out_features))
self.reset_parameters()
def reset_parameters(self):
std = math.sqrt(3 / self.in_features)
nn.init.uniform_(self.weight, -std, std)
nn.init.uniform_(self.bias, -std, std)
def forward(self, x):
r"""
Calculates the output :math:`y` through the following:
:math:`sigma \sim N(mu_1, std_1)`
:math:`bias \sim N(mu_2, std_2)`
:math:`y = sigma \cdot x + bias`
"""
# Fill s_normal_weight with values from the standard normal distribution
self.s_normal_weight.normal_()
weight_noise = self.sigma_weight * self.s_normal_weight.clone().requires_grad_()
self.s_normal_weight.normal_()
weight_noise = self.sigma_weight * self.s_normal_weight.clone().requires_grad_()
bias = None
if self.bias is not None:
# Fill s_normal_bias with values from standard normal
self.s_normal_bias.normal_()
bias = self.bias + self.sigma_bias * self.s_normal_bias.clone().requires_grad_()
bias = None
if self.bias is not None:
# Fill s_normal_bias with values from standard normal
self.s_normal_bias.normal_()
bias = self.bias + self.sigma_bias * self.s_normal_bias.clone().requires_grad_()
return F.linear(x, self.weight + weight_noise, bias)
return F.linear(x, self.weight + weight_noise, bias)

View file

@ -11,7 +11,7 @@ class TargetNetwork:
device
The device to put the cloned parameters in.
"""
def __init__(self, network, device = None):
def __init__(self, network, device=None):
self.model = network.model
self.target_model = deepcopy(network.model)
if device is not None:
@ -37,7 +37,8 @@ class TargetNetwork:
Parameters
----------
tau : number
A number between 0-1 which indicates the proportion of the originator and clone in the new clone.
A number between 0-1 which indicates
the proportion of the originator and clone in the new clone.
"""
assert isinstance(tau, float)
assert 0.0 < tau <= 1.0

View file

@ -1,5 +1,5 @@
from .ESNetwork import *
from .ESNetworkMP import *
from .Network import *
from .NoisyLinear import *
from .TargetNetwork import *
from .ESNetwork import ESNetwork
from .ESNetworkMP import ESNetworkMP
from .Network import Network
from .NoisyLinear import NoisyLinear
from .TargetNetwork import TargetNetwork

View file

@ -36,4 +36,3 @@ class ExponentialScheduler(Scheduler):
return self.initial_value * (self.base ** (self.current_iteration - 1))
else:
return self.end_value

View file

@ -7,4 +7,4 @@ class Scheduler():
def __iter__(self):
return self
def __next__(self):
raise NotImplementedError("Scheduler does not have it's function to create a value implemented")
raise NotImplementedError("__next__ not implemented in Scheduler")