PEP8 Conformance

This commit is contained in:
Brandon Rozek 2020-04-14 14:16:14 -04:00
parent 9b81188a77
commit 8fa4691511
29 changed files with 652 additions and 755 deletions

View file

@ -12,7 +12,8 @@ class ArgMaxSelector:
if self.device is not None:
state = state.to(self.device)
action_values = self.model(state).squeeze(0)
action = self.random_act() if (action_values[0] == action_values).all() else action_values.argmax().item()
action = self.random_act() if (action_values[0] == action_values).all() \
else action_values.argmax().item()
return action
def act(self, state):
return self.best_act(state)

View file

@ -1,6 +1,7 @@
from .ArgMaxSelector import ArgMaxSelector
import numpy as np
import collections
import numpy as np
from .ArgMaxSelector import ArgMaxSelector
class EpsilonGreedySelector(ArgMaxSelector):
def __init__(self, model, action_size, device=None, epsilon=0.1):
super(EpsilonGreedySelector, self).__init__(model, action_size, device=device)

View file

@ -1,5 +1,6 @@
from .ArgMaxSelector import ArgMaxSelector
import torch
from .ArgMaxSelector import ArgMaxSelector
class IdentitySelector(ArgMaxSelector):
def __init__(self, model, action_size, device=None):
super(IdentitySelector, self).__init__(model, action_size, device=device)

View file

@ -1,10 +1,10 @@
from random import randrange
class RandomSelector():
class RandomSelector:
def __init__(self, action_size):
self.action_size = action_size
def random_act(self):
return randrange(action_size)
def best_act(self, state):
return randrange(self.action_size)
def best_act(self, _):
return self.random_act()
def act(self, state):
def act(self, _):
return self.random_act()

View file

@ -1,9 +1,6 @@
from random import randrange
import torch
from torch.distributions import Categorical
import rltorch
from rltorch.action_selector import ArgMaxSelector
from .ArgMaxSelector import ArgMaxSelector
from ..memory.EpisodeMemory import EpisodeMemory
class StochasticSelector(ArgMaxSelector):
def __init__(self, model, action_size, memory=None, device=None):
super(StochasticSelector, self).__init__(model, action_size, device=device)
@ -17,6 +14,6 @@ class StochasticSelector(ArgMaxSelector):
action_probabilities = self.model(state)
distribution = Categorical(action_probabilities)
action = distribution.sample()
if log_prob and isinstance(self.memory, rltorch.memory.EpisodeMemory):
if log_prob and isinstance(self.memory, EpisodeMemory):
self.memory.append_log_probs(distribution.log_prob(action))
return action.item()

View file

@ -1,5 +1,5 @@
from .ArgMaxSelector import *
from .EpsilonGreedySelector import *
from .IdentitySelector import *
from .RandomSelector import *
from .StochasticSelector import *
from .ArgMaxSelector import ArgMaxSelector
from .EpsilonGreedySelector import EpsilonGreedySelector
from .IdentitySelector import IdentitySelector
from .RandomSelector import RandomSelector
from .StochasticSelector import StochasticSelector

View file

@ -2,8 +2,6 @@ from copy import deepcopy
import numpy as np
import torch
import torch.nn.functional as F
import rltorch
import rltorch.memory as M
class A2CSingleAgent:
def __init__(self, policy_net, value_net, memory, config, logger=None):
@ -16,7 +14,11 @@ class A2CSingleAgent:
def _discount_rewards(self, rewards):
gammas = torch.ones_like(rewards)
if len(rewards) > 1:
gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0)
discount_tensor = torch.tensor(self.config['discount_rate'])
gammas[1:] = torch.cumprod(
discount_tensor.repeat(len(rewards) - 1),
dim=0
)
return gammas * rewards
# This function is currently not used since the performance gains hasn't been shown
@ -29,18 +31,18 @@ class A2CSingleAgent:
values = self.value_net(states).squeeze(1)
generalized_advantages = torch.zeros_like(rewards)
for i in range(len(generalized_advantages)):
discount_tensor = torch.tensor(self.config['discount_rate']) * tradeoff
for i, _ in enumerate(generalized_advantages):
weights = torch.ones_like(rewards[i:])
if i != len(generalized_advantages) - 1:
weights[1:] = torch.cumprod(torch.tensor(self.config['discount_rate'] * tradeoff).repeat(len(rewards) - i - 1), dim = 0)
weights[1:] = torch.cumprod(discount_tensor.repeat(len(rewards) - i - 1), dim=0)
generalized_advantages[i] = (weights * (rewards[i:] + self.config['discount_rate'] * next_values[i:] - values[i:])).sum()
return generalized_advantages
def learn(self):
episode_batch = self.memory.recall()
state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
state_batch, _, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
# Send batches to the appropriate device
state_batch = torch.cat(state_batch).to(self.value_net.device)
@ -50,8 +52,10 @@ class A2CSingleAgent:
log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device)
## Value Loss
# In A2C, the value loss is the difference between the discounted reward and the value from the first state
# The value of the first state is supposed to tell us the expected reward from the current policy of the whole episode
# In A2C, the value loss is the difference between the discounted reward
# and the value from the first state.
# The value of the first state is supposed to tell us
# the expected reward from the current policy of the whole episode
discounted_reward = self._discount_rewards(reward_batch)
observed_value = discounted_reward.sum()
value_loss = F.mse_loss(observed_value, self.value_net(state_batch[0]))
@ -86,5 +90,3 @@ class A2CSingleAgent:
# Memory under the old policy is not needed for future training
self.memory.clear()

View file

@ -1,10 +1,8 @@
import collections
from copy import deepcopy
import rltorch.memory as M
import torch
import torch.nn.functional as F
from copy import deepcopy
import numpy as np
from pathlib import Path
class DQNAgent:
def __init__(self, net, memory, config, target_net=None, logger=None):
@ -24,7 +22,7 @@ class DQNAgent:
if len(self.memory) < self.config['batch_size']:
return
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
if isinstance(self.memory, M.PrioritizedReplayMemory):
weight_importance = self.config['prioritized_replay_weight_importance']
# If it's a scheduler then get the next value by calling next, otherwise just use it's value
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
@ -63,7 +61,7 @@ class DQNAgent:
expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
# If we're sampling by TD error, multiply loss by a importance weight which helps decrease overfitting
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
if isinstance(self.memory, M.PrioritizedReplayMemory):
# loss = (torch.as_tensor(importance_weights, device = self.net.device) * F.smooth_l1_loss(obtained_values, expected_values, reduction = 'none').squeeze(1)).mean()
loss = (torch.as_tensor(importance_weights, device=self.net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
else:
@ -85,8 +83,6 @@ class DQNAgent:
self.target_net.sync()
# If we're sampling by TD error, readjust the weights of the experiences
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
if isinstance(self.memory, M.PrioritizedReplayMemory):
td_error = (obtained_values - expected_values).detach().abs()
self.memory.update_priorities(batch_indexes, td_error)

View file

@ -1,11 +1,9 @@
import collections
from copy import deepcopy
import rltorch.memory as M
import torch
import torch.nn.functional as F
from copy import deepcopy
import numpy as np
from pathlib import Path
from rltorch.action_selector import ArgMaxSelector
class DQfDAgent:
def __init__(self, net, memory, config, target_net=None, logger=None):
@ -32,10 +30,10 @@ class DQfDAgent:
batch_size = self.config['batch_size']
steps = None
if isinstance(self.memory, M.DQfDMemory):
weight_importance = self.config['prioritized_replay_weight_importance']
# If it's a scheduler then get the next value by calling next, otherwise just use it's value
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) \
else weight_importance
# Check to see if we are doing N-Step DQN
if steps is not None:
@ -46,16 +44,6 @@ class DQfDAgent:
# Process batch
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True)
else:
# Check to see if we're doing N-Step DQN
if steps is not None:
minibatch = self.memory.sample_n_steps(batch_size, steps)
else:
minibatch = self.memory.sample(batch_size)
# Process batch
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, batch_indexes = M.zip_batch(minibatch, want_indices = True)
batch_index_tensors = torch.tensor(batch_indexes)
demo_mask = batch_index_tensors < self.memory.demo_position
@ -86,11 +74,11 @@ class DQfDAgent:
best_next_state_value = torch.zeros(batch_size, device=self.net.device)
best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
expected_values = (reward_batch + (batch_size * best_next_state_value)).unsqueeze(1)
# N-Step DQN Loss
# num_steps capture how many steps actually exist before the end of episode
if steps != None:
if steps is not None:
expected_n_step_values = []
with torch.no_grad():
for i in range(0, len(state_batch), steps):
@ -127,7 +115,7 @@ class DQfDAgent:
l = torch.ones_like(state_values[demo_mask])
expert_actions = action_batch[demo_mask]
# l(s, a) is zero for every action the expert doesn't take
for i,a in zip(range(len(l)), expert_actions):
for i, _, a in zip(enumerate(l), expert_actions):
l[i].fill_(0.8) # According to paper
l[i, a] = 0
if self.target_net is not None:
@ -148,26 +136,17 @@ class DQfDAgent:
# Since dqn_loss and demo_loss are different sizes, the reduction has to happen before they are combined
if isinstance(self.memory, M.DQfDMemory):
dqn_loss = (torch.as_tensor(importance_weights, device=self.net.device) * F.mse_loss(obtained_values, expected_values, reduction='none').squeeze(1)).mean()
else:
dqn_loss = F.mse_loss(obtained_values, expected_values)
if steps != None:
if isinstance(self.memory, M.DQfDMemory):
if steps is not None:
dqn_n_step_loss = (torch.as_tensor(importance_weights[::steps], device=self.net.device) * F.mse_loss(observed_n_step_values, expected_n_step_values, reduction='none')).mean()
else:
dqn_n_step_loss = F.mse_loss(observed_n_step_values, expected_n_step_values, reduction = 'none').mean()
else:
dqn_n_step_loss = torch.tensor(0, device=self.net.device)
if demo_mask.sum() > 0:
if isinstance(self.memory, M.DQfDMemory):
demo_loss = (torch.as_tensor(importance_weights, device=self.net.device)[demo_mask] * F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction='none').squeeze(1)).mean()
else:
demo_loss = F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction = 'none').squeeze(1).mean()
else:
demo_loss = 0.
demo_loss = 0
loss = td_importance * dqn_loss + td_importance * dqn_n_step_loss + demo_importance * demo_loss
if self.logger is not None:

View file

@ -1,12 +1,8 @@
from copy import deepcopy
import numpy as np
import torch
import torch.nn.functional as F
from torch.distributions import Categorical
import rltorch
import rltorch.memory as M
import collections
import random
class PPOAgent:
def __init__(self, policy_net, value_net, memory, config, logger=None):
@ -23,7 +19,6 @@ class PPOAgent:
gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim=0)
return gammas * rewards
def learn(self):
episode_batch = self.memory.recall()
state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
@ -68,14 +63,10 @@ class PPOAgent:
self.logger.append("Loss/Policy", policy_loss.item())
self.logger.append("Loss/Value", value_loss.item())
self.old_policy_net.sync()
self.policy_net.zero_grad()
policy_loss.backward()
self.policy_net.step()
# Memory under the old policy is not needed for future training
self.memory.clear()

View file

@ -2,16 +2,17 @@ from copy import deepcopy
import collections
import numpy as np
import torch
import torch.nn.functional as F
from torch.distributions import Categorical
import rltorch
import rltorch.memory as M
import torch.nn.functional as F
# Q-Evolutionary Policy Agent
# Maximizes the policy with respect to the Q-Value function.
# Since function is non-differentiabile, depends on the Evolutionary Strategy algorithm
class QEPAgent:
def __init__(self, policy_net, value_net, memory, config, target_value_net = None, logger = None, entropy_importance = 0, policy_skip = 4, after_value_train = None):
def __init__(self, policy_net, value_net, memory, config, target_value_net=None, logger=None, entropy_importance=0, policy_skip=4):
self.policy_net = policy_net
assert isinstance(self.policy_net, rltorch.network.ESNetwork) or isinstance(self.policy_net, rltorch.network.ESNetworkMP)
self.policy_net.fitness = self.fitness
@ -22,7 +23,6 @@ class QEPAgent:
self.logger = logger
self.policy_skip = policy_skip
self.entropy_importance = entropy_importance
self.after_value_train = after_value_train
def save(self, file_location):
torch.save({
@ -42,10 +42,8 @@ class QEPAgent:
batch_size = len(state_batch)
with torch.no_grad():
action_probabilities = policy_net(state_batch)
action_size = action_probabilities.shape[1]
distributions = list(map(Categorical, action_probabilities))
actions = torch.stack([d.sample() for d in distributions])
with torch.no_grad():
@ -54,17 +52,17 @@ class QEPAgent:
# Weird hacky solution where in multiprocess, it sometimes spits out nans
# So have it try again
while torch.isnan(state_values).any():
print("NAN DETECTED")
with torch.no_grad():
state_values = value_net(state_batch)
obtained_values = state_values.gather(1, actions.view(batch_size, 1)).squeeze(1)
obtained_values = state_values.gather(1, actions.view(len(state_batch), 1)).squeeze(1)
# return -obtained_values.mean().item()
entropy_importance = 0 # Entropy accounting for 1% of loss seems to work well
entropy_importance = next(self.entropy_importance) if isinstance(self.entropy_importance, collections.Iterable) else self.entropy_importance
value_importance = 1 - entropy_importance
# entropy_loss = (action_probabilities * torch.log2(action_probabilities)).sum(1) # Standard entropy loss from information theory
entropy_loss = (action_probabilities - torch.tensor(1 / action_size, device = state_batch.device).repeat(batch_size, action_size)).abs().sum(1)
entropy_loss = (action_probabilities - torch.tensor(1 / action_size, device=state_batch.device).repeat(len(state_batch), action_size)).abs().sum(1)
return (entropy_importance * entropy_loss - value_importance * obtained_values).mean().item()
@ -73,7 +71,7 @@ class QEPAgent:
if len(self.memory) < self.config['batch_size']:
return
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
if isinstance(self.memory, M.PrioritizedReplayMemory):
weight_importance = self.config['prioritized_replay_weight_importance']
# If it's a scheduler then get the next value by calling next, otherwise just use it's value
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
@ -111,7 +109,7 @@ class QEPAgent:
expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
if isinstance(self.memory, M.PrioritizedReplayMemory):
value_loss = (torch.as_tensor(importance_weights, device=self.value_net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
else:
value_loss = F.mse_loss(obtained_values, expected_values)
@ -124,16 +122,13 @@ class QEPAgent:
self.value_net.clamp_gradients()
self.value_net.step()
if callable(self.after_value_train):
self.after_value_train()
if self.target_value_net is not None:
if 'target_sync_tau' in self.config:
self.target_value_net.partial_sync(self.config['target_sync_tau'])
else:
self.target_value_net.sync()
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
if isinstance(self.memory, M.PrioritizedReplayMemory):
td_error = (obtained_values - expected_values).detach().abs()
self.memory.update_priorities(batch_indexes, td_error)
@ -141,7 +136,8 @@ class QEPAgent:
if self.policy_skip > 0:
self.policy_skip -= 1
return
self.policy_skip = self.config['policy_skip']
self.policy_skip = 4
if self.target_value_net is not None:
self.policy_net.calc_gradients(self.target_value_net, state_batch)
else:

View file

@ -1,7 +1,7 @@
import rltorch
from copy import deepcopy
import torch
import numpy as np
import torch
import rltorch
class REINFORCEAgent:
def __init__(self, net, memory, config, target_net=None, logger=None):
@ -30,7 +30,7 @@ class REINFORCEAgent:
def learn(self):
episode_batch = self.memory.recall()
state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
_, _, reward_batch, _, _, log_prob_batch = zip(*episode_batch)
# Caluclate discounted rewards to place more importance to recent rewards
shaped_reward_batch = self._shape_rewards(torch.tensor(reward_batch))

View file

@ -1,6 +1,6 @@
from .A2CSingleAgent import *
from .DQNAgent import *
from .DQfDAgent import *
from .PPOAgent import *
from .QEPAgent import *
from .REINFORCEAgent import *
from .A2CSingleAgent import A2CSingleAgent
from .DQNAgent import DQNAgent
from .DQfDAgent import DQfDAgent
from .PPOAgent import PPOAgent
from .QEPAgent import QEPAgent
from .REINFORCEAgent import REINFORCEAgent

View file

@ -1,6 +1,6 @@
from copy import deepcopy
import rltorch
import time
import rltorch
def simulateEnvEps(env, actor, config, total_episodes=1, memory=None, logger=None, name="", render=False):
for episode in range(total_episodes):
@ -27,7 +27,7 @@ def simulateEnvEps(env, actor, config, total_episodes = 1, memory = None, logger
logger.append(name + '/EpisodeReward', episode_reward)
class EnvironmentRunSync():
class EnvironmentRunSync:
def __init__(self, env, actor, config, memory=None, logwriter=None, name="", render=False):
self.env = env
self.name = name
@ -72,7 +72,7 @@ class EnvironmentRunSync():
self.last_state = state
class EnvironmentEpisodeSync():
class EnvironmentEpisodeSync:
def __init__(self, env, actor, config, memory=None, logwriter=None, name=""):
self.env = env
self.name = name

View file

@ -1,8 +1,8 @@
from collections import deque
import gym
import torch
from gym import spaces
import cv2
from collections import deque
import numpy as np
class EpisodicLifeEnv(gym.Wrapper):
@ -170,7 +170,12 @@ class FrameStack(gym.Wrapper):
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)
self.observation_space = spaces.Box(
low=0,
high=255,
shape=(shp[:-1] + (shp[-1] * k,)),
dtype=env.observation_space.dtype
)
def reset(self):
ob = self.env.reset()
@ -207,14 +212,16 @@ class ProcessFrame(gym.Wrapper):
if self.grayscale:
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
if self.crop_bounds is not None and len(self.crop_bounds) == 4:
frame = frame[self.crop_bounds[0]:self.crop_bounds[1], self.crop_bounds[2]:self.crop_bounds[3]]
frame = frame[
self.crop_bounds[0]:self.crop_bounds[1],
self.crop_bounds[2]:self.crop_bounds[3]
]
if self.resize_shape is not None and len(self.resize_shape) == 2:
frame = cv2.resize(frame, self.resize_shape, interpolation=cv2.INTER_AREA)
# Normalize
frame = frame / 255
return frame
# Turns observations into torch tensors
# Adds an additional dimension that's suppose to represent the batch dim
class TorchWrap(gym.Wrapper):
@ -233,8 +240,6 @@ class TorchWrap(gym.Wrapper):
frame = torch.from_numpy(frame).unsqueeze(0).float()
return frame
class ProcessFrame84(gym.ObservationWrapper):
def __init__(self, env=None):
super(ProcessFrame84, self).__init__(env)
@ -256,4 +261,3 @@ class ProcessFrame84(gym.ObservationWrapper):
x_t = resized_screen[18:102, :]
x_t = np.reshape(x_t, [84, 84])
return x_t.astype(np.uint8)

View file

@ -1,6 +1,6 @@
from .PrioritizedReplayMemory import PrioritizedReplayMemory
from collections import namedtuple
import numpy as np
from .PrioritizedReplayMemory import PrioritizedReplayMemory
Transition = namedtuple('Transition',
('state', 'action', 'reward', 'next_state', 'done'))
@ -47,7 +47,8 @@ class DQfDMemory(PrioritizedReplayMemory):
idxes = self._sample_proportional(sample_size)
step_idxes = []
for i in idxes:
# If the interval of experiences fall between demonstration and obtained, move it over to the demonstration half
# If the interval of experiences fall between demonstration and obtained,
# move it over to the demonstration half
if i < self.demo_position and i + steps > self.demo_position:
diff = i + steps - self.demo_position
step_idxes += range(i - diff, i + steps - diff)

View file

@ -1,6 +1,4 @@
import random
from collections import namedtuple
import torch
Transition = namedtuple('Transition',
('state', 'action', 'reward', 'next_state', 'done'))

View file

@ -1,10 +1,9 @@
# From OpenAI Baselines https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
from .ReplayMemory import ReplayMemory
import operator
import random
import numpy as np
from numba import jit
from .ReplayMemory import ReplayMemory
class SegmentTree(object):
def __init__(self, capacity, operation, neutral_element):
@ -320,4 +319,3 @@ class PrioritizedReplayMemory(ReplayMemory):
self._it_min[idx] = priority ** self._alpha
self._max_priority = max(self._max_priority, priority)

View file

@ -106,11 +106,9 @@ class ReplayMemory(object):
def __reversed__(self):
return reversed(self.memory)
def zip_batch(minibatch, priority = False, want_indices = False):
def zip_batch(minibatch, priority=False):
if priority:
state_batch, action_batch, reward_batch, next_state_batch, done_batch, weights, indexes = zip(*minibatch)
elif want_indices:
state_batch, action_batch, reward_batch, next_state_batch, done_batch, indexes = zip(*minibatch)
else:
state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*minibatch)
@ -122,7 +120,5 @@ def zip_batch(minibatch, priority = False, want_indices = False):
if priority:
return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, weights, indexes
elif want_indices:
return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, indexes
else:
return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch

View file

@ -34,86 +34,3 @@ class EnvironmentEpisode(mp.Process):
self.logger.append(self.name + '/EpisodeReward', episode_reward)
self.episode_num += 1
# from copy import deepcopy
# import torch.multiprocessing as mp
# from ctypes import *
# import rltorch.log
# def envepisode(actor, env, episode_num, config, runcondition, memoryqueue = None, logqueue = None, name = ""):
# # Wait for signal to start running through the environment
# while runcondition.wait():
# # Start a logger to log the rewards
# logger = rltorch.log.Logger()
# state = env.reset()
# episode_reward = 0
# done = False
# while not done:
# action = actor.act(state)
# next_state, reward, done, _ = env.step(action)
# episode_reward += reward
# if memoryqueue is not None:
# memoryqueue.put((state, action, reward, next_state, done))
# state = next_state
# if done:
# with episode_num.get_lock():
# if episode_num.value % config['print_stat_n_eps'] == 0:
# print("episode: {}/{}, score: {}"
# .format(episode_num.value, config['total_training_episodes'], episode_reward))
# if logger is not None:
# logger.append(name + '/EpisodeReward', episode_reward)
# episode_reward = 0
# state = env.reset()
# with episode_num.get_lock():
# episode_num.value += 1
# logqueue.put(logger)
# class EnvironmentRun():
# def __init__(self, env_func, actor, config, memory = None, name = ""):
# self.config = deepcopy(config)
# self.memory = memory
# self.episode_num = mp.Value(c_uint)
# self.runcondition = mp.Event()
# # Interestingly enough, there isn't a good reliable way to know how many states an episode will have
# # Perhaps we can share a uint to keep track...
# self.memory_queue = mp.Queue(maxsize = config['replay_skip'] + 1)
# self.logqueue = mp.Queue(maxsize = 1)
# with self.episode_num.get_lock():
# self.episode_num.value = 1
# self.runner = mp.Process(target=envrun,
# args=(actor, env_func, self.episode_num, config, self.runcondition),
# kwargs = {'iterations': config['replay_skip'] + 1,
# 'memoryqueue' : self.memory_queue, 'logqueue' : self.logqueue, 'name' : name})
# self.runner.start()
# def run(self):
# self.runcondition.set()
# def join(self):
# self._sync_memory()
# if self.logwriter is not None:
# self.logwriter.write(self._get_reward_logger())
# def sync_memory(self):
# if self.memory is not None:
# for i in range(self.config['replay_skip'] + 1):
# self.memory.append(*self.memory_queue.get())
# def get_reward_logger(self):
# return self.logqueue.get()
# def terminate(self):
# self.runner.terminate()

View file

@ -1,6 +1,6 @@
from copy import deepcopy
from ctypes import c_uint
import torch.multiprocessing as mp
from ctypes import *
import rltorch.log
def envrun(actor, env, episode_num, config, runcondition, iterations=1, memoryqueue=None, logqueue=None, name=""):

View file

@ -1,7 +1,8 @@
from copy import deepcopy
import numpy as np
import torch
from .Network import Network
from copy import deepcopy
# [TODO] Should we torch.no_grad the __call__?
# What if we want to sometimes do gradient descent as well?
@ -64,7 +65,11 @@ class ESNetwork(Network):
white_noise_dict = {}
noise_dict = {}
for key in model_dict.keys():
white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape, device = self.device)
white_noise_dict[key] = torch.randn(
self.population_size,
*model_dict[key].shape,
device=self.device
)
noise_dict[key] = self.sigma * white_noise_dict[key]
return white_noise_dict, noise_dict
@ -96,7 +101,10 @@ class ESNetwork(Network):
candidate_solutions = self._generate_candidate_solutions(noise_dict)
## Calculate fitness then mean shift, scale
fitness_values = torch.tensor([self.fitness(x, *args) for x in candidate_solutions], device = self.device)
fitness_values = torch.tensor(
[self.fitness(x, *args) for x in candidate_solutions],
device=self.device
)
if self.logger is not None:
self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item())
fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps)

View file

@ -1,9 +1,8 @@
from copy import deepcopy
import numpy as np
import torch
from .Network import Network
from copy import deepcopy
import torch.multiprocessing as mp
import functools
from .Network import Network
class fn_copy:
def __init__(self, fn, args):
@ -27,7 +26,8 @@ class ESNetworkMP(Network):
self.sigma = sigma
assert self.sigma > 0
mp_ctx = mp.get_context("spawn")
self.pool = mp_ctx.Pool(processes=2) #[TODO] Probably should make number of processes a config variable
#[TODO] Probably should make number of processes a config variable
self.pool = mp_ctx.Pool(processes=2)
# We're not going to be calculating gradients in the traditional way
# So there's no need to waste computation time keeping track
@ -42,7 +42,11 @@ class ESNetworkMP(Network):
white_noise_dict = {}
noise_dict = {}
for key in model_dict.keys():
white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape, device = self.device)
white_noise_dict[key] = torch.randn(
self.population_size,
*model_dict[key].shape,
device=self.device
)
noise_dict[key] = self.sigma * white_noise_dict[key]
return white_noise_dict, noise_dict
@ -67,7 +71,10 @@ class ESNetworkMP(Network):
candidate_solutions = self._generate_candidate_solutions(noise_dict)
## Calculate fitness then mean shift, scale
fitness_values = torch.tensor(list(self.pool.map(fn_copy(self.fitness, args), candidate_solutions)), device = self.device)
fitness_values = torch.tensor(
list(self.pool.map(fn_copy(self.fitness, args), candidate_solutions)),
device=self.device
)
if self.logger is not None:
self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item())

View file

@ -20,7 +20,11 @@ class Network:
def __init__(self, model, optimizer, config, device=None, logger=None, name=""):
self.model = model
if 'weight_decay' in config:
self.optimizer = optimizer(model.parameters(), lr = config['learning_rate'], weight_decay = config['weight_decay'])
self.optimizer = optimizer(
model.parameters(),
lr=config['learning_rate'],
weight_decay=config['weight_decay']
)
else:
self.optimizer = optimizer(model.parameters(), lr=config['learning_rate'])
self.logger = logger

View file

@ -1,7 +1,8 @@
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
# This class utilizes this property of the normal distribution
# N(mu, sigma) = mu + sigma * N(0, 1)
@ -10,7 +11,6 @@ class NoisyLinear(nn.Linear):
Draws the parameters of nn.Linear from a normal distribution.
The parameters of the normal distribution are registered as
learnable parameters in the neural network.
Parameters
----------
in_features

View file

@ -37,7 +37,8 @@ class TargetNetwork:
Parameters
----------
tau : number
A number between 0-1 which indicates the proportion of the originator and clone in the new clone.
A number between 0-1 which indicates
the proportion of the originator and clone in the new clone.
"""
assert isinstance(tau, float)
assert 0.0 < tau <= 1.0

View file

@ -1,5 +1,5 @@
from .ESNetwork import *
from .ESNetworkMP import *
from .Network import *
from .NoisyLinear import *
from .TargetNetwork import *
from .ESNetwork import ESNetwork
from .ESNetworkMP import ESNetworkMP
from .Network import Network
from .NoisyLinear import NoisyLinear
from .TargetNetwork import TargetNetwork

View file

@ -36,4 +36,3 @@ class ExponentialScheduler(Scheduler):
return self.initial_value * (self.base ** (self.current_iteration - 1))
else:
return self.end_value

View file

@ -7,4 +7,4 @@ class Scheduler():
def __iter__(self):
return self
def __next__(self):
raise NotImplementedError("Scheduler does not have it's function to create a value implemented")
raise NotImplementedError("__next__ not implemented in Scheduler")