PEP8 Conformance

2020-04-14 14:16:14 -04:00 · 2020-04-14 14:16:14 -04:00 · 8fa4691511
commit 8fa4691511
parent 9b81188a77
29 changed files with 652 additions and 755 deletions
--- a/rltorch/action_selector/ArgMaxSelector.py
+++ b/rltorch/action_selector/ArgMaxSelector.py
@ -1,7 +1,7 @@
 from random import randrange
 import torch
 class ArgMaxSelector:
-    def __init__(self, model, action_size, device = None):
+    def __init__(self, model, action_size, device=None):
        self.model = model
        self.action_size = action_size
        self.device = device
@ -12,7 +12,8 @@ class ArgMaxSelector:
            if self.device is not None:
                state = state.to(self.device)
            action_values = self.model(state).squeeze(0)
-            action = self.random_act() if (action_values[0] == action_values).all() else action_values.argmax().item()
+            action = self.random_act() if (action_values[0] == action_values).all() \
+                        else action_values.argmax().item()
        return action
    def act(self, state):
        return self.best_act(state)
--- a/rltorch/action_selector/EpsilonGreedySelector.py
+++ b/rltorch/action_selector/EpsilonGreedySelector.py
@ -1,9 +1,10 @@
-from .ArgMaxSelector import ArgMaxSelector
-import numpy as np 
 import collections
+import numpy as np
+from .ArgMaxSelector import ArgMaxSelector
+
 class EpsilonGreedySelector(ArgMaxSelector):
-    def __init__(self, model, action_size, device = None, epsilon = 0.1):
-        super(EpsilonGreedySelector, self).__init__(model, action_size, device = device)
+    def __init__(self, model, action_size, device=None, epsilon=0.1):
+        super(EpsilonGreedySelector, self).__init__(model, action_size, device=device)
        self.epsilon = epsilon
    # random_act is already implemented in ArgMaxSelector
    # best_act is already implemented in ArgMaxSelector
--- a/rltorch/action_selector/IdentitySelector.py
+++ b/rltorch/action_selector/IdentitySelector.py
@ -1,8 +1,9 @@
-from .ArgMaxSelector import ArgMaxSelector
 import torch
+from .ArgMaxSelector import ArgMaxSelector
+
 class IdentitySelector(ArgMaxSelector):
-    def __init__(self, model, action_size, device = None):
-        super(IdentitySelector, self).__init__(model, action_size, device = device)
+    def __init__(self, model, action_size, device=None):
+        super(IdentitySelector, self).__init__(model, action_size, device=device)
    # random_act is already implemented in ArgMaxSelector
    def best_act(self, state):
        with torch.no_grad():
--- a/rltorch/action_selector/RandomSelector.py
+++ b/rltorch/action_selector/RandomSelector.py
@ -1,10 +1,10 @@
 from random import randrange
-class RandomSelector():
+class RandomSelector:
    def __init__(self, action_size):
        self.action_size = action_size
    def random_act(self):
-        return randrange(action_size)
-    def best_act(self, state):
+        return randrange(self.action_size)
+    def best_act(self, _):
        return self.random_act()
-    def act(self, state):
+    def act(self, _):
        return self.random_act()
--- a/rltorch/action_selector/StochasticSelector.py
+++ b/rltorch/action_selector/StochasticSelector.py
@ -1,22 +1,19 @@
-from random import randrange
-import torch
 from torch.distributions import Categorical
-import rltorch
-from rltorch.action_selector import ArgMaxSelector
-
+from .ArgMaxSelector import ArgMaxSelector
+from ..memory.EpisodeMemory import EpisodeMemory
 class StochasticSelector(ArgMaxSelector):
-    def __init__(self, model, action_size, memory = None, device = None):
-        super(StochasticSelector, self).__init__(model, action_size, device = device)
+    def __init__(self, model, action_size, memory=None, device=None):
+        super(StochasticSelector, self).__init__(model, action_size, device=device)
        self.model = model
        self.action_size = action_size
        self.device = device
        self.memory = memory
-    def best_act(self, state, log_prob = True):
+    def best_act(self, state, log_prob=True):
        if self.device is not None:
            state = state.to(self.device)
        action_probabilities = self.model(state)
        distribution = Categorical(action_probabilities)
        action = distribution.sample()
-        if log_prob and isinstance(self.memory, rltorch.memory.EpisodeMemory):
+        if log_prob and isinstance(self.memory, EpisodeMemory):
            self.memory.append_log_probs(distribution.log_prob(action))
        return action.item()
--- a/rltorch/action_selector/init.py
+++ b/rltorch/action_selector/init.py
@ -1,5 +1,5 @@
-from .ArgMaxSelector import * 
-from .EpsilonGreedySelector import * 
-from .IdentitySelector import * 
-from .RandomSelector import * 
-from .StochasticSelector import * 
+from .ArgMaxSelector import ArgMaxSelector
+from .EpsilonGreedySelector import EpsilonGreedySelector
+from .IdentitySelector import IdentitySelector
+from .RandomSelector import RandomSelector
+from .StochasticSelector import StochasticSelector
--- a/rltorch/agents/A2CSingleAgent.py
+++ b/rltorch/agents/A2CSingleAgent.py
@ -2,11 +2,9 @@ from copy import deepcopy
 import numpy as np
 import torch
 import torch.nn.functional as F
-import rltorch
-import rltorch.memory as M

 class A2CSingleAgent:
-  def __init__(self, policy_net, value_net, memory, config, logger = None):
+    def __init__(self, policy_net, value_net, memory, config, logger=None):
        self.policy_net = policy_net
        self.value_net = value_net
        self.memory = memory
@ -16,7 +14,11 @@ class A2CSingleAgent:
    def _discount_rewards(self, rewards):
        gammas = torch.ones_like(rewards)
        if len(rewards) > 1:
-      gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0)
+            discount_tensor = torch.tensor(self.config['discount_rate'])
+            gammas[1:] = torch.cumprod(
+                discount_tensor.repeat(len(rewards) - 1),
+                dim=0
+            )
        return gammas * rewards

    # This function is currently not used since the performance gains hasn't been shown
@ -29,18 +31,18 @@ class A2CSingleAgent:
            values = self.value_net(states).squeeze(1)

        generalized_advantages = torch.zeros_like(rewards)
-    for i in range(len(generalized_advantages)):
+        discount_tensor = torch.tensor(self.config['discount_rate']) * tradeoff
+        for i, _ in enumerate(generalized_advantages):
            weights = torch.ones_like(rewards[i:])
            if i != len(generalized_advantages) - 1:
-        weights[1:] = torch.cumprod(torch.tensor(self.config['discount_rate'] * tradeoff).repeat(len(rewards) - i - 1), dim = 0)
+                weights[1:] = torch.cumprod(discount_tensor.repeat(len(rewards) - i - 1), dim=0)
            generalized_advantages[i] = (weights * (rewards[i:] + self.config['discount_rate'] * next_values[i:] - values[i:])).sum()

        return generalized_advantages

-  
    def learn(self):
        episode_batch = self.memory.recall()
-    state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)  
+        state_batch, _, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)

        # Send batches to the appropriate device
        state_batch = torch.cat(state_batch).to(self.value_net.device)
@ -50,8 +52,10 @@ class A2CSingleAgent:
        log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device)

        ## Value Loss
-    # In A2C, the value loss is the difference between the discounted reward and the value from the first state
-    # The value of the first state is supposed to tell us the expected reward from the current policy of the whole episode
+        # In A2C, the value loss is the difference between the discounted reward
+        # and the value from the first state.
+        # The value of the first state is supposed to tell us
+        # the expected reward from the current policy of the whole episode
        discounted_reward = self._discount_rewards(reward_batch)
        observed_value = discounted_reward.sum()
        value_loss = F.mse_loss(observed_value, self.value_net(state_batch[0]))
@ -86,5 +90,3 @@ class A2CSingleAgent:

        # Memory under the old policy is not needed for future training
        self.memory.clear()
-    
-
--- a/rltorch/agents/DQNAgent.py
+++ b/rltorch/agents/DQNAgent.py
@ -1,13 +1,11 @@
 import collections
+from copy import deepcopy
 import rltorch.memory as M
 import torch
 import torch.nn.functional as F
-from copy import deepcopy
-import numpy as np
-from pathlib import Path

 class DQNAgent:
-    def __init__(self, net , memory, config, target_net = None, logger = None):
+    def __init__(self, net, memory, config, target_net=None, logger=None):
        self.net = net
        self.target_net = target_net
        self.memory = memory
@ -20,16 +18,16 @@ class DQNAgent:
        self.net.model.to(self.net.device)
        self.target_net.sync()

-    def learn(self, logger = None):
+    def learn(self, logger=None):
        if len(self.memory) < self.config['batch_size']:
            return
        
-        if (isinstance(self.memory, M.PrioritizedReplayMemory)):
+        if isinstance(self.memory, M.PrioritizedReplayMemory):
            weight_importance = self.config['prioritized_replay_weight_importance']
            # If it's a scheduler then get the next value by calling next, otherwise just use it's value
            beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
-            minibatch = self.memory.sample(self.config['batch_size'], beta = beta)
-            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True)
+            minibatch = self.memory.sample(self.config['batch_size'], beta=beta)
+            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True)
        else:
            minibatch = self.memory.sample(self.config['batch_size'])
            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch)
@ -49,7 +47,7 @@ class DQNAgent:
            # and the regular net to select the action
            # That way we decouple the value and action selecting processes (DOUBLE DQN)
            not_done_size = not_done_batch.sum()
-            next_state_values = torch.zeros_like(state_values, device = self.net.device)
+            next_state_values = torch.zeros_like(state_values, device=self.net.device)
            if self.target_net is not None:
                next_state_values[not_done_batch] = self.target_net(next_state_batch[not_done_batch])
                next_best_action = self.net(next_state_batch[not_done_batch]).argmax(1)
@ -57,15 +55,15 @@ class DQNAgent:
                next_state_values[not_done_batch] = self.net(next_state_batch[not_done_batch])
                next_best_action = next_state_values[not_done_batch].argmax(1)

-            best_next_state_value = torch.zeros(self.config['batch_size'], device = self.net.device)
+            best_next_state_value = torch.zeros(self.config['batch_size'], device=self.net.device)
            best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
            
        expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)

        # If we're sampling by TD error, multiply loss by a importance weight which helps decrease overfitting
-        if (isinstance(self.memory, M.PrioritizedReplayMemory)):
+        if isinstance(self.memory, M.PrioritizedReplayMemory):
            # loss = (torch.as_tensor(importance_weights, device = self.net.device) * F.smooth_l1_loss(obtained_values, expected_values, reduction = 'none').squeeze(1)).mean()
-             loss = (torch.as_tensor(importance_weights, device = self.net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
+            loss = (torch.as_tensor(importance_weights, device=self.net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
        else:
            # loss = F.smooth_l1_loss(obtained_values, expected_values)
            loss = F.mse_loss(obtained_values, expected_values)
@ -85,8 +83,6 @@ class DQNAgent:
                self.target_net.sync()

        # If we're sampling by TD error, readjust the weights of the experiences
-        if (isinstance(self.memory, M.PrioritizedReplayMemory)):
+        if isinstance(self.memory, M.PrioritizedReplayMemory):
            td_error = (obtained_values - expected_values).detach().abs()
            self.memory.update_priorities(batch_indexes, td_error)
-
-
--- a/rltorch/agents/DQfDAgent.py
+++ b/rltorch/agents/DQfDAgent.py
@ -1,14 +1,12 @@
 import collections
+from copy import deepcopy
 import rltorch.memory as M
 import torch
 import torch.nn.functional as F
-from copy import deepcopy
-import numpy as np
-from pathlib import Path
-from rltorch.action_selector import ArgMaxSelector
+

 class DQfDAgent:
-    def __init__(self, net, memory, config, target_net = None, logger = None):
+    def __init__(self, net, memory, config, target_net=None, logger=None):
        self.net = net
        self.target_net = target_net
        self.memory = memory
@ -21,7 +19,7 @@ class DQfDAgent:
        self.net.model.to(self.net.device)
        self.target_net.sync()
    
-    def learn(self, logger = None):
+    def learn(self, logger=None):
        if len(self.memory) < self.config['batch_size']:
            return
        
@ -32,29 +30,19 @@ class DQfDAgent:
            batch_size = self.config['batch_size']
            steps = None
        
-        if isinstance(self.memory, M.DQfDMemory):
        weight_importance = self.config['prioritized_replay_weight_importance']
        # If it's a scheduler then get the next value by calling next, otherwise just use it's value
-            beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
+        beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) \
+            else weight_importance
        
        # Check to see if we are doing N-Step DQN
        if steps is not None:
            minibatch = self.memory.sample_n_steps(batch_size, steps, beta)
        else:
-                minibatch = self.memory.sample(batch_size, beta = beta)
+            minibatch = self.memory.sample(batch_size, beta=beta)

        # Process batch
-            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True)
-
-        else:
-            # Check to see if we're doing N-Step DQN
-            if steps is not None:
-                minibatch = self.memory.sample_n_steps(batch_size, steps)
-            else:
-                minibatch = self.memory.sample(batch_size)
-
-            # Process batch
-            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, batch_indexes = M.zip_batch(minibatch, want_indices = True)
+        state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True)

        batch_index_tensors = torch.tensor(batch_indexes)
        demo_mask = batch_index_tensors < self.memory.demo_position
@ -75,7 +63,7 @@ class DQfDAgent:
            # and the regular net to select the action
            # That way we decouple the value and action selecting processes (DOUBLE DQN)
            not_done_size = not_done_batch.sum()
-            next_state_values = torch.zeros_like(state_values, device = self.net.device)
+            next_state_values = torch.zeros_like(state_values, device=self.net.device)
            if self.target_net is not None:
                next_state_values[not_done_batch] = self.target_net(next_state_batch[not_done_batch])
                next_best_action = self.net(next_state_batch[not_done_batch]).argmax(1)
@ -83,14 +71,14 @@ class DQfDAgent:
                next_state_values[not_done_batch] = self.net(next_state_batch[not_done_batch])
                next_best_action = next_state_values[not_done_batch].argmax(1)

-            best_next_state_value = torch.zeros(batch_size, device = self.net.device)
+            best_next_state_value = torch.zeros(batch_size, device=self.net.device)
            best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
            
-        expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
+        expected_values = (reward_batch + (batch_size * best_next_state_value)).unsqueeze(1)

        # N-Step DQN Loss
        # num_steps capture how many steps actually exist before the end of episode
-        if steps != None:
+        if steps is not None:
            expected_n_step_values = []
            with torch.no_grad():
                for i in range(0, len(state_batch), steps):
@ -127,7 +115,7 @@ class DQfDAgent:
            l = torch.ones_like(state_values[demo_mask])
            expert_actions = action_batch[demo_mask]
            # l(s, a) is zero for every action the expert doesn't take
-            for i,a in zip(range(len(l)), expert_actions):
+            for i, _, a in zip(enumerate(l), expert_actions):
                l[i].fill_(0.8) # According to paper
                l[i, a] = 0
            if self.target_net is not None:
@ -148,26 +136,17 @@ class DQfDAgent:
        
        
        # Since dqn_loss and demo_loss are different sizes, the reduction has to happen before they are combined
-        if isinstance(self.memory, M.DQfDMemory):
-            dqn_loss = (torch.as_tensor(importance_weights, device = self.net.device) * F.mse_loss(obtained_values, expected_values, reduction = 'none').squeeze(1)).mean()
-        else:
-            dqn_loss = F.mse_loss(obtained_values, expected_values)
+        dqn_loss = (torch.as_tensor(importance_weights, device=self.net.device) * F.mse_loss(obtained_values, expected_values, reduction='none').squeeze(1)).mean()
        
-        if steps != None:
-            if isinstance(self.memory, M.DQfDMemory):
-                dqn_n_step_loss =  (torch.as_tensor(importance_weights[::steps], device = self.net.device) * F.mse_loss(observed_n_step_values, expected_n_step_values, reduction = 'none')).mean()
+        if steps is not None:
+            dqn_n_step_loss = (torch.as_tensor(importance_weights[::steps], device=self.net.device) * F.mse_loss(observed_n_step_values, expected_n_step_values, reduction='none')).mean()
        else:
-                dqn_n_step_loss =  F.mse_loss(observed_n_step_values, expected_n_step_values, reduction = 'none').mean()
-        else:
-            dqn_n_step_loss = torch.tensor(0, device = self.net.device)
+            dqn_n_step_loss = torch.tensor(0, device=self.net.device)
        
        if demo_mask.sum() > 0:
-            if isinstance(self.memory, M.DQfDMemory):
-                demo_loss = (torch.as_tensor(importance_weights, device = self.net.device)[demo_mask] * F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction = 'none').squeeze(1)).mean()
+            demo_loss = (torch.as_tensor(importance_weights, device=self.net.device)[demo_mask] * F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction='none').squeeze(1)).mean()
        else:
-                demo_loss = F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction = 'none').squeeze(1).mean()
-        else:
-            demo_loss = 0.
+            demo_loss = 0
        loss = td_importance * dqn_loss + td_importance * dqn_n_step_loss + demo_importance * demo_loss
        
        if self.logger is not None:
--- a/rltorch/agents/PPOAgent.py
+++ b/rltorch/agents/PPOAgent.py
@ -1,15 +1,11 @@
 from copy import deepcopy
-import numpy as np
 import torch
 import torch.nn.functional as F
 from torch.distributions import Categorical
 import rltorch
-import rltorch.memory as M
-import collections
-import random

 class PPOAgent:
-  def __init__(self, policy_net, value_net, memory, config, logger = None):
+    def __init__(self, policy_net, value_net, memory, config, logger=None):
        self.policy_net = policy_net
        self.old_policy_net = rltorch.network.TargetNetwork(policy_net)
        self.value_net = value_net
@ -20,10 +16,9 @@ class PPOAgent:
    def _discount_rewards(self, rewards):
        gammas = torch.ones_like(rewards)
        if len(rewards) > 1:
-      gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0)
+            gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim=0)
        return gammas * rewards
  
-  
    def learn(self):
        episode_batch = self.memory.recall()
        state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)  
@ -61,21 +56,17 @@ class PPOAgent:
        # For PPO we want to stay within a certain ratio of the old policy
        policy_ratio = torch.exp(log_prob_batch - old_log_probs) # Equivalent to (log_prob / old_log_prob)
        policy_loss1 = policy_ratio * advantages
-    policy_loss2 = policy_ratio.clamp(min = 0.8, max = 1.2) * advantages # From original paper
+        policy_loss2 = policy_ratio.clamp(min=0.8, max=1.2) * advantages # From original paper
        policy_loss = -torch.min(policy_loss1, policy_loss2).sum()
    
        if self.logger is not None:
            self.logger.append("Loss/Policy", policy_loss.item())
            self.logger.append("Loss/Value", value_loss.item())

-    
        self.old_policy_net.sync()
        self.policy_net.zero_grad()
        policy_loss.backward()
        self.policy_net.step()

-
        # Memory under the old policy is not needed for future training
        self.memory.clear()
-    
-
--- a/rltorch/agents/QEPAgent.py
+++ b/rltorch/agents/QEPAgent.py
@ -2,16 +2,17 @@ from copy import deepcopy
 import collections
 import numpy as np
 import torch
+import torch.nn.functional as F
 from torch.distributions import Categorical
 import rltorch
 import rltorch.memory as M
-import torch.nn.functional as F
+

 # Q-Evolutionary Policy Agent
 # Maximizes the policy with respect to the Q-Value function.
 # Since function is non-differentiabile, depends on the Evolutionary Strategy algorithm
 class QEPAgent:
-    def __init__(self, policy_net, value_net, memory, config, target_value_net = None, logger = None, entropy_importance = 0, policy_skip = 4, after_value_train = None):
+    def __init__(self, policy_net, value_net, memory, config, target_value_net=None, logger=None, entropy_importance=0, policy_skip=4):
        self.policy_net = policy_net
        assert isinstance(self.policy_net, rltorch.network.ESNetwork) or isinstance(self.policy_net, rltorch.network.ESNetworkMP)
        self.policy_net.fitness = self.fitness
@ -22,7 +23,6 @@ class QEPAgent:
        self.logger = logger
        self.policy_skip = policy_skip
        self.entropy_importance = entropy_importance
-        self.after_value_train = after_value_train
    
    def save(self, file_location):
        torch.save({
@ -42,10 +42,8 @@ class QEPAgent:
        batch_size = len(state_batch)
        with torch.no_grad():
            action_probabilities = policy_net(state_batch)
-        
        action_size = action_probabilities.shape[1]
        distributions = list(map(Categorical, action_probabilities))
-
        actions = torch.stack([d.sample() for d in distributions])
      
        with torch.no_grad():
@ -54,31 +52,31 @@ class QEPAgent:
        # Weird hacky solution where in multiprocess, it sometimes spits out nans
        # So have it try again
        while torch.isnan(state_values).any():
-            print("NAN DETECTED")
            with torch.no_grad():
                state_values = value_net(state_batch)

-        obtained_values = state_values.gather(1, actions.view(batch_size, 1)).squeeze(1)
-
+        obtained_values = state_values.gather(1, actions.view(len(state_batch), 1)).squeeze(1)
+        # return -obtained_values.mean().item()
+        entropy_importance = 0 # Entropy accounting for 1% of loss seems to work well
        entropy_importance = next(self.entropy_importance) if isinstance(self.entropy_importance, collections.Iterable) else self.entropy_importance
        value_importance = 1 - entropy_importance
        
        # entropy_loss = (action_probabilities * torch.log2(action_probabilities)).sum(1) # Standard entropy loss from information theory
-        entropy_loss = (action_probabilities - torch.tensor(1 / action_size, device = state_batch.device).repeat(batch_size, action_size)).abs().sum(1)
+        entropy_loss = (action_probabilities - torch.tensor(1 / action_size, device=state_batch.device).repeat(len(state_batch), action_size)).abs().sum(1)
        
        return (entropy_importance * entropy_loss - value_importance * obtained_values).mean().item()
        

-    def learn(self, logger = None):
+    def learn(self, logger=None):
        if len(self.memory) < self.config['batch_size']:
            return
        
-        if (isinstance(self.memory, M.PrioritizedReplayMemory)):
+        if isinstance(self.memory, M.PrioritizedReplayMemory):
            weight_importance = self.config['prioritized_replay_weight_importance']
            # If it's a scheduler then get the next value by calling next, otherwise just use it's value
            beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
-            minibatch = self.memory.sample(self.config['batch_size'], beta = beta)
-            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True)
+            minibatch = self.memory.sample(self.config['batch_size'], beta=beta)
+            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True)
        else:
            minibatch = self.memory.sample(self.config['batch_size'])
            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch)
@ -98,7 +96,7 @@ class QEPAgent:
            # and the regular net to select the action
            # That way we decouple the value and action selecting processes (DOUBLE DQN)
            not_done_size = not_done_batch.sum()
-            next_state_values = torch.zeros_like(state_values, device = self.value_net.device)
+            next_state_values = torch.zeros_like(state_values, device=self.value_net.device)
            if self.target_value_net is not None:
                next_state_values[not_done_batch] = self.target_value_net(next_state_batch[not_done_batch])
                next_best_action = self.value_net(next_state_batch[not_done_batch]).argmax(1)
@ -106,13 +104,13 @@ class QEPAgent:
                next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch])
                next_best_action = next_state_values[not_done_batch].argmax(1)

-            best_next_state_value = torch.zeros(self.config['batch_size'], device = self.value_net.device)
+            best_next_state_value = torch.zeros(self.config['batch_size'], device=self.value_net.device)
            best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
            
        expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)

-        if (isinstance(self.memory, M.PrioritizedReplayMemory)):
-            value_loss = (torch.as_tensor(importance_weights, device = self.value_net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
+        if isinstance(self.memory, M.PrioritizedReplayMemory):
+            value_loss = (torch.as_tensor(importance_weights, device=self.value_net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
        else:
            value_loss = F.mse_loss(obtained_values, expected_values)
        
@ -124,16 +122,13 @@ class QEPAgent:
        self.value_net.clamp_gradients()
        self.value_net.step()

-        if callable(self.after_value_train):
-            self.after_value_train()
-
        if self.target_value_net is not None:
            if 'target_sync_tau' in self.config:
                self.target_value_net.partial_sync(self.config['target_sync_tau'])
            else:
                self.target_value_net.sync()

-        if (isinstance(self.memory, M.PrioritizedReplayMemory)):
+        if isinstance(self.memory, M.PrioritizedReplayMemory):
            td_error = (obtained_values - expected_values).detach().abs()
            self.memory.update_priorities(batch_indexes, td_error)

@ -141,7 +136,8 @@ class QEPAgent:
        if self.policy_skip > 0:
            self.policy_skip -= 1
            return
-        self.policy_skip = self.config['policy_skip']
+        self.policy_skip = 4
+
        if self.target_value_net is not None:
            self.policy_net.calc_gradients(self.target_value_net, state_batch)
        else:
--- a/rltorch/agents/REINFORCEAgent.py
+++ b/rltorch/agents/REINFORCEAgent.py
@ -1,10 +1,10 @@
-import rltorch
 from copy import deepcopy
-import torch
 import numpy as np
+import torch
+import rltorch

 class REINFORCEAgent:
-  def __init__(self, net , memory, config, target_net = None, logger = None):
+    def __init__(self, net, memory, config, target_net=None, logger=None):
        self.net = net
        if not isinstance(memory, rltorch.memory.EpisodeMemory):
            raise ValueError("Memory must be of instance EpisodeMemory")
@ -23,14 +23,14 @@ class REINFORCEAgent:
        for i in range(len(rewards)):
            gammas = torch.ones_like(rewards[i:])
            if i != len(rewards) - 1:
-        gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i - 1), dim = 0)
+                gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i - 1), dim=0)
            advantages = rewards[i:] - baseline
            shaped_rewards[i] = (gammas * advantages).sum()
        return shaped_rewards
  
    def learn(self):
        episode_batch = self.memory.recall()
-    state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
+        _, _, reward_batch, _, _, log_prob_batch = zip(*episode_batch)

        # Caluclate discounted rewards to place more importance to recent rewards
        shaped_reward_batch = self._shape_rewards(torch.tensor(reward_batch))
--- a/rltorch/agents/init.py
+++ b/rltorch/agents/init.py
@ -1,6 +1,6 @@
-from .A2CSingleAgent import *
-from .DQNAgent import *
-from .DQfDAgent import *
-from .PPOAgent import *
-from .QEPAgent import *
-from .REINFORCEAgent import *
+from .A2CSingleAgent import A2CSingleAgent
+from .DQNAgent import DQNAgent
+from .DQfDAgent import DQfDAgent
+from .PPOAgent import PPOAgent
+from .QEPAgent import QEPAgent
+from .REINFORCEAgent import REINFORCEAgent
--- a/rltorch/env/simulate.py
+++ b/rltorch/env/simulate.py
@ -1,8 +1,8 @@
 from copy import deepcopy
-import rltorch
 import time
+import rltorch

-def simulateEnvEps(env, actor, config, total_episodes = 1, memory = None, logger = None, name = "", render = False):
+def simulateEnvEps(env, actor, config, total_episodes=1, memory=None, logger=None, name="", render=False):
    for episode in range(total_episodes):
        state = env.reset()
        done = False
@ -27,8 +27,8 @@ def simulateEnvEps(env, actor, config, total_episodes = 1, memory = None, logger
            logger.append(name + '/EpisodeReward', episode_reward)


-class EnvironmentRunSync():
-  def __init__(self, env, actor, config, memory = None, logwriter = None, name = "", render = False):
+class EnvironmentRunSync:
+    def __init__(self, env, actor, config, memory=None, logwriter=None, name="", render=False):
        self.env = env
        self.name = name
        self.actor = actor
@ -72,8 +72,8 @@ class EnvironmentRunSync():
        self.last_state = state


-class EnvironmentEpisodeSync():
-  def __init__(self, env, actor, config, memory = None, logwriter = None, name = ""):
+class EnvironmentEpisodeSync:
+    def __init__(self, env, actor, config, memory=None, logwriter=None, name=""):
        self.env = env
        self.name = name
        self.actor = actor
--- a/rltorch/env/wrappers.py
+++ b/rltorch/env/wrappers.py
@ -1,8 +1,8 @@
+from collections import deque
 import gym
 import torch
 from gym import spaces
 import cv2
-from collections import deque
 import numpy as np

 class EpisodicLifeEnv(gym.Wrapper):
@ -170,7 +170,12 @@ class FrameStack(gym.Wrapper):
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
-    self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)
+        self.observation_space = spaces.Box(
+            low=0,
+            high=255,
+            shape=(shp[:-1] + (shp[-1] * k,)),
+            dtype=env.observation_space.dtype
+        )

    def reset(self):
        ob = self.env.reset()
@ -189,7 +194,7 @@ class FrameStack(gym.Wrapper):
        return torch.cat(list(self.frames)).unsqueeze(0)

 class ProcessFrame(gym.Wrapper):
-  def __init__(self, env, resize_shape = None, crop_bounds = None, grayscale = False):
+    def __init__(self, env, resize_shape=None, crop_bounds=None, grayscale=False):
        gym.Wrapper.__init__(self, env)
        self.resize_shape = resize_shape
        self.crop_bounds = crop_bounds
@ -207,14 +212,16 @@ class ProcessFrame(gym.Wrapper):
        if self.grayscale:
            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        if self.crop_bounds is not None and len(self.crop_bounds) == 4:
-        frame = frame[self.crop_bounds[0]:self.crop_bounds[1], self.crop_bounds[2]:self.crop_bounds[3]] 
+            frame = frame[
+                self.crop_bounds[0]:self.crop_bounds[1],
+                self.crop_bounds[2]:self.crop_bounds[3]
+            ] 
        if self.resize_shape is not None and len(self.resize_shape) == 2:
            frame = cv2.resize(frame, self.resize_shape, interpolation=cv2.INTER_AREA)
        # Normalize
        frame = frame / 255
        return frame

-
 # Turns observations into torch tensors
 # Adds an additional dimension that's suppose to represent the batch dim
 class TorchWrap(gym.Wrapper):
@ -233,8 +240,6 @@ class TorchWrap(gym.Wrapper):
        frame = torch.from_numpy(frame).unsqueeze(0).float()
        return frame

-
-
 class ProcessFrame84(gym.ObservationWrapper):
    def __init__(self, env=None):
        super(ProcessFrame84, self).__init__(env)
@ -256,4 +261,3 @@ class ProcessFrame84(gym.ObservationWrapper):
        x_t = resized_screen[18:102, :]
        x_t = np.reshape(x_t, [84, 84])
        return x_t.astype(np.uint8)
-
--- a/rltorch/memory/DQfDMemory.py
+++ b/rltorch/memory/DQfDMemory.py
@ -1,13 +1,13 @@
-from .PrioritizedReplayMemory import PrioritizedReplayMemory
 from collections import namedtuple
 import numpy as np
+from .PrioritizedReplayMemory import PrioritizedReplayMemory

 Transition = namedtuple('Transition',
    ('state', 'action', 'reward', 'next_state', 'done'))


 class DQfDMemory(PrioritizedReplayMemory):
-    def __init__(self, capacity, alpha, max_demo = -1):
+    def __init__(self, capacity, alpha, max_demo=-1):
        assert max_demo <= capacity
        super().__init__(capacity, alpha)
        self.demo_position = 0
@ -47,7 +47,8 @@ class DQfDMemory(PrioritizedReplayMemory):
        idxes = self._sample_proportional(sample_size)
        step_idxes = []
        for i in idxes:
-            # If the interval of experiences fall between demonstration and obtained, move it over to the demonstration half
+            # If the interval of experiences fall between demonstration and obtained,
+            # move it over to the demonstration half
            if i < self.demo_position and i + steps > self.demo_position:
                diff = i + steps - self.demo_position
                step_idxes += range(i - diff, i + steps - diff)
--- a/rltorch/memory/EpisodeMemory.py
+++ b/rltorch/memory/EpisodeMemory.py
@ -1,6 +1,4 @@
-import random
 from collections import namedtuple
-import torch
 Transition = namedtuple('Transition',
    ('state', 'action', 'reward', 'next_state', 'done'))

--- a/rltorch/memory/PrioritizedReplayMemory.py
+++ b/rltorch/memory/PrioritizedReplayMemory.py
@ -1,10 +1,9 @@
 # From OpenAI Baselines https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
-
-from .ReplayMemory import ReplayMemory
 import operator
 import random
 import numpy as np
 from numba import jit
+from .ReplayMemory import ReplayMemory

 class SegmentTree(object):
    def __init__(self, capacity, operation, neutral_element):
@ -34,7 +33,7 @@ class SegmentTree(object):
        self._value = [neutral_element for _ in range(2 * capacity)]
        self._operation = operation

-    @jit(forceobj = True)
+    @jit(forceobj=True)
    def _reduce_helper(self, start, end, node, node_start, node_end):
        if start == node_start and end == node_end:
            return self._value[node]
@ -50,7 +49,7 @@ class SegmentTree(object):
                    self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
                )

-    @jit(forceobj = True)
+    @jit(forceobj=True)
    def reduce(self, start=0, end=None):
        """Returns result of applying `self.operation`
        to a contiguous subsequence of the array.
@ -73,7 +72,7 @@ class SegmentTree(object):
        end -= 1
        return self._reduce_helper(start, end, 1, 0, self._capacity - 1)

-    @jit(forceobj = True)
+    @jit(forceobj=True)
    def __setitem__(self, idx, val):
        # index of the leaf
        idx += self._capacity
@ -86,7 +85,7 @@ class SegmentTree(object):
            )
            idx //= 2

-    @jit(forceobj = True)
+    @jit(forceobj=True)
    def __getitem__(self, idx):
        assert 0 <= idx < self._capacity
        return self._value[self._capacity + idx]
@ -100,12 +99,12 @@ class SumSegmentTree(SegmentTree):
            neutral_element=0.0
        )

-    @jit(forceobj = True)
+    @jit(forceobj=True)
    def sum(self, start=0, end=None):
        """Returns arr[start] + ... + arr[end]"""
        return super(SumSegmentTree, self).reduce(start, end)

-    @jit(forceobj = True, parallel = True)
+    @jit(forceobj=True, parallel=True)
    def find_prefixsum_idx(self, prefixsum):
        """Find the highest index `i` in the array such that
            sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
@ -140,7 +139,7 @@ class MinSegmentTree(SegmentTree):
            neutral_element=float('inf')
        )

-    @jit(forceobj = True)
+    @jit(forceobj=True)
    def min(self, start=0, end=None):
        """Returns min(arr[start], ...,  arr[end])"""
        return super(MinSegmentTree, self).reduce(start, end)
@ -185,7 +184,7 @@ class PrioritizedReplayMemory(ReplayMemory):
        self._it_sum[idx] = self._max_priority ** self._alpha
        self._it_min[idx] = self._max_priority ** self._alpha

-    @jit(forceobj = True)
+    @jit(forceobj=True)
    def _sample_proportional(self, batch_size):
        res = []
        p_total = self._it_sum.sum(0, len(self.memory) - 1)
@ -294,7 +293,7 @@ class PrioritizedReplayMemory(ReplayMemory):
        batch = list(zip(*encoded_sample, weights, step_idxes))
        return batch
    
-    @jit(forceobj = True)
+    @jit(forceobj=True)
    def update_priorities(self, idxes, priorities):
        """
        Update priorities of sampled transitions.
@ -320,4 +319,3 @@ class PrioritizedReplayMemory(ReplayMemory):
            self._it_min[idx] = priority ** self._alpha

            self._max_priority = max(self._max_priority, priority)
-
--- a/rltorch/memory/ReplayMemory.py
+++ b/rltorch/memory/ReplayMemory.py
@ -106,11 +106,9 @@ class ReplayMemory(object):
    def __reversed__(self):
        return reversed(self.memory)

-def zip_batch(minibatch, priority = False, want_indices = False):
+def zip_batch(minibatch, priority=False):
    if priority:
        state_batch, action_batch, reward_batch, next_state_batch, done_batch, weights, indexes = zip(*minibatch)
-    elif want_indices:
-        state_batch, action_batch, reward_batch, next_state_batch, done_batch, indexes = zip(*minibatch)
    else:
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*minibatch)
        
@ -122,7 +120,5 @@ def zip_batch(minibatch, priority = False, want_indices = False):

    if priority:
        return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, weights, indexes
-    elif want_indices:
-        return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, indexes
    else:
        return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch
--- a/rltorch/mp/EnvironmentEpisode.py
+++ b/rltorch/mp/EnvironmentEpisode.py
@ -5,7 +5,7 @@ from copy import deepcopy
 import torch.multiprocessing as mp

 class EnvironmentEpisode(mp.Process):
-  def __init__(self, env, actor, config, logger = None, name = ""):
+    def __init__(self, env, actor, config, logger=None, name=""):
        super(EnvironmentEpisode, self).__init__()
        self.env = env
        self.actor = actor
@ -14,7 +14,7 @@ class EnvironmentEpisode(mp.Process):
        self.name = name
        self.episode_num = 1

-  def run(self, printstat = False, memory = None):
+    def run(self, printstat=False, memory=None):
        state = self.env.reset()
        done = False
        episode_reward = 0
@ -34,86 +34,3 @@ class EnvironmentEpisode(mp.Process):
            self.logger.append(self.name + '/EpisodeReward', episode_reward)

        self.episode_num += 1
-
-
-
-
-
-
-
-
-# from copy import deepcopy
-# import torch.multiprocessing as mp
-# from ctypes import *
-# import rltorch.log
-
-# def envepisode(actor, env, episode_num, config, runcondition, memoryqueue = None, logqueue = None, name = ""):
-#   # Wait for signal to start running through the environment
-#   while runcondition.wait():
-#     # Start a logger to log the rewards
-#     logger = rltorch.log.Logger()
-#     state = env.reset()
-#     episode_reward = 0
-#     done = False
-#     while not done:
-#       action = actor.act(state)
-#       next_state, reward, done, _ = env.step(action)
-       
-#       episode_reward += reward
-#       if memoryqueue is not None:
-#         memoryqueue.put((state, action, reward, next_state, done))
-       
-#       state = next_state
-
-#       if done:
-#         with episode_num.get_lock():
-#           if episode_num.value % config['print_stat_n_eps'] == 0:
-#             print("episode: {}/{}, score: {}"
-#               .format(episode_num.value, config['total_training_episodes'], episode_reward))
-          
-#         if logger is not None:
-#           logger.append(name + '/EpisodeReward', episode_reward)
-#         episode_reward = 0
-#         state = env.reset()
-#         with episode_num.get_lock():
-#           episode_num.value +=  1
-          
-#     logqueue.put(logger)
-  
-# class EnvironmentRun():
-#   def __init__(self, env_func, actor, config, memory = None, name = ""):
-#     self.config = deepcopy(config)
-#     self.memory = memory
-#     self.episode_num = mp.Value(c_uint)
-#     self.runcondition = mp.Event()
-#     # Interestingly enough, there isn't a good reliable way to know how many states an episode will have
-#     # Perhaps we can share a uint to keep track...
-#     self.memory_queue = mp.Queue(maxsize = config['replay_skip'] + 1)
-#     self.logqueue = mp.Queue(maxsize = 1)
-#     with self.episode_num.get_lock():
-#       self.episode_num.value = 1
-#     self.runner = mp.Process(target=envrun, 
-#       args=(actor, env_func, self.episode_num, config, self.runcondition),
-#       kwargs = {'iterations': config['replay_skip'] + 1, 
-#         'memoryqueue' : self.memory_queue, 'logqueue' : self.logqueue, 'name' : name})
-#     self.runner.start()
-
-#   def run(self):
-#     self.runcondition.set()
-
-#   def join(self):
-#     self._sync_memory()
-#     if self.logwriter is not None:
-#       self.logwriter.write(self._get_reward_logger())
-
-#   def sync_memory(self):
-#     if self.memory is not None:
-#       for i in range(self.config['replay_skip'] + 1):
-#         self.memory.append(*self.memory_queue.get())
-
-#   def get_reward_logger(self):
-#     return self.logqueue.get()
-
-#   def terminate(self):
-#     self.runner.terminate()
-    
--- a/rltorch/mp/EnvironmentRun.py
+++ b/rltorch/mp/EnvironmentRun.py
@ -1,9 +1,9 @@
 from copy import deepcopy
+from ctypes import c_uint
 import torch.multiprocessing as mp
-from ctypes import *
 import rltorch.log

-def envrun(actor, env, episode_num, config, runcondition, iterations = 1, memoryqueue = None, logqueue = None, name = ""):
+def envrun(actor, env, episode_num, config, runcondition, iterations=1, memoryqueue=None, logqueue=None, name=""):
    state = env.reset()
    episode_reward = 0
    # Wait for signal to start running through the environment
--- a/rltorch/network/ESNetwork.py
+++ b/rltorch/network/ESNetwork.py
@ -1,7 +1,8 @@
+from copy import deepcopy
 import numpy as np
 import torch
 from .Network import Network
-from copy import deepcopy
+

 # [TODO] Should we torch.no_grad the __call__?
 # What if we want to sometimes do gradient descent as well?
@ -38,7 +39,7 @@ class ESNetwork(Network):
    name
      For use in logger to differentiate in analysis.
    """
-    def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma = 0.05, device = None, logger = None, name = ""):
+    def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, logger=None, name=""):
        super(ESNetwork, self).__init__(model, optimizer, config, device, logger, name)
        self.population_size = population_size
        self.fitness = fitness_fn
@ -64,7 +65,11 @@ class ESNetwork(Network):
        white_noise_dict = {}
        noise_dict = {}
        for key in model_dict.keys():
-            white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape, device = self.device)
+            white_noise_dict[key] = torch.randn(
+                self.population_size,
+                *model_dict[key].shape,
+                device=self.device
+            )
            noise_dict[key] = self.sigma * white_noise_dict[key]
        return white_noise_dict, noise_dict

@ -96,7 +101,10 @@ class ESNetwork(Network):
        candidate_solutions = self._generate_candidate_solutions(noise_dict)
        
        ## Calculate fitness then mean shift, scale
-        fitness_values = torch.tensor([self.fitness(x, *args) for x in candidate_solutions], device = self.device)
+        fitness_values = torch.tensor(
+            [self.fitness(x, *args) for x in candidate_solutions],
+            device=self.device
+        )
        if self.logger is not None:
            self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item())
        fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps)
--- a/rltorch/network/ESNetworkMP.py
+++ b/rltorch/network/ESNetworkMP.py
@ -1,9 +1,8 @@
+from copy import deepcopy
 import numpy as np
 import torch
-from .Network import Network
-from copy import deepcopy
 import torch.multiprocessing as mp
-import functools
+from .Network import Network

 class fn_copy:
    def __init__(self, fn, args):
@ -20,14 +19,15 @@ class ESNetworkMP(Network):
    fitness_fun := model, *args -> fitness_value (float)
    We wish to find a model that maximizes the fitness function
    """
-    def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma = 0.05, device = None, logger = None, name = ""):
+    def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, logger=None, name=""):
        super(ESNetworkMP, self).__init__(model, optimizer, config, device, logger, name)
        self.population_size = population_size
        self.fitness = fitness_fn
        self.sigma = sigma
        assert self.sigma > 0
        mp_ctx = mp.get_context("spawn")
-        self.pool = mp_ctx.Pool(processes=2) #[TODO] Probably should make number of processes a config variable
+        #[TODO] Probably should make number of processes a config variable
+        self.pool = mp_ctx.Pool(processes=2) 

    # We're not going to be calculating gradients in the traditional way
    # So there's no need to waste computation time keeping track
@ -42,7 +42,11 @@ class ESNetworkMP(Network):
        white_noise_dict = {}
        noise_dict = {}
        for key in model_dict.keys():
-            white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape, device = self.device)
+            white_noise_dict[key] = torch.randn(
+                self.population_size,
+                *model_dict[key].shape,
+                device=self.device
+            )
            noise_dict[key] = self.sigma * white_noise_dict[key]
        return white_noise_dict, noise_dict

@ -67,7 +71,10 @@ class ESNetworkMP(Network):
        candidate_solutions = self._generate_candidate_solutions(noise_dict)
        
        ## Calculate fitness then mean shift, scale
-        fitness_values = torch.tensor(list(self.pool.map(fn_copy(self.fitness, args), candidate_solutions)), device = self.device)
+        fitness_values = torch.tensor(
+            list(self.pool.map(fn_copy(self.fitness, args), candidate_solutions)),
+            device=self.device
+        )

        if self.logger is not None:
            self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item())
--- a/rltorch/network/Network.py
+++ b/rltorch/network/Network.py
@ -17,12 +17,16 @@ class Network:
    name
      For use in logger to differentiate in analysis.
    """
-    def __init__(self, model, optimizer, config, device = None, logger = None, name = ""):
+    def __init__(self, model, optimizer, config, device=None, logger=None, name=""):
        self.model = model
        if 'weight_decay' in config:
-            self.optimizer = optimizer(model.parameters(), lr = config['learning_rate'], weight_decay = config['weight_decay'])
+            self.optimizer = optimizer(
+                model.parameters(),
+                lr=config['learning_rate'],
+                weight_decay=config['weight_decay']
+            )
        else:
-            self.optimizer = optimizer(model.parameters(), lr = config['learning_rate'])
+            self.optimizer = optimizer(model.parameters(), lr=config['learning_rate'])
        self.logger = logger
        self.name = name
        self.device = device
@ -32,7 +36,7 @@ class Network:
    def __call__(self, *args):
        return self.model(*args)

-    def clamp_gradients(self, x = 1):
+    def clamp_gradients(self, x=1):
        """
        Forcing gradients to stay within a certain interval
        by setting it to the bound if it goes over it.
--- a/rltorch/network/NoisyLinear.py
+++ b/rltorch/network/NoisyLinear.py
@ -1,7 +1,8 @@
+import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import math
+

 # This class utilizes this property of the normal distribution
 # N(mu, sigma) = mu + sigma * N(0, 1)
@ -10,7 +11,6 @@ class NoisyLinear(nn.Linear):
    Draws the parameters of nn.Linear from a normal distribution.
    The parameters of the normal distribution are registered as
    learnable parameters in the neural network.
-
    Parameters
    ----------
    in_features
@ -24,8 +24,8 @@ class NoisyLinear(nn.Linear):
       learn an additive bias.
       Default: True
    """
-  def __init__(self, in_features, out_features, sigma_init = 0.017, bias = True):
-    super(NoisyLinear, self).__init__(in_features, out_features, bias = bias)
+    def __init__(self, in_features, out_features, sigma_init=0.017, bias=True):
+        super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)
        # One of the parameters the network is going to tune is the 
        # standard deviation of the gaussian noise on the weights
        self.sigma_weight = nn.Parameter(torch.Tensor(out_features, in_features).fill_(sigma_init))
--- a/rltorch/network/TargetNetwork.py
+++ b/rltorch/network/TargetNetwork.py
@ -11,7 +11,7 @@ class TargetNetwork:
    device
      The device to put the cloned parameters in.
    """
-    def __init__(self, network, device = None):
+    def __init__(self, network, device=None):
        self.model = network.model
        self.target_model = deepcopy(network.model)
        if device is not None:
@ -37,7 +37,8 @@ class TargetNetwork:
        Parameters
        ----------
        tau : number
-          A number between 0-1 which indicates the proportion of the originator and clone in the new clone.
+          A number between 0-1 which indicates
+          the proportion of the originator and clone in the new clone.
        """
        assert isinstance(tau, float)
        assert 0.0 < tau <= 1.0
--- a/rltorch/network/init.py
+++ b/rltorch/network/init.py
@ -1,5 +1,5 @@
-from .ESNetwork import *
-from .ESNetworkMP import *
-from .Network import *
-from .NoisyLinear import *
-from .TargetNetwork import *
+from .ESNetwork import ESNetwork
+from .ESNetworkMP import ESNetworkMP
+from .Network import Network
+from .NoisyLinear import NoisyLinear
+from .TargetNetwork import TargetNetwork
--- a/rltorch/scheduler/ExponentialScheduler.py
+++ b/rltorch/scheduler/ExponentialScheduler.py
@ -36,4 +36,3 @@ class ExponentialScheduler(Scheduler):
            return self.initial_value * (self.base ** (self.current_iteration - 1))
        else:
            return self.end_value
-
--- a/rltorch/scheduler/Scheduler.py
+++ b/rltorch/scheduler/Scheduler.py
@ -7,4 +7,4 @@ class Scheduler():
    def __iter__(self):
        return self
    def __next__(self):
-        raise NotImplementedError("Scheduler does not have it's function to create a value implemented")
+        raise NotImplementedError("__next__ not implemented in Scheduler")