Initial Commit

This commit is contained in:
Brandon Rozek 2019-01-31 23:34:32 -05:00
commit a03abe2bb1
27 changed files with 906 additions and 0 deletions

4
.gitignore vendored Normal file
View file

@ -0,0 +1,4 @@
__pycache__/
*.py[cod]
rlenv/
runs/

37
Readme.md Normal file
View file

@ -0,0 +1,37 @@
# rltorch
A reinforcement learning framework with the primary purpose of learning and cleaning up personal scripts.
## Installation
From GitHub
```
pip install git+https://github.com/brandon-rozek/rltorch
```
## Components
### Config
This is a dictionary that is shared around the different components. Contains hyperparameters and other configuration values.
### Environment
This component needs to support the standard openai functions reset and step.
### Logger
For Tensorboard to work, you need to define a logger that will (optionally) later go into the network, runner, and agent/trainer.
Due to issues with multiprocessing, the Logger is a shared dictionary of lists that get appended to and the LogWriter writes on the main thread.
### Network
A network takes a PyTorch nn.Module, PyTorch optimizer, configuration, and the optional logger.
### Target Network
Takes in a network and provides methods to sync a copy of the original network.
### Action Selector
Typtically takes in a network which it then uses to help make decisions on which actions to take.
For example, the ArgMaxSelector chooses the action that produces the highest entry in the output vector of the network.
### Memory
Stores experiences during simulations of the environment. Useful for later training.
### Agents
Takes in a network and performs some sort of training upon it.

123
examples/acrobot.py Normal file
View file

@ -0,0 +1,123 @@
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import rltorch
import rltorch.network as rn
import rltorch.memory as M
import rltorch.env as E
from rltorch.action_selector import ArgMaxSelector
from tensorboardX import SummaryWriter
class Value(nn.Module):
def __init__(self, state_size, action_size):
super(Value, self).__init__()
self.state_size = state_size
self.action_size = action_size
self.fc1 = rn.NoisyLinear(state_size, 64)
self.value_fc = rn.NoisyLinear(64, 64)
self.value = rn.NoisyLinear(64, 1)
self.advantage_fc = rn.NoisyLinear(64, 64)
self.advantage = rn.NoisyLinear(64, action_size)
def forward(self, x):
x = F.relu(self.fc1(x))
state_value = F.relu(self.value_fc(x))
state_value = self.value(state_value)
advantage = F.relu(self.advantage_fc(x))
advantage = self.advantage(advantage)
x = state_value + advantage - advantage.mean()
return x
config = {}
config['seed'] = 901
config['environment_name'] = 'Acrobot-v1'
config['memory_size'] = 2000
config['total_training_episodes'] = 50
config['total_evaluation_episodes'] = 10
config['batch_size'] = 32
config['learning_rate'] = 1e-3
config['target_sync_tau'] = 1e-1
config['weight_decay'] = 1e-5
config['discount_rate'] = 0.99
config['replay_skip'] = 0
# How many episodes between printing out the episode stats
config['print_stat_n_eps'] = 1
config['disable_cuda'] = False
def train(runner, agent, config, logwriter = None):
finished = False
episode_num = 1
while not finished:
runner.run(config['replay_skip'] + 1, printstat = runner.episode_num % config['print_stat_n_eps'] == 0)
agent.learn()
runner.join()
# When the episode number changes, write out the weight histograms
if logwriter is not None and episode_num < runner.episode_num:
episode_num = runner.episode_num
agent.net.log_named_parameters()
if logwriter is not None:
logwriter.write()
finished = runner.episode_num > config['total_training_episodes']
# Setting up the environment
rltorch.set_seed(config['seed'])
print("Setting up environment...", end = " ")
env = E.TorchWrap(gym.make(config['environment_name']))
env.seed(config['seed'])
print("Done.")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
# Logging
logger = rltorch.log.Logger()
logwriter = rltorch.log.LogWriter(logger, SummaryWriter())
# Setting up the networks
device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu")
net = rn.Network(Value(state_size, action_size),
torch.optim.Adam, config, logger = logger, name = "DQN")
target_net = rn.TargetNetwork(net)
# Actor takes a net and uses it to produce actions from given states
actor = ArgMaxSelector(net, action_size)
# Memory stores experiences for later training
memory = M.ReplayMemory(capacity = config['memory_size'])
# Runner performs a certain number of steps in the environment
runner = rltorch.mp.EnvironmentRun(env, actor, config, memory = memory, logger = logger, name = "Training")
runner.start()
# Agent is what performs the training
agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net, logger = logger)
print("Training...")
train(runner, agent, config, logwriter = logwriter)
# For profiling...
# import cProfile
# cProfile.run('train(runner, agent, config, logwriter = logwriter )')
# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution...
print("Training Finished.")
runner.terminate() # We don't need the extra process anymore
print("Evaluating...")
rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation")
print("Evaulations Done.")
logwriter.close() # We don't need to write anything out to disk anymore

140
examples/pong.py Normal file
View file

@ -0,0 +1,140 @@
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import rltorch
import rltorch.network as rn
import rltorch.memory as M
import rltorch.env as E
from rltorch.action_selector import ArgMaxSelector
from tensorboardX import SummaryWriter
class Value(nn.Module):
def __init__(self, state_size, action_size):
super(Value, self).__init__()
self.state_size = state_size
self.action_size = action_size
self.conv1 = nn.Conv2d(4, 32, kernel_size = (8, 8), stride = (4, 4))
self.conv2 = nn.Conv2d(32, 64, kernel_size = (4, 4), stride = (2, 2))
self.conv3 = nn.Conv2d(64, 64, kernel_size = (3, 3), stride = (1, 1))
self.fc1 = rn.NoisyLinear(64 * 6 * 6, 384)
self.value_fc = rn.NoisyLinear(384, 384)
self.value = rn.NoisyLinear(384, 1)
self.advantage_fc = rn.NoisyLinear(384, 384)
self.advantage = rn.NoisyLinear(384, action_size)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
x = F.relu(self.conv3(x))
# Makes batch_size dimension again
x = x.view(-1, 64 * 6 * 6)
x = F.relu(self.fc1(x))
state_value = F.relu(self.value_fc(x))
state_value = self.value(state_value)
advantage = F.relu(self.advantage_fc(x))
advantage = self.advantage(advantage)
x = state_value + advantage - advantage.mean()
# For debugging purposes...
if torch.isnan(x).any().item():
print("WARNING NAN IN MODEL DETECTED")
return x
config = {}
config['seed'] = 901
config['environment_name'] = 'PongNoFrameskip-v4'
config['memory_size'] = 4000
config['total_training_episodes'] = 50
config['total_evaluation_episodes'] = 10
config['learning_rate'] = 1e-4
config['target_sync_tau'] = 1e-3
config['weight_decay'] = 1e-8
config['discount_rate'] = 0.999
config['replay_skip'] = 4
config['batch_size'] = 32 * (config['replay_skip'] + 1)
# How many episodes between printing out the episode stats
config['print_stat_n_eps'] = 1
config['disable_cuda'] = False
def train(runner, agent, config, logwriter = None):
finished = False
episode_num = 1
while not finished:
runner.run(config['replay_skip'] + 1, printstat = runner.episode_num % config['print_stat_n_eps'] == 0)
agent.learn()
runner.join()
# When the episode number changes, write out the weight histograms
if logwriter is not None and episode_num < runner.episode_num:
episode_num = runner.episode_num
agent.net.log_named_parameters()
if logwriter is not None:
logwriter.write()
finished = runner.episode_num > config['total_training_episodes']
rltorch.set_seed(config['seed'])
print("Setting up environment...", end = " ")
env = E.FrameStack(E.TorchWrap(
E.ProcessFrame(E.FireResetEnv(gym.make(config['environment_name'])),
resize_shape = (80, 80), crop_bounds = [34, 194, 15, 145], grayscale = True))
, 4)
env.seed(config['seed'])
print("Done.")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
# Logging
logger = rltorch.log.Logger()
logwriter = rltorch.log.LogWriter(logger, SummaryWriter())
# Setting up the networks
device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu")
net = rn.Network(Value(state_size, action_size),
torch.optim.Adam, config, logger = logger, name = "DQN")
target_net = rn.TargetNetwork(net)
# Actor takes a network and uses it to produce actions from given states
actor = ArgMaxSelector(net, action_size)
# Memory stores experiences for later training
memory = M.ReplayMemory(capacity = config['memory_size'])
# Runner performs a certain number of steps in the environment
runner = rltorch.mp.EnvironmentRun(env, actor, config, memory = memory, logger = logger, name = "Training")
runner.start()
# Agent is what performs the training
agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net, logger = logger)
print("Training...")
train(runner, agent, config, logwriter = logwriter)
# For profiling...
# import cProfile
# cProfile.run('train(runner, agent, config, logwriter = logwriter )')
# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution...
print("Training Finished.")
runner.terminate() # We don't need the extra process anymore
print("Evaluating...")
rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation")
print("Evaulations Done.")
logwriter.close() # We don't need to write anything out to disk anymore

31
requirements.txt Normal file
View file

@ -0,0 +1,31 @@
absl-py==0.7.0
astor==0.7.1
atari-py==0.1.7
certifi==2018.11.29
chardet==3.0.4
future==0.17.1
gast==0.2.2
grpcio==1.18.0
gym==0.10.11
h5py==2.9.0
idna==2.8
Keras-Applications==1.0.7
Keras-Preprocessing==1.0.8
Markdown==3.0.1
numpy==1.16.0
opencv-python==4.0.0.21
Pillow==5.4.1
pkg-resources==0.0.0
protobuf==3.6.1
pyglet==1.3.2
PyOpenGL==3.1.0
requests==2.21.0
scipy==1.2.0
six==1.12.0
tensorboard==1.12.2
tensorboardX==1.6
tensorflow==1.12.0
termcolor==1.1.0
torch==1.0.0
urllib3==1.24.1
Werkzeug==0.14.1

8
rltorch/__init__.py Normal file
View file

@ -0,0 +1,8 @@
from . import action_selector
from . import agents
from . import env
from . import memory
from . import network
from . import mp
from .seed import *
from . import log

View file

@ -0,0 +1,18 @@
from random import randrange
import torch
class ArgMaxSelector:
def __init__(self, model, action_size, device = None):
self.model = model
self.action_size = action_size
self.device = device
def random_act(self):
return randrange(self.action_size)
def best_act(self, state):
with torch.no_grad():
if self.device is not None:
self.device.to(self.device)
action_values = self.model(state).squeeze(0)
action = self.random_act() if (action_values[0] == action_values).all() else action_values.argmax().item()
return action
def act(self, state):
return self.best_act(state)

View file

@ -0,0 +1,14 @@
from .ArgMaxSelector import ArgMaxSelector
class EpsilonGreedySelector(ArgMaxSelector):
def __init__(self, model, action_size, device = None, epsilon = 0.1, epsilon_decay = 1, epsilon_min = 0.1):
super(EpsilonGreedySelector, self).__init__(model, action_size, device = device)
self.epsilon = epsilon
self.epsilon_decay = epsilon_decay
self.epsilon_min = epsilon_min
# random_act is already implemented in ArgMaxSelector
# best_act is already implemented in ArgMaxSelector
def act(self, state):
action = self.random_act() if np.random.rand() < self.epsilon else self.best_act()
if self.epsilon > self.epsilon_min:
self.epsilon = self.epsilon * self.epsilon_decay
return action

View file

@ -0,0 +1,10 @@
from random import randrange
class RandomSelector():
def __init__(self, action_size):
self.action_size = action_size
def random_act(self):
return randrange(action_size)
def best_act(self, state):
return self.random_act()
def act(self, state):
return self.random_act()

View file

@ -0,0 +1,3 @@
from .ArgMaxSelector import *
from .EpsilonGreedySelector import *
from .RandomSelector import *

View file

@ -0,0 +1,54 @@
import rltorch.memory as M
import torch
import torch.nn.functional as F
from copy import deepcopy
class DQNAgent:
def __init__(self, net , memory, config, target_net = None, logger = None):
self.net = net
self.target_net = target_net
self.memory = memory
self.config = deepcopy(config)
self.logger = logger
def learn(self):
if len(self.memory) < self.config['batch_size']:
return
minibatch = self.memory.sample(self.config['batch_size'])
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch)
obtained_values = self.net(state_batch).gather(1, action_batch.view(self.config['batch_size'], 1))
with torch.no_grad():
# Use the target net to produce action values for the next state
# and the regular net to select the action
# That way we decouple the value and action selecting processes (DOUBLE DQN)
not_done_size = not_done_batch.sum()
if self.target_net is not None:
next_state_values = self.target_net(next_state_batch)
next_best_action = self.net(next_state_batch).argmax(1)
else:
next_state_values = self.net(next_state_batch)
next_best_action = next_state_values.argmax(1)
best_next_state_value = torch.zeros(self.config['batch_size'])
best_next_state_value[not_done_batch] = next_state_values.gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
loss = F.mse_loss(obtained_values, expected_values)
if self.logger is not None:
self.logger.append("Loss", loss.item())
self.net.zero_grad()
loss.backward()
self.net.clamp_gradients()
self.net.step()
if self.target_net is not None:
if 'target_sync_tau' in self.config:
self.target_net.partial_sync(self.config['target_sync_tau'])
else:
self.target_net.sync()

View file

@ -0,0 +1 @@
from .DQNAgent import *

2
rltorch/env/__init__.py vendored Normal file
View file

@ -0,0 +1,2 @@
from .wrappers import *
from .simulate import *

21
rltorch/env/simulate.py vendored Normal file
View file

@ -0,0 +1,21 @@
def simulateEnvEps(env, actor, config, total_episodes = 1, memory = None, logger = None, name = ""):
for episode in range(total_episodes):
state = env.reset()
done = False
episode_reward = 0
while not done:
action = actor.act(state)
next_state, reward, done, _ = env.step(action)
episode_reward = episode_reward + reward
if memory is not None:
memory.append(state, action, reward, next_state, done)
state = next_state
if episode % config['print_stat_n_eps'] == 0:
print("episode: {}/{}, score: {}"
.format(episode, total_episodes, episode_reward))
if logger is not None:
logger.append(name + '/EpisodeReward', episode_reward)

129
rltorch/env/wrappers.py vendored Normal file
View file

@ -0,0 +1,129 @@
import gym
import torch
from gym import spaces
import cv2
from collections import deque
# Mostly derived from OpenAI baselines
class FireResetEnv(gym.Wrapper):
def __init__(self, env):
"""Take action on reset for environments that are fixed until firing."""
gym.Wrapper.__init__(self, env)
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
assert len(env.unwrapped.get_action_meanings()) >= 3
def reset(self, **kwargs):
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(1)
if done:
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(2)
if done:
self.env.reset(**kwargs)
return obs
def step(self, ac):
return self.env.step(ac)
class LazyFrames(object):
def __init__(self, frames):
"""This object ensures that common frames between the observations are only stored once.
It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
buffers.
This object should only be converted to numpy array before being passed to the model.
You'd not believe how complex the previous solution was."""
self._frames = frames
self._out = None
def _force(self):
if self._out is None:
self._out = torch.stack(self._frames)
self._frames = None
return self._out
def __array__(self, dtype=None):
out = self._force()
if dtype is not None:
out = out.astype(dtype)
return out
def __len__(self):
return len(self._force())
def __getitem__(self, i):
return self._force()[i]
class FrameStack(gym.Wrapper):
def __init__(self, env, k):
"""Stack k last frames.
Returns lazy array, which is much more memory efficient.
See Also
--------
baselines.common.atari_wrappers.LazyFrames
"""
gym.Wrapper.__init__(self, env)
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)
def reset(self):
ob = self.env.reset()
for _ in range(self.k):
self.frames.append(ob)
return self._get_ob()
def step(self, action):
ob, reward, done, info = self.env.step(action)
self.frames.append(ob)
return self._get_ob(), reward, done, info
def _get_ob(self):
assert len(self.frames) == self.k
# return LazyFrames(list(self.frames))
return torch.cat(list(self.frames)).unsqueeze(0)
class ProcessFrame(gym.Wrapper):
def __init__(self, env, resize_shape = None, crop_bounds = None, grayscale = False):
gym.Wrapper.__init__(self, env)
self.resize_shape = resize_shape
self.crop_bounds = crop_bounds
self.grayscale = grayscale
def reset(self):
return self._preprocess(self.env.reset())
def step(self, action):
next_state, reward, done, info = self.env.step(action)
next_state = self._preprocess(next_state)
return next_state, reward, done, info
def _preprocess(self, frame):
if self.grayscale:
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
if self.crop_bounds is not None and len(self.crop_bounds) == 4:
frame = frame[self.crop_bounds[0]:self.crop_bounds[1], self.crop_bounds[2]:self.crop_bounds[3]]
if self.resize_shape is not None and len(self.resize_shape) == 2:
frame = cv2.resize(frame, self.resize_shape, interpolation=cv2.INTER_AREA)
# Normalize
frame = frame / 255
return frame
# Turns observations into torch tensors
# Adds an additional dimension that's suppose to represent the batch dim
class TorchWrap(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
def reset(self):
return self._convert(self.env.reset())
def step(self, action):
next_state, reward, done, info = self.env.step(action)
next_state = self._convert(next_state)
return next_state, reward, done, info
def _convert(self, frame):
frame = torch.from_numpy(frame).unsqueeze(0).float()
return frame

44
rltorch/log.py Normal file
View file

@ -0,0 +1,44 @@
from collections import Counter
import numpy as np
import torch
class Logger:
def __init__(self):
self.log = {}
def append(self, tag, value):
if tag not in self.log.keys():
self.log[tag] = []
self.log[tag].append(value)
def keys(self):
return self.log.keys()
def __len__(self):
return len(self.log)
def __iter__(self):
return iter(self.log)
def __contains__(self, value):
return value in self.log
def __getitem__(self, index):
return self.log[index]
def __setitem__(self, index, value):
self.log[index] = value
def __reversed__(self):
return reversed(self.log)
# Workaround since we can't use SummaryWriter in a different process
class LogWriter:
def __init__(self, logger, writer):
self.logger = logger
self.writer = writer
self.steps = Counter()
def write(self):
for key in self.logger.keys():
for value in self.logger[key]:
self.steps[key] += 1
if isinstance(value, int) or isinstance(value, float):
self.writer.add_scalar(key, value, self.steps[key])
if isinstance(value, np.ndarray) or isinstance(value, torch.Tensor):
self.writer.add_histogram(key, value, self.steps[key])
self.logger.log = {}
def close(self):
self.writer.close()

View file

@ -0,0 +1,55 @@
from random import sample
from collections import namedtuple
import torch
Transition = namedtuple('Transition',
('state', 'action', 'reward', 'next_state', 'done'))
# Implements a Ring Buffer
class ReplayMemory(object):
def __init__(self, capacity):
self.capacity = capacity
self.memory = []
self.position = 0
def append(self, *args):
"""Saves a transition."""
if len(self.memory) < self.capacity:
self.memory.append(None)
self.memory[self.position] = Transition(*args)
self.position = (self.position + 1) % self.capacity
def clear(self):
self.memory.clear()
self.position = 0
def sample(self, batch_size):
return sample(self.memory, batch_size)
def __len__(self):
return len(self.memory)
def __iter__(self):
return iter(self.memory)
def __contains__(self, value):
return value in self.memory
def __getitem__(self, index):
return self.memory[index]
def __setitem__(self, index, value):
self.memory[index] = value
def __reversed__(self):
return reversed(self.memory)
def zip_batch(minibatch):
state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*minibatch)
state_batch = torch.cat(state_batch)
action_batch = torch.tensor(action_batch)
reward_batch = torch.tensor(reward_batch)
not_done_batch = ~torch.tensor(done_batch)
next_state_batch = torch.cat(next_state_batch)[not_done_batch]
return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch

View file

@ -0,0 +1 @@
from .ReplayMemory import *

View file

@ -0,0 +1,35 @@
from copy import deepcopy
import torch.multiprocessing as mp
class EnvironmentEpisode(mp.Process):
def __init__(self, env, actor, config, memory = None, logger = None, name = ""):
super(EnvironmentEpisode, self).__init__()
self.env = env
self.actor = actor
self.memory = memory
self.config = deepcopy(config)
self.logger = logger
self.name = name
self.episode_num = 1
def run(self, printstat = False):
state = self.env.reset()
done = False
episode_reward = 0
while not done:
action = self.actor.act(state)
next_state, reward, done, _ = self.env.step(action)
episode_reward = episode_reward + reward
if self.memory is not None:
self.memory.append(state, action, reward, next_state, done)
state = next_state
if printstat:
print("episode: {}/{}, score: {}"
.format(self.episode_num, self.config['total_training_episodes'], episode_reward))
if self.logger is not None:
self.logger.append(self.name + '/EpisodeReward', episode_reward)
self.episode_num += 1

View file

@ -0,0 +1,39 @@
from copy import deepcopy
import torch.multiprocessing as mp
class EnvironmentRun(mp.Process):
def __init__(self, env, actor, config, memory = None, logger = None, name = ""):
super(EnvironmentRun, self).__init__()
self.env = env
self.actor = actor
self.memory = memory
self.config = deepcopy(config)
self.logger = logger
self.name = name
self.episode_num = 1
self.episode_reward = 0
self.last_state = env.reset()
def run(self, iterations = 1, printstat = False):
state = self.last_state
for _ in range(iterations):
action = self.actor.act(state)
next_state, reward, done, _ = self.env.step(action)
self.episode_reward = self.episode_reward + reward
if self.memory is not None:
self.memory.append(state, action, reward, next_state, done)
state = next_state
if done:
if printstat:
print("episode: {}/{}, score: {}"
.format(self.episode_num, self.config['total_training_episodes'], self.episode_reward))
if self.logger is not None:
self.logger.append(self.name + '/EpisodeReward', self.episode_reward)
self.episode_num = self.episode_num + 1
self.episode_reward = 0
state = self.env.reset()
self.last_state = state

2
rltorch/mp/__init__.py Normal file
View file

@ -0,0 +1,2 @@
from .EnvironmentEpisode import *
from .EnvironmentRun import *

View file

@ -0,0 +1,29 @@
class Network:
"""
Wrapper around model which provides copy of it instead of trained weights
"""
def __init__(self, model, optimizer, config, logger = None, name = ""):
self.model = model
self.optimizer = optimizer(model.parameters(), lr = config['learning_rate'], weight_decay = config['weight_decay'])
self.logger = logger
self.name = name
def __call__(self, *args):
return self.model(*args)
def clamp_gradients(self):
for param in self.model.parameters():
param.grad.data.clamp_(-1, 1)
def zero_grad(self):
self.model.zero_grad()
def step(self):
self.optimizer.step()
def log_named_parameters(self):
if self.logger is not None:
for name, param in self.model.named_parameters():
self.logger.append(self.name + "/" + name, param.cpu().detach().numpy())

View file

@ -0,0 +1,44 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
# This class utilizes this property of the normal distribution
# N(mu, sigma) = mu + sigma * N(0, 1)
class NoisyLinear(nn.Linear):
def __init__(self, in_features, out_features, sigma_init = 0.017, bias = True):
super(NoisyLinear, self).__init__(in_features, out_features, bias = bias)
# One of the parameters the network is going to tune is the
# standard deviation of the gaussian noise on the weights
self.sigma_weight = nn.Parameter(torch.Tensor(out_features, in_features).fill_(sigma_init))
# Reserve space for N(0, 1) of weights in the forward() call
self.register_buffer("s_normal_weight", torch.zeros(out_features, in_features))
if bias:
# If a bias exists, then we manipulate the standard deviation of the
# gaussion noise on them as well
self.sigma_bias = nn.Parameter(torch.Tensor(out_features).fill_(sigma_init))
# Reserve space for N(0, 1) of bias in the foward() call
self.register_buffer("s_normal_bias", torch.zeros(out_features))
self.reset_parameters()
def reset_parameters(self):
std = math.sqrt(3 / self.in_features)
nn.init.uniform_(self.weight, -std, std)
nn.init.uniform_(self.bias, -std, std)
def forward(self, x):
# Fill s_normal_weight with values from the standard normal distribution
torch.randn(self.s_normal_weight.size(), out = self.s_normal_weight,
dtype = self.s_normal_weight.dtype, layout = self.s_normal_weight.layout, device = self.s_normal_weight.device)
# Multiply by the standard deviation to correct the spread of Gaussian noise
weight_noise = self.sigma_weight * self.s_normal_weight.clone().requires_grad_()
bias = None
if self.bias is not None:
# Fill s_normal_bias with values from standard normal
torch.randn(self.s_normal_bias.size(), out = self.s_normal_bias,
dtype = self.s_normal_bias.dtype, layout = self.s_normal_bias.layout, device = self.s_normal_bias.device)
# Add guassian noise to original bias
bias = self.bias + self.sigma_bias * self.s_normal_bias.clone().requires_grad_()
return F.linear(x, self.weight + weight_noise, bias)

View file

@ -0,0 +1,28 @@
from copy import deepcopy
# Derived from ptan library
class TargetNetwork:
"""
Wrapper around model which provides copy of it instead of trained weights
"""
def __init__(self, network):
self.model = network.model
self.target_model = deepcopy(network.model)
def __call__(self, *args):
return self.model(*args)
def sync(self):
self.target_model.load_state_dict(self.model.state_dict())
def partial_sync(self, tau):
"""
Blend params of target net with params from the model
:param tau:
"""
assert isinstance(tau, float)
assert 0.0 < tau <= 1.0
model_state = self.model.state_dict()
target_state = self.target_model.state_dict()
for grad_index, grad in model_state.items():
target_state[grad_index].copy_((1 - tau) * target_state[grad_index] + tau * grad)
self.target_model.load_state_dict(target_state)

View file

@ -0,0 +1,3 @@
from .Network import *
from .NoisyLinear import *
from .TargetNetwork import *

16
rltorch/seed.py Normal file
View file

@ -0,0 +1,16 @@
from os import environ
import numpy as np
import random
import torch
def set_seed(SEED):
# Set `PYTHONHASHSEED` environment variable at a fixed value
environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
random.seed(SEED)
# Pytorch
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

15
setup.py Normal file
View file

@ -0,0 +1,15 @@
"""
rltorch stands for Reinforcement Learning Torch -- RL library built on top of PyTorch
"""
import setuptools
setuptools.setup(
name="rltorch",
author="Brandon Rozek",
author_email="rozekbrandon@gmail.com",
license='MIT',
description="Reinforcement Learning Framework for PyTorch",
version="0.1",
packages=setuptools.find_packages(),
)