commit a03abe2bb18da3c59503177b9ca857d2fb9fb0c9 Author: Brandon Rozek Date: Thu Jan 31 23:34:32 2019 -0500 Initial Commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1f091d4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__/ +*.py[cod] +rlenv/ +runs/ diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..da5dc4b --- /dev/null +++ b/Readme.md @@ -0,0 +1,37 @@ +# rltorch +A reinforcement learning framework with the primary purpose of learning and cleaning up personal scripts. + +## Installation +From GitHub +``` +pip install git+https://github.com/brandon-rozek/rltorch +``` + +## Components +### Config +This is a dictionary that is shared around the different components. Contains hyperparameters and other configuration values. + +### Environment +This component needs to support the standard openai functions reset and step. + +### Logger +For Tensorboard to work, you need to define a logger that will (optionally) later go into the network, runner, and agent/trainer. + +Due to issues with multiprocessing, the Logger is a shared dictionary of lists that get appended to and the LogWriter writes on the main thread. + +### Network +A network takes a PyTorch nn.Module, PyTorch optimizer, configuration, and the optional logger. + +### Target Network +Takes in a network and provides methods to sync a copy of the original network. + +### Action Selector +Typtically takes in a network which it then uses to help make decisions on which actions to take. + +For example, the ArgMaxSelector chooses the action that produces the highest entry in the output vector of the network. + +### Memory +Stores experiences during simulations of the environment. Useful for later training. + +### Agents +Takes in a network and performs some sort of training upon it. \ No newline at end of file diff --git a/examples/acrobot.py b/examples/acrobot.py new file mode 100644 index 0000000..c3c337d --- /dev/null +++ b/examples/acrobot.py @@ -0,0 +1,123 @@ +import gym +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import rltorch +import rltorch.network as rn +import rltorch.memory as M +import rltorch.env as E +from rltorch.action_selector import ArgMaxSelector +from tensorboardX import SummaryWriter + + +class Value(nn.Module): + def __init__(self, state_size, action_size): + super(Value, self).__init__() + self.state_size = state_size + self.action_size = action_size + + self.fc1 = rn.NoisyLinear(state_size, 64) + + self.value_fc = rn.NoisyLinear(64, 64) + self.value = rn.NoisyLinear(64, 1) + + self.advantage_fc = rn.NoisyLinear(64, 64) + self.advantage = rn.NoisyLinear(64, action_size) + + + def forward(self, x): + x = F.relu(self.fc1(x)) + + state_value = F.relu(self.value_fc(x)) + state_value = self.value(state_value) + + advantage = F.relu(self.advantage_fc(x)) + advantage = self.advantage(advantage) + + x = state_value + advantage - advantage.mean() + + return x + + +config = {} +config['seed'] = 901 +config['environment_name'] = 'Acrobot-v1' +config['memory_size'] = 2000 +config['total_training_episodes'] = 50 +config['total_evaluation_episodes'] = 10 +config['batch_size'] = 32 +config['learning_rate'] = 1e-3 +config['target_sync_tau'] = 1e-1 +config['weight_decay'] = 1e-5 +config['discount_rate'] = 0.99 +config['replay_skip'] = 0 +# How many episodes between printing out the episode stats +config['print_stat_n_eps'] = 1 +config['disable_cuda'] = False + +def train(runner, agent, config, logwriter = None): + finished = False + episode_num = 1 + while not finished: + runner.run(config['replay_skip'] + 1, printstat = runner.episode_num % config['print_stat_n_eps'] == 0) + agent.learn() + runner.join() + # When the episode number changes, write out the weight histograms + if logwriter is not None and episode_num < runner.episode_num: + episode_num = runner.episode_num + agent.net.log_named_parameters() + + if logwriter is not None: + logwriter.write() + finished = runner.episode_num > config['total_training_episodes'] + + +# Setting up the environment +rltorch.set_seed(config['seed']) +print("Setting up environment...", end = " ") +env = E.TorchWrap(gym.make(config['environment_name'])) +env.seed(config['seed']) +print("Done.") + +state_size = env.observation_space.shape[0] +action_size = env.action_space.n + +# Logging +logger = rltorch.log.Logger() +logwriter = rltorch.log.LogWriter(logger, SummaryWriter()) + +# Setting up the networks +device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") +net = rn.Network(Value(state_size, action_size), + torch.optim.Adam, config, logger = logger, name = "DQN") +target_net = rn.TargetNetwork(net) + +# Actor takes a net and uses it to produce actions from given states +actor = ArgMaxSelector(net, action_size) +# Memory stores experiences for later training +memory = M.ReplayMemory(capacity = config['memory_size']) + +# Runner performs a certain number of steps in the environment +runner = rltorch.mp.EnvironmentRun(env, actor, config, memory = memory, logger = logger, name = "Training") +runner.start() + +# Agent is what performs the training +agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net, logger = logger) + +print("Training...") +train(runner, agent, config, logwriter = logwriter) + +# For profiling... +# import cProfile +# cProfile.run('train(runner, agent, config, logwriter = logwriter )') +# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... + +print("Training Finished.") +runner.terminate() # We don't need the extra process anymore + +print("Evaluating...") +rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") +print("Evaulations Done.") + +logwriter.close() # We don't need to write anything out to disk anymore \ No newline at end of file diff --git a/examples/pong.py b/examples/pong.py new file mode 100644 index 0000000..074b9b1 --- /dev/null +++ b/examples/pong.py @@ -0,0 +1,140 @@ +import gym +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import rltorch +import rltorch.network as rn +import rltorch.memory as M +import rltorch.env as E +from rltorch.action_selector import ArgMaxSelector +from tensorboardX import SummaryWriter + +class Value(nn.Module): + def __init__(self, state_size, action_size): + super(Value, self).__init__() + self.state_size = state_size + self.action_size = action_size + + self.conv1 = nn.Conv2d(4, 32, kernel_size = (8, 8), stride = (4, 4)) + self.conv2 = nn.Conv2d(32, 64, kernel_size = (4, 4), stride = (2, 2)) + self.conv3 = nn.Conv2d(64, 64, kernel_size = (3, 3), stride = (1, 1)) + + self.fc1 = rn.NoisyLinear(64 * 6 * 6, 384) + + self.value_fc = rn.NoisyLinear(384, 384) + self.value = rn.NoisyLinear(384, 1) + + self.advantage_fc = rn.NoisyLinear(384, 384) + self.advantage = rn.NoisyLinear(384, action_size) + + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = F.relu(self.conv2(x)) + x = F.relu(self.conv3(x)) + + # Makes batch_size dimension again + x = x.view(-1, 64 * 6 * 6) + x = F.relu(self.fc1(x)) + + state_value = F.relu(self.value_fc(x)) + state_value = self.value(state_value) + + advantage = F.relu(self.advantage_fc(x)) + advantage = self.advantage(advantage) + + x = state_value + advantage - advantage.mean() + + # For debugging purposes... + if torch.isnan(x).any().item(): + print("WARNING NAN IN MODEL DETECTED") + + return x + + + + +config = {} +config['seed'] = 901 +config['environment_name'] = 'PongNoFrameskip-v4' +config['memory_size'] = 4000 +config['total_training_episodes'] = 50 +config['total_evaluation_episodes'] = 10 +config['learning_rate'] = 1e-4 +config['target_sync_tau'] = 1e-3 +config['weight_decay'] = 1e-8 +config['discount_rate'] = 0.999 +config['replay_skip'] = 4 +config['batch_size'] = 32 * (config['replay_skip'] + 1) +# How many episodes between printing out the episode stats +config['print_stat_n_eps'] = 1 +config['disable_cuda'] = False + +def train(runner, agent, config, logwriter = None): + finished = False + episode_num = 1 + while not finished: + runner.run(config['replay_skip'] + 1, printstat = runner.episode_num % config['print_stat_n_eps'] == 0) + agent.learn() + runner.join() + # When the episode number changes, write out the weight histograms + if logwriter is not None and episode_num < runner.episode_num: + episode_num = runner.episode_num + agent.net.log_named_parameters() + + if logwriter is not None: + logwriter.write() + finished = runner.episode_num > config['total_training_episodes'] + + +rltorch.set_seed(config['seed']) +print("Setting up environment...", end = " ") +env = E.FrameStack(E.TorchWrap( + E.ProcessFrame(E.FireResetEnv(gym.make(config['environment_name'])), + resize_shape = (80, 80), crop_bounds = [34, 194, 15, 145], grayscale = True)) +, 4) +env.seed(config['seed']) +print("Done.") + +state_size = env.observation_space.shape[0] +action_size = env.action_space.n + +# Logging +logger = rltorch.log.Logger() +logwriter = rltorch.log.LogWriter(logger, SummaryWriter()) + +# Setting up the networks +device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") +net = rn.Network(Value(state_size, action_size), + torch.optim.Adam, config, logger = logger, name = "DQN") +target_net = rn.TargetNetwork(net) + +# Actor takes a network and uses it to produce actions from given states +actor = ArgMaxSelector(net, action_size) +# Memory stores experiences for later training +memory = M.ReplayMemory(capacity = config['memory_size']) + +# Runner performs a certain number of steps in the environment +runner = rltorch.mp.EnvironmentRun(env, actor, config, memory = memory, logger = logger, name = "Training") +runner.start() + +# Agent is what performs the training +agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net, logger = logger) + +print("Training...") +train(runner, agent, config, logwriter = logwriter) + +# For profiling... +# import cProfile +# cProfile.run('train(runner, agent, config, logwriter = logwriter )') +# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... + +print("Training Finished.") +runner.terminate() # We don't need the extra process anymore + +print("Evaluating...") +rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") +print("Evaulations Done.") + +logwriter.close() # We don't need to write anything out to disk anymore \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fb01538 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,31 @@ +absl-py==0.7.0 +astor==0.7.1 +atari-py==0.1.7 +certifi==2018.11.29 +chardet==3.0.4 +future==0.17.1 +gast==0.2.2 +grpcio==1.18.0 +gym==0.10.11 +h5py==2.9.0 +idna==2.8 +Keras-Applications==1.0.7 +Keras-Preprocessing==1.0.8 +Markdown==3.0.1 +numpy==1.16.0 +opencv-python==4.0.0.21 +Pillow==5.4.1 +pkg-resources==0.0.0 +protobuf==3.6.1 +pyglet==1.3.2 +PyOpenGL==3.1.0 +requests==2.21.0 +scipy==1.2.0 +six==1.12.0 +tensorboard==1.12.2 +tensorboardX==1.6 +tensorflow==1.12.0 +termcolor==1.1.0 +torch==1.0.0 +urllib3==1.24.1 +Werkzeug==0.14.1 \ No newline at end of file diff --git a/rltorch/__init__.py b/rltorch/__init__.py new file mode 100644 index 0000000..d2e8f73 --- /dev/null +++ b/rltorch/__init__.py @@ -0,0 +1,8 @@ +from . import action_selector +from . import agents +from . import env +from . import memory +from . import network +from . import mp +from .seed import * +from . import log \ No newline at end of file diff --git a/rltorch/action_selector/ArgMaxSelector.py b/rltorch/action_selector/ArgMaxSelector.py new file mode 100644 index 0000000..2b7b2a1 --- /dev/null +++ b/rltorch/action_selector/ArgMaxSelector.py @@ -0,0 +1,18 @@ +from random import randrange +import torch +class ArgMaxSelector: + def __init__(self, model, action_size, device = None): + self.model = model + self.action_size = action_size + self.device = device + def random_act(self): + return randrange(self.action_size) + def best_act(self, state): + with torch.no_grad(): + if self.device is not None: + self.device.to(self.device) + action_values = self.model(state).squeeze(0) + action = self.random_act() if (action_values[0] == action_values).all() else action_values.argmax().item() + return action + def act(self, state): + return self.best_act(state) \ No newline at end of file diff --git a/rltorch/action_selector/EpsilonGreedySelector.py b/rltorch/action_selector/EpsilonGreedySelector.py new file mode 100644 index 0000000..7bb04c1 --- /dev/null +++ b/rltorch/action_selector/EpsilonGreedySelector.py @@ -0,0 +1,14 @@ +from .ArgMaxSelector import ArgMaxSelector +class EpsilonGreedySelector(ArgMaxSelector): + def __init__(self, model, action_size, device = None, epsilon = 0.1, epsilon_decay = 1, epsilon_min = 0.1): + super(EpsilonGreedySelector, self).__init__(model, action_size, device = device) + self.epsilon = epsilon + self.epsilon_decay = epsilon_decay + self.epsilon_min = epsilon_min + # random_act is already implemented in ArgMaxSelector + # best_act is already implemented in ArgMaxSelector + def act(self, state): + action = self.random_act() if np.random.rand() < self.epsilon else self.best_act() + if self.epsilon > self.epsilon_min: + self.epsilon = self.epsilon * self.epsilon_decay + return action \ No newline at end of file diff --git a/rltorch/action_selector/RandomSelector.py b/rltorch/action_selector/RandomSelector.py new file mode 100644 index 0000000..441c512 --- /dev/null +++ b/rltorch/action_selector/RandomSelector.py @@ -0,0 +1,10 @@ +from random import randrange +class RandomSelector(): + def __init__(self, action_size): + self.action_size = action_size + def random_act(self): + return randrange(action_size) + def best_act(self, state): + return self.random_act() + def act(self, state): + return self.random_act() diff --git a/rltorch/action_selector/__init__.py b/rltorch/action_selector/__init__.py new file mode 100644 index 0000000..3c24389 --- /dev/null +++ b/rltorch/action_selector/__init__.py @@ -0,0 +1,3 @@ +from .ArgMaxSelector import * +from .EpsilonGreedySelector import * +from .RandomSelector import * \ No newline at end of file diff --git a/rltorch/agents/DQNAgent.py b/rltorch/agents/DQNAgent.py new file mode 100644 index 0000000..cd57913 --- /dev/null +++ b/rltorch/agents/DQNAgent.py @@ -0,0 +1,54 @@ +import rltorch.memory as M +import torch +import torch.nn.functional as F +from copy import deepcopy + +class DQNAgent: + def __init__(self, net , memory, config, target_net = None, logger = None): + self.net = net + self.target_net = target_net + self.memory = memory + self.config = deepcopy(config) + self.logger = logger + + def learn(self): + if len(self.memory) < self.config['batch_size']: + return + + minibatch = self.memory.sample(self.config['batch_size']) + state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch) + + obtained_values = self.net(state_batch).gather(1, action_batch.view(self.config['batch_size'], 1)) + + with torch.no_grad(): + # Use the target net to produce action values for the next state + # and the regular net to select the action + # That way we decouple the value and action selecting processes (DOUBLE DQN) + not_done_size = not_done_batch.sum() + if self.target_net is not None: + next_state_values = self.target_net(next_state_batch) + next_best_action = self.net(next_state_batch).argmax(1) + else: + next_state_values = self.net(next_state_batch) + next_best_action = next_state_values.argmax(1) + + best_next_state_value = torch.zeros(self.config['batch_size']) + best_next_state_value[not_done_batch] = next_state_values.gather(1, next_best_action.view((not_done_size, 1))).squeeze(1) + + expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1) + + loss = F.mse_loss(obtained_values, expected_values) + + if self.logger is not None: + self.logger.append("Loss", loss.item()) + + self.net.zero_grad() + loss.backward() + self.net.clamp_gradients() + self.net.step() + + if self.target_net is not None: + if 'target_sync_tau' in self.config: + self.target_net.partial_sync(self.config['target_sync_tau']) + else: + self.target_net.sync() diff --git a/rltorch/agents/__init__.py b/rltorch/agents/__init__.py new file mode 100644 index 0000000..205fd9c --- /dev/null +++ b/rltorch/agents/__init__.py @@ -0,0 +1 @@ +from .DQNAgent import * \ No newline at end of file diff --git a/rltorch/env/__init__.py b/rltorch/env/__init__.py new file mode 100644 index 0000000..bdf3087 --- /dev/null +++ b/rltorch/env/__init__.py @@ -0,0 +1,2 @@ +from .wrappers import * +from .simulate import * \ No newline at end of file diff --git a/rltorch/env/simulate.py b/rltorch/env/simulate.py new file mode 100644 index 0000000..42ca46f --- /dev/null +++ b/rltorch/env/simulate.py @@ -0,0 +1,21 @@ +def simulateEnvEps(env, actor, config, total_episodes = 1, memory = None, logger = None, name = ""): + for episode in range(total_episodes): + state = env.reset() + done = False + episode_reward = 0 + while not done: + action = actor.act(state) + next_state, reward, done, _ = env.step(action) + + episode_reward = episode_reward + reward + if memory is not None: + memory.append(state, action, reward, next_state, done) + state = next_state + + if episode % config['print_stat_n_eps'] == 0: + print("episode: {}/{}, score: {}" + .format(episode, total_episodes, episode_reward)) + + if logger is not None: + logger.append(name + '/EpisodeReward', episode_reward) + diff --git a/rltorch/env/wrappers.py b/rltorch/env/wrappers.py new file mode 100644 index 0000000..065b931 --- /dev/null +++ b/rltorch/env/wrappers.py @@ -0,0 +1,129 @@ +import gym +import torch +from gym import spaces +import cv2 +from collections import deque + +# Mostly derived from OpenAI baselines +class FireResetEnv(gym.Wrapper): + def __init__(self, env): + """Take action on reset for environments that are fixed until firing.""" + gym.Wrapper.__init__(self, env) + assert env.unwrapped.get_action_meanings()[1] == 'FIRE' + assert len(env.unwrapped.get_action_meanings()) >= 3 + + def reset(self, **kwargs): + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(1) + if done: + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(2) + if done: + self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + +class LazyFrames(object): + def __init__(self, frames): + """This object ensures that common frames between the observations are only stored once. + It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay + buffers. + This object should only be converted to numpy array before being passed to the model. + You'd not believe how complex the previous solution was.""" + self._frames = frames + self._out = None + + def _force(self): + if self._out is None: + self._out = torch.stack(self._frames) + self._frames = None + return self._out + + def __array__(self, dtype=None): + out = self._force() + if dtype is not None: + out = out.astype(dtype) + return out + + def __len__(self): + return len(self._force()) + + def __getitem__(self, i): + return self._force()[i] + +class FrameStack(gym.Wrapper): + def __init__(self, env, k): + """Stack k last frames. + Returns lazy array, which is much more memory efficient. + See Also + -------- + baselines.common.atari_wrappers.LazyFrames + """ + gym.Wrapper.__init__(self, env) + self.k = k + self.frames = deque([], maxlen=k) + shp = env.observation_space.shape + self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype) + + def reset(self): + ob = self.env.reset() + for _ in range(self.k): + self.frames.append(ob) + return self._get_ob() + + def step(self, action): + ob, reward, done, info = self.env.step(action) + self.frames.append(ob) + return self._get_ob(), reward, done, info + + def _get_ob(self): + assert len(self.frames) == self.k + # return LazyFrames(list(self.frames)) + return torch.cat(list(self.frames)).unsqueeze(0) + +class ProcessFrame(gym.Wrapper): + def __init__(self, env, resize_shape = None, crop_bounds = None, grayscale = False): + gym.Wrapper.__init__(self, env) + self.resize_shape = resize_shape + self.crop_bounds = crop_bounds + self.grayscale = grayscale + + def reset(self): + return self._preprocess(self.env.reset()) + + def step(self, action): + next_state, reward, done, info = self.env.step(action) + next_state = self._preprocess(next_state) + return next_state, reward, done, info + + def _preprocess(self, frame): + if self.grayscale: + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) + if self.crop_bounds is not None and len(self.crop_bounds) == 4: + frame = frame[self.crop_bounds[0]:self.crop_bounds[1], self.crop_bounds[2]:self.crop_bounds[3]] + if self.resize_shape is not None and len(self.resize_shape) == 2: + frame = cv2.resize(frame, self.resize_shape, interpolation=cv2.INTER_AREA) + # Normalize + frame = frame / 255 + return frame + + +# Turns observations into torch tensors +# Adds an additional dimension that's suppose to represent the batch dim +class TorchWrap(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + + def reset(self): + return self._convert(self.env.reset()) + + def step(self, action): + next_state, reward, done, info = self.env.step(action) + next_state = self._convert(next_state) + return next_state, reward, done, info + + def _convert(self, frame): + frame = torch.from_numpy(frame).unsqueeze(0).float() + return frame \ No newline at end of file diff --git a/rltorch/log.py b/rltorch/log.py new file mode 100644 index 0000000..37e5343 --- /dev/null +++ b/rltorch/log.py @@ -0,0 +1,44 @@ +from collections import Counter +import numpy as np +import torch + +class Logger: + def __init__(self): + self.log = {} + def append(self, tag, value): + if tag not in self.log.keys(): + self.log[tag] = [] + self.log[tag].append(value) + def keys(self): + return self.log.keys() + def __len__(self): + return len(self.log) + def __iter__(self): + return iter(self.log) + def __contains__(self, value): + return value in self.log + def __getitem__(self, index): + return self.log[index] + def __setitem__(self, index, value): + self.log[index] = value + def __reversed__(self): + return reversed(self.log) + +# Workaround since we can't use SummaryWriter in a different process +class LogWriter: + def __init__(self, logger, writer): + self.logger = logger + self.writer = writer + self.steps = Counter() + def write(self): + for key in self.logger.keys(): + for value in self.logger[key]: + self.steps[key] += 1 + if isinstance(value, int) or isinstance(value, float): + self.writer.add_scalar(key, value, self.steps[key]) + if isinstance(value, np.ndarray) or isinstance(value, torch.Tensor): + self.writer.add_histogram(key, value, self.steps[key]) + self.logger.log = {} + def close(self): + self.writer.close() + diff --git a/rltorch/memory/ReplayMemory.py b/rltorch/memory/ReplayMemory.py new file mode 100644 index 0000000..f9d6b2f --- /dev/null +++ b/rltorch/memory/ReplayMemory.py @@ -0,0 +1,55 @@ +from random import sample +from collections import namedtuple +import torch +Transition = namedtuple('Transition', + ('state', 'action', 'reward', 'next_state', 'done')) + +# Implements a Ring Buffer +class ReplayMemory(object): + def __init__(self, capacity): + self.capacity = capacity + self.memory = [] + self.position = 0 + + def append(self, *args): + """Saves a transition.""" + if len(self.memory) < self.capacity: + self.memory.append(None) + self.memory[self.position] = Transition(*args) + self.position = (self.position + 1) % self.capacity + + def clear(self): + self.memory.clear() + self.position = 0 + + def sample(self, batch_size): + return sample(self.memory, batch_size) + + def __len__(self): + return len(self.memory) + + def __iter__(self): + return iter(self.memory) + + def __contains__(self, value): + return value in self.memory + + def __getitem__(self, index): + return self.memory[index] + + def __setitem__(self, index, value): + self.memory[index] = value + + def __reversed__(self): + return reversed(self.memory) + +def zip_batch(minibatch): + state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*minibatch) + + state_batch = torch.cat(state_batch) + action_batch = torch.tensor(action_batch) + reward_batch = torch.tensor(reward_batch) + not_done_batch = ~torch.tensor(done_batch) + next_state_batch = torch.cat(next_state_batch)[not_done_batch] + + return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch \ No newline at end of file diff --git a/rltorch/memory/__init__.py b/rltorch/memory/__init__.py new file mode 100644 index 0000000..d4f414f --- /dev/null +++ b/rltorch/memory/__init__.py @@ -0,0 +1 @@ +from .ReplayMemory import * diff --git a/rltorch/mp/EnvironmentEpisode.py b/rltorch/mp/EnvironmentEpisode.py new file mode 100644 index 0000000..a0f03f2 --- /dev/null +++ b/rltorch/mp/EnvironmentEpisode.py @@ -0,0 +1,35 @@ +from copy import deepcopy +import torch.multiprocessing as mp + +class EnvironmentEpisode(mp.Process): + def __init__(self, env, actor, config, memory = None, logger = None, name = ""): + super(EnvironmentEpisode, self).__init__() + self.env = env + self.actor = actor + self.memory = memory + self.config = deepcopy(config) + self.logger = logger + self.name = name + self.episode_num = 1 + + def run(self, printstat = False): + state = self.env.reset() + done = False + episode_reward = 0 + while not done: + action = self.actor.act(state) + next_state, reward, done, _ = self.env.step(action) + + episode_reward = episode_reward + reward + if self.memory is not None: + self.memory.append(state, action, reward, next_state, done) + state = next_state + + if printstat: + print("episode: {}/{}, score: {}" + .format(self.episode_num, self.config['total_training_episodes'], episode_reward)) + if self.logger is not None: + self.logger.append(self.name + '/EpisodeReward', episode_reward) + + self.episode_num += 1 + diff --git a/rltorch/mp/EnvironmentRun.py b/rltorch/mp/EnvironmentRun.py new file mode 100644 index 0000000..b73f8a2 --- /dev/null +++ b/rltorch/mp/EnvironmentRun.py @@ -0,0 +1,39 @@ +from copy import deepcopy +import torch.multiprocessing as mp + +class EnvironmentRun(mp.Process): + def __init__(self, env, actor, config, memory = None, logger = None, name = ""): + super(EnvironmentRun, self).__init__() + self.env = env + self.actor = actor + self.memory = memory + self.config = deepcopy(config) + self.logger = logger + self.name = name + self.episode_num = 1 + self.episode_reward = 0 + self.last_state = env.reset() + + def run(self, iterations = 1, printstat = False): + state = self.last_state + for _ in range(iterations): + action = self.actor.act(state) + next_state, reward, done, _ = self.env.step(action) + + self.episode_reward = self.episode_reward + reward + if self.memory is not None: + self.memory.append(state, action, reward, next_state, done) + state = next_state + + if done: + if printstat: + print("episode: {}/{}, score: {}" + .format(self.episode_num, self.config['total_training_episodes'], self.episode_reward)) + if self.logger is not None: + self.logger.append(self.name + '/EpisodeReward', self.episode_reward) + self.episode_num = self.episode_num + 1 + self.episode_reward = 0 + state = self.env.reset() + + self.last_state = state + diff --git a/rltorch/mp/__init__.py b/rltorch/mp/__init__.py new file mode 100644 index 0000000..b79f4a9 --- /dev/null +++ b/rltorch/mp/__init__.py @@ -0,0 +1,2 @@ +from .EnvironmentEpisode import * +from .EnvironmentRun import * \ No newline at end of file diff --git a/rltorch/network/Network.py b/rltorch/network/Network.py new file mode 100644 index 0000000..0fbe73f --- /dev/null +++ b/rltorch/network/Network.py @@ -0,0 +1,29 @@ +class Network: + """ + Wrapper around model which provides copy of it instead of trained weights + """ + def __init__(self, model, optimizer, config, logger = None, name = ""): + self.model = model + self.optimizer = optimizer(model.parameters(), lr = config['learning_rate'], weight_decay = config['weight_decay']) + self.logger = logger + self.name = name + + def __call__(self, *args): + return self.model(*args) + + def clamp_gradients(self): + for param in self.model.parameters(): + param.grad.data.clamp_(-1, 1) + + def zero_grad(self): + self.model.zero_grad() + + def step(self): + self.optimizer.step() + + def log_named_parameters(self): + if self.logger is not None: + for name, param in self.model.named_parameters(): + self.logger.append(self.name + "/" + name, param.cpu().detach().numpy()) + + diff --git a/rltorch/network/NoisyLinear.py b/rltorch/network/NoisyLinear.py new file mode 100644 index 0000000..066069b --- /dev/null +++ b/rltorch/network/NoisyLinear.py @@ -0,0 +1,44 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + +# This class utilizes this property of the normal distribution +# N(mu, sigma) = mu + sigma * N(0, 1) +class NoisyLinear(nn.Linear): + def __init__(self, in_features, out_features, sigma_init = 0.017, bias = True): + super(NoisyLinear, self).__init__(in_features, out_features, bias = bias) + # One of the parameters the network is going to tune is the + # standard deviation of the gaussian noise on the weights + self.sigma_weight = nn.Parameter(torch.Tensor(out_features, in_features).fill_(sigma_init)) + # Reserve space for N(0, 1) of weights in the forward() call + self.register_buffer("s_normal_weight", torch.zeros(out_features, in_features)) + if bias: + # If a bias exists, then we manipulate the standard deviation of the + # gaussion noise on them as well + self.sigma_bias = nn.Parameter(torch.Tensor(out_features).fill_(sigma_init)) + # Reserve space for N(0, 1) of bias in the foward() call + self.register_buffer("s_normal_bias", torch.zeros(out_features)) + self.reset_parameters() + + def reset_parameters(self): + std = math.sqrt(3 / self.in_features) + nn.init.uniform_(self.weight, -std, std) + nn.init.uniform_(self.bias, -std, std) + + def forward(self, x): + # Fill s_normal_weight with values from the standard normal distribution + torch.randn(self.s_normal_weight.size(), out = self.s_normal_weight, + dtype = self.s_normal_weight.dtype, layout = self.s_normal_weight.layout, device = self.s_normal_weight.device) + # Multiply by the standard deviation to correct the spread of Gaussian noise + weight_noise = self.sigma_weight * self.s_normal_weight.clone().requires_grad_() + + bias = None + if self.bias is not None: + # Fill s_normal_bias with values from standard normal + torch.randn(self.s_normal_bias.size(), out = self.s_normal_bias, + dtype = self.s_normal_bias.dtype, layout = self.s_normal_bias.layout, device = self.s_normal_bias.device) + # Add guassian noise to original bias + bias = self.bias + self.sigma_bias * self.s_normal_bias.clone().requires_grad_() + + return F.linear(x, self.weight + weight_noise, bias) \ No newline at end of file diff --git a/rltorch/network/TargetNetwork.py b/rltorch/network/TargetNetwork.py new file mode 100644 index 0000000..c3d9184 --- /dev/null +++ b/rltorch/network/TargetNetwork.py @@ -0,0 +1,28 @@ +from copy import deepcopy +# Derived from ptan library +class TargetNetwork: + """ + Wrapper around model which provides copy of it instead of trained weights + """ + def __init__(self, network): + self.model = network.model + self.target_model = deepcopy(network.model) + + def __call__(self, *args): + return self.model(*args) + + def sync(self): + self.target_model.load_state_dict(self.model.state_dict()) + + def partial_sync(self, tau): + """ + Blend params of target net with params from the model + :param tau: + """ + assert isinstance(tau, float) + assert 0.0 < tau <= 1.0 + model_state = self.model.state_dict() + target_state = self.target_model.state_dict() + for grad_index, grad in model_state.items(): + target_state[grad_index].copy_((1 - tau) * target_state[grad_index] + tau * grad) + self.target_model.load_state_dict(target_state) \ No newline at end of file diff --git a/rltorch/network/__init__.py b/rltorch/network/__init__.py new file mode 100644 index 0000000..3d85005 --- /dev/null +++ b/rltorch/network/__init__.py @@ -0,0 +1,3 @@ +from .Network import * +from .NoisyLinear import * +from .TargetNetwork import * \ No newline at end of file diff --git a/rltorch/seed.py b/rltorch/seed.py new file mode 100644 index 0000000..74d685b --- /dev/null +++ b/rltorch/seed.py @@ -0,0 +1,16 @@ +from os import environ +import numpy as np +import random +import torch + +def set_seed(SEED): + # Set `PYTHONHASHSEED` environment variable at a fixed value + environ['PYTHONHASHSEED'] = str(SEED) + + np.random.seed(SEED) + random.seed(SEED) + + # Pytorch + torch.manual_seed(SEED) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..c0763be --- /dev/null +++ b/setup.py @@ -0,0 +1,15 @@ +""" +rltorch stands for Reinforcement Learning Torch -- RL library built on top of PyTorch +""" +import setuptools + + +setuptools.setup( + name="rltorch", + author="Brandon Rozek", + author_email="rozekbrandon@gmail.com", + license='MIT', + description="Reinforcement Learning Framework for PyTorch", + version="0.1", + packages=setuptools.find_packages(), +) \ No newline at end of file