diff --git a/pz_risk/train_v.py b/pz_risk/train_v.py new file mode 100644 index 0000000..37e2e37 --- /dev/null +++ b/pz_risk/train_v.py @@ -0,0 +1,156 @@ +import os +import torch +import random +import numpy as np + +from risk_env import env +import training.utils as utils +from training.dvn import DVNAgent +from training.arguments import get_args +from wrappers import GraphObservationWrapper, DenseRewardWrapper, SparseRewardWrapper + +from agents.sampling import SAMPLING +from agents.value import get_future, get_attack_dist +from utils import get_feat_adj_from_board +from agents import GreedyAgent, RandomAgent +from copy import deepcopy + +from tqdm import tqdm +import matplotlib.pyplot as plt + + +def main(): + args = get_args() + + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + + if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + + log_dir = os.path.expanduser(args.log_dir) + eval_log_dir = log_dir + "_eval" + utils.cleanup_log_dir(log_dir) + utils.cleanup_log_dir(eval_log_dir) + + torch.set_num_threads(1) + device = torch.device("cuda:0" if args.cuda else "cpu") + + e = env(n_agent=2, board_name='4node') + e = GraphObservationWrapper(e) + e = SparseRewardWrapper(e) + e.reset() + _, _, _, info = e.last() + n_nodes = info['nodes'] + n_agents = info['agents'] + max_episode = 3000 + max_epi_step = 200 + + epsilon = 0.9 + epsilon_min = 0.005 + decay_rate = 0.005 + + feat_size = e.observation_spaces['feat'].shape[0] + hidden_size = 20 + + critic = DVNAgent(n_nodes, n_agents, feat_size, hidden_size) + save_path = './mini_7/' + load = 0 + # critic.load_state_dict(torch.load(os.path.join(save_path, str(load) + ".pt"))) + loss_list = [] + reward_list = [] + + # players = [None] + # players = [RandomAgent(i) for i in range(1, 6)] + + for episode in tqdm(range(load, max_episode)): + + e.reset() + state, _, _, _ = e.last() + loss_epi = [] + reward_epi = [] + for agent_id in e.agent_iter(max_iter=max_epi_step): + # for a in e.possible_agents: + # e.unwrapped.land_hist[a].append(len(e.unwrapped.board.player_nodes(a))) + # e.unwrapped.unit_hist[a].append(e.unwrapped.board.player_units(a)) + # e.unwrapped.place_hist[a].append(e.unwrapped.board.calc_units(a)) + # make an action based on epsilon greedy action + state, _, _, info = e.last() + critic.eval() + if agent_id != 0: + task_id = state['task_id'] + action = SAMPLING[task_id](e.unwrapped.board, agent_id) + else: + # Use Model to Gather Future State per Valid Actions + action_scores = [] + deterministic, valid_actions = e.unwrapped.board.valid_actions(agent_id) + for valid_action in valid_actions: + sim = deepcopy(e.unwrapped.board) + if deterministic: + sim.step(agent_id, valid_action) + else: + dist = get_attack_dist(e.unwrapped.board, valid_action) + if len(dist): # TODO: Change to sampling + left = get_future(dist, mode='two', risk=0.2) + sim.step(agent_id, valid_action, left) + else: + sim.step(agent_id, valid_action) + sim_feat, sim_adj = get_feat_adj_from_board(sim, agent_id, e.unwrapped.n_agents, e.unwrapped.n_grps) + sim_feat = torch.tensor(sim_feat, dtype=torch.float32, device=device).reshape(-1, + n_nodes + n_agents, + feat_size) + sim_adj = torch.tensor(sim_adj, dtype=torch.float32, device=device).reshape(-1, n_nodes + n_agents, + n_nodes + n_agents) + action_scores.append(critic(sim_feat, sim_adj).detach().cpu().numpy()[:, n_nodes + agent_id]) + action = valid_actions[np.argmax(action_scores)] + before_feat = torch.tensor(state['feat'], dtype=torch.float32, device=device).reshape(-1, + n_nodes + n_agents, + feat_size) + before_adj = torch.tensor(state['adj'], dtype=torch.float32, device=device).reshape(-1, n_nodes + n_agents, + n_nodes + n_agents) + + e.step(action) + state, _, _, info = e.last() + feat = torch.tensor(state['feat'], dtype=torch.float32, device=device).reshape(-1, n_nodes + n_agents, + feat_size) + adj = torch.tensor(state['adj'], dtype=torch.float32, device=device).reshape(-1, n_nodes + n_agents, + n_nodes + n_agents) + reward = torch.tensor(state['rewards'], dtype=torch.float32, device=device).reshape(-1, n_agents) + done = torch.tensor(state['dones'], dtype=torch.bool, device=device).reshape(-1, n_agents) + reward_epi.append(reward.cpu().numpy()[0]) + # e.render() + + # make a transition and save to replay memory + transition = [before_feat, before_adj, reward, feat, adj, done] + critic.save_memory(transition) + critic.train() + if critic.train_start(): + loss = critic.train_() + loss_epi.append(loss) + # print('Loss: {}, Reward: {}'.format(loss, reward)) + if all(done[0].cpu().numpy()): + break + + if epsilon > epsilon_min: + epsilon -= decay_rate + else: + epsilon = epsilon_min + + if critic.train_start(): + loss_list.append(sum(loss_epi) / len(loss_epi)) + # plt.show() + e.close() + reward_list.append(sum(reward_epi)) + + if critic.train_start(): + print(episode + 1, reward_list[-1], loss_list[-1]) + + if episode % 10 == 0: + torch.save(critic.state_dict(), os.path.join(save_path, str(episode // 10) + ".pt")) + + return loss_list, reward_list + + +if __name__ == "__main__": + main() diff --git a/pz_risk/training/__init__.py b/pz_risk/training/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pz_risk/training/arguments.py b/pz_risk/training/arguments.py new file mode 100644 index 0000000..1cba9c9 --- /dev/null +++ b/pz_risk/training/arguments.py @@ -0,0 +1,145 @@ +import argparse + +import torch + + +def get_args(): + parser = argparse.ArgumentParser(description='RL') + parser.add_argument( + '--lr', type=float, default=7e-4, help='learning rate (default: 7e-4)') + parser.add_argument( + '--eps', + type=float, + default=1e-5, + help='RMSprop optimizer epsilon (default: 1e-5)') + parser.add_argument( + '--alpha', + type=float, + default=0.99, + help='RMSprop optimizer apha (default: 0.99)') + parser.add_argument( + '--gamma', + type=float, + default=0.99, + help='discount factor for rewards (default: 0.99)') + parser.add_argument( + '--use-gae', + action='store_true', + default=False, + help='use generalized advantage estimation') + parser.add_argument( + '--gae-lambda', + type=float, + default=0.95, + help='gae lambda parameter (default: 0.95)') + parser.add_argument( + '--entropy-coef', + type=float, + default=0.01, + help='entropy term coefficient (default: 0.01)') + parser.add_argument( + '--value-loss-coef', + type=float, + default=0.5, + help='value loss coefficient (default: 0.5)') + parser.add_argument( + '--max-grad-norm', + type=float, + default=0.5, + help='max norm of gradients (default: 0.5)') + parser.add_argument( + '--seed', type=int, default=1, help='random seed (default: 1)') + parser.add_argument( + '--cuda-deterministic', + action='store_true', + default=False, + help="sets flags for determinism when using CUDA (potentially slow!)") + parser.add_argument( + '--num-processes', + type=int, + default=1, + help='how many training CPU processes to use (default: 16)') + parser.add_argument( + '--num-steps', + type=int, + default=5, + help='number of forward steps in A2C (default: 5)') + parser.add_argument( + '--ppo-epoch', + type=int, + default=4, + help='number of ppo epochs (default: 4)') + parser.add_argument( + '--num-mini-batch', + type=int, + default=32, + help='number of batches for ppo (default: 32)') + parser.add_argument( + '--clip-param', + type=float, + default=0.2, + help='ppo clip parameter (default: 0.2)') + parser.add_argument( + '--log-interval', + type=int, + default=10, + help='log interval, one log per n updates (default: 10)') + parser.add_argument( + '--save-interval', + type=int, + default=100, + help='save interval, one save per n updates (default: 100)') + parser.add_argument( + '--eval-interval', + type=int, + default=None, + help='eval interval, one eval per n updates (default: None)') + parser.add_argument( + '--num-env-steps', + type=int, + default=10e6, + help='number of environment steps to train (default: 10e6)') + parser.add_argument( + '--env-name', + default='Risk-Normal-6-v0', + help='environment to train on (default: Risk-Normal-6-v0)') + parser.add_argument( + '--log-dir', + default='/tmp/gym/', + help='directory to save agent logs (default: /tmp/gym)') + parser.add_argument( + '--save-dir', + default='./trained_models/', + help='directory to save agent logs (default: ./trained_models/)') + parser.add_argument( + '--no-cuda', + action='store_true', + default=False, + help='disables CUDA training') + parser.add_argument( + '--use-proper-time-limits', + action='store_true', + default=False, + help='compute returns taking into account time limits') + parser.add_argument( + '--recurrent-policy', + action='store_true', + default=False, + help='use a recurrent policy') + parser.add_argument( + '--use-linear-lr-decay', + action='store_true', + default=False, + help='use a linear schedule on the learning rate') + parser.add_argument( + '--dir', + default='./mini_7/100.pt', + help='Directory to load') + + + + args = parser.parse_args() + + args.cuda = not args.no_cuda and torch.cuda.is_available() + + return args diff --git a/pz_risk/training/distributions.py b/pz_risk/training/distributions.py new file mode 100644 index 0000000..481cb39 --- /dev/null +++ b/pz_risk/training/distributions.py @@ -0,0 +1,109 @@ +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from training.utils import AddBias, init + +""" +Modify standard PyTorch distributions so they are compatible with this code. +""" + +# +# Standardize distribution interfaces +# + +# Categorical +class FixedCategorical(torch.distributions.Categorical): + def sample(self): + return super().sample().unsqueeze(-1) + + def log_probs(self, actions): + return ( + super() + .log_prob(actions.squeeze(-1)) + .view(actions.size(0), -1) + .sum(-1) + .unsqueeze(-1) + ) + + def mode(self): + return self.probs.argmax(dim=-1, keepdim=True) + + +# Normal +class FixedNormal(torch.distributions.Normal): + def log_probs(self, actions): + return super().log_prob(actions).sum(-1, keepdim=True) + + def entropy(self): + return super().entropy().sum(-1) + + def mode(self): + return self.mean + + +# Bernoulli +class FixedBernoulli(torch.distributions.Bernoulli): + def log_probs(self, actions): + return super.log_prob(actions).view(actions.size(0), -1).sum(-1).unsqueeze(-1) + + def entropy(self): + return super().entropy().sum(-1) + + def mode(self): + return torch.gt(self.probs, 0.5).float() + + +class Categorical(nn.Module): + def __init__(self, num_inputs, num_outputs): + super(Categorical, self).__init__() + + init_ = lambda m: init( + m, + nn.init.orthogonal_, + lambda x: nn.init.constant_(x, 0), + gain=0.01) + + self.linear = init_(nn.Linear(num_inputs, num_outputs)) + + def forward(self, x): + x = self.linear(x) + return FixedCategorical(logits=x) + + +class DiagGaussian(nn.Module): + def __init__(self, num_inputs, num_outputs): + super(DiagGaussian, self).__init__() + + init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. + constant_(x, 0)) + + self.fc_mean = init_(nn.Linear(num_inputs, num_outputs)) + self.logstd = AddBias(torch.zeros(num_outputs)) + + def forward(self, x): + action_mean = self.fc_mean(x) + + # An ugly hack for my KFAC implementation. + zeros = torch.zeros(action_mean.size()) + if x.is_cuda: + zeros = zeros.cuda() + + action_logstd = self.logstd(zeros) + return FixedNormal(action_mean, action_logstd.exp()) + + +class Bernoulli(nn.Module): + def __init__(self, num_inputs, num_outputs): + super(Bernoulli, self).__init__() + + init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. + constant_(x, 0)) + + self.linear = init_(nn.Linear(num_inputs, num_outputs)) + + def forward(self, x): + x = self.linear(x) + return FixedBernoulli(logits=x) diff --git a/pz_risk/training/dvn.py b/pz_risk/training/dvn.py new file mode 100644 index 0000000..68305a3 --- /dev/null +++ b/pz_risk/training/dvn.py @@ -0,0 +1,141 @@ +### DVN ### +### Deep Value Network ### +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import torchvision.transforms as T + +import math +import random +import numpy as np + +from itertools import count +from collections import namedtuple + +from training.utils import * + +Transition = namedtuple('Transition', ('before_feat', 'before_adj', 'reward', 'feat', 'adj', 'done')) + + +class ReplayMemory(object): + + def __init__(self, capacity): + self.capacity = capacity + self.memory = [] + self.position = 0 + + def push(self, *args): + """Saves a transition.""" + if len(self.memory) < self.capacity: + self.memory.append(None) + self.memory[self.position] = Transition(*args) + self.position = (self.position + 1) % self.capacity + + def sample(self, batch_size): + return random.sample(self.memory, batch_size) + + def __len__(self): + return len(self.memory) + + +class GNN(nn.Module): + def __init__(self, transform, activation): + super(GNN, self).__init__() + + self.transform = transform + self.activation = activation + + def forward(self, feat, adj): + seq = self.transform(feat) + ret = torch.matmul(adj, seq) + return self.activation(ret) + + +class DVN(nn.Module): + def __init__(self, feat_space, hidden_size): + super(DVN, self).__init__() + + self.h1 = GNN(init_(nn.Linear(feat_space, hidden_size)), nn.Tanh()) + self.h2 = GNN(init_(nn.Linear(hidden_size, hidden_size)), nn.Tanh()) + + self.critic_linear = init_(nn.Linear(hidden_size, 1)) + self.train() + + def forward(self, feat, adj): + hidden = self.h1(feat, adj) + hidden = self.h2(hidden, adj) + return self.critic_linear(hidden) + + +class DVNAgent(nn.Module): + def __init__(self, num_nodes, num_agents, feat_size, hidden_size, device='cuda:0'): + super(DVNAgent, self).__init__() + self.device = device + self.policy_network = DVN(feat_size, hidden_size).to(device) + self.target_network = DVN(feat_size, hidden_size).to(device) + + self.memory = ReplayMemory(2 ** 10) + self.target_network.load_state_dict(self.policy_network.state_dict()) + self.target_network.eval() + + self.optimizer = optim.RMSprop(self.policy_network.parameters()) + + self.batch_size = 100 + self.gamma = 0.9999 + self.target_update = 2000 + self.num_train = 0 + self.n_nodes = num_nodes + self.n_agents = num_agents + self.feat_size = feat_size + + def forward(self, feat, adj): + with torch.no_grad(): + return self.policy_network(feat, adj) + + def train_(self): + self.num_train += 1 + transitions = self.memory.sample(self.batch_size) + batch = Transition(*zip(*transitions)) + + non_final_mask_feat = torch.tensor(tuple(map(lambda s: s is not None, batch.feat))) + # non_final_mask_adj = torch.tensor(tuple(map(lambda s: s is not None, batch.adj)), torch.bool, self.device) + non_final_next_feat = torch.cat([s for s in batch.feat if s is not None]).view(-1, self.n_nodes + self.n_agents, self.feat_size) + non_final_next_adj = torch.cat([s for s in batch.adj if s is not None]).view(-1, self.n_nodes + self.n_agents, self.n_nodes + self.n_agents) + + feat_batch = torch.cat(batch.before_feat).view(-1, self.n_nodes + self.n_agents, self.feat_size) + adj_batch = torch.cat(batch.before_adj).view(-1, self.n_nodes + self.n_agents, self.n_nodes + self.n_agents) + reward_batch = torch.cat(batch.reward) + + q = self.policy_network(feat_batch, adj_batch).squeeze()[:, self.n_nodes:] + + y = torch.zeros([self.batch_size, self.n_agents], device=self.device) + y[non_final_mask_feat] = self.target_network(non_final_next_feat, non_final_next_adj)[:, self.n_nodes:].squeeze().detach() * self.gamma + y += reward_batch + + # Compute Huber loss + loss = F.smooth_l1_loss(y, q) + + # Optimize the model + self.optimizer.zero_grad() + loss.backward() + for param in self.policy_network.parameters(): + param.grad.data.clamp_(-1, 1) + self.optimizer.step() + + if self.num_train % self.target_update == 0: + self.update_target() + + return loss.item() + + def train_start(self): + return len(self.memory) > self.batch_size # Boolean + + def update_target(self): + self.target_network.load_state_dict(self.policy_network.state_dict()) + self.target_network.eval() + + def save_memory(self, transition): + # transition[1] = torch.tensor([[transition[1]]], device=self.device, dtype=torch.long) # Action + # transition[2] = torch.tensor([transition[2]], device=self.device) # Reward + self.memory.push(*transition) diff --git a/pz_risk/training/enjoy.py b/pz_risk/training/enjoy.py new file mode 100644 index 0000000..a645fa7 --- /dev/null +++ b/pz_risk/training/enjoy.py @@ -0,0 +1,96 @@ +import argparse +import os +# workaround to unpickle olf model files +import sys + +import numpy as np +import torch + +from a2c_ppo_acktr.envs import VecPyTorch, make_vec_envs +from a2c_ppo_acktr.utils import get_render_func, get_vec_normalize + +sys.path.append('a2c_ppo_acktr') + +parser = argparse.ArgumentParser(description='RL') +parser.add_argument( + '--seed', type=int, default=1, help='random seed (default: 1)') +parser.add_argument( + '--log-interval', + type=int, + default=10, + help='log interval, one log per n updates (default: 10)') +parser.add_argument( + '--env-name', + default='PongNoFrameskip-v4', + help='environment to train on (default: PongNoFrameskip-v4)') +parser.add_argument( + '--load-dir', + default='./trained_models/', + help='directory to save agent logs (default: ./trained_models/)') +parser.add_argument( + '--non-det', + action='store_true', + default=False, + help='whether to use a non-deterministic policy') +args = parser.parse_args() + +args.det = not args.non_det + +env = make_vec_envs( + args.env_name, + args.seed + 1000, + 1, + None, + None, + device='cpu', + allow_early_resets=False) + +# Get a render function +render_func = get_render_func(env) + +# We need to use the same statistics for normalization as used in training +actor_critic, obs_rms = \ + torch.load(os.path.join(args.load_dir, args.env_name + ".pt"), + map_location='cpu') + +vec_norm = get_vec_normalize(env) +if vec_norm is not None: + vec_norm.eval() + vec_norm.obs_rms = obs_rms + +recurrent_hidden_states = torch.zeros(1, + actor_critic.recurrent_hidden_state_size) +masks = torch.zeros(1, 1) + +obs = env.reset() + +if render_func is not None: + render_func('human') + +if args.env_name.find('Bullet') > -1: + import pybullet as p + + torsoId = -1 + for i in range(p.getNumBodies()): + if (p.getBodyInfo(i)[0].decode() == "torso"): + torsoId = i + +while True: + with torch.no_grad(): + value, action, _, recurrent_hidden_states = actor_critic.act( + obs, recurrent_hidden_states, masks, deterministic=args.det) + + # Obser reward and next obs + obs, reward, done, _ = env.step(action) + + masks.fill_(0.0 if done else 1.0) + + if args.env_name.find('Bullet') > -1: + if torsoId > -1: + distance = 5 + yaw = 0 + humanPos, humanOrn = p.getBasePositionAndOrientation(torsoId) + p.resetDebugVisualizerCamera(distance, yaw, -20, humanPos) + + if render_func is not None: + render_func('human') diff --git a/pz_risk/training/envs.py b/pz_risk/training/envs.py new file mode 100644 index 0000000..6d26e7f --- /dev/null +++ b/pz_risk/training/envs.py @@ -0,0 +1,188 @@ +import os + +import gym +import numpy as np +import torch +from gym.spaces.box import Box + + +from common.vec_env import (DummyVecEnv, SubprocVecEnv, VecEnvWrapper) +from common.vec_env.vec_normalize import VecNormalize as VecNormalize_ + +try: + import pz_risk.envs +except ImportError: + pass + + +def make_env(env_id, seed, rank): + def _thunk(): + env = gym.make(env_id) + env.seed(seed + rank) + + return env + + return _thunk + + +def make_vec_envs(env_name, + seed, + num_processes, + gamma, + log_dir, + device, + allow_early_resets, + num_frame_stack=None): + envs = [make_env(env_name, seed, i) for i in range(num_processes)] + + if len(envs) > 1: + envs = SubprocVecEnv(envs) + else: + envs = DummyVecEnv(envs) + + if len(envs.observation_space.shape) == 1: + if gamma is None: + envs = VecNormalize(envs, norm_reward=False) + else: + envs = VecNormalize(envs, gamma=gamma) + + envs = VecPyTorch(envs, device) + + if num_frame_stack is not None: + envs = VecPyTorchFrameStack(envs, num_frame_stack, device) + elif len(envs.observation_space.shape) == 3: + envs = VecPyTorchFrameStack(envs, 4, device) + + return envs + + +# Can be used to test recurrent policies for Reacher-v2 +class MaskGoal(gym.ObservationWrapper): + def observation(self, observation): + if self.env._elapsed_steps > 0: + observation[-2:] = 0 + return observation + + +class TransposeObs(gym.ObservationWrapper): + def __init__(self, env=None): + """ + Transpose observation space (base class) + """ + super(TransposeObs, self).__init__(env) + + +class TransposeImage(TransposeObs): + def __init__(self, env=None, op=[2, 0, 1]): + """ + Transpose observation space for images + """ + super(TransposeImage, self).__init__(env) + assert len(op) == 3, "Error: Operation, " + str(op) + ", must be dim3" + self.op = op + obs_shape = self.observation_space.shape + self.observation_space = Box( + self.observation_space.low[0, 0, 0], + self.observation_space.high[0, 0, 0], [ + obs_shape[self.op[0]], obs_shape[self.op[1]], + obs_shape[self.op[2]] + ], + dtype=self.observation_space.dtype) + + def observation(self, ob): + return ob.transpose(self.op[0], self.op[1], self.op[2]) + + +class VecPyTorch(VecEnvWrapper): + def __init__(self, venv, device): + """Return only every `skip`-th frame""" + super(VecPyTorch, self).__init__(venv) + self.device = device + # TODO: Fix data types + + def reset(self): + obs = self.venv.reset() + obs = torch.from_numpy(obs).float().to(self.device) + return obs + + def step_async(self, actions): + if isinstance(actions, torch.LongTensor): + # Squeeze the dimension for discrete actions + actions = actions.squeeze(1) + actions = actions.cpu().numpy() + self.venv.step_async(actions) + + def step_wait(self): + obs, reward, done, info = self.venv.step_wait() + obs = torch.from_numpy(obs).float().to(self.device) + reward = torch.from_numpy(reward).unsqueeze(dim=1).float() + return obs, reward, done, info + + +class VecNormalize(VecNormalize_): + def __init__(self, *args, **kwargs): + super(VecNormalize, self).__init__(*args, **kwargs) + self.training = True + + def _obfilt(self, obs, update=True): + if self.obs_rms: + if self.training and update: + self.obs_rms.update(obs) + obs = np.clip((obs - self.obs_rms.mean) / + np.sqrt(self.obs_rms.var + self.epsilon), + -self.clip_obs, self.clip_obs) + return obs + else: + return obs + + def train(self): + self.training = True + + def eval(self): + self.training = False + + +# Derived from +# https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_frame_stack.py +class VecPyTorchFrameStack(VecEnvWrapper): + def __init__(self, venv, nstack, device=None): + self.venv = venv + self.nstack = nstack + + wos = venv.observation_space # wrapped ob space + self.shape_dim0 = wos.shape[0] + + low = np.repeat(wos.low, self.nstack, axis=0) + high = np.repeat(wos.high, self.nstack, axis=0) + + if device is None: + device = torch.device('cpu') + self.stacked_obs = torch.zeros((venv.num_envs,) + + low.shape).to(device) + + observation_space = gym.spaces.Box(low=low, + high=high, + dtype=venv.observation_space.dtype) + VecEnvWrapper.__init__(self, venv, observation_space=observation_space) + + def step_wait(self): + obs, rews, news, infos = self.venv.step_wait() + self.stacked_obs[:, :-self.shape_dim0] = \ + self.stacked_obs[:, self.shape_dim0:].clone() + for (i, new) in enumerate(news): + if new: + self.stacked_obs[i] = 0 + self.stacked_obs[:, -self.shape_dim0:] = obs + return self.stacked_obs, rews, news, infos + + def reset(self): + obs = self.venv.reset() + if torch.backends.cudnn.deterministic: + self.stacked_obs = torch.zeros(self.stacked_obs.shape) + else: + self.stacked_obs.zero_() + self.stacked_obs[:, -self.shape_dim0:] = obs + return self.stacked_obs + + def close(self): + self.venv.close() diff --git a/pz_risk/training/evaluation.py b/pz_risk/training/evaluation.py new file mode 100644 index 0000000..7f67a7a --- /dev/null +++ b/pz_risk/training/evaluation.py @@ -0,0 +1,48 @@ +import numpy as np +import torch + +from training import utils +from training.envs import make_vec_envs + + +def evaluate(actor_critic, obs_rms, env_name, seed, num_processes, eval_log_dir, + device): + eval_envs = make_vec_envs(env_name, seed + num_processes, num_processes, + None, eval_log_dir, device, True) + + vec_norm = utils.get_vec_normalize(eval_envs) + if vec_norm is not None: + vec_norm.eval() + vec_norm.obs_rms = obs_rms + + eval_episode_rewards = [] + + obs = eval_envs.reset() + eval_recurrent_hidden_states = torch.zeros( + num_processes, actor_critic.recurrent_hidden_state_size, device=device) + eval_masks = torch.zeros(num_processes, 1, device=device) + + while len(eval_episode_rewards) < 10: + with torch.no_grad(): + _, action, _, eval_recurrent_hidden_states = actor_critic.act( + obs, + eval_recurrent_hidden_states, + eval_masks, + deterministic=True) + + # Obser reward and next obs + obs, _, done, infos = eval_envs.step(action) + + eval_masks = torch.tensor( + [[0.0] if done_ else [1.0] for done_ in done], + dtype=torch.float32, + device=device) + + for info in infos: + if 'episode' in info.keys(): + eval_episode_rewards.append(info['episode']['r']) + + eval_envs.close() + + print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( + len(eval_episode_rewards), np.mean(eval_episode_rewards))) diff --git a/pz_risk/training/model.py b/pz_risk/training/model.py new file mode 100644 index 0000000..df015d9 --- /dev/null +++ b/pz_risk/training/model.py @@ -0,0 +1,164 @@ +# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail + +import numpy as np + +from loguru import logger + +import torch as t +import torch.nn as nn +import torch.functional as F + +from training.distributions import Bernoulli, Categorical, DiagGaussian +from training.utils import * + + +class NNBase(nn.Module): + def __init__(self, hidden_size): + super(NNBase, self).__init__() + self._hidden_size = hidden_size + + @property + def output_size(self): + return self._hidden_size + + +class MLPBase(NNBase): + def __init__(self, num_inputs, hidden_size=64): + super(MLPBase, self).__init__(hidden_size) + + init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0), np.sqrt(2)) + + self.actor = nn.Sequential( + init_(nn.Linear(num_inputs, hidden_size)), nn.Tanh(), + init_(nn.Linear(hidden_size, hidden_size)), nn.Tanh()) + + self.critic = nn.Sequential( + init_(nn.Linear(num_inputs, hidden_size)), nn.Tanh(), + init_(nn.Linear(hidden_size, hidden_size)), nn.Tanh()) + + self.critic_linear = init_(nn.Linear(hidden_size, 1)) + + self.train() + + def forward(self, inputs, rnn_hxs, masks): + x = inputs + + hidden_critic = self.critic(x) + hidden_actor = self.actor(x) + + return self.critic_linear(hidden_critic), hidden_actor, rnn_hxs + + +class GNN(nn.Module): + def __init__(self, transform, activation): + super(GNN, self).__init__() + + self.transform = transform + self.activation = activation + + def forward(self, adj, feat): + seq = self.transform(feat) + ret = t.matmul(seq, adj) + return self.activation(ret) + + +class GNNBase(NNBase): + def __init__(self, num_inputs, hidden_size=64): + super(GNNBase, self).__init__(hidden_size) + + self.actor = nn.Sequential( + GNN(init_(nn.Linear(num_inputs, hidden_size)), nn.Tanh()), + GNN(init_(nn.Linear(hidden_size, hidden_size)), nn.Tanh())) + + self.critic = nn.Sequential( + GNN(init_(nn.Linear(num_inputs, hidden_size)), nn.Tanh()), + GNN(init_(nn.Linear(hidden_size, hidden_size)), nn.Tanh())) + + self.critic_linear = init_(nn.Linear(hidden_size, 1)) + + self.train() + + def forward(self, features, adj, rnn_hxs, masks): + hidden_critic = self.critic(features, adj) + hidden_actor = self.actor(features, adj) + + return self.critic_linear(hidden_critic), hidden_actor, rnn_hxs + + +class Flatten(nn.Module): + def forward(self, x): + return x.view(x.size(0), -1) + + +class Policy(nn.Module): + def __init__(self, obs_shape, action_space, base_kwargs=None): + super(Policy, self).__init__() + if base_kwargs is None: + base_kwargs = {} + self.base = GNNBase(obs_shape['feat'].shape[0], **base_kwargs) + + self.dist = {} + for k, space in action_space.items(): + if space.__class__.__name__ == "Discrete": + self.dist[k] = [Categorical(self.base.output_size, space.n)] + elif space.__class__.__name__ == "MultiDiscrete": + self.dist[k] = [Categorical(self.base.output_size, v) for v in space.nvec] + elif space.__class__.__name__ == "Box": + num_outputs = action_space.shape[0] + self.dist[k] = [DiagGaussian(self.base.output_size, num_outputs)] + elif space.__class__.__name__ == "MultiBinary": + num_outputs = action_space.shape[0] + self.dist[k] = [Bernoulli(self.base.output_size, num_outputs)] + else: + raise NotImplementedError + + @property + def is_recurrent(self): + return self.base.is_recurrent + + @property + def recurrent_hidden_state_size(self): + """Size of rnn_hx.""" + return self.base.recurrent_hidden_state_size + + def forward(self, inputs, rnn_hxs, masks): + raise NotImplementedError + + def act(self, feat, task_id, masks, deterministic=False): + # adj = inputs['adj'] + # feat = inputs['feat'] + # game_state = inputs['task_id'] + # placement = inputs['placement'] + # cards = inputs['cards'] + + value, actor_features = self.base(feat, self.adj, masks) + + dists = [dist(actor_features) for dist in self.dist[task_id]] + + if deterministic: + actions = [dist.mode() for dist in dists] + else: + actions = [dist.sample() for dist in dists] + + action_log_probs = [dist.log_probs(action) for dist, action in zip(dists, actions)] + dist_entropy = [dist.entropy().mean() for dist in dists] + + return value, actions, action_log_probs + + def get_value(self, inputs, rnn_hxs, masks): + value, _, _ = self.base(inputs, rnn_hxs, masks) + return value + + def evaluate_actions(self, feat, task_id, masks, actions): + # adj = inputs['adj'] + # feat = inputs['feat'] + # game_state = inputs['task_id'] + # placement = inputs['placement'] + # cards = inputs['cards'] + value, actor_features = self.base(feat, self.adj, masks) + dists = [dist(actor_features) for dist in self.dist[task_id]] + + action_log_probs = [dist.log_probs(action) for dist, action in zip(dists, actions)] + dist_entropy = [dist.entropy().mean() for dist in dists] + + return value, action_log_probs, dist_entropy diff --git a/pz_risk/training/ppo.py b/pz_risk/training/ppo.py new file mode 100644 index 0000000..1bb77ff --- /dev/null +++ b/pz_risk/training/ppo.py @@ -0,0 +1,88 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + + +class PPO: + def __init__(self, + actor_critic, + clip_param, + ppo_epoch, + num_mini_batch, + value_loss_coef, + entropy_coef, + lr=None, + eps=None, + max_grad_norm=None, + use_clipped_value_loss=True): + + self.actor_critic = actor_critic + + self.clip_param = clip_param + self.ppo_epoch = ppo_epoch + self.num_mini_batch = num_mini_batch + + self.value_loss_coef = value_loss_coef + self.entropy_coef = entropy_coef + + self.max_grad_norm = max_grad_norm + self.use_clipped_value_loss = use_clipped_value_loss + + self.optimizer = optim.Adam(actor_critic.parameters(), lr=lr, eps=eps) + + def update(self, rollouts): + advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] + advantages = (advantages - advantages.mean()) / ( + advantages.std() + 1e-5) + + value_loss_epoch = 0 + action_loss_epoch = 0 + dist_entropy_epoch = 0 + + for e in range(self.ppo_epoch): + data_generator = rollouts.feed_forward_generator(advantages, self.num_mini_batch) + + for sample in data_generator: + obs_batch, task_batch, actions_batch, \ + value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, \ + adv_targ = sample + + # Reshape to do in a single forward pass for all steps + values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( + obs_batch, task_batch, masks_batch, actions_batch) + + ratio = torch.exp(action_log_probs - old_action_log_probs_batch) + surr1 = ratio * adv_targ + surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv_targ + action_loss = -torch.min(surr1, surr2).mean() + + if self.use_clipped_value_loss: + value_pred_clipped = value_preds_batch + \ + (values - value_preds_batch).clamp(-self.clip_param, self.clip_param) + value_losses = (values - return_batch).pow(2) + value_losses_clipped = ( + value_pred_clipped - return_batch).pow(2) + value_loss = 0.5 * torch.max(value_losses, + value_losses_clipped).mean() + else: + value_loss = 0.5 * (return_batch - values).pow(2).mean() + + self.optimizer.zero_grad() + (value_loss * self.value_loss_coef + action_loss - + dist_entropy * self.entropy_coef).backward() + nn.utils.clip_grad_norm_(self.actor_critic.parameters(), + self.max_grad_norm) + self.optimizer.step() + + value_loss_epoch += value_loss.item() + action_loss_epoch += action_loss.item() + dist_entropy_epoch += dist_entropy.item() + + num_updates = self.ppo_epoch * self.num_mini_batch + + value_loss_epoch /= num_updates + action_loss_epoch /= num_updates + dist_entropy_epoch /= num_updates + + return value_loss_epoch, action_loss_epoch, dist_entropy_epoch diff --git a/pz_risk/training/storage.py b/pz_risk/training/storage.py new file mode 100644 index 0000000..5fda143 --- /dev/null +++ b/pz_risk/training/storage.py @@ -0,0 +1,112 @@ +# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail + +import torch +from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler + + +def _flatten_helper(T, N, _tensor): + return _tensor.view(T * N, *_tensor.size()[2:]) + + +class RolloutStorage(object): + def __init__(self, num_steps, num_processes, obs_shape, action_space, task_space): + self.obs = torch.zeros(num_steps + 1, num_processes, *obs_shape) + self.task_id = torch.zeros(num_steps + 1, num_processes, task_space) + self.rewards = torch.zeros(num_steps, num_processes, 1) + self.value_preds = torch.zeros(num_steps + 1, num_processes, 1) + self.returns = torch.zeros(num_steps + 1, num_processes, 1) + self.action_log_probs = torch.zeros(num_steps, num_processes) + if action_space.__class__.__name__ == 'Discrete': + action_shape = 1 + else: + action_shape = action_space.shape[0] + self.actions = torch.zeros(num_steps, num_processes, action_shape) + + self.masks = torch.ones(num_steps + 1, num_processes, 1) + + # Masks that indicate whether it's a true terminal state + # or time limit end state + self.bad_masks = torch.ones(num_steps + 1, num_processes, 1) + + self.num_steps = num_steps + self.step = 0 + + def to(self, device): + self.obs = self.obs.to(device) + self.task_id = self.task_id.to(device) + self.rewards = self.rewards.to(device) + self.value_preds = self.value_preds.to(device) + self.returns = self.returns.to(device) + self.action_log_probs = self.action_log_probs.to(device) + self.actions = self.actions.to(device) + self.masks = self.masks.to(device) + self.bad_masks = self.bad_masks.to(device) + + def insert(self, obs, task_id, actions, action_log_probs, + value_preds, rewards, masks, bad_masks): + self.obs[self.step + 1].copy_(obs) + self.task_id[self.step + 1].copy_(task_id) + self.actions[self.step].copy_(actions) + self.action_log_probs[self.step].copy_(action_log_probs) + self.value_preds[self.step].copy_(value_preds) + self.rewards[self.step].copy_(rewards) + self.masks[self.step + 1].copy_(masks) + self.bad_masks[self.step + 1].copy_(bad_masks) + + self.step = (self.step + 1) % self.num_steps + + def after_update(self): + self.obs[0].copy_(self.obs[-1]) + self.task_id[0].copy_(self.task_id[-1]) + self.masks[0].copy_(self.masks[-1]) + self.bad_masks[0].copy_(self.bad_masks[-1]) + + def compute_returns(self, + next_value, + gamma, + use_proper_time_limits=True): + if use_proper_time_limits: + self.returns[-1] = next_value + for step in reversed(range(self.rewards.size(0))): + self.returns[step] = (self.returns[step + 1] * gamma * self.masks[step + 1] + self.rewards[step]) * \ + self.bad_masks[step + 1] + (1 - self.bad_masks[step + 1]) * self.value_preds[step] + else: + self.returns[-1] = next_value + for step in reversed(range(self.rewards.size(0))): + self.returns[step] = self.returns[step + 1] * gamma * self.masks[step + 1] + self.rewards[step] + + def feed_forward_generator(self, + advantages, + num_mini_batch=None, + mini_batch_size=None): + num_steps, num_processes = self.rewards.size()[0:2] + batch_size = num_processes * num_steps + + if mini_batch_size is None: + assert batch_size >= num_mini_batch, ( + "PPO requires the number of processes ({}) " + "* number of steps ({}) = {} " + "to be greater than or equal to the number of PPO mini batches ({})." + "".format(num_processes, num_steps, num_processes * num_steps, + num_mini_batch)) + mini_batch_size = batch_size // num_mini_batch + sampler = BatchSampler( + SubsetRandomSampler(range(batch_size)), + mini_batch_size, + drop_last=True) + for indices in sampler: + obs_batch = self.obs[:-1].view(-1, *self.obs.size()[2:])[indices] + task_batch = self.task_id[:-1].view(-1, *self.task_id.size()[2:])[indices] + actions_batch = self.actions.view(-1, self.actions.size(-1))[indices] + value_preds_batch = self.value_preds[:-1].view(-1, 1)[indices] + return_batch = self.returns[:-1].view(-1, 1)[indices] + masks_batch = self.masks[:-1].view(-1, 1)[indices] + old_action_log_probs_batch = self.action_log_probs.view(-1, + 1)[indices] + if advantages is None: + adv_targ = None + else: + adv_targ = advantages.view(-1, 1)[indices] + + yield obs_batch, task_batch, actions_batch, value_preds_batch, return_batch, masks_batch, \ + old_action_log_probs_batch, adv_targ diff --git a/pz_risk/training/utils.py b/pz_risk/training/utils.py new file mode 100644 index 0000000..7ef1200 --- /dev/null +++ b/pz_risk/training/utils.py @@ -0,0 +1,66 @@ +import glob +import os + +import torch +import torch.nn as nn +import numpy as np +# from a2c_ppo_acktr.envs import VecNormalize + +init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0), np.sqrt(2)) + +# Get a render function +def get_render_func(venv): + if hasattr(venv, 'envs'): + return venv.envs[0].render + elif hasattr(venv, 'venv'): + return get_render_func(venv.venv) + elif hasattr(venv, 'env'): + return get_render_func(venv.env) + + return None + + +def get_vec_normalize(venv): + # if isinstance(venv, VecNormalize): + # return venv + # elif hasattr(venv, 'venv'): + # return get_vec_normalize(venv.venv) + + return None + + +# Necessary for my KFAC implementation. +class AddBias(nn.Module): + def __init__(self, bias): + super(AddBias, self).__init__() + self._bias = nn.Parameter(bias.unsqueeze(1)) + + def forward(self, x): + if x.dim() == 2: + bias = self._bias.t().view(1, -1) + else: + bias = self._bias.t().view(1, -1, 1, 1) + + return x + bias + + +def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr): + """Decreases the learning rate linearly""" + lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs))) + for param_group in optimizer.param_groups: + param_group['lr'] = lr + + +def init(module, weight_init, bias_init, gain=1): + weight_init(module.weight.data, gain=gain) + bias_init(module.bias.data) + return module + + +def cleanup_log_dir(log_dir): + try: + os.makedirs(log_dir) + except OSError: + files = glob.glob(os.path.join(log_dir, '*.monitor.csv')) + for f in files: + os.remove(f)