Skip to content

Commit

Permalink
my RL added
Browse files Browse the repository at this point in the history
  • Loading branch information
mahi97 committed Oct 26, 2021
1 parent 7a3669e commit e9abdab
Show file tree
Hide file tree
Showing 12 changed files with 1,313 additions and 0 deletions.
156 changes: 156 additions & 0 deletions pz_risk/train_v.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import os
import torch
import random
import numpy as np

from risk_env import env
import training.utils as utils
from training.dvn import DVNAgent
from training.arguments import get_args
from wrappers import GraphObservationWrapper, DenseRewardWrapper, SparseRewardWrapper

from agents.sampling import SAMPLING
from agents.value import get_future, get_attack_dist
from utils import get_feat_adj_from_board
from agents import GreedyAgent, RandomAgent
from copy import deepcopy

from tqdm import tqdm
import matplotlib.pyplot as plt


def main():
args = get_args()

torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

log_dir = os.path.expanduser(args.log_dir)
eval_log_dir = log_dir + "_eval"
utils.cleanup_log_dir(log_dir)
utils.cleanup_log_dir(eval_log_dir)

torch.set_num_threads(1)
device = torch.device("cuda:0" if args.cuda else "cpu")

e = env(n_agent=2, board_name='4node')
e = GraphObservationWrapper(e)
e = SparseRewardWrapper(e)
e.reset()
_, _, _, info = e.last()
n_nodes = info['nodes']
n_agents = info['agents']
max_episode = 3000
max_epi_step = 200

epsilon = 0.9
epsilon_min = 0.005
decay_rate = 0.005

feat_size = e.observation_spaces['feat'].shape[0]
hidden_size = 20

critic = DVNAgent(n_nodes, n_agents, feat_size, hidden_size)
save_path = './mini_7/'
load = 0
# critic.load_state_dict(torch.load(os.path.join(save_path, str(load) + ".pt")))
loss_list = []
reward_list = []

# players = [None]
# players = [RandomAgent(i) for i in range(1, 6)]

for episode in tqdm(range(load, max_episode)):

e.reset()
state, _, _, _ = e.last()
loss_epi = []
reward_epi = []
for agent_id in e.agent_iter(max_iter=max_epi_step):
# for a in e.possible_agents:
# e.unwrapped.land_hist[a].append(len(e.unwrapped.board.player_nodes(a)))
# e.unwrapped.unit_hist[a].append(e.unwrapped.board.player_units(a))
# e.unwrapped.place_hist[a].append(e.unwrapped.board.calc_units(a))
# make an action based on epsilon greedy action
state, _, _, info = e.last()
critic.eval()
if agent_id != 0:
task_id = state['task_id']
action = SAMPLING[task_id](e.unwrapped.board, agent_id)
else:
# Use Model to Gather Future State per Valid Actions
action_scores = []
deterministic, valid_actions = e.unwrapped.board.valid_actions(agent_id)
for valid_action in valid_actions:
sim = deepcopy(e.unwrapped.board)
if deterministic:
sim.step(agent_id, valid_action)
else:
dist = get_attack_dist(e.unwrapped.board, valid_action)
if len(dist): # TODO: Change to sampling
left = get_future(dist, mode='two', risk=0.2)
sim.step(agent_id, valid_action, left)
else:
sim.step(agent_id, valid_action)
sim_feat, sim_adj = get_feat_adj_from_board(sim, agent_id, e.unwrapped.n_agents, e.unwrapped.n_grps)
sim_feat = torch.tensor(sim_feat, dtype=torch.float32, device=device).reshape(-1,
n_nodes + n_agents,
feat_size)
sim_adj = torch.tensor(sim_adj, dtype=torch.float32, device=device).reshape(-1, n_nodes + n_agents,
n_nodes + n_agents)
action_scores.append(critic(sim_feat, sim_adj).detach().cpu().numpy()[:, n_nodes + agent_id])
action = valid_actions[np.argmax(action_scores)]
before_feat = torch.tensor(state['feat'], dtype=torch.float32, device=device).reshape(-1,
n_nodes + n_agents,
feat_size)
before_adj = torch.tensor(state['adj'], dtype=torch.float32, device=device).reshape(-1, n_nodes + n_agents,
n_nodes + n_agents)

e.step(action)
state, _, _, info = e.last()
feat = torch.tensor(state['feat'], dtype=torch.float32, device=device).reshape(-1, n_nodes + n_agents,
feat_size)
adj = torch.tensor(state['adj'], dtype=torch.float32, device=device).reshape(-1, n_nodes + n_agents,
n_nodes + n_agents)
reward = torch.tensor(state['rewards'], dtype=torch.float32, device=device).reshape(-1, n_agents)
done = torch.tensor(state['dones'], dtype=torch.bool, device=device).reshape(-1, n_agents)
reward_epi.append(reward.cpu().numpy()[0])
# e.render()

# make a transition and save to replay memory
transition = [before_feat, before_adj, reward, feat, adj, done]
critic.save_memory(transition)
critic.train()
if critic.train_start():
loss = critic.train_()
loss_epi.append(loss)
# print('Loss: {}, Reward: {}'.format(loss, reward))
if all(done[0].cpu().numpy()):
break

if epsilon > epsilon_min:
epsilon -= decay_rate
else:
epsilon = epsilon_min

if critic.train_start():
loss_list.append(sum(loss_epi) / len(loss_epi))
# plt.show()
e.close()
reward_list.append(sum(reward_epi))

if critic.train_start():
print(episode + 1, reward_list[-1], loss_list[-1])

if episode % 10 == 0:
torch.save(critic.state_dict(), os.path.join(save_path, str(episode // 10) + ".pt"))

return loss_list, reward_list


if __name__ == "__main__":
main()
Empty file added pz_risk/training/__init__.py
Empty file.
145 changes: 145 additions & 0 deletions pz_risk/training/arguments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import argparse

import torch


def get_args():
parser = argparse.ArgumentParser(description='RL')
parser.add_argument(
'--lr', type=float, default=7e-4, help='learning rate (default: 7e-4)')
parser.add_argument(
'--eps',
type=float,
default=1e-5,
help='RMSprop optimizer epsilon (default: 1e-5)')
parser.add_argument(
'--alpha',
type=float,
default=0.99,
help='RMSprop optimizer apha (default: 0.99)')
parser.add_argument(
'--gamma',
type=float,
default=0.99,
help='discount factor for rewards (default: 0.99)')
parser.add_argument(
'--use-gae',
action='store_true',
default=False,
help='use generalized advantage estimation')
parser.add_argument(
'--gae-lambda',
type=float,
default=0.95,
help='gae lambda parameter (default: 0.95)')
parser.add_argument(
'--entropy-coef',
type=float,
default=0.01,
help='entropy term coefficient (default: 0.01)')
parser.add_argument(
'--value-loss-coef',
type=float,
default=0.5,
help='value loss coefficient (default: 0.5)')
parser.add_argument(
'--max-grad-norm',
type=float,
default=0.5,
help='max norm of gradients (default: 0.5)')
parser.add_argument(
'--seed', type=int, default=1, help='random seed (default: 1)')
parser.add_argument(
'--cuda-deterministic',
action='store_true',
default=False,
help="sets flags for determinism when using CUDA (potentially slow!)")
parser.add_argument(
'--num-processes',
type=int,
default=1,
help='how many training CPU processes to use (default: 16)')
parser.add_argument(
'--num-steps',
type=int,
default=5,
help='number of forward steps in A2C (default: 5)')
parser.add_argument(
'--ppo-epoch',
type=int,
default=4,
help='number of ppo epochs (default: 4)')
parser.add_argument(
'--num-mini-batch',
type=int,
default=32,
help='number of batches for ppo (default: 32)')
parser.add_argument(
'--clip-param',
type=float,
default=0.2,
help='ppo clip parameter (default: 0.2)')
parser.add_argument(
'--log-interval',
type=int,
default=10,
help='log interval, one log per n updates (default: 10)')
parser.add_argument(
'--save-interval',
type=int,
default=100,
help='save interval, one save per n updates (default: 100)')
parser.add_argument(
'--eval-interval',
type=int,
default=None,
help='eval interval, one eval per n updates (default: None)')
parser.add_argument(
'--num-env-steps',
type=int,
default=10e6,
help='number of environment steps to train (default: 10e6)')
parser.add_argument(
'--env-name',
default='Risk-Normal-6-v0',
help='environment to train on (default: Risk-Normal-6-v0)')
parser.add_argument(
'--log-dir',
default='/tmp/gym/',
help='directory to save agent logs (default: /tmp/gym)')
parser.add_argument(
'--save-dir',
default='./trained_models/',
help='directory to save agent logs (default: ./trained_models/)')
parser.add_argument(
'--no-cuda',
action='store_true',
default=False,
help='disables CUDA training')
parser.add_argument(
'--use-proper-time-limits',
action='store_true',
default=False,
help='compute returns taking into account time limits')
parser.add_argument(
'--recurrent-policy',
action='store_true',
default=False,
help='use a recurrent policy')
parser.add_argument(
'--use-linear-lr-decay',
action='store_true',
default=False,
help='use a linear schedule on the learning rate')
parser.add_argument(
'--dir',
default='./mini_7/100.pt',
help='Directory to load')



args = parser.parse_args()

args.cuda = not args.no_cuda and torch.cuda.is_available()

return args
Loading

0 comments on commit e9abdab

Please sign in to comment.