Skip to content

Commit

Permalink
Make number of sampled weights used to compute utility metrics parame…
Browse files Browse the repository at this point in the history
…terizable (#95)
  • Loading branch information
LucasAlegre authored Feb 25, 2024
1 parent c145a11 commit f67a0a7
Show file tree
Hide file tree
Showing 10 changed files with 64 additions and 5 deletions.
2 changes: 1 addition & 1 deletion morl_baselines/common/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def log_all_multi_policy_metrics(
hv_ref_point: np.ndarray,
reward_dim: int,
global_step: int,
n_sample_weights: int = 50,
n_sample_weights: int,
ref_front: Optional[List[np.ndarray]] = None,
):
"""Logs all metrics for multi-policy training.
Expand Down
4 changes: 4 additions & 0 deletions morl_baselines/multi_policy/capql/capql.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ def train(
known_pareto_front: Optional[List[np.ndarray]] = None,
num_eval_weights_for_front: int = 100,
num_eval_episodes_for_front: int = 5,
num_eval_weights_for_eval: int = 50,
eval_freq: int = 10000,
reset_num_timesteps: bool = False,
checkpoints: bool = False,
Expand All @@ -397,6 +398,7 @@ def train(
known_pareto_front (Optional[List[np.ndarray]]): Optimal Pareto front, if known.
num_eval_weights_for_front (int): Number of weights to evaluate for the Pareto front.
num_eval_episodes_for_front: number of episodes to run when evaluating the policy.
num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
eval_freq (int): Number of timesteps between evaluations during an iteration.
reset_num_timesteps (bool): Whether to reset the number of timesteps.
checkpoints (bool): Whether to save checkpoints.
Expand All @@ -409,6 +411,7 @@ def train(
"known_front": known_pareto_front,
"num_eval_weights_for_front": num_eval_weights_for_front,
"num_eval_episodes_for_front": num_eval_episodes_for_front,
"num_eval_weights_for_eval": num_eval_weights_for_eval,
"eval_freq": eval_freq,
"reset_num_timesteps": reset_num_timesteps,
}
Expand Down Expand Up @@ -467,6 +470,7 @@ def train(
hv_ref_point=ref_point,
reward_dim=self.reward_dim,
global_step=self.global_step,
n_sample_weights=num_eval_weights_for_eval,
ref_front=known_pareto_front,
)

Expand Down
4 changes: 4 additions & 0 deletions morl_baselines/multi_policy/envelope/envelope.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,7 @@ def train(
eval_freq: int = 10000,
num_eval_weights_for_front: int = 100,
num_eval_episodes_for_front: int = 5,
num_eval_weights_for_eval: int = 50,
reset_learning_starts: bool = False,
verbose: bool = False,
):
Expand All @@ -498,6 +499,7 @@ def train(
eval_freq: policy evaluation frequency (in number of steps).
num_eval_weights_for_front: number of weights to sample for creating the pareto front when evaluating.
num_eval_episodes_for_front: number of episodes to run when evaluating the policy.
num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
reset_learning_starts: whether to reset the learning starts. Useful when training multiple times.
verbose: whether to print the episode info.
"""
Expand All @@ -515,6 +517,7 @@ def train(
"eval_freq": eval_freq,
"num_eval_weights_for_front": num_eval_weights_for_front,
"num_eval_episodes_for_front": num_eval_episodes_for_front,
"num_eval_weights_for_eval": num_eval_weights_for_eval,
"reset_learning_starts": reset_learning_starts,
}
)
Expand Down Expand Up @@ -557,6 +560,7 @@ def train(
hv_ref_point=ref_point,
reward_dim=self.reward_dim,
global_step=self.global_step,
n_sample_weights=num_eval_weights_for_eval,
ref_front=known_pareto_front,
)

Expand Down
4 changes: 4 additions & 0 deletions morl_baselines/multi_policy/gpi_pd/gpi_pd.py
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,7 @@ def train(
known_pareto_front: Optional[List[np.ndarray]] = None,
num_eval_weights_for_front: int = 100,
num_eval_episodes_for_front: int = 5,
num_eval_weights_for_eval: int = 50,
timesteps_per_iter: int = 10000,
weight_selection_algo: str = "gpi-ls",
eval_freq: int = 1000,
Expand All @@ -812,6 +813,7 @@ def train(
known_pareto_front (Optional[List[np.ndarray]]): Optimal Pareto front if known.
num_eval_weights_for_front: Number of weights to evaluate for the Pareto front.
num_eval_episodes_for_front: number of episodes to run when evaluating the policy.
num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
timesteps_per_iter (int): Number of timesteps to train for per iteration.
weight_selection_algo (str): Weight selection algorithm to use.
eval_freq (int): Number of timesteps between evaluations.
Expand All @@ -826,6 +828,7 @@ def train(
"known_front": known_pareto_front,
"num_eval_weights_for_front": num_eval_weights_for_front,
"num_eval_episodes_for_front": num_eval_episodes_for_front,
"num_eval_weights_for_eval": num_eval_weights_for_eval,
"timesteps_per_iter": timesteps_per_iter,
"weight_selection_algo": weight_selection_algo,
"eval_freq": eval_freq,
Expand Down Expand Up @@ -895,6 +898,7 @@ def train(
hv_ref_point=ref_point,
reward_dim=self.reward_dim,
global_step=self.global_step,
n_sample_weights=num_eval_weights_for_eval,
ref_front=known_pareto_front,
)
# This is the EU computed in the paper
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,7 @@ def train(
known_pareto_front: Optional[List[np.ndarray]] = None,
num_eval_weights_for_front: int = 100,
num_eval_episodes_for_front: int = 5,
num_eval_weights_for_eval: int = 50,
weight_selection_algo: str = "gpi-ls",
timesteps_per_iter: int = 10000,
eval_freq: int = 1000,
Expand All @@ -609,6 +610,7 @@ def train(
known_pareto_front (Optional[List[np.ndarray]]): Optimal Pareto front, if known.
num_eval_weights_for_front (int): Number of weights to evaluate for the Pareto front.
num_eval_episodes_for_front: number of episodes to run when evaluating the policy.
num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
weight_selection_algo (str): Weight selection algorithm to use.
timesteps_per_iter (int): Number of timesteps to train the agent for each iteration.
eval_freq (int): Number of timesteps between evaluations during an iteration.
Expand All @@ -623,6 +625,7 @@ def train(
"known_front": known_pareto_front,
"num_eval_weights_for_front": num_eval_weights_for_front,
"num_eval_episodes_for_front": num_eval_episodes_for_front,
"num_eval_weights_for_eval": num_eval_weights_for_eval,
"weight_selection_algo": weight_selection_algo,
"timesteps_per_iter": timesteps_per_iter,
"eval_freq": eval_freq,
Expand Down Expand Up @@ -687,6 +690,7 @@ def train(
hv_ref_point=ref_point,
reward_dim=self.reward_dim,
global_step=self.global_step,
n_sample_weights=num_eval_weights_for_eval,
ref_front=known_pareto_front,
)
# This is the EU computed in the paper
Expand Down
29 changes: 26 additions & 3 deletions morl_baselines/multi_policy/morld/morld.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ def __eval_all_policies(
self,
eval_env: gym.Env,
num_eval_episodes_for_front: int,
num_eval_weights_for_eval: int,
ref_point: np.ndarray,
known_front: Optional[List[np.ndarray]] = None,
):
Expand All @@ -307,7 +308,12 @@ def __eval_all_policies(

if self.log:
log_all_multi_policy_metrics(
self.archive.evaluations, ref_point, self.reward_dim, self.global_step, ref_front=known_front
self.archive.evaluations,
ref_point,
self.reward_dim,
self.global_step,
n_sample_weights=num_eval_weights_for_eval,
ref_front=known_front,
)
return evals

Expand Down Expand Up @@ -415,6 +421,7 @@ def train(
ref_point: np.ndarray,
known_pareto_front: Optional[List[np.ndarray]] = None,
num_eval_episodes_for_front: int = 5,
num_eval_weights_for_eval: int = 50,
reset_num_timesteps: bool = False,
):
"""Trains the algorithm.
Expand All @@ -425,16 +432,30 @@ def train(
ref_point: reference point for the hypervolume metric
known_pareto_front: optimal pareto front for the problem if known
num_eval_episodes_for_front: number of episodes for each policy evaluation
num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
reset_num_timesteps: whether to reset the number of timesteps or not
"""
if self.log:
self.register_additional_config(
{
"total_timesteps": total_timesteps,
"ref_point": ref_point.tolist(),
"known_front": known_pareto_front,
"num_eval_weights_for_eval": num_eval_weights_for_eval,
"num_eval_episodes_for_front": num_eval_episodes_for_front,
}
)

# Init
self.global_step = 0 if reset_num_timesteps else self.global_step
self.num_episodes = 0 if reset_num_timesteps else self.num_episodes
start_time = time.time()

obs, _ = self.env.reset()
print("Starting training...")
self.__eval_all_policies(eval_env, num_eval_episodes_for_front, ref_point, known_pareto_front)
self.__eval_all_policies(
eval_env, num_eval_episodes_for_front, num_eval_weights_for_eval, ref_point, known_pareto_front
)

while self.global_step < total_timesteps:
# selection
Expand All @@ -448,7 +469,9 @@ def train(
self.__update_others(policy)

# Update archive
evals = self.__eval_all_policies(eval_env, num_eval_episodes_for_front, ref_point, known_pareto_front)
evals = self.__eval_all_policies(
eval_env, num_eval_episodes_for_front, num_eval_weights_for_eval, ref_point, known_pareto_front
)

# cooperation
self.__share(policy)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def train(
timesteps_per_iteration: int = int(2e5),
num_eval_weights_for_front: int = 100,
num_eval_episodes_for_front: int = 5,
num_eval_weights_for_eval: int = 50,
eval_freq: int = 1000,
):
"""Learn a set of policies.
Expand All @@ -175,6 +176,7 @@ def train(
timesteps_per_iteration: The number of timesteps per iteration.
num_eval_weights_for_front: The number of weights to use to construct a Pareto front for evaluation.
num_eval_episodes_for_front: The number of episodes to run when evaluating the policy.
num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
eval_freq: The frequency of evaluation.
"""
if self.log:
Expand All @@ -186,6 +188,7 @@ def train(
"timesteps_per_iteration": timesteps_per_iteration,
"num_eval_weights_for_front": num_eval_weights_for_front,
"num_eval_episodes_for_front": num_eval_episodes_for_front,
"num_eval_weights_for_eval": num_eval_weights_for_eval,
"eval_freq": eval_freq,
}
)
Expand Down Expand Up @@ -267,6 +270,7 @@ def train(
hv_ref_point=ref_point,
reward_dim=self.reward_dim,
global_step=self.global_step,
n_sample_weights=num_eval_weights_for_eval,
ref_front=known_pareto_front,
)

Expand Down
4 changes: 4 additions & 0 deletions morl_baselines/multi_policy/pareto_q_learning/pql.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ def train(
eval_env: gym.Env,
ref_point: Optional[np.ndarray] = None,
known_pareto_front: Optional[List[np.ndarray]] = None,
num_eval_weights_for_eval: int = 50,
log_every: Optional[int] = 10000,
action_eval: Optional[str] = "hypervolume",
):
Expand All @@ -207,6 +208,7 @@ def train(
eval_env (gym.Env): The environment to evaluate the policies on.
eval_ref_point (ndarray, optional): The reference point for the hypervolume metric during evaluation. If none, use the same ref point as training.
known_pareto_front (List[ndarray], optional): The optimal Pareto front, if known.
num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
log_every (int, optional): Log the results every number of timesteps. (Default value = 1000)
action_eval (str, optional): The action evaluation function name. (Default value = 'hypervolume')
Expand All @@ -227,6 +229,7 @@ def train(
"total_timesteps": total_timesteps,
"ref_point": ref_point.tolist(),
"known_front": known_pareto_front,
"num_eval_weights_for_eval": num_eval_weights_for_eval,
"log_every": log_every,
"action_eval": action_eval,
}
Expand Down Expand Up @@ -257,6 +260,7 @@ def train(
hv_ref_point=ref_point,
reward_dim=self.reward_dim,
global_step=self.global_step,
n_sample_weights=num_eval_weights_for_eval,
ref_front=known_pareto_front,
)

Expand Down
4 changes: 4 additions & 0 deletions morl_baselines/multi_policy/pcn/pcn.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ def train(
eval_env: gym.Env,
ref_point: np.ndarray,
known_pareto_front: Optional[List[np.ndarray]] = None,
num_eval_weights_for_eval: int = 50,
num_er_episodes: int = 20,
num_step_episodes: int = 10,
num_model_updates: int = 50,
Expand All @@ -400,6 +401,7 @@ def train(
eval_env: environment for evaluation
ref_point: reference point for hypervolume calculation
known_pareto_front: Optimal pareto front for metrics calculation, if known.
num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
num_er_episodes: number of episodes to fill experience replay buffer
num_step_episodes: number of steps per episode
num_model_updates: number of model updates per episode
Expand All @@ -414,6 +416,7 @@ def train(
"total_timesteps": total_timesteps,
"ref_point": ref_point.tolist(),
"known_front": known_pareto_front,
"num_eval_weights_for_eval": num_eval_weights_for_eval,
"num_er_episodes": num_er_episodes,
"num_step_episodes": num_step_episodes,
"num_model_updates": num_model_updates,
Expand Down Expand Up @@ -523,5 +526,6 @@ def train(
hv_ref_point=ref_point,
reward_dim=self.reward_dim,
global_step=self.global_step,
n_sample_weights=num_eval_weights_for_eval,
ref_front=known_pareto_front,
)
10 changes: 9 additions & 1 deletion morl_baselines/multi_policy/pgmorl/pgmorl.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,7 @@ def __eval_all_agents(
hv_ref_point=ref_point,
reward_dim=self.reward_dim,
global_step=self.global_step,
n_sample_weights=self.num_eval_weights_for_eval,
ref_front=known_pareto_front,
)

Expand Down Expand Up @@ -617,12 +618,19 @@ def train(
eval_env: gym.Env,
ref_point: np.ndarray,
known_pareto_front: Optional[List[np.ndarray]] = None,
num_eval_weights_for_eval: int = 50,
):
"""Trains the agents."""
if self.log:
self.register_additional_config(
{"total_timesteps": total_timesteps, "ref_point": ref_point.tolist(), "known_front": known_pareto_front}
{
"total_timesteps": total_timesteps,
"ref_point": ref_point.tolist(),
"known_front": known_pareto_front,
"num_eval_weights_for_eval": num_eval_weights_for_eval,
}
)
self.num_eval_weights_for_eval = num_eval_weights_for_eval
max_iterations = total_timesteps // self.steps_per_iteration // self.num_envs
iteration = 0
# Init
Expand Down

0 comments on commit f67a0a7

Please sign in to comment.