Make number of sampled weights used to compute utility metrics parame…

…terizable (#95)
LucasAlegre · Feb 25, 2024 · f67a0a7 · f67a0a7
1 parent c145a11
commit f67a0a7
Show file tree

Hide file tree

Showing 10 changed files with 64 additions and 5 deletions.
diff --git a/morl_baselines/common/evaluation.py b/morl_baselines/common/evaluation.py
@@ -148,7 +148,7 @@ def log_all_multi_policy_metrics(
     hv_ref_point: np.ndarray,
     reward_dim: int,
     global_step: int,
-    n_sample_weights: int = 50,
+    n_sample_weights: int,
     ref_front: Optional[List[np.ndarray]] = None,
 ):
     """Logs all metrics for multi-policy training.

diff --git a/morl_baselines/multi_policy/capql/capql.py b/morl_baselines/multi_policy/capql/capql.py
@@ -384,6 +384,7 @@ def train(
         known_pareto_front: Optional[List[np.ndarray]] = None,
         num_eval_weights_for_front: int = 100,
         num_eval_episodes_for_front: int = 5,
+        num_eval_weights_for_eval: int = 50,
         eval_freq: int = 10000,
         reset_num_timesteps: bool = False,
         checkpoints: bool = False,
@@ -397,6 +398,7 @@ def train(
             known_pareto_front (Optional[List[np.ndarray]]): Optimal Pareto front, if known.
             num_eval_weights_for_front (int): Number of weights to evaluate for the Pareto front.
             num_eval_episodes_for_front: number of episodes to run when evaluating the policy.
+            num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
             eval_freq (int): Number of timesteps between evaluations during an iteration.
             reset_num_timesteps (bool): Whether to reset the number of timesteps.
             checkpoints (bool): Whether to save checkpoints.
@@ -409,6 +411,7 @@ def train(
                     "known_front": known_pareto_front,
                     "num_eval_weights_for_front": num_eval_weights_for_front,
                     "num_eval_episodes_for_front": num_eval_episodes_for_front,
+                    "num_eval_weights_for_eval": num_eval_weights_for_eval,
                     "eval_freq": eval_freq,
                     "reset_num_timesteps": reset_num_timesteps,
                 }
@@ -467,6 +470,7 @@ def train(
                     hv_ref_point=ref_point,
                     reward_dim=self.reward_dim,
                     global_step=self.global_step,
+                    n_sample_weights=num_eval_weights_for_eval,
                     ref_front=known_pareto_front,
                 )
 

diff --git a/morl_baselines/multi_policy/envelope/envelope.py b/morl_baselines/multi_policy/envelope/envelope.py
@@ -482,6 +482,7 @@ def train(
         eval_freq: int = 10000,
         num_eval_weights_for_front: int = 100,
         num_eval_episodes_for_front: int = 5,
+        num_eval_weights_for_eval: int = 50,
         reset_learning_starts: bool = False,
         verbose: bool = False,
     ):
@@ -498,6 +499,7 @@ def train(
             eval_freq: policy evaluation frequency (in number of steps).
             num_eval_weights_for_front: number of weights to sample for creating the pareto front when evaluating.
             num_eval_episodes_for_front: number of episodes to run when evaluating the policy.
+            num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
             reset_learning_starts: whether to reset the learning starts. Useful when training multiple times.
             verbose: whether to print the episode info.
         """
@@ -515,6 +517,7 @@ def train(
                     "eval_freq": eval_freq,
                     "num_eval_weights_for_front": num_eval_weights_for_front,
                     "num_eval_episodes_for_front": num_eval_episodes_for_front,
+                    "num_eval_weights_for_eval": num_eval_weights_for_eval,
                     "reset_learning_starts": reset_learning_starts,
                 }
             )
@@ -557,6 +560,7 @@ def train(
                     hv_ref_point=ref_point,
                     reward_dim=self.reward_dim,
                     global_step=self.global_step,
+                    n_sample_weights=num_eval_weights_for_eval,
                     ref_front=known_pareto_front,
                 )
 

diff --git a/morl_baselines/multi_policy/gpi_pd/gpi_pd.py b/morl_baselines/multi_policy/gpi_pd/gpi_pd.py
@@ -797,6 +797,7 @@ def train(
         known_pareto_front: Optional[List[np.ndarray]] = None,
         num_eval_weights_for_front: int = 100,
         num_eval_episodes_for_front: int = 5,
+        num_eval_weights_for_eval: int = 50,
         timesteps_per_iter: int = 10000,
         weight_selection_algo: str = "gpi-ls",
         eval_freq: int = 1000,
@@ -812,6 +813,7 @@ def train(
             known_pareto_front (Optional[List[np.ndarray]]): Optimal Pareto front if known.
             num_eval_weights_for_front: Number of weights to evaluate for the Pareto front.
             num_eval_episodes_for_front: number of episodes to run when evaluating the policy.
+            num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
             timesteps_per_iter (int): Number of timesteps to train for per iteration.
             weight_selection_algo (str): Weight selection algorithm to use.
             eval_freq (int): Number of timesteps between evaluations.
@@ -826,6 +828,7 @@ def train(
                     "known_front": known_pareto_front,
                     "num_eval_weights_for_front": num_eval_weights_for_front,
                     "num_eval_episodes_for_front": num_eval_episodes_for_front,
+                    "num_eval_weights_for_eval": num_eval_weights_for_eval,
                     "timesteps_per_iter": timesteps_per_iter,
                     "weight_selection_algo": weight_selection_algo,
                     "eval_freq": eval_freq,
@@ -895,6 +898,7 @@ def train(
                     hv_ref_point=ref_point,
                     reward_dim=self.reward_dim,
                     global_step=self.global_step,
+                    n_sample_weights=num_eval_weights_for_eval,
                     ref_front=known_pareto_front,
                 )
                 # This is the EU computed in the paper

diff --git a/morl_baselines/multi_policy/gpi_pd/gpi_pd_continuous_action.py b/morl_baselines/multi_policy/gpi_pd/gpi_pd_continuous_action.py
@@ -594,6 +594,7 @@ def train(
         known_pareto_front: Optional[List[np.ndarray]] = None,
         num_eval_weights_for_front: int = 100,
         num_eval_episodes_for_front: int = 5,
+        num_eval_weights_for_eval: int = 50,
         weight_selection_algo: str = "gpi-ls",
         timesteps_per_iter: int = 10000,
         eval_freq: int = 1000,
@@ -609,6 +610,7 @@ def train(
             known_pareto_front (Optional[List[np.ndarray]]): Optimal Pareto front, if known.
             num_eval_weights_for_front (int): Number of weights to evaluate for the Pareto front.
             num_eval_episodes_for_front: number of episodes to run when evaluating the policy.
+            num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
             weight_selection_algo (str): Weight selection algorithm to use.
             timesteps_per_iter (int): Number of timesteps to train the agent for each iteration.
             eval_freq (int): Number of timesteps between evaluations during an iteration.
@@ -623,6 +625,7 @@ def train(
                     "known_front": known_pareto_front,
                     "num_eval_weights_for_front": num_eval_weights_for_front,
                     "num_eval_episodes_for_front": num_eval_episodes_for_front,
+                    "num_eval_weights_for_eval": num_eval_weights_for_eval,
                     "weight_selection_algo": weight_selection_algo,
                     "timesteps_per_iter": timesteps_per_iter,
                     "eval_freq": eval_freq,
@@ -687,6 +690,7 @@ def train(
                     hv_ref_point=ref_point,
                     reward_dim=self.reward_dim,
                     global_step=self.global_step,
+                    n_sample_weights=num_eval_weights_for_eval,
                     ref_front=known_pareto_front,
                 )
                 # This is the EU computed in the paper

diff --git a/morl_baselines/multi_policy/morld/morld.py b/morl_baselines/multi_policy/morld/morld.py
@@ -290,6 +290,7 @@ def __eval_all_policies(
         self,
         eval_env: gym.Env,
         num_eval_episodes_for_front: int,
+        num_eval_weights_for_eval: int,
         ref_point: np.ndarray,
         known_front: Optional[List[np.ndarray]] = None,
     ):
@@ -307,7 +308,12 @@ def __eval_all_policies(
 
         if self.log:
             log_all_multi_policy_metrics(
-                self.archive.evaluations, ref_point, self.reward_dim, self.global_step, ref_front=known_front
+                self.archive.evaluations,
+                ref_point,
+                self.reward_dim,
+                self.global_step,
+                n_sample_weights=num_eval_weights_for_eval,
+                ref_front=known_front,
             )
         return evals
 
@@ -415,6 +421,7 @@ def train(
         ref_point: np.ndarray,
         known_pareto_front: Optional[List[np.ndarray]] = None,
         num_eval_episodes_for_front: int = 5,
+        num_eval_weights_for_eval: int = 50,
         reset_num_timesteps: bool = False,
     ):
         """Trains the algorithm.
@@ -425,16 +432,30 @@ def train(
             ref_point: reference point for the hypervolume metric
             known_pareto_front: optimal pareto front for the problem if known
             num_eval_episodes_for_front: number of episodes for each policy evaluation
+            num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
             reset_num_timesteps: whether to reset the number of timesteps or not
         """
+        if self.log:
+            self.register_additional_config(
+                {
+                    "total_timesteps": total_timesteps,
+                    "ref_point": ref_point.tolist(),
+                    "known_front": known_pareto_front,
+                    "num_eval_weights_for_eval": num_eval_weights_for_eval,
+                    "num_eval_episodes_for_front": num_eval_episodes_for_front,
+                }
+            )
+
         # Init
         self.global_step = 0 if reset_num_timesteps else self.global_step
         self.num_episodes = 0 if reset_num_timesteps else self.num_episodes
         start_time = time.time()
 
         obs, _ = self.env.reset()
         print("Starting training...")
-        self.__eval_all_policies(eval_env, num_eval_episodes_for_front, ref_point, known_pareto_front)
+        self.__eval_all_policies(
+            eval_env, num_eval_episodes_for_front, num_eval_weights_for_eval, ref_point, known_pareto_front
+        )
 
         while self.global_step < total_timesteps:
             # selection
@@ -448,7 +469,9 @@ def train(
             self.__update_others(policy)
 
             # Update archive
-            evals = self.__eval_all_policies(eval_env, num_eval_episodes_for_front, ref_point, known_pareto_front)
+            evals = self.__eval_all_policies(
+                eval_env, num_eval_episodes_for_front, num_eval_weights_for_eval, ref_point, known_pareto_front
+            )
 
             # cooperation
             self.__share(policy)

diff --git a/morl_baselines/multi_policy/multi_policy_moqlearning/mp_mo_q_learning.py b/morl_baselines/multi_policy/multi_policy_moqlearning/mp_mo_q_learning.py
@@ -163,6 +163,7 @@ def train(
         timesteps_per_iteration: int = int(2e5),
         num_eval_weights_for_front: int = 100,
         num_eval_episodes_for_front: int = 5,
+        num_eval_weights_for_eval: int = 50,
         eval_freq: int = 1000,
     ):
         """Learn a set of policies.
@@ -175,6 +176,7 @@ def train(
             timesteps_per_iteration: The number of timesteps per iteration.
             num_eval_weights_for_front: The number of weights to use to construct a Pareto front for evaluation.
             num_eval_episodes_for_front: The number of episodes to run when evaluating the policy.
+            num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
             eval_freq: The frequency of evaluation.
         """
         if self.log:
@@ -186,6 +188,7 @@ def train(
                     "timesteps_per_iteration": timesteps_per_iteration,
                     "num_eval_weights_for_front": num_eval_weights_for_front,
                     "num_eval_episodes_for_front": num_eval_episodes_for_front,
+                    "num_eval_weights_for_eval": num_eval_weights_for_eval,
                     "eval_freq": eval_freq,
                 }
             )
@@ -267,6 +270,7 @@ def train(
                     hv_ref_point=ref_point,
                     reward_dim=self.reward_dim,
                     global_step=self.global_step,
+                    n_sample_weights=num_eval_weights_for_eval,
                     ref_front=known_pareto_front,
                 )
 

diff --git a/morl_baselines/multi_policy/pareto_q_learning/pql.py b/morl_baselines/multi_policy/pareto_q_learning/pql.py
@@ -197,6 +197,7 @@ def train(
         eval_env: gym.Env,
         ref_point: Optional[np.ndarray] = None,
         known_pareto_front: Optional[List[np.ndarray]] = None,
+        num_eval_weights_for_eval: int = 50,
         log_every: Optional[int] = 10000,
         action_eval: Optional[str] = "hypervolume",
     ):
@@ -207,6 +208,7 @@ def train(
             eval_env (gym.Env): The environment to evaluate the policies on.
             eval_ref_point (ndarray, optional): The reference point for the hypervolume metric during evaluation. If none, use the same ref point as training.
             known_pareto_front (List[ndarray], optional): The optimal Pareto front, if known.
+            num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
             log_every (int, optional): Log the results every number of timesteps. (Default value = 1000)
             action_eval (str, optional): The action evaluation function name. (Default value = 'hypervolume')
 
@@ -227,6 +229,7 @@ def train(
                     "total_timesteps": total_timesteps,
                     "ref_point": ref_point.tolist(),
                     "known_front": known_pareto_front,
+                    "num_eval_weights_for_eval": num_eval_weights_for_eval,
                     "log_every": log_every,
                     "action_eval": action_eval,
                 }
@@ -257,6 +260,7 @@ def train(
                         hv_ref_point=ref_point,
                         reward_dim=self.reward_dim,
                         global_step=self.global_step,
+                        n_sample_weights=num_eval_weights_for_eval,
                         ref_front=known_pareto_front,
                     )
 

diff --git a/morl_baselines/multi_policy/pcn/pcn.py b/morl_baselines/multi_policy/pcn/pcn.py
@@ -386,6 +386,7 @@ def train(
         eval_env: gym.Env,
         ref_point: np.ndarray,
         known_pareto_front: Optional[List[np.ndarray]] = None,
+        num_eval_weights_for_eval: int = 50,
         num_er_episodes: int = 20,
         num_step_episodes: int = 10,
         num_model_updates: int = 50,
@@ -400,6 +401,7 @@ def train(
             eval_env: environment for evaluation
             ref_point: reference point for hypervolume calculation
             known_pareto_front: Optimal pareto front for metrics calculation, if known.
+            num_eval_weights_for_eval (int): Number of weights use when evaluating the Pareto front, e.g., for computing expected utility.
             num_er_episodes: number of episodes to fill experience replay buffer
             num_step_episodes: number of steps per episode
             num_model_updates: number of model updates per episode
@@ -414,6 +416,7 @@ def train(
                     "total_timesteps": total_timesteps,
                     "ref_point": ref_point.tolist(),
                     "known_front": known_pareto_front,
+                    "num_eval_weights_for_eval": num_eval_weights_for_eval,
                     "num_er_episodes": num_er_episodes,
                     "num_step_episodes": num_step_episodes,
                     "num_model_updates": num_model_updates,
@@ -523,5 +526,6 @@ def train(
                         hv_ref_point=ref_point,
                         reward_dim=self.reward_dim,
                         global_step=self.global_step,
+                        n_sample_weights=num_eval_weights_for_eval,
                         ref_front=known_pareto_front,
                     )
diff --git a/morl_baselines/multi_policy/pgmorl/pgmorl.py b/morl_baselines/multi_policy/pgmorl/pgmorl.py
@@ -538,6 +538,7 @@ def __eval_all_agents(
                 hv_ref_point=ref_point,
                 reward_dim=self.reward_dim,
                 global_step=self.global_step,
+                n_sample_weights=self.num_eval_weights_for_eval,
                 ref_front=known_pareto_front,
             )
 
@@ -617,12 +618,19 @@ def train(
         eval_env: gym.Env,
         ref_point: np.ndarray,
         known_pareto_front: Optional[List[np.ndarray]] = None,
+        num_eval_weights_for_eval: int = 50,
     ):
         """Trains the agents."""
         if self.log:
             self.register_additional_config(
-                {"total_timesteps": total_timesteps, "ref_point": ref_point.tolist(), "known_front": known_pareto_front}
+                {
+                    "total_timesteps": total_timesteps,
+                    "ref_point": ref_point.tolist(),
+                    "known_front": known_pareto_front,
+                    "num_eval_weights_for_eval": num_eval_weights_for_eval,
+                }
             )
+        self.num_eval_weights_for_eval = num_eval_weights_for_eval
         max_iterations = total_timesteps // self.steps_per_iteration // self.num_envs
         iteration = 0
         # Init