facebookexperimental · alxlyj · Dec 3, 2024 · Dec 3, 2024 · Dec 4, 2024 · Dec 4, 2024
diff --git a/.gitignore b/.gitignore
@@ -50,4 +50,5 @@ python/src/tutorials/data/R/*
 python/src/tutorials/data/*
 *.pkl
 python/src/robyn/_deprecate/*
-python/src/robyn/tutorials/output/*
+python/src/robyn/tutorials/output/*
+python/src/robyn/debug/*
diff --git a/python/src/robyn/data/entities/mmmdata.py b/python/src/robyn/data/entities/mmmdata.py
@@ -196,59 +196,55 @@ def remove_column(self, column_name: str) -> None:
         self.data.drop(columns=[column_name], inplace=True)
 
     def calculate_rolling_window_indices(self) -> None:
-        # Ensure the date column is in datetime format
+        """Match R's window index calculation exactly"""
+        # Convert dates to pandas datetime
         self.data[self.mmmdata_spec.date_var] = pd.to_datetime(
             self.data[self.mmmdata_spec.date_var]
         )
-        # Convert window_start and window_end to datetime if they are strings
-        window_start = (
-            pd.to_datetime(self.mmmdata_spec.window_start)
-            if isinstance(self.mmmdata_spec.window_start, str)
-            else self.mmmdata_spec.window_start
-        )
-        window_end = (
-            pd.to_datetime(self.mmmdata_spec.window_end)
-            if isinstance(self.mmmdata_spec.window_end, str)
-            else self.mmmdata_spec.window_end
-        )
+        dates_vec = self.data[self.mmmdata_spec.date_var]
 
-        # Calculate the index for the rolling window start
+        # Convert window dates to datetime
+        window_start = pd.to_datetime(self.mmmdata_spec.window_start)
+        window_end = pd.to_datetime(self.mmmdata_spec.window_end)
+
+        # Calculate start index using days difference like R
         if window_start is not None:
-            closest_start_idx = (
-                (self.data[self.mmmdata_spec.date_var] - window_start).abs().idxmin()
-            )
-            closest_start_date = self.data[self.mmmdata_spec.date_var].iloc[
-                closest_start_idx
-            ]
+            days_diff = (dates_vec - window_start).dt.total_seconds() / (24 * 3600)
+            closest_start_idx = days_diff.abs().argmin()
+            closest_start_date = dates_vec.iloc[closest_start_idx]
+
+            print(f"Python window calculation (start):")
+            print(f"window_start: {window_start}")
+            print(f"closest_start_date: {closest_start_date}")
+            print(f"closest_start_idx: {closest_start_idx}")
+            print(f"days_diff first few values: {days_diff.head().values}")
+
             self.mmmdata_spec.rolling_window_start_which = closest_start_idx
-            # Adjust window_start to the closest date in the data
             self.mmmdata_spec.window_start = closest_start_date
-            print(
-                f"Adjusted window_start to the closest date in the data: {closest_start_date}"
-            )
 
-        # Calculate the index for the rolling window end
+        # Calculate end index using days difference like R
         if window_end is not None:
-            closest_end_idx = (
-                (self.data[self.mmmdata_spec.date_var] - window_end).abs().idxmin()
-            )
-            closest_end_date = self.data[self.mmmdata_spec.date_var].iloc[
-                closest_end_idx
-            ]
+            days_diff = (dates_vec - window_end).dt.total_seconds() / (24 * 3600)
+            closest_end_idx = days_diff.abs().argmin()
+            closest_end_date = dates_vec.iloc[closest_end_idx]
+
+            print(f"Python window calculation (end):")
+            print(f"window_end: {window_end}")
+            print(f"closest_end_date: {closest_end_date}")
+            print(f"closest_end_idx: {closest_end_idx}")
+            print(f"days_diff first few values: {days_diff.head().values}")
+
             self.mmmdata_spec.rolling_window_end_which = closest_end_idx
-            # Adjust window_end to the closest date in the data
             self.mmmdata_spec.window_end = closest_end_date
-            print(
-                f"Adjusted window_end to the closest date in the data: {closest_end_date}"
-            )
 
-        # Calculate rolling window length
+        # Calculate window length
         if window_start is not None and window_end is not None:
             self.mmmdata_spec.rolling_window_length = (
                 self.mmmdata_spec.rolling_window_end_which
                 - self.mmmdata_spec.rolling_window_start_which
                 + 1
             )
+            print(f"Window length: {self.mmmdata_spec.rolling_window_length}")
 
     def set_default_factor_vars(self) -> None:
         """

diff --git a/python/src/robyn/modeling/base_model_executor.py b/python/src/robyn/modeling/base_model_executor.py
@@ -318,25 +318,3 @@ def _apply_transformations(
                 "Error applying transformations for channel %s: %s", channel, str(e)
             )
             raise
-
-    def _prepare_model_data(self, hyperparameters: Dict[str, Any]) -> np.ndarray:
-        logger.info("Preparing model data")
-        transformed_data = {}
-
-        for channel in self.mmmdata.paid_media_vars:
-            logger.debug("Processing channel: %s", channel)
-            transformed_data[channel] = self._apply_transformations(
-                channel, hyperparameters
-            )
-
-        logger.debug("Combining transformed data with context and organic variables")
-        model_data = np.column_stack(
-            [transformed_data[channel] for channel in self.mmmdata.paid_media_vars]
-            + [
-                self.mmmdata.data[var]
-                for var in self.mmmdata.context_vars + self.mmmdata.organic_vars
-            ]
-        )
-
-        logger.info("Model data preparation complete. Shape: %s", model_data.shape)
-        return model_data
diff --git a/python/src/robyn/modeling/ridge/ridge_data_builder.py b/python/src/robyn/modeling/ridge/ridge_data_builder.py
@@ -166,16 +166,19 @@ def _hyper_collector(
         return hyper_collect
 
     def _geometric_adstock(self, x: pd.Series, theta: float) -> pd.Series:
+        # print(f"Before adstock: {x.head()}")
         y = x.copy()
         for i in range(1, len(x)):
             y.iloc[i] += theta * y.iloc[i - 1]
+        # print(f"After adstock: {y.head()}")
         return y
 
     def _hill_transformation(
         self, x: pd.Series, alpha: float, gamma: float
     ) -> pd.Series:
-
+        # Add debug self.logger.debugs
+        # print(f"Before hill: {x.head()}")
         x_scaled = (x - x.min()) / (x.max() - x.min())
         result = x_scaled**alpha / (x_scaled**alpha + gamma**alpha)
-
+        # print(f"After hill: {result.head()}")
         return result
diff --git a/python/src/robyn/modeling/ridge/ridge_evaluate_model.py b/python/src/robyn/modeling/ridge/ridge_evaluate_model.py
@@ -227,12 +227,10 @@ def _evaluate_model(
         debug = True
 
         if debug and (iter_ng == 0 or iter_ng % 25 == 0):
-            self.logger.debug(
-                f"\nEvaluation Debug (trial {trial}, iteration {iter_ng}):"
-            )
-            self.logger.debug(f"X shape: {X.shape}")
-            self.logger.debug(f"y shape: {y.shape}")
-            self.logger.debug("Parameters:", params)
+            print(f"\nEvaluation Debug (trial {trial}, iteration {iter_ng}):")
+            print(f"X shape: {X.shape}")
+            print(f"y shape: {y.shape}")
+            print("Parameters:", params)
 
         # Split data using R's approach
         train_size = params.get("train_size", 1.0) if ts_validation else 1.0
@@ -272,7 +270,7 @@ def _evaluate_model(
         # Calculate metrics using R-style calculations
         y_train_pred = model.predict(x_norm)
         metrics["rsq_train"] = self.ridge_metrics_calculator.calculate_r2_score(
-            y_norm, y_train_pred, x_norm.shape[1], debug=debug, iteration=iter_ng
+            y_norm, y_train_pred, x_norm.shape[1]
         )
         metrics["nrmse_train"] = self.ridge_metrics_calculator.calculate_nrmse(
             y_norm, y_train_pred, debug=debug, iteration=iter_ng

diff --git a/python/src/robyn/modeling/ridge/ridge_metrics_calculator.py b/python/src/robyn/modeling/ridge/ridge_metrics_calculator.py
@@ -147,11 +147,13 @@ def _calculate_decomp_spend_dist(
             "iterPar",
             "Elapsed",
         ]
-        # self.logger.debug(f"Decomp spend distribution debug:")
-        # self.logger.debug(f"Total media spend: {total_media_spend}")
-        # self.logger.debug(f"Total effect: {total_effect}")
-        # for col in paid_media_cols:
-        # self.logger.debug(f"{col} - effect: {all_effects[col]}, spend: {all_spends[col]}")
+        self.logger.debug(f"Decomp spend distribution debug:")
+        self.logger.debug(f"Total media spend: {total_media_spend}")
+        self.logger.debug(f"Total effect: {total_effect}")
+        for col in paid_media_cols:
+            self.logger.debug(
+                f"{col} - effect: {all_effects[col]}, spend: {all_spends[col]}"
+            )
         return df[required_cols]
 
     def _calculate_lambda(
@@ -163,44 +165,45 @@ def _calculate_lambda(
         iteration: int = 0,
     ) -> Tuple[float, float]:
         """Match R's glmnet lambda calculation exactly"""
-
-        def mysd(y: np.ndarray) -> float:
-            """R's implementation of sd"""
-            return np.sqrt(np.sum((y - np.mean(y)) ** 2) / len(y))
-
-        # Scale X using R's approach
-        x_means = np.mean(x_norm, axis=0)
-        x_sds = np.apply_along_axis(mysd, 0, x_norm)
-        sx = (x_norm - x_means) / x_sds[:, np.newaxis].T
-
-        # Handle NaN values like R does
-        check_nan = np.all(np.isnan(sx), axis=0)
-        sx = np.where(check_nan, 0, sx)
-
-        # R does not scale y in this case
-        sy = y_norm
-
-        # Calculate lambda_max using R's approach
-        # 0.001 is glmnet's default alpha value for ridge regression
-        lambda_max = np.max(np.abs(np.sum(sx * sy[:, np.newaxis], axis=0))) / (
-            0.001 * len(x_norm)
-        )
-
-        # Calculate lambda using lambda_hp
-        lambda_min = lambda_max * 0.0001  # R's default lambda_min_ratio
+        n_samples = len(y_norm)
+
+        # Standardize first before scale factor calculation
+        def r_scale(x):
+            mean = np.mean(x)
+            sd = np.sqrt(np.sum((x - mean) ** 2) / (len(x) - 1))
+            return (x - mean) / (sd if sd > 1e-10 else 1.0)
+
+        # Scale X and y first
+        x_scaled = np.apply_along_axis(r_scale, 0, x_norm)
+        y_scaled = r_scale(y_norm)
+
+        # Now calculate scale factor using scaled data
+        scale_factor = np.mean(np.abs(x_scaled)) * np.mean(np.abs(y_scaled))
+
+        if debug and (iteration == 0 or iteration % 25 == 0):
+            print(f"\nLambda Calculation Debug (iteration {iteration}):")
+            print(f"n_samples: {n_samples}")
+            print(f"x_scaled mean abs: {np.mean(np.abs(x_scaled)):.6f}")
+            print(f"y_scaled mean abs: {np.mean(np.abs(y_scaled)):.6f}")
+            print(f"Scale factor: {scale_factor:.6f}")
+            print(f"lambda_hp: {lambda_hp:.6f}")
+
+        # R's lambda calculation
+        alpha = 0.001
+        gram = x_scaled.T @ x_scaled / n_samples
+        ctx = np.abs(x_scaled.T @ y_scaled) / n_samples
+
+        lambda_max = np.max(ctx) / alpha
+        lambda_min = lambda_max * 0.0001
         lambda_ = lambda_min + lambda_hp * (lambda_max - lambda_min)
 
-        if debug and (iteration % 10 == 0):
-            self.logger.debug(f"\nLambda Debug (iter {iteration}):")
-            self.logger.debug(f"x_means: {np.mean(np.abs(x_means)):.6f}")
-            self.logger.debug(f"x_sds mean: {np.mean(x_sds):.6f}")
-            self.logger.debug(f"sx mean: {np.mean(np.abs(sx)):.6f}")
-            self.logger.debug(f"sy mean: {np.mean(np.abs(sy)):.6f}")
-            self.logger.debug(f"lambda_max: {lambda_max:.6f}")
-            self.logger.debug(f"lambda_: {lambda_:.6f}")
-            self.logger.debug(f"lambda_hp: {lambda_hp:.6f}")
+        if debug and (iteration == 0 or iteration % 25 == 0):
+            print(f"max ctx: {np.max(ctx):.6f}")
+            print(f"lambda_max: {lambda_max:.6f}")
+            print(f"lambda_min: {lambda_min:.6f}")
+            print(f"final lambda_: {lambda_:.6f}")
 
-        return float(lambda_), float(lambda_max)
+        return lambda_, lambda_max
 
     def _calculate_rssd(
         self,
@@ -225,11 +228,11 @@ def _calculate_rssd(
             effects.append(effect)
             spends.append(raw_spend)
 
-            # if debug and (iteration % 50 == 0):
-            #     self.logger.debug(f"{col}:")
-            #     self.logger.debug(f"  coefficient: {coef:.6f}")
-            #     self.logger.debug(f"  raw spend: {raw_spend:.6f}")
-            #     self.logger.debug(f"  effect: {effect:.6f}")
+            if debug and (iteration == 0 or iteration % 25 == 0):
+                print(f"{col}:")
+                print(f"  coefficient: {coef:.6f}")
+                print(f"  raw spend: {raw_spend:.6f}")
+                print(f"  effect: {effect:.6f}")
 
         # Convert to numpy arrays
         effects = np.array(effects)
@@ -244,12 +247,12 @@ def _calculate_rssd(
             effects_norm = effects / total_effect
             spends_norm = spends / total_spend
 
-            # if debug and (iteration % 50 == 0):
-            #     self.logger.debug("\nNormalized values:")
-            #     self.logger.debug("Effects:", effects_norm)
-            #     self.logger.debug("Spends:", spends_norm)
-            #     self.logger.debug("Effect total (check=1):", np.sum(effects_norm))
-            #     self.logger.debug("Spend total (check=1):", np.sum(spends_norm))
+            if debug and (iteration == 0 or iteration % 25 == 0):
+                print("\nNormalized values:")
+                print("Effects:", effects_norm)
+                print("Spends:", spends_norm)
+                print("Effect total (check=1):", np.sum(effects_norm))
+                print("Spend total (check=1):", np.sum(spends_norm))
 
             # Calculate RSSD
             squared_diff = (effects_norm - spends_norm) ** 2
@@ -427,18 +430,12 @@ def _calculate_x_decomp_agg(
         return df
 
     def calculate_r2_score(
-        self,
-        y_true: np.ndarray,
-        y_pred: np.ndarray,
-        n_features: int,
-        df_int: int = 1,
-        debug: bool = True,
-        iteration: int = 0,
+        self, y_true: np.ndarray, y_pred: np.ndarray, n_features: int, df_int: int = 1
     ) -> float:
         """Match R's R² calculation exactly"""
         n = len(y_true)
 
-        # Calculate R's version of R²
+        # Scale data like R
         y_mean = np.mean(y_true)
         ss_tot = np.sum((y_true - y_mean) ** 2)
         ss_res = np.sum((y_true - y_pred) ** 2)
@@ -449,17 +446,10 @@ def calculate_r2_score(
         # R's adjustment formula
         adj_r2 = 1 - ((1 - r2) * (n - df_int) / (n - n_features - df_int))
 
+        # Match R's scale
         if adj_r2 < 0:
             adj_r2 = -np.sqrt(np.abs(adj_r2))  # R-style negative scaling
 
-        # if debug and (iteration % 50 == 0):
-        #     self.logger.debug(f"R² Calculation Debug:")
-        #     self.logger.debug(f"n: {n}")
-        #     self.logger.debug(f"ss_tot: {ss_tot:.6f}")
-        #     self.logger.debug(f"ss_res: {ss_res:.6f}")
-        #     self.logger.debug(f"R²: {r2:.6f}")
-        #     self.logger.debug(f"Adjusted R²: {adj_r2:.6f}")
-
         return float(adj_r2)
 
     def calculate_nrmse(
@@ -469,7 +459,7 @@ def calculate_nrmse(
         debug: bool = True,
         iteration: int = 0,
     ) -> float:
-        """Calculate NRMSE with R-matching normalization"""
+        """Calculate NRMSE with detailed debugging"""
         n = len(y_true)
         y_true = np.asarray(y_true)
         y_pred = np.asarray(y_pred)
@@ -478,18 +468,29 @@ def calculate_nrmse(
         residuals = y_true - y_pred
         rss = np.sum(residuals**2)
 
-        # Calculate range same as R
-        y_range = np.max(y_true) - np.min(y_true)
+        # Calculate range from true values
+        y_min = np.min(y_true)
+        y_max = np.max(y_true)
+        scale = y_max - y_min
 
-        # Calculate RMSE
+        # Calculate RMSE first
         rmse = np.sqrt(rss / n)
-        nrmse = rmse / y_range if y_range > 0 else rmse
-
-        # if debug and (iteration % 50 == 0):
-        #     self.logger.debug(f"\nNRMSE Calculation Debug (iteration {iteration}):")
-        #     self.logger.debug(f"RSS: {rss:.6f}")
-        #     self.logger.debug(f"RMSE: {rmse:.6f}")
-        #     self.logger.debug(f"Range: {y_range:.6f}")
-        #     self.logger.debug(f"NRMSE: {nrmse:.6f}")
+
+        if debug and (iteration == 0 or iteration % 25 == 0):
+            print(f"\nNRMSE Calculation Debug (iteration {iteration}):")
+            print(f"n: {n}")
+            print(f"RSS: {rss:.6f}")
+            print(f"RMSE: {rmse:.6f}")
+            print(f"y_true range: [{y_min:.6f}, {y_max:.6f}]")
+            print(f"scale: {scale:.6f}")
+            print("First 5 pairs (true, pred, residual):")
+            for i in range(min(5, len(y_true))):
+                print(f"  {y_true[i]:.6f}, {y_pred[i]:.6f}, {residuals[i]:.6f}")
+
+        # Calculate final NRMSE
+        nrmse = rmse / scale if scale > 0 else rmse
+
+        if debug and (iteration == 0 or iteration % 25 == 0):
+            print(f"Final NRMSE: {nrmse:.6f}")
 
         return float(nrmse)