Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update modeling column discrepancies #1170

Draft
wants to merge 4 commits into
base: robynpy_release
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,5 @@ python/src/tutorials/data/R/*
python/src/tutorials/data/*
*.pkl
python/src/robyn/_deprecate/*
python/src/robyn/tutorials/output/*
python/src/robyn/tutorials/output/*
python/src/robyn/debug/*
66 changes: 31 additions & 35 deletions python/src/robyn/data/entities/mmmdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,59 +196,55 @@ def remove_column(self, column_name: str) -> None:
self.data.drop(columns=[column_name], inplace=True)

def calculate_rolling_window_indices(self) -> None:
# Ensure the date column is in datetime format
"""Match R's window index calculation exactly"""
# Convert dates to pandas datetime
self.data[self.mmmdata_spec.date_var] = pd.to_datetime(
self.data[self.mmmdata_spec.date_var]
)
# Convert window_start and window_end to datetime if they are strings
window_start = (
pd.to_datetime(self.mmmdata_spec.window_start)
if isinstance(self.mmmdata_spec.window_start, str)
else self.mmmdata_spec.window_start
)
window_end = (
pd.to_datetime(self.mmmdata_spec.window_end)
if isinstance(self.mmmdata_spec.window_end, str)
else self.mmmdata_spec.window_end
)
dates_vec = self.data[self.mmmdata_spec.date_var]

# Calculate the index for the rolling window start
# Convert window dates to datetime
window_start = pd.to_datetime(self.mmmdata_spec.window_start)
window_end = pd.to_datetime(self.mmmdata_spec.window_end)

# Calculate start index using days difference like R
if window_start is not None:
closest_start_idx = (
(self.data[self.mmmdata_spec.date_var] - window_start).abs().idxmin()
)
closest_start_date = self.data[self.mmmdata_spec.date_var].iloc[
closest_start_idx
]
days_diff = (dates_vec - window_start).dt.total_seconds() / (24 * 3600)
closest_start_idx = days_diff.abs().argmin()
closest_start_date = dates_vec.iloc[closest_start_idx]

print(f"Python window calculation (start):")
print(f"window_start: {window_start}")
print(f"closest_start_date: {closest_start_date}")
print(f"closest_start_idx: {closest_start_idx}")
print(f"days_diff first few values: {days_diff.head().values}")

self.mmmdata_spec.rolling_window_start_which = closest_start_idx
# Adjust window_start to the closest date in the data
self.mmmdata_spec.window_start = closest_start_date
print(
f"Adjusted window_start to the closest date in the data: {closest_start_date}"
)

# Calculate the index for the rolling window end
# Calculate end index using days difference like R
if window_end is not None:
closest_end_idx = (
(self.data[self.mmmdata_spec.date_var] - window_end).abs().idxmin()
)
closest_end_date = self.data[self.mmmdata_spec.date_var].iloc[
closest_end_idx
]
days_diff = (dates_vec - window_end).dt.total_seconds() / (24 * 3600)
closest_end_idx = days_diff.abs().argmin()
closest_end_date = dates_vec.iloc[closest_end_idx]

print(f"Python window calculation (end):")
print(f"window_end: {window_end}")
print(f"closest_end_date: {closest_end_date}")
print(f"closest_end_idx: {closest_end_idx}")
print(f"days_diff first few values: {days_diff.head().values}")

self.mmmdata_spec.rolling_window_end_which = closest_end_idx
# Adjust window_end to the closest date in the data
self.mmmdata_spec.window_end = closest_end_date
print(
f"Adjusted window_end to the closest date in the data: {closest_end_date}"
)

# Calculate rolling window length
# Calculate window length
if window_start is not None and window_end is not None:
self.mmmdata_spec.rolling_window_length = (
self.mmmdata_spec.rolling_window_end_which
- self.mmmdata_spec.rolling_window_start_which
+ 1
)
print(f"Window length: {self.mmmdata_spec.rolling_window_length}")

def set_default_factor_vars(self) -> None:
"""
Expand Down
22 changes: 0 additions & 22 deletions python/src/robyn/modeling/base_model_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,25 +318,3 @@ def _apply_transformations(
"Error applying transformations for channel %s: %s", channel, str(e)
)
raise

def _prepare_model_data(self, hyperparameters: Dict[str, Any]) -> np.ndarray:
logger.info("Preparing model data")
transformed_data = {}

for channel in self.mmmdata.paid_media_vars:
logger.debug("Processing channel: %s", channel)
transformed_data[channel] = self._apply_transformations(
channel, hyperparameters
)

logger.debug("Combining transformed data with context and organic variables")
model_data = np.column_stack(
[transformed_data[channel] for channel in self.mmmdata.paid_media_vars]
+ [
self.mmmdata.data[var]
for var in self.mmmdata.context_vars + self.mmmdata.organic_vars
]
)

logger.info("Model data preparation complete. Shape: %s", model_data.shape)
return model_data
7 changes: 5 additions & 2 deletions python/src/robyn/modeling/ridge/ridge_data_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,16 +166,19 @@ def _hyper_collector(
return hyper_collect

def _geometric_adstock(self, x: pd.Series, theta: float) -> pd.Series:
# print(f"Before adstock: {x.head()}")
y = x.copy()
for i in range(1, len(x)):
y.iloc[i] += theta * y.iloc[i - 1]
# print(f"After adstock: {y.head()}")
return y

def _hill_transformation(
self, x: pd.Series, alpha: float, gamma: float
) -> pd.Series:

# Add debug self.logger.debugs
# print(f"Before hill: {x.head()}")
x_scaled = (x - x.min()) / (x.max() - x.min())
result = x_scaled**alpha / (x_scaled**alpha + gamma**alpha)

# print(f"After hill: {result.head()}")
return result
12 changes: 5 additions & 7 deletions python/src/robyn/modeling/ridge/ridge_evaluate_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,12 +227,10 @@ def _evaluate_model(
debug = True

if debug and (iter_ng == 0 or iter_ng % 25 == 0):
self.logger.debug(
f"\nEvaluation Debug (trial {trial}, iteration {iter_ng}):"
)
self.logger.debug(f"X shape: {X.shape}")
self.logger.debug(f"y shape: {y.shape}")
self.logger.debug("Parameters:", params)
print(f"\nEvaluation Debug (trial {trial}, iteration {iter_ng}):")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print("Parameters:", params)

# Split data using R's approach
train_size = params.get("train_size", 1.0) if ts_validation else 1.0
Expand Down Expand Up @@ -272,7 +270,7 @@ def _evaluate_model(
# Calculate metrics using R-style calculations
y_train_pred = model.predict(x_norm)
metrics["rsq_train"] = self.ridge_metrics_calculator.calculate_r2_score(
y_norm, y_train_pred, x_norm.shape[1], debug=debug, iteration=iter_ng
y_norm, y_train_pred, x_norm.shape[1]
)
metrics["nrmse_train"] = self.ridge_metrics_calculator.calculate_nrmse(
y_norm, y_train_pred, debug=debug, iteration=iter_ng
Expand Down
159 changes: 80 additions & 79 deletions python/src/robyn/modeling/ridge/ridge_metrics_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,13 @@ def _calculate_decomp_spend_dist(
"iterPar",
"Elapsed",
]
# self.logger.debug(f"Decomp spend distribution debug:")
# self.logger.debug(f"Total media spend: {total_media_spend}")
# self.logger.debug(f"Total effect: {total_effect}")
# for col in paid_media_cols:
# self.logger.debug(f"{col} - effect: {all_effects[col]}, spend: {all_spends[col]}")
self.logger.debug(f"Decomp spend distribution debug:")
self.logger.debug(f"Total media spend: {total_media_spend}")
self.logger.debug(f"Total effect: {total_effect}")
for col in paid_media_cols:
self.logger.debug(
f"{col} - effect: {all_effects[col]}, spend: {all_spends[col]}"
)
return df[required_cols]

def _calculate_lambda(
Expand All @@ -163,44 +165,45 @@ def _calculate_lambda(
iteration: int = 0,
) -> Tuple[float, float]:
"""Match R's glmnet lambda calculation exactly"""

def mysd(y: np.ndarray) -> float:
"""R's implementation of sd"""
return np.sqrt(np.sum((y - np.mean(y)) ** 2) / len(y))

# Scale X using R's approach
x_means = np.mean(x_norm, axis=0)
x_sds = np.apply_along_axis(mysd, 0, x_norm)
sx = (x_norm - x_means) / x_sds[:, np.newaxis].T

# Handle NaN values like R does
check_nan = np.all(np.isnan(sx), axis=0)
sx = np.where(check_nan, 0, sx)

# R does not scale y in this case
sy = y_norm

# Calculate lambda_max using R's approach
# 0.001 is glmnet's default alpha value for ridge regression
lambda_max = np.max(np.abs(np.sum(sx * sy[:, np.newaxis], axis=0))) / (
0.001 * len(x_norm)
)

# Calculate lambda using lambda_hp
lambda_min = lambda_max * 0.0001 # R's default lambda_min_ratio
n_samples = len(y_norm)

# Standardize first before scale factor calculation
def r_scale(x):
mean = np.mean(x)
sd = np.sqrt(np.sum((x - mean) ** 2) / (len(x) - 1))
return (x - mean) / (sd if sd > 1e-10 else 1.0)

# Scale X and y first
x_scaled = np.apply_along_axis(r_scale, 0, x_norm)
y_scaled = r_scale(y_norm)

# Now calculate scale factor using scaled data
scale_factor = np.mean(np.abs(x_scaled)) * np.mean(np.abs(y_scaled))

if debug and (iteration == 0 or iteration % 25 == 0):
print(f"\nLambda Calculation Debug (iteration {iteration}):")
print(f"n_samples: {n_samples}")
print(f"x_scaled mean abs: {np.mean(np.abs(x_scaled)):.6f}")
print(f"y_scaled mean abs: {np.mean(np.abs(y_scaled)):.6f}")
print(f"Scale factor: {scale_factor:.6f}")
print(f"lambda_hp: {lambda_hp:.6f}")

# R's lambda calculation
alpha = 0.001
gram = x_scaled.T @ x_scaled / n_samples
ctx = np.abs(x_scaled.T @ y_scaled) / n_samples

lambda_max = np.max(ctx) / alpha
lambda_min = lambda_max * 0.0001
lambda_ = lambda_min + lambda_hp * (lambda_max - lambda_min)

if debug and (iteration % 10 == 0):
self.logger.debug(f"\nLambda Debug (iter {iteration}):")
self.logger.debug(f"x_means: {np.mean(np.abs(x_means)):.6f}")
self.logger.debug(f"x_sds mean: {np.mean(x_sds):.6f}")
self.logger.debug(f"sx mean: {np.mean(np.abs(sx)):.6f}")
self.logger.debug(f"sy mean: {np.mean(np.abs(sy)):.6f}")
self.logger.debug(f"lambda_max: {lambda_max:.6f}")
self.logger.debug(f"lambda_: {lambda_:.6f}")
self.logger.debug(f"lambda_hp: {lambda_hp:.6f}")
if debug and (iteration == 0 or iteration % 25 == 0):
print(f"max ctx: {np.max(ctx):.6f}")
print(f"lambda_max: {lambda_max:.6f}")
print(f"lambda_min: {lambda_min:.6f}")
print(f"final lambda_: {lambda_:.6f}")

return float(lambda_), float(lambda_max)
return lambda_, lambda_max

def _calculate_rssd(
self,
Expand All @@ -225,11 +228,11 @@ def _calculate_rssd(
effects.append(effect)
spends.append(raw_spend)

# if debug and (iteration % 50 == 0):
# self.logger.debug(f"{col}:")
# self.logger.debug(f" coefficient: {coef:.6f}")
# self.logger.debug(f" raw spend: {raw_spend:.6f}")
# self.logger.debug(f" effect: {effect:.6f}")
if debug and (iteration == 0 or iteration % 25 == 0):
print(f"{col}:")
print(f" coefficient: {coef:.6f}")
print(f" raw spend: {raw_spend:.6f}")
print(f" effect: {effect:.6f}")

# Convert to numpy arrays
effects = np.array(effects)
Expand All @@ -244,12 +247,12 @@ def _calculate_rssd(
effects_norm = effects / total_effect
spends_norm = spends / total_spend

# if debug and (iteration % 50 == 0):
# self.logger.debug("\nNormalized values:")
# self.logger.debug("Effects:", effects_norm)
# self.logger.debug("Spends:", spends_norm)
# self.logger.debug("Effect total (check=1):", np.sum(effects_norm))
# self.logger.debug("Spend total (check=1):", np.sum(spends_norm))
if debug and (iteration == 0 or iteration % 25 == 0):
print("\nNormalized values:")
print("Effects:", effects_norm)
print("Spends:", spends_norm)
print("Effect total (check=1):", np.sum(effects_norm))
print("Spend total (check=1):", np.sum(spends_norm))

# Calculate RSSD
squared_diff = (effects_norm - spends_norm) ** 2
Expand Down Expand Up @@ -427,18 +430,12 @@ def _calculate_x_decomp_agg(
return df

def calculate_r2_score(
self,
y_true: np.ndarray,
y_pred: np.ndarray,
n_features: int,
df_int: int = 1,
debug: bool = True,
iteration: int = 0,
self, y_true: np.ndarray, y_pred: np.ndarray, n_features: int, df_int: int = 1
) -> float:
"""Match R's R² calculation exactly"""
n = len(y_true)

# Calculate R's version of R²
# Scale data like R
y_mean = np.mean(y_true)
ss_tot = np.sum((y_true - y_mean) ** 2)
ss_res = np.sum((y_true - y_pred) ** 2)
Expand All @@ -449,17 +446,10 @@ def calculate_r2_score(
# R's adjustment formula
adj_r2 = 1 - ((1 - r2) * (n - df_int) / (n - n_features - df_int))

# Match R's scale
if adj_r2 < 0:
adj_r2 = -np.sqrt(np.abs(adj_r2)) # R-style negative scaling

# if debug and (iteration % 50 == 0):
# self.logger.debug(f"R² Calculation Debug:")
# self.logger.debug(f"n: {n}")
# self.logger.debug(f"ss_tot: {ss_tot:.6f}")
# self.logger.debug(f"ss_res: {ss_res:.6f}")
# self.logger.debug(f"R²: {r2:.6f}")
# self.logger.debug(f"Adjusted R²: {adj_r2:.6f}")

return float(adj_r2)

def calculate_nrmse(
Expand All @@ -469,7 +459,7 @@ def calculate_nrmse(
debug: bool = True,
iteration: int = 0,
) -> float:
"""Calculate NRMSE with R-matching normalization"""
"""Calculate NRMSE with detailed debugging"""
n = len(y_true)
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)
Expand All @@ -478,18 +468,29 @@ def calculate_nrmse(
residuals = y_true - y_pred
rss = np.sum(residuals**2)

# Calculate range same as R
y_range = np.max(y_true) - np.min(y_true)
# Calculate range from true values
y_min = np.min(y_true)
y_max = np.max(y_true)
scale = y_max - y_min

# Calculate RMSE
# Calculate RMSE first
rmse = np.sqrt(rss / n)
nrmse = rmse / y_range if y_range > 0 else rmse

# if debug and (iteration % 50 == 0):
# self.logger.debug(f"\nNRMSE Calculation Debug (iteration {iteration}):")
# self.logger.debug(f"RSS: {rss:.6f}")
# self.logger.debug(f"RMSE: {rmse:.6f}")
# self.logger.debug(f"Range: {y_range:.6f}")
# self.logger.debug(f"NRMSE: {nrmse:.6f}")

if debug and (iteration == 0 or iteration % 25 == 0):
print(f"\nNRMSE Calculation Debug (iteration {iteration}):")
print(f"n: {n}")
print(f"RSS: {rss:.6f}")
print(f"RMSE: {rmse:.6f}")
print(f"y_true range: [{y_min:.6f}, {y_max:.6f}]")
print(f"scale: {scale:.6f}")
print("First 5 pairs (true, pred, residual):")
for i in range(min(5, len(y_true))):
print(f" {y_true[i]:.6f}, {y_pred[i]:.6f}, {residuals[i]:.6f}")

# Calculate final NRMSE
nrmse = rmse / scale if scale > 0 else rmse

if debug and (iteration == 0 or iteration % 25 == 0):
print(f"Final NRMSE: {nrmse:.6f}")

return float(nrmse)
Loading
Loading