mind-inria · lionelkusch · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/doc_conf/api.rst b/doc_conf/api.rst
@@ -23,8 +23,11 @@ Functions
    ensemble_clustered_inference
    group_reid
    hd_inference
-   knockoff_aggregation
    model_x_knockoff
+   model_x_knockoff_filter
+   model_x_knockoff_pvalue
+   model_x_knockoff_bootstrap_quantile
+   model_x_knockoff_bootstrap_e_value
    multivariate_1D_simulation
    permutation_test_cv
    reid

diff --git a/doc_conf/references.bib b/doc_conf/references.bib
@@ -77,22 +77,6 @@ @article{Ren_2023
     eprint = {https://academic.oup.com/jrsssb/article-pdf/86/1/122/56629998/qkad085.pdf},
 }
 
-@article{Candes_2018,
-    author = {Candès, Emmanuel and Fan, Yingying and Janson, Lucas and Lv, Jinchi},
-    title = "{Panning for Gold: ‘Model-X’ Knockoffs for High Dimensional Controlled Variable Selection}",
-    journal = {Journal of the Royal Statistical Society Series B: Statistical Methodology},
-    volume = {80},
-    number = {3},
-    pages = {551-577},
-    year = {2018},
-    month = {01},
-    abstract = "{ Many contemporary large-scale applications involve building interpretable models linking a large set of potential covariates to a response in a non-linear fashion, such as when the response is binary. Although this modelling problem has been extensively studied, it remains unclear how to control the fraction of false discoveries effectively even in high dimensional logistic regression, not to mention general high dimensional non-linear models. To address such a practical problem, we propose a new framework of ‘model-X’ knockoffs, which reads from a different perspective the knockoff procedure that was originally designed for controlling the false discovery rate in linear models. Whereas the knockoffs procedure is constrained to homoscedastic linear models with n⩾p, the key innovation here is that model-X knockoffs provide valid inference from finite samples in settings in which the conditional distribution of the response is arbitrary and completely unknown. Furthermore, this holds no matter the number of covariates. Correct inference in such a broad setting is achieved by constructing knockoff variables probabilistically instead of geometrically. To do this, our approach requires that the covariates are random (independent and identically distributed rows) with a distribution that is known, although we provide preliminary experimental evidence that our procedure is robust to unknown or estimated distributions. To our knowledge, no other procedure solves the controlled variable selection problem in such generality but, in the restricted settings where competitors exist, we demonstrate the superior power of knockoffs through simulations. Finally, we apply our procedure to data from a case–control study of Crohn's disease in the UK, making twice as many discoveries as the original analysis of the same data.}",
-    issn = {1369-7412},
-    doi = {10.1111/rssb.12265},
-    url = {https://doi.org/10.1111/rssb.12265},
-    eprint = {https://academic.oup.com/jrsssb/article-pdf/80/3/551/49274696/jrsssb\_80\_3\_551.pdf},
-}
-
 @article{breimanRandomForests2001,
   title = {Random {{Forests}}},
   author = {Breiman, Leo},
@@ -148,20 +132,24 @@ @article{miPermutationbasedIdentificationImportant2021
   keywords = {Cancer,Data mining,Machine learning,Statistical methods},
 }
 
-@article{candesPanningGoldModelX2017,
-  title = {Panning for {{Gold}}: {{Model-X Knockoffs}} for {{High-dimensional Controlled Variable Selection}}},
-  shorttitle = {Panning for {{Gold}}},
-  author = {Candes, Emmanuel and Fan, Yingying and Janson, Lucas and Lv, Jinchi},
-  year = {2017},
-  month = dec,
-  journal = {arXiv:1610.02351 [math, stat]},
-  eprint = {1610.02351},
-  primaryclass = {math, stat},
-  urldate = {2022-01-12},
-  abstract = {Many contemporary large-scale applications involve building interpretable models linking a large set of potential covariates to a response in a nonlinear fashion, such as when the response is binary. Although this modeling problem has been extensively studied, it remains unclear how to effectively control the fraction of false discoveries even in high-dimensional logistic regression, not to mention general high-dimensional nonlinear models. To address such a practical problem, we propose a new framework of \$model\$-\$X\$ knockoffs, which reads from a different perspective the knockoff procedure (Barber and Cand{\textbackslash}`es, 2015) originally designed for controlling the false discovery rate in linear models. Whereas the knockoffs procedure is constrained to homoscedastic linear models with \$n{\textbackslash}ge p\$, the key innovation here is that model-X knockoffs provide valid inference from finite samples in settings in which the conditional distribution of the response is arbitrary and completely unknown. Furthermore, this holds no matter the number of covariates. Correct inference in such a broad setting is achieved by constructing knockoff variables probabilistically instead of geometrically. To do this, our approach requires the covariates be random (independent and identically distributed rows) with a distribution that is known, although we provide preliminary experimental evidence that our procedure is robust to unknown/estimated distributions. To our knowledge, no other procedure solves the \$controlled\$ variable selection problem in such generality, but in the restricted settings where competitors exist, we demonstrate the superior power of knockoffs through simulations. Finally, we apply our procedure to data from a case-control study of Crohn's disease in the United Kingdom, making twice as many discoveries as the original analysis of the same data.},
-  archiveprefix = {arxiv},
-  keywords = {Mathematics - Statistics Theory,Statistics - Applications,Statistics - Methodology},
-  file = {/home/ahmad/Zotero/storage/YZ23F3Q5/Candes et al. - 2017 - Panning for Gold Model-X Knockoffs for High-dimen.pdf;/home/ahmad/Zotero/storage/ZSN64F6N/1610.html}
+@article{candes2018panning,
+  title={Panning for gold:‘model-X’knockoffs for high dimensional controlled variable selection},
+  author={Candes, Emmanuel and Fan, Yingying and Janson, Lucas and Lv, Jinchi},
+  journal={Journal of the Royal Statistical Society Series B: Statistical Methodology},
+  volume={80},
+  number={3},
+  pages={551--577},
+  year={2018},
+  publisher={Oxford University Press}
+ }
+
+@article{barber2015controlling,
+  title={Controlling the false discovery rate via knockoffs},
+  author={Barber, Rina Foygel and Cand{\`e}s, Emmanuel J},
+  journal={The Annals of statistics},
+  pages={2055--2085},
+  year={2015},
+  publisher={JSTOR}
 }
 
 @article{liuFastPowerfulConditional2021,
@@ -176,5 +164,13 @@ @article{liuFastPowerfulConditional2021
   abstract = {We consider the problem of conditional independence testing: given a response Y and covariates (X,Z), we test the null hypothesis that Y is independent of X given Z. The conditional randomization test (CRT) was recently proposed as a way to use distributional information about X{\textbar}Z to exactly (non-asymptotically) control Type-I error using any test statistic in any dimensionality without assuming anything about Y{\textbar}(X,Z). This flexibility in principle allows one to derive powerful test statistics from complex prediction algorithms while maintaining statistical validity. Yet the direct use of such advanced test statistics in the CRT is prohibitively computationally expensive, especially with multiple testing, due to the CRT's requirement to recompute the test statistic many times on resampled data. We propose the distilled CRT, a novel approach to using state-of-the-art machine learning algorithms in the CRT while drastically reducing the number of times those algorithms need to be run, thereby taking advantage of their power and the CRT's statistical guarantees without suffering the usual computational expense. In addition to distillation, we propose a number of other tricks like screening and recycling computations to further speed up the CRT without sacrificing its high power and exact validity. Indeed, we show in simulations that all our proposals combined lead to a test that has similar power to the most powerful existing CRT implementations but requires orders of magnitude less computation, making it a practical tool even for large data sets. We demonstrate these benefits on a breast cancer dataset by identifying biomarkers related to cancer stage.},
   archiveprefix = {arxiv},
   keywords = {Statistics - Methodology},
-  file = {/home/ahmad/Zotero/storage/8HRQZX3H/Liu et al. - 2021 - Fast and Powerful Conditional Randomization Testin.pdf;/home/ahmad/Zotero/storage/YFNDKN2B/2006.html}
-}
+}
+
+@article{reid2016study,
+  title={A study of error variance estimation in lasso regression},
+  author={Reid, Stephen and Tibshirani, Robert and Friedman, Jerome},
+  journal={Statistica Sinica},
+  pages={35--67},
+  year={2016},
+  publisher={JSTOR}
+}
diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py
@@ -24,10 +24,16 @@
 
 import numpy as np
 from hidimstat.data_simulation import simu_data
-from hidimstat.knockoffs import model_x_knockoff
-from hidimstat.knockoff_aggregation import knockoff_aggregation
+from hidimstat.knockoffs import (
+    model_x_knockoff,
+    model_x_knockoff_filter,
+    model_x_knockoff_bootstrap_quantile,
+    model_x_knockoff_bootstrap_e_value,
+)
 from hidimstat.utils import cal_fdp_power
 from sklearn.utils import check_random_state
+from sklearn.linear_model import LassoCV
+from sklearn.model_selection import KFold
 import matplotlib.pyplot as plt
 
 plt.rcParams.update({"font.size": 26})
@@ -61,32 +67,46 @@ def single_run(
     )
 
     # Use model-X Knockoffs [1]
-    mx_selection = model_x_knockoff(X, y, fdr=fdr, n_jobs=n_jobs, seed=seed)
-
+    test_scores = model_x_knockoff(
+        X,
+        y,
+        estimator=LassoCV(
+            n_jobs=n_jobs,
+            verbose=0,
+            max_iter=1000,
+            cv=KFold(n_splits=5, shuffle=True, random_state=0),
+            tol=1e-6,
+        ),
+        n_bootstraps=1,
+        random_state=seed,
+    )
+    mx_selection = model_x_knockoff_filter(test_scores, fdr=fdr)
     fdp_mx, power_mx = cal_fdp_power(mx_selection, non_zero_index)
+
     # Use p-values aggregation [2]
-    aggregated_ko_selection = knockoff_aggregation(
+    test_scores = model_x_knockoff(
         X,
         y,
-        fdr=fdr,
+        estimator=LassoCV(
+            n_jobs=n_jobs,
+            verbose=0,
+            max_iter=1000,
+            cv=KFold(n_splits=5, shuffle=True, random_state=0),
+            tol=1e-6,
+        ),
         n_bootstraps=n_bootstraps,
         n_jobs=n_jobs,
-        gamma=0.3,
         random_state=seed,
     )
+    aggregated_ko_selection = model_x_knockoff_bootstrap_quantile(
+        test_scores, fdr=fdr, gamma=0.3, selection_only=True
+    )
 
     fdp_pval, power_pval = cal_fdp_power(aggregated_ko_selection, non_zero_index)
 
     # Use e-values aggregation [1]
-    eval_selection = knockoff_aggregation(
-        X,
-        y,
-        fdr=fdr,
-        method="e-values",
-        n_bootstraps=n_bootstraps,
-        n_jobs=n_jobs,
-        gamma=0.3,
-        random_state=seed,
+    eval_selection = model_x_knockoff_bootstrap_e_value(
+        test_scores, fdr=fdr, selection_only=True
     )
 
     fdp_eval, power_eval = cal_fdp_power(eval_selection, non_zero_index)

diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py
@@ -3,8 +3,13 @@
 from .desparsified_lasso import desparsified_group_lasso, desparsified_lasso
 from .Dnn_learner_single import DnnLearnerSingle
 from .ensemble_clustered_inference import ensemble_clustered_inference
-from .knockoff_aggregation import knockoff_aggregation
-from .knockoffs import model_x_knockoff
+from .knockoffs import (
+    model_x_knockoff,
+    model_x_knockoff_filter,
+    model_x_knockoff_pvalue,
+    model_x_knockoff_bootstrap_quantile,
+    model_x_knockoff_bootstrap_e_value,
+)
 from .multi_sample_split import aggregate_quantiles
 from .noise_std import group_reid, reid
 from .permutation_test import permutation_test_cv
@@ -31,8 +36,11 @@
     "ensemble_clustered_inference",
     "group_reid",
     "hd_inference",
-    "knockoff_aggregation",
     "model_x_knockoff",
+    "model_x_knockoff_filter",
+    "model_x_knockoff_pvalue",
+    "model_x_knockoff_bootstrap_quantile",
+    "model_x_knockoff_bootstrap_e_value",
     "multivariate_1D_simulation",
     "permutation_test_cv",
     "reid",