diff --git a/paper/paper.bib b/paper/paper.bib index 63704bc..5f64af1 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -21,17 +21,19 @@ @misc{cmpnn_amended_results } @article{representation_review, - title = {From intuition to AI: evolution of small molecule representations in drug discovery}, - volume = {25}, - ISSN = {1477-4054}, - url = {http://dx.doi.org/10.1093/bib/bbad422}, - DOI = {10.1093/bib/bbad422}, - number = {1}, - journal = {Briefings in Bioinformatics}, - publisher = {Oxford University Press (OUP)}, - author = {McGibbon, Miles and Shave, Steven and Dong, Jie and Gao, Yumiao and Houston, Douglas R and Xie, Jiancong and Yang, Yuedong and Schwaller, Philippe and Blay, Vincent}, - year = {2023}, - month = nov + author = {McGibbon, Miles and Shave, Steven and Dong, Jie and Gao, Yumiao and Houston, Douglas R and Xie, Jiancong and Yang, Yuedong and Schwaller, Philippe and Blay, Vincent}, + title = "{From intuition to AI: evolution of small molecule representations in drug discovery}", + journal = {Briefings in Bioinformatics}, + volume = {25}, + number = {1}, + pages = {bbad422}, + year = {2023}, + month = {11}, + abstract = "{Within drug discovery, the goal of AI scientists and cheminformaticians is to help identify molecular starting points that will develop into safe and efficacious drugs while reducing costs, time and failure rates. To achieve this goal, it is crucial to represent molecules in a digital format that makes them machine-readable and facilitates the accurate prediction of properties that drive decision-making. Over the years, molecular representations have evolved from intuitive and human-readable formats to bespoke numerical descriptors and fingerprints, and now to learned representations that capture patterns and salient features across vast chemical spaces. Among these, sequence-based and graph-based representations of small molecules have become highly popular. However, each approach has strengths and weaknesses across dimensions such as generality, computational cost, inversibility for generative applications and interpretability, which can be critical in informing practitioners’ decisions. As the drug discovery landscape evolves, opportunities for innovation continue to emerge. These include the creation of molecular representations for high-value, low-data regimes, the distillation of broader biological and chemical knowledge into novel learned representations and the modeling of up-and-coming therapeutic modalities.}", + issn = {1477-4054}, + doi = {10.1093/bib/bbad422}, + url = {https://doi.org/10.1093/bib/bbad422}, + eprint = {https://academic.oup.com/bib/article-pdf/25/1/bbad422/53933271/bbad422.pdf}, } @article{estrada_abc, @@ -39,9 +41,19 @@ @article{estrada_abc author={Estrada, Ernesto and Torres, Luis and Rodriguez, Lissette and Gutman, Ivan}, year={1998}, publisher={NISCAIR-CSIR, India}, - url={http://nopr.niscpr.res.in/handle/123456789/40308}, + url={}, } + @article{estrada_abc, + title={An atom-bond connectivity index: Modelling the.enthalpy of formation of alkanes}, + volume={37}, + url={http://nopr.niscpr.res.in/handle/123456789/40308}, + journal={Indian Journal of Chemistry}, + author={Estrada', Ernesto and Torres', Luis and Rodriguez', Lissette and Gutman=, Ivan}, + year={1998}, + pages={849-855} + } + @article{quantumscents, author = {Burns, Jackson W. and Rogers, David M.}, title = {QuantumScents: Quantum-Mechanical Properties for 3.5k Olfactory Molecules}, @@ -120,18 +132,19 @@ @inproceedings{unimol author={Gengmo Zhou and Zhifeng Gao and Qiankun Ding and Hang Zheng and Hongteng Xu and Zhewei Wei and Linfeng Zhang and Guolin Ke}, booktitle={The Eleventh International Conference on Learning Representations }, year={2023}, - url={https://openreview.net/forum?id=6K2RM6wVqKu}, doi={10.26434/chemrxiv-2022-jjm0j}, url={https://doi.org/10.26434/chemrxiv-2022-jjm0j} } -@misc{sggrl, - title={Multi-Modal Representation Learning for Molecular Property Prediction: Sequence, Graph, Geometry}, - author={Zeyu Wang and Tianyi Jiang and Jinhuan Wang and Qi Xuan}, - year={2024}, - eprint={2401.03369}, - archivePrefix={arXiv}, - primaryClass={q-bio.MN} +@article{sggrl, + doi = {10.48550/ARXIV.2401.03369}, + url = {https://arxiv.org/abs/2401.03369}, + author = {Wang, Zeyu and Jiang, Tianyi and Wang, Jinhuan and Xuan, Qi}, + keywords = {Molecular Networks (q-bio.MN), Machine Learning (cs.LG), Biomolecules (q-bio.BM), FOS: Biological sciences, FOS: Biological sciences, FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {Multi-Modal Representation Learning for Molecular Property Prediction: Sequence, Graph, Geometry}, + publisher = {arXiv}, + year = {2024}, + copyright = {arXiv.org perpetual, non-exclusive license} } @article{gslmpp, @@ -272,7 +285,7 @@ @article{ma_deep_qsar } } -@misc{mhnn, +@article{mhnn, title={Molecular Hypergraph Neural Networks}, author={Junwu Chen and Philippe Schwaller}, year={2023}, @@ -312,17 +325,22 @@ @article{biswas_critical } @article{mordred, - title = {Mordred: a molecular descriptor calculator}, - volume = {10}, - ISSN = {1758-2946}, - url = {http://dx.doi.org/10.1186/s13321-018-0258-y}, - DOI = {10.1186/s13321-018-0258-y}, - number = {1}, - journal = {Journal of Cheminformatics}, - publisher = {Springer Science and Business Media LLC}, - author = {Moriwaki, Hirotomo and Tian, Yu-Shi and Kawashita, Norihito and Takagi, Tatsuya}, - year = {2018}, - month = feb + author={Moriwaki, Hirotomo + and Tian, Yu-Shi + and Kawashita, Norihito + and Takagi, Tatsuya}, + title={Mordred: a molecular descriptor calculator}, + journal={Journal of Cheminformatics}, + year={2018}, + month={Feb}, + day={06}, + volume={10}, + number={1}, + pages={4}, + abstract={Molecular descriptors are widely employed to present molecular characteristics in cheminformatics. Various molecular-descriptor-calculation software programs have been developed. However, users of those programs must contend with several issues, including software bugs, insufficient update frequencies, and software licensing constraints. To address these issues, we propose Mordred, a developed descriptor-calculation software application that can calculate more than 1800 two- and three-dimensional descriptors. It is freely available via GitHub. Mordred can be easily installed and used in the command line interface, as a web application, or as a high-flexibility Python package on all major platforms (Windows, Linux, and macOS). Performance benchmark results show that Mordred is at least twice as fast as the well-known PaDEL-Descriptor and it can calculate descriptors for large molecules, which cannot be accomplished by other software. Owing to its good performance, convenience, number of descriptors, and a lax licensing constraint, Mordred is a promising choice of molecular descriptor calculation software that can be utilized for cheminformatics studies, such as those on quantitative structure--property relationships.}, + issn={1758-2946}, + doi={10.1186/s13321-018-0258-y}, + url={https://doi.org/10.1186/s13321-018-0258-y} } @article{smiles, @@ -348,7 +366,8 @@ @software{lightning title = {{PyTorch Lightning}}, url = {https://github.com/Lightning-AI/lightning}, version = {1.4}, -year = {2019} +year = {2019}, +doi = {10.5281/zenodo.3828935} } @article{stereo_signature, @@ -366,7 +385,7 @@ @article{stereo_signature pages = {887-897} } -@misc{shap, +@article{shap, doi = {10.48550/ARXIV.1705.07874}, url = {https://arxiv.org/abs/1705.07874}, author = {Lundberg, Scott and Lee, Su-In}, @@ -431,7 +450,7 @@ @article{wu_photovoltaic url = {http://dx.doi.org/10.1021/acs.iecr.0c03880}, DOI = {10.1021/acs.iecr.0c03880}, number = {42}, - journal = {Industrial & Engineering Chemistry Research}, + journal = {Industrial and Engineering Chemistry Research}, publisher = {American Chemical Society (ACS)}, author = {Wu, Jinkui and Wang, Shihui and Zhou, Li and Ji, Xu and Dai, Yiyang and Dang, Yagu and Kraft, Markus}, year = {2020}, @@ -484,19 +503,26 @@ @article{pah } @article{ara, - title = {DeepAR: a novel deep learning-based hybrid framework for the interpretable prediction of androgen receptor antagonists}, - volume = {15}, - ISSN = {1758-2946}, - url = {http://dx.doi.org/10.1186/s13321-023-00721-z}, - DOI = {10.1186/s13321-023-00721-z}, - number = {1}, - journal = {Journal of Cheminformatics}, - publisher = {Springer Science and Business Media LLC}, - author = {Schaduangrat, Nalini and Anuwongcharoen, Nuttapat and Charoenkwan, Phasit and Shoombuatong, Watshara}, - year = {2023}, - month = may + author={Schaduangrat, Nalini + and Anuwongcharoen, Nuttapat + and Charoenkwan, Phasit + and Shoombuatong, Watshara}, + title={DeepAR: a novel deep learning-based hybrid framework for the interpretable prediction of androgen receptor antagonists}, + journal={Journal of Cheminformatics}, + year={2023}, + month={May}, + day={06}, + volume={15}, + number={1}, + pages={50}, + abstract={Drug resistance represents a major obstacle to therapeutic innovations and is a prevalent feature in prostate cancer (PCa). Androgen receptors (ARs) are the hallmark therapeutic target for prostate cancer modulation and AR antagonists have achieved great success. However, rapid emergence of resistance contributing to PCa progression is the ultimate burden of their long-term usage. Hence, the discovery and development of AR antagonists with capability to combat the resistance, remains an avenue for further exploration. Therefore, this study proposes a novel deep learning (DL)-based hybrid framework, named DeepAR, to accurately and rapidly identify AR antagonists by using only the SMILES notation. Specifically, DeepAR is capable of extracting and learning the key information embedded in AR antagonists. Firstly, we established a benchmark dataset by collecting active and inactive compounds against AR from the ChEMBL database. Based on this dataset, we developed and optimized a collection of baseline models by using a comprehensive set of well-known molecular descriptors and machine learning algorithms. Then, these baseline models were utilized for creating probabilistic features. Finally, these probabilistic features were combined and used for the construction of a meta-model based on a one-dimensional convolutional neural network. Experimental results indicated that DeepAR is a more accurate and stable approach for identifying AR antagonists in terms of the independent test dataset, by achieving an accuracy of 0.911 and MCC of 0.823. In addition, our proposed framework is able to provide feature importance information by leveraging a popular computational approach, named SHapley Additive exPlanations (SHAP). In the meanwhile, the characterization and analysis of potential AR antagonist candidates were achieved through the SHAP waterfall plot and molecular docking. The analysis inferred that N-heterocyclic moieties, halogenated substituents, and a cyano functional group were significant determinants of potential AR antagonists. Lastly, we implemented an online web server by using DeepAR (at http://pmlabstack.pythonanywhere.com/DeepAR). We anticipate that DeepAR could be a useful computational tool for community-wide facilitation of AR candidates from a large number of uncharacterized compounds.}, + issn={1758-2946}, + doi={10.1186/s13321-023-00721-z}, + url={https://doi.org/10.1186/s13321-023-00721-z} } + + @article{qm9, title = {Quantum chemistry structures and properties of 134 kilo molecules}, volume = {1}, @@ -506,12 +532,12 @@ @article{qm9 number = {1}, journal = {Scientific Data}, publisher = {Springer Science and Business Media LLC}, - author = {Ramakrishnan, Raghunathan and Dral, Pavlo O. and Rupp, Matthias and von Lilienfeld, O. Anatole}, + author = {Raghunathan Ramakrishnan and Pavlo Dral and Matthias Rupp and {\noop{Liliendfeld}} von Liliendfeld, OA}, year = {2014}, month = aug } -@misc{moleculenet, +@article{moleculenet, title={MoleculeNet: A Benchmark for Molecular Machine Learning}, author={Zhenqin Wu and Bharath Ramsundar and Evan N. Feinberg and Joseph Gomes and Caleb Geniesse and Aneesh S. Pappu and Karl Leswing and Vijay Pande}, year={2018}, @@ -530,7 +556,8 @@ @article{astartes volume = {8}, number = {91}, pages = {5996}, - author = {Jackson W. Burns and Kevin A. Spiekermann and Himaghna Bhattacharjee and Dionisios G. Vlachos and William H. Green}, title = {Machine Learning Validation via Rational Dataset Sampling with astartes}, + author = {Jackson Burns and Kevin Spiekermann and Himaghna Bhattacharjee and Dionisios Vlachos and William Green}, + title = {Machine Learning Validation via Rational Dataset Sampling with astartes}, journal = {Journal of Open Source Software} } @@ -557,7 +584,7 @@ @article{qm8 number = {8}, journal = {The Journal of Chemical Physics}, publisher = {AIP Publishing}, - author = {Ramakrishnan, Raghunathan and Hartmann, Mia and Tapavicza, Enrico and von Lilienfeld, O. Anatole}, + author = {Ramakrishnan, Raghunathan and Hartmann, Mia and Tapavicza, Enrico and {\noop{Liliendfeld}} von Liliendfeld, OA}, year = {2015}, month = aug } @@ -670,7 +697,7 @@ @article{tdc doi = {10.48550/arXiv.2102.09548}, } -@misc{pgp_best, +@article{pgp_best, doi = {10.48550/ARXIV.2310.00174}, url = {https://arxiv.org/abs/2310.00174}, author = {Notwell, James H. and Wood, Michael W.}, @@ -682,3 +709,18 @@ @misc{pgp_best doi = {10.48550/arXiv.2310.00174}, url = {https://doi.org/10.48550/arXiv.2310.00174} } + +@article{Coley2017, + title = {Prediction of Organic Reaction Outcomes Using Machine Learning}, + volume = {3}, + ISSN = {2374-7951}, + url = {http://dx.doi.org/10.1021/acscentsci.7b00064}, + DOI = {10.1021/acscentsci.7b00064}, + number = {5}, + journal = {ACS Central Science}, + publisher = {American Chemical Society (ACS)}, + author = {Coley, Connor W. and Barzilay, Regina and Jaakkola, Tommi S. and Green, William H. and Jensen, Klavs F.}, + year = {2017}, + month = apr, + pages = {434–443} +} diff --git a/paper/paper.md b/paper/paper.md index aa50d3c..5b1a0b7 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -41,15 +41,16 @@ note: | # Abstract -Quantitative Structure-Property/Activity Relationship studies, often referred to interchangeably as QS(P/A)R, seek to establish a mapping between molecular structure and an arbitrary Quantity of Interest (QOI). -Since its inception this was done on a QOI-by-QOI basis with new descriptors being devised by researchers to _specifically_ map to their QOI. -This continued for years and culminated in packages like DRAGON (later E-dragon), PaDEL-descriptor (and padelpy), Mordred, and many others. -The sheer number of different packages resulted in the creation of 'meta-packages' which served only to aggregate these other calculators, including tools like molfeat, ChemDes, Parameter Client, and AIMSim. - -Generalizable descriptor-based modeling was a natural evolution of these meta-packages' development, though it historically focused almost exclusively on linear methods. -Efforts to incorporate Deep Learning (DL) as a regression technique (Deep-QSPR), which would be capable of capturing the non-linear behavior of arbitrary QOIs, have instead focused on using molecular fingerprints as inputs. -The combination of bulk molecular-level descriptors with DL has remained largely unexplored, in significant part due to the orthogonal domain expertise required to combine the two. -Generalizable QSPR has turned to learned representations primarily via message passing graph neural networks. +Quantitative Structure-Property Relationship studies (QSPR), often referred to interchangeably as QSAR, seek to establish a mapping between molecular structure and an arbitrary Quantity of Interest (QOI). +Historically this was done on a QOI-by-QOI basis with new descriptors being devised by researchers to _specifically_ map to their QOI. +A large number of descriptors have been invented, and can be computed using packages like DRAGON (later E-dragon), PaDEL-descriptor (and padelpy), Mordred, CODESSA, and many others. +The sheer number of different descriptor packages resulted in the creation of 'meta-packages' which served only to aggregate these other calculators, including tools like molfeat, ChemDes, Parameter Client, and AIMSim. + +Generalizable descriptor-based modeling was a natural evolution of these meta-packages' development. +Historically QSPR researchers focused almost exclusively on linear methods. +Another community of researchers focused on finding nonlinear correlations between molecular structures and a QOI, often using Deep learning (DL). +The DL community typically used molecular fingerprints instead of the complex descriptors popular in QSPR community. +Recently the DL community has turned to learned representations primarily via message passing graph neural networks. This approach has proved remarkably effective but is not without drawbacks. Learning a representation requires large datasets to avoid over-fitting or even learn at all, loses interpretability since an embedding's meaning can only be induced retroactively, and needs significant execution time given the complexity of the underlying message passing algorithm. This paper introduces `fastprop`, a software package and general Deep-QSPR framework that combines a cogent set of molecular descriptors with DL to achieve state-of-the-art performance on datasets ranging from tens to tens of thousands of molecules. @@ -85,10 +86,10 @@ As explained by Muratov et. al [@muratov_qsar] QSPR uses linear methods (some of The over-reliance on this category of approaches may be due to priorities; domain experts seek interpretability in their work, especially given that the inputs are physically meaningful descriptors, and linear methods lend themselves well to this approach. Practice may also have been a limitation, since historically training and deploying neural networks required more computer science expertise than linear methods. -All of this is not to say that DL has _never_ been applied to QSPR. +All of this is not to say that Deep Learning (DL) has _never_ been applied to QSPR. Applications of DL to QSPR, i.e. DeepQSPR, were attempted throughout this time period but focused on the use of molecular fingerprints rather than descriptors. This may be at least partially attributed to knowledge overlap between deep learning experts and this sub-class of descriptors. -Molecular fingerprints are bit vectors which encode the presence or absence of human-chosen sub-structures in an analogous manner to the "bag of words" featurization strategy common to natural language processing. +Molecular fingerprints are bit vectors which encode the presence or absence of sub-structures in an analogous manner to the "bag of words" featurization strategy common to natural language processing. Experts have bridged this gap to open this subdomain and proved its effectiveness. In Ma and coauthors' review of DL for QSPR [@ma_deep_qsar], for example, it is claimed that DL with fingerprint descriptors is more effective than with molecular-level descriptors. They also demonstrate that DL outperforms or at least matches classical machine learning methods across a number of ADME-related datasets. @@ -99,12 +100,12 @@ Beyond the domains of chemistry where many of the descriptors had been originall As interest began to shift toward the prediction of molecular properties which were themselves descriptors (i.e. derived from quantum mechanics simulations) - to which none of the devised molecular descriptors were designed to be correlated - learned representations (LRs) emerged. ## Shift to Learned Representations -The exact timing of the transition from descriptors (molecular-level or fingerprints) to LRs is difficult to ascertain. +The exact timing of the transition from fixed descriptors (molecular-level or fingerprints) to LRs is difficult to ascertain [@Coley2017]. Among the most cited at least is the work of Yang and coworkers in 2019 [@chemprop_theory] which laid the groundwork for applying LRs to "Property Prediction" - QSPR by another name. In short, the basic idea is to initialize a molecular graph with only information about its bonds and atoms such as order, aromaticity, atomic number, etc. Then via a Message Passing Neural Network (MPNN) architecture, which is able to aggregate these atom- and bond-level features into a vector in a manner which can be updated, the 'best' representation of the molecule is found during training. This method proved highly accurate _and_ achieved the generalizability apparently lacking in descriptor-based modeling. -The corresponding software package Chemprop (later described in [@chemprop_software]) has become a _de facto_ standard for property prediction, partially because of the significant development and maintenance effort surrounding the software itself. +The modern version of the corresponding software package Chemprop (described in [@chemprop_software]) has become a _de facto_ standard for property prediction, partially because of the significant development and maintenance effort supporting that open source software project. Following the initial success of Chemprop numerous representation learning frameworks have been devised, all of which slightly improve performance. The Communicative-MPNN (CMPNN) framework is a modified version of Chemprop with a different message passing scheme to increase the interactions between node and edge features [@cmpnn]. @@ -117,30 +118,31 @@ Despite the continuous incremental performance improvements, this area of resear A thru-theme in these frameworks is the increasing complexity of DL techniques and consequent un-interpretability. This also means that actually _using_ these methods to do research on real-world dataset requires varying amounts of DL expertise, creating a rift between domain experts and these methods. Perhaps the most significant failure is the inability to achieve good predictions on small [^1] datasets. -This is a long-standing limitation, with the original Chemprop paper stating that datasets with fewer than 1000 entries see fingerprint-based linear on par with Chemprop [@chemprop_theory]. +This is a long-standing limitation, with the original Chemprop paper stating that linear models are about on par with Chemprop for datasets with fewer than 1000 entries [@chemprop_theory]. This limitation is especially challenging because it is a _fundamental_ drawback of the LR approach. Without the use of advanced DL techniques like pre-training or transfer learning, the model is essentially starting from near-zero information every time a model is created. -This inherently requires larger datasets to allow the model to effectively 're-learn' the chemical intuition which was built in to descriptor- and fingerprint-based representations. +This inherently requires larger datasets to allow the model to effectively 're-learn' the chemical intuition which was built in to descriptor- and fixed fingerprint-based representations. Efforts are of course underway to address this limitation, though none are broadly successful. One simple but incredibly computationally expensive approach is to use delta learning, which artificially increases dataset size by generating all possible _pairs_ of molecules from the available data (thus squaring the size of the dataset). -This was attempted by Nalini et al. [@deepdelta], who use an unmodified version of Chemprop referred to as 'DeepDelta' to predict _differences_ in molecular properties for _pairs_ of molecules. +This was attempted by Nalini et al. [@deepdelta], who used an unmodified version of Chemprop referred to as 'DeepDelta' to predict _differences_ in molecular properties for _pairs_ of molecules. They achieve increased performance over standard LR approaches but _lost_ the ability to train on large datasets due to simple runtime limitations. -Other increasingly complex approaches are discussed in the outstanding review by van Tilborg et al. [@low_data_review], though such techniques are furthering the consequences of complexity mentioned above. +Other increasingly complex approaches are discussed in the outstanding review by van Tilborg et al. [@low_data_review]. While iterations on LRs and novel approaches to low-data regimes have been in development, the classical QSPR community has continued their work. -A turning point in this domain was the release of `mordred`, a fast and well-developed package capable of calculating more than 1600 molecular descriptors. +A turning point in this domain was the release of `mordred`, a fast and well-developed package capable of calculating more than 1600 molecular descriptors [@mordred]. Critically this package was fully open source and written in Python, allowing it to readily interoperate with the world-class Python DL software ecosystem that greatly benefitted the LR community. -Now despite previous evidence that molecular descriptors _cannot_ achieve generalizable QSPR in combination with DL, the opposite is shown with `fastprop`. +Despite previous claims that molecular descriptors _cannot_ achieve generalizable QSPR in combination with DL, the opposite is shown here with `fastprop`. [^1]: What constitutes a 'small' dataset is decidedly _not_ agreed upon by researchers. -For the purposes of this study, it will be used to refer to datasets with ~1000 samples or less, which the authors believe better reflects the size of real-world datasets. +For the purposes of this study, it will be used to refer to datasets with ~1000 molecules or fewer, which the authors believe better reflects the size of real-world datasets. # Implementation -At its core the `fastprop` 'architecture' is simply the `mordred` molecular descriptor calculator [^2] [@mordred] connected to a Feedforward Neural Network (FNN) implemented in PyTorch Lightning [@lightning] (Figure \ref{logo}) - an existing approach formalized into an easy to use, reliable, and correct implementation. -The user simply specifies a set of SMILES [@smiles], a linear textual encoding of molecules, and their corresponding properties. -`fastprop` automatically calculates and caches the corresponding molecular descriptors with `mordred`, re-scales both the descriptors and the targets appropriately, and then trains an FNN with to predict the indicated target properties. +At its core the `fastprop` 'architecture' is simply the `mordred` molecular descriptor calculator [^2] [@mordred] connected to a Feedforward Neural Network (FNN) implemented in PyTorch Lightning [@lightning] (Figure \ref{logo}) - an existing approach formalized into an easy-to-use, reliable, and correct implementation. +`fastprop` is highly modular for seamless integration into existing workflows and includes end-to-end interfaces for general use. +In the latter mode the user simply specifies a set of SMILES [@smiles], a linear textual encoding of molecules, and their corresponding properties. +`fastprop` automatically calculates and caches the corresponding molecular descriptors with `mordred`, re-scales both the descriptors and the targets appropriately, and then trains an FNN to predict the indicated target properties. By default this FNN is two hidden layers with 1800 neurons each connected by ReLU activation functions, though the configuration can be readily changed via the command line interface or configuration file. `fastprop` owes its success to the cogent set of descriptors assembled by the developers of `mordred`, the ease of training FNNs with modern software like PyTorch Lightning, and the careful application of Research Software Engineering best practices that make it as user friendly as the best-maintained alternatives. @@ -201,19 +203,19 @@ test_size: 0.1 sampler: random ``` -Training, prediction, and feature importance and then readily accessible via the commands `fastprop train`, `fastprop predict`, or `fastprop shap`, respectively. +Training, prediction, and feature importance are then readily accessible via the commands `fastprop train`, `fastprop predict`, or `fastprop shap`, respectively. The `fastprop` GitHub repository contains a Jupyter notebook runnable from the browser via Google colab which allows users to actually execute the above example, which is also discussed at length in the [PAHs section](#pah), as well as further details about each configurable option. # Results & Discussion There are a number of established molecular property prediction benchmarks commonly used in LR studies, especially those standardized by MoleculeNet [@moleculenet]. Principal among them are QM8 [@qm8] and QM9 [@qm9], often regarded as _the_ standard benchmark for property prediction. -These are important benchmarks and QM9 is included for completeness, though the enormous size and rich coverage of chemical space (inherent in the design of the combinatorial datasets) means that nearly all model architectures are highly accurate, including `fastprop`. +These are important benchmarks and QM9 is included for completeness, though the enormous size and rich coverage of chemical space in the QM9 dataset means that nearly all model architectures are highly accurate, including `fastprop`. -Real world datasets, particularly those common in QSPR studies, often number in the hundreds. +Real world experimental datasets, particularly those common in QSPR studies, often number in the hundreds. To demonstrate the applicability of `fastprop` to these regimes, many smaller datasets are selected including some from the QSPR literature that are not established benchmarks. These studies relied on more complex and slow modeling techniques ([ARA](#ara)) or the design of a bespoke descriptor ([PAHs](#pah)) and have not yet come to rely on learned representations as a go-to tool. In these data-limited regimes where LRs sometimes struggle, `fastprop` and its intuition-loaded initialization are highly powerful. -To emphasize this point further, the benchmarks are presented in order of size, descending. +To emphasize this point further, the benchmarks are presented in order of dataset size, descending. Two additional benchmarks showing the limitations of `fastprop` are included after the main group of benchmarks: Fubrain and QuantumScents. The former demonstrates how `fastprop` can outperform LRs but still trail approaches like delta learning. @@ -254,6 +256,7 @@ Those presented here are summarized below, first for regression: - Mean Absolute Percentage Error (MAPE): MAE except that differences are relative (i.e. divided by the ground truth); scale-independent, range 0 (best) and up. - Weighted Mean Absolute Percentage Error (WMAPE): MAPE except the average is a weighted average, where the weight is the magnitude of the ground truth; scale-independent, range 0 (best) and up. - Coefficient of Determination (R2): Proportion of variance explained; scale-independent, range 0 (worst) to 1 (best). + and classification: - Area Under the Receiver Operating Curve (AUROC, AUC, or ROC-AUC): Summary statistic combining all possible classification errors; scale-independent, range 0.5 (worst, random guessing) to 1.0 (perfect classifier). - Accuracy: Fraction of correct classifications, expressed as either a percentage or a number; scale-independent, range 0 (worst) to 1 (perfect classifier). @@ -262,7 +265,7 @@ and classification: See Table \ref{results_table} for a summary of all the results. Subsequent sections explore each in greater detail. -Table: Summary of benchmark results. \label{results_table} +Table: Summary of benchmark results, best state-of-the-art method vs. `fastprop` and Chemprop. \label{results_table} +---------------+--------------------+-------------+--------------+------------+-------------------------+------+ | Benchmark | Samples (k) | Metric | SOTA | `fastprop` | Chemprop | p | @@ -287,7 +290,7 @@ Values are only shown for results generated in this study which are known to be Only the results for Flash and PAH are statistically significant at 95% confidence (p<0.05). ### QM9 -Originally described in Scientific Data [@qm9] and perhaps the most established property prediction benchmark, Quantum Machine 9 (QM9) provides quantum mechanics derived descriptors for all small molecules containing one to nine heavy atoms, totaling ~134k. +Originally described in Scientific Data [@qm9] and perhaps the most established property prediction benchmark, Quantum Machine 9 (QM9) provides quantum mechanics derived descriptors for many small molecules containing one to nine heavy atoms, totaling ~134k. The data was retrieved from MoleculeNet [@moleculenet] in a readily usable format. As a point of comparison, performance metrics are retrieved from the paper presenting the UniMol architecture [@unimol] previously mentioned. In that study they trained on only three especially difficult QOIs (homo, lumo, and gap) using scaffold-based splitting (a more challenging alternative to random splitting), reporting mean and standard deviation across 3 repetitions. @@ -379,7 +382,7 @@ test_gap_root_mean_squared_error_loss 3.0 1.556471e-02 8. ### Pgp First reported in 2011 by Broccatelli and coworkers [@pgp], this dataset has since become a standard benchmark and is included in the Therapeutic Data Commons (TDC) [@tdc] model benchmarking suite. -the dataset maps approximately 1.2k small molecule drugs to a binary label indicating if they inhibit P-glycoprotein (Pgp). +The dataset maps approximately 1.2k small molecule drugs to a binary label indicating if they inhibit P-glycoprotein (Pgp). TDC serves this data through a Python package, but due to installation issues the data was retrieved from the original study instead. The recommended splitting approach is a 70/10/20 scaffold-based split which is done here with 4 replicates. @@ -387,7 +390,7 @@ The model in the original study uses a molecular interaction field but has since According to TDC the current leader [@pgp_best] on this benchmark has achieved an AUROC of 0.938 $\pm$ 0.002 [^3]. On the same leaderboard Chemprop [@chemprop_theory] achieves 0.886 $\pm$ 0.016 with the inclusion of additional molecular features. `fastprop` yet again approaches the performance of the leading methods and outperforms Chemprop, here with an AUROC of 0.903 $\pm$ 0.033 and an accuracy of 83.6 $\pm$ 4.6%. -Remarkably, the linear model outperforms both Chemprop and `fastprop`, approaching the performance of the current leader with an AUROC of 0.917 $\pm$ 0.016 and an accuracy of 83.8 $\pm$ 3.9%. +Remarkably, the linear QSPR model outperforms both Chemprop and `fastprop`, approaching the performance of the current leader with an AUROC of 0.917 $\pm$ 0.016 and an accuracy of 83.8 $\pm$ 3.9%.