Skip to content

Commit

Permalink
completed draft up to regression benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
JacksonBurns committed Feb 14, 2024
1 parent eed6115 commit 89f7966
Show file tree
Hide file tree
Showing 2 changed files with 368 additions and 117 deletions.
205 changes: 204 additions & 1 deletion paper/paper.bib
Original file line number Diff line number Diff line change
@@ -1,3 +1,39 @@
@Article{muratov_qsar,
author ="Muratov, Eugene N. and Bajorath, Jürgen and Sheridan, Robert P. and Tetko, Igor V. and Filimonov, Dmitry and Poroikov, Vladimir and Oprea, Tudor I. and Baskin, Igor I. and Varnek, Alexandre and Roitberg, Adrian and Isayev, Olexandr and Curtalolo, Stefano and Fourches, Denis and Cohen, Yoram and Aspuru-Guzik, Alan and Winkler, David A. and Agrafiotis, Dimitris and Cherkasov, Artem and Tropsha, Alexander",
title ="QSAR without borders",
journal ="Chem. Soc. Rev.",
year ="2020",
volume ="49",
issue ="11",
pages ="3525-3564",
publisher ="The Royal Society of Chemistry",
doi ="10.1039/D0CS00098A",
url ="http://dx.doi.org/10.1039/D0CS00098A",
abstract ="Prediction of chemical bioactivity and physical properties has been one of the most important applications of statistical and more recently{,} machine learning and artificial intelligence methods in chemical sciences. This field of research{,} broadly known as quantitative structure–activity relationships (QSAR) modeling{,} has developed many important algorithms and has found a broad range of applications in physical organic and medicinal chemistry in the past 55+ years. This Perspective summarizes recent technological advances in QSAR modeling but it also highlights the applicability of algorithms{,} modeling methods{,} and validation practices developed in QSAR to a wide range of research areas outside of traditional QSAR boundaries including synthesis planning{,} nanotechnology{,} materials science{,} biomaterials{,} and clinical informatics. As modern research methods generate rapidly increasing amounts of data{,} the knowledge of robust data-driven modelling methods professed within the QSAR field can become essential for scientists working both within and outside of chemical research. We hope that this contribution highlighting the generalizable components of QSAR modeling will serve to address this challenge."
}

@article{representation_review,
title = {From intuition to AI: evolution of small molecule representations in drug discovery},
volume = {25},
ISSN = {1477-4054},
url = {http://dx.doi.org/10.1093/bib/bbad422},
DOI = {10.1093/bib/bbad422},
number = {1},
journal = {Briefings in Bioinformatics},
publisher = {Oxford University Press (OUP)},
author = {McGibbon, Miles and Shave, Steven and Dong, Jie and Gao, Yumiao and Houston, Douglas R and Xie, Jiancong and Yang, Yuedong and Schwaller, Philippe and Blay, Vincent},
year = {2023},
month = nov
}

@article{estrada_abc,
title={An atom-bond connectivity index: modelling the enthalpy of formation of alkanes},
author={Estrada, Ernesto and Torres, Luis and Rodriguez, Lissette and Gutman, Ivan},
year={1998},
publisher={NISCAIR-CSIR, India},
url={http://nopr.niscpr.res.in/handle/123456789/40308},
}

@article{quantumscents,
author = {Burns, Jackson W. and Rogers, David M.},
title = {QuantumScents: Quantum-Mechanical Properties for 3.5k Olfactory Molecules},
Expand All @@ -16,6 +52,60 @@ @article{quantumscents
}
}

@article{wiener_index,
author = {Wiener, Harry},
title = {Structural Determination of Paraffin Boiling Points},
journal = {Journal of the American Chemical Society},
volume = {69},
number = {1},
pages = {17-20},
year = {1947},
doi = {10.1021/ja01193a005},
note ={PMID: 20291038},
URL = {
https://doi.org/10.1021/ja01193a005
},
eprint = {
https://doi.org/10.1021/ja01193a005
}
}

@article{moco,
author = {Zhu, Yanqiao and Chen, Dingshuo and Du, Yuanqi and Wang, Yingze and Liu, Qiang and Wu, Shu},
title = {Molecular Contrastive Pretraining with Collaborative Featurizations},
journal = {Journal of Chemical Information and Modeling},
volume = {0},
number = {0},
pages = {null},
year = {0},
doi = {10.1021/acs.jcim.3c01468},
note ={PMID: 38315002},

URL = {
https://doi.org/10.1021/acs.jcim.3c01468
},
eprint = {
https://doi.org/10.1021/acs.jcim.3c01468
}

}

@article{low_data_review,
title={Deep learning for low-data drug discovery: hurdles and opportunities},
DOI={10.26434/chemrxiv-2024-w0wvl},
journal={ChemRxiv},
author={van Tilborg, Derek and Brinkmann, Helena and Criscuolo, Emanuele and Rossen, Luke and Özçelik, Riza and Grisoni, Francesca},
year={2024},
}

@inproceedings{unimol,
title={Uni-Mol: A Universal 3D Molecular Representation Learning Framework},
author={Gengmo Zhou and Zhifeng Gao and Qiankun Ding and Hang Zheng and Hongteng Xu and Zhewei Wei and Linfeng Zhang and Guolin Ke},
Expand All @@ -26,6 +116,22 @@ @inproceedings{unimol
url={https://doi.org/10.26434/chemrxiv-2022-jjm0j}
}

@misc{sggrl,
title={Multi-Modal Representation Learning for Molecular Property Prediction: Sequence, Graph, Geometry},
author={Zeyu Wang and Tianyi Jiang and Jinhuan Wang and Qi Xuan},
year={2024},
eprint={2401.03369},
archivePrefix={arXiv},
primaryClass={q-bio.MN}
}

@article{gslmpp,
title={Molecular Property Prediction Based on Graph Structure Learning},
author={Zhao, Bangyi and Xu, Weixia and Guan, Jihong and Zhou, Shuigeng},
journal={arXiv preprint arXiv:2312.16855},
year={2023}
}

@inproceedings{cmpnn,
author = {Song, Ying and Zheng, Shuangjia and Niu, Zhangming and Fu, Zhang-Hua and Lu, Yutong and Yang, Yuedong},
title = {Communicative representation learning on attributed molecular graphs},
Expand Down Expand Up @@ -91,6 +197,20 @@ @article{fuels_qsar_method
abstract = {Machine learning has proven to be a powerful tool for accelerating biofuel development. Although numerous models are available to predict a range of properties using chemical descriptors, there is a trade-off between interpretability and performance. Neural networks provide predictive models with high accuracy at the expense of some interpretability, while simpler models such as linear regression often lack in accuracy. In addition to model architecture, feature selection is also critical for developing interpretable and accurate predictive models. We present a method for systematically selecting molecular descriptor features and developing interpretable machine learning models without sacrificing accuracy. Our method simplifies the process of selecting features by reducing feature multicollinearity and enables discoveries of new relationships between global properties and molecular descriptors. To demonstrate our approach, we developed models for predicting melting point, boiling point, flash point, yield sooting index, and net heat of combustion with the help of the Tree-based Pipeline Optimization Tool (TPOT). For training, we used publicly available experimental data for up to 8351 molecules. Our models accurately predict various molecular properties for organic molecules (mean absolute percent error (MAPE) ranges from 3.3% to 10.5%) and provide a set of features that are well-correlated to the property. This method enables researchers to explore sets of features that significantly contribute to the prediction of the property, offering new scientific insights. To help accelerate early stage biofuel research and development, we also integrated the data and models into a open-source, interactive web tool.}
}

@article{yalamanchi,
title = {Uncertainty quantification of a deep learning fuel property prediction model},
journal = {Applications in Energy and Combustion Science},
volume = {16},
pages = {100211},
year = {2023},
issn = {2666-352X},
doi = {https://doi.org/10.1016/j.jaecs.2023.100211},
url = {https://www.sciencedirect.com/science/article/pii/S2666352X23001000},
author = {Kiran K. Yalamanchi and Sahil Kommalapati and Pinaki Pal and Nursulu Kuzhagaliyeva and Abdullah S AlRamadan and Balaji Mohan and Yuanjiang Pei and S. Mani Sarathy and Emre Cenker and Jihad Badra},
keywords = {Fuel property prediction, Deep learning, Uncertainty quantification, Monte Carlo ensemble methods, Bayesian neural network, Epistemic uncertainty, Aleatoric uncertainty},
abstract = {Deep learning models are being widely used in the field of combustion. Given the black-box nature of typical neural network based models, uncertainty quantification (UQ) is critical to ensure the reliability of predictions as well as the training datasets, and for a principled quantification of noise and its various sources. Deep learning surrogate models for predicting properties of chemical compounds and mixtures have been recently shown to be promising for enabling data-driven fuel design and optimization, with the ultimate goal of improving efficiency and lowering emissions from combustion engines. In this study, UQ is performed for a multi-task deep learning model that simultaneously predicts the research octane number (RON), Motor Octane Number (MON), and Yield Sooting Index (YSI) of pure components and multicomponent blends. The deep learning model is comprised of three smaller networks: Extractor 1, Extractor 2, and Predictor, and a mixing operator. The molecular fingerprints of individual components are encoded via Extractor 1 and Extractor 2, the mixing operator generates fingerprints for mixtures/blends based on linear mixing operation, and the predictor maps the fingerprint to the target properties. Two different classes of UQ methods, Monte Carlo ensemble methods and Bayesian neural networks (BNNs), are employed for quantifying the epistemic uncertainty. Combinations of Bernoulli and Gaussian distributions with DropConnect and DropOut techniques are explored as ensemble methods. All the DropConnect, DropOut and Bayesian layers are applied to the predictor network. Aleatoric uncertainty is modeled by assuming that each data point has an independent uncertainty associated with it. The results of the UQ study are further analyzed to compare the performance of BNN and ensemble methods. Although this study is confined to UQ of fuel property prediction, the methodologies are applicable to other deep learning frameworks that are being widely used in the combustion community.}
}

@article{fubrain,
author = {Esaki, Tsuyoshi and Ohashi, Rikiya and Watanabe, Reiko and Natsume-Kitatani, Yayoi and Kawashima, Hitoshi and Nagao, Chioko and Mizuguchi, Kenji},
title = {Computational Model To Predict the Fraction of Unbound Drug in the Brain},
Expand All @@ -111,7 +231,7 @@ @article{fubrain

@article{ma_deep_qsar,
author = {Ma, Junshui and Sheridan, Robert P. and Liaw, Andy and Dahl, George E. and Svetnik, Vladimir},
title = {Deep Neural Nets as a Method for Quantitative StructureActivity Relationships},
title = {Deep Neural Nets as a Method for Quantitative Structure-Activity Relationships},
journal = {Journal of Chemical Information and Modeling},
volume = {55},
number = {2},
Expand All @@ -138,6 +258,74 @@ @misc{mhnn
url={https://doi.org/10.48550/arXiv.2312.13136}
}

@article{biswas_critical,
author = {Biswas, Sayandeep and Chung, Yunsie and Ramirez, Josephine and Wu, Haoyang and Green, William H.},
title = {Predicting Critical Properties and Acentric Factors of Fluids Using Multitask Machine Learning},
journal = {Journal of Chemical Information and Modeling},
volume = {63},
number = {15},
pages = {4574-4588},
year = {2023},
doi = {10.1021/acs.jcim.3c00546},
note ={PMID: 37487557},

URL = {
https://doi.org/10.1021/acs.jcim.3c00546
},
eprint = {
https://doi.org/10.1021/acs.jcim.3c00546
}

}

@article{mordred,
title = {Mordred: a molecular descriptor calculator},
volume = {10},
ISSN = {1758-2946},
url = {http://dx.doi.org/10.1186/s13321-018-0258-y},
DOI = {10.1186/s13321-018-0258-y},
number = {1},
journal = {Journal of Cheminformatics},
publisher = {Springer Science and Business Media LLC},
author = {Moriwaki, Hirotomo and Tian, Yu-Shi and Kawashita, Norihito and Takagi, Tatsuya},
year = {2018},
month = feb
}

@article{smiles,
title = {SMILES, a chemical language and information system. 1. Introduction to methodology and encoding rules},
volume = {28},
ISSN = {1520-5142},
url = {http://dx.doi.org/10.1021/ci00057a005},
DOI = {10.1021/ci00057a005},
number = {1},
journal = {Journal of Chemical Information and Computer Sciences},
publisher = {American Chemical Society (ACS)},
author = {Weininger, David},
year = {1988},
month = feb,
pages = {31-36}
}

@software{lightning,
author = {Falcon, William and {The PyTorch Lightning team}},
doi = {10.5281/zenodo.3828935},
license = {Apache-2.0},
month = mar,
title = {{PyTorch Lightning}},
url = {https://github.com/Lightning-AI/lightning},
version = {1.4},
year = {2019}
}

@Article{deepdelta,
author={Schaduangrat, Nalini and Anuwongcharoen, Nuttapat and Charoenkwan, Phasit and Shoombuatong, Watshara},
title={DeepAR: a novel deep learning-based hybrid framework for the interpretable prediction of androgen receptor antagonists},
Expand Down Expand Up @@ -185,6 +373,21 @@ @article{ysi
pages = {349-364}
}

@article{wu_photovoltaic,
title = {Deep-Learning Architecture in QSPR Modeling for the Prediction of Energy Conversion Efficiency of Solar Cells},
volume = {59},
ISSN = {1520-5045},
url = {http://dx.doi.org/10.1021/acs.iecr.0c03880},
DOI = {10.1021/acs.iecr.0c03880},
number = {42},
journal = {Industrial & Engineering Chemistry Research},
publisher = {American Chemical Society (ACS)},
author = {Wu, Jinkui and Wang, Shihui and Zhou, Li and Ji, Xu and Dai, Yiyang and Dang, Yagu and Kraft, Markus},
year = {2020},
month = oct,
pages = {18991-19000}
}

@article{hopv15_subset,
title = {Predicting Power Conversion Efficiency of Organic Photovoltaics: Models and Data Analysis},
volume = {6},
Expand Down
Loading

0 comments on commit 89f7966

Please sign in to comment.