From 5e016245e34a0b1d48050633a99b49fc8ab6c478 Mon Sep 17 00:00:00 2001 From: memgonzales Date: Thu, 21 Nov 2024 01:50:07 +0800 Subject: [PATCH] Address reviewers' comments --- ...fier Building & Evaluation (XGBoost).ipynb | 1025 ----------------- 1 file changed, 1025 deletions(-) delete mode 100644 experiments/5.13. Benchmarking - Classifier Building & Evaluation (XGBoost).ipynb diff --git a/experiments/5.13. Benchmarking - Classifier Building & Evaluation (XGBoost).ipynb b/experiments/5.13. Benchmarking - Classifier Building & Evaluation (XGBoost).ipynb deleted file mode 100644 index 4b66c89..0000000 --- a/experiments/5.13. Benchmarking - Classifier Building & Evaluation (XGBoost).ipynb +++ /dev/null @@ -1,1025 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "864700f6", - "metadata": {}, - "source": [ - "# PHIStruct: Improving phage-host interaction prediction at low sequence similarity settings using structure-aware protein embeddings\n", - "\n", - "Mark Edward M. Gonzales1, 2, Jennifer C. Ureta1, 2, 3 & Anish M.S. Shrestha1, 2\n", - "\n", - "1 Bioinformatics Lab, Advanced Research Institute for Informatics, Computing and Networking, De La Salle University, Manila 1004, Philippines
\n", - "2 Department of Software Technology, College of Computer Studies, De La Salle University, Manila 1004, Philippines
\n", - "3 Walter and Eliza Hall Institute of Medical Research, Melbourne, Victoria, 3052, Australia\n", - "\n", - "✉️ gonzales.markedward@gmail.com, jennifer.ureta@gmail.com, anish.shrestha@dlsu.edu.ph" - ] - }, - { - "cell_type": "markdown", - "id": "e9619d49", - "metadata": {}, - "source": [ - "
" - ] - }, - { - "cell_type": "markdown", - "id": "1eefb76b", - "metadata": {}, - "source": [ - "# 💡 Prerequisites\n", - "\n", - "### Option 1: Download the prerequisite files\n", - "1. Download `consolidated.tar.gz` from this [link](https://drive.google.com/file/d/1yQSXwlb37dm2ZLXGJHdIM5vmrzwPAwvI/view?usp=sharing), and unzip it. This should result in a folder named `consolidated`.
Technically, this notebook only needs `consolidated/rbp_embeddings_saprot_relaxed_r3.csv`.\n", - "1. Create a folder named `inphared` inside `data`, and save the extracted `consolidated` folder inside `data/inphared`. \n", - "1. Download `fasta.tar.gz` from this [link](https://drive.google.com/file/d/1NMFR3JrrrCHLoCMQp2nia4dgtcXs5x05/view?usp=sharing), and unzip it. This should result in a folder named `fasta`.
Technically, this notebook only needs the `.clstr` files inside `fasta`.\n", - "1. Save the extracted `fasta` folder inside `data/inphared`.\n", - "\n", - "### Option 2: Generate the prerequisite files yourself\n", - "1. If you have run `3.0. Data Consolidation (SaProt).ipynb`, then `data/inphared/consolidated` should have already been populated with the prerequisite files.\n", - "1. Consolidate the sequences of the proteins with predicted structures into a single FASTA file.
\n", - " For reproducibility, we provide our consolidated FASTA file [here](https://drive.google.com/file/d/1LTZte1f4lreQ5MXWeM-y2Mtp9z96pXS7/view?usp=sharing).\n", - "1. Generate the protein clusters by running CD-HIT on this FASTA file at a sequence similarity threshold of 100%, following the instructions [here](https://github.com/weizhongli/cdhit). \n", - "1. Rename the resulting `.clstr` file to `complete-struct-100.fasta.clstr` and the resulting FASTA file (containing only the representative sequences) to `complete-struct-100.fasta`. \n", - "1. Generate `complete-struct-80.fasta.clstr`, `complete-struct-60.fasta.clstr`, and `complete-struct-40.fasta.clstr` by running CD-HIT on `complete-struct-100.fasta` at sequence similarity thresholds of 80%, 60%, and 40%, respectively.\n", - "1. Create a folder named `fasta` inside `data/inphared`, and save the four `.clstr` files inside `data/inphared/fasta`.\n", - "\n", - "### Resulting folder structure\n", - "\n", - "`experiments` (parent folder of this notebook)
\n", - "↳ `data`
\n", - "   ↳ `inphared`
\n", - "       ↳ `consolidated`
\n", - "          ↳ `rbp_embeddings_saprot_relaxed_r3.csv`
\n", - "       ↳ `fasta`
\n", - "          ↳ `complete-struct-100.fasta.clstr`
\n", - "          ↳ `complete-struct-80.fasta.clstr`
\n", - "          ↳ `complete-struct-40.fasta.clstr`
\n", - "          ↳ `complete-struct-60.fasta.clstr`
\n", - "↳ `5.0. Classifier Building & Evaluation (XGBoost).ipynb` (this notebook)
" - ] - }, - { - "cell_type": "markdown", - "id": "f3015489", - "metadata": {}, - "source": [ - "
" - ] - }, - { - "cell_type": "markdown", - "id": "c059ce7f", - "metadata": {}, - "source": [ - "# 📁 Output files\n", - "\n", - "The output files (i.e., the results of evaluating the model's performance) — which are saved in `temp/results` — are already included when the repository was cloned.
" - ] - }, - { - "cell_type": "markdown", - "id": "cecf5469", - "metadata": {}, - "source": [ - "
" - ] - }, - { - "cell_type": "markdown", - "id": "f274d083", - "metadata": {}, - "source": [ - "# Part I: Preliminaries" - ] - }, - { - "cell_type": "markdown", - "id": "fc7d25a1", - "metadata": {}, - "source": [ - "Import the necessary libraries and modules." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "0840061c", - "metadata": {}, - "outputs": [], - "source": [ - "import math\n", - "import pickle\n", - "import os\n", - "import warnings\n", - "\n", - "import pandas as pd\n", - "import numpy as np\n", - "import sklearn\n", - "\n", - "import ConstantsUtil\n", - "import ClassificationUtil\n", - "\n", - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "98fb65d4", - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option(\"display.max_rows\", None)\n", - "pd.set_option(\"display.max_columns\", 50)\n", - "\n", - "pd.options.mode.chained_assignment = None\n", - "\n", - "with warnings.catch_warnings():\n", - " warnings.filterwarnings(\n", - " \"ignore\", category=sklearn.exceptions.UndefinedMetricWarning\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "e27723c6", - "metadata": {}, - "outputs": [], - "source": [ - "constants = ConstantsUtil.ConstantsUtil()\n", - "util = ClassificationUtil.ClassificationUtil()" - ] - }, - { - "cell_type": "markdown", - "id": "0ada3031", - "metadata": {}, - "source": [ - "
" - ] - }, - { - "cell_type": "markdown", - "id": "1f9f7cc0", - "metadata": {}, - "source": [ - "# Part II: Classifier Building and Evaluation" - ] - }, - { - "cell_type": "markdown", - "id": "53d0124d", - "metadata": {}, - "source": [ - "Train an XGBoost classifier, and evaluate its performance at different train-versus-test similarity and confidence thresholds." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "0faa8454", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "*** saprot_relaxed_r3, similarity = 100% ***\n", - "Constructing training and test sets...\n", - "Training set shape: (16942, 1280)\n", - "Test set shape: (2340, 1280)\n", - "Training the model...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Edward\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " y = column_or_1d(y, warn=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving evaluation results...\n", - "Confidence threshold k: 0.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.9341 0.7658 0.8416 111\n", - " enterobacter 0.3967 0.4138 0.4051 116\n", - " enterococcus 0.8723 0.8039 0.8367 51\n", - " escherichia 0.8694 0.8837 0.8765 1040\n", - " klebsiella 0.8316 0.8677 0.8493 461\n", - " others 0.0000 0.0000 0.0000 51\n", - " pseudomonas 0.8560 0.9407 0.8964 354\n", - "staphylococcus 0.9545 0.9423 0.9484 156\n", - "\n", - " accuracy 0.8432 2340\n", - " macro avg 0.7143 0.7022 0.7067 2340\n", - " weighted avg 0.8264 0.8432 0.8339 2340\n", - "\n", - "===================\n", - "Confidence threshold k: 10.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.9438 0.7568 0.8400 111\n", - " enterobacter 0.4018 0.3879 0.3947 116\n", - " enterococcus 0.8723 0.8039 0.8367 51\n", - " escherichia 0.8807 0.8731 0.8769 1040\n", - " klebsiella 0.8483 0.8612 0.8547 461\n", - " others 0.0656 0.0784 0.0714 51\n", - " pseudomonas 0.8757 0.9350 0.9044 354\n", - "staphylococcus 0.9545 0.9423 0.9484 156\n", - "\n", - " accuracy 0.8363 2340\n", - " macro avg 0.7303 0.7048 0.7159 2340\n", - " weighted avg 0.8398 0.8363 0.8373 2340\n", - "\n", - "===================\n", - "Confidence threshold k: 20.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.9529 0.7297 0.8265 111\n", - " enterobacter 0.4151 0.3793 0.3964 116\n", - " enterococcus 0.8913 0.8039 0.8454 51\n", - " escherichia 0.8909 0.8635 0.8770 1040\n", - " klebsiella 0.8621 0.8547 0.8584 461\n", - " others 0.0442 0.0980 0.0610 51\n", - " pseudomonas 0.8922 0.9350 0.9131 354\n", - "staphylococcus 0.9545 0.9423 0.9484 156\n", - "\n", - " accuracy 0.8295 2340\n", - " macro avg 0.7379 0.7008 0.7158 2340\n", - " weighted avg 0.8506 0.8295 0.8388 2340\n", - "\n", - "===================\n", - "Confidence threshold k: 30.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.9524 0.7207 0.8205 111\n", - " enterobacter 0.4375 0.3621 0.3962 116\n", - " enterococcus 0.8889 0.7843 0.8333 51\n", - " escherichia 0.8959 0.8442 0.8693 1040\n", - " klebsiella 0.8781 0.8438 0.8606 461\n", - " others 0.0398 0.1373 0.0617 51\n", - " pseudomonas 0.9036 0.9266 0.9149 354\n", - "staphylococcus 0.9608 0.9423 0.9515 156\n", - "\n", - " accuracy 0.8167 2340\n", - " macro avg 0.7446 0.6952 0.7135 2340\n", - " weighted avg 0.8590 0.8167 0.8358 2340\n", - "\n", - "===================\n", - "Confidence threshold k: 40.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.9512 0.7027 0.8083 111\n", - " enterobacter 0.4659 0.3534 0.4020 116\n", - " enterococcus 0.9302 0.7843 0.8511 51\n", - " escherichia 0.8998 0.8288 0.8629 1040\n", - " klebsiella 0.8951 0.8330 0.8629 461\n", - " others 0.0684 0.3137 0.1123 51\n", - " pseudomonas 0.9157 0.9209 0.9183 354\n", - "staphylococcus 0.9667 0.9295 0.9477 156\n", - "\n", - " accuracy 0.8085 2340\n", - " macro avg 0.7616 0.7083 0.7207 2340\n", - " weighted avg 0.8692 0.8085 0.8349 2340\n", - "\n", - "===================\n", - "Confidence threshold k: 50.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.9512 0.7027 0.8083 111\n", - " enterobacter 0.4737 0.3103 0.3750 116\n", - " enterococcus 0.9302 0.7843 0.8511 51\n", - " escherichia 0.9061 0.8077 0.8541 1040\n", - " klebsiella 0.9021 0.8200 0.8591 461\n", - " others 0.0600 0.3529 0.1026 51\n", - " pseudomonas 0.9300 0.9011 0.9154 354\n", - "staphylococcus 0.9667 0.9295 0.9477 156\n", - "\n", - " accuracy 0.7923 2340\n", - " macro avg 0.7650 0.7011 0.7141 2340\n", - " weighted avg 0.8758 0.7923 0.8282 2340\n", - "\n", - "===================\n", - "Confidence threshold k: 60.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.9620 0.6847 0.8000 111\n", - " enterobacter 0.5152 0.2931 0.3736 116\n", - " enterococcus 0.9268 0.7451 0.8261 51\n", - " escherichia 0.9120 0.7875 0.8452 1040\n", - " klebsiella 0.9109 0.7983 0.8509 461\n", - " others 0.0654 0.4706 0.1148 51\n", - " pseudomonas 0.9349 0.8927 0.9133 354\n", - "staphylococcus 0.9796 0.9231 0.9505 156\n", - "\n", - " accuracy 0.7774 2340\n", - " macro avg 0.7759 0.6994 0.7093 2340\n", - " weighted avg 0.8843 0.7774 0.8218 2340\n", - "\n", - "===================\n", - "Confidence threshold k: 70.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.9733 0.6577 0.7849 111\n", - " enterobacter 0.5254 0.2672 0.3543 116\n", - " enterococcus 0.9268 0.7451 0.8261 51\n", - " escherichia 0.9182 0.7558 0.8291 1040\n", - " klebsiella 0.9158 0.7787 0.8417 461\n", - " others 0.0587 0.5098 0.1053 51\n", - " pseudomonas 0.9451 0.8757 0.9091 354\n", - "staphylococcus 0.9863 0.9231 0.9536 156\n", - "\n", - " accuracy 0.7551 2340\n", - " macro avg 0.7812 0.6891 0.7005 2340\n", - " weighted avg 0.8910 0.7551 0.8105 2340\n", - "\n", - "===================\n", - "Confidence threshold k: 80.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.9722 0.6306 0.7650 111\n", - " enterobacter 0.5490 0.2414 0.3353 116\n", - " enterococcus 0.9268 0.7451 0.8261 51\n", - " escherichia 0.9255 0.7048 0.8002 1040\n", - " klebsiella 0.9348 0.7462 0.8299 461\n", - " others 0.0613 0.6667 0.1122 51\n", - " pseudomonas 0.9651 0.8588 0.9088 354\n", - "staphylococcus 0.9863 0.9231 0.9536 156\n", - "\n", - " accuracy 0.7244 2340\n", - " macro avg 0.7901 0.6896 0.6914 2340\n", - " weighted avg 0.9021 0.7244 0.7936 2340\n", - "\n", - "===================\n", - "Confidence threshold k: 90.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 1.0000 0.5676 0.7241 111\n", - " enterobacter 0.5588 0.1638 0.2533 116\n", - " enterococcus 0.9211 0.6863 0.7865 51\n", - " escherichia 0.9335 0.6212 0.7460 1040\n", - " klebsiella 0.9605 0.6855 0.8000 461\n", - " others 0.0507 0.7451 0.0950 51\n", - " pseudomonas 0.9728 0.8079 0.8827 354\n", - "staphylococcus 0.9929 0.8974 0.9428 156\n", - "\n", - " accuracy 0.6594 2340\n", - " macro avg 0.7988 0.6468 0.6538 2340\n", - " weighted avg 0.9138 0.6594 0.7517 2340\n", - "\n", - "===================\n", - "Finished\n", - "===================\n", - "*** saprot_relaxed_r3, similarity = 80% ***\n", - "Constructing training and test sets...\n", - "Training set shape: (14822, 1280)\n", - "Test set shape: (2822, 1280)\n", - "Training the model...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Edward\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " y = column_or_1d(y, warn=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving evaluation results...\n", - "Confidence threshold k: 0.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.4933 0.6789 0.5714 109\n", - " enterobacter 0.3214 0.2156 0.2581 167\n", - " enterococcus 0.7826 0.6000 0.6792 60\n", - " escherichia 0.7337 0.7359 0.7348 1344\n", - " klebsiella 0.6939 0.6530 0.6728 559\n", - " others 0.0000 0.0000 0.0000 60\n", - " pseudomonas 0.5183 0.6467 0.5754 351\n", - "staphylococcus 0.7871 0.9244 0.8503 172\n", - "\n", - " accuracy 0.6683 2822\n", - " macro avg 0.5413 0.5568 0.5427 2822\n", - " weighted avg 0.6540 0.6683 0.6584 2822\n", - "\n", - "===================\n", - "Confidence threshold k: 10.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.5221 0.6514 0.5796 109\n", - " enterobacter 0.3478 0.1916 0.2471 167\n", - " enterococcus 0.8182 0.6000 0.6923 60\n", - " escherichia 0.7461 0.7128 0.7291 1344\n", - " klebsiella 0.7166 0.6243 0.6673 559\n", - " others 0.0114 0.0333 0.0169 60\n", - " pseudomonas 0.5369 0.6211 0.5760 351\n", - "staphylococcus 0.8020 0.9186 0.8564 172\n", - "\n", - " accuracy 0.6464 2822\n", - " macro avg 0.5626 0.5441 0.5456 2822\n", - " weighted avg 0.6713 0.6464 0.6553 2822\n", - "\n", - "===================\n", - "Confidence threshold k: 20.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.5420 0.6514 0.5917 109\n", - " enterobacter 0.3718 0.1737 0.2367 167\n", - " enterococcus 0.8000 0.5333 0.6400 60\n", - " escherichia 0.7608 0.6838 0.7202 1344\n", - " klebsiella 0.7451 0.6064 0.6686 559\n", - " others 0.0178 0.1000 0.0302 60\n", - " pseudomonas 0.5605 0.6068 0.5828 351\n", - "staphylococcus 0.8125 0.9070 0.8571 172\n", - "\n", - " accuracy 0.6254 2822\n", - " macro avg 0.5763 0.5328 0.5409 2822\n", - " weighted avg 0.6895 0.6254 0.6513 2822\n", - "\n", - "===================\n", - "Confidence threshold k: 30.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.5476 0.6330 0.5872 109\n", - " enterobacter 0.4386 0.1497 0.2232 167\n", - " enterococcus 0.8205 0.5333 0.6465 60\n", - " escherichia 0.7732 0.6518 0.7073 1344\n", - " klebsiella 0.7658 0.5850 0.6633 559\n", - " others 0.0285 0.2333 0.0508 60\n", - " pseudomonas 0.5762 0.5926 0.5843 351\n", - "staphylococcus 0.8298 0.9070 0.8667 172\n", - "\n", - " accuracy 0.6049 2822\n", - " macro avg 0.5975 0.5357 0.5412 2822\n", - " weighted avg 0.7073 0.6049 0.6445 2822\n", - "\n", - "===================\n", - "Confidence threshold k: 40.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.5630 0.6147 0.5877 109\n", - " enterobacter 0.4423 0.1377 0.2100 167\n", - " enterococcus 0.8333 0.5000 0.6250 60\n", - " escherichia 0.7895 0.6250 0.6977 1344\n", - " klebsiella 0.7839 0.5581 0.6520 559\n", - " others 0.0283 0.3000 0.0517 60\n", - " pseudomonas 0.6114 0.5783 0.5944 351\n", - "staphylococcus 0.8432 0.9070 0.8739 172\n", - "\n", - " accuracy 0.5843 2822\n", - " macro avg 0.6119 0.5276 0.5366 2822\n", - " weighted avg 0.7250 0.5843 0.6382 2822\n", - "\n", - "===================\n", - "Confidence threshold k: 50.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.5946 0.6055 0.6000 109\n", - " enterobacter 0.4889 0.1317 0.2075 167\n", - " enterococcus 0.8529 0.4833 0.6170 60\n", - " escherichia 0.7931 0.5818 0.6712 1344\n", - " klebsiella 0.8152 0.5367 0.6472 559\n", - " others 0.0326 0.4333 0.0606 60\n", - " pseudomonas 0.6482 0.5670 0.6049 351\n", - "staphylococcus 0.8902 0.8953 0.8928 172\n", - "\n", - " accuracy 0.5592 2822\n", - " macro avg 0.6395 0.5293 0.5377 2822\n", - " weighted avg 0.7448 0.5592 0.6274 2822\n", - "\n", - "===================\n", - "Confidence threshold k: 60.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.6263 0.5688 0.5962 109\n", - " enterobacter 0.5238 0.1317 0.2105 167\n", - " enterococcus 0.8667 0.4333 0.5778 60\n", - " escherichia 0.8101 0.5394 0.6476 1344\n", - " klebsiella 0.8323 0.4973 0.6226 559\n", - " others 0.0299 0.4833 0.0564 60\n", - " pseudomonas 0.6561 0.5328 0.5881 351\n", - "staphylococcus 0.9167 0.8953 0.9059 172\n", - "\n", - " accuracy 0.5255 2822\n", - " macro avg 0.6577 0.5103 0.5256 2822\n", - " weighted avg 0.7624 0.5255 0.6091 2822\n", - "\n", - "===================\n", - "Confidence threshold k: 70.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.6552 0.5229 0.5816 109\n", - " enterobacter 0.6364 0.1257 0.2100 167\n", - " enterococcus 0.9286 0.4333 0.5909 60\n", - " escherichia 0.8245 0.4963 0.6196 1344\n", - " klebsiella 0.8449 0.4776 0.6103 559\n", - " others 0.0276 0.5167 0.0524 60\n", - " pseudomonas 0.6947 0.5185 0.5938 351\n", - "staphylococcus 0.9448 0.8953 0.9194 172\n", - "\n", - " accuracy 0.4979 2822\n", - " macro avg 0.6946 0.4983 0.5222 2822\n", - " weighted avg 0.7873 0.4979 0.5944 2822\n", - "\n", - "===================\n", - "Confidence threshold k: 80.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.7200 0.4954 0.5870 109\n", - " enterobacter 0.6207 0.1078 0.1837 167\n", - " enterococcus 0.9286 0.4333 0.5909 60\n", - " escherichia 0.8393 0.4353 0.5732 1344\n", - " klebsiella 0.8520 0.4222 0.5646 559\n", - " others 0.0303 0.6667 0.0579 60\n", - " pseudomonas 0.7342 0.4957 0.5918 351\n", - "staphylococcus 0.9684 0.8895 0.9273 172\n", - "\n", - " accuracy 0.4557 2822\n", - " macro avg 0.7117 0.4932 0.5096 2822\n", - " weighted avg 0.8038 0.4557 0.5623 2822\n", - "\n", - "===================\n", - "Confidence threshold k: 90.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.9057 0.4404 0.5926 109\n", - " enterobacter 0.6000 0.0719 0.1283 167\n", - " enterococcus 0.9259 0.4167 0.5747 60\n", - " escherichia 0.8621 0.3490 0.4968 1344\n", - " klebsiella 0.8917 0.3828 0.5357 559\n", - " others 0.0277 0.7333 0.0534 60\n", - " pseudomonas 0.7500 0.4274 0.5445 351\n", - "staphylococcus 0.9867 0.8605 0.9193 172\n", - "\n", - " accuracy 0.3933 2822\n", - " macro avg 0.7437 0.4602 0.4807 2822\n", - " weighted avg 0.8314 0.3933 0.5103 2822\n", - "\n", - "===================\n", - "Finished\n", - "===================\n", - "*** saprot_relaxed_r3, similarity = 60% ***\n", - "Constructing training and test sets...\n", - "Training set shape: (13099, 1280)\n", - "Test set shape: (3153, 1280)\n", - "Training the model...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Edward\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " y = column_or_1d(y, warn=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving evaluation results...\n", - "Confidence threshold k: 0.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.5235 0.7355 0.6117 121\n", - " enterobacter 0.2000 0.1276 0.1558 196\n", - " enterococcus 0.7083 0.6939 0.7010 49\n", - " escherichia 0.7328 0.7508 0.7417 1589\n", - " klebsiella 0.4883 0.4949 0.4916 588\n", - " others 0.0000 0.0000 0.0000 49\n", - " pseudomonas 0.6863 0.7203 0.7029 404\n", - "staphylococcus 0.7778 0.8025 0.7900 157\n", - "\n", - " accuracy 0.6499 3153\n", - " macro avg 0.5146 0.5407 0.5243 3153\n", - " weighted avg 0.6306 0.6499 0.6389 3153\n", - "\n", - "===================\n", - "Confidence threshold k: 10.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.5658 0.7107 0.6300 121\n", - " enterobacter 0.2019 0.1071 0.1400 196\n", - " enterococcus 0.7391 0.6939 0.7158 49\n", - " escherichia 0.7415 0.7111 0.7260 1589\n", - " klebsiella 0.5177 0.4728 0.4942 588\n", - " others 0.0207 0.1020 0.0345 49\n", - " pseudomonas 0.7197 0.7054 0.7125 404\n", - "staphylococcus 0.8170 0.7962 0.8065 157\n", - "\n", - " accuracy 0.6229 3153\n", - " macro avg 0.5404 0.5374 0.5324 3153\n", - " weighted avg 0.6492 0.6229 0.6340 3153\n", - "\n", - "===================\n", - "Confidence threshold k: 20.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.6043 0.6942 0.6462 121\n", - " enterobacter 0.2353 0.1020 0.1423 196\n", - " enterococcus 0.7907 0.6939 0.7391 49\n", - " escherichia 0.7572 0.6753 0.7139 1589\n", - " klebsiella 0.5398 0.4609 0.4972 588\n", - " others 0.0227 0.2041 0.0409 49\n", - " pseudomonas 0.7434 0.6955 0.7187 404\n", - "staphylococcus 0.8389 0.7962 0.8170 157\n", - "\n", - " accuracy 0.6020 3153\n", - " macro avg 0.5666 0.5403 0.5394 3153\n", - " weighted avg 0.6698 0.6020 0.6310 3153\n", - "\n", - "===================\n", - "Confidence threshold k: 30.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.6480 0.6694 0.6585 121\n", - " enterobacter 0.2105 0.0816 0.1176 196\n", - " enterococcus 0.8500 0.6939 0.7640 49\n", - " escherichia 0.7724 0.6514 0.7067 1589\n", - " klebsiella 0.5614 0.4354 0.4904 588\n", - " others 0.0245 0.3061 0.0453 49\n", - " pseudomonas 0.7570 0.6708 0.7113 404\n", - "staphylococcus 0.8621 0.7962 0.8278 157\n", - "\n", - " accuracy 0.5814 3153\n", - " macro avg 0.5857 0.5381 0.5402 3153\n", - " weighted avg 0.6854 0.5814 0.6251 3153\n", - "\n", - "===================\n", - "Confidence threshold k: 40.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.7064 0.6364 0.6696 121\n", - " enterobacter 0.2353 0.0816 0.1212 196\n", - " enterococcus 0.8718 0.6939 0.7727 49\n", - " escherichia 0.7865 0.6098 0.6870 1589\n", - " klebsiella 0.5855 0.4133 0.4845 588\n", - " others 0.0206 0.3469 0.0389 49\n", - " pseudomonas 0.7975 0.6436 0.7123 404\n", - "staphylococcus 0.8786 0.7834 0.8283 157\n", - "\n", - " accuracy 0.5515 3153\n", - " macro avg 0.6103 0.5261 0.5393 3153\n", - " weighted avg 0.7071 0.5515 0.6149 3153\n", - "\n", - "===================\n", - "Confidence threshold k: 50.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.7500 0.6198 0.6787 121\n", - " enterobacter 0.2258 0.0714 0.1085 196\n", - " enterococcus 0.8919 0.6735 0.7674 49\n", - " escherichia 0.8020 0.5658 0.6635 1589\n", - " klebsiella 0.6324 0.3980 0.4885 588\n", - " others 0.0226 0.4694 0.0432 49\n", - " pseudomonas 0.8173 0.6312 0.7123 404\n", - "staphylococcus 0.8963 0.7707 0.8288 157\n", - "\n", - " accuracy 0.5246 3153\n", - " macro avg 0.6298 0.5250 0.5364 3153\n", - " weighted avg 0.7285 0.5246 0.6034 3153\n", - "\n", - "===================\n", - "Confidence threshold k: 60.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.8605 0.6116 0.7150 121\n", - " enterobacter 0.2449 0.0612 0.0980 196\n", - " enterococcus 0.9167 0.6735 0.7765 49\n", - " escherichia 0.8095 0.5242 0.6364 1589\n", - " klebsiella 0.6749 0.3707 0.4786 588\n", - " others 0.0224 0.5510 0.0430 49\n", - " pseudomonas 0.8304 0.5941 0.6926 404\n", - "staphylococcus 0.9098 0.7707 0.8345 157\n", - "\n", - " accuracy 0.4941 3153\n", - " macro avg 0.6586 0.5196 0.5343 3153\n", - " weighted avg 0.7484 0.4941 0.5865 3153\n", - "\n", - "===================\n", - "Confidence threshold k: 70.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.9342 0.5868 0.7208 121\n", - " enterobacter 0.2973 0.0561 0.0944 196\n", - " enterococcus 0.9429 0.6735 0.7857 49\n", - " escherichia 0.8305 0.4688 0.5994 1589\n", - " klebsiella 0.7402 0.3537 0.4787 588\n", - " others 0.0225 0.6531 0.0434 49\n", - " pseudomonas 0.8448 0.5792 0.6872 404\n", - "staphylococcus 0.9524 0.7643 0.8481 157\n", - "\n", - " accuracy 0.4611 3153\n", - " macro avg 0.6956 0.5169 0.5322 3153\n", - " weighted avg 0.7816 0.4611 0.5680 3153\n", - "\n", - "===================\n", - "Confidence threshold k: 80.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.9859 0.5785 0.7292 121\n", - " enterobacter 0.3200 0.0408 0.0724 196\n", - " enterococcus 0.9677 0.6122 0.7500 49\n", - " escherichia 0.8476 0.3990 0.5426 1589\n", - " klebsiella 0.7689 0.3112 0.4431 588\n", - " others 0.0205 0.6939 0.0397 49\n", - " pseudomonas 0.8803 0.5644 0.6878 404\n", - "staphylococcus 0.9916 0.7516 0.8551 157\n", - "\n", - " accuracy 0.4139 3153\n", - " macro avg 0.7228 0.4940 0.5150 3153\n", - " weighted avg 0.8058 0.4139 0.5315 3153\n", - "\n", - "===================\n", - "Confidence threshold k: 90.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.9844 0.5207 0.6811 121\n", - " enterobacter 0.3333 0.0306 0.0561 196\n", - " enterococcus 0.9565 0.4490 0.6111 49\n", - " escherichia 0.8765 0.3128 0.4610 1589\n", - " klebsiella 0.8400 0.2500 0.3853 588\n", - " others 0.0194 0.7755 0.0378 49\n", - " pseudomonas 0.9095 0.5223 0.6635 404\n", - "staphylococcus 0.9912 0.7197 0.8339 157\n", - "\n", - " accuracy 0.3479 3153\n", - " macro avg 0.7389 0.4476 0.4662 3153\n", - " weighted avg 0.8380 0.3479 0.4705 3153\n", - "\n", - "===================\n", - "Finished\n", - "===================\n", - "*** saprot_relaxed_r3, similarity = 40% ***\n", - "Constructing training and test sets...\n", - "Training set shape: (11662, 1280)\n", - "Test set shape: (3203, 1280)\n", - "Training the model...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Edward\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " y = column_or_1d(y, warn=True)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving evaluation results...\n", - "Confidence threshold k: 0.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.2692 0.5506 0.3616 89\n", - " enterobacter 0.2025 0.1481 0.1711 216\n", - " enterococcus 0.5345 0.7561 0.6263 41\n", - " escherichia 0.7430 0.6375 0.6862 1796\n", - " klebsiella 0.4540 0.4497 0.4519 527\n", - " others 0.0000 0.0000 0.0000 41\n", - " pseudomonas 0.3939 0.7106 0.5069 311\n", - "staphylococcus 0.7072 0.7033 0.7052 182\n", - "\n", - " accuracy 0.5754 3203\n", - " macro avg 0.4131 0.4945 0.4387 3203\n", - " weighted avg 0.5977 0.5754 0.5780 3203\n", - "\n", - "===================\n", - "Confidence threshold k: 10.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.3121 0.5506 0.3984 89\n", - " enterobacter 0.2362 0.1389 0.1749 216\n", - " enterococcus 0.5370 0.7073 0.6105 41\n", - " escherichia 0.7553 0.5997 0.6685 1796\n", - " klebsiella 0.4792 0.4156 0.4451 527\n", - " others 0.0067 0.0488 0.0118 41\n", - " pseudomonas 0.4197 0.6977 0.5242 311\n", - "staphylococcus 0.7381 0.6813 0.7086 182\n", - "\n", - " accuracy 0.5454 3203\n", - " macro avg 0.4355 0.4800 0.4428 3203\n", - " weighted avg 0.6166 0.5454 0.5701 3203\n", - "\n", - "===================\n", - "Confidence threshold k: 20.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.3475 0.5506 0.4261 89\n", - " enterobacter 0.2385 0.1204 0.1600 216\n", - " enterococcus 0.6000 0.6585 0.6279 41\n", - " escherichia 0.7636 0.5629 0.6481 1796\n", - " klebsiella 0.5109 0.3985 0.4478 527\n", - " others 0.0130 0.1707 0.0241 41\n", - " pseudomonas 0.4459 0.6752 0.5371 311\n", - "staphylococcus 0.7407 0.6593 0.6977 182\n", - "\n", - " accuracy 0.5183 3203\n", - " macro avg 0.4575 0.4745 0.4461 3203\n", - " weighted avg 0.6312 0.5183 0.5598 3203\n", - "\n", - "===================\n", - "Confidence threshold k: 30.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.3729 0.4944 0.4251 89\n", - " enterobacter 0.2737 0.1204 0.1672 216\n", - " enterococcus 0.6190 0.6341 0.6265 41\n", - " escherichia 0.7722 0.5267 0.6263 1796\n", - " klebsiella 0.5571 0.3700 0.4447 527\n", - " others 0.0127 0.2439 0.0242 41\n", - " pseudomonas 0.4606 0.6399 0.5357 311\n", - "staphylococcus 0.7436 0.6374 0.6864 182\n", - "\n", - " accuracy 0.4877 3203\n", - " macro avg 0.4765 0.4583 0.4420 3203\n", - " weighted avg 0.6486 0.4877 0.5468 3203\n", - "\n", - "===================\n", - "Confidence threshold k: 40.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.4118 0.4719 0.4398 89\n", - " enterobacter 0.2466 0.0833 0.1246 216\n", - " enterococcus 0.6410 0.6098 0.6250 41\n", - " escherichia 0.7868 0.4994 0.6110 1796\n", - " klebsiella 0.5968 0.3510 0.4421 527\n", - " others 0.0141 0.3415 0.0270 41\n", - " pseudomonas 0.4770 0.6013 0.5320 311\n", - "staphylococcus 0.7616 0.6319 0.6907 182\n", - "\n", - " accuracy 0.4630 3203\n", - " macro avg 0.4920 0.4488 0.4365 3203\n", - " weighted avg 0.6654 0.4630 0.5352 3203\n", - "\n", - "===================\n", - "Confidence threshold k: 50.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.4318 0.4270 0.4294 89\n", - " enterobacter 0.2807 0.0741 0.1172 216\n", - " enterococcus 0.6486 0.5854 0.6154 41\n", - " escherichia 0.7938 0.4671 0.5882 1796\n", - " klebsiella 0.6331 0.3340 0.4373 527\n", - " others 0.0151 0.4390 0.0292 41\n", - " pseudomonas 0.4986 0.5627 0.5287 311\n", - "staphylococcus 0.7708 0.6099 0.6810 182\n", - "\n", - " accuracy 0.4362 3203\n", - " macro avg 0.5091 0.4374 0.4283 3203\n", - " weighted avg 0.6809 0.4362 0.5199 3203\n", - "\n", - "===================\n", - "Confidence threshold k: 60.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.4444 0.3596 0.3975 89\n", - " enterobacter 0.3200 0.0741 0.1203 216\n", - " enterococcus 0.7273 0.5854 0.6486 41\n", - " escherichia 0.8056 0.4293 0.5601 1796\n", - " klebsiella 0.6793 0.3055 0.4215 527\n", - " others 0.0155 0.5366 0.0301 41\n", - " pseudomonas 0.5395 0.5273 0.5333 311\n", - "staphylococcus 0.8189 0.5714 0.6731 182\n", - "\n", - " accuracy 0.4040 3203\n", - " macro avg 0.5438 0.4236 0.4231 3203\n", - " weighted avg 0.7059 0.4040 0.5013 3203\n", - "\n", - "===================\n", - "Confidence threshold k: 70.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.5088 0.3258 0.3973 89\n", - " enterobacter 0.4000 0.0741 0.1250 216\n", - " enterococcus 0.7308 0.4634 0.5672 41\n", - " escherichia 0.8213 0.3814 0.5209 1796\n", - " klebsiella 0.7310 0.2732 0.3978 527\n", - " others 0.0143 0.5854 0.0279 41\n", - " pseudomonas 0.5953 0.4920 0.5387 311\n", - "staphylococcus 0.8214 0.5055 0.6259 182\n", - "\n", - " accuracy 0.3628 3203\n", - " macro avg 0.5779 0.3876 0.4001 3203\n", - " weighted avg 0.7359 0.3628 0.4725 3203\n", - "\n", - "===================\n", - "Confidence threshold k: 80.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.6250 0.2809 0.3876 89\n", - " enterobacter 0.5238 0.0509 0.0928 216\n", - " enterococcus 0.7857 0.2683 0.4000 41\n", - " escherichia 0.8369 0.3285 0.4718 1796\n", - " klebsiella 0.7764 0.2372 0.3634 527\n", - " others 0.0138 0.6585 0.0271 41\n", - " pseudomonas 0.6827 0.4566 0.5472 311\n", - "staphylococcus 0.8431 0.4725 0.6056 182\n", - "\n", - " accuracy 0.3175 3203\n", - " macro avg 0.6359 0.3442 0.3619 3203\n", - " weighted avg 0.7741 0.3175 0.4344 3203\n", - "\n", - "===================\n", - "Confidence threshold k: 90.0%\n", - " precision recall f1-score support\n", - "\n", - " acinetobacter 0.6897 0.2247 0.3390 89\n", - " enterobacter 0.7500 0.0278 0.0536 216\n", - " enterococcus 0.8182 0.2195 0.3462 41\n", - " escherichia 0.8621 0.2472 0.3842 1796\n", - " klebsiella 0.8034 0.1784 0.2919 527\n", - " others 0.0136 0.7561 0.0267 41\n", - " pseudomonas 0.7908 0.3891 0.5216 311\n", - "staphylococcus 0.8876 0.4341 0.5830 182\n", - "\n", - " accuracy 0.2510 3203\n", - " macro avg 0.7019 0.3096 0.3183 3203\n", - " weighted avg 0.8232 0.2510 0.3651 3203\n", - "\n", - "===================\n", - "Finished\n", - "===================\n" - ] - } - ], - "source": [ - "models = list(constants.SAPROT_PLM.keys())\n", - "\n", - "for similarity in range(100, 39, -20):\n", - " for model in models:\n", - " model = model.lower()\n", - " df, df_all, protein_clusters = util.filter_proteins_based_on_struct_and_seq_sim(\n", - " f\"{constants.INPHARED}/{constants.CONSOLIDATED}/rbp_embeddings_{model}.csv\",\n", - " f\"{constants.INPHARED}/{constants.CONSOLIDATED}/rbp_embeddings_saprot_relaxed_r3.csv\",\n", - " f\"{constants.INPHARED}/{constants.FASTA}/complete-struct-{similarity}.fasta.clstr\",\n", - " )\n", - "\n", - " include_proteins_in_cluster = True\n", - " if similarity == 100:\n", - " include_proteins_in_cluster = False\n", - "\n", - " print(f\"*** {model}, similarity = {similarity}% ***\")\n", - " util.classify(\n", - " df,\n", - " model + \"-xgboost-eskapee-smotetomek\",\n", - " similarity,\n", - " genus=[\n", - " \"enterococcus\",\n", - " \"staphylococcus\",\n", - " \"klebsiella\",\n", - " \"acinetobacter\",\n", - " \"pseudomonas\",\n", - " \"enterobacter\",\n", - " \"escherichia\",\n", - " ],\n", - " feature_columns=[f\"s{i}\" for i in range(1, 1281)],\n", - " include_proteins_in_cluster=include_proteins_in_cluster,\n", - " rbp_embeddings_all=df_all,\n", - " protein_clusters=protein_clusters,\n", - " undersample_others=True,\n", - " oversample_technique=\"SMOTETomek\",\n", - " model=\"xgboost\",\n", - " )" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}