From 72e2e1ca1300b9c82cea31ba28bcd990d02b6102 Mon Sep 17 00:00:00 2001
From: Jaclyn Beck <jaclyn.beck@sagebase.org>
Date: Wed, 20 Nov 2024 19:12:48 -0800
Subject: [PATCH 1/8] Removed druggability-only genes from the gene_metadata
 pre-processing step, and bumped the version of the file in the config to
 match the new file on Synapse

---
 config.yaml                                   |    4 +-
 .../AG-896_Preprocess_Gene_Annotations.ipynb  | 1824 ++++++++---------
 .../preprocessing/preprocessing_utils.py      |  209 +-
 test_config.yaml                              |    4 +-
 4 files changed, 1012 insertions(+), 1029 deletions(-)

diff --git a/config.yaml b/config.yaml
index 7b8b4f4b..1ace0892 100644
--- a/config.yaml
+++ b/config.yaml
@@ -144,7 +144,7 @@ datasets:
   - gene_info:
       files:
         - name: gene_metadata
-          id: syn25953363.13
+          id: syn25953363.14
           format: feather
         - name: igap
           id: syn12514826.5
@@ -187,7 +187,7 @@ datasets:
         possible_replacement: ensembl_possible_replacements
         permalink: ensembl_permalink
       provenance:
-        - syn25953363.13
+        - syn25953363.14
         - syn12514826.5
         - syn12514912.3
         - *agora_proteomics_provenance
diff --git a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
index 9ef8fedf..c0f2ad33 100644
--- a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
+++ b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
@@ -108,7 +108,7 @@
    "source": [
     "## Get Ensembl IDs from data sets that will be processed by agora-data-tools\n",
     "\n",
-    "Loop through all data sets in the config file to get all Ensembl IDs used in every data set."
+    "Loop through all data sets in the config file to get all Ensembl IDs used in every data set. Exclude `gene_metadata` since that's the file we are building, and `druggability` since that data is deprecated."
    ]
   },
   {
@@ -118,73 +118,6 @@
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'genes_biodomains': ('syn44151254.5', 'csv'),\n",
-       " 'neuropath_regression_results': ('syn22017882.5', 'csv'),\n",
-       " 'proteomics': ('syn18689335.3', 'csv'),\n",
-       " 'proteomics_tmt': ('syn35221005.2', 'csv'),\n",
-       " 'proteomics_srm': ('syn52579640.4', 'csv'),\n",
-       " 'target_exp_validation_harmonized': ('syn24184512.9', 'csv'),\n",
-       " 'metabolomics': ('syn26064497.1', 'feather'),\n",
-       " 'igap': ('syn12514826.5', 'csv'),\n",
-       " 'eqtl': ('syn12514912.3', 'csv'),\n",
-       " 'diff_exp_data': ('syn27211942.1', 'tsv'),\n",
-       " 'target_list': ('syn12540368.47', 'csv'),\n",
-       " 'median_expression': ('syn27211878.2', 'csv'),\n",
-       " 'druggability': ('syn13363443.11', 'csv'),\n",
-       " 'tep_adi_info': ('syn51942280.2', 'csv'),\n",
-       " 'team_info': ('syn12615624.18', 'csv'),\n",
-       " 'team_member_info': ('syn12615633.18', 'csv'),\n",
-       " 'overall_scores': ('syn25575156.13', 'table'),\n",
-       " 'networks': ('syn11685347.1', 'csv')}"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "config = utils._get_config(config_path=config_filename)\n",
-    "datasets = config[\"datasets\"]\n",
-    "\n",
-    "files = {}\n",
-    "\n",
-    "for dataset in datasets:\n",
-    "    dataset_name = list(dataset.keys())[0]\n",
-    "\n",
-    "    for entity in dataset[dataset_name][\"files\"]:\n",
-    "        entity_id = entity[\"id\"]\n",
-    "        entity_format = entity[\"format\"]\n",
-    "        entity_name = entity[\"name\"]\n",
-    "\n",
-    "        # Ignore json files, which are post-processed and not what we're interested in.\n",
-    "        # Also ignore \"gene_metadata\" since that's the file we're making here.\n",
-    "        if entity_format != \"json\" and entity_name != \"gene_metadata\":\n",
-    "            files[entity_name] = (entity_id, entity_format)\n",
-    "\n",
-    "# There are some duplicate synID's in this list but that doesn't really matter\n",
-    "files"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8f1a2120",
-   "metadata": {},
-   "source": [
-    "### We should now have a list of all raw data files ingested. Get each one and create a list of IDs."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "9843689d",
-   "metadata": {
-    "scrolled": true
-   },
    "outputs": [
     {
      "name": "stderr",
@@ -193,10 +126,10 @@
       "\n",
       "UPGRADE AVAILABLE\n",
       "\n",
-      "A more recent version of the Synapse Client (4.2.0) is available. Your version (4.0.0) can be upgraded by typing:\n",
+      "A more recent version of the Synapse Client (4.6.0) is available. Your version (4.0.0) can be upgraded by typing:\n",
       "    pip install --upgrade synapseclient\n",
       "\n",
-      "Python Synapse Client version 4.2.0 release notes\n",
+      "Python Synapse Client version 4.6.0 release notes\n",
       "\n",
       "https://python-docs.synapse.org/news/\n",
       "\n"
@@ -222,50 +155,58 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "genes_biodomains has an NaN Ensembl ID\n",
+      "Found 19 files:\n",
+      "genes_biodomains:\tsyn44151254.5\n",
+      "neuropath_regression_results:\tsyn22017882.5\n",
+      "proteomics:\tsyn18689335.4\n",
+      "proteomics_tmt:\tsyn35221005.2\n",
+      "proteomics_srm:\tsyn52579640.4\n",
+      "target_exp_validation_harmonized:\tsyn24184512.9\n",
+      "metabolomics:\tsyn26064497.1\n",
+      "gene_metadata:\tsyn25953363.13\n",
+      "igap:\tsyn12514826.5\n",
+      "eqtl:\tsyn12514912.3\n",
+      "diff_exp_data:\tsyn27211942.1\n",
+      "target_list:\tsyn12540368.51\n",
+      "median_expression:\tsyn27211878.2\n",
+      "druggability:\tsyn13363443.11\n",
+      "tep_adi_info:\tsyn51942280.3\n",
+      "team_info:\tsyn12615624.18\n",
+      "team_member_info:\tsyn12615633.19\n",
+      "overall_scores:\tsyn25575156.13\n",
+      "networks:\tsyn11685347.1\n",
+      "\n",
+      "genes_biodomains has 591 NaN Ensembl IDs\n",
       "WARNING: no Ensembl ID column found for team_info!\n",
-      "WARNING: no Ensembl ID column found for team_member_info!\n"
+      "WARNING: no Ensembl ID column found for team_member_info!\n",
+      "\n",
+      "35858 Ensembl IDs found.\n",
+      "['ENSG00000151650', 'ENSG00000168268', 'ENSG00000186310', 'ENSG00000204616', 'ENSG00000158467']\n"
      ]
     }
    ],
    "source": [
-    "syn = utils._login_to_synapse(\n",
-    "    token=None\n",
-    ")  # Assumes you have already logged in with a valid token\n",
-    "\n",
-    "# The various column names used to store Ensembl IDs in the files\n",
-    "col_names = [\"ENSG\", \"ensembl_gene_id\", \"GeneID\", \"ensembl_id\"]\n",
-    "file_ensembl_list = []\n",
-    "\n",
-    "for file in files.keys():\n",
-    "    df = extract.get_entity_as_df(syn_id=files[file][0], source=files[file][1], syn=syn)\n",
-    "\n",
-    "    file_ensembl_ids = None\n",
-    "\n",
-    "    for C in col_names:\n",
-    "        if C in df.columns:\n",
-    "            file_ensembl_ids = df[C]\n",
-    "\n",
-    "    # networks file is a special case\n",
-    "    if file == \"networks\":\n",
-    "        file_ensembl_ids = pd.melt(\n",
-    "            df[[\"geneA_ensembl_gene_id\", \"geneB_ensembl_gene_id\"]]\n",
-    "        )[\"value\"]\n",
-    "\n",
-    "    if file_ensembl_ids is not None:\n",
-    "        file_ensembl_list = file_ensembl_list + file_ensembl_ids.tolist()\n",
-    "        if \"n/A\" in file_ensembl_ids.tolist():\n",
-    "            print(file + \" has an n/A Ensembl ID\")\n",
-    "            file_ensembl_list.remove(\"n/A\")\n",
-    "        if np.NaN in file_ensembl_ids.tolist():\n",
-    "            print(file + \" has an NaN Ensembl ID\")\n",
-    "    else:\n",
-    "        print(\"WARNING: no Ensembl ID column found for \" + file + \"!\")"
+    "file_ensembl_list = preprocessing_utils.get_all_adt_ensembl_ids(\n",
+    "    config_filename=config_filename,\n",
+    "    exclude_files=[\"gene_metadata\", \"druggability\"],\n",
+    "    token=None,\n",
+    ")\n",
+    "print(\"\")\n",
+    "print(str(len(file_ensembl_list)) + \" Ensembl IDs found.\")\n",
+    "print(file_ensembl_list[0:5])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5fa76bfb",
+   "metadata": {},
+   "source": [
+    "Create a data frame with these IDs so it can be merged with the MyGene query results below."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "id": "f1303e5b",
    "metadata": {},
    "outputs": [
@@ -273,13 +214,11 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "37452\n"
+      "35858\n"
      ]
     }
    ],
    "source": [
-    "file_ensembl_list = list(set(file_ensembl_list))\n",
-    "\n",
     "ensembl_ids_df = pd.DataFrame({\"ensembl_gene_id\": file_ensembl_list})\n",
     "\n",
     "\"\"\" Removed due to no longer getting genes from BioMart, but saving code\n",
@@ -300,7 +239,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "id": "4e7a37c8",
    "metadata": {},
    "outputs": [],
@@ -321,7 +260,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "id": "7ebd03d4",
    "metadata": {
     "scrolled": true
@@ -401,11 +340,7 @@
       "INFO:biothings.client:done.\n",
       "INFO:biothings.client:querying 34001-35000...\n",
       "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 35001-36000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 36001-37000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 37001-37452...\n",
+      "INFO:biothings.client:querying 35001-35858...\n",
       "INFO:biothings.client:done.\n"
      ]
     },
@@ -453,57 +388,57 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>ENSG00000164972</th>\n",
-       "      <td>84688</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4]</td>\n",
-       "      <td>sperm microtubule inner protein 6</td>\n",
-       "      <td>This gene encodes a nuclear- or perinuclear-lo...</td>\n",
-       "      <td>SPMIP6</td>\n",
+       "      <th>ENSG00000151650</th>\n",
+       "      <td>27287</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>[HPX42B, NA88A, VENTX2]</td>\n",
+       "      <td>VENT homeobox</td>\n",
+       "      <td>This gene encodes a member of the Vent family ...</td>\n",
+       "      <td>VENTX</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>ENSG00000169105</th>\n",
-       "      <td>113189</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[ATCS, D4ST1, EDSMC1, HNK1ST]</td>\n",
-       "      <td>carbohydrate sulfotransferase 14</td>\n",
-       "      <td>This gene encodes a member of the HNK-1 family...</td>\n",
-       "      <td>CHST14</td>\n",
+       "      <th>ENSG00000168268</th>\n",
+       "      <td>64943</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>5'-nucleotidase domain containing 2</td>\n",
+       "      <td>Predicted to enable 5'-nucleotidase activity. ...</td>\n",
+       "      <td>NT5DC2</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>ENSG00000255136</th>\n",
-       "      <td>ENSG00000255136</td>\n",
+       "      <th>ENSG00000186310</th>\n",
+       "      <td>4675</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>TPBGL antisense RNA 1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>TPBGL-AS1</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>[MB20, NPL3]</td>\n",
+       "      <td>nucleosome assembly protein 1 like 3</td>\n",
+       "      <td>This gene is intronless and encodes a member o...</td>\n",
+       "      <td>NAP1L3</td>\n",
+       "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>ENSG00000105499</th>\n",
-       "      <td>8605</td>\n",
+       "      <th>ENSG00000204616</th>\n",
+       "      <td>11074</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>CPLA2-gamma</td>\n",
-       "      <td>phospholipase A2 group IVC</td>\n",
-       "      <td>This gene encodes a protein which is a member ...</td>\n",
-       "      <td>PLA2G4C</td>\n",
+       "      <td>[C6orf13, HCG1, HCGI, RNF]</td>\n",
+       "      <td>tripartite motif containing 31</td>\n",
+       "      <td>This gene encodes a protein that functions as ...</td>\n",
+       "      <td>TRIM31</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>ENSG00000104611</th>\n",
-       "      <td>63898</td>\n",
+       "      <th>ENSG00000158467</th>\n",
+       "      <td>23382</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>[PPP1R38, SH2A]</td>\n",
-       "      <td>SH2 domain containing 4A</td>\n",
-       "      <td>Enables phosphatase binding activity. Located ...</td>\n",
-       "      <td>SH2D4A</td>\n",
+       "      <td>[ADOHCYASE3, IRBIT2]</td>\n",
+       "      <td>adenosylhomocysteinase like 2</td>\n",
+       "      <td>The protein encoded by this gene acts as a hom...</td>\n",
+       "      <td>AHCYL2</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
@@ -512,48 +447,40 @@
        "</div>"
       ],
       "text/plain": [
-       "                             _id  _version  \\\n",
-       "ensembl_gene_id                              \n",
-       "ENSG00000164972            84688       2.0   \n",
-       "ENSG00000169105           113189       2.0   \n",
-       "ENSG00000255136  ENSG00000255136       1.0   \n",
-       "ENSG00000105499             8605       1.0   \n",
-       "ENSG00000104611            63898       1.0   \n",
-       "\n",
-       "                                                        alias  \\\n",
-       "ensembl_gene_id                                                 \n",
-       "ENSG00000164972  [C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4]   \n",
-       "ENSG00000169105                 [ATCS, D4ST1, EDSMC1, HNK1ST]   \n",
-       "ENSG00000255136                                           NaN   \n",
-       "ENSG00000105499                                   CPLA2-gamma   \n",
-       "ENSG00000104611                               [PPP1R38, SH2A]   \n",
+       "                   _id  _version                       alias  \\\n",
+       "ensembl_gene_id                                                \n",
+       "ENSG00000151650  27287       1.0     [HPX42B, NA88A, VENTX2]   \n",
+       "ENSG00000168268  64943       1.0                         NaN   \n",
+       "ENSG00000186310   4675       1.0                [MB20, NPL3]   \n",
+       "ENSG00000204616  11074       1.0  [C6orf13, HCG1, HCGI, RNF]   \n",
+       "ENSG00000158467  23382       1.0        [ADOHCYASE3, IRBIT2]   \n",
        "\n",
-       "                                              name  \\\n",
-       "ensembl_gene_id                                      \n",
-       "ENSG00000164972  sperm microtubule inner protein 6   \n",
-       "ENSG00000169105   carbohydrate sulfotransferase 14   \n",
-       "ENSG00000255136              TPBGL antisense RNA 1   \n",
-       "ENSG00000105499         phospholipase A2 group IVC   \n",
-       "ENSG00000104611           SH2 domain containing 4A   \n",
+       "                                                 name  \\\n",
+       "ensembl_gene_id                                         \n",
+       "ENSG00000151650                         VENT homeobox   \n",
+       "ENSG00000168268   5'-nucleotidase domain containing 2   \n",
+       "ENSG00000186310  nucleosome assembly protein 1 like 3   \n",
+       "ENSG00000204616        tripartite motif containing 31   \n",
+       "ENSG00000158467         adenosylhomocysteinase like 2   \n",
        "\n",
-       "                                                           summary     symbol  \\\n",
-       "ensembl_gene_id                                                                 \n",
-       "ENSG00000164972  This gene encodes a nuclear- or perinuclear-lo...     SPMIP6   \n",
-       "ENSG00000169105  This gene encodes a member of the HNK-1 family...     CHST14   \n",
-       "ENSG00000255136                                                NaN  TPBGL-AS1   \n",
-       "ENSG00000105499  This gene encodes a protein which is a member ...    PLA2G4C   \n",
-       "ENSG00000104611  Enables phosphatase binding activity. Located ...     SH2D4A   \n",
+       "                                                           summary  symbol  \\\n",
+       "ensembl_gene_id                                                              \n",
+       "ENSG00000151650  This gene encodes a member of the Vent family ...   VENTX   \n",
+       "ENSG00000168268  Predicted to enable 5'-nucleotidase activity. ...  NT5DC2   \n",
+       "ENSG00000186310  This gene is intronless and encodes a member o...  NAP1L3   \n",
+       "ENSG00000204616  This gene encodes a protein that functions as ...  TRIM31   \n",
+       "ENSG00000158467  The protein encoded by this gene acts as a hom...  AHCYL2   \n",
        "\n",
        "                   type_of_gene notfound  \n",
        "ensembl_gene_id                           \n",
-       "ENSG00000164972  protein-coding      NaN  \n",
-       "ENSG00000169105  protein-coding      NaN  \n",
-       "ENSG00000255136             NaN      NaN  \n",
-       "ENSG00000105499  protein-coding      NaN  \n",
-       "ENSG00000104611  protein-coding      NaN  "
+       "ENSG00000151650  protein-coding      NaN  \n",
+       "ENSG00000168268  protein-coding      NaN  \n",
+       "ENSG00000186310  protein-coding      NaN  \n",
+       "ENSG00000204616  protein-coding      NaN  \n",
+       "ENSG00000158467  protein-coding      NaN  "
       ]
      },
-     "execution_count": 7,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -573,7 +500,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "id": "23bb114e",
    "metadata": {
     "scrolled": true
@@ -583,8 +510,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Annotations found for 36284 genes.\n",
-      "No annotations found for 1175 genes.\n"
+      "Annotations found for 34655 genes.\n",
+      "No annotations found for 1206 genes.\n"
      ]
     }
    ],
@@ -611,7 +538,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 27,
    "id": "186d8cb8",
    "metadata": {
     "scrolled": true
@@ -621,7 +548,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "(37459, 9)\n"
+      "(35861, 9)\n"
      ]
     },
     {
@@ -659,61 +586,61 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>ENSG00000164972</td>\n",
-       "      <td>84688</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4]</td>\n",
-       "      <td>sperm microtubule inner protein 6</td>\n",
-       "      <td>This gene encodes a nuclear- or perinuclear-lo...</td>\n",
-       "      <td>SPMIP6</td>\n",
+       "      <td>ENSG00000151650</td>\n",
+       "      <td>27287</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>[HPX42B, NA88A, VENTX2]</td>\n",
+       "      <td>VENT homeobox</td>\n",
+       "      <td>This gene encodes a member of the Vent family ...</td>\n",
+       "      <td>VENTX</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>ENSG00000169105</td>\n",
-       "      <td>113189</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[ATCS, D4ST1, EDSMC1, HNK1ST]</td>\n",
-       "      <td>carbohydrate sulfotransferase 14</td>\n",
-       "      <td>This gene encodes a member of the HNK-1 family...</td>\n",
-       "      <td>CHST14</td>\n",
+       "      <td>ENSG00000168268</td>\n",
+       "      <td>64943</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>5'-nucleotidase domain containing 2</td>\n",
+       "      <td>Predicted to enable 5'-nucleotidase activity. ...</td>\n",
+       "      <td>NT5DC2</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>ENSG00000255136</td>\n",
-       "      <td>ENSG00000255136</td>\n",
+       "      <td>ENSG00000186310</td>\n",
+       "      <td>4675</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>TPBGL antisense RNA 1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>TPBGL-AS1</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>[MB20, NPL3]</td>\n",
+       "      <td>nucleosome assembly protein 1 like 3</td>\n",
+       "      <td>This gene is intronless and encodes a member o...</td>\n",
+       "      <td>NAP1L3</td>\n",
+       "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>ENSG00000105499</td>\n",
-       "      <td>8605</td>\n",
+       "      <td>ENSG00000204616</td>\n",
+       "      <td>11074</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>CPLA2-gamma</td>\n",
-       "      <td>phospholipase A2 group IVC</td>\n",
-       "      <td>This gene encodes a protein which is a member ...</td>\n",
-       "      <td>PLA2G4C</td>\n",
+       "      <td>[C6orf13, HCG1, HCGI, RNF]</td>\n",
+       "      <td>tripartite motif containing 31</td>\n",
+       "      <td>This gene encodes a protein that functions as ...</td>\n",
+       "      <td>TRIM31</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>ENSG00000104611</td>\n",
-       "      <td>63898</td>\n",
+       "      <td>ENSG00000158467</td>\n",
+       "      <td>23382</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>[PPP1R38, SH2A]</td>\n",
-       "      <td>SH2 domain containing 4A</td>\n",
-       "      <td>Enables phosphatase binding activity. Located ...</td>\n",
-       "      <td>SH2D4A</td>\n",
+       "      <td>[ADOHCYASE3, IRBIT2]</td>\n",
+       "      <td>adenosylhomocysteinase like 2</td>\n",
+       "      <td>The protein encoded by this gene acts as a hom...</td>\n",
+       "      <td>AHCYL2</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
@@ -722,43 +649,36 @@
        "</div>"
       ],
       "text/plain": [
-       "   ensembl_gene_id              _id  _version  \\\n",
-       "0  ENSG00000164972            84688       2.0   \n",
-       "1  ENSG00000169105           113189       2.0   \n",
-       "2  ENSG00000255136  ENSG00000255136       1.0   \n",
-       "3  ENSG00000105499             8605       1.0   \n",
-       "4  ENSG00000104611            63898       1.0   \n",
-       "\n",
-       "                                          alias  \\\n",
-       "0  [C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4]   \n",
-       "1                 [ATCS, D4ST1, EDSMC1, HNK1ST]   \n",
-       "2                                           NaN   \n",
-       "3                                   CPLA2-gamma   \n",
-       "4                               [PPP1R38, SH2A]   \n",
+       "   ensembl_gene_id    _id  _version                       alias  \\\n",
+       "0  ENSG00000151650  27287       1.0     [HPX42B, NA88A, VENTX2]   \n",
+       "1  ENSG00000168268  64943       1.0                         NaN   \n",
+       "2  ENSG00000186310   4675       1.0                [MB20, NPL3]   \n",
+       "3  ENSG00000204616  11074       1.0  [C6orf13, HCG1, HCGI, RNF]   \n",
+       "4  ENSG00000158467  23382       1.0        [ADOHCYASE3, IRBIT2]   \n",
        "\n",
-       "                                name  \\\n",
-       "0  sperm microtubule inner protein 6   \n",
-       "1   carbohydrate sulfotransferase 14   \n",
-       "2              TPBGL antisense RNA 1   \n",
-       "3         phospholipase A2 group IVC   \n",
-       "4           SH2 domain containing 4A   \n",
+       "                                   name  \\\n",
+       "0                         VENT homeobox   \n",
+       "1   5'-nucleotidase domain containing 2   \n",
+       "2  nucleosome assembly protein 1 like 3   \n",
+       "3        tripartite motif containing 31   \n",
+       "4         adenosylhomocysteinase like 2   \n",
        "\n",
-       "                                             summary     symbol  \\\n",
-       "0  This gene encodes a nuclear- or perinuclear-lo...     SPMIP6   \n",
-       "1  This gene encodes a member of the HNK-1 family...     CHST14   \n",
-       "2                                                NaN  TPBGL-AS1   \n",
-       "3  This gene encodes a protein which is a member ...    PLA2G4C   \n",
-       "4  Enables phosphatase binding activity. Located ...     SH2D4A   \n",
+       "                                             summary  symbol    type_of_gene  \\\n",
+       "0  This gene encodes a member of the Vent family ...   VENTX  protein-coding   \n",
+       "1  Predicted to enable 5'-nucleotidase activity. ...  NT5DC2  protein-coding   \n",
+       "2  This gene is intronless and encodes a member o...  NAP1L3  protein-coding   \n",
+       "3  This gene encodes a protein that functions as ...  TRIM31  protein-coding   \n",
+       "4  The protein encoded by this gene acts as a hom...  AHCYL2  protein-coding   \n",
        "\n",
-       "     type_of_gene notfound  \n",
-       "0  protein-coding      NaN  \n",
-       "1  protein-coding      NaN  \n",
-       "2             NaN      NaN  \n",
-       "3  protein-coding      NaN  \n",
-       "4  protein-coding      NaN  "
+       "  notfound  \n",
+       "0      NaN  \n",
+       "1      NaN  \n",
+       "2      NaN  \n",
+       "3      NaN  \n",
+       "4      NaN  "
       ]
      },
-     "execution_count": 9,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -791,7 +711,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 28,
    "id": "285c10d2",
    "metadata": {
     "scrolled": true
@@ -799,8 +719,9 @@
    "outputs": [],
    "source": [
     "# NaN or NULL alias values become empty lists\n",
-    "for row in gene_table_merged.loc[gene_table_merged[\"alias\"].isnull(), \"alias\"].index:\n",
-    "    gene_table_merged.at[row, \"alias\"] = []\n",
+    "gene_table_merged[\"alias\"] = gene_table_merged[\"alias\"].apply(\n",
+    "    lambda cell: cell if cell is not np.NaN else []\n",
+    ")\n",
     "\n",
     "# Some alias values are a single string, not a list. Turn them into lists here.\n",
     "gene_table_merged[\"alias\"] = gene_table_merged[\"alias\"].apply(\n",
@@ -836,7 +757,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 29,
    "id": "bc63cc53",
    "metadata": {
     "scrolled": true
@@ -876,67 +797,55 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>6011</th>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>128966722</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC128966722</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6012</th>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>128966732</td>\n",
+       "      <th>19626</th>\n",
+       "      <td>ENSG00000249738</td>\n",
+       "      <td>285626</td>\n",
        "      <td>1.0</td>\n",
        "      <td>[]</td>\n",
-       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
+       "      <td>uncharacterized LOC285626</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>LOC128966732</td>\n",
-       "      <td>protein-coding</td>\n",
+       "      <td>LOC285626</td>\n",
+       "      <td>ncRNA</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6013</th>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>128966730</td>\n",
+       "      <th>19627</th>\n",
+       "      <td>ENSG00000249738</td>\n",
+       "      <td>105377683</td>\n",
        "      <td>1.0</td>\n",
        "      <td>[]</td>\n",
-       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
+       "      <td>uncharacterized LOC105377683</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>LOC128966730</td>\n",
-       "      <td>protein-coding</td>\n",
+       "      <td>LOC105377683</td>\n",
+       "      <td>ncRNA</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6014</th>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>128966731</td>\n",
+       "      <th>24698</th>\n",
+       "      <td>ENSG00000276387</td>\n",
+       "      <td>3802</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC128966731</td>\n",
+       "      <td>[CD158A, NKAT1, KIR2DL3, KIR-K64, NKAT-1, p58....</td>\n",
+       "      <td>killer cell immunoglobulin like receptor, two ...</td>\n",
+       "      <td>Killer cell immunoglobulin-like receptors (KIR...</td>\n",
+       "      <td>KIR2DL1</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6015</th>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>128966733</td>\n",
+       "      <th>24699</th>\n",
+       "      <td>ENSG00000276387</td>\n",
+       "      <td>124900571</td>\n",
        "      <td>1.0</td>\n",
        "      <td>[]</td>\n",
-       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
+       "      <td>killer cell immunoglobulin-like receptor 2DS1</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>LOC128966733</td>\n",
+       "      <td>LOC124900571</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>12139</th>\n",
+       "      <th>29514</th>\n",
        "      <td>ENSG00000230373</td>\n",
        "      <td>100133220</td>\n",
        "      <td>1.0</td>\n",
@@ -948,7 +857,7 @@
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>12140</th>\n",
+       "      <th>29515</th>\n",
        "      <td>ENSG00000230373</td>\n",
        "      <td>642402</td>\n",
        "      <td>1.0</td>\n",
@@ -959,126 +868,53 @@
        "      <td>pseudo</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23329</th>\n",
-       "      <td>ENSG00000276387</td>\n",
-       "      <td>124900571</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>killer cell immunoglobulin-like receptor 2DS1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC124900571</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23330</th>\n",
-       "      <td>ENSG00000276387</td>\n",
-       "      <td>3802</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[NKAT1, KIR2DL3, NKAT, KIR221, CD158A, p58.1, ...</td>\n",
-       "      <td>killer cell immunoglobulin like receptor, two ...</td>\n",
-       "      <td>Killer cell immunoglobulin-like receptors (KIR...</td>\n",
-       "      <td>KIR2DL1</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31304</th>\n",
-       "      <td>ENSG00000249738</td>\n",
-       "      <td>285626</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>uncharacterized LOC285626</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC285626</td>\n",
-       "      <td>ncRNA</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31305</th>\n",
-       "      <td>ENSG00000249738</td>\n",
-       "      <td>105377683</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>uncharacterized LOC105377683</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC105377683</td>\n",
-       "      <td>ncRNA</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
        "       ensembl_gene_id        _id  _version  \\\n",
-       "6011   ENSG00000276518  128966722       1.0   \n",
-       "6012   ENSG00000276518  128966732       1.0   \n",
-       "6013   ENSG00000276518  128966730       1.0   \n",
-       "6014   ENSG00000276518  128966731       1.0   \n",
-       "6015   ENSG00000276518  128966733       1.0   \n",
-       "12139  ENSG00000230373  100133220       1.0   \n",
-       "12140  ENSG00000230373     642402       1.0   \n",
-       "23329  ENSG00000276387  124900571       1.0   \n",
-       "23330  ENSG00000276387       3802       2.0   \n",
-       "31304  ENSG00000249738     285626       1.0   \n",
-       "31305  ENSG00000249738  105377683       1.0   \n",
+       "19626  ENSG00000249738     285626       1.0   \n",
+       "19627  ENSG00000249738  105377683       1.0   \n",
+       "24698  ENSG00000276387       3802       1.0   \n",
+       "24699  ENSG00000276387  124900571       1.0   \n",
+       "29514  ENSG00000230373  100133220       1.0   \n",
+       "29515  ENSG00000230373     642402       1.0   \n",
        "\n",
        "                                                   alias  \\\n",
-       "6011                                                  []   \n",
-       "6012                                                  []   \n",
-       "6013                                                  []   \n",
-       "6014                                                  []   \n",
-       "6015                                                  []   \n",
-       "12139                                         [GOLGA6L3]   \n",
-       "12140                                       [GOLGA6L21P]   \n",
-       "23329                                                 []   \n",
-       "23330  [NKAT1, KIR2DL3, NKAT, KIR221, CD158A, p58.1, ...   \n",
-       "31304                                                 []   \n",
-       "31305                                                 []   \n",
+       "19626                                                 []   \n",
+       "19627                                                 []   \n",
+       "24698  [CD158A, NKAT1, KIR2DL3, KIR-K64, NKAT-1, p58....   \n",
+       "24699                                                 []   \n",
+       "29514                                         [GOLGA6L3]   \n",
+       "29515                                       [GOLGA6L21P]   \n",
        "\n",
        "                                                    name  \\\n",
-       "6011   putative killer cell immunoglobulin-like recep...   \n",
-       "6012   putative killer cell immunoglobulin-like recep...   \n",
-       "6013   putative killer cell immunoglobulin-like recep...   \n",
-       "6014   putative killer cell immunoglobulin-like recep...   \n",
-       "6015   putative killer cell immunoglobulin-like recep...   \n",
-       "12139                golgin A6 family like 3, pseudogene   \n",
-       "12140               golgin A6 family like 17, pseudogene   \n",
-       "23329      killer cell immunoglobulin-like receptor 2DS1   \n",
-       "23330  killer cell immunoglobulin like receptor, two ...   \n",
-       "31304                          uncharacterized LOC285626   \n",
-       "31305                       uncharacterized LOC105377683   \n",
+       "19626                          uncharacterized LOC285626   \n",
+       "19627                       uncharacterized LOC105377683   \n",
+       "24698  killer cell immunoglobulin like receptor, two ...   \n",
+       "24699      killer cell immunoglobulin-like receptor 2DS1   \n",
+       "29514                golgin A6 family like 3, pseudogene   \n",
+       "29515               golgin A6 family like 17, pseudogene   \n",
        "\n",
        "                                                 summary        symbol  \\\n",
-       "6011                                                 NaN  LOC128966722   \n",
-       "6012                                                 NaN  LOC128966732   \n",
-       "6013                                                 NaN  LOC128966730   \n",
-       "6014                                                 NaN  LOC128966731   \n",
-       "6015                                                 NaN  LOC128966733   \n",
-       "12139                                                NaN     GOLGA6L3P   \n",
-       "12140                                                NaN    GOLGA6L17P   \n",
-       "23329                                                NaN  LOC124900571   \n",
-       "23330  Killer cell immunoglobulin-like receptors (KIR...       KIR2DL1   \n",
-       "31304                                                NaN     LOC285626   \n",
-       "31305                                                NaN  LOC105377683   \n",
+       "19626                                                NaN     LOC285626   \n",
+       "19627                                                NaN  LOC105377683   \n",
+       "24698  Killer cell immunoglobulin-like receptors (KIR...       KIR2DL1   \n",
+       "24699                                                NaN  LOC124900571   \n",
+       "29514                                                NaN     GOLGA6L3P   \n",
+       "29515                                                NaN    GOLGA6L17P   \n",
        "\n",
        "         type_of_gene notfound  \n",
-       "6011   protein-coding      NaN  \n",
-       "6012   protein-coding      NaN  \n",
-       "6013   protein-coding      NaN  \n",
-       "6014   protein-coding      NaN  \n",
-       "6015   protein-coding      NaN  \n",
-       "12139          pseudo      NaN  \n",
-       "12140          pseudo      NaN  \n",
-       "23329  protein-coding      NaN  \n",
-       "23330  protein-coding      NaN  \n",
-       "31304           ncRNA      NaN  \n",
-       "31305           ncRNA      NaN  "
+       "19626           ncRNA      NaN  \n",
+       "19627           ncRNA      NaN  \n",
+       "24698  protein-coding      NaN  \n",
+       "24699  protein-coding      NaN  \n",
+       "29514          pseudo      NaN  \n",
+       "29515          pseudo      NaN  "
       ]
      },
-     "execution_count": 11,
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1097,7 +933,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "id": "093a2e98",
    "metadata": {},
    "outputs": [
@@ -1105,7 +941,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "4 duplicated genes have been processed.\n"
+      "3 duplicated genes have been processed.\n"
      ]
     },
     {
@@ -1142,107 +978,95 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>37442</th>\n",
-       "      <td>ENSG00000163811</td>\n",
-       "      <td>23160</td>\n",
+       "      <th>35848</th>\n",
+       "      <td>ENSG00000085998</td>\n",
+       "      <td>55624</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>[NET12, UTP5]</td>\n",
-       "      <td>WD repeat domain 43</td>\n",
-       "      <td>Enables RNA binding activity. Involved in posi...</td>\n",
-       "      <td>WDR43</td>\n",
+       "      <td>[RP76, LGMDR15, LGMD2O, gnT-I.2, GNTI.2, GnT I...</td>\n",
+       "      <td>protein O-linked mannose N-acetylglucosaminylt...</td>\n",
+       "      <td>This gene encodes a type II transmembrane prot...</td>\n",
+       "      <td>POMGNT1</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37443</th>\n",
-       "      <td>ENSG00000226467</td>\n",
-       "      <td>10554</td>\n",
+       "      <th>35849</th>\n",
+       "      <td>ENSG00000285081</td>\n",
+       "      <td>ENSG00000285081</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>[G15, LPLAT1, 1-AGPAT1, LPAATA, LPAAT-alpha]</td>\n",
-       "      <td>1-acylglycerol-3-phosphate O-acyltransferase 1</td>\n",
-       "      <td>This gene encodes an enzyme that converts lyso...</td>\n",
-       "      <td>AGPAT1</td>\n",
-       "      <td>protein-coding</td>\n",
+       "      <td>[]</td>\n",
        "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37444</th>\n",
-       "      <td>ENSG00000120533</td>\n",
-       "      <td>56943</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[Sus1, e(y)2, DC6]</td>\n",
-       "      <td>ENY2 transcription and export complex 2 subunit</td>\n",
-       "      <td>Enables nuclear receptor coactivator activity....</td>\n",
-       "      <td>ENY2</td>\n",
-       "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37445</th>\n",
-       "      <td>ENSG00000214759</td>\n",
-       "      <td>ENSG00000214759</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>ribosomal protein L36a pseudogene 2</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>RPL36AP2</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37446</th>\n",
-       "      <td>ENSG00000253981</td>\n",
-       "      <td>ENSG00000253981</td>\n",
+       "      <th>35850</th>\n",
+       "      <td>ENSG00000126822</td>\n",
+       "      <td>26030</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>ALG1 like 13, pseudogene</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ALG1L13P</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>[ARHGEF43, KIAA0599]</td>\n",
+       "      <td>pleckstrin homology and RhoGEF domain containi...</td>\n",
+       "      <td>Predicted to enable guanyl-nucleotide exchange...</td>\n",
+       "      <td>PLEKHG3</td>\n",
+       "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37447</th>\n",
-       "      <td>ENSG00000267206</td>\n",
-       "      <td>158062</td>\n",
+       "      <th>35851</th>\n",
+       "      <td>ENSG00000187240</td>\n",
+       "      <td>79659</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>[hLcn5, LCN5, UNQ643]</td>\n",
-       "      <td>lipocalin 6</td>\n",
-       "      <td>Predicted to enable small molecule binding act...</td>\n",
-       "      <td>LCN6</td>\n",
+       "      <td>[DHC2, hdhc11, DNCH2, SRTD3, SRPS2B, ATD3, DHC...</td>\n",
+       "      <td>dynein cytoplasmic 2 heavy chain 1</td>\n",
+       "      <td>This gene encodes a large cytoplasmic dynein p...</td>\n",
+       "      <td>DYNC2H1</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37448</th>\n",
-       "      <td>ENSG00000276387</td>\n",
-       "      <td>3802</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C...</td>\n",
-       "      <td>killer cell immunoglobulin like receptor, two ...</td>\n",
-       "      <td>Killer cell immunoglobulin-like receptors (KIR...</td>\n",
-       "      <td>KIR2DL1</td>\n",
+       "      <th>35852</th>\n",
+       "      <td>ENSG00000101470</td>\n",
+       "      <td>7125</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>[CMYP15, CMYO15, CFAP85, FAP85, MYONRI]</td>\n",
+       "      <td>troponin C2, fast skeletal type</td>\n",
+       "      <td>Troponin (Tn), a key protein complex in the re...</td>\n",
+       "      <td>TNNC2</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37449</th>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>128966722</td>\n",
+       "      <th>35853</th>\n",
+       "      <td>ENSG00000241472</td>\n",
+       "      <td>100506994</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>[LOC128966730, LOC128966732, LOC128966731, LOC...</td>\n",
-       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>PTPRG antisense RNA 1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>PTPRG-AS1</td>\n",
+       "      <td>ncRNA</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>LOC128966722</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35854</th>\n",
+       "      <td>ENSG00000133106</td>\n",
+       "      <td>94240</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>[BRESI1]</td>\n",
+       "      <td>epithelial stromal interaction 1</td>\n",
+       "      <td>The protein encoded by this gene has been show...</td>\n",
+       "      <td>EPSTI1</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37450</th>\n",
+       "      <th>35855</th>\n",
        "      <td>ENSG00000230373</td>\n",
        "      <td>100133220</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>[GOLGA6L21P, GOLGA6L17P, GOLGA6L3]</td>\n",
+       "      <td>[GOLGA6L3, GOLGA6L21P, GOLGA6L17P]</td>\n",
        "      <td>golgin A6 family like 3, pseudogene</td>\n",
        "      <td>NaN</td>\n",
        "      <td>GOLGA6L3P</td>\n",
@@ -1250,7 +1074,7 @@
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37451</th>\n",
+       "      <th>35856</th>\n",
        "      <td>ENSG00000249738</td>\n",
        "      <td>285626</td>\n",
        "      <td>1.0</td>\n",
@@ -1261,80 +1085,91 @@
        "      <td>ncRNA</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35857</th>\n",
+       "      <td>ENSG00000276387</td>\n",
+       "      <td>3802</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>[CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64...</td>\n",
+       "      <td>killer cell immunoglobulin like receptor, two ...</td>\n",
+       "      <td>Killer cell immunoglobulin-like receptors (KIR...</td>\n",
+       "      <td>KIR2DL1</td>\n",
+       "      <td>protein-coding</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
        "       ensembl_gene_id              _id _version  \\\n",
-       "37442  ENSG00000163811            23160      1.0   \n",
-       "37443  ENSG00000226467            10554      1.0   \n",
-       "37444  ENSG00000120533            56943      1.0   \n",
-       "37445  ENSG00000214759  ENSG00000214759      1.0   \n",
-       "37446  ENSG00000253981  ENSG00000253981      1.0   \n",
-       "37447  ENSG00000267206           158062      1.0   \n",
-       "37448  ENSG00000276387             3802      2.0   \n",
-       "37449  ENSG00000276518        128966722      1.0   \n",
-       "37450  ENSG00000230373        100133220      1.0   \n",
-       "37451  ENSG00000249738           285626      1.0   \n",
+       "35848  ENSG00000085998            55624      1.0   \n",
+       "35849  ENSG00000285081  ENSG00000285081      1.0   \n",
+       "35850  ENSG00000126822            26030      1.0   \n",
+       "35851  ENSG00000187240            79659      1.0   \n",
+       "35852  ENSG00000101470             7125      1.0   \n",
+       "35853  ENSG00000241472        100506994      1.0   \n",
+       "35854  ENSG00000133106            94240      1.0   \n",
+       "35855  ENSG00000230373        100133220      1.0   \n",
+       "35856  ENSG00000249738           285626      1.0   \n",
+       "35857  ENSG00000276387             3802      1.0   \n",
        "\n",
        "                                                   alias  \\\n",
-       "37442                                      [NET12, UTP5]   \n",
-       "37443       [G15, LPLAT1, 1-AGPAT1, LPAATA, LPAAT-alpha]   \n",
-       "37444                                 [Sus1, e(y)2, DC6]   \n",
-       "37445                                                 []   \n",
-       "37446                                                 []   \n",
-       "37447                              [hLcn5, LCN5, UNQ643]   \n",
-       "37448  [NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C...   \n",
-       "37449  [LOC128966730, LOC128966732, LOC128966731, LOC...   \n",
-       "37450                 [GOLGA6L21P, GOLGA6L17P, GOLGA6L3]   \n",
-       "37451                                     [LOC105377683]   \n",
+       "35848  [RP76, LGMDR15, LGMD2O, gnT-I.2, GNTI.2, GnT I...   \n",
+       "35849                                                 []   \n",
+       "35850                               [ARHGEF43, KIAA0599]   \n",
+       "35851  [DHC2, hdhc11, DNCH2, SRTD3, SRPS2B, ATD3, DHC...   \n",
+       "35852            [CMYP15, CMYO15, CFAP85, FAP85, MYONRI]   \n",
+       "35853                                                 []   \n",
+       "35854                                           [BRESI1]   \n",
+       "35855                 [GOLGA6L3, GOLGA6L21P, GOLGA6L17P]   \n",
+       "35856                                     [LOC105377683]   \n",
+       "35857  [CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64...   \n",
        "\n",
        "                                                    name  \\\n",
-       "37442                                WD repeat domain 43   \n",
-       "37443     1-acylglycerol-3-phosphate O-acyltransferase 1   \n",
-       "37444    ENY2 transcription and export complex 2 subunit   \n",
-       "37445                ribosomal protein L36a pseudogene 2   \n",
-       "37446                           ALG1 like 13, pseudogene   \n",
-       "37447                                        lipocalin 6   \n",
-       "37448  killer cell immunoglobulin like receptor, two ...   \n",
-       "37449  putative killer cell immunoglobulin-like recep...   \n",
-       "37450                golgin A6 family like 3, pseudogene   \n",
-       "37451                          uncharacterized LOC285626   \n",
+       "35848  protein O-linked mannose N-acetylglucosaminylt...   \n",
+       "35849                                                NaN   \n",
+       "35850  pleckstrin homology and RhoGEF domain containi...   \n",
+       "35851                 dynein cytoplasmic 2 heavy chain 1   \n",
+       "35852                    troponin C2, fast skeletal type   \n",
+       "35853                              PTPRG antisense RNA 1   \n",
+       "35854                   epithelial stromal interaction 1   \n",
+       "35855                golgin A6 family like 3, pseudogene   \n",
+       "35856                          uncharacterized LOC285626   \n",
+       "35857  killer cell immunoglobulin like receptor, two ...   \n",
        "\n",
-       "                                                 summary        symbol  \\\n",
-       "37442  Enables RNA binding activity. Involved in posi...         WDR43   \n",
-       "37443  This gene encodes an enzyme that converts lyso...        AGPAT1   \n",
-       "37444  Enables nuclear receptor coactivator activity....          ENY2   \n",
-       "37445                                                NaN      RPL36AP2   \n",
-       "37446                                                NaN      ALG1L13P   \n",
-       "37447  Predicted to enable small molecule binding act...          LCN6   \n",
-       "37448  Killer cell immunoglobulin-like receptors (KIR...       KIR2DL1   \n",
-       "37449                                                NaN  LOC128966722   \n",
-       "37450                                                NaN     GOLGA6L3P   \n",
-       "37451                                                NaN     LOC285626   \n",
+       "                                                 summary     symbol  \\\n",
+       "35848  This gene encodes a type II transmembrane prot...    POMGNT1   \n",
+       "35849                                                NaN        NaN   \n",
+       "35850  Predicted to enable guanyl-nucleotide exchange...    PLEKHG3   \n",
+       "35851  This gene encodes a large cytoplasmic dynein p...    DYNC2H1   \n",
+       "35852  Troponin (Tn), a key protein complex in the re...      TNNC2   \n",
+       "35853                                                NaN  PTPRG-AS1   \n",
+       "35854  The protein encoded by this gene has been show...     EPSTI1   \n",
+       "35855                                                NaN  GOLGA6L3P   \n",
+       "35856                                                NaN  LOC285626   \n",
+       "35857  Killer cell immunoglobulin-like receptors (KIR...    KIR2DL1   \n",
        "\n",
        "         type_of_gene notfound  \n",
-       "37442  protein-coding      NaN  \n",
-       "37443  protein-coding      NaN  \n",
-       "37444  protein-coding      NaN  \n",
-       "37445             NaN      NaN  \n",
-       "37446             NaN      NaN  \n",
-       "37447  protein-coding      NaN  \n",
-       "37448  protein-coding      NaN  \n",
-       "37449  protein-coding      NaN  \n",
-       "37450          pseudo      NaN  \n",
-       "37451           ncRNA      NaN  "
+       "35848  protein-coding      NaN  \n",
+       "35849             NaN      NaN  \n",
+       "35850  protein-coding      NaN  \n",
+       "35851  protein-coding      NaN  \n",
+       "35852  protein-coding      NaN  \n",
+       "35853           ncRNA      NaN  \n",
+       "35854  protein-coding      NaN  \n",
+       "35855          pseudo      NaN  \n",
+       "35856           ncRNA      NaN  \n",
+       "35857  protein-coding      NaN  "
       ]
      },
-     "execution_count": 12,
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "non_dupes = set(gene_table_merged.index) - set(all_duplicated.index)\n",
-    "keep_df = gene_table_merged.loc[list(non_dupes)].copy(deep=True)\n",
+    "keep_df = gene_table_merged.drop(all_duplicated.index)\n",
     "\n",
     "# For each duplicated Ensembl ID, collapse to 1 row and append that row to keep_df\n",
     "for ens_id in set(all_duplicated[\"ensembl_gene_id\"]):\n",
@@ -1383,7 +1218,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "id": "4a1bbdee",
    "metadata": {
     "scrolled": true
@@ -1395,27 +1230,26 @@
      "text": [
       "             name     date                                 url version\n",
       "1  Ensembl GRCh37 Feb 2014          https://grch37.ensembl.org  GRCh37\n",
-      "2     Ensembl 111 Jan 2024 https://jan2024.archive.ensembl.org     111\n",
-      "3     Ensembl 110 Jul 2023 https://jul2023.archive.ensembl.org     110\n",
-      "4     Ensembl 109 Feb 2023 https://feb2023.archive.ensembl.org     109\n",
-      "5     Ensembl 108 Oct 2022 https://oct2022.archive.ensembl.org     108\n",
-      "6     Ensembl 107 Jul 2022 https://jul2022.archive.ensembl.org     107\n",
-      "7     Ensembl 106 Apr 2022 https://apr2022.archive.ensembl.org     106\n",
-      "8     Ensembl 105 Dec 2021 https://dec2021.archive.ensembl.org     105\n",
-      "9     Ensembl 104 May 2021 https://may2021.archive.ensembl.org     104\n",
-      "10    Ensembl 103 Feb 2021 https://feb2021.archive.ensembl.org     103\n",
-      "11    Ensembl 102 Nov 2020 https://nov2020.archive.ensembl.org     102\n",
-      "12    Ensembl 101 Aug 2020 https://aug2020.archive.ensembl.org     101\n",
-      "13    Ensembl 100 Apr 2020 https://apr2020.archive.ensembl.org     100\n",
-      "14     Ensembl 99 Jan 2020 https://jan2020.archive.ensembl.org      99\n",
-      "15     Ensembl 98 Sep 2019 https://sep2019.archive.ensembl.org      98\n",
-      "16     Ensembl 97 Jul 2019 https://jul2019.archive.ensembl.org      97\n",
-      "17     Ensembl 96 Apr 2019 https://apr2019.archive.ensembl.org      96\n",
-      "18     Ensembl 95 Jan 2019 https://jan2019.archive.ensembl.org      95\n",
-      "19     Ensembl 80 May 2015 https://may2015.archive.ensembl.org      80\n",
-      "20     Ensembl 77 Oct 2014 https://oct2014.archive.ensembl.org      77\n",
-      "21     Ensembl 75 Feb 2014 https://feb2014.archive.ensembl.org      75\n",
-      "22     Ensembl 54 May 2009 https://may2009.archive.ensembl.org      54\n",
+      "2     Ensembl 113 Oct 2024 https://oct2024.archive.ensembl.org     113\n",
+      "3     Ensembl 112 May 2024 https://may2024.archive.ensembl.org     112\n",
+      "4     Ensembl 111 Jan 2024 https://jan2024.archive.ensembl.org     111\n",
+      "5     Ensembl 110 Jul 2023 https://jul2023.archive.ensembl.org     110\n",
+      "6     Ensembl 109 Feb 2023 https://feb2023.archive.ensembl.org     109\n",
+      "7     Ensembl 108 Oct 2022 https://oct2022.archive.ensembl.org     108\n",
+      "8     Ensembl 107 Jul 2022 https://jul2022.archive.ensembl.org     107\n",
+      "9     Ensembl 106 Apr 2022 https://apr2022.archive.ensembl.org     106\n",
+      "10    Ensembl 105 Dec 2021 https://dec2021.archive.ensembl.org     105\n",
+      "11    Ensembl 104 May 2021 https://may2021.archive.ensembl.org     104\n",
+      "12    Ensembl 103 Feb 2021 https://feb2021.archive.ensembl.org     103\n",
+      "13    Ensembl 102 Nov 2020 https://nov2020.archive.ensembl.org     102\n",
+      "14    Ensembl 101 Aug 2020 https://aug2020.archive.ensembl.org     101\n",
+      "15    Ensembl 100 Apr 2020 https://apr2020.archive.ensembl.org     100\n",
+      "16     Ensembl 99 Jan 2020 https://jan2020.archive.ensembl.org      99\n",
+      "17     Ensembl 98 Sep 2019 https://sep2019.archive.ensembl.org      98\n",
+      "18     Ensembl 80 May 2015 https://may2015.archive.ensembl.org      80\n",
+      "19     Ensembl 77 Oct 2014 https://oct2014.archive.ensembl.org      77\n",
+      "20     Ensembl 75 Feb 2014 https://feb2014.archive.ensembl.org      75\n",
+      "21     Ensembl 54 May 2009 https://may2009.archive.ensembl.org      54\n",
       "   current_release\n",
       "1                 \n",
       "2                *\n",
@@ -1438,7 +1272,6 @@
       "19                \n",
       "20                \n",
       "21                \n",
-      "22                \n",
       "\n"
      ]
     }
@@ -1462,7 +1295,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "id": "9a747309",
    "metadata": {
     "scrolled": true
@@ -1472,7 +1305,6 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "37452\n",
       "Querying genes 1 - 1000\n",
       "Querying genes 1001 - 2000\n",
       "Querying genes 2001 - 3000\n",
@@ -1508,10 +1340,7 @@
       "Querying genes 32001 - 33000\n",
       "Querying genes 33001 - 34000\n",
       "Querying genes 34001 - 35000\n",
-      "Querying genes 35001 - 36000\n",
-      "Querying genes 36001 - 37000\n",
-      "Querying genes 37001 - 37452\n",
-      "37452\n"
+      "Querying genes 35001 - 35858\n"
      ]
     },
     {
@@ -1535,156 +1364,114 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>is_current</th>\n",
        "      <th>assembly</th>\n",
-       "      <th>id</th>\n",
-       "      <th>version</th>\n",
-       "      <th>type</th>\n",
        "      <th>peptide</th>\n",
-       "      <th>latest</th>\n",
        "      <th>possible_replacement</th>\n",
        "      <th>release</th>\n",
+       "      <th>latest</th>\n",
+       "      <th>type</th>\n",
+       "      <th>id</th>\n",
+       "      <th>version</th>\n",
+       "      <th>is_current</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>37447</th>\n",
-       "      <td>1</td>\n",
+       "      <th>35853</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000267206</td>\n",
-       "      <td>6</td>\n",
-       "      <td>Gene</td>\n",
        "      <td>None</td>\n",
-       "      <td>ENSG00000267206.6</td>\n",
        "      <td>[]</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
+       "      <td>ENSG00000241472.9</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>ENSG00000241472</td>\n",
+       "      <td>9</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37448</th>\n",
-       "      <td>1</td>\n",
+       "      <th>35854</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000276387</td>\n",
-       "      <td>4</td>\n",
-       "      <td>Gene</td>\n",
        "      <td>None</td>\n",
-       "      <td>ENSG00000276387.4</td>\n",
        "      <td>[]</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
+       "      <td>ENSG00000133106.15</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>ENSG00000133106</td>\n",
+       "      <td>15</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37449</th>\n",
-       "      <td>1</td>\n",
+       "      <th>35855</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Gene</td>\n",
        "      <td>None</td>\n",
-       "      <td>ENSG00000276518.1</td>\n",
        "      <td>[]</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
+       "      <td>ENSG00000230373.9</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>ENSG00000230373</td>\n",
+       "      <td>9</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37450</th>\n",
-       "      <td>1</td>\n",
+       "      <th>35856</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000230373</td>\n",
-       "      <td>9</td>\n",
-       "      <td>Gene</td>\n",
        "      <td>None</td>\n",
-       "      <td>ENSG00000230373.9</td>\n",
        "      <td>[]</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
+       "      <td>ENSG00000249738.11</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>ENSG00000249738</td>\n",
+       "      <td>11</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37451</th>\n",
-       "      <td>1</td>\n",
+       "      <th>35857</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000249738</td>\n",
-       "      <td>10</td>\n",
-       "      <td>Gene</td>\n",
        "      <td>None</td>\n",
-       "      <td>ENSG00000249738.10</td>\n",
        "      <td>[]</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
+       "      <td>ENSG00000276387.4</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>ENSG00000276387</td>\n",
+       "      <td>4</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "      is_current assembly               id  version  type peptide  \\\n",
-       "37447          1   GRCh38  ENSG00000267206        6  Gene    None   \n",
-       "37448          1   GRCh38  ENSG00000276387        4  Gene    None   \n",
-       "37449          1   GRCh38  ENSG00000276518        1  Gene    None   \n",
-       "37450          1   GRCh38  ENSG00000230373        9  Gene    None   \n",
-       "37451          1   GRCh38  ENSG00000249738       10  Gene    None   \n",
+       "      assembly peptide possible_replacement release              latest  type  \\\n",
+       "35853   GRCh38    None                   []     113   ENSG00000241472.9  Gene   \n",
+       "35854   GRCh38    None                   []     113  ENSG00000133106.15  Gene   \n",
+       "35855   GRCh38    None                   []     113   ENSG00000230373.9  Gene   \n",
+       "35856   GRCh38    None                   []     113  ENSG00000249738.11  Gene   \n",
+       "35857   GRCh38    None                   []     113   ENSG00000276387.4  Gene   \n",
        "\n",
-       "                   latest possible_replacement release  \n",
-       "37447   ENSG00000267206.6                   []     111  \n",
-       "37448   ENSG00000276387.4                   []     111  \n",
-       "37449   ENSG00000276518.1                   []     111  \n",
-       "37450   ENSG00000230373.9                   []     111  \n",
-       "37451  ENSG00000249738.10                   []     111  "
+       "                    id  version is_current  \n",
+       "35853  ENSG00000241472        9          1  \n",
+       "35854  ENSG00000133106       15          1  \n",
+       "35855  ENSG00000230373        9          1  \n",
+       "35856  ENSG00000249738       11          1  \n",
+       "35857  ENSG00000276387        4          1  "
       ]
      },
-     "execution_count": 14,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "url = \"https://rest.ensembl.org/archive/id\"\n",
-    "headers = {\"Content-Type\": \"application/json\", \"Accept\": \"application/json\"}\n",
-    "\n",
-    "ids = gene_table_merged[\"ensembl_gene_id\"].tolist()\n",
-    "print(len(ids))\n",
-    "\n",
-    "# We can only query 1000 genes at a time\n",
-    "batch_ind = range(0, len(ids), 1000)\n",
-    "results = []\n",
-    "\n",
-    "for B in batch_ind:\n",
-    "    end = min(len(ids), B + 1000)\n",
-    "    print(\"Querying genes \" + str(B + 1) + \" - \" + str(end))\n",
-    "\n",
-    "    request_data = '{ \"id\" : ' + str(ids[B:end]) + \" }\"\n",
-    "    request_data = request_data.replace(\"'\", '\"')\n",
-    "\n",
-    "    ok = False\n",
-    "    tries = 0\n",
-    "\n",
-    "    while tries < 5 and not ok:\n",
-    "        try:\n",
-    "            res = requests.post(url, headers=headers, data=request_data)\n",
-    "            ok = res.ok\n",
-    "        except:\n",
-    "            ok = False\n",
-    "\n",
-    "        tries = tries + 1\n",
-    "\n",
-    "        if not ok:\n",
-    "            # res.raise_for_status()\n",
-    "            print(\n",
-    "                \"Error retrieving Ensembl versions for genes \"\n",
-    "                + str(B + 1)\n",
-    "                + \" - \"\n",
-    "                + str(end)\n",
-    "                + \". Trying again...\"\n",
-    "            )\n",
-    "        else:\n",
-    "            results = results + res.json()\n",
-    "            break\n",
-    "\n",
-    "print(len(results))\n",
-    "\n",
-    "versions = pd.json_normalize(results)\n",
+    "versions = preprocessing_utils.query_ensembl_version_api(\n",
+    "    ensembl_ids=gene_table_merged[\"ensembl_gene_id\"].tolist()\n",
+    ")\n",
     "\n",
     "versions.tail()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "id": "5c108238",
    "metadata": {},
    "outputs": [
@@ -1692,35 +1479,37 @@
      "data": {
       "text/plain": [
        "release\n",
-       "100       22\n",
+       "100       21\n",
        "101        8\n",
        "102       16\n",
-       "103       15\n",
-       "104       19\n",
-       "105        9\n",
+       "103       12\n",
+       "104       17\n",
+       "105       10\n",
        "106       35\n",
-       "107       10\n",
+       "107       12\n",
        "108        4\n",
        "109        4\n",
        "110       11\n",
-       "111    36286\n",
+       "111       52\n",
+       "112      354\n",
+       "113    34303\n",
        "80        21\n",
        "81         2\n",
        "82        10\n",
        "84       673\n",
        "87        61\n",
        "89        20\n",
-       "91        75\n",
-       "93        53\n",
+       "91        67\n",
+       "93        50\n",
        "95        33\n",
        "96        31\n",
-       "97        18\n",
+       "97        17\n",
        "98         9\n",
-       "99         7\n",
+       "99         5\n",
        "dtype: int64"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1731,7 +1520,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "id": "bf5aecb1",
    "metadata": {
     "scrolled": true
@@ -1741,8 +1530,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "37452\n",
-      "37452\n",
+      "35858\n",
+      "35858\n",
       "True\n"
      ]
     }
@@ -1759,7 +1548,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "id": "7fc8bbcd",
    "metadata": {},
    "outputs": [
@@ -1769,7 +1558,7 @@
        "True"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1791,7 +1580,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
    "id": "0d5b5652",
    "metadata": {
     "scrolled": true
@@ -1815,7 +1604,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
    "id": "337b2890",
    "metadata": {},
    "outputs": [
@@ -1823,28 +1612,27 @@
      "data": {
       "text/plain": [
        "closest_release\n",
-       "80       915\n",
-       "95        33\n",
-       "96        31\n",
-       "97        18\n",
+       "80       985\n",
        "98         9\n",
-       "99         7\n",
-       "100       22\n",
+       "99         5\n",
+       "100       21\n",
        "101        8\n",
        "102       16\n",
-       "103       15\n",
-       "104       19\n",
-       "105        9\n",
+       "103       12\n",
+       "104       17\n",
+       "105       10\n",
        "106       35\n",
-       "107       10\n",
+       "107       12\n",
        "108        4\n",
        "109        4\n",
        "110       11\n",
-       "111    36286\n",
+       "111       52\n",
+       "112      354\n",
+       "113    34303\n",
        "dtype: int64"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1865,7 +1653,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 19,
    "id": "343e5006",
    "metadata": {
     "scrolled": false
@@ -1892,15 +1680,15 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>is_current</th>\n",
        "      <th>assembly</th>\n",
-       "      <th>id</th>\n",
-       "      <th>version</th>\n",
-       "      <th>type</th>\n",
        "      <th>peptide</th>\n",
-       "      <th>latest</th>\n",
        "      <th>possible_replacement</th>\n",
        "      <th>release</th>\n",
+       "      <th>latest</th>\n",
+       "      <th>type</th>\n",
+       "      <th>id</th>\n",
+       "      <th>version</th>\n",
+       "      <th>is_current</th>\n",
        "      <th>closest_release</th>\n",
        "      <th>permalink</th>\n",
        "    </tr>\n",
@@ -1908,102 +1696,102 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>1</td>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000164972</td>\n",
-       "      <td>14</td>\n",
-       "      <td>Gene</td>\n",
        "      <td>None</td>\n",
-       "      <td>ENSG00000164972.14</td>\n",
        "      <td>[]</td>\n",
-       "      <td>111</td>\n",
-       "      <td>111</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>113</td>\n",
+       "      <td>ENSG00000151650.8</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>ENSG00000151650</td>\n",
+       "      <td>8</td>\n",
+       "      <td>1</td>\n",
+       "      <td>113</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>1</td>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000169105</td>\n",
-       "      <td>8</td>\n",
-       "      <td>Gene</td>\n",
        "      <td>None</td>\n",
-       "      <td>ENSG00000169105.8</td>\n",
        "      <td>[]</td>\n",
-       "      <td>111</td>\n",
-       "      <td>111</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>113</td>\n",
+       "      <td>ENSG00000168268.11</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>ENSG00000168268</td>\n",
+       "      <td>11</td>\n",
+       "      <td>1</td>\n",
+       "      <td>113</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>1</td>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000255136</td>\n",
-       "      <td>3</td>\n",
-       "      <td>Gene</td>\n",
        "      <td>None</td>\n",
-       "      <td>ENSG00000255136.3</td>\n",
        "      <td>[]</td>\n",
-       "      <td>111</td>\n",
-       "      <td>111</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>113</td>\n",
+       "      <td>ENSG00000186310.10</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>ENSG00000186310</td>\n",
+       "      <td>10</td>\n",
+       "      <td>1</td>\n",
+       "      <td>113</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>1</td>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000105499</td>\n",
-       "      <td>14</td>\n",
-       "      <td>Gene</td>\n",
        "      <td>None</td>\n",
-       "      <td>ENSG00000105499.14</td>\n",
        "      <td>[]</td>\n",
-       "      <td>111</td>\n",
-       "      <td>111</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>113</td>\n",
+       "      <td>ENSG00000204616.11</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>ENSG00000204616</td>\n",
+       "      <td>11</td>\n",
+       "      <td>1</td>\n",
+       "      <td>113</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>1</td>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000104611</td>\n",
-       "      <td>12</td>\n",
-       "      <td>Gene</td>\n",
        "      <td>None</td>\n",
-       "      <td>ENSG00000104611.12</td>\n",
        "      <td>[]</td>\n",
-       "      <td>111</td>\n",
-       "      <td>111</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>113</td>\n",
+       "      <td>ENSG00000158467.17</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>ENSG00000158467</td>\n",
+       "      <td>17</td>\n",
+       "      <td>1</td>\n",
+       "      <td>113</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "  is_current assembly               id  version  type peptide  \\\n",
-       "0          1   GRCh38  ENSG00000164972       14  Gene    None   \n",
-       "1          1   GRCh38  ENSG00000169105        8  Gene    None   \n",
-       "2          1   GRCh38  ENSG00000255136        3  Gene    None   \n",
-       "3          1   GRCh38  ENSG00000105499       14  Gene    None   \n",
-       "4          1   GRCh38  ENSG00000104611       12  Gene    None   \n",
+       "  assembly peptide possible_replacement release              latest  type  \\\n",
+       "0   GRCh38    None                   []     113   ENSG00000151650.8  Gene   \n",
+       "1   GRCh38    None                   []     113  ENSG00000168268.11  Gene   \n",
+       "2   GRCh38    None                   []     113  ENSG00000186310.10  Gene   \n",
+       "3   GRCh38    None                   []     113  ENSG00000204616.11  Gene   \n",
+       "4   GRCh38    None                   []     113  ENSG00000158467.17  Gene   \n",
        "\n",
-       "               latest possible_replacement release  closest_release  \\\n",
-       "0  ENSG00000164972.14                   []     111              111   \n",
-       "1   ENSG00000169105.8                   []     111              111   \n",
-       "2   ENSG00000255136.3                   []     111              111   \n",
-       "3  ENSG00000105499.14                   []     111              111   \n",
-       "4  ENSG00000104611.12                   []     111              111   \n",
+       "                id  version is_current  closest_release  \\\n",
+       "0  ENSG00000151650        8          1              113   \n",
+       "1  ENSG00000168268       11          1              113   \n",
+       "2  ENSG00000186310       10          1              113   \n",
+       "3  ENSG00000204616       11          1              113   \n",
+       "4  ENSG00000158467       17          1              113   \n",
        "\n",
        "                                           permalink  \n",
-       "0  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "1  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "2  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "3  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "4  https://jan2024.archive.ensembl.org/Homo_sapie...  "
+       "0  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "1  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "2  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "3  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "4  https://oct2024.archive.ensembl.org/Homo_sapie...  "
       ]
      },
-     "execution_count": 20,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2024,7 +1812,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 20,
    "id": "4b01719d",
    "metadata": {},
    "outputs": [
@@ -2049,87 +1837,87 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>is_current</th>\n",
        "      <th>assembly</th>\n",
-       "      <th>id</th>\n",
-       "      <th>version</th>\n",
-       "      <th>type</th>\n",
        "      <th>peptide</th>\n",
-       "      <th>latest</th>\n",
        "      <th>possible_replacement</th>\n",
        "      <th>release</th>\n",
+       "      <th>latest</th>\n",
+       "      <th>type</th>\n",
+       "      <th>id</th>\n",
+       "      <th>version</th>\n",
+       "      <th>is_current</th>\n",
        "      <th>closest_release</th>\n",
        "      <th>permalink</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>51</th>\n",
-       "      <td></td>\n",
+       "      <th>67</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000266701</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Gene</td>\n",
        "      <td>None</td>\n",
-       "      <td>ENSG00000266701.1</td>\n",
        "      <td>[]</td>\n",
        "      <td>84</td>\n",
+       "      <td>ENSG00000265108.1</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>ENSG00000265108</td>\n",
+       "      <td>1</td>\n",
+       "      <td></td>\n",
        "      <td>80</td>\n",
        "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>99</th>\n",
-       "      <td></td>\n",
+       "      <th>68</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000268225</td>\n",
-       "      <td>2</td>\n",
-       "      <td>Gene</td>\n",
        "      <td>None</td>\n",
-       "      <td>ENSG00000268225.2</td>\n",
        "      <td>[]</td>\n",
-       "      <td>98</td>\n",
-       "      <td>98</td>\n",
-       "      <td>https://sep2019.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>80</td>\n",
+       "      <td>ENSG00000280803.1</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>ENSG00000280803</td>\n",
+       "      <td>1</td>\n",
+       "      <td></td>\n",
+       "      <td>80</td>\n",
+       "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>119</th>\n",
-       "      <td></td>\n",
+       "      <th>111</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000281018</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Gene</td>\n",
        "      <td>None</td>\n",
-       "      <td>ENSG00000281018.1</td>\n",
        "      <td>[]</td>\n",
        "      <td>84</td>\n",
+       "      <td>ENSG00000281672.1</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>ENSG00000281672</td>\n",
+       "      <td>1</td>\n",
+       "      <td></td>\n",
        "      <td>80</td>\n",
        "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>120</th>\n",
-       "      <td></td>\n",
+       "      <th>135</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000216011</td>\n",
-       "      <td>2</td>\n",
-       "      <td>Gene</td>\n",
        "      <td>None</td>\n",
-       "      <td>ENSG00000216011.2</td>\n",
        "      <td>[]</td>\n",
-       "      <td>84</td>\n",
+       "      <td>87</td>\n",
+       "      <td>ENSG00000279857.1</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>ENSG00000279857</td>\n",
+       "      <td>1</td>\n",
+       "      <td></td>\n",
        "      <td>80</td>\n",
        "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>135</th>\n",
-       "      <td></td>\n",
+       "      <th>141</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000264103</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Gene</td>\n",
        "      <td>None</td>\n",
-       "      <td>ENSG00000264103.1</td>\n",
        "      <td>[]</td>\n",
        "      <td>84</td>\n",
+       "      <td>ENSG00000274483.1</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>ENSG00000274483</td>\n",
+       "      <td>1</td>\n",
+       "      <td></td>\n",
        "      <td>80</td>\n",
        "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
@@ -2138,29 +1926,29 @@
        "</div>"
       ],
       "text/plain": [
-       "    is_current assembly               id  version  type peptide  \\\n",
-       "51               GRCh38  ENSG00000266701        1  Gene    None   \n",
-       "99               GRCh38  ENSG00000268225        2  Gene    None   \n",
-       "119              GRCh38  ENSG00000281018        1  Gene    None   \n",
-       "120              GRCh38  ENSG00000216011        2  Gene    None   \n",
-       "135              GRCh38  ENSG00000264103        1  Gene    None   \n",
+       "    assembly peptide possible_replacement release             latest  type  \\\n",
+       "67    GRCh38    None                   []      84  ENSG00000265108.1  Gene   \n",
+       "68    GRCh38    None                   []      80  ENSG00000280803.1  Gene   \n",
+       "111   GRCh38    None                   []      84  ENSG00000281672.1  Gene   \n",
+       "135   GRCh38    None                   []      87  ENSG00000279857.1  Gene   \n",
+       "141   GRCh38    None                   []      84  ENSG00000274483.1  Gene   \n",
        "\n",
-       "                latest possible_replacement release  closest_release  \\\n",
-       "51   ENSG00000266701.1                   []      84               80   \n",
-       "99   ENSG00000268225.2                   []      98               98   \n",
-       "119  ENSG00000281018.1                   []      84               80   \n",
-       "120  ENSG00000216011.2                   []      84               80   \n",
-       "135  ENSG00000264103.1                   []      84               80   \n",
+       "                  id  version is_current  closest_release  \\\n",
+       "67   ENSG00000265108        1                          80   \n",
+       "68   ENSG00000280803        1                          80   \n",
+       "111  ENSG00000281672        1                          80   \n",
+       "135  ENSG00000279857        1                          80   \n",
+       "141  ENSG00000274483        1                          80   \n",
        "\n",
        "                                             permalink  \n",
-       "51   https://may2015.archive.ensembl.org/Homo_sapie...  \n",
-       "99   https://sep2019.archive.ensembl.org/Homo_sapie...  \n",
-       "119  https://may2015.archive.ensembl.org/Homo_sapie...  \n",
-       "120  https://may2015.archive.ensembl.org/Homo_sapie...  \n",
-       "135  https://may2015.archive.ensembl.org/Homo_sapie...  "
+       "67   https://may2015.archive.ensembl.org/Homo_sapie...  \n",
+       "68   https://may2015.archive.ensembl.org/Homo_sapie...  \n",
+       "111  https://may2015.archive.ensembl.org/Homo_sapie...  \n",
+       "135  https://may2015.archive.ensembl.org/Homo_sapie...  \n",
+       "141  https://may2015.archive.ensembl.org/Homo_sapie...  "
       ]
      },
-     "execution_count": 21,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2171,7 +1959,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 21,
    "id": "c4128cc9",
    "metadata": {},
    "outputs": [
@@ -2179,8 +1967,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000164972\n",
-      "https://jul2023.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000279049\n"
+      "https://oct2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000151650\n",
+      "https://oct2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000142192\n"
      ]
     }
    ],
@@ -2191,7 +1979,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 22,
    "id": "73791e6c",
    "metadata": {},
    "outputs": [
@@ -2201,7 +1989,7 @@
        "True"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2222,7 +2010,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 23,
    "id": "f3edfd2f",
    "metadata": {},
    "outputs": [
@@ -2230,7 +2018,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "(37452, 12)\n"
+      "(35858, 12)\n"
      ]
     },
     {
@@ -2271,128 +2059,121 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>ENSG00000164972</td>\n",
-       "      <td>84688</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22]</td>\n",
-       "      <td>sperm microtubule inner protein 6</td>\n",
-       "      <td>This gene encodes a nuclear- or perinuclear-lo...</td>\n",
-       "      <td>SPMIP6</td>\n",
+       "      <td>ENSG00000151650</td>\n",
+       "      <td>27287</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>[NA88A, HPX42B, VENTX2]</td>\n",
+       "      <td>VENT homeobox</td>\n",
+       "      <td>This gene encodes a member of the Vent family ...</td>\n",
+       "      <td>VENTX</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
        "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>ENSG00000169105</td>\n",
-       "      <td>113189</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>[ATCS, EDSMC1, HNK1ST, D4ST1]</td>\n",
-       "      <td>carbohydrate sulfotransferase 14</td>\n",
-       "      <td>This gene encodes a member of the HNK-1 family...</td>\n",
-       "      <td>CHST14</td>\n",
+       "      <td>ENSG00000168268</td>\n",
+       "      <td>64943</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>5'-nucleotidase domain containing 2</td>\n",
+       "      <td>Predicted to enable 5'-nucleotidase activity. ...</td>\n",
+       "      <td>NT5DC2</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
        "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>ENSG00000255136</td>\n",
-       "      <td>ENSG00000255136</td>\n",
+       "      <td>ENSG00000186310</td>\n",
+       "      <td>4675</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>TPBGL antisense RNA 1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>TPBGL-AS1</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>[MB20, NPL3]</td>\n",
+       "      <td>nucleosome assembly protein 1 like 3</td>\n",
+       "      <td>This gene is intronless and encodes a member o...</td>\n",
+       "      <td>NAP1L3</td>\n",
+       "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
        "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>ENSG00000105499</td>\n",
-       "      <td>8605</td>\n",
+       "      <td>ENSG00000204616</td>\n",
+       "      <td>11074</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>[CPLA2-gamma]</td>\n",
-       "      <td>phospholipase A2 group IVC</td>\n",
-       "      <td>This gene encodes a protein which is a member ...</td>\n",
-       "      <td>PLA2G4C</td>\n",
+       "      <td>[C6orf13, RNF, HCGI, HCG1]</td>\n",
+       "      <td>tripartite motif containing 31</td>\n",
+       "      <td>This gene encodes a protein that functions as ...</td>\n",
+       "      <td>TRIM31</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
        "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>ENSG00000104611</td>\n",
-       "      <td>63898</td>\n",
+       "      <td>ENSG00000158467</td>\n",
+       "      <td>23382</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>[PPP1R38, SH2A]</td>\n",
-       "      <td>SH2 domain containing 4A</td>\n",
-       "      <td>Enables phosphatase binding activity. Located ...</td>\n",
-       "      <td>SH2D4A</td>\n",
+       "      <td>[IRBIT2, ADOHCYASE3]</td>\n",
+       "      <td>adenosylhomocysteinase like 2</td>\n",
+       "      <td>The protein encoded by this gene acts as a hom...</td>\n",
+       "      <td>AHCYL2</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
        "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   ensembl_gene_id              _id _version  \\\n",
-       "0  ENSG00000164972            84688      2.0   \n",
-       "1  ENSG00000169105           113189      2.0   \n",
-       "2  ENSG00000255136  ENSG00000255136      1.0   \n",
-       "3  ENSG00000105499             8605      1.0   \n",
-       "4  ENSG00000104611            63898      1.0   \n",
-       "\n",
-       "                                          alias  \\\n",
-       "0  [SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22]   \n",
-       "1                 [ATCS, EDSMC1, HNK1ST, D4ST1]   \n",
-       "2                                            []   \n",
-       "3                                 [CPLA2-gamma]   \n",
-       "4                               [PPP1R38, SH2A]   \n",
+       "   ensembl_gene_id    _id _version                       alias  \\\n",
+       "0  ENSG00000151650  27287      1.0     [NA88A, HPX42B, VENTX2]   \n",
+       "1  ENSG00000168268  64943      1.0                          []   \n",
+       "2  ENSG00000186310   4675      1.0                [MB20, NPL3]   \n",
+       "3  ENSG00000204616  11074      1.0  [C6orf13, RNF, HCGI, HCG1]   \n",
+       "4  ENSG00000158467  23382      1.0        [IRBIT2, ADOHCYASE3]   \n",
        "\n",
-       "                                name  \\\n",
-       "0  sperm microtubule inner protein 6   \n",
-       "1   carbohydrate sulfotransferase 14   \n",
-       "2              TPBGL antisense RNA 1   \n",
-       "3         phospholipase A2 group IVC   \n",
-       "4           SH2 domain containing 4A   \n",
+       "                                   name  \\\n",
+       "0                         VENT homeobox   \n",
+       "1   5'-nucleotidase domain containing 2   \n",
+       "2  nucleosome assembly protein 1 like 3   \n",
+       "3        tripartite motif containing 31   \n",
+       "4         adenosylhomocysteinase like 2   \n",
        "\n",
-       "                                             summary     symbol  \\\n",
-       "0  This gene encodes a nuclear- or perinuclear-lo...     SPMIP6   \n",
-       "1  This gene encodes a member of the HNK-1 family...     CHST14   \n",
-       "2                                                NaN  TPBGL-AS1   \n",
-       "3  This gene encodes a protein which is a member ...    PLA2G4C   \n",
-       "4  Enables phosphatase binding activity. Located ...     SH2D4A   \n",
+       "                                             summary  symbol    type_of_gene  \\\n",
+       "0  This gene encodes a member of the Vent family ...   VENTX  protein-coding   \n",
+       "1  Predicted to enable 5'-nucleotidase activity. ...  NT5DC2  protein-coding   \n",
+       "2  This gene is intronless and encodes a member o...  NAP1L3  protein-coding   \n",
+       "3  This gene encodes a protein that functions as ...  TRIM31  protein-coding   \n",
+       "4  The protein encoded by this gene acts as a hom...  AHCYL2  protein-coding   \n",
        "\n",
-       "     type_of_gene notfound ensembl_release possible_replacement  \\\n",
-       "0  protein-coding      NaN             111                   []   \n",
-       "1  protein-coding      NaN             111                   []   \n",
-       "2             NaN      NaN             111                   []   \n",
-       "3  protein-coding      NaN             111                   []   \n",
-       "4  protein-coding      NaN             111                   []   \n",
+       "  notfound ensembl_release possible_replacement  \\\n",
+       "0      NaN             113                   []   \n",
+       "1      NaN             113                   []   \n",
+       "2      NaN             113                   []   \n",
+       "3      NaN             113                   []   \n",
+       "4      NaN             113                   []   \n",
        "\n",
        "                                           permalink  \n",
-       "0  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "1  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "2  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "3  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "4  https://jan2024.archive.ensembl.org/Homo_sapie...  "
+       "0  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "1  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "2  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "3  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "4  https://oct2024.archive.ensembl.org/Homo_sapie...  "
       ]
      },
-     "execution_count": 24,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2430,7 +2211,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 24,
    "id": "d0c07b7a",
    "metadata": {},
    "outputs": [
@@ -2469,63 +2250,63 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>ENSG00000164972</td>\n",
-       "      <td>sperm microtubule inner protein 6</td>\n",
-       "      <td>[SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22]</td>\n",
-       "      <td>This gene encodes a nuclear- or perinuclear-lo...</td>\n",
-       "      <td>SPMIP6</td>\n",
+       "      <td>ENSG00000151650</td>\n",
+       "      <td>VENT homeobox</td>\n",
+       "      <td>[NA88A, HPX42B, VENTX2]</td>\n",
+       "      <td>This gene encodes a member of the Vent family ...</td>\n",
+       "      <td>VENTX</td>\n",
        "      <td>protein-coding</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
        "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>ENSG00000169105</td>\n",
-       "      <td>carbohydrate sulfotransferase 14</td>\n",
-       "      <td>[ATCS, EDSMC1, HNK1ST, D4ST1]</td>\n",
-       "      <td>This gene encodes a member of the HNK-1 family...</td>\n",
-       "      <td>CHST14</td>\n",
+       "      <td>ENSG00000168268</td>\n",
+       "      <td>5'-nucleotidase domain containing 2</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>Predicted to enable 5'-nucleotidase activity. ...</td>\n",
+       "      <td>NT5DC2</td>\n",
        "      <td>protein-coding</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
        "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>ENSG00000255136</td>\n",
-       "      <td>TPBGL antisense RNA 1</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>TPBGL-AS1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>111</td>\n",
+       "      <td>ENSG00000186310</td>\n",
+       "      <td>nucleosome assembly protein 1 like 3</td>\n",
+       "      <td>[MB20, NPL3]</td>\n",
+       "      <td>This gene is intronless and encodes a member o...</td>\n",
+       "      <td>NAP1L3</td>\n",
+       "      <td>protein-coding</td>\n",
+       "      <td>113</td>\n",
        "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>ENSG00000105499</td>\n",
-       "      <td>phospholipase A2 group IVC</td>\n",
-       "      <td>[CPLA2-gamma]</td>\n",
-       "      <td>This gene encodes a protein which is a member ...</td>\n",
-       "      <td>PLA2G4C</td>\n",
+       "      <td>ENSG00000204616</td>\n",
+       "      <td>tripartite motif containing 31</td>\n",
+       "      <td>[C6orf13, RNF, HCGI, HCG1]</td>\n",
+       "      <td>This gene encodes a protein that functions as ...</td>\n",
+       "      <td>TRIM31</td>\n",
        "      <td>protein-coding</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
        "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>ENSG00000104611</td>\n",
-       "      <td>SH2 domain containing 4A</td>\n",
-       "      <td>[PPP1R38, SH2A]</td>\n",
-       "      <td>Enables phosphatase binding activity. Located ...</td>\n",
-       "      <td>SH2D4A</td>\n",
+       "      <td>ENSG00000158467</td>\n",
+       "      <td>adenosylhomocysteinase like 2</td>\n",
+       "      <td>[IRBIT2, ADOHCYASE3]</td>\n",
+       "      <td>The protein encoded by this gene acts as a hom...</td>\n",
+       "      <td>AHCYL2</td>\n",
        "      <td>protein-coding</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
        "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -2540,149 +2321,148 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37447</th>\n",
-       "      <td>ENSG00000267206</td>\n",
-       "      <td>lipocalin 6</td>\n",
-       "      <td>[hLcn5, LCN5, UNQ643]</td>\n",
-       "      <td>Predicted to enable small molecule binding act...</td>\n",
-       "      <td>LCN6</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>111</td>\n",
+       "      <th>35853</th>\n",
+       "      <td>ENSG00000241472</td>\n",
+       "      <td>PTPRG antisense RNA 1</td>\n",
        "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37448</th>\n",
-       "      <td>ENSG00000276387</td>\n",
-       "      <td>killer cell immunoglobulin like receptor, two ...</td>\n",
-       "      <td>[NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C...</td>\n",
-       "      <td>Killer cell immunoglobulin-like receptors (KIR...</td>\n",
-       "      <td>KIR2DL1</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>111</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>PTPRG-AS1</td>\n",
+       "      <td>ncRNA</td>\n",
+       "      <td>113</td>\n",
        "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37449</th>\n",
-       "      <td>ENSG00000276518</td>\n",
-       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
-       "      <td>[LOC128966730, LOC128966732, LOC128966731, LOC...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC128966722</td>\n",
+       "      <th>35854</th>\n",
+       "      <td>ENSG00000133106</td>\n",
+       "      <td>epithelial stromal interaction 1</td>\n",
+       "      <td>[BRESI1]</td>\n",
+       "      <td>The protein encoded by this gene has been show...</td>\n",
+       "      <td>EPSTI1</td>\n",
        "      <td>protein-coding</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
        "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37450</th>\n",
+       "      <th>35855</th>\n",
        "      <td>ENSG00000230373</td>\n",
        "      <td>golgin A6 family like 3, pseudogene</td>\n",
-       "      <td>[GOLGA6L21P, GOLGA6L17P, GOLGA6L3]</td>\n",
+       "      <td>[GOLGA6L3, GOLGA6L21P, GOLGA6L17P]</td>\n",
        "      <td>NaN</td>\n",
        "      <td>GOLGA6L3P</td>\n",
        "      <td>pseudo</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
        "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37451</th>\n",
+       "      <th>35856</th>\n",
        "      <td>ENSG00000249738</td>\n",
        "      <td>uncharacterized LOC285626</td>\n",
        "      <td>[LOC105377683]</td>\n",
        "      <td>NaN</td>\n",
        "      <td>LOC285626</td>\n",
        "      <td>ncRNA</td>\n",
-       "      <td>111</td>\n",
+       "      <td>113</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35857</th>\n",
+       "      <td>ENSG00000276387</td>\n",
+       "      <td>killer cell immunoglobulin like receptor, two ...</td>\n",
+       "      <td>[CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64...</td>\n",
+       "      <td>Killer cell immunoglobulin-like receptors (KIR...</td>\n",
+       "      <td>KIR2DL1</td>\n",
+       "      <td>protein-coding</td>\n",
+       "      <td>113</td>\n",
        "      <td>[]</td>\n",
-       "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
+       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>37452 rows × 9 columns</p>\n",
+       "<p>35858 rows × 9 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
        "       ensembl_gene_id                                               name  \\\n",
-       "0      ENSG00000164972                  sperm microtubule inner protein 6   \n",
-       "1      ENSG00000169105                   carbohydrate sulfotransferase 14   \n",
-       "2      ENSG00000255136                              TPBGL antisense RNA 1   \n",
-       "3      ENSG00000105499                         phospholipase A2 group IVC   \n",
-       "4      ENSG00000104611                           SH2 domain containing 4A   \n",
+       "0      ENSG00000151650                                      VENT homeobox   \n",
+       "1      ENSG00000168268                5'-nucleotidase domain containing 2   \n",
+       "2      ENSG00000186310               nucleosome assembly protein 1 like 3   \n",
+       "3      ENSG00000204616                     tripartite motif containing 31   \n",
+       "4      ENSG00000158467                      adenosylhomocysteinase like 2   \n",
        "...                ...                                                ...   \n",
-       "37447  ENSG00000267206                                        lipocalin 6   \n",
-       "37448  ENSG00000276387  killer cell immunoglobulin like receptor, two ...   \n",
-       "37449  ENSG00000276518  putative killer cell immunoglobulin-like recep...   \n",
-       "37450  ENSG00000230373                golgin A6 family like 3, pseudogene   \n",
-       "37451  ENSG00000249738                          uncharacterized LOC285626   \n",
+       "35853  ENSG00000241472                              PTPRG antisense RNA 1   \n",
+       "35854  ENSG00000133106                   epithelial stromal interaction 1   \n",
+       "35855  ENSG00000230373                golgin A6 family like 3, pseudogene   \n",
+       "35856  ENSG00000249738                          uncharacterized LOC285626   \n",
+       "35857  ENSG00000276387  killer cell immunoglobulin like receptor, two ...   \n",
        "\n",
        "                                                   alias  \\\n",
-       "0           [SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22]   \n",
-       "1                          [ATCS, EDSMC1, HNK1ST, D4ST1]   \n",
-       "2                                                     []   \n",
-       "3                                          [CPLA2-gamma]   \n",
-       "4                                        [PPP1R38, SH2A]   \n",
+       "0                                [NA88A, HPX42B, VENTX2]   \n",
+       "1                                                     []   \n",
+       "2                                           [MB20, NPL3]   \n",
+       "3                             [C6orf13, RNF, HCGI, HCG1]   \n",
+       "4                                   [IRBIT2, ADOHCYASE3]   \n",
        "...                                                  ...   \n",
-       "37447                              [hLcn5, LCN5, UNQ643]   \n",
-       "37448  [NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C...   \n",
-       "37449  [LOC128966730, LOC128966732, LOC128966731, LOC...   \n",
-       "37450                 [GOLGA6L21P, GOLGA6L17P, GOLGA6L3]   \n",
-       "37451                                     [LOC105377683]   \n",
+       "35853                                                 []   \n",
+       "35854                                           [BRESI1]   \n",
+       "35855                 [GOLGA6L3, GOLGA6L21P, GOLGA6L17P]   \n",
+       "35856                                     [LOC105377683]   \n",
+       "35857  [CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64...   \n",
        "\n",
-       "                                                 summary        symbol  \\\n",
-       "0      This gene encodes a nuclear- or perinuclear-lo...        SPMIP6   \n",
-       "1      This gene encodes a member of the HNK-1 family...        CHST14   \n",
-       "2                                                    NaN     TPBGL-AS1   \n",
-       "3      This gene encodes a protein which is a member ...       PLA2G4C   \n",
-       "4      Enables phosphatase binding activity. Located ...        SH2D4A   \n",
-       "...                                                  ...           ...   \n",
-       "37447  Predicted to enable small molecule binding act...          LCN6   \n",
-       "37448  Killer cell immunoglobulin-like receptors (KIR...       KIR2DL1   \n",
-       "37449                                                NaN  LOC128966722   \n",
-       "37450                                                NaN     GOLGA6L3P   \n",
-       "37451                                                NaN     LOC285626   \n",
+       "                                                 summary     symbol  \\\n",
+       "0      This gene encodes a member of the Vent family ...      VENTX   \n",
+       "1      Predicted to enable 5'-nucleotidase activity. ...     NT5DC2   \n",
+       "2      This gene is intronless and encodes a member o...     NAP1L3   \n",
+       "3      This gene encodes a protein that functions as ...     TRIM31   \n",
+       "4      The protein encoded by this gene acts as a hom...     AHCYL2   \n",
+       "...                                                  ...        ...   \n",
+       "35853                                                NaN  PTPRG-AS1   \n",
+       "35854  The protein encoded by this gene has been show...     EPSTI1   \n",
+       "35855                                                NaN  GOLGA6L3P   \n",
+       "35856                                                NaN  LOC285626   \n",
+       "35857  Killer cell immunoglobulin-like receptors (KIR...    KIR2DL1   \n",
        "\n",
        "         type_of_gene ensembl_release possible_replacement  \\\n",
-       "0      protein-coding             111                   []   \n",
-       "1      protein-coding             111                   []   \n",
-       "2                 NaN             111                   []   \n",
-       "3      protein-coding             111                   []   \n",
-       "4      protein-coding             111                   []   \n",
+       "0      protein-coding             113                   []   \n",
+       "1      protein-coding             113                   []   \n",
+       "2      protein-coding             113                   []   \n",
+       "3      protein-coding             113                   []   \n",
+       "4      protein-coding             113                   []   \n",
        "...               ...             ...                  ...   \n",
-       "37447  protein-coding             111                   []   \n",
-       "37448  protein-coding             111                   []   \n",
-       "37449  protein-coding             111                   []   \n",
-       "37450          pseudo             111                   []   \n",
-       "37451           ncRNA             111                   []   \n",
+       "35853           ncRNA             113                   []   \n",
+       "35854  protein-coding             113                   []   \n",
+       "35855          pseudo             113                   []   \n",
+       "35856           ncRNA             113                   []   \n",
+       "35857  protein-coding             113                   []   \n",
        "\n",
        "                                               permalink  \n",
-       "0      https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "1      https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "2      https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "3      https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "4      https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
+       "0      https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "1      https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "2      https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "3      https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "4      https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
        "...                                                  ...  \n",
-       "37447  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "37448  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "37449  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "37450  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "37451  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
+       "35853  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "35854  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "35855  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "35856  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
+       "35857  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
        "\n",
-       "[37452 rows x 9 columns]"
+       "[35858 rows x 9 columns]"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "for row in gene_table_merged.loc[\n",
-    "    gene_table_merged[\"possible_replacement\"].isnull(), \"possible_replacement\"\n",
-    "].index:\n",
-    "    gene_table_merged.at[row, \"possible_replacement\"] = []\n",
+    "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n",
+    "    \"possible_replacement\"\n",
+    "].apply(lambda cell: cell if cell is not np.NaN else [])\n",
     "\n",
     "gene_table_merged[\"possible_replacement\"] = gene_table_merged.apply(\n",
     "    lambda row: (\n",
@@ -2721,7 +2501,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 25,
    "id": "f2287922",
    "metadata": {},
    "outputs": [],
@@ -2736,7 +2516,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "agora-data-tools-ywFp1Gf9",
    "language": "python",
    "name": "python3"
   },
diff --git a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
index fbc1a2dc..d5f0bc5f 100644
--- a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
+++ b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
@@ -1,12 +1,27 @@
+"""
+This file includes several helper functions that are called from one or more of the pre-processing
+notebooks. This helps avoid code duplication and/or keeps the notebooks cleaner and more straightforward.
+Current public-facing functions:
+    manual_query_biomart - queries Biomart with a GET request
+    query_ensembl_version_api - queries the Ensembl API for Ensembl ID version info
+    r_query_biomart - queries Biomart using rpy2
+    filter_hasgs - removes human alternative sequence genes from a data frame
+    get_all_adt_ensembl_ids - gets the Ensembl IDs in all of the files ingested by ADT
+"""
+
 import pandas as pd
+import numpy as np
 import requests
 import re
+import synapseclient
 from io import StringIO
-from typing import Union
+from typing import Union, Dict, List, Set, Tuple
+import agoradatatools.etl.utils as utils
+import agoradatatools.etl.extract as extract
 
 
 def manual_query_biomart(
-    attributes: list[str], filters: dict[Union[list, set]]
+    attributes: List[str], filters: Dict[str, Union[List[str], Set[str]]]
 ) -> pd.DataFrame:
     """Performs a GET request to the Biomart web service and returns the response. There is no
     canonical Python library to query Biomart and no Python library at all to query on
@@ -49,6 +64,62 @@ def manual_query_biomart(
     return result
 
 
+def query_ensembl_version_api(ensembl_ids: List[str]) -> pd.DataFrame:
+    """
+    Queries the Ensembl API via POST to get version information for each Ensembl ID. The API can only
+    process 1000 IDs at a time so the query is broken into batches of 1000. If a request fails, this
+    function will try again up to 5 times on that batch before quitting and raising an error.
+
+    Args:
+        ensembl_ids: a list of Ensembl IDs to query
+
+    Returns:
+        a pandas data frame with Ensembl IDs, version, and release information
+    """
+    url = "https://rest.ensembl.org/archive/id"
+    headers = {"Content-Type": "application/json", "Accept": "application/json"}
+
+    # We can only query 1000 genes at a time
+    batch_ind = range(0, len(ensembl_ids), 1000)
+    results = []
+
+    for B in batch_ind:
+        end = min(len(ensembl_ids), B + 1000)
+        print("Querying genes " + str(B + 1) + " - " + str(end))
+
+        request_data = '{ "id" : ' + str(ensembl_ids[B:end]) + " }"
+        request_data = request_data.replace("'", '"')
+
+        ok = False
+        tries = 0
+
+        while tries < 5 and not ok:
+            try:
+                res = requests.post(url, headers=headers, data=request_data)
+                ok = res.ok
+            except:
+                ok = False
+
+            tries = tries + 1
+
+            if not ok and tries == 5:
+                res.raise_for_status()
+            elif not ok:
+                print(
+                    "Error retrieving Ensembl versions for genes "
+                    + str(B + 1)
+                    + " - "
+                    + str(end)
+                    + ". Trying again..."
+                )
+            else:
+                results = results + res.json()
+                break
+
+    versions = pd.json_normalize(results)
+    return versions
+
+
 def filter_hasgs(df: pd.DataFrame, chromosome_name_column: str) -> pd.DataFrame:
     """Filters human alternative sequence genes (HASGs) from a data frame by using a regex to
     identify them for removal. Valid genes will either have a numerical chromosome name or have
@@ -96,7 +167,7 @@ def r_query_biomart() -> pd.DataFrame:
     r.library("biomaRt")
 
     # Sometimes Biomart doesn't respond and the command needs to be sent again. Try up to 5 times.
-    for T in range(5):
+    for _ in range(5):
         try:
             mart = r.useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
             ensembl_ids = r.getBM(
@@ -124,3 +195,135 @@ def r_query_biomart() -> pd.DataFrame:
             }
         )
         return ensembl_ids_df
+
+
+def get_all_adt_ensembl_ids(
+    config_filename: str, exclude_files: List[str] = [], token: str = None
+) -> List[str]:
+    """
+    Loops through an ADT config file, finds all data files that are ingested by ADT, and returns a
+    list containing all Ensembl IDs present in those files. Specific files can be excluded from the
+    list with the exclude_files argument.
+
+    Args:
+        config_filename: full or relative file path to the ADT config.yaml file
+        exclude_files: list of file names to exclude when searching files for IDs. These names must
+                       match what is in "name" field of the file specification in the config.yaml
+                       file. Typical values are "gene_metadata" and "druggability".
+        token: a Synapse auth token, or None if the user has Synapse credentials saved.
+
+    Returns:
+        a list of unique Ensembl IDs that exist in at least one data set ingested by ADT
+    """
+    syn = utils._login_to_synapse(token=token)
+    config = utils._get_config(config_path=config_filename)
+    datasets = config["datasets"]
+
+    # Get all unique files in the config since some files are listed multiple times by being
+    # included in multiple data sets. Also fetch all column rename values for standardizing Ensembl
+    # ID column names
+    unique_files = {}
+    column_renames = {}
+
+    for dataset in datasets:
+        dataset_name = list(dataset.keys())[0]
+
+        for file in dataset[dataset_name]["files"]:
+            # Make the Synapse ID the key so that "update" will only add a new item if the ID doesn't
+            # already exist
+            unique_files.update({file["id"]: file})
+
+        # Only some data sets have column rename values
+        if "column_rename" in dataset[dataset_name].keys():
+            column_renames.update(dataset[dataset_name]["column_rename"])
+
+    # Print all the files we found
+    print("Found " + str(len(unique_files)) + " files:")
+    [print(x["name"] + ":\t" + x["id"]) for x in unique_files.values()]
+    print("")
+
+    # Create a list of all Ensembl IDs in all files
+    file_ensembl_list = []
+
+    for entity in unique_files.values():
+        # Ignore json files, which are post-processed and not what we're interested in.
+        # Also ignore any other files specified by 'exclude_files', which likely includes
+        # "gene_metadata" and "druggability".
+        if entity["format"] == "json" or entity["name"] in exclude_files:
+            continue
+
+        file_ensembl_ids = _extract_ensembl_ids(syn, entity, column_renames)
+        file_ensembl_list = file_ensembl_list + file_ensembl_ids
+
+    # Remove duplicate values
+    return list(set(file_ensembl_list))
+
+
+def _extract_ensembl_ids(
+    syn: synapseclient.Synapse, entity: Dict[str, str], column_renames: Dict[str, str]
+) -> List[str]:
+    """
+    Internal function used by get_all_adt_ensembl_ids to exctract a list of Ensembl IDs from a file.
+    The file is downloaded from Synapse and read in as a pandas data frame, column names are renamed
+    if necessary to ensure that most Ensembl ID columns are renamed to "ensembl_gene_id", and all
+    Ensembl IDs from relevant columns are put in a list.
+
+    Note that the "networks" data set contains two columns with Ensembl IDs (genea_ensembl_gene_id
+    and geneb_ensembl_gene_id) which are not renamed, so this function searches for columns named
+    with any of those two names or with "ensembl_gene_id" when finding Ensembl ID columns.
+
+    Note that this function depends on the column_rename specifications in the config to accurately
+    convert all Ensembl ID-containing columns in all files except networks to "ensembl_gene_id", so
+    that we don't have to hard-code a list of all possible column names. This assumption is valid
+    for the current set of data files and will likely remain valid for future data, but a warning
+    is printed out if no matching column is found, just in case.
+
+    Args:
+        syn: a syanpseclient object which has already been initialized and successfully logged in
+        entity: a dictionary containing keys "id", "name", and "format"
+        column_renames: a dictionary containing all column rename pairs from the config file, where
+                        key = old column name, and value = new column name
+
+    Returns:
+        a list of unique Ensembl IDs in the file, or an empty list if no Ensembl ID column found
+    """
+    df = extract.get_entity_as_df(syn_id=entity["id"], source=entity["format"], syn=syn)
+
+    # Use column_renames from the config to convert most Ensembl ID column names to "ensembl_gene_id".
+    df = utils.standardize_column_names(df=df)
+    df = utils.rename_columns(df=df, column_map=column_renames)
+
+    # Exception to the above comment: the 'networks' file has two ID columns (genea_ and geneb_ ensembl_gene_id)
+    # which do not get renamed
+    possible_col_names = [
+        "ensembl_gene_id",
+        "genea_ensembl_gene_id",
+        "geneb_ensembl_gene_id",
+    ]
+
+    file_ensembl_ids = []
+
+    # The data may have zero, one, or more than one (in the case of 'networks') column of Ensembl IDs
+    for C in possible_col_names:
+        if C in df.columns:
+            file_ensembl_ids = file_ensembl_ids + df[C].tolist()
+
+    # Print any warnings and remove any NA values from the list before returning
+    if len(file_ensembl_ids) == 0:
+        print("WARNING: no Ensembl ID column found for " + entity["name"] + "!")
+
+    if "n/A" in file_ensembl_ids:
+        print(entity["name"] + " has an n/A Ensembl ID")
+        file_ensembl_ids.remove("n/A")
+
+    if np.NaN in file_ensembl_ids:
+        print(
+            entity["name"]
+            + " has "
+            + str(file_ensembl_ids.count(np.NaN))
+            + " NaN Ensembl IDs"
+        )
+        file_ensembl_ids = [x for x in file_ensembl_ids if x is not np.NaN]
+
+    # Remove duplicate values
+    return list(set(file_ensembl_ids))
diff --git a/test_config.yaml b/test_config.yaml
index 02bd0f18..a4d59ad3 100644
--- a/test_config.yaml
+++ b/test_config.yaml
@@ -144,7 +144,7 @@ datasets:
   - gene_info:
       files:
         - name: gene_metadata
-          id: syn25953363.13
+          id: syn25953363.14
           format: feather
         - name: igap
           id: syn12514826.5
@@ -187,7 +187,7 @@ datasets:
         possible_replacement: ensembl_possible_replacements
         permalink: ensembl_permalink
       provenance:
-        - syn25953363.13
+        - syn25953363.14
         - syn12514826.5
         - syn12514912.3
         - *agora_proteomics_provenance

From 5e2e3ea154bbefcff00d745afdcbd21f70ba99f3 Mon Sep 17 00:00:00 2001
From: Jaclyn Beck <jaclyn.beck@sagebase.org>
Date: Thu, 21 Nov 2024 11:43:04 -0800
Subject: [PATCH 2/8] Undid bump in gene_metadata version, it can't be
 increased until druggability removed from gene_info

---
 config.yaml      | 4 ++--
 test_config.yaml | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/config.yaml b/config.yaml
index 1ace0892..7b8b4f4b 100644
--- a/config.yaml
+++ b/config.yaml
@@ -144,7 +144,7 @@ datasets:
   - gene_info:
       files:
         - name: gene_metadata
-          id: syn25953363.14
+          id: syn25953363.13
           format: feather
         - name: igap
           id: syn12514826.5
@@ -187,7 +187,7 @@ datasets:
         possible_replacement: ensembl_possible_replacements
         permalink: ensembl_permalink
       provenance:
-        - syn25953363.14
+        - syn25953363.13
         - syn12514826.5
         - syn12514912.3
         - *agora_proteomics_provenance
diff --git a/test_config.yaml b/test_config.yaml
index a4d59ad3..02bd0f18 100644
--- a/test_config.yaml
+++ b/test_config.yaml
@@ -144,7 +144,7 @@ datasets:
   - gene_info:
       files:
         - name: gene_metadata
-          id: syn25953363.14
+          id: syn25953363.13
           format: feather
         - name: igap
           id: syn12514826.5
@@ -187,7 +187,7 @@ datasets:
         possible_replacement: ensembl_possible_replacements
         permalink: ensembl_permalink
       provenance:
-        - syn25953363.14
+        - syn25953363.13
         - syn12514826.5
         - syn12514912.3
         - *agora_proteomics_provenance

From bba8fc17c3f930b1c4b10e4c090d8e4832af63db Mon Sep 17 00:00:00 2001
From: Jaclyn Beck <jaclyn.beck@sagebase.org>
Date: Thu, 21 Nov 2024 13:00:38 -0800
Subject: [PATCH 3/8] Addressed SonarCloud issue with exceptions, updated
 gitignore with a few more local files to ignore

---
 .gitignore                                       |   3 +++
 .../preprocessing/preprocessing_utils.py         |   9 ++++++---
 tests/test_assets/.DS_Store                      | Bin 6148 -> 0 bytes
 3 files changed, 9 insertions(+), 3 deletions(-)
 delete mode 100644 tests/test_assets/.DS_Store

diff --git a/.gitignore b/.gitignore
index 20c38245..909837f7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -133,6 +133,7 @@ dmypy.json
 
 # local generated files
 staging/*
+data_analysis/*/output/*
 
 #test staging location
 test_staging_dir/
@@ -141,3 +142,5 @@ test_staging_dir/
 dev_config.yaml
 
 .vscode/
+.ipynb_checkpoints/
+.Rhistory
diff --git a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
index d5f0bc5f..4ee36b02 100644
--- a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
+++ b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
@@ -15,7 +15,7 @@
 import re
 import synapseclient
 from io import StringIO
-from typing import Union, Dict, List, Set, Tuple
+from typing import Union, Dict, List, Set
 import agoradatatools.etl.utils as utils
 import agoradatatools.etl.extract as extract
 
@@ -97,7 +97,8 @@ def query_ensembl_version_api(ensembl_ids: List[str]) -> pd.DataFrame:
             try:
                 res = requests.post(url, headers=headers, data=request_data)
                 ok = res.ok
-            except:
+            except requests.RequestException as ex:
+                print(ex)
                 ok = False
 
             tries = tries + 1
@@ -158,6 +159,7 @@ def r_query_biomart() -> pd.DataFrame:
                                       "chromosome_name", and "hgnc_symbol" retrived from BioMart
     """
     from rpy2.robjects import r
+    from rpy2.rinterface_lib.embedded import RRuntimeError
 
     r(
         'if (!require("BiocManager", character.only = TRUE)) { install.packages("BiocManager") }'
@@ -176,7 +178,8 @@ def r_query_biomart() -> pd.DataFrame:
                 useCache=False,
             )
 
-        except:
+        except RRuntimeError as ex:
+            print(ex)
             print("Trying again...")
             ensembl_ids = None
         else:
diff --git a/tests/test_assets/.DS_Store b/tests/test_assets/.DS_Store
deleted file mode 100644
index 46b71f5c36027abd59a6285a7688b02ddb6a4ed4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKu}T9$5S`H!0THgW+%FLPgC)em+7D0?6@-HnCf3*a+4|njC^;_yD~s?3X5Q`W
zyxY51?ChGEFLqBiW}TVM;Y9mjm>TEl6FbWc(c3#7?8RQKw`aSaR3A=|dudxR-Vpcs
zYh>Q<`ptUPZ&s-4K7ZL!=JpTF!(XGU6p#W^Knh5K(^mjJn>N1)RFnczKnnaQ!2O}X
zi8XNuY+DBckMEqH5ovgCcL`w0aZMZoF$43U0)y&##qgjbU$U+y4uL^8Z!6<Hxw7XC
z#oOx0msW0G0~Mu!6c{UT9>b3P{}z7a{y!#ZBn70vNh#p7<^6JjU&`J(`8oF50)K>m
n8){oQ!srly0bB9aUR~ic>T2Q;*yzYNI#34z>LQZ@|DnJa#>_qE


From e5f7597509686ece683f9a20d85cbb8f0bbe9b0d Mon Sep 17 00:00:00 2001
From: Jaclyn Beck <jaclyn.beck@sagebase.org>
Date: Thu, 21 Nov 2024 17:23:43 -0800
Subject: [PATCH 4/8] More code cleanup, moved duplicate ensembl ID handling to
 preprocessing_utils

---
 .../AG-896_Preprocess_Gene_Annotations.ipynb  | 2009 +----------------
 .../preprocessing/preprocessing_utils.py      |   78 +
 2 files changed, 137 insertions(+), 1950 deletions(-)

diff --git a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
index c0f2ad33..bfbab4b4 100644
--- a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
+++ b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
@@ -113,78 +113,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "a3fdbeec",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "UPGRADE AVAILABLE\n",
-      "\n",
-      "A more recent version of the Synapse Client (4.6.0) is available. Your version (4.0.0) can be upgraded by typing:\n",
-      "    pip install --upgrade synapseclient\n",
-      "\n",
-      "Python Synapse Client version 4.6.0 release notes\n",
-      "\n",
-      "https://python-docs.synapse.org/news/\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Welcome, Jaclyn Beck!\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:synapseclient_default:Welcome, Jaclyn Beck!\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Found 19 files:\n",
-      "genes_biodomains:\tsyn44151254.5\n",
-      "neuropath_regression_results:\tsyn22017882.5\n",
-      "proteomics:\tsyn18689335.4\n",
-      "proteomics_tmt:\tsyn35221005.2\n",
-      "proteomics_srm:\tsyn52579640.4\n",
-      "target_exp_validation_harmonized:\tsyn24184512.9\n",
-      "metabolomics:\tsyn26064497.1\n",
-      "gene_metadata:\tsyn25953363.13\n",
-      "igap:\tsyn12514826.5\n",
-      "eqtl:\tsyn12514912.3\n",
-      "diff_exp_data:\tsyn27211942.1\n",
-      "target_list:\tsyn12540368.51\n",
-      "median_expression:\tsyn27211878.2\n",
-      "druggability:\tsyn13363443.11\n",
-      "tep_adi_info:\tsyn51942280.3\n",
-      "team_info:\tsyn12615624.18\n",
-      "team_member_info:\tsyn12615633.19\n",
-      "overall_scores:\tsyn25575156.13\n",
-      "networks:\tsyn11685347.1\n",
-      "\n",
-      "genes_biodomains has 591 NaN Ensembl IDs\n",
-      "WARNING: no Ensembl ID column found for team_info!\n",
-      "WARNING: no Ensembl ID column found for team_member_info!\n",
-      "\n",
-      "35858 Ensembl IDs found.\n",
-      "['ENSG00000151650', 'ENSG00000168268', 'ENSG00000186310', 'ENSG00000204616', 'ENSG00000158467']\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "file_ensembl_list = preprocessing_utils.get_all_adt_ensembl_ids(\n",
     "    config_filename=config_filename,\n",
@@ -206,18 +140,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "f1303e5b",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "35858\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "ensembl_ids_df = pd.DataFrame({\"ensembl_gene_id\": file_ensembl_list})\n",
     "\n",
@@ -239,7 +165,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "4e7a37c8",
    "metadata": {},
    "outputs": [],
@@ -260,231 +186,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "7ebd03d4",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:biothings.client:querying 1-1000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 1001-2000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 2001-3000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 3001-4000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 4001-5000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 5001-6000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 6001-7000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 7001-8000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 8001-9000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 9001-10000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 10001-11000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 11001-12000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 12001-13000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 13001-14000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 14001-15000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 15001-16000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 16001-17000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 17001-18000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 18001-19000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 19001-20000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 20001-21000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 21001-22000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 22001-23000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 23001-24000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 24001-25000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 25001-26000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 26001-27000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 27001-28000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 28001-29000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 29001-30000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 30001-31000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 31001-32000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 32001-33000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 33001-34000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 34001-35000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 35001-35858...\n",
-      "INFO:biothings.client:done.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>_id</th>\n",
-       "      <th>_version</th>\n",
-       "      <th>alias</th>\n",
-       "      <th>name</th>\n",
-       "      <th>summary</th>\n",
-       "      <th>symbol</th>\n",
-       "      <th>type_of_gene</th>\n",
-       "      <th>notfound</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ensembl_gene_id</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>ENSG00000151650</th>\n",
-       "      <td>27287</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[HPX42B, NA88A, VENTX2]</td>\n",
-       "      <td>VENT homeobox</td>\n",
-       "      <td>This gene encodes a member of the Vent family ...</td>\n",
-       "      <td>VENTX</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ENSG00000168268</th>\n",
-       "      <td>64943</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>5'-nucleotidase domain containing 2</td>\n",
-       "      <td>Predicted to enable 5'-nucleotidase activity. ...</td>\n",
-       "      <td>NT5DC2</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ENSG00000186310</th>\n",
-       "      <td>4675</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[MB20, NPL3]</td>\n",
-       "      <td>nucleosome assembly protein 1 like 3</td>\n",
-       "      <td>This gene is intronless and encodes a member o...</td>\n",
-       "      <td>NAP1L3</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ENSG00000204616</th>\n",
-       "      <td>11074</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[C6orf13, HCG1, HCGI, RNF]</td>\n",
-       "      <td>tripartite motif containing 31</td>\n",
-       "      <td>This gene encodes a protein that functions as ...</td>\n",
-       "      <td>TRIM31</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ENSG00000158467</th>\n",
-       "      <td>23382</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[ADOHCYASE3, IRBIT2]</td>\n",
-       "      <td>adenosylhomocysteinase like 2</td>\n",
-       "      <td>The protein encoded by this gene acts as a hom...</td>\n",
-       "      <td>AHCYL2</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                   _id  _version                       alias  \\\n",
-       "ensembl_gene_id                                                \n",
-       "ENSG00000151650  27287       1.0     [HPX42B, NA88A, VENTX2]   \n",
-       "ENSG00000168268  64943       1.0                         NaN   \n",
-       "ENSG00000186310   4675       1.0                [MB20, NPL3]   \n",
-       "ENSG00000204616  11074       1.0  [C6orf13, HCG1, HCGI, RNF]   \n",
-       "ENSG00000158467  23382       1.0        [ADOHCYASE3, IRBIT2]   \n",
-       "\n",
-       "                                                 name  \\\n",
-       "ensembl_gene_id                                         \n",
-       "ENSG00000151650                         VENT homeobox   \n",
-       "ENSG00000168268   5'-nucleotidase domain containing 2   \n",
-       "ENSG00000186310  nucleosome assembly protein 1 like 3   \n",
-       "ENSG00000204616        tripartite motif containing 31   \n",
-       "ENSG00000158467         adenosylhomocysteinase like 2   \n",
-       "\n",
-       "                                                           summary  symbol  \\\n",
-       "ensembl_gene_id                                                              \n",
-       "ENSG00000151650  This gene encodes a member of the Vent family ...   VENTX   \n",
-       "ENSG00000168268  Predicted to enable 5'-nucleotidase activity. ...  NT5DC2   \n",
-       "ENSG00000186310  This gene is intronless and encodes a member o...  NAP1L3   \n",
-       "ENSG00000204616  This gene encodes a protein that functions as ...  TRIM31   \n",
-       "ENSG00000158467  The protein encoded by this gene acts as a hom...  AHCYL2   \n",
-       "\n",
-       "                   type_of_gene notfound  \n",
-       "ensembl_gene_id                           \n",
-       "ENSG00000151650  protein-coding      NaN  \n",
-       "ENSG00000168268  protein-coding      NaN  \n",
-       "ENSG00000186310  protein-coding      NaN  \n",
-       "ENSG00000204616  protein-coding      NaN  \n",
-       "ENSG00000158467  protein-coding      NaN  "
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "mg = mygene.MyGeneInfo()\n",
     "\n",
@@ -500,21 +207,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "23bb114e",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Annotations found for 34655 genes.\n",
-      "No annotations found for 1206 genes.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(\"Annotations found for \" + str(sum(mygene_output[\"notfound\"].isna())) + \" genes.\")\n",
     "print(\n",
@@ -538,151 +236,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "id": "186d8cb8",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(35861, 9)\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>ensembl_gene_id</th>\n",
-       "      <th>_id</th>\n",
-       "      <th>_version</th>\n",
-       "      <th>alias</th>\n",
-       "      <th>name</th>\n",
-       "      <th>summary</th>\n",
-       "      <th>symbol</th>\n",
-       "      <th>type_of_gene</th>\n",
-       "      <th>notfound</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>ENSG00000151650</td>\n",
-       "      <td>27287</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[HPX42B, NA88A, VENTX2]</td>\n",
-       "      <td>VENT homeobox</td>\n",
-       "      <td>This gene encodes a member of the Vent family ...</td>\n",
-       "      <td>VENTX</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>ENSG00000168268</td>\n",
-       "      <td>64943</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>5'-nucleotidase domain containing 2</td>\n",
-       "      <td>Predicted to enable 5'-nucleotidase activity. ...</td>\n",
-       "      <td>NT5DC2</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>ENSG00000186310</td>\n",
-       "      <td>4675</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[MB20, NPL3]</td>\n",
-       "      <td>nucleosome assembly protein 1 like 3</td>\n",
-       "      <td>This gene is intronless and encodes a member o...</td>\n",
-       "      <td>NAP1L3</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>ENSG00000204616</td>\n",
-       "      <td>11074</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[C6orf13, HCG1, HCGI, RNF]</td>\n",
-       "      <td>tripartite motif containing 31</td>\n",
-       "      <td>This gene encodes a protein that functions as ...</td>\n",
-       "      <td>TRIM31</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>ENSG00000158467</td>\n",
-       "      <td>23382</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[ADOHCYASE3, IRBIT2]</td>\n",
-       "      <td>adenosylhomocysteinase like 2</td>\n",
-       "      <td>The protein encoded by this gene acts as a hom...</td>\n",
-       "      <td>AHCYL2</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   ensembl_gene_id    _id  _version                       alias  \\\n",
-       "0  ENSG00000151650  27287       1.0     [HPX42B, NA88A, VENTX2]   \n",
-       "1  ENSG00000168268  64943       1.0                         NaN   \n",
-       "2  ENSG00000186310   4675       1.0                [MB20, NPL3]   \n",
-       "3  ENSG00000204616  11074       1.0  [C6orf13, HCG1, HCGI, RNF]   \n",
-       "4  ENSG00000158467  23382       1.0        [ADOHCYASE3, IRBIT2]   \n",
-       "\n",
-       "                                   name  \\\n",
-       "0                         VENT homeobox   \n",
-       "1   5'-nucleotidase domain containing 2   \n",
-       "2  nucleosome assembly protein 1 like 3   \n",
-       "3        tripartite motif containing 31   \n",
-       "4         adenosylhomocysteinase like 2   \n",
-       "\n",
-       "                                             summary  symbol    type_of_gene  \\\n",
-       "0  This gene encodes a member of the Vent family ...   VENTX  protein-coding   \n",
-       "1  Predicted to enable 5'-nucleotidase activity. ...  NT5DC2  protein-coding   \n",
-       "2  This gene is intronless and encodes a member o...  NAP1L3  protein-coding   \n",
-       "3  This gene encodes a protein that functions as ...  TRIM31  protein-coding   \n",
-       "4  The protein encoded by this gene acts as a hom...  AHCYL2  protein-coding   \n",
-       "\n",
-       "  notfound  \n",
-       "0      NaN  \n",
-       "1      NaN  \n",
-       "2      NaN  \n",
-       "3      NaN  \n",
-       "4      NaN  "
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "gene_table_merged = pd.merge(\n",
     "    left=ensembl_ids_df,\n",
@@ -711,37 +270,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": null,
    "id": "285c10d2",
    "metadata": {
     "scrolled": true
    },
    "outputs": [],
    "source": [
-    "# NaN or NULL alias values become empty lists\n",
-    "gene_table_merged[\"alias\"] = gene_table_merged[\"alias\"].apply(\n",
-    "    lambda cell: cell if cell is not np.NaN else []\n",
-    ")\n",
-    "\n",
-    "# Some alias values are a single string, not a list. Turn them into lists here.\n",
-    "gene_table_merged[\"alias\"] = gene_table_merged[\"alias\"].apply(\n",
-    "    lambda cell: cell if isinstance(cell, list) else [cell]\n",
-    ")\n",
-    "\n",
-    "\n",
-    "# Some alias values are lists of lists or have duplicate values\n",
-    "def flatten(row):\n",
-    "    flattened = []\n",
-    "    for item in row:\n",
-    "        if isinstance(item, list):\n",
-    "            flattened = flattened + item\n",
-    "        else:\n",
-    "            flattened.append(item)\n",
-    "    return flattened\n",
-    "\n",
-    "\n",
     "gene_table_merged[\"alias\"] = gene_table_merged[\"alias\"].apply(\n",
-    "    lambda row: list(set(flatten(row)))\n",
+    "    preprocessing_utils.standardize_list_item\n",
     ")"
    ]
   },
@@ -752,456 +289,42 @@
    "source": [
     "## Remove duplicate Ensembl IDs from the list. \n",
     "\n",
-    "Duplicates in the list typically have the same Ensembl ID but different gene symbols. This usually happens when a single Ensembl ID maps to multiple Entrez IDs in the NCBI database. There's not a good way to reconcile this, so we first check for entries whose `symbol` is something other than \"LOC#######\", and designate that entry as the main row. If there are multiple or zero entries meeting that criteria, we just use the first entry in the list for each ensembl ID and discard the rest, which is what the Agora front end does. The gene symbols of duplicate rows are then added as aliases to the matching unique row."
+    "Duplicates in the list typically have the same Ensembl ID but different gene symbols. This usually happens when a single Ensembl ID maps to multiple Entrez IDs in the NCBI database. For every set of duplicated rows with the same Ensembl ID, we remove all rows but the first row in the set, and the symbols and aliases of the removed rows get added to the \"alias\" field of the first row."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": null,
    "id": "bc63cc53",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>ensembl_gene_id</th>\n",
-       "      <th>_id</th>\n",
-       "      <th>_version</th>\n",
-       "      <th>alias</th>\n",
-       "      <th>name</th>\n",
-       "      <th>summary</th>\n",
-       "      <th>symbol</th>\n",
-       "      <th>type_of_gene</th>\n",
-       "      <th>notfound</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>19626</th>\n",
-       "      <td>ENSG00000249738</td>\n",
-       "      <td>285626</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>uncharacterized LOC285626</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC285626</td>\n",
-       "      <td>ncRNA</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>19627</th>\n",
-       "      <td>ENSG00000249738</td>\n",
-       "      <td>105377683</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>uncharacterized LOC105377683</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC105377683</td>\n",
-       "      <td>ncRNA</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>24698</th>\n",
-       "      <td>ENSG00000276387</td>\n",
-       "      <td>3802</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[CD158A, NKAT1, KIR2DL3, KIR-K64, NKAT-1, p58....</td>\n",
-       "      <td>killer cell immunoglobulin like receptor, two ...</td>\n",
-       "      <td>Killer cell immunoglobulin-like receptors (KIR...</td>\n",
-       "      <td>KIR2DL1</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>24699</th>\n",
-       "      <td>ENSG00000276387</td>\n",
-       "      <td>124900571</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>killer cell immunoglobulin-like receptor 2DS1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC124900571</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>29514</th>\n",
-       "      <td>ENSG00000230373</td>\n",
-       "      <td>100133220</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[GOLGA6L3]</td>\n",
-       "      <td>golgin A6 family like 3, pseudogene</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>GOLGA6L3P</td>\n",
-       "      <td>pseudo</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>29515</th>\n",
-       "      <td>ENSG00000230373</td>\n",
-       "      <td>642402</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[GOLGA6L21P]</td>\n",
-       "      <td>golgin A6 family like 17, pseudogene</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>GOLGA6L17P</td>\n",
-       "      <td>pseudo</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       ensembl_gene_id        _id  _version  \\\n",
-       "19626  ENSG00000249738     285626       1.0   \n",
-       "19627  ENSG00000249738  105377683       1.0   \n",
-       "24698  ENSG00000276387       3802       1.0   \n",
-       "24699  ENSG00000276387  124900571       1.0   \n",
-       "29514  ENSG00000230373  100133220       1.0   \n",
-       "29515  ENSG00000230373     642402       1.0   \n",
-       "\n",
-       "                                                   alias  \\\n",
-       "19626                                                 []   \n",
-       "19627                                                 []   \n",
-       "24698  [CD158A, NKAT1, KIR2DL3, KIR-K64, NKAT-1, p58....   \n",
-       "24699                                                 []   \n",
-       "29514                                         [GOLGA6L3]   \n",
-       "29515                                       [GOLGA6L21P]   \n",
-       "\n",
-       "                                                    name  \\\n",
-       "19626                          uncharacterized LOC285626   \n",
-       "19627                       uncharacterized LOC105377683   \n",
-       "24698  killer cell immunoglobulin like receptor, two ...   \n",
-       "24699      killer cell immunoglobulin-like receptor 2DS1   \n",
-       "29514                golgin A6 family like 3, pseudogene   \n",
-       "29515               golgin A6 family like 17, pseudogene   \n",
-       "\n",
-       "                                                 summary        symbol  \\\n",
-       "19626                                                NaN     LOC285626   \n",
-       "19627                                                NaN  LOC105377683   \n",
-       "24698  Killer cell immunoglobulin-like receptors (KIR...       KIR2DL1   \n",
-       "24699                                                NaN  LOC124900571   \n",
-       "29514                                                NaN     GOLGA6L3P   \n",
-       "29515                                                NaN    GOLGA6L17P   \n",
-       "\n",
-       "         type_of_gene notfound  \n",
-       "19626           ncRNA      NaN  \n",
-       "19627           ncRNA      NaN  \n",
-       "24698  protein-coding      NaN  \n",
-       "24699  protein-coding      NaN  \n",
-       "29514          pseudo      NaN  \n",
-       "29515          pseudo      NaN  "
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "# duplicated() will return true if the ID is a duplicate and is not the first one to appear the list.\n",
+    "# For printing only\n",
     "dupes = gene_table_merged[\"ensembl_gene_id\"].duplicated()\n",
-    "dupe_vals = gene_table_merged[dupes]\n",
+    "dupe_ids = gene_table_merged.loc[dupes, \"ensembl_gene_id\"]\n",
+    "print(\n",
+    "    gene_table_merged.loc[\n",
+    "        gene_table_merged[\"ensembl_gene_id\"].isin(dupe_ids),\n",
+    "        [\"ensembl_gene_id\", \"symbol\", \"alias\"],\n",
+    "    ]\n",
+    ")\n",
     "\n",
-    "# Rows with duplicated Ensembl IDs\n",
-    "all_duplicated = gene_table_merged.loc[\n",
-    "    gene_table_merged[\"ensembl_gene_id\"].isin(dupe_vals[\"ensembl_gene_id\"])\n",
-    "]\n",
-    "all_duplicated"
+    "# Remove duplicates\n",
+    "gene_table_merged = preprocessing_utils.merge_duplicate_ensembl_ids(gene_table_merged)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "093a2e98",
+   "id": "bc76d96e",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "3 duplicated genes have been processed.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>ensembl_gene_id</th>\n",
-       "      <th>_id</th>\n",
-       "      <th>_version</th>\n",
-       "      <th>alias</th>\n",
-       "      <th>name</th>\n",
-       "      <th>summary</th>\n",
-       "      <th>symbol</th>\n",
-       "      <th>type_of_gene</th>\n",
-       "      <th>notfound</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>35848</th>\n",
-       "      <td>ENSG00000085998</td>\n",
-       "      <td>55624</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[RP76, LGMDR15, LGMD2O, gnT-I.2, GNTI.2, GnT I...</td>\n",
-       "      <td>protein O-linked mannose N-acetylglucosaminylt...</td>\n",
-       "      <td>This gene encodes a type II transmembrane prot...</td>\n",
-       "      <td>POMGNT1</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35849</th>\n",
-       "      <td>ENSG00000285081</td>\n",
-       "      <td>ENSG00000285081</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35850</th>\n",
-       "      <td>ENSG00000126822</td>\n",
-       "      <td>26030</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[ARHGEF43, KIAA0599]</td>\n",
-       "      <td>pleckstrin homology and RhoGEF domain containi...</td>\n",
-       "      <td>Predicted to enable guanyl-nucleotide exchange...</td>\n",
-       "      <td>PLEKHG3</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35851</th>\n",
-       "      <td>ENSG00000187240</td>\n",
-       "      <td>79659</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[DHC2, hdhc11, DNCH2, SRTD3, SRPS2B, ATD3, DHC...</td>\n",
-       "      <td>dynein cytoplasmic 2 heavy chain 1</td>\n",
-       "      <td>This gene encodes a large cytoplasmic dynein p...</td>\n",
-       "      <td>DYNC2H1</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35852</th>\n",
-       "      <td>ENSG00000101470</td>\n",
-       "      <td>7125</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[CMYP15, CMYO15, CFAP85, FAP85, MYONRI]</td>\n",
-       "      <td>troponin C2, fast skeletal type</td>\n",
-       "      <td>Troponin (Tn), a key protein complex in the re...</td>\n",
-       "      <td>TNNC2</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35853</th>\n",
-       "      <td>ENSG00000241472</td>\n",
-       "      <td>100506994</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>PTPRG antisense RNA 1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>PTPRG-AS1</td>\n",
-       "      <td>ncRNA</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35854</th>\n",
-       "      <td>ENSG00000133106</td>\n",
-       "      <td>94240</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[BRESI1]</td>\n",
-       "      <td>epithelial stromal interaction 1</td>\n",
-       "      <td>The protein encoded by this gene has been show...</td>\n",
-       "      <td>EPSTI1</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35855</th>\n",
-       "      <td>ENSG00000230373</td>\n",
-       "      <td>100133220</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[GOLGA6L3, GOLGA6L21P, GOLGA6L17P]</td>\n",
-       "      <td>golgin A6 family like 3, pseudogene</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>GOLGA6L3P</td>\n",
-       "      <td>pseudo</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35856</th>\n",
-       "      <td>ENSG00000249738</td>\n",
-       "      <td>285626</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[LOC105377683]</td>\n",
-       "      <td>uncharacterized LOC285626</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC285626</td>\n",
-       "      <td>ncRNA</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35857</th>\n",
-       "      <td>ENSG00000276387</td>\n",
-       "      <td>3802</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64...</td>\n",
-       "      <td>killer cell immunoglobulin like receptor, two ...</td>\n",
-       "      <td>Killer cell immunoglobulin-like receptors (KIR...</td>\n",
-       "      <td>KIR2DL1</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       ensembl_gene_id              _id _version  \\\n",
-       "35848  ENSG00000085998            55624      1.0   \n",
-       "35849  ENSG00000285081  ENSG00000285081      1.0   \n",
-       "35850  ENSG00000126822            26030      1.0   \n",
-       "35851  ENSG00000187240            79659      1.0   \n",
-       "35852  ENSG00000101470             7125      1.0   \n",
-       "35853  ENSG00000241472        100506994      1.0   \n",
-       "35854  ENSG00000133106            94240      1.0   \n",
-       "35855  ENSG00000230373        100133220      1.0   \n",
-       "35856  ENSG00000249738           285626      1.0   \n",
-       "35857  ENSG00000276387             3802      1.0   \n",
-       "\n",
-       "                                                   alias  \\\n",
-       "35848  [RP76, LGMDR15, LGMD2O, gnT-I.2, GNTI.2, GnT I...   \n",
-       "35849                                                 []   \n",
-       "35850                               [ARHGEF43, KIAA0599]   \n",
-       "35851  [DHC2, hdhc11, DNCH2, SRTD3, SRPS2B, ATD3, DHC...   \n",
-       "35852            [CMYP15, CMYO15, CFAP85, FAP85, MYONRI]   \n",
-       "35853                                                 []   \n",
-       "35854                                           [BRESI1]   \n",
-       "35855                 [GOLGA6L3, GOLGA6L21P, GOLGA6L17P]   \n",
-       "35856                                     [LOC105377683]   \n",
-       "35857  [CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64...   \n",
-       "\n",
-       "                                                    name  \\\n",
-       "35848  protein O-linked mannose N-acetylglucosaminylt...   \n",
-       "35849                                                NaN   \n",
-       "35850  pleckstrin homology and RhoGEF domain containi...   \n",
-       "35851                 dynein cytoplasmic 2 heavy chain 1   \n",
-       "35852                    troponin C2, fast skeletal type   \n",
-       "35853                              PTPRG antisense RNA 1   \n",
-       "35854                   epithelial stromal interaction 1   \n",
-       "35855                golgin A6 family like 3, pseudogene   \n",
-       "35856                          uncharacterized LOC285626   \n",
-       "35857  killer cell immunoglobulin like receptor, two ...   \n",
-       "\n",
-       "                                                 summary     symbol  \\\n",
-       "35848  This gene encodes a type II transmembrane prot...    POMGNT1   \n",
-       "35849                                                NaN        NaN   \n",
-       "35850  Predicted to enable guanyl-nucleotide exchange...    PLEKHG3   \n",
-       "35851  This gene encodes a large cytoplasmic dynein p...    DYNC2H1   \n",
-       "35852  Troponin (Tn), a key protein complex in the re...      TNNC2   \n",
-       "35853                                                NaN  PTPRG-AS1   \n",
-       "35854  The protein encoded by this gene has been show...     EPSTI1   \n",
-       "35855                                                NaN  GOLGA6L3P   \n",
-       "35856                                                NaN  LOC285626   \n",
-       "35857  Killer cell immunoglobulin-like receptors (KIR...    KIR2DL1   \n",
-       "\n",
-       "         type_of_gene notfound  \n",
-       "35848  protein-coding      NaN  \n",
-       "35849             NaN      NaN  \n",
-       "35850  protein-coding      NaN  \n",
-       "35851  protein-coding      NaN  \n",
-       "35852  protein-coding      NaN  \n",
-       "35853           ncRNA      NaN  \n",
-       "35854  protein-coding      NaN  \n",
-       "35855          pseudo      NaN  \n",
-       "35856           ncRNA      NaN  \n",
-       "35857  protein-coding      NaN  "
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "keep_df = gene_table_merged.drop(all_duplicated.index)\n",
-    "\n",
-    "# For each duplicated Ensembl ID, collapse to 1 row and append that row to keep_df\n",
-    "for ens_id in set(all_duplicated[\"ensembl_gene_id\"]):\n",
-    "    group = all_duplicated.loc[all_duplicated[\"ensembl_gene_id\"] == ens_id].copy(\n",
-    "        deep=True\n",
-    "    )\n",
-    "    # Put any entries with symbols that aren't \"LOC#####\" at the top of the data frame\n",
-    "    matches = group[\"symbol\"].str.startswith(\"LOC\") == False\n",
-    "    group = pd.concat([group.loc[matches], group.loc[matches == False]]).reset_index(\n",
-    "        drop=True\n",
-    "    )\n",
-    "\n",
-    "    # Add all duplicate symbols and their aliases to the alias field of the first entry\n",
-    "    for row in group.index[1:]:\n",
-    "        group.at[group.index[0], \"alias\"].append(group[\"symbol\"][row])\n",
-    "        if len(group.at[row, \"alias\"]) > 0:\n",
-    "            group.at[group.index[0], \"alias\"] = (\n",
-    "                group.at[group.index[0], \"alias\"] + group[\"alias\"][row]\n",
-    "            )\n",
-    "\n",
-    "    # Make sure we didn't add duplicate aliases\n",
-    "    group.at[group.index[0], \"alias\"] = list(set(group.at[group.index[0], \"alias\"]))\n",
-    "\n",
-    "    # Keep the first row only, which now has all the aliases\n",
-    "    keep_df = pd.concat([keep_df, group.iloc[0].to_frame().T], ignore_index=True)\n",
-    "\n",
-    "print(\n",
-    "    str(len(all_duplicated.drop_duplicates(\"ensembl_gene_id\")))\n",
-    "    + \" duplicated genes have been processed.\"\n",
-    ")\n",
-    "gene_table_merged = keep_df.reset_index(drop=True)\n",
-    "gene_table_merged.tail(n=10)"
+    "print(str(len(dupe_ids.drop_duplicates())) + \" duplicated genes have been processed.\")\n",
+    "print(gene_table_merged.shape)\n",
+    "print(gene_table_merged.loc[gene_table_merged[\"ensembl_gene_id\"].isin(dupe_ids), \"alias\"])"
    ]
   },
   {
@@ -1218,64 +341,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "id": "4a1bbdee",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "             name     date                                 url version\n",
-      "1  Ensembl GRCh37 Feb 2014          https://grch37.ensembl.org  GRCh37\n",
-      "2     Ensembl 113 Oct 2024 https://oct2024.archive.ensembl.org     113\n",
-      "3     Ensembl 112 May 2024 https://may2024.archive.ensembl.org     112\n",
-      "4     Ensembl 111 Jan 2024 https://jan2024.archive.ensembl.org     111\n",
-      "5     Ensembl 110 Jul 2023 https://jul2023.archive.ensembl.org     110\n",
-      "6     Ensembl 109 Feb 2023 https://feb2023.archive.ensembl.org     109\n",
-      "7     Ensembl 108 Oct 2022 https://oct2022.archive.ensembl.org     108\n",
-      "8     Ensembl 107 Jul 2022 https://jul2022.archive.ensembl.org     107\n",
-      "9     Ensembl 106 Apr 2022 https://apr2022.archive.ensembl.org     106\n",
-      "10    Ensembl 105 Dec 2021 https://dec2021.archive.ensembl.org     105\n",
-      "11    Ensembl 104 May 2021 https://may2021.archive.ensembl.org     104\n",
-      "12    Ensembl 103 Feb 2021 https://feb2021.archive.ensembl.org     103\n",
-      "13    Ensembl 102 Nov 2020 https://nov2020.archive.ensembl.org     102\n",
-      "14    Ensembl 101 Aug 2020 https://aug2020.archive.ensembl.org     101\n",
-      "15    Ensembl 100 Apr 2020 https://apr2020.archive.ensembl.org     100\n",
-      "16     Ensembl 99 Jan 2020 https://jan2020.archive.ensembl.org      99\n",
-      "17     Ensembl 98 Sep 2019 https://sep2019.archive.ensembl.org      98\n",
-      "18     Ensembl 80 May 2015 https://may2015.archive.ensembl.org      80\n",
-      "19     Ensembl 77 Oct 2014 https://oct2014.archive.ensembl.org      77\n",
-      "20     Ensembl 75 Feb 2014 https://feb2014.archive.ensembl.org      75\n",
-      "21     Ensembl 54 May 2009 https://may2009.archive.ensembl.org      54\n",
-      "   current_release\n",
-      "1                 \n",
-      "2                *\n",
-      "3                 \n",
-      "4                 \n",
-      "5                 \n",
-      "6                 \n",
-      "7                 \n",
-      "8                 \n",
-      "9                 \n",
-      "10                \n",
-      "11                \n",
-      "12                \n",
-      "13                \n",
-      "14                \n",
-      "15                \n",
-      "16                \n",
-      "17                \n",
-      "18                \n",
-      "19                \n",
-      "20                \n",
-      "21                \n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "archive_df = r.listEnsemblArchives()\n",
     "archive_df.to_csvfile(path=archive_filename, row_names=False, quote=False)\n",
@@ -1295,172 +366,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "id": "9a747309",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Querying genes 1 - 1000\n",
-      "Querying genes 1001 - 2000\n",
-      "Querying genes 2001 - 3000\n",
-      "Querying genes 3001 - 4000\n",
-      "Querying genes 4001 - 5000\n",
-      "Querying genes 5001 - 6000\n",
-      "Querying genes 6001 - 7000\n",
-      "Querying genes 7001 - 8000\n",
-      "Querying genes 8001 - 9000\n",
-      "Querying genes 9001 - 10000\n",
-      "Querying genes 10001 - 11000\n",
-      "Querying genes 11001 - 12000\n",
-      "Querying genes 12001 - 13000\n",
-      "Querying genes 13001 - 14000\n",
-      "Querying genes 14001 - 15000\n",
-      "Querying genes 15001 - 16000\n",
-      "Querying genes 16001 - 17000\n",
-      "Querying genes 17001 - 18000\n",
-      "Querying genes 18001 - 19000\n",
-      "Querying genes 19001 - 20000\n",
-      "Querying genes 20001 - 21000\n",
-      "Querying genes 21001 - 22000\n",
-      "Querying genes 22001 - 23000\n",
-      "Querying genes 23001 - 24000\n",
-      "Querying genes 24001 - 25000\n",
-      "Querying genes 25001 - 26000\n",
-      "Querying genes 26001 - 27000\n",
-      "Querying genes 27001 - 28000\n",
-      "Querying genes 28001 - 29000\n",
-      "Querying genes 29001 - 30000\n",
-      "Querying genes 30001 - 31000\n",
-      "Querying genes 31001 - 32000\n",
-      "Querying genes 32001 - 33000\n",
-      "Querying genes 33001 - 34000\n",
-      "Querying genes 34001 - 35000\n",
-      "Querying genes 35001 - 35858\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>assembly</th>\n",
-       "      <th>peptide</th>\n",
-       "      <th>possible_replacement</th>\n",
-       "      <th>release</th>\n",
-       "      <th>latest</th>\n",
-       "      <th>type</th>\n",
-       "      <th>id</th>\n",
-       "      <th>version</th>\n",
-       "      <th>is_current</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>35853</th>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>113</td>\n",
-       "      <td>ENSG00000241472.9</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>ENSG00000241472</td>\n",
-       "      <td>9</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35854</th>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>113</td>\n",
-       "      <td>ENSG00000133106.15</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>ENSG00000133106</td>\n",
-       "      <td>15</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35855</th>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>113</td>\n",
-       "      <td>ENSG00000230373.9</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>ENSG00000230373</td>\n",
-       "      <td>9</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35856</th>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>113</td>\n",
-       "      <td>ENSG00000249738.11</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>ENSG00000249738</td>\n",
-       "      <td>11</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35857</th>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>113</td>\n",
-       "      <td>ENSG00000276387.4</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>ENSG00000276387</td>\n",
-       "      <td>4</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "      assembly peptide possible_replacement release              latest  type  \\\n",
-       "35853   GRCh38    None                   []     113   ENSG00000241472.9  Gene   \n",
-       "35854   GRCh38    None                   []     113  ENSG00000133106.15  Gene   \n",
-       "35855   GRCh38    None                   []     113   ENSG00000230373.9  Gene   \n",
-       "35856   GRCh38    None                   []     113  ENSG00000249738.11  Gene   \n",
-       "35857   GRCh38    None                   []     113   ENSG00000276387.4  Gene   \n",
-       "\n",
-       "                    id  version is_current  \n",
-       "35853  ENSG00000241472        9          1  \n",
-       "35854  ENSG00000133106       15          1  \n",
-       "35855  ENSG00000230373        9          1  \n",
-       "35856  ENSG00000249738       11          1  \n",
-       "35857  ENSG00000276387        4          1  "
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "versions = preprocessing_utils.query_ensembl_version_api(\n",
     "    ensembl_ids=gene_table_merged[\"ensembl_gene_id\"].tolist()\n",
@@ -1471,71 +382,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "id": "5c108238",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "release\n",
-       "100       21\n",
-       "101        8\n",
-       "102       16\n",
-       "103       12\n",
-       "104       17\n",
-       "105       10\n",
-       "106       35\n",
-       "107       12\n",
-       "108        4\n",
-       "109        4\n",
-       "110       11\n",
-       "111       52\n",
-       "112      354\n",
-       "113    34303\n",
-       "80        21\n",
-       "81         2\n",
-       "82        10\n",
-       "84       673\n",
-       "87        61\n",
-       "89        20\n",
-       "91        67\n",
-       "93        50\n",
-       "95        33\n",
-       "96        31\n",
-       "97        17\n",
-       "98         9\n",
-       "99         5\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "versions.groupby(\"release\").size()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "id": "bf5aecb1",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "35858\n",
-      "35858\n",
-      "True\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Check that all IDs are the same between the result and the gene table\n",
     "print(len(versions[\"id\"]))\n",
@@ -1548,21 +410,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "id": "7fc8bbcd",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Make sure everything is GRCh38, not GRCh37\n",
     "all(versions[\"assembly\"] == \"GRCh38\")"
@@ -1580,7 +431,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "id": "0d5b5652",
    "metadata": {
     "scrolled": true
@@ -1604,39 +455,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "id": "337b2890",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "closest_release\n",
-       "80       985\n",
-       "98         9\n",
-       "99         5\n",
-       "100       21\n",
-       "101        8\n",
-       "102       16\n",
-       "103       12\n",
-       "104       17\n",
-       "105       10\n",
-       "106       35\n",
-       "107       12\n",
-       "108        4\n",
-       "109        4\n",
-       "110       11\n",
-       "111       52\n",
-       "112      354\n",
-       "113    34303\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "versions[\"closest_release\"] = 0\n",
     "\n",
@@ -1653,149 +475,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
    "id": "343e5006",
    "metadata": {
     "scrolled": false
    },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>assembly</th>\n",
-       "      <th>peptide</th>\n",
-       "      <th>possible_replacement</th>\n",
-       "      <th>release</th>\n",
-       "      <th>latest</th>\n",
-       "      <th>type</th>\n",
-       "      <th>id</th>\n",
-       "      <th>version</th>\n",
-       "      <th>is_current</th>\n",
-       "      <th>closest_release</th>\n",
-       "      <th>permalink</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>113</td>\n",
-       "      <td>ENSG00000151650.8</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>ENSG00000151650</td>\n",
-       "      <td>8</td>\n",
-       "      <td>1</td>\n",
-       "      <td>113</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>113</td>\n",
-       "      <td>ENSG00000168268.11</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>ENSG00000168268</td>\n",
-       "      <td>11</td>\n",
-       "      <td>1</td>\n",
-       "      <td>113</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>113</td>\n",
-       "      <td>ENSG00000186310.10</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>ENSG00000186310</td>\n",
-       "      <td>10</td>\n",
-       "      <td>1</td>\n",
-       "      <td>113</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>113</td>\n",
-       "      <td>ENSG00000204616.11</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>ENSG00000204616</td>\n",
-       "      <td>11</td>\n",
-       "      <td>1</td>\n",
-       "      <td>113</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>113</td>\n",
-       "      <td>ENSG00000158467.17</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>ENSG00000158467</td>\n",
-       "      <td>17</td>\n",
-       "      <td>1</td>\n",
-       "      <td>113</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "  assembly peptide possible_replacement release              latest  type  \\\n",
-       "0   GRCh38    None                   []     113   ENSG00000151650.8  Gene   \n",
-       "1   GRCh38    None                   []     113  ENSG00000168268.11  Gene   \n",
-       "2   GRCh38    None                   []     113  ENSG00000186310.10  Gene   \n",
-       "3   GRCh38    None                   []     113  ENSG00000204616.11  Gene   \n",
-       "4   GRCh38    None                   []     113  ENSG00000158467.17  Gene   \n",
-       "\n",
-       "                id  version is_current  closest_release  \\\n",
-       "0  ENSG00000151650        8          1              113   \n",
-       "1  ENSG00000168268       11          1              113   \n",
-       "2  ENSG00000186310       10          1              113   \n",
-       "3  ENSG00000204616       11          1              113   \n",
-       "4  ENSG00000158467       17          1              113   \n",
-       "\n",
-       "                                           permalink  \n",
-       "0  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "1  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "2  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "3  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "4  https://oct2024.archive.ensembl.org/Homo_sapie...  "
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "versions[\"permalink\"] = \"\"\n",
     "\n",
@@ -1812,166 +497,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "id": "4b01719d",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>assembly</th>\n",
-       "      <th>peptide</th>\n",
-       "      <th>possible_replacement</th>\n",
-       "      <th>release</th>\n",
-       "      <th>latest</th>\n",
-       "      <th>type</th>\n",
-       "      <th>id</th>\n",
-       "      <th>version</th>\n",
-       "      <th>is_current</th>\n",
-       "      <th>closest_release</th>\n",
-       "      <th>permalink</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>67</th>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>84</td>\n",
-       "      <td>ENSG00000265108.1</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>ENSG00000265108</td>\n",
-       "      <td>1</td>\n",
-       "      <td></td>\n",
-       "      <td>80</td>\n",
-       "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>68</th>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>80</td>\n",
-       "      <td>ENSG00000280803.1</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>ENSG00000280803</td>\n",
-       "      <td>1</td>\n",
-       "      <td></td>\n",
-       "      <td>80</td>\n",
-       "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>111</th>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>84</td>\n",
-       "      <td>ENSG00000281672.1</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>ENSG00000281672</td>\n",
-       "      <td>1</td>\n",
-       "      <td></td>\n",
-       "      <td>80</td>\n",
-       "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>135</th>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>87</td>\n",
-       "      <td>ENSG00000279857.1</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>ENSG00000279857</td>\n",
-       "      <td>1</td>\n",
-       "      <td></td>\n",
-       "      <td>80</td>\n",
-       "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>141</th>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>84</td>\n",
-       "      <td>ENSG00000274483.1</td>\n",
-       "      <td>Gene</td>\n",
-       "      <td>ENSG00000274483</td>\n",
-       "      <td>1</td>\n",
-       "      <td></td>\n",
-       "      <td>80</td>\n",
-       "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    assembly peptide possible_replacement release             latest  type  \\\n",
-       "67    GRCh38    None                   []      84  ENSG00000265108.1  Gene   \n",
-       "68    GRCh38    None                   []      80  ENSG00000280803.1  Gene   \n",
-       "111   GRCh38    None                   []      84  ENSG00000281672.1  Gene   \n",
-       "135   GRCh38    None                   []      87  ENSG00000279857.1  Gene   \n",
-       "141   GRCh38    None                   []      84  ENSG00000274483.1  Gene   \n",
-       "\n",
-       "                  id  version is_current  closest_release  \\\n",
-       "67   ENSG00000265108        1                          80   \n",
-       "68   ENSG00000280803        1                          80   \n",
-       "111  ENSG00000281672        1                          80   \n",
-       "135  ENSG00000279857        1                          80   \n",
-       "141  ENSG00000274483        1                          80   \n",
-       "\n",
-       "                                             permalink  \n",
-       "67   https://may2015.archive.ensembl.org/Homo_sapie...  \n",
-       "68   https://may2015.archive.ensembl.org/Homo_sapie...  \n",
-       "111  https://may2015.archive.ensembl.org/Homo_sapie...  \n",
-       "135  https://may2015.archive.ensembl.org/Homo_sapie...  \n",
-       "141  https://may2015.archive.ensembl.org/Homo_sapie...  "
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "versions[versions[\"closest_release\"] < 100].head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "id": "c4128cc9",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "https://oct2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000151650\n",
-      "https://oct2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000142192\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(versions[\"permalink\"][0])\n",
     "print(versions[\"permalink\"][25])"
@@ -1979,21 +518,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "id": "73791e6c",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Does every gene have an associated URL?\n",
     "url_base_len = len(archive_table[\"url\"][0]) + 1\n",
@@ -2010,174 +538,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "id": "f3edfd2f",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(35858, 12)\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>ensembl_gene_id</th>\n",
-       "      <th>_id</th>\n",
-       "      <th>_version</th>\n",
-       "      <th>alias</th>\n",
-       "      <th>name</th>\n",
-       "      <th>summary</th>\n",
-       "      <th>symbol</th>\n",
-       "      <th>type_of_gene</th>\n",
-       "      <th>notfound</th>\n",
-       "      <th>ensembl_release</th>\n",
-       "      <th>possible_replacement</th>\n",
-       "      <th>permalink</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>ENSG00000151650</td>\n",
-       "      <td>27287</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[NA88A, HPX42B, VENTX2]</td>\n",
-       "      <td>VENT homeobox</td>\n",
-       "      <td>This gene encodes a member of the Vent family ...</td>\n",
-       "      <td>VENTX</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>113</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>ENSG00000168268</td>\n",
-       "      <td>64943</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>5'-nucleotidase domain containing 2</td>\n",
-       "      <td>Predicted to enable 5'-nucleotidase activity. ...</td>\n",
-       "      <td>NT5DC2</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>113</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>ENSG00000186310</td>\n",
-       "      <td>4675</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[MB20, NPL3]</td>\n",
-       "      <td>nucleosome assembly protein 1 like 3</td>\n",
-       "      <td>This gene is intronless and encodes a member o...</td>\n",
-       "      <td>NAP1L3</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>113</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>ENSG00000204616</td>\n",
-       "      <td>11074</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[C6orf13, RNF, HCGI, HCG1]</td>\n",
-       "      <td>tripartite motif containing 31</td>\n",
-       "      <td>This gene encodes a protein that functions as ...</td>\n",
-       "      <td>TRIM31</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>113</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>ENSG00000158467</td>\n",
-       "      <td>23382</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>[IRBIT2, ADOHCYASE3]</td>\n",
-       "      <td>adenosylhomocysteinase like 2</td>\n",
-       "      <td>The protein encoded by this gene acts as a hom...</td>\n",
-       "      <td>AHCYL2</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>113</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   ensembl_gene_id    _id _version                       alias  \\\n",
-       "0  ENSG00000151650  27287      1.0     [NA88A, HPX42B, VENTX2]   \n",
-       "1  ENSG00000168268  64943      1.0                          []   \n",
-       "2  ENSG00000186310   4675      1.0                [MB20, NPL3]   \n",
-       "3  ENSG00000204616  11074      1.0  [C6orf13, RNF, HCGI, HCG1]   \n",
-       "4  ENSG00000158467  23382      1.0        [IRBIT2, ADOHCYASE3]   \n",
-       "\n",
-       "                                   name  \\\n",
-       "0                         VENT homeobox   \n",
-       "1   5'-nucleotidase domain containing 2   \n",
-       "2  nucleosome assembly protein 1 like 3   \n",
-       "3        tripartite motif containing 31   \n",
-       "4         adenosylhomocysteinase like 2   \n",
-       "\n",
-       "                                             summary  symbol    type_of_gene  \\\n",
-       "0  This gene encodes a member of the Vent family ...   VENTX  protein-coding   \n",
-       "1  Predicted to enable 5'-nucleotidase activity. ...  NT5DC2  protein-coding   \n",
-       "2  This gene is intronless and encodes a member o...  NAP1L3  protein-coding   \n",
-       "3  This gene encodes a protein that functions as ...  TRIM31  protein-coding   \n",
-       "4  The protein encoded by this gene acts as a hom...  AHCYL2  protein-coding   \n",
-       "\n",
-       "  notfound ensembl_release possible_replacement  \\\n",
-       "0      NaN             113                   []   \n",
-       "1      NaN             113                   []   \n",
-       "2      NaN             113                   []   \n",
-       "3      NaN             113                   []   \n",
-       "4      NaN             113                   []   \n",
-       "\n",
-       "                                           permalink  \n",
-       "0  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "1  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "2  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "3  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "4  https://oct2024.archive.ensembl.org/Homo_sapie...  "
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "versions = versions[[\"id\", \"release\", \"possible_replacement\", \"permalink\"]]\n",
     "versions.rename(\n",
@@ -2202,276 +566,21 @@
    "metadata": {},
    "source": [
     "### Final cleanup\n",
-    "Unfilled \"possible_replacement\" entries should be changed from NaN to empty lists. \n",
-    "\n",
-    "\"possible_replacement\" entries that have data in them exist as a list of dicts, and need to have the Ensembl IDs pulled out of them as a list of strings. \n",
+    "\"possible_replacement\" entries will either be an empty list or a list of dictionaries. Entries that have data in them need to have the Ensembl IDs pulled out of them as a list of strings.\n",
     "\n",
     "Remove unneeded columns. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": null,
    "id": "d0c07b7a",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>ensembl_gene_id</th>\n",
-       "      <th>name</th>\n",
-       "      <th>alias</th>\n",
-       "      <th>summary</th>\n",
-       "      <th>symbol</th>\n",
-       "      <th>type_of_gene</th>\n",
-       "      <th>ensembl_release</th>\n",
-       "      <th>possible_replacement</th>\n",
-       "      <th>permalink</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>ENSG00000151650</td>\n",
-       "      <td>VENT homeobox</td>\n",
-       "      <td>[NA88A, HPX42B, VENTX2]</td>\n",
-       "      <td>This gene encodes a member of the Vent family ...</td>\n",
-       "      <td>VENTX</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>113</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>ENSG00000168268</td>\n",
-       "      <td>5'-nucleotidase domain containing 2</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>Predicted to enable 5'-nucleotidase activity. ...</td>\n",
-       "      <td>NT5DC2</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>113</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>ENSG00000186310</td>\n",
-       "      <td>nucleosome assembly protein 1 like 3</td>\n",
-       "      <td>[MB20, NPL3]</td>\n",
-       "      <td>This gene is intronless and encodes a member o...</td>\n",
-       "      <td>NAP1L3</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>113</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>ENSG00000204616</td>\n",
-       "      <td>tripartite motif containing 31</td>\n",
-       "      <td>[C6orf13, RNF, HCGI, HCG1]</td>\n",
-       "      <td>This gene encodes a protein that functions as ...</td>\n",
-       "      <td>TRIM31</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>113</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>ENSG00000158467</td>\n",
-       "      <td>adenosylhomocysteinase like 2</td>\n",
-       "      <td>[IRBIT2, ADOHCYASE3]</td>\n",
-       "      <td>The protein encoded by this gene acts as a hom...</td>\n",
-       "      <td>AHCYL2</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>113</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35853</th>\n",
-       "      <td>ENSG00000241472</td>\n",
-       "      <td>PTPRG antisense RNA 1</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>PTPRG-AS1</td>\n",
-       "      <td>ncRNA</td>\n",
-       "      <td>113</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35854</th>\n",
-       "      <td>ENSG00000133106</td>\n",
-       "      <td>epithelial stromal interaction 1</td>\n",
-       "      <td>[BRESI1]</td>\n",
-       "      <td>The protein encoded by this gene has been show...</td>\n",
-       "      <td>EPSTI1</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>113</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35855</th>\n",
-       "      <td>ENSG00000230373</td>\n",
-       "      <td>golgin A6 family like 3, pseudogene</td>\n",
-       "      <td>[GOLGA6L3, GOLGA6L21P, GOLGA6L17P]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>GOLGA6L3P</td>\n",
-       "      <td>pseudo</td>\n",
-       "      <td>113</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35856</th>\n",
-       "      <td>ENSG00000249738</td>\n",
-       "      <td>uncharacterized LOC285626</td>\n",
-       "      <td>[LOC105377683]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>LOC285626</td>\n",
-       "      <td>ncRNA</td>\n",
-       "      <td>113</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35857</th>\n",
-       "      <td>ENSG00000276387</td>\n",
-       "      <td>killer cell immunoglobulin like receptor, two ...</td>\n",
-       "      <td>[CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64...</td>\n",
-       "      <td>Killer cell immunoglobulin-like receptors (KIR...</td>\n",
-       "      <td>KIR2DL1</td>\n",
-       "      <td>protein-coding</td>\n",
-       "      <td>113</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>https://oct2024.archive.ensembl.org/Homo_sapie...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>35858 rows × 9 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       ensembl_gene_id                                               name  \\\n",
-       "0      ENSG00000151650                                      VENT homeobox   \n",
-       "1      ENSG00000168268                5'-nucleotidase domain containing 2   \n",
-       "2      ENSG00000186310               nucleosome assembly protein 1 like 3   \n",
-       "3      ENSG00000204616                     tripartite motif containing 31   \n",
-       "4      ENSG00000158467                      adenosylhomocysteinase like 2   \n",
-       "...                ...                                                ...   \n",
-       "35853  ENSG00000241472                              PTPRG antisense RNA 1   \n",
-       "35854  ENSG00000133106                   epithelial stromal interaction 1   \n",
-       "35855  ENSG00000230373                golgin A6 family like 3, pseudogene   \n",
-       "35856  ENSG00000249738                          uncharacterized LOC285626   \n",
-       "35857  ENSG00000276387  killer cell immunoglobulin like receptor, two ...   \n",
-       "\n",
-       "                                                   alias  \\\n",
-       "0                                [NA88A, HPX42B, VENTX2]   \n",
-       "1                                                     []   \n",
-       "2                                           [MB20, NPL3]   \n",
-       "3                             [C6orf13, RNF, HCGI, HCG1]   \n",
-       "4                                   [IRBIT2, ADOHCYASE3]   \n",
-       "...                                                  ...   \n",
-       "35853                                                 []   \n",
-       "35854                                           [BRESI1]   \n",
-       "35855                 [GOLGA6L3, GOLGA6L21P, GOLGA6L17P]   \n",
-       "35856                                     [LOC105377683]   \n",
-       "35857  [CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64...   \n",
-       "\n",
-       "                                                 summary     symbol  \\\n",
-       "0      This gene encodes a member of the Vent family ...      VENTX   \n",
-       "1      Predicted to enable 5'-nucleotidase activity. ...     NT5DC2   \n",
-       "2      This gene is intronless and encodes a member o...     NAP1L3   \n",
-       "3      This gene encodes a protein that functions as ...     TRIM31   \n",
-       "4      The protein encoded by this gene acts as a hom...     AHCYL2   \n",
-       "...                                                  ...        ...   \n",
-       "35853                                                NaN  PTPRG-AS1   \n",
-       "35854  The protein encoded by this gene has been show...     EPSTI1   \n",
-       "35855                                                NaN  GOLGA6L3P   \n",
-       "35856                                                NaN  LOC285626   \n",
-       "35857  Killer cell immunoglobulin-like receptors (KIR...    KIR2DL1   \n",
-       "\n",
-       "         type_of_gene ensembl_release possible_replacement  \\\n",
-       "0      protein-coding             113                   []   \n",
-       "1      protein-coding             113                   []   \n",
-       "2      protein-coding             113                   []   \n",
-       "3      protein-coding             113                   []   \n",
-       "4      protein-coding             113                   []   \n",
-       "...               ...             ...                  ...   \n",
-       "35853           ncRNA             113                   []   \n",
-       "35854  protein-coding             113                   []   \n",
-       "35855          pseudo             113                   []   \n",
-       "35856           ncRNA             113                   []   \n",
-       "35857  protein-coding             113                   []   \n",
-       "\n",
-       "                                               permalink  \n",
-       "0      https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "1      https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "2      https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "3      https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "4      https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "...                                                  ...  \n",
-       "35853  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "35854  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "35855  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "35856  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "35857  https://oct2024.archive.ensembl.org/Homo_sapie...  \n",
-       "\n",
-       "[35858 rows x 9 columns]"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n",
     "    \"possible_replacement\"\n",
-    "].apply(lambda cell: cell if cell is not np.NaN else [])\n",
-    "\n",
-    "gene_table_merged[\"possible_replacement\"] = gene_table_merged.apply(\n",
-    "    lambda row: (\n",
-    "        row[\"possible_replacement\"]\n",
-    "        if len(row[\"possible_replacement\"]) == 0\n",
-    "        else [x[\"stable_id\"] for x in row[\"possible_replacement\"]]\n",
-    "    ),\n",
-    "    axis=1,\n",
-    ")\n",
+    "].apply(lambda pr: pr if len(pr) == 0 else [x[\"stable_id\"] for x in pr])\n",
     "\n",
     "gene_table_merged = gene_table_merged[\n",
     "    [\n",
@@ -2501,7 +610,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "id": "f2287922",
    "metadata": {},
    "outputs": [],
diff --git a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
index 4ee36b02..c9735fd2 100644
--- a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
+++ b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
@@ -7,6 +7,10 @@
     r_query_biomart - queries Biomart using rpy2
     filter_hasgs - removes human alternative sequence genes from a data frame
     get_all_adt_ensembl_ids - gets the Ensembl IDs in all of the files ingested by ADT
+    standardize_list_item - turn values of varying types into a list. Used for fixing the "alias" and
+                            "possible_replacement" fields of gene_metadata.
+    merge_duplicate_ensembl_ids - collapse rows with the same Ensembl ID but different gene symbols
+                                  or aliases into one row
 """
 
 import pandas as pd
@@ -330,3 +334,77 @@ def _extract_ensembl_ids(
 
     # Remove duplicate values
     return list(set(file_ensembl_ids))
+
+
+def standardize_list_item(item: Union[str, List[str]]) -> List[str]:
+    """
+    For the gene_metadata data frame, some queries return columns that are a mixture of None/NaN,
+    a single string, and a list of strings. This function standardizes the column values so that
+    everything is a list, either empty (if NaN) or a list of strings. The final list is sorted
+    alphabetically to make comparison between different versions of the file easier.
+
+    This function is intended to be called as part of an apply() statement on a pandas data frame
+    column.
+
+    Args:
+        item: either a list of strings, a list of lists of strings, or np.NaN
+
+    Returns:
+        A single-level list of strings, which may be empty. The list is sorted alphabetically.
+    """
+    # Convert NaN to an empty list
+    if item is np.NaN:
+        return []
+
+    # Convert plain strings to a list of one string
+    if isinstance(item, str):
+        return [item]
+
+    # Get unique values only and sort them
+    item = list(set(item))
+    item.sort()
+    return item
+
+
+def merge_duplicate_ensembl_ids(gene_table: pd.DataFrame) -> pd.DataFrame:
+    """
+    MyGene queries sometimes return multiple rows rows with the same Ensembl ID but different symbols
+    or other information. This usually happens when a single Ensembl ID maps to multiple Entrez IDs
+    in the NCBI database. There's not a good way to reconcile this, so for every set of rows with the
+    same Ensembl ID, we designate the first entry in the as the main row. The gene symbols of the
+    remaining rows in the set are then added as aliases to the "main" row, and all of their aliases
+    are added to the main row alias field as well. All rows in the set except the main row are then
+    deleted from the data frame, leaving a single row for that Ensembl ID with all symbols and aliases
+    from the duplicate rows merged into the alias field.
+
+    Args:
+        gene_table: a pandas DataFrame containing gene metadata results from MyGene
+
+    Returns:
+        a data frame with duplicate rows removed
+    """
+    dupes = gene_table["ensembl_gene_id"].duplicated()
+    dupe_ids = gene_table.loc[dupes, "ensembl_gene_id"].drop_duplicates().tolist()
+
+    for ens_id in dupe_ids:
+        rows = gene_table.loc[gene_table["ensembl_gene_id"] == ens_id]
+
+        # Add duplicate rows' symbols to the alias field of the first row, then add duplicate rows'
+        # aliases to the first row's alias field. All other information in the duplicate rows is
+        # discarded.
+        new_alias = rows.iloc[0]["alias"]
+
+        for row in rows.index[1:]:
+            new_alias.append(rows.loc[row, "symbol"])
+            new_alias = new_alias + rows.loc[row, "alias"]
+
+        # Remove any duplicate aliases and sort them
+        new_alias = list(set(new_alias))
+        new_alias.sort()
+
+        # Set the new aliases to the first row in this group and remove all duplicate rows from the
+        # data frame
+        gene_table.at[rows.index[0], "alias"] = new_alias
+        gene_table = gene_table.drop(rows.index[1:])
+
+    return gene_table

From 503d3012b7dcaeae967a9c13f55a823b1338ce88 Mon Sep 17 00:00:00 2001
From: Jaclyn Beck <jaclyn.beck@sagebase.org>
Date: Fri, 22 Nov 2024 11:32:13 -0800
Subject: [PATCH 5/8] Updated uniprot mapping script to use new preprocessing
 function to get all ADT ids

---
 .../AG-1388_ENSG_Uniprot_Mapping.ipynb        | 160 ++----------------
 1 file changed, 10 insertions(+), 150 deletions(-)

diff --git a/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb
index ba477beb..2b369886 100644
--- a/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb
+++ b/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb
@@ -20,16 +20,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "from unipressed import IdMappingClient\n",
     "import time\n",
     "import pandas as pd\n",
-    "import numpy as np\n",
-    "import agoradatatools.etl.utils as utils\n",
-    "import agoradatatools.etl.extract as extract\n",
+    "import preprocessing_utils\n",
     "\n",
     "config_filename = \"../../../../config.yaml\""
    ]
@@ -43,157 +41,19 @@
     "Loop through all data sets in the config file to get all Ensembl IDs used in every data set. NOTE: In the future, it would be simpler to just load the `gene_metadata` data set once druggability genes are removed from it, rather than looping through all of these files. "
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'genes_biodomains': ('syn44151254.5', 'csv'),\n",
-       " 'neuropath_regression_results': ('syn22017882.5', 'csv'),\n",
-       " 'proteomics': ('syn18689335.3', 'csv'),\n",
-       " 'proteomics_tmt': ('syn35221005.2', 'csv'),\n",
-       " 'proteomics_srm': ('syn52579640.4', 'csv'),\n",
-       " 'target_exp_validation_harmonized': ('syn24184512.9', 'csv'),\n",
-       " 'metabolomics': ('syn26064497.1', 'feather'),\n",
-       " 'igap': ('syn12514826.5', 'csv'),\n",
-       " 'eqtl': ('syn12514912.3', 'csv'),\n",
-       " 'diff_exp_data': ('syn27211942.1', 'tsv'),\n",
-       " 'target_list': ('syn12540368.47', 'csv'),\n",
-       " 'median_expression': ('syn27211878.2', 'csv'),\n",
-       " 'tep_adi_info': ('syn51942280.2', 'csv'),\n",
-       " 'team_info': ('syn12615624.18', 'csv'),\n",
-       " 'team_member_info': ('syn12615633.18', 'csv'),\n",
-       " 'overall_scores': ('syn25575156.13', 'table'),\n",
-       " 'networks': ('syn11685347.1', 'csv')}"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "config = utils._get_config(config_path=config_filename)\n",
-    "datasets = config[\"datasets\"]\n",
-    "\n",
-    "files = {}\n",
-    "\n",
-    "for dataset in datasets:\n",
-    "    dataset_name = list(dataset.keys())[0]\n",
-    "\n",
-    "    for entity in dataset[dataset_name][\"files\"]:\n",
-    "        entity_id = entity[\"id\"]\n",
-    "        entity_format = entity[\"format\"]\n",
-    "        entity_name = entity[\"name\"]\n",
-    "\n",
-    "        # Ignore json files, which are post-processed and not what we're interested in.\n",
-    "        # Also ignore \"druggability\" since we want to exclude druggability-only genes, and \n",
-    "        # \"gene_metadata\" which includes druggability genes.\n",
-    "        if entity_format != \"json\" and entity_name not in [\"druggability\", \"gene_metadata\"]:\n",
-    "            files[entity_name] = (entity_id, entity_format)\n",
-    "\n",
-    "# There are some duplicate synID's in this list but that doesn't really matter\n",
-    "files"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### We should now have a list of all raw data files ingested. Get each one and create a list of IDs."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "UPGRADE AVAILABLE\n",
-      "\n",
-      "A more recent version of the Synapse Client (4.6.0) is available. Your version (4.0.0) can be upgraded by typing:\n",
-      "    pip install --upgrade synapseclient\n",
-      "\n",
-      "Python Synapse Client version 4.6.0 release notes\n",
-      "\n",
-      "https://python-docs.synapse.org/news/\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Welcome, Jaclyn Beck!\n",
-      "\n",
-      "INFO: 2024-11-15 11:43:36 | synapseclient_default | Welcome, Jaclyn Beck!\n",
-      "\n",
-      "genes_biodomains has an NaN Ensembl ID\n",
-      "WARNING: no Ensembl ID column found for team_info!\n",
-      "WARNING: no Ensembl ID column found for team_member_info!\n"
-     ]
-    }
-   ],
-   "source": [
-    "syn = utils._login_to_synapse(token=None)  # Assumes you have already logged in with a valid token\n",
-    "\n",
-    "# The various column names used to store Ensembl IDs in the files\n",
-    "col_names = [\"ENSG\", \"ensembl_gene_id\", \"GeneID\", \"ensembl_id\"]\n",
-    "file_ensembl_list = []\n",
-    "\n",
-    "for file in files.keys():\n",
-    "    df = extract.get_entity_as_df(syn_id=files[file][0], source=files[file][1], syn=syn)\n",
-    "\n",
-    "    file_ensembl_ids = None\n",
-    "\n",
-    "    for C in col_names:\n",
-    "        if C in df.columns:\n",
-    "            file_ensembl_ids = df[C]\n",
-    "\n",
-    "    # networks file is a special case\n",
-    "    if file == \"networks\":\n",
-    "        file_ensembl_ids = pd.melt(\n",
-    "            df[[\"geneA_ensembl_gene_id\", \"geneB_ensembl_gene_id\"]]\n",
-    "        )[\"value\"]\n",
-    "\n",
-    "    if file_ensembl_ids is not None:\n",
-    "        file_ensembl_list = file_ensembl_list + file_ensembl_ids.tolist()\n",
-    "        if \"n/A\" in file_ensembl_ids.tolist():\n",
-    "            print(file + \" has an n/A Ensembl ID\")\n",
-    "            file_ensembl_list.remove(\"n/A\")\n",
-    "        if np.NaN in file_ensembl_ids.tolist():\n",
-    "            print(file + \" has an NaN Ensembl ID\")\n",
-    "    else:\n",
-    "        print(\"WARNING: no Ensembl ID column found for \" + file + \"!\")"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "35858\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "file_ensembl_list = list(set(file_ensembl_list))\n",
-    "\n",
-    "# NaNs will be floats, so this removes them. Using np.isnan() on strings throws an error.\n",
-    "ensembl_ids = [x for x in file_ensembl_list if isinstance(x, str)]\n",
-    "\n",
-    "print(len(ensembl_ids))"
+    "ensembl_ids = preprocessing_utils.get_all_adt_ensembl_ids(\n",
+    "    config_filename=config_filename,\n",
+    "    exclude_files=[\"gene_metadata\", \"druggability\"],\n",
+    "    token=None,\n",
+    ")\n",
+    "print(\"\")\n",
+    "print(str(len(ensembl_ids)) + \" Ensembl IDs found.\")"
    ]
   },
   {

From ab1bb82407000bdb046180992be108d2f8a5ffac Mon Sep 17 00:00:00 2001
From: Jaclyn Beck <jaclyn.beck@sagebase.org>
Date: Fri, 22 Nov 2024 13:45:49 -0800
Subject: [PATCH 6/8] Fixed standardize_list_item to work for
 possible_replacement

---
 .../AG-896_Preprocess_Gene_Annotations.ipynb           |  4 ++++
 .../notebooks/preprocessing/preprocessing_utils.py     | 10 +++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
index bfbab4b4..e09882a4 100644
--- a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
+++ b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
@@ -578,6 +578,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n",
+    "    \"possible_replacement\"\n",
+    "].apply(preprocessing_utils.standardize_list_item)\n",
+    "\n",
     "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n",
     "    \"possible_replacement\"\n",
     "].apply(lambda pr: pr if len(pr) == 0 else [x[\"stable_id\"] for x in pr])\n",
diff --git a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
index c9735fd2..f9119d01 100644
--- a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
+++ b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
@@ -360,9 +360,13 @@ def standardize_list_item(item: Union[str, List[str]]) -> List[str]:
     if isinstance(item, str):
         return [item]
 
-    # Get unique values only and sort them
-    item = list(set(item))
-    item.sort()
+    if isinstance(item, list):
+        # Get unique values only and sort them
+        item = list(set(item))
+        item.sort()
+
+    # No extra handling necessary for other data types
+
     return item
 
 
From 3a6065544b493e3efc528cb9c8c41a267589b565 Mon Sep 17 00:00:00 2001
From: Jaclyn Beck <jaclyn.beck@sagebase.org>
Date: Fri, 22 Nov 2024 14:50:50 -0800
Subject: [PATCH 7/8] Fix to possible_replacement so the list field is
 standardized after it's actually a list of strings

---
 .../preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
index e09882a4..7550d17c 100644
--- a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
+++ b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
@@ -580,11 +580,11 @@
    "source": [
     "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n",
     "    \"possible_replacement\"\n",
-    "].apply(preprocessing_utils.standardize_list_item)\n",
+    "].apply(lambda pr: pr if pr is np.NaN or len(pr) == 0 else [x[\"stable_id\"] for x in pr])\n",
     "\n",
     "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n",
     "    \"possible_replacement\"\n",
-    "].apply(lambda pr: pr if len(pr) == 0 else [x[\"stable_id\"] for x in pr])\n",
+    "].apply(preprocessing_utils.standardize_list_item)\n",
     "\n",
     "gene_table_merged = gene_table_merged[\n",
     "    [\n",

From ce9dc5e4eb3b6b6b14a122168de679a31423adc0 Mon Sep 17 00:00:00 2001
From: Jaclyn Beck <jaclyn.beck@sagebase.org>
Date: Fri, 22 Nov 2024 14:55:31 -0800
Subject: [PATCH 8/8] Updated comment in the standardize list function

---
 .../agora/notebooks/preprocessing/preprocessing_utils.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
index f9119d01..e85f441a 100644
--- a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
+++ b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py
@@ -347,10 +347,10 @@ def standardize_list_item(item: Union[str, List[str]]) -> List[str]:
     column.
 
     Args:
-        item: either a list of strings, a list of lists of strings, or np.NaN
+        item: either a string, a list of strings, or np.NaN
 
     Returns:
-        A single-level list of strings, which may be empty. The list is sorted alphabetically.
+        A list of strings or an empty list. The list is sorted alphabetically.
     """
     # Convert NaN to an empty list
     if item is np.NaN: