From 72e2e1ca1300b9c82cea31ba28bcd990d02b6102 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Wed, 20 Nov 2024 19:12:48 -0800 Subject: [PATCH 1/8] Removed druggability-only genes from the gene_metadata pre-processing step, and bumped the version of the file in the config to match the new file on Synapse --- config.yaml | 4 +- .../AG-896_Preprocess_Gene_Annotations.ipynb | 1824 ++++++++--------- .../preprocessing/preprocessing_utils.py | 209 +- test_config.yaml | 4 +- 4 files changed, 1012 insertions(+), 1029 deletions(-) diff --git a/config.yaml b/config.yaml index 7b8b4f4b..1ace0892 100644 --- a/config.yaml +++ b/config.yaml @@ -144,7 +144,7 @@ datasets: - gene_info: files: - name: gene_metadata - id: syn25953363.13 + id: syn25953363.14 format: feather - name: igap id: syn12514826.5 @@ -187,7 +187,7 @@ datasets: possible_replacement: ensembl_possible_replacements permalink: ensembl_permalink provenance: - - syn25953363.13 + - syn25953363.14 - syn12514826.5 - syn12514912.3 - *agora_proteomics_provenance diff --git a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb index 9ef8fedf..c0f2ad33 100644 --- a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb +++ b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb @@ -108,7 +108,7 @@ "source": [ "## Get Ensembl IDs from data sets that will be processed by agora-data-tools\n", "\n", - "Loop through all data sets in the config file to get all Ensembl IDs used in every data set." + "Loop through all data sets in the config file to get all Ensembl IDs used in every data set. Exclude `gene_metadata` since that's the file we are building, and `druggability` since that data is deprecated." ] }, { @@ -118,73 +118,6 @@ "metadata": { "scrolled": true }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'genes_biodomains': ('syn44151254.5', 'csv'),\n", - " 'neuropath_regression_results': ('syn22017882.5', 'csv'),\n", - " 'proteomics': ('syn18689335.3', 'csv'),\n", - " 'proteomics_tmt': ('syn35221005.2', 'csv'),\n", - " 'proteomics_srm': ('syn52579640.4', 'csv'),\n", - " 'target_exp_validation_harmonized': ('syn24184512.9', 'csv'),\n", - " 'metabolomics': ('syn26064497.1', 'feather'),\n", - " 'igap': ('syn12514826.5', 'csv'),\n", - " 'eqtl': ('syn12514912.3', 'csv'),\n", - " 'diff_exp_data': ('syn27211942.1', 'tsv'),\n", - " 'target_list': ('syn12540368.47', 'csv'),\n", - " 'median_expression': ('syn27211878.2', 'csv'),\n", - " 'druggability': ('syn13363443.11', 'csv'),\n", - " 'tep_adi_info': ('syn51942280.2', 'csv'),\n", - " 'team_info': ('syn12615624.18', 'csv'),\n", - " 'team_member_info': ('syn12615633.18', 'csv'),\n", - " 'overall_scores': ('syn25575156.13', 'table'),\n", - " 'networks': ('syn11685347.1', 'csv')}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "config = utils._get_config(config_path=config_filename)\n", - "datasets = config[\"datasets\"]\n", - "\n", - "files = {}\n", - "\n", - "for dataset in datasets:\n", - " dataset_name = list(dataset.keys())[0]\n", - "\n", - " for entity in dataset[dataset_name][\"files\"]:\n", - " entity_id = entity[\"id\"]\n", - " entity_format = entity[\"format\"]\n", - " entity_name = entity[\"name\"]\n", - "\n", - " # Ignore json files, which are post-processed and not what we're interested in.\n", - " # Also ignore \"gene_metadata\" since that's the file we're making here.\n", - " if entity_format != \"json\" and entity_name != \"gene_metadata\":\n", - " files[entity_name] = (entity_id, entity_format)\n", - "\n", - "# There are some duplicate synID's in this list but that doesn't really matter\n", - "files" - ] - }, - { - "cell_type": "markdown", - "id": "8f1a2120", - "metadata": {}, - "source": [ - "### We should now have a list of all raw data files ingested. Get each one and create a list of IDs." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "9843689d", - "metadata": { - "scrolled": true - }, "outputs": [ { "name": "stderr", @@ -193,10 +126,10 @@ "\n", "UPGRADE AVAILABLE\n", "\n", - "A more recent version of the Synapse Client (4.2.0) is available. Your version (4.0.0) can be upgraded by typing:\n", + "A more recent version of the Synapse Client (4.6.0) is available. Your version (4.0.0) can be upgraded by typing:\n", " pip install --upgrade synapseclient\n", "\n", - "Python Synapse Client version 4.2.0 release notes\n", + "Python Synapse Client version 4.6.0 release notes\n", "\n", "https://python-docs.synapse.org/news/\n", "\n" @@ -222,50 +155,58 @@ "name": "stdout", "output_type": "stream", "text": [ - "genes_biodomains has an NaN Ensembl ID\n", + "Found 19 files:\n", + "genes_biodomains:\tsyn44151254.5\n", + "neuropath_regression_results:\tsyn22017882.5\n", + "proteomics:\tsyn18689335.4\n", + "proteomics_tmt:\tsyn35221005.2\n", + "proteomics_srm:\tsyn52579640.4\n", + "target_exp_validation_harmonized:\tsyn24184512.9\n", + "metabolomics:\tsyn26064497.1\n", + "gene_metadata:\tsyn25953363.13\n", + "igap:\tsyn12514826.5\n", + "eqtl:\tsyn12514912.3\n", + "diff_exp_data:\tsyn27211942.1\n", + "target_list:\tsyn12540368.51\n", + "median_expression:\tsyn27211878.2\n", + "druggability:\tsyn13363443.11\n", + "tep_adi_info:\tsyn51942280.3\n", + "team_info:\tsyn12615624.18\n", + "team_member_info:\tsyn12615633.19\n", + "overall_scores:\tsyn25575156.13\n", + "networks:\tsyn11685347.1\n", + "\n", + "genes_biodomains has 591 NaN Ensembl IDs\n", "WARNING: no Ensembl ID column found for team_info!\n", - "WARNING: no Ensembl ID column found for team_member_info!\n" + "WARNING: no Ensembl ID column found for team_member_info!\n", + "\n", + "35858 Ensembl IDs found.\n", + "['ENSG00000151650', 'ENSG00000168268', 'ENSG00000186310', 'ENSG00000204616', 'ENSG00000158467']\n" ] } ], "source": [ - "syn = utils._login_to_synapse(\n", - " token=None\n", - ") # Assumes you have already logged in with a valid token\n", - "\n", - "# The various column names used to store Ensembl IDs in the files\n", - "col_names = [\"ENSG\", \"ensembl_gene_id\", \"GeneID\", \"ensembl_id\"]\n", - "file_ensembl_list = []\n", - "\n", - "for file in files.keys():\n", - " df = extract.get_entity_as_df(syn_id=files[file][0], source=files[file][1], syn=syn)\n", - "\n", - " file_ensembl_ids = None\n", - "\n", - " for C in col_names:\n", - " if C in df.columns:\n", - " file_ensembl_ids = df[C]\n", - "\n", - " # networks file is a special case\n", - " if file == \"networks\":\n", - " file_ensembl_ids = pd.melt(\n", - " df[[\"geneA_ensembl_gene_id\", \"geneB_ensembl_gene_id\"]]\n", - " )[\"value\"]\n", - "\n", - " if file_ensembl_ids is not None:\n", - " file_ensembl_list = file_ensembl_list + file_ensembl_ids.tolist()\n", - " if \"n/A\" in file_ensembl_ids.tolist():\n", - " print(file + \" has an n/A Ensembl ID\")\n", - " file_ensembl_list.remove(\"n/A\")\n", - " if np.NaN in file_ensembl_ids.tolist():\n", - " print(file + \" has an NaN Ensembl ID\")\n", - " else:\n", - " print(\"WARNING: no Ensembl ID column found for \" + file + \"!\")" + "file_ensembl_list = preprocessing_utils.get_all_adt_ensembl_ids(\n", + " config_filename=config_filename,\n", + " exclude_files=[\"gene_metadata\", \"druggability\"],\n", + " token=None,\n", + ")\n", + "print(\"\")\n", + "print(str(len(file_ensembl_list)) + \" Ensembl IDs found.\")\n", + "print(file_ensembl_list[0:5])" + ] + }, + { + "cell_type": "markdown", + "id": "5fa76bfb", + "metadata": {}, + "source": [ + "Create a data frame with these IDs so it can be merged with the MyGene query results below." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "f1303e5b", "metadata": {}, "outputs": [ @@ -273,13 +214,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "37452\n" + "35858\n" ] } ], "source": [ - "file_ensembl_list = list(set(file_ensembl_list))\n", - "\n", "ensembl_ids_df = pd.DataFrame({\"ensembl_gene_id\": file_ensembl_list})\n", "\n", "\"\"\" Removed due to no longer getting genes from BioMart, but saving code\n", @@ -300,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "4e7a37c8", "metadata": {}, "outputs": [], @@ -321,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "7ebd03d4", "metadata": { "scrolled": true @@ -401,11 +340,7 @@ "INFO:biothings.client:done.\n", "INFO:biothings.client:querying 34001-35000...\n", "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 35001-36000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 36001-37000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 37001-37452...\n", + "INFO:biothings.client:querying 35001-35858...\n", "INFO:biothings.client:done.\n" ] }, @@ -453,57 +388,57 @@ " \n", " \n", " \n", - " ENSG00000164972\n", - " 84688\n", - " 2.0\n", - " [C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4]\n", - " sperm microtubule inner protein 6\n", - " This gene encodes a nuclear- or perinuclear-lo...\n", - " SPMIP6\n", + " ENSG00000151650\n", + " 27287\n", + " 1.0\n", + " [HPX42B, NA88A, VENTX2]\n", + " VENT homeobox\n", + " This gene encodes a member of the Vent family ...\n", + " VENTX\n", " protein-coding\n", " NaN\n", " \n", " \n", - " ENSG00000169105\n", - " 113189\n", - " 2.0\n", - " [ATCS, D4ST1, EDSMC1, HNK1ST]\n", - " carbohydrate sulfotransferase 14\n", - " This gene encodes a member of the HNK-1 family...\n", - " CHST14\n", + " ENSG00000168268\n", + " 64943\n", + " 1.0\n", + " NaN\n", + " 5'-nucleotidase domain containing 2\n", + " Predicted to enable 5'-nucleotidase activity. ...\n", + " NT5DC2\n", " protein-coding\n", " NaN\n", " \n", " \n", - " ENSG00000255136\n", - " ENSG00000255136\n", + " ENSG00000186310\n", + " 4675\n", " 1.0\n", - " NaN\n", - " TPBGL antisense RNA 1\n", - " NaN\n", - " TPBGL-AS1\n", - " NaN\n", + " [MB20, NPL3]\n", + " nucleosome assembly protein 1 like 3\n", + " This gene is intronless and encodes a member o...\n", + " NAP1L3\n", + " protein-coding\n", " NaN\n", " \n", " \n", - " ENSG00000105499\n", - " 8605\n", + " ENSG00000204616\n", + " 11074\n", " 1.0\n", - " CPLA2-gamma\n", - " phospholipase A2 group IVC\n", - " This gene encodes a protein which is a member ...\n", - " PLA2G4C\n", + " [C6orf13, HCG1, HCGI, RNF]\n", + " tripartite motif containing 31\n", + " This gene encodes a protein that functions as ...\n", + " TRIM31\n", " protein-coding\n", " NaN\n", " \n", " \n", - " ENSG00000104611\n", - " 63898\n", + " ENSG00000158467\n", + " 23382\n", " 1.0\n", - " [PPP1R38, SH2A]\n", - " SH2 domain containing 4A\n", - " Enables phosphatase binding activity. Located ...\n", - " SH2D4A\n", + " [ADOHCYASE3, IRBIT2]\n", + " adenosylhomocysteinase like 2\n", + " The protein encoded by this gene acts as a hom...\n", + " AHCYL2\n", " protein-coding\n", " NaN\n", " \n", @@ -512,48 +447,40 @@ "" ], "text/plain": [ - " _id _version \\\n", - "ensembl_gene_id \n", - "ENSG00000164972 84688 2.0 \n", - "ENSG00000169105 113189 2.0 \n", - "ENSG00000255136 ENSG00000255136 1.0 \n", - "ENSG00000105499 8605 1.0 \n", - "ENSG00000104611 63898 1.0 \n", - "\n", - " alias \\\n", - "ensembl_gene_id \n", - "ENSG00000164972 [C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4] \n", - "ENSG00000169105 [ATCS, D4ST1, EDSMC1, HNK1ST] \n", - "ENSG00000255136 NaN \n", - "ENSG00000105499 CPLA2-gamma \n", - "ENSG00000104611 [PPP1R38, SH2A] \n", + " _id _version alias \\\n", + "ensembl_gene_id \n", + "ENSG00000151650 27287 1.0 [HPX42B, NA88A, VENTX2] \n", + "ENSG00000168268 64943 1.0 NaN \n", + "ENSG00000186310 4675 1.0 [MB20, NPL3] \n", + "ENSG00000204616 11074 1.0 [C6orf13, HCG1, HCGI, RNF] \n", + "ENSG00000158467 23382 1.0 [ADOHCYASE3, IRBIT2] \n", "\n", - " name \\\n", - "ensembl_gene_id \n", - "ENSG00000164972 sperm microtubule inner protein 6 \n", - "ENSG00000169105 carbohydrate sulfotransferase 14 \n", - "ENSG00000255136 TPBGL antisense RNA 1 \n", - "ENSG00000105499 phospholipase A2 group IVC \n", - "ENSG00000104611 SH2 domain containing 4A \n", + " name \\\n", + "ensembl_gene_id \n", + "ENSG00000151650 VENT homeobox \n", + "ENSG00000168268 5'-nucleotidase domain containing 2 \n", + "ENSG00000186310 nucleosome assembly protein 1 like 3 \n", + "ENSG00000204616 tripartite motif containing 31 \n", + "ENSG00000158467 adenosylhomocysteinase like 2 \n", "\n", - " summary symbol \\\n", - "ensembl_gene_id \n", - "ENSG00000164972 This gene encodes a nuclear- or perinuclear-lo... SPMIP6 \n", - "ENSG00000169105 This gene encodes a member of the HNK-1 family... CHST14 \n", - "ENSG00000255136 NaN TPBGL-AS1 \n", - "ENSG00000105499 This gene encodes a protein which is a member ... PLA2G4C \n", - "ENSG00000104611 Enables phosphatase binding activity. Located ... SH2D4A \n", + " summary symbol \\\n", + "ensembl_gene_id \n", + "ENSG00000151650 This gene encodes a member of the Vent family ... VENTX \n", + "ENSG00000168268 Predicted to enable 5'-nucleotidase activity. ... NT5DC2 \n", + "ENSG00000186310 This gene is intronless and encodes a member o... NAP1L3 \n", + "ENSG00000204616 This gene encodes a protein that functions as ... TRIM31 \n", + "ENSG00000158467 The protein encoded by this gene acts as a hom... AHCYL2 \n", "\n", " type_of_gene notfound \n", "ensembl_gene_id \n", - "ENSG00000164972 protein-coding NaN \n", - "ENSG00000169105 protein-coding NaN \n", - "ENSG00000255136 NaN NaN \n", - "ENSG00000105499 protein-coding NaN \n", - "ENSG00000104611 protein-coding NaN " + "ENSG00000151650 protein-coding NaN \n", + "ENSG00000168268 protein-coding NaN \n", + "ENSG00000186310 protein-coding NaN \n", + "ENSG00000204616 protein-coding NaN \n", + "ENSG00000158467 protein-coding NaN " ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -573,7 +500,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "23bb114e", "metadata": { "scrolled": true @@ -583,8 +510,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Annotations found for 36284 genes.\n", - "No annotations found for 1175 genes.\n" + "Annotations found for 34655 genes.\n", + "No annotations found for 1206 genes.\n" ] } ], @@ -611,7 +538,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 27, "id": "186d8cb8", "metadata": { "scrolled": true @@ -621,7 +548,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(37459, 9)\n" + "(35861, 9)\n" ] }, { @@ -659,61 +586,61 @@ " \n", " \n", " 0\n", - " ENSG00000164972\n", - " 84688\n", - " 2.0\n", - " [C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4]\n", - " sperm microtubule inner protein 6\n", - " This gene encodes a nuclear- or perinuclear-lo...\n", - " SPMIP6\n", + " ENSG00000151650\n", + " 27287\n", + " 1.0\n", + " [HPX42B, NA88A, VENTX2]\n", + " VENT homeobox\n", + " This gene encodes a member of the Vent family ...\n", + " VENTX\n", " protein-coding\n", " NaN\n", " \n", " \n", " 1\n", - " ENSG00000169105\n", - " 113189\n", - " 2.0\n", - " [ATCS, D4ST1, EDSMC1, HNK1ST]\n", - " carbohydrate sulfotransferase 14\n", - " This gene encodes a member of the HNK-1 family...\n", - " CHST14\n", + " ENSG00000168268\n", + " 64943\n", + " 1.0\n", + " NaN\n", + " 5'-nucleotidase domain containing 2\n", + " Predicted to enable 5'-nucleotidase activity. ...\n", + " NT5DC2\n", " protein-coding\n", " NaN\n", " \n", " \n", " 2\n", - " ENSG00000255136\n", - " ENSG00000255136\n", + " ENSG00000186310\n", + " 4675\n", " 1.0\n", - " NaN\n", - " TPBGL antisense RNA 1\n", - " NaN\n", - " TPBGL-AS1\n", - " NaN\n", + " [MB20, NPL3]\n", + " nucleosome assembly protein 1 like 3\n", + " This gene is intronless and encodes a member o...\n", + " NAP1L3\n", + " protein-coding\n", " NaN\n", " \n", " \n", " 3\n", - " ENSG00000105499\n", - " 8605\n", + " ENSG00000204616\n", + " 11074\n", " 1.0\n", - " CPLA2-gamma\n", - " phospholipase A2 group IVC\n", - " This gene encodes a protein which is a member ...\n", - " PLA2G4C\n", + " [C6orf13, HCG1, HCGI, RNF]\n", + " tripartite motif containing 31\n", + " This gene encodes a protein that functions as ...\n", + " TRIM31\n", " protein-coding\n", " NaN\n", " \n", " \n", " 4\n", - " ENSG00000104611\n", - " 63898\n", + " ENSG00000158467\n", + " 23382\n", " 1.0\n", - " [PPP1R38, SH2A]\n", - " SH2 domain containing 4A\n", - " Enables phosphatase binding activity. Located ...\n", - " SH2D4A\n", + " [ADOHCYASE3, IRBIT2]\n", + " adenosylhomocysteinase like 2\n", + " The protein encoded by this gene acts as a hom...\n", + " AHCYL2\n", " protein-coding\n", " NaN\n", " \n", @@ -722,43 +649,36 @@ "" ], "text/plain": [ - " ensembl_gene_id _id _version \\\n", - "0 ENSG00000164972 84688 2.0 \n", - "1 ENSG00000169105 113189 2.0 \n", - "2 ENSG00000255136 ENSG00000255136 1.0 \n", - "3 ENSG00000105499 8605 1.0 \n", - "4 ENSG00000104611 63898 1.0 \n", - "\n", - " alias \\\n", - "0 [C9orf24, CBE1, NYD-SP22, SMRP1, bA573M23.4] \n", - "1 [ATCS, D4ST1, EDSMC1, HNK1ST] \n", - "2 NaN \n", - "3 CPLA2-gamma \n", - "4 [PPP1R38, SH2A] \n", + " ensembl_gene_id _id _version alias \\\n", + "0 ENSG00000151650 27287 1.0 [HPX42B, NA88A, VENTX2] \n", + "1 ENSG00000168268 64943 1.0 NaN \n", + "2 ENSG00000186310 4675 1.0 [MB20, NPL3] \n", + "3 ENSG00000204616 11074 1.0 [C6orf13, HCG1, HCGI, RNF] \n", + "4 ENSG00000158467 23382 1.0 [ADOHCYASE3, IRBIT2] \n", "\n", - " name \\\n", - "0 sperm microtubule inner protein 6 \n", - "1 carbohydrate sulfotransferase 14 \n", - "2 TPBGL antisense RNA 1 \n", - "3 phospholipase A2 group IVC \n", - "4 SH2 domain containing 4A \n", + " name \\\n", + "0 VENT homeobox \n", + "1 5'-nucleotidase domain containing 2 \n", + "2 nucleosome assembly protein 1 like 3 \n", + "3 tripartite motif containing 31 \n", + "4 adenosylhomocysteinase like 2 \n", "\n", - " summary symbol \\\n", - "0 This gene encodes a nuclear- or perinuclear-lo... SPMIP6 \n", - "1 This gene encodes a member of the HNK-1 family... CHST14 \n", - "2 NaN TPBGL-AS1 \n", - "3 This gene encodes a protein which is a member ... PLA2G4C \n", - "4 Enables phosphatase binding activity. Located ... SH2D4A \n", + " summary symbol type_of_gene \\\n", + "0 This gene encodes a member of the Vent family ... VENTX protein-coding \n", + "1 Predicted to enable 5'-nucleotidase activity. ... NT5DC2 protein-coding \n", + "2 This gene is intronless and encodes a member o... NAP1L3 protein-coding \n", + "3 This gene encodes a protein that functions as ... TRIM31 protein-coding \n", + "4 The protein encoded by this gene acts as a hom... AHCYL2 protein-coding \n", "\n", - " type_of_gene notfound \n", - "0 protein-coding NaN \n", - "1 protein-coding NaN \n", - "2 NaN NaN \n", - "3 protein-coding NaN \n", - "4 protein-coding NaN " + " notfound \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN " ] }, - "execution_count": 9, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -791,7 +711,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 28, "id": "285c10d2", "metadata": { "scrolled": true @@ -799,8 +719,9 @@ "outputs": [], "source": [ "# NaN or NULL alias values become empty lists\n", - "for row in gene_table_merged.loc[gene_table_merged[\"alias\"].isnull(), \"alias\"].index:\n", - " gene_table_merged.at[row, \"alias\"] = []\n", + "gene_table_merged[\"alias\"] = gene_table_merged[\"alias\"].apply(\n", + " lambda cell: cell if cell is not np.NaN else []\n", + ")\n", "\n", "# Some alias values are a single string, not a list. Turn them into lists here.\n", "gene_table_merged[\"alias\"] = gene_table_merged[\"alias\"].apply(\n", @@ -836,7 +757,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 29, "id": "bc63cc53", "metadata": { "scrolled": true @@ -876,67 +797,55 @@ " \n", " \n", " \n", - " 6011\n", - " ENSG00000276518\n", - " 128966722\n", - " 1.0\n", - " []\n", - " putative killer cell immunoglobulin-like recep...\n", - " NaN\n", - " LOC128966722\n", - " protein-coding\n", - " NaN\n", - " \n", - " \n", - " 6012\n", - " ENSG00000276518\n", - " 128966732\n", + " 19626\n", + " ENSG00000249738\n", + " 285626\n", " 1.0\n", " []\n", - " putative killer cell immunoglobulin-like recep...\n", + " uncharacterized LOC285626\n", " NaN\n", - " LOC128966732\n", - " protein-coding\n", + " LOC285626\n", + " ncRNA\n", " NaN\n", " \n", " \n", - " 6013\n", - " ENSG00000276518\n", - " 128966730\n", + " 19627\n", + " ENSG00000249738\n", + " 105377683\n", " 1.0\n", " []\n", - " putative killer cell immunoglobulin-like recep...\n", + " uncharacterized LOC105377683\n", " NaN\n", - " LOC128966730\n", - " protein-coding\n", + " LOC105377683\n", + " ncRNA\n", " NaN\n", " \n", " \n", - " 6014\n", - " ENSG00000276518\n", - " 128966731\n", + " 24698\n", + " ENSG00000276387\n", + " 3802\n", " 1.0\n", - " []\n", - " putative killer cell immunoglobulin-like recep...\n", - " NaN\n", - " LOC128966731\n", + " [CD158A, NKAT1, KIR2DL3, KIR-K64, NKAT-1, p58....\n", + " killer cell immunoglobulin like receptor, two ...\n", + " Killer cell immunoglobulin-like receptors (KIR...\n", + " KIR2DL1\n", " protein-coding\n", " NaN\n", " \n", " \n", - " 6015\n", - " ENSG00000276518\n", - " 128966733\n", + " 24699\n", + " ENSG00000276387\n", + " 124900571\n", " 1.0\n", " []\n", - " putative killer cell immunoglobulin-like recep...\n", + " killer cell immunoglobulin-like receptor 2DS1\n", " NaN\n", - " LOC128966733\n", + " LOC124900571\n", " protein-coding\n", " NaN\n", " \n", " \n", - " 12139\n", + " 29514\n", " ENSG00000230373\n", " 100133220\n", " 1.0\n", @@ -948,7 +857,7 @@ " NaN\n", " \n", " \n", - " 12140\n", + " 29515\n", " ENSG00000230373\n", " 642402\n", " 1.0\n", @@ -959,126 +868,53 @@ " pseudo\n", " NaN\n", " \n", - " \n", - " 23329\n", - " ENSG00000276387\n", - " 124900571\n", - " 1.0\n", - " []\n", - " killer cell immunoglobulin-like receptor 2DS1\n", - " NaN\n", - " LOC124900571\n", - " protein-coding\n", - " NaN\n", - " \n", - " \n", - " 23330\n", - " ENSG00000276387\n", - " 3802\n", - " 2.0\n", - " [NKAT1, KIR2DL3, NKAT, KIR221, CD158A, p58.1, ...\n", - " killer cell immunoglobulin like receptor, two ...\n", - " Killer cell immunoglobulin-like receptors (KIR...\n", - " KIR2DL1\n", - " protein-coding\n", - " NaN\n", - " \n", - " \n", - " 31304\n", - " ENSG00000249738\n", - " 285626\n", - " 1.0\n", - " []\n", - " uncharacterized LOC285626\n", - " NaN\n", - " LOC285626\n", - " ncRNA\n", - " NaN\n", - " \n", - " \n", - " 31305\n", - " ENSG00000249738\n", - " 105377683\n", - " 1.0\n", - " []\n", - " uncharacterized LOC105377683\n", - " NaN\n", - " LOC105377683\n", - " ncRNA\n", - " NaN\n", - " \n", " \n", "\n", "" ], "text/plain": [ " ensembl_gene_id _id _version \\\n", - "6011 ENSG00000276518 128966722 1.0 \n", - "6012 ENSG00000276518 128966732 1.0 \n", - "6013 ENSG00000276518 128966730 1.0 \n", - "6014 ENSG00000276518 128966731 1.0 \n", - "6015 ENSG00000276518 128966733 1.0 \n", - "12139 ENSG00000230373 100133220 1.0 \n", - "12140 ENSG00000230373 642402 1.0 \n", - "23329 ENSG00000276387 124900571 1.0 \n", - "23330 ENSG00000276387 3802 2.0 \n", - "31304 ENSG00000249738 285626 1.0 \n", - "31305 ENSG00000249738 105377683 1.0 \n", + "19626 ENSG00000249738 285626 1.0 \n", + "19627 ENSG00000249738 105377683 1.0 \n", + "24698 ENSG00000276387 3802 1.0 \n", + "24699 ENSG00000276387 124900571 1.0 \n", + "29514 ENSG00000230373 100133220 1.0 \n", + "29515 ENSG00000230373 642402 1.0 \n", "\n", " alias \\\n", - "6011 [] \n", - "6012 [] \n", - "6013 [] \n", - "6014 [] \n", - "6015 [] \n", - "12139 [GOLGA6L3] \n", - "12140 [GOLGA6L21P] \n", - "23329 [] \n", - "23330 [NKAT1, KIR2DL3, NKAT, KIR221, CD158A, p58.1, ... \n", - "31304 [] \n", - "31305 [] \n", + "19626 [] \n", + "19627 [] \n", + "24698 [CD158A, NKAT1, KIR2DL3, KIR-K64, NKAT-1, p58.... \n", + "24699 [] \n", + "29514 [GOLGA6L3] \n", + "29515 [GOLGA6L21P] \n", "\n", " name \\\n", - "6011 putative killer cell immunoglobulin-like recep... \n", - "6012 putative killer cell immunoglobulin-like recep... \n", - "6013 putative killer cell immunoglobulin-like recep... \n", - "6014 putative killer cell immunoglobulin-like recep... \n", - "6015 putative killer cell immunoglobulin-like recep... \n", - "12139 golgin A6 family like 3, pseudogene \n", - "12140 golgin A6 family like 17, pseudogene \n", - "23329 killer cell immunoglobulin-like receptor 2DS1 \n", - "23330 killer cell immunoglobulin like receptor, two ... \n", - "31304 uncharacterized LOC285626 \n", - "31305 uncharacterized LOC105377683 \n", + "19626 uncharacterized LOC285626 \n", + "19627 uncharacterized LOC105377683 \n", + "24698 killer cell immunoglobulin like receptor, two ... \n", + "24699 killer cell immunoglobulin-like receptor 2DS1 \n", + "29514 golgin A6 family like 3, pseudogene \n", + "29515 golgin A6 family like 17, pseudogene \n", "\n", " summary symbol \\\n", - "6011 NaN LOC128966722 \n", - "6012 NaN LOC128966732 \n", - "6013 NaN LOC128966730 \n", - "6014 NaN LOC128966731 \n", - "6015 NaN LOC128966733 \n", - "12139 NaN GOLGA6L3P \n", - "12140 NaN GOLGA6L17P \n", - "23329 NaN LOC124900571 \n", - "23330 Killer cell immunoglobulin-like receptors (KIR... KIR2DL1 \n", - "31304 NaN LOC285626 \n", - "31305 NaN LOC105377683 \n", + "19626 NaN LOC285626 \n", + "19627 NaN LOC105377683 \n", + "24698 Killer cell immunoglobulin-like receptors (KIR... KIR2DL1 \n", + "24699 NaN LOC124900571 \n", + "29514 NaN GOLGA6L3P \n", + "29515 NaN GOLGA6L17P \n", "\n", " type_of_gene notfound \n", - "6011 protein-coding NaN \n", - "6012 protein-coding NaN \n", - "6013 protein-coding NaN \n", - "6014 protein-coding NaN \n", - "6015 protein-coding NaN \n", - "12139 pseudo NaN \n", - "12140 pseudo NaN \n", - "23329 protein-coding NaN \n", - "23330 protein-coding NaN \n", - "31304 ncRNA NaN \n", - "31305 ncRNA NaN " + "19626 ncRNA NaN \n", + "19627 ncRNA NaN \n", + "24698 protein-coding NaN \n", + "24699 protein-coding NaN \n", + "29514 pseudo NaN \n", + "29515 pseudo NaN " ] }, - "execution_count": 11, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1097,7 +933,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "093a2e98", "metadata": {}, "outputs": [ @@ -1105,7 +941,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "4 duplicated genes have been processed.\n" + "3 duplicated genes have been processed.\n" ] }, { @@ -1142,107 +978,95 @@ " \n", " \n", " \n", - " 37442\n", - " ENSG00000163811\n", - " 23160\n", + " 35848\n", + " ENSG00000085998\n", + " 55624\n", " 1.0\n", - " [NET12, UTP5]\n", - " WD repeat domain 43\n", - " Enables RNA binding activity. Involved in posi...\n", - " WDR43\n", + " [RP76, LGMDR15, LGMD2O, gnT-I.2, GNTI.2, GnT I...\n", + " protein O-linked mannose N-acetylglucosaminylt...\n", + " This gene encodes a type II transmembrane prot...\n", + " POMGNT1\n", " protein-coding\n", " NaN\n", " \n", " \n", - " 37443\n", - " ENSG00000226467\n", - " 10554\n", + " 35849\n", + " ENSG00000285081\n", + " ENSG00000285081\n", " 1.0\n", - " [G15, LPLAT1, 1-AGPAT1, LPAATA, LPAAT-alpha]\n", - " 1-acylglycerol-3-phosphate O-acyltransferase 1\n", - " This gene encodes an enzyme that converts lyso...\n", - " AGPAT1\n", - " protein-coding\n", + " []\n", " NaN\n", - " \n", - " \n", - " 37444\n", - " ENSG00000120533\n", - " 56943\n", - " 1.0\n", - " [Sus1, e(y)2, DC6]\n", - " ENY2 transcription and export complex 2 subunit\n", - " Enables nuclear receptor coactivator activity....\n", - " ENY2\n", - " protein-coding\n", " NaN\n", - " \n", - " \n", - " 37445\n", - " ENSG00000214759\n", - " ENSG00000214759\n", - " 1.0\n", - " []\n", - " ribosomal protein L36a pseudogene 2\n", " NaN\n", - " RPL36AP2\n", " NaN\n", " NaN\n", " \n", " \n", - " 37446\n", - " ENSG00000253981\n", - " ENSG00000253981\n", + " 35850\n", + " ENSG00000126822\n", + " 26030\n", " 1.0\n", - " []\n", - " ALG1 like 13, pseudogene\n", - " NaN\n", - " ALG1L13P\n", - " NaN\n", + " [ARHGEF43, KIAA0599]\n", + " pleckstrin homology and RhoGEF domain containi...\n", + " Predicted to enable guanyl-nucleotide exchange...\n", + " PLEKHG3\n", + " protein-coding\n", " NaN\n", " \n", " \n", - " 37447\n", - " ENSG00000267206\n", - " 158062\n", + " 35851\n", + " ENSG00000187240\n", + " 79659\n", " 1.0\n", - " [hLcn5, LCN5, UNQ643]\n", - " lipocalin 6\n", - " Predicted to enable small molecule binding act...\n", - " LCN6\n", + " [DHC2, hdhc11, DNCH2, SRTD3, SRPS2B, ATD3, DHC...\n", + " dynein cytoplasmic 2 heavy chain 1\n", + " This gene encodes a large cytoplasmic dynein p...\n", + " DYNC2H1\n", " protein-coding\n", " NaN\n", " \n", " \n", - " 37448\n", - " ENSG00000276387\n", - " 3802\n", - " 2.0\n", - " [NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C...\n", - " killer cell immunoglobulin like receptor, two ...\n", - " Killer cell immunoglobulin-like receptors (KIR...\n", - " KIR2DL1\n", + " 35852\n", + " ENSG00000101470\n", + " 7125\n", + " 1.0\n", + " [CMYP15, CMYO15, CFAP85, FAP85, MYONRI]\n", + " troponin C2, fast skeletal type\n", + " Troponin (Tn), a key protein complex in the re...\n", + " TNNC2\n", " protein-coding\n", " NaN\n", " \n", " \n", - " 37449\n", - " ENSG00000276518\n", - " 128966722\n", + " 35853\n", + " ENSG00000241472\n", + " 100506994\n", " 1.0\n", - " [LOC128966730, LOC128966732, LOC128966731, LOC...\n", - " putative killer cell immunoglobulin-like recep...\n", + " []\n", + " PTPRG antisense RNA 1\n", + " NaN\n", + " PTPRG-AS1\n", + " ncRNA\n", " NaN\n", - " LOC128966722\n", + " \n", + " \n", + " 35854\n", + " ENSG00000133106\n", + " 94240\n", + " 1.0\n", + " [BRESI1]\n", + " epithelial stromal interaction 1\n", + " The protein encoded by this gene has been show...\n", + " EPSTI1\n", " protein-coding\n", " NaN\n", " \n", " \n", - " 37450\n", + " 35855\n", " ENSG00000230373\n", " 100133220\n", " 1.0\n", - " [GOLGA6L21P, GOLGA6L17P, GOLGA6L3]\n", + " [GOLGA6L3, GOLGA6L21P, GOLGA6L17P]\n", " golgin A6 family like 3, pseudogene\n", " NaN\n", " GOLGA6L3P\n", @@ -1250,7 +1074,7 @@ " NaN\n", " \n", " \n", - " 37451\n", + " 35856\n", " ENSG00000249738\n", " 285626\n", " 1.0\n", @@ -1261,80 +1085,91 @@ " ncRNA\n", " NaN\n", " \n", + " \n", + " 35857\n", + " ENSG00000276387\n", + " 3802\n", + " 1.0\n", + " [CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64...\n", + " killer cell immunoglobulin like receptor, two ...\n", + " Killer cell immunoglobulin-like receptors (KIR...\n", + " KIR2DL1\n", + " protein-coding\n", + " NaN\n", + " \n", " \n", "\n", "" ], "text/plain": [ " ensembl_gene_id _id _version \\\n", - "37442 ENSG00000163811 23160 1.0 \n", - "37443 ENSG00000226467 10554 1.0 \n", - "37444 ENSG00000120533 56943 1.0 \n", - "37445 ENSG00000214759 ENSG00000214759 1.0 \n", - "37446 ENSG00000253981 ENSG00000253981 1.0 \n", - "37447 ENSG00000267206 158062 1.0 \n", - "37448 ENSG00000276387 3802 2.0 \n", - "37449 ENSG00000276518 128966722 1.0 \n", - "37450 ENSG00000230373 100133220 1.0 \n", - "37451 ENSG00000249738 285626 1.0 \n", + "35848 ENSG00000085998 55624 1.0 \n", + "35849 ENSG00000285081 ENSG00000285081 1.0 \n", + "35850 ENSG00000126822 26030 1.0 \n", + "35851 ENSG00000187240 79659 1.0 \n", + "35852 ENSG00000101470 7125 1.0 \n", + "35853 ENSG00000241472 100506994 1.0 \n", + "35854 ENSG00000133106 94240 1.0 \n", + "35855 ENSG00000230373 100133220 1.0 \n", + "35856 ENSG00000249738 285626 1.0 \n", + "35857 ENSG00000276387 3802 1.0 \n", "\n", " alias \\\n", - "37442 [NET12, UTP5] \n", - "37443 [G15, LPLAT1, 1-AGPAT1, LPAATA, LPAAT-alpha] \n", - "37444 [Sus1, e(y)2, DC6] \n", - "37445 [] \n", - "37446 [] \n", - "37447 [hLcn5, LCN5, UNQ643] \n", - "37448 [NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C... \n", - "37449 [LOC128966730, LOC128966732, LOC128966731, LOC... \n", - "37450 [GOLGA6L21P, GOLGA6L17P, GOLGA6L3] \n", - "37451 [LOC105377683] \n", + "35848 [RP76, LGMDR15, LGMD2O, gnT-I.2, GNTI.2, GnT I... \n", + "35849 [] \n", + "35850 [ARHGEF43, KIAA0599] \n", + "35851 [DHC2, hdhc11, DNCH2, SRTD3, SRPS2B, ATD3, DHC... \n", + "35852 [CMYP15, CMYO15, CFAP85, FAP85, MYONRI] \n", + "35853 [] \n", + "35854 [BRESI1] \n", + "35855 [GOLGA6L3, GOLGA6L21P, GOLGA6L17P] \n", + "35856 [LOC105377683] \n", + "35857 [CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64... \n", "\n", " name \\\n", - "37442 WD repeat domain 43 \n", - "37443 1-acylglycerol-3-phosphate O-acyltransferase 1 \n", - "37444 ENY2 transcription and export complex 2 subunit \n", - "37445 ribosomal protein L36a pseudogene 2 \n", - "37446 ALG1 like 13, pseudogene \n", - "37447 lipocalin 6 \n", - "37448 killer cell immunoglobulin like receptor, two ... \n", - "37449 putative killer cell immunoglobulin-like recep... \n", - "37450 golgin A6 family like 3, pseudogene \n", - "37451 uncharacterized LOC285626 \n", + "35848 protein O-linked mannose N-acetylglucosaminylt... \n", + "35849 NaN \n", + "35850 pleckstrin homology and RhoGEF domain containi... \n", + "35851 dynein cytoplasmic 2 heavy chain 1 \n", + "35852 troponin C2, fast skeletal type \n", + "35853 PTPRG antisense RNA 1 \n", + "35854 epithelial stromal interaction 1 \n", + "35855 golgin A6 family like 3, pseudogene \n", + "35856 uncharacterized LOC285626 \n", + "35857 killer cell immunoglobulin like receptor, two ... \n", "\n", - " summary symbol \\\n", - "37442 Enables RNA binding activity. Involved in posi... WDR43 \n", - "37443 This gene encodes an enzyme that converts lyso... AGPAT1 \n", - "37444 Enables nuclear receptor coactivator activity.... ENY2 \n", - "37445 NaN RPL36AP2 \n", - "37446 NaN ALG1L13P \n", - "37447 Predicted to enable small molecule binding act... LCN6 \n", - "37448 Killer cell immunoglobulin-like receptors (KIR... KIR2DL1 \n", - "37449 NaN LOC128966722 \n", - "37450 NaN GOLGA6L3P \n", - "37451 NaN LOC285626 \n", + " summary symbol \\\n", + "35848 This gene encodes a type II transmembrane prot... POMGNT1 \n", + "35849 NaN NaN \n", + "35850 Predicted to enable guanyl-nucleotide exchange... PLEKHG3 \n", + "35851 This gene encodes a large cytoplasmic dynein p... DYNC2H1 \n", + "35852 Troponin (Tn), a key protein complex in the re... TNNC2 \n", + "35853 NaN PTPRG-AS1 \n", + "35854 The protein encoded by this gene has been show... EPSTI1 \n", + "35855 NaN GOLGA6L3P \n", + "35856 NaN LOC285626 \n", + "35857 Killer cell immunoglobulin-like receptors (KIR... KIR2DL1 \n", "\n", " type_of_gene notfound \n", - "37442 protein-coding NaN \n", - "37443 protein-coding NaN \n", - "37444 protein-coding NaN \n", - "37445 NaN NaN \n", - "37446 NaN NaN \n", - "37447 protein-coding NaN \n", - "37448 protein-coding NaN \n", - "37449 protein-coding NaN \n", - "37450 pseudo NaN \n", - "37451 ncRNA NaN " + "35848 protein-coding NaN \n", + "35849 NaN NaN \n", + "35850 protein-coding NaN \n", + "35851 protein-coding NaN \n", + "35852 protein-coding NaN \n", + "35853 ncRNA NaN \n", + "35854 protein-coding NaN \n", + "35855 pseudo NaN \n", + "35856 ncRNA NaN \n", + "35857 protein-coding NaN " ] }, - "execution_count": 12, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "non_dupes = set(gene_table_merged.index) - set(all_duplicated.index)\n", - "keep_df = gene_table_merged.loc[list(non_dupes)].copy(deep=True)\n", + "keep_df = gene_table_merged.drop(all_duplicated.index)\n", "\n", "# For each duplicated Ensembl ID, collapse to 1 row and append that row to keep_df\n", "for ens_id in set(all_duplicated[\"ensembl_gene_id\"]):\n", @@ -1383,7 +1218,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "4a1bbdee", "metadata": { "scrolled": true @@ -1395,27 +1230,26 @@ "text": [ " name date url version\n", "1 Ensembl GRCh37 Feb 2014 https://grch37.ensembl.org GRCh37\n", - "2 Ensembl 111 Jan 2024 https://jan2024.archive.ensembl.org 111\n", - "3 Ensembl 110 Jul 2023 https://jul2023.archive.ensembl.org 110\n", - "4 Ensembl 109 Feb 2023 https://feb2023.archive.ensembl.org 109\n", - "5 Ensembl 108 Oct 2022 https://oct2022.archive.ensembl.org 108\n", - "6 Ensembl 107 Jul 2022 https://jul2022.archive.ensembl.org 107\n", - "7 Ensembl 106 Apr 2022 https://apr2022.archive.ensembl.org 106\n", - "8 Ensembl 105 Dec 2021 https://dec2021.archive.ensembl.org 105\n", - "9 Ensembl 104 May 2021 https://may2021.archive.ensembl.org 104\n", - "10 Ensembl 103 Feb 2021 https://feb2021.archive.ensembl.org 103\n", - "11 Ensembl 102 Nov 2020 https://nov2020.archive.ensembl.org 102\n", - "12 Ensembl 101 Aug 2020 https://aug2020.archive.ensembl.org 101\n", - "13 Ensembl 100 Apr 2020 https://apr2020.archive.ensembl.org 100\n", - "14 Ensembl 99 Jan 2020 https://jan2020.archive.ensembl.org 99\n", - "15 Ensembl 98 Sep 2019 https://sep2019.archive.ensembl.org 98\n", - "16 Ensembl 97 Jul 2019 https://jul2019.archive.ensembl.org 97\n", - "17 Ensembl 96 Apr 2019 https://apr2019.archive.ensembl.org 96\n", - "18 Ensembl 95 Jan 2019 https://jan2019.archive.ensembl.org 95\n", - "19 Ensembl 80 May 2015 https://may2015.archive.ensembl.org 80\n", - "20 Ensembl 77 Oct 2014 https://oct2014.archive.ensembl.org 77\n", - "21 Ensembl 75 Feb 2014 https://feb2014.archive.ensembl.org 75\n", - "22 Ensembl 54 May 2009 https://may2009.archive.ensembl.org 54\n", + "2 Ensembl 113 Oct 2024 https://oct2024.archive.ensembl.org 113\n", + "3 Ensembl 112 May 2024 https://may2024.archive.ensembl.org 112\n", + "4 Ensembl 111 Jan 2024 https://jan2024.archive.ensembl.org 111\n", + "5 Ensembl 110 Jul 2023 https://jul2023.archive.ensembl.org 110\n", + "6 Ensembl 109 Feb 2023 https://feb2023.archive.ensembl.org 109\n", + "7 Ensembl 108 Oct 2022 https://oct2022.archive.ensembl.org 108\n", + "8 Ensembl 107 Jul 2022 https://jul2022.archive.ensembl.org 107\n", + "9 Ensembl 106 Apr 2022 https://apr2022.archive.ensembl.org 106\n", + "10 Ensembl 105 Dec 2021 https://dec2021.archive.ensembl.org 105\n", + "11 Ensembl 104 May 2021 https://may2021.archive.ensembl.org 104\n", + "12 Ensembl 103 Feb 2021 https://feb2021.archive.ensembl.org 103\n", + "13 Ensembl 102 Nov 2020 https://nov2020.archive.ensembl.org 102\n", + "14 Ensembl 101 Aug 2020 https://aug2020.archive.ensembl.org 101\n", + "15 Ensembl 100 Apr 2020 https://apr2020.archive.ensembl.org 100\n", + "16 Ensembl 99 Jan 2020 https://jan2020.archive.ensembl.org 99\n", + "17 Ensembl 98 Sep 2019 https://sep2019.archive.ensembl.org 98\n", + "18 Ensembl 80 May 2015 https://may2015.archive.ensembl.org 80\n", + "19 Ensembl 77 Oct 2014 https://oct2014.archive.ensembl.org 77\n", + "20 Ensembl 75 Feb 2014 https://feb2014.archive.ensembl.org 75\n", + "21 Ensembl 54 May 2009 https://may2009.archive.ensembl.org 54\n", " current_release\n", "1 \n", "2 *\n", @@ -1438,7 +1272,6 @@ "19 \n", "20 \n", "21 \n", - "22 \n", "\n" ] } @@ -1462,7 +1295,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "9a747309", "metadata": { "scrolled": true @@ -1472,7 +1305,6 @@ "name": "stdout", "output_type": "stream", "text": [ - "37452\n", "Querying genes 1 - 1000\n", "Querying genes 1001 - 2000\n", "Querying genes 2001 - 3000\n", @@ -1508,10 +1340,7 @@ "Querying genes 32001 - 33000\n", "Querying genes 33001 - 34000\n", "Querying genes 34001 - 35000\n", - "Querying genes 35001 - 36000\n", - "Querying genes 36001 - 37000\n", - "Querying genes 37001 - 37452\n", - "37452\n" + "Querying genes 35001 - 35858\n" ] }, { @@ -1535,156 +1364,114 @@ " \n", " \n", " \n", - " is_current\n", " assembly\n", - " id\n", - " version\n", - " type\n", " peptide\n", - " latest\n", " possible_replacement\n", " release\n", + " latest\n", + " type\n", + " id\n", + " version\n", + " is_current\n", " \n", " \n", " \n", " \n", - " 37447\n", - " 1\n", + " 35853\n", " GRCh38\n", - " ENSG00000267206\n", - " 6\n", - " Gene\n", " None\n", - " ENSG00000267206.6\n", " []\n", - " 111\n", + " 113\n", + " ENSG00000241472.9\n", + " Gene\n", + " ENSG00000241472\n", + " 9\n", + " 1\n", " \n", " \n", - " 37448\n", - " 1\n", + " 35854\n", " GRCh38\n", - " ENSG00000276387\n", - " 4\n", - " Gene\n", " None\n", - " ENSG00000276387.4\n", " []\n", - " 111\n", + " 113\n", + " ENSG00000133106.15\n", + " Gene\n", + " ENSG00000133106\n", + " 15\n", + " 1\n", " \n", " \n", - " 37449\n", - " 1\n", + " 35855\n", " GRCh38\n", - " ENSG00000276518\n", - " 1\n", - " Gene\n", " None\n", - " ENSG00000276518.1\n", " []\n", - " 111\n", + " 113\n", + " ENSG00000230373.9\n", + " Gene\n", + " ENSG00000230373\n", + " 9\n", + " 1\n", " \n", " \n", - " 37450\n", - " 1\n", + " 35856\n", " GRCh38\n", - " ENSG00000230373\n", - " 9\n", - " Gene\n", " None\n", - " ENSG00000230373.9\n", " []\n", - " 111\n", + " 113\n", + " ENSG00000249738.11\n", + " Gene\n", + " ENSG00000249738\n", + " 11\n", + " 1\n", " \n", " \n", - " 37451\n", - " 1\n", + " 35857\n", " GRCh38\n", - " ENSG00000249738\n", - " 10\n", - " Gene\n", " None\n", - " ENSG00000249738.10\n", " []\n", - " 111\n", + " 113\n", + " ENSG00000276387.4\n", + " Gene\n", + " ENSG00000276387\n", + " 4\n", + " 1\n", " \n", " \n", "\n", "" ], "text/plain": [ - " is_current assembly id version type peptide \\\n", - "37447 1 GRCh38 ENSG00000267206 6 Gene None \n", - "37448 1 GRCh38 ENSG00000276387 4 Gene None \n", - "37449 1 GRCh38 ENSG00000276518 1 Gene None \n", - "37450 1 GRCh38 ENSG00000230373 9 Gene None \n", - "37451 1 GRCh38 ENSG00000249738 10 Gene None \n", + " assembly peptide possible_replacement release latest type \\\n", + "35853 GRCh38 None [] 113 ENSG00000241472.9 Gene \n", + "35854 GRCh38 None [] 113 ENSG00000133106.15 Gene \n", + "35855 GRCh38 None [] 113 ENSG00000230373.9 Gene \n", + "35856 GRCh38 None [] 113 ENSG00000249738.11 Gene \n", + "35857 GRCh38 None [] 113 ENSG00000276387.4 Gene \n", "\n", - " latest possible_replacement release \n", - "37447 ENSG00000267206.6 [] 111 \n", - "37448 ENSG00000276387.4 [] 111 \n", - "37449 ENSG00000276518.1 [] 111 \n", - "37450 ENSG00000230373.9 [] 111 \n", - "37451 ENSG00000249738.10 [] 111 " + " id version is_current \n", + "35853 ENSG00000241472 9 1 \n", + "35854 ENSG00000133106 15 1 \n", + "35855 ENSG00000230373 9 1 \n", + "35856 ENSG00000249738 11 1 \n", + "35857 ENSG00000276387 4 1 " ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "url = \"https://rest.ensembl.org/archive/id\"\n", - "headers = {\"Content-Type\": \"application/json\", \"Accept\": \"application/json\"}\n", - "\n", - "ids = gene_table_merged[\"ensembl_gene_id\"].tolist()\n", - "print(len(ids))\n", - "\n", - "# We can only query 1000 genes at a time\n", - "batch_ind = range(0, len(ids), 1000)\n", - "results = []\n", - "\n", - "for B in batch_ind:\n", - " end = min(len(ids), B + 1000)\n", - " print(\"Querying genes \" + str(B + 1) + \" - \" + str(end))\n", - "\n", - " request_data = '{ \"id\" : ' + str(ids[B:end]) + \" }\"\n", - " request_data = request_data.replace(\"'\", '\"')\n", - "\n", - " ok = False\n", - " tries = 0\n", - "\n", - " while tries < 5 and not ok:\n", - " try:\n", - " res = requests.post(url, headers=headers, data=request_data)\n", - " ok = res.ok\n", - " except:\n", - " ok = False\n", - "\n", - " tries = tries + 1\n", - "\n", - " if not ok:\n", - " # res.raise_for_status()\n", - " print(\n", - " \"Error retrieving Ensembl versions for genes \"\n", - " + str(B + 1)\n", - " + \" - \"\n", - " + str(end)\n", - " + \". Trying again...\"\n", - " )\n", - " else:\n", - " results = results + res.json()\n", - " break\n", - "\n", - "print(len(results))\n", - "\n", - "versions = pd.json_normalize(results)\n", + "versions = preprocessing_utils.query_ensembl_version_api(\n", + " ensembl_ids=gene_table_merged[\"ensembl_gene_id\"].tolist()\n", + ")\n", "\n", "versions.tail()" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "id": "5c108238", "metadata": {}, "outputs": [ @@ -1692,35 +1479,37 @@ "data": { "text/plain": [ "release\n", - "100 22\n", + "100 21\n", "101 8\n", "102 16\n", - "103 15\n", - "104 19\n", - "105 9\n", + "103 12\n", + "104 17\n", + "105 10\n", "106 35\n", - "107 10\n", + "107 12\n", "108 4\n", "109 4\n", "110 11\n", - "111 36286\n", + "111 52\n", + "112 354\n", + "113 34303\n", "80 21\n", "81 2\n", "82 10\n", "84 673\n", "87 61\n", "89 20\n", - "91 75\n", - "93 53\n", + "91 67\n", + "93 50\n", "95 33\n", "96 31\n", - "97 18\n", + "97 17\n", "98 9\n", - "99 7\n", + "99 5\n", "dtype: int64" ] }, - "execution_count": 15, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1731,7 +1520,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "id": "bf5aecb1", "metadata": { "scrolled": true @@ -1741,8 +1530,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "37452\n", - "37452\n", + "35858\n", + "35858\n", "True\n" ] } @@ -1759,7 +1548,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "id": "7fc8bbcd", "metadata": {}, "outputs": [ @@ -1769,7 +1558,7 @@ "True" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1791,7 +1580,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "id": "0d5b5652", "metadata": { "scrolled": true @@ -1815,7 +1604,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "id": "337b2890", "metadata": {}, "outputs": [ @@ -1823,28 +1612,27 @@ "data": { "text/plain": [ "closest_release\n", - "80 915\n", - "95 33\n", - "96 31\n", - "97 18\n", + "80 985\n", "98 9\n", - "99 7\n", - "100 22\n", + "99 5\n", + "100 21\n", "101 8\n", "102 16\n", - "103 15\n", - "104 19\n", - "105 9\n", + "103 12\n", + "104 17\n", + "105 10\n", "106 35\n", - "107 10\n", + "107 12\n", "108 4\n", "109 4\n", "110 11\n", - "111 36286\n", + "111 52\n", + "112 354\n", + "113 34303\n", "dtype: int64" ] }, - "execution_count": 19, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1865,7 +1653,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "id": "343e5006", "metadata": { "scrolled": false @@ -1892,15 +1680,15 @@ " \n", " \n", " \n", - " is_current\n", " assembly\n", - " id\n", - " version\n", - " type\n", " peptide\n", - " latest\n", " possible_replacement\n", " release\n", + " latest\n", + " type\n", + " id\n", + " version\n", + " is_current\n", " closest_release\n", " permalink\n", " \n", @@ -1908,102 +1696,102 @@ " \n", " \n", " 0\n", - " 1\n", " GRCh38\n", - " ENSG00000164972\n", - " 14\n", - " Gene\n", " None\n", - " ENSG00000164972.14\n", " []\n", - " 111\n", - " 111\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " 113\n", + " ENSG00000151650.8\n", + " Gene\n", + " ENSG00000151650\n", + " 8\n", + " 1\n", + " 113\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 1\n", - " 1\n", " GRCh38\n", - " ENSG00000169105\n", - " 8\n", - " Gene\n", " None\n", - " ENSG00000169105.8\n", " []\n", - " 111\n", - " 111\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " 113\n", + " ENSG00000168268.11\n", + " Gene\n", + " ENSG00000168268\n", + " 11\n", + " 1\n", + " 113\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 2\n", - " 1\n", " GRCh38\n", - " ENSG00000255136\n", - " 3\n", - " Gene\n", " None\n", - " ENSG00000255136.3\n", " []\n", - " 111\n", - " 111\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " 113\n", + " ENSG00000186310.10\n", + " Gene\n", + " ENSG00000186310\n", + " 10\n", + " 1\n", + " 113\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 3\n", - " 1\n", " GRCh38\n", - " ENSG00000105499\n", - " 14\n", - " Gene\n", " None\n", - " ENSG00000105499.14\n", " []\n", - " 111\n", - " 111\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " 113\n", + " ENSG00000204616.11\n", + " Gene\n", + " ENSG00000204616\n", + " 11\n", + " 1\n", + " 113\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 4\n", - " 1\n", " GRCh38\n", - " ENSG00000104611\n", - " 12\n", - " Gene\n", " None\n", - " ENSG00000104611.12\n", " []\n", - " 111\n", - " 111\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " 113\n", + " ENSG00000158467.17\n", + " Gene\n", + " ENSG00000158467\n", + " 17\n", + " 1\n", + " 113\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " is_current assembly id version type peptide \\\n", - "0 1 GRCh38 ENSG00000164972 14 Gene None \n", - "1 1 GRCh38 ENSG00000169105 8 Gene None \n", - "2 1 GRCh38 ENSG00000255136 3 Gene None \n", - "3 1 GRCh38 ENSG00000105499 14 Gene None \n", - "4 1 GRCh38 ENSG00000104611 12 Gene None \n", + " assembly peptide possible_replacement release latest type \\\n", + "0 GRCh38 None [] 113 ENSG00000151650.8 Gene \n", + "1 GRCh38 None [] 113 ENSG00000168268.11 Gene \n", + "2 GRCh38 None [] 113 ENSG00000186310.10 Gene \n", + "3 GRCh38 None [] 113 ENSG00000204616.11 Gene \n", + "4 GRCh38 None [] 113 ENSG00000158467.17 Gene \n", "\n", - " latest possible_replacement release closest_release \\\n", - "0 ENSG00000164972.14 [] 111 111 \n", - "1 ENSG00000169105.8 [] 111 111 \n", - "2 ENSG00000255136.3 [] 111 111 \n", - "3 ENSG00000105499.14 [] 111 111 \n", - "4 ENSG00000104611.12 [] 111 111 \n", + " id version is_current closest_release \\\n", + "0 ENSG00000151650 8 1 113 \n", + "1 ENSG00000168268 11 1 113 \n", + "2 ENSG00000186310 10 1 113 \n", + "3 ENSG00000204616 11 1 113 \n", + "4 ENSG00000158467 17 1 113 \n", "\n", " permalink \n", - "0 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "1 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "2 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "3 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "4 https://jan2024.archive.ensembl.org/Homo_sapie... " + "0 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "1 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "2 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "3 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "4 https://oct2024.archive.ensembl.org/Homo_sapie... " ] }, - "execution_count": 20, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -2024,7 +1812,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "id": "4b01719d", "metadata": {}, "outputs": [ @@ -2049,87 +1837,87 @@ " \n", " \n", " \n", - " is_current\n", " assembly\n", - " id\n", - " version\n", - " type\n", " peptide\n", - " latest\n", " possible_replacement\n", " release\n", + " latest\n", + " type\n", + " id\n", + " version\n", + " is_current\n", " closest_release\n", " permalink\n", " \n", " \n", " \n", " \n", - " 51\n", - " \n", + " 67\n", " GRCh38\n", - " ENSG00000266701\n", - " 1\n", - " Gene\n", " None\n", - " ENSG00000266701.1\n", " []\n", " 84\n", + " ENSG00000265108.1\n", + " Gene\n", + " ENSG00000265108\n", + " 1\n", + " \n", " 80\n", " https://may2015.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", - " 99\n", - " \n", + " 68\n", " GRCh38\n", - " ENSG00000268225\n", - " 2\n", - " Gene\n", " None\n", - " ENSG00000268225.2\n", " []\n", - " 98\n", - " 98\n", - " https://sep2019.archive.ensembl.org/Homo_sapie...\n", + " 80\n", + " ENSG00000280803.1\n", + " Gene\n", + " ENSG00000280803\n", + " 1\n", + " \n", + " 80\n", + " https://may2015.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", - " 119\n", - " \n", + " 111\n", " GRCh38\n", - " ENSG00000281018\n", - " 1\n", - " Gene\n", " None\n", - " ENSG00000281018.1\n", " []\n", " 84\n", + " ENSG00000281672.1\n", + " Gene\n", + " ENSG00000281672\n", + " 1\n", + " \n", " 80\n", " https://may2015.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", - " 120\n", - " \n", + " 135\n", " GRCh38\n", - " ENSG00000216011\n", - " 2\n", - " Gene\n", " None\n", - " ENSG00000216011.2\n", " []\n", - " 84\n", + " 87\n", + " ENSG00000279857.1\n", + " Gene\n", + " ENSG00000279857\n", + " 1\n", + " \n", " 80\n", " https://may2015.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", - " 135\n", - " \n", + " 141\n", " GRCh38\n", - " ENSG00000264103\n", - " 1\n", - " Gene\n", " None\n", - " ENSG00000264103.1\n", " []\n", " 84\n", + " ENSG00000274483.1\n", + " Gene\n", + " ENSG00000274483\n", + " 1\n", + " \n", " 80\n", " https://may2015.archive.ensembl.org/Homo_sapie...\n", " \n", @@ -2138,29 +1926,29 @@ "" ], "text/plain": [ - " is_current assembly id version type peptide \\\n", - "51 GRCh38 ENSG00000266701 1 Gene None \n", - "99 GRCh38 ENSG00000268225 2 Gene None \n", - "119 GRCh38 ENSG00000281018 1 Gene None \n", - "120 GRCh38 ENSG00000216011 2 Gene None \n", - "135 GRCh38 ENSG00000264103 1 Gene None \n", + " assembly peptide possible_replacement release latest type \\\n", + "67 GRCh38 None [] 84 ENSG00000265108.1 Gene \n", + "68 GRCh38 None [] 80 ENSG00000280803.1 Gene \n", + "111 GRCh38 None [] 84 ENSG00000281672.1 Gene \n", + "135 GRCh38 None [] 87 ENSG00000279857.1 Gene \n", + "141 GRCh38 None [] 84 ENSG00000274483.1 Gene \n", "\n", - " latest possible_replacement release closest_release \\\n", - "51 ENSG00000266701.1 [] 84 80 \n", - "99 ENSG00000268225.2 [] 98 98 \n", - "119 ENSG00000281018.1 [] 84 80 \n", - "120 ENSG00000216011.2 [] 84 80 \n", - "135 ENSG00000264103.1 [] 84 80 \n", + " id version is_current closest_release \\\n", + "67 ENSG00000265108 1 80 \n", + "68 ENSG00000280803 1 80 \n", + "111 ENSG00000281672 1 80 \n", + "135 ENSG00000279857 1 80 \n", + "141 ENSG00000274483 1 80 \n", "\n", " permalink \n", - "51 https://may2015.archive.ensembl.org/Homo_sapie... \n", - "99 https://sep2019.archive.ensembl.org/Homo_sapie... \n", - "119 https://may2015.archive.ensembl.org/Homo_sapie... \n", - "120 https://may2015.archive.ensembl.org/Homo_sapie... \n", - "135 https://may2015.archive.ensembl.org/Homo_sapie... " + "67 https://may2015.archive.ensembl.org/Homo_sapie... \n", + "68 https://may2015.archive.ensembl.org/Homo_sapie... \n", + "111 https://may2015.archive.ensembl.org/Homo_sapie... \n", + "135 https://may2015.archive.ensembl.org/Homo_sapie... \n", + "141 https://may2015.archive.ensembl.org/Homo_sapie... " ] }, - "execution_count": 21, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -2171,7 +1959,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "id": "c4128cc9", "metadata": {}, "outputs": [ @@ -2179,8 +1967,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000164972\n", - "https://jul2023.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000279049\n" + "https://oct2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000151650\n", + "https://oct2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000142192\n" ] } ], @@ -2191,7 +1979,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "id": "73791e6c", "metadata": {}, "outputs": [ @@ -2201,7 +1989,7 @@ "True" ] }, - "execution_count": 23, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -2222,7 +2010,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "id": "f3edfd2f", "metadata": {}, "outputs": [ @@ -2230,7 +2018,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(37452, 12)\n" + "(35858, 12)\n" ] }, { @@ -2271,128 +2059,121 @@ " \n", " \n", " 0\n", - " ENSG00000164972\n", - " 84688\n", - " 2.0\n", - " [SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22]\n", - " sperm microtubule inner protein 6\n", - " This gene encodes a nuclear- or perinuclear-lo...\n", - " SPMIP6\n", + " ENSG00000151650\n", + " 27287\n", + " 1.0\n", + " [NA88A, HPX42B, VENTX2]\n", + " VENT homeobox\n", + " This gene encodes a member of the Vent family ...\n", + " VENTX\n", " protein-coding\n", " NaN\n", - " 111\n", + " 113\n", " []\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 1\n", - " ENSG00000169105\n", - " 113189\n", - " 2.0\n", - " [ATCS, EDSMC1, HNK1ST, D4ST1]\n", - " carbohydrate sulfotransferase 14\n", - " This gene encodes a member of the HNK-1 family...\n", - " CHST14\n", + " ENSG00000168268\n", + " 64943\n", + " 1.0\n", + " []\n", + " 5'-nucleotidase domain containing 2\n", + " Predicted to enable 5'-nucleotidase activity. ...\n", + " NT5DC2\n", " protein-coding\n", " NaN\n", - " 111\n", + " 113\n", " []\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 2\n", - " ENSG00000255136\n", - " ENSG00000255136\n", + " ENSG00000186310\n", + " 4675\n", " 1.0\n", - " []\n", - " TPBGL antisense RNA 1\n", - " NaN\n", - " TPBGL-AS1\n", - " NaN\n", + " [MB20, NPL3]\n", + " nucleosome assembly protein 1 like 3\n", + " This gene is intronless and encodes a member o...\n", + " NAP1L3\n", + " protein-coding\n", " NaN\n", - " 111\n", + " 113\n", " []\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 3\n", - " ENSG00000105499\n", - " 8605\n", + " ENSG00000204616\n", + " 11074\n", " 1.0\n", - " [CPLA2-gamma]\n", - " phospholipase A2 group IVC\n", - " This gene encodes a protein which is a member ...\n", - " PLA2G4C\n", + " [C6orf13, RNF, HCGI, HCG1]\n", + " tripartite motif containing 31\n", + " This gene encodes a protein that functions as ...\n", + " TRIM31\n", " protein-coding\n", " NaN\n", - " 111\n", + " 113\n", " []\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 4\n", - " ENSG00000104611\n", - " 63898\n", + " ENSG00000158467\n", + " 23382\n", " 1.0\n", - " [PPP1R38, SH2A]\n", - " SH2 domain containing 4A\n", - " Enables phosphatase binding activity. Located ...\n", - " SH2D4A\n", + " [IRBIT2, ADOHCYASE3]\n", + " adenosylhomocysteinase like 2\n", + " The protein encoded by this gene acts as a hom...\n", + " AHCYL2\n", " protein-coding\n", " NaN\n", - " 111\n", + " 113\n", " []\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " ensembl_gene_id _id _version \\\n", - "0 ENSG00000164972 84688 2.0 \n", - "1 ENSG00000169105 113189 2.0 \n", - "2 ENSG00000255136 ENSG00000255136 1.0 \n", - "3 ENSG00000105499 8605 1.0 \n", - "4 ENSG00000104611 63898 1.0 \n", - "\n", - " alias \\\n", - "0 [SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22] \n", - "1 [ATCS, EDSMC1, HNK1ST, D4ST1] \n", - "2 [] \n", - "3 [CPLA2-gamma] \n", - "4 [PPP1R38, SH2A] \n", + " ensembl_gene_id _id _version alias \\\n", + "0 ENSG00000151650 27287 1.0 [NA88A, HPX42B, VENTX2] \n", + "1 ENSG00000168268 64943 1.0 [] \n", + "2 ENSG00000186310 4675 1.0 [MB20, NPL3] \n", + "3 ENSG00000204616 11074 1.0 [C6orf13, RNF, HCGI, HCG1] \n", + "4 ENSG00000158467 23382 1.0 [IRBIT2, ADOHCYASE3] \n", "\n", - " name \\\n", - "0 sperm microtubule inner protein 6 \n", - "1 carbohydrate sulfotransferase 14 \n", - "2 TPBGL antisense RNA 1 \n", - "3 phospholipase A2 group IVC \n", - "4 SH2 domain containing 4A \n", + " name \\\n", + "0 VENT homeobox \n", + "1 5'-nucleotidase domain containing 2 \n", + "2 nucleosome assembly protein 1 like 3 \n", + "3 tripartite motif containing 31 \n", + "4 adenosylhomocysteinase like 2 \n", "\n", - " summary symbol \\\n", - "0 This gene encodes a nuclear- or perinuclear-lo... SPMIP6 \n", - "1 This gene encodes a member of the HNK-1 family... CHST14 \n", - "2 NaN TPBGL-AS1 \n", - "3 This gene encodes a protein which is a member ... PLA2G4C \n", - "4 Enables phosphatase binding activity. Located ... SH2D4A \n", + " summary symbol type_of_gene \\\n", + "0 This gene encodes a member of the Vent family ... VENTX protein-coding \n", + "1 Predicted to enable 5'-nucleotidase activity. ... NT5DC2 protein-coding \n", + "2 This gene is intronless and encodes a member o... NAP1L3 protein-coding \n", + "3 This gene encodes a protein that functions as ... TRIM31 protein-coding \n", + "4 The protein encoded by this gene acts as a hom... AHCYL2 protein-coding \n", "\n", - " type_of_gene notfound ensembl_release possible_replacement \\\n", - "0 protein-coding NaN 111 [] \n", - "1 protein-coding NaN 111 [] \n", - "2 NaN NaN 111 [] \n", - "3 protein-coding NaN 111 [] \n", - "4 protein-coding NaN 111 [] \n", + " notfound ensembl_release possible_replacement \\\n", + "0 NaN 113 [] \n", + "1 NaN 113 [] \n", + "2 NaN 113 [] \n", + "3 NaN 113 [] \n", + "4 NaN 113 [] \n", "\n", " permalink \n", - "0 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "1 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "2 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "3 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "4 https://jan2024.archive.ensembl.org/Homo_sapie... " + "0 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "1 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "2 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "3 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "4 https://oct2024.archive.ensembl.org/Homo_sapie... " ] }, - "execution_count": 24, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -2430,7 +2211,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "id": "d0c07b7a", "metadata": {}, "outputs": [ @@ -2469,63 +2250,63 @@ " \n", " \n", " 0\n", - " ENSG00000164972\n", - " sperm microtubule inner protein 6\n", - " [SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22]\n", - " This gene encodes a nuclear- or perinuclear-lo...\n", - " SPMIP6\n", + " ENSG00000151650\n", + " VENT homeobox\n", + " [NA88A, HPX42B, VENTX2]\n", + " This gene encodes a member of the Vent family ...\n", + " VENTX\n", " protein-coding\n", - " 111\n", + " 113\n", " []\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 1\n", - " ENSG00000169105\n", - " carbohydrate sulfotransferase 14\n", - " [ATCS, EDSMC1, HNK1ST, D4ST1]\n", - " This gene encodes a member of the HNK-1 family...\n", - " CHST14\n", + " ENSG00000168268\n", + " 5'-nucleotidase domain containing 2\n", + " []\n", + " Predicted to enable 5'-nucleotidase activity. ...\n", + " NT5DC2\n", " protein-coding\n", - " 111\n", + " 113\n", " []\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 2\n", - " ENSG00000255136\n", - " TPBGL antisense RNA 1\n", - " []\n", - " NaN\n", - " TPBGL-AS1\n", - " NaN\n", - " 111\n", + " ENSG00000186310\n", + " nucleosome assembly protein 1 like 3\n", + " [MB20, NPL3]\n", + " This gene is intronless and encodes a member o...\n", + " NAP1L3\n", + " protein-coding\n", + " 113\n", " []\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 3\n", - " ENSG00000105499\n", - " phospholipase A2 group IVC\n", - " [CPLA2-gamma]\n", - " This gene encodes a protein which is a member ...\n", - " PLA2G4C\n", + " ENSG00000204616\n", + " tripartite motif containing 31\n", + " [C6orf13, RNF, HCGI, HCG1]\n", + " This gene encodes a protein that functions as ...\n", + " TRIM31\n", " protein-coding\n", - " 111\n", + " 113\n", " []\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 4\n", - " ENSG00000104611\n", - " SH2 domain containing 4A\n", - " [PPP1R38, SH2A]\n", - " Enables phosphatase binding activity. Located ...\n", - " SH2D4A\n", + " ENSG00000158467\n", + " adenosylhomocysteinase like 2\n", + " [IRBIT2, ADOHCYASE3]\n", + " The protein encoded by this gene acts as a hom...\n", + " AHCYL2\n", " protein-coding\n", - " 111\n", + " 113\n", " []\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " ...\n", @@ -2540,149 +2321,148 @@ " ...\n", " \n", " \n", - " 37447\n", - " ENSG00000267206\n", - " lipocalin 6\n", - " [hLcn5, LCN5, UNQ643]\n", - " Predicted to enable small molecule binding act...\n", - " LCN6\n", - " protein-coding\n", - " 111\n", + " 35853\n", + " ENSG00000241472\n", + " PTPRG antisense RNA 1\n", " []\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", - " \n", - " \n", - " 37448\n", - " ENSG00000276387\n", - " killer cell immunoglobulin like receptor, two ...\n", - " [NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C...\n", - " Killer cell immunoglobulin-like receptors (KIR...\n", - " KIR2DL1\n", - " protein-coding\n", - " 111\n", + " NaN\n", + " PTPRG-AS1\n", + " ncRNA\n", + " 113\n", " []\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", - " 37449\n", - " ENSG00000276518\n", - " putative killer cell immunoglobulin-like recep...\n", - " [LOC128966730, LOC128966732, LOC128966731, LOC...\n", - " NaN\n", - " LOC128966722\n", + " 35854\n", + " ENSG00000133106\n", + " epithelial stromal interaction 1\n", + " [BRESI1]\n", + " The protein encoded by this gene has been show...\n", + " EPSTI1\n", " protein-coding\n", - " 111\n", + " 113\n", " []\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", - " 37450\n", + " 35855\n", " ENSG00000230373\n", " golgin A6 family like 3, pseudogene\n", - " [GOLGA6L21P, GOLGA6L17P, GOLGA6L3]\n", + " [GOLGA6L3, GOLGA6L21P, GOLGA6L17P]\n", " NaN\n", " GOLGA6L3P\n", " pseudo\n", - " 111\n", + " 113\n", " []\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", - " 37451\n", + " 35856\n", " ENSG00000249738\n", " uncharacterized LOC285626\n", " [LOC105377683]\n", " NaN\n", " LOC285626\n", " ncRNA\n", - " 111\n", + " 113\n", + " []\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", + " \n", + " \n", + " 35857\n", + " ENSG00000276387\n", + " killer cell immunoglobulin like receptor, two ...\n", + " [CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64...\n", + " Killer cell immunoglobulin-like receptors (KIR...\n", + " KIR2DL1\n", + " protein-coding\n", + " 113\n", " []\n", - " https://jan2024.archive.ensembl.org/Homo_sapie...\n", + " https://oct2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", "\n", - "

37452 rows × 9 columns

\n", + "

35858 rows × 9 columns

\n", "" ], "text/plain": [ " ensembl_gene_id name \\\n", - "0 ENSG00000164972 sperm microtubule inner protein 6 \n", - "1 ENSG00000169105 carbohydrate sulfotransferase 14 \n", - "2 ENSG00000255136 TPBGL antisense RNA 1 \n", - "3 ENSG00000105499 phospholipase A2 group IVC \n", - "4 ENSG00000104611 SH2 domain containing 4A \n", + "0 ENSG00000151650 VENT homeobox \n", + "1 ENSG00000168268 5'-nucleotidase domain containing 2 \n", + "2 ENSG00000186310 nucleosome assembly protein 1 like 3 \n", + "3 ENSG00000204616 tripartite motif containing 31 \n", + "4 ENSG00000158467 adenosylhomocysteinase like 2 \n", "... ... ... \n", - "37447 ENSG00000267206 lipocalin 6 \n", - "37448 ENSG00000276387 killer cell immunoglobulin like receptor, two ... \n", - "37449 ENSG00000276518 putative killer cell immunoglobulin-like recep... \n", - "37450 ENSG00000230373 golgin A6 family like 3, pseudogene \n", - "37451 ENSG00000249738 uncharacterized LOC285626 \n", + "35853 ENSG00000241472 PTPRG antisense RNA 1 \n", + "35854 ENSG00000133106 epithelial stromal interaction 1 \n", + "35855 ENSG00000230373 golgin A6 family like 3, pseudogene \n", + "35856 ENSG00000249738 uncharacterized LOC285626 \n", + "35857 ENSG00000276387 killer cell immunoglobulin like receptor, two ... \n", "\n", " alias \\\n", - "0 [SMRP1, C9orf24, CBE1, bA573M23.4, NYD-SP22] \n", - "1 [ATCS, EDSMC1, HNK1ST, D4ST1] \n", - "2 [] \n", - "3 [CPLA2-gamma] \n", - "4 [PPP1R38, SH2A] \n", + "0 [NA88A, HPX42B, VENTX2] \n", + "1 [] \n", + "2 [MB20, NPL3] \n", + "3 [C6orf13, RNF, HCGI, HCG1] \n", + "4 [IRBIT2, ADOHCYASE3] \n", "... ... \n", - "37447 [hLcn5, LCN5, UNQ643] \n", - "37448 [NKAT1, LOC124900571, KIR2DL3, NKAT, KIR221, C... \n", - "37449 [LOC128966730, LOC128966732, LOC128966731, LOC... \n", - "37450 [GOLGA6L21P, GOLGA6L17P, GOLGA6L3] \n", - "37451 [LOC105377683] \n", + "35853 [] \n", + "35854 [BRESI1] \n", + "35855 [GOLGA6L3, GOLGA6L21P, GOLGA6L17P] \n", + "35856 [LOC105377683] \n", + "35857 [CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64... \n", "\n", - " summary symbol \\\n", - "0 This gene encodes a nuclear- or perinuclear-lo... SPMIP6 \n", - "1 This gene encodes a member of the HNK-1 family... CHST14 \n", - "2 NaN TPBGL-AS1 \n", - "3 This gene encodes a protein which is a member ... PLA2G4C \n", - "4 Enables phosphatase binding activity. Located ... SH2D4A \n", - "... ... ... \n", - "37447 Predicted to enable small molecule binding act... LCN6 \n", - "37448 Killer cell immunoglobulin-like receptors (KIR... KIR2DL1 \n", - "37449 NaN LOC128966722 \n", - "37450 NaN GOLGA6L3P \n", - "37451 NaN LOC285626 \n", + " summary symbol \\\n", + "0 This gene encodes a member of the Vent family ... VENTX \n", + "1 Predicted to enable 5'-nucleotidase activity. ... NT5DC2 \n", + "2 This gene is intronless and encodes a member o... NAP1L3 \n", + "3 This gene encodes a protein that functions as ... TRIM31 \n", + "4 The protein encoded by this gene acts as a hom... AHCYL2 \n", + "... ... ... \n", + "35853 NaN PTPRG-AS1 \n", + "35854 The protein encoded by this gene has been show... EPSTI1 \n", + "35855 NaN GOLGA6L3P \n", + "35856 NaN LOC285626 \n", + "35857 Killer cell immunoglobulin-like receptors (KIR... KIR2DL1 \n", "\n", " type_of_gene ensembl_release possible_replacement \\\n", - "0 protein-coding 111 [] \n", - "1 protein-coding 111 [] \n", - "2 NaN 111 [] \n", - "3 protein-coding 111 [] \n", - "4 protein-coding 111 [] \n", + "0 protein-coding 113 [] \n", + "1 protein-coding 113 [] \n", + "2 protein-coding 113 [] \n", + "3 protein-coding 113 [] \n", + "4 protein-coding 113 [] \n", "... ... ... ... \n", - "37447 protein-coding 111 [] \n", - "37448 protein-coding 111 [] \n", - "37449 protein-coding 111 [] \n", - "37450 pseudo 111 [] \n", - "37451 ncRNA 111 [] \n", + "35853 ncRNA 113 [] \n", + "35854 protein-coding 113 [] \n", + "35855 pseudo 113 [] \n", + "35856 ncRNA 113 [] \n", + "35857 protein-coding 113 [] \n", "\n", " permalink \n", - "0 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "1 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "2 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "3 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "4 https://jan2024.archive.ensembl.org/Homo_sapie... \n", + "0 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "1 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "2 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "3 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "4 https://oct2024.archive.ensembl.org/Homo_sapie... \n", "... ... \n", - "37447 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "37448 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "37449 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "37450 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "37451 https://jan2024.archive.ensembl.org/Homo_sapie... \n", + "35853 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "35854 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "35855 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "35856 https://oct2024.archive.ensembl.org/Homo_sapie... \n", + "35857 https://oct2024.archive.ensembl.org/Homo_sapie... \n", "\n", - "[37452 rows x 9 columns]" + "[35858 rows x 9 columns]" ] }, - "execution_count": 25, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "for row in gene_table_merged.loc[\n", - " gene_table_merged[\"possible_replacement\"].isnull(), \"possible_replacement\"\n", - "].index:\n", - " gene_table_merged.at[row, \"possible_replacement\"] = []\n", + "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n", + " \"possible_replacement\"\n", + "].apply(lambda cell: cell if cell is not np.NaN else [])\n", "\n", "gene_table_merged[\"possible_replacement\"] = gene_table_merged.apply(\n", " lambda row: (\n", @@ -2721,7 +2501,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "id": "f2287922", "metadata": {}, "outputs": [], @@ -2736,7 +2516,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "agora-data-tools-ywFp1Gf9", "language": "python", "name": "python3" }, diff --git a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py index fbc1a2dc..d5f0bc5f 100644 --- a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py +++ b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py @@ -1,12 +1,27 @@ +""" +This file includes several helper functions that are called from one or more of the pre-processing +notebooks. This helps avoid code duplication and/or keeps the notebooks cleaner and more straightforward. +Current public-facing functions: + manual_query_biomart - queries Biomart with a GET request + query_ensembl_version_api - queries the Ensembl API for Ensembl ID version info + r_query_biomart - queries Biomart using rpy2 + filter_hasgs - removes human alternative sequence genes from a data frame + get_all_adt_ensembl_ids - gets the Ensembl IDs in all of the files ingested by ADT +""" + import pandas as pd +import numpy as np import requests import re +import synapseclient from io import StringIO -from typing import Union +from typing import Union, Dict, List, Set, Tuple +import agoradatatools.etl.utils as utils +import agoradatatools.etl.extract as extract def manual_query_biomart( - attributes: list[str], filters: dict[Union[list, set]] + attributes: List[str], filters: Dict[str, Union[List[str], Set[str]]] ) -> pd.DataFrame: """Performs a GET request to the Biomart web service and returns the response. There is no canonical Python library to query Biomart and no Python library at all to query on @@ -49,6 +64,62 @@ def manual_query_biomart( return result +def query_ensembl_version_api(ensembl_ids: List[str]) -> pd.DataFrame: + """ + Queries the Ensembl API via POST to get version information for each Ensembl ID. The API can only + process 1000 IDs at a time so the query is broken into batches of 1000. If a request fails, this + function will try again up to 5 times on that batch before quitting and raising an error. + + Args: + ensembl_ids: a list of Ensembl IDs to query + + Returns: + a pandas data frame with Ensembl IDs, version, and release information + """ + url = "https://rest.ensembl.org/archive/id" + headers = {"Content-Type": "application/json", "Accept": "application/json"} + + # We can only query 1000 genes at a time + batch_ind = range(0, len(ensembl_ids), 1000) + results = [] + + for B in batch_ind: + end = min(len(ensembl_ids), B + 1000) + print("Querying genes " + str(B + 1) + " - " + str(end)) + + request_data = '{ "id" : ' + str(ensembl_ids[B:end]) + " }" + request_data = request_data.replace("'", '"') + + ok = False + tries = 0 + + while tries < 5 and not ok: + try: + res = requests.post(url, headers=headers, data=request_data) + ok = res.ok + except: + ok = False + + tries = tries + 1 + + if not ok and tries == 5: + res.raise_for_status() + elif not ok: + print( + "Error retrieving Ensembl versions for genes " + + str(B + 1) + + " - " + + str(end) + + ". Trying again..." + ) + else: + results = results + res.json() + break + + versions = pd.json_normalize(results) + return versions + + def filter_hasgs(df: pd.DataFrame, chromosome_name_column: str) -> pd.DataFrame: """Filters human alternative sequence genes (HASGs) from a data frame by using a regex to identify them for removal. Valid genes will either have a numerical chromosome name or have @@ -96,7 +167,7 @@ def r_query_biomart() -> pd.DataFrame: r.library("biomaRt") # Sometimes Biomart doesn't respond and the command needs to be sent again. Try up to 5 times. - for T in range(5): + for _ in range(5): try: mart = r.useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl") ensembl_ids = r.getBM( @@ -124,3 +195,135 @@ def r_query_biomart() -> pd.DataFrame: } ) return ensembl_ids_df + + +def get_all_adt_ensembl_ids( + config_filename: str, exclude_files: List[str] = [], token: str = None +) -> List[str]: + """ + Loops through an ADT config file, finds all data files that are ingested by ADT, and returns a + list containing all Ensembl IDs present in those files. Specific files can be excluded from the + list with the exclude_files argument. + + Args: + config_filename: full or relative file path to the ADT config.yaml file + exclude_files: list of file names to exclude when searching files for IDs. These names must + match what is in "name" field of the file specification in the config.yaml + file. Typical values are "gene_metadata" and "druggability". + token: a Synapse auth token, or None if the user has Synapse credentials saved. + + Returns: + a list of unique Ensembl IDs that exist in at least one data set ingested by ADT + """ + syn = utils._login_to_synapse(token=token) + config = utils._get_config(config_path=config_filename) + datasets = config["datasets"] + + # Get all unique files in the config since some files are listed multiple times by being + # included in multiple data sets. Also fetch all column rename values for standardizing Ensembl + # ID column names + unique_files = {} + column_renames = {} + + for dataset in datasets: + dataset_name = list(dataset.keys())[0] + + for file in dataset[dataset_name]["files"]: + # Make the Synapse ID the key so that "update" will only add a new item if the ID doesn't + # already exist + unique_files.update({file["id"]: file}) + + # Only some data sets have column rename values + if "column_rename" in dataset[dataset_name].keys(): + column_renames.update(dataset[dataset_name]["column_rename"]) + + # Print all the files we found + print("Found " + str(len(unique_files)) + " files:") + [print(x["name"] + ":\t" + x["id"]) for x in unique_files.values()] + print("") + + # Create a list of all Ensembl IDs in all files + file_ensembl_list = [] + + for entity in unique_files.values(): + # Ignore json files, which are post-processed and not what we're interested in. + # Also ignore any other files specified by 'exclude_files', which likely includes + # "gene_metadata" and "druggability". + if entity["format"] == "json" or entity["name"] in exclude_files: + continue + + file_ensembl_ids = _extract_ensembl_ids(syn, entity, column_renames) + file_ensembl_list = file_ensembl_list + file_ensembl_ids + + # Remove duplicate values + return list(set(file_ensembl_list)) + + +def _extract_ensembl_ids( + syn: synapseclient.Synapse, entity: Dict[str, str], column_renames: Dict[str, str] +) -> List[str]: + """ + Internal function used by get_all_adt_ensembl_ids to exctract a list of Ensembl IDs from a file. + The file is downloaded from Synapse and read in as a pandas data frame, column names are renamed + if necessary to ensure that most Ensembl ID columns are renamed to "ensembl_gene_id", and all + Ensembl IDs from relevant columns are put in a list. + + Note that the "networks" data set contains two columns with Ensembl IDs (genea_ensembl_gene_id + and geneb_ensembl_gene_id) which are not renamed, so this function searches for columns named + with any of those two names or with "ensembl_gene_id" when finding Ensembl ID columns. + + Note that this function depends on the column_rename specifications in the config to accurately + convert all Ensembl ID-containing columns in all files except networks to "ensembl_gene_id", so + that we don't have to hard-code a list of all possible column names. This assumption is valid + for the current set of data files and will likely remain valid for future data, but a warning + is printed out if no matching column is found, just in case. + + Args: + syn: a syanpseclient object which has already been initialized and successfully logged in + entity: a dictionary containing keys "id", "name", and "format" + column_renames: a dictionary containing all column rename pairs from the config file, where + key = old column name, and value = new column name + + Returns: + a list of unique Ensembl IDs in the file, or an empty list if no Ensembl ID column found + """ + df = extract.get_entity_as_df(syn_id=entity["id"], source=entity["format"], syn=syn) + + # Use column_renames from the config to convert most Ensembl ID column names to "ensembl_gene_id". + df = utils.standardize_column_names(df=df) + df = utils.rename_columns(df=df, column_map=column_renames) + + # Exception to the above comment: the 'networks' file has two ID columns (genea_ and geneb_ ensembl_gene_id) + # which do not get renamed + possible_col_names = [ + "ensembl_gene_id", + "genea_ensembl_gene_id", + "geneb_ensembl_gene_id", + ] + + file_ensembl_ids = [] + + # The data may have zero, one, or more than one (in the case of 'networks') column of Ensembl IDs + for C in possible_col_names: + if C in df.columns: + file_ensembl_ids = file_ensembl_ids + df[C].tolist() + + # Print any warnings and remove any NA values from the list before returning + if len(file_ensembl_ids) == 0: + print("WARNING: no Ensembl ID column found for " + entity["name"] + "!") + + if "n/A" in file_ensembl_ids: + print(entity["name"] + " has an n/A Ensembl ID") + file_ensembl_ids.remove("n/A") + + if np.NaN in file_ensembl_ids: + print( + entity["name"] + + " has " + + str(file_ensembl_ids.count(np.NaN)) + + " NaN Ensembl IDs" + ) + file_ensembl_ids = [x for x in file_ensembl_ids if x is not np.NaN] + + # Remove duplicate values + return list(set(file_ensembl_ids)) diff --git a/test_config.yaml b/test_config.yaml index 02bd0f18..a4d59ad3 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -144,7 +144,7 @@ datasets: - gene_info: files: - name: gene_metadata - id: syn25953363.13 + id: syn25953363.14 format: feather - name: igap id: syn12514826.5 @@ -187,7 +187,7 @@ datasets: possible_replacement: ensembl_possible_replacements permalink: ensembl_permalink provenance: - - syn25953363.13 + - syn25953363.14 - syn12514826.5 - syn12514912.3 - *agora_proteomics_provenance From 5e2e3ea154bbefcff00d745afdcbd21f70ba99f3 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Thu, 21 Nov 2024 11:43:04 -0800 Subject: [PATCH 2/8] Undid bump in gene_metadata version, it can't be increased until druggability removed from gene_info --- config.yaml | 4 ++-- test_config.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config.yaml b/config.yaml index 1ace0892..7b8b4f4b 100644 --- a/config.yaml +++ b/config.yaml @@ -144,7 +144,7 @@ datasets: - gene_info: files: - name: gene_metadata - id: syn25953363.14 + id: syn25953363.13 format: feather - name: igap id: syn12514826.5 @@ -187,7 +187,7 @@ datasets: possible_replacement: ensembl_possible_replacements permalink: ensembl_permalink provenance: - - syn25953363.14 + - syn25953363.13 - syn12514826.5 - syn12514912.3 - *agora_proteomics_provenance diff --git a/test_config.yaml b/test_config.yaml index a4d59ad3..02bd0f18 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -144,7 +144,7 @@ datasets: - gene_info: files: - name: gene_metadata - id: syn25953363.14 + id: syn25953363.13 format: feather - name: igap id: syn12514826.5 @@ -187,7 +187,7 @@ datasets: possible_replacement: ensembl_possible_replacements permalink: ensembl_permalink provenance: - - syn25953363.14 + - syn25953363.13 - syn12514826.5 - syn12514912.3 - *agora_proteomics_provenance From bba8fc17c3f930b1c4b10e4c090d8e4832af63db Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Thu, 21 Nov 2024 13:00:38 -0800 Subject: [PATCH 3/8] Addressed SonarCloud issue with exceptions, updated gitignore with a few more local files to ignore --- .gitignore | 3 +++ .../preprocessing/preprocessing_utils.py | 9 ++++++--- tests/test_assets/.DS_Store | Bin 6148 -> 0 bytes 3 files changed, 9 insertions(+), 3 deletions(-) delete mode 100644 tests/test_assets/.DS_Store diff --git a/.gitignore b/.gitignore index 20c38245..909837f7 100644 --- a/.gitignore +++ b/.gitignore @@ -133,6 +133,7 @@ dmypy.json # local generated files staging/* +data_analysis/*/output/* #test staging location test_staging_dir/ @@ -141,3 +142,5 @@ test_staging_dir/ dev_config.yaml .vscode/ +.ipynb_checkpoints/ +.Rhistory diff --git a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py index d5f0bc5f..4ee36b02 100644 --- a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py +++ b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py @@ -15,7 +15,7 @@ import re import synapseclient from io import StringIO -from typing import Union, Dict, List, Set, Tuple +from typing import Union, Dict, List, Set import agoradatatools.etl.utils as utils import agoradatatools.etl.extract as extract @@ -97,7 +97,8 @@ def query_ensembl_version_api(ensembl_ids: List[str]) -> pd.DataFrame: try: res = requests.post(url, headers=headers, data=request_data) ok = res.ok - except: + except requests.RequestException as ex: + print(ex) ok = False tries = tries + 1 @@ -158,6 +159,7 @@ def r_query_biomart() -> pd.DataFrame: "chromosome_name", and "hgnc_symbol" retrived from BioMart """ from rpy2.robjects import r + from rpy2.rinterface_lib.embedded import RRuntimeError r( 'if (!require("BiocManager", character.only = TRUE)) { install.packages("BiocManager") }' @@ -176,7 +178,8 @@ def r_query_biomart() -> pd.DataFrame: useCache=False, ) - except: + except RRuntimeError as ex: + print(ex) print("Trying again...") ensembl_ids = None else: diff --git a/tests/test_assets/.DS_Store b/tests/test_assets/.DS_Store deleted file mode 100644 index 46b71f5c36027abd59a6285a7688b02ddb6a4ed4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKu}T9$5S`H!0THgW+%FLPgC)em+7D0?6@-HnCf3*a+4|njC^;_yD~s?3X5Q`W zyxY51?ChGEFLqBiW}TVM;Y9mjm>TEl6FbWc(c3#7?8RQKw`aSaR3A=|dudxR-Vpcs zYh>Q<`ptUPZ&s-4K7ZL!=JpTF!(XGU6p#W^Knh5K(^mjJn>N1)RFnczKnnaQ!2O}X zi8XNuY+DBckMEqH5ovgCcL`w0aZMZoF$43U0)y&##qgjbU$U+y4uL^8Z!6b3P{}z7a{y!#ZBn70vNh#p7<^6JjU&`J(`8oF50)K>m n8){oQ!srly0bB9aUR~ic>T2Q;*yzYNI#34z>LQZ@|DnJa#>_qE From e5f7597509686ece683f9a20d85cbb8f0bbe9b0d Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Thu, 21 Nov 2024 17:23:43 -0800 Subject: [PATCH 4/8] More code cleanup, moved duplicate ensembl ID handling to preprocessing_utils --- .../AG-896_Preprocess_Gene_Annotations.ipynb | 2009 +---------------- .../preprocessing/preprocessing_utils.py | 78 + 2 files changed, 137 insertions(+), 1950 deletions(-) diff --git a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb index c0f2ad33..bfbab4b4 100644 --- a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb +++ b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb @@ -113,78 +113,12 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "a3fdbeec", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "UPGRADE AVAILABLE\n", - "\n", - "A more recent version of the Synapse Client (4.6.0) is available. Your version (4.0.0) can be upgraded by typing:\n", - " pip install --upgrade synapseclient\n", - "\n", - "Python Synapse Client version 4.6.0 release notes\n", - "\n", - "https://python-docs.synapse.org/news/\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Welcome, Jaclyn Beck!\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:synapseclient_default:Welcome, Jaclyn Beck!\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found 19 files:\n", - "genes_biodomains:\tsyn44151254.5\n", - "neuropath_regression_results:\tsyn22017882.5\n", - "proteomics:\tsyn18689335.4\n", - "proteomics_tmt:\tsyn35221005.2\n", - "proteomics_srm:\tsyn52579640.4\n", - "target_exp_validation_harmonized:\tsyn24184512.9\n", - "metabolomics:\tsyn26064497.1\n", - "gene_metadata:\tsyn25953363.13\n", - "igap:\tsyn12514826.5\n", - "eqtl:\tsyn12514912.3\n", - "diff_exp_data:\tsyn27211942.1\n", - "target_list:\tsyn12540368.51\n", - "median_expression:\tsyn27211878.2\n", - "druggability:\tsyn13363443.11\n", - "tep_adi_info:\tsyn51942280.3\n", - "team_info:\tsyn12615624.18\n", - "team_member_info:\tsyn12615633.19\n", - "overall_scores:\tsyn25575156.13\n", - "networks:\tsyn11685347.1\n", - "\n", - "genes_biodomains has 591 NaN Ensembl IDs\n", - "WARNING: no Ensembl ID column found for team_info!\n", - "WARNING: no Ensembl ID column found for team_member_info!\n", - "\n", - "35858 Ensembl IDs found.\n", - "['ENSG00000151650', 'ENSG00000168268', 'ENSG00000186310', 'ENSG00000204616', 'ENSG00000158467']\n" - ] - } - ], + "outputs": [], "source": [ "file_ensembl_list = preprocessing_utils.get_all_adt_ensembl_ids(\n", " config_filename=config_filename,\n", @@ -206,18 +140,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "f1303e5b", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "35858\n" - ] - } - ], + "outputs": [], "source": [ "ensembl_ids_df = pd.DataFrame({\"ensembl_gene_id\": file_ensembl_list})\n", "\n", @@ -239,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "4e7a37c8", "metadata": {}, "outputs": [], @@ -260,231 +186,12 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "7ebd03d4", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:biothings.client:querying 1-1000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 1001-2000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 2001-3000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 3001-4000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 4001-5000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 5001-6000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 6001-7000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 7001-8000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 8001-9000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 9001-10000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 10001-11000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 11001-12000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 12001-13000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 13001-14000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 14001-15000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 15001-16000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 16001-17000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 17001-18000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 18001-19000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 19001-20000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 20001-21000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 21001-22000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 22001-23000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 23001-24000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 24001-25000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 25001-26000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 26001-27000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 27001-28000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 28001-29000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 29001-30000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 30001-31000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 31001-32000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 32001-33000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 33001-34000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 34001-35000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 35001-35858...\n", - "INFO:biothings.client:done.\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
_id_versionaliasnamesummarysymboltype_of_genenotfound
ensembl_gene_id
ENSG00000151650272871.0[HPX42B, NA88A, VENTX2]VENT homeoboxThis gene encodes a member of the Vent family ...VENTXprotein-codingNaN
ENSG00000168268649431.0NaN5'-nucleotidase domain containing 2Predicted to enable 5'-nucleotidase activity. ...NT5DC2protein-codingNaN
ENSG0000018631046751.0[MB20, NPL3]nucleosome assembly protein 1 like 3This gene is intronless and encodes a member o...NAP1L3protein-codingNaN
ENSG00000204616110741.0[C6orf13, HCG1, HCGI, RNF]tripartite motif containing 31This gene encodes a protein that functions as ...TRIM31protein-codingNaN
ENSG00000158467233821.0[ADOHCYASE3, IRBIT2]adenosylhomocysteinase like 2The protein encoded by this gene acts as a hom...AHCYL2protein-codingNaN
\n", - "
" - ], - "text/plain": [ - " _id _version alias \\\n", - "ensembl_gene_id \n", - "ENSG00000151650 27287 1.0 [HPX42B, NA88A, VENTX2] \n", - "ENSG00000168268 64943 1.0 NaN \n", - "ENSG00000186310 4675 1.0 [MB20, NPL3] \n", - "ENSG00000204616 11074 1.0 [C6orf13, HCG1, HCGI, RNF] \n", - "ENSG00000158467 23382 1.0 [ADOHCYASE3, IRBIT2] \n", - "\n", - " name \\\n", - "ensembl_gene_id \n", - "ENSG00000151650 VENT homeobox \n", - "ENSG00000168268 5'-nucleotidase domain containing 2 \n", - "ENSG00000186310 nucleosome assembly protein 1 like 3 \n", - "ENSG00000204616 tripartite motif containing 31 \n", - "ENSG00000158467 adenosylhomocysteinase like 2 \n", - "\n", - " summary symbol \\\n", - "ensembl_gene_id \n", - "ENSG00000151650 This gene encodes a member of the Vent family ... VENTX \n", - "ENSG00000168268 Predicted to enable 5'-nucleotidase activity. ... NT5DC2 \n", - "ENSG00000186310 This gene is intronless and encodes a member o... NAP1L3 \n", - "ENSG00000204616 This gene encodes a protein that functions as ... TRIM31 \n", - "ENSG00000158467 The protein encoded by this gene acts as a hom... AHCYL2 \n", - "\n", - " type_of_gene notfound \n", - "ensembl_gene_id \n", - "ENSG00000151650 protein-coding NaN \n", - "ENSG00000168268 protein-coding NaN \n", - "ENSG00000186310 protein-coding NaN \n", - "ENSG00000204616 protein-coding NaN \n", - "ENSG00000158467 protein-coding NaN " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "mg = mygene.MyGeneInfo()\n", "\n", @@ -500,21 +207,12 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "23bb114e", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Annotations found for 34655 genes.\n", - "No annotations found for 1206 genes.\n" - ] - } - ], + "outputs": [], "source": [ "print(\"Annotations found for \" + str(sum(mygene_output[\"notfound\"].isna())) + \" genes.\")\n", "print(\n", @@ -538,151 +236,12 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "186d8cb8", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(35861, 9)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ensembl_gene_id_id_versionaliasnamesummarysymboltype_of_genenotfound
0ENSG00000151650272871.0[HPX42B, NA88A, VENTX2]VENT homeoboxThis gene encodes a member of the Vent family ...VENTXprotein-codingNaN
1ENSG00000168268649431.0NaN5'-nucleotidase domain containing 2Predicted to enable 5'-nucleotidase activity. ...NT5DC2protein-codingNaN
2ENSG0000018631046751.0[MB20, NPL3]nucleosome assembly protein 1 like 3This gene is intronless and encodes a member o...NAP1L3protein-codingNaN
3ENSG00000204616110741.0[C6orf13, HCG1, HCGI, RNF]tripartite motif containing 31This gene encodes a protein that functions as ...TRIM31protein-codingNaN
4ENSG00000158467233821.0[ADOHCYASE3, IRBIT2]adenosylhomocysteinase like 2The protein encoded by this gene acts as a hom...AHCYL2protein-codingNaN
\n", - "
" - ], - "text/plain": [ - " ensembl_gene_id _id _version alias \\\n", - "0 ENSG00000151650 27287 1.0 [HPX42B, NA88A, VENTX2] \n", - "1 ENSG00000168268 64943 1.0 NaN \n", - "2 ENSG00000186310 4675 1.0 [MB20, NPL3] \n", - "3 ENSG00000204616 11074 1.0 [C6orf13, HCG1, HCGI, RNF] \n", - "4 ENSG00000158467 23382 1.0 [ADOHCYASE3, IRBIT2] \n", - "\n", - " name \\\n", - "0 VENT homeobox \n", - "1 5'-nucleotidase domain containing 2 \n", - "2 nucleosome assembly protein 1 like 3 \n", - "3 tripartite motif containing 31 \n", - "4 adenosylhomocysteinase like 2 \n", - "\n", - " summary symbol type_of_gene \\\n", - "0 This gene encodes a member of the Vent family ... VENTX protein-coding \n", - "1 Predicted to enable 5'-nucleotidase activity. ... NT5DC2 protein-coding \n", - "2 This gene is intronless and encodes a member o... NAP1L3 protein-coding \n", - "3 This gene encodes a protein that functions as ... TRIM31 protein-coding \n", - "4 The protein encoded by this gene acts as a hom... AHCYL2 protein-coding \n", - "\n", - " notfound \n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN " - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "gene_table_merged = pd.merge(\n", " left=ensembl_ids_df,\n", @@ -711,37 +270,15 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "285c10d2", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "# NaN or NULL alias values become empty lists\n", - "gene_table_merged[\"alias\"] = gene_table_merged[\"alias\"].apply(\n", - " lambda cell: cell if cell is not np.NaN else []\n", - ")\n", - "\n", - "# Some alias values are a single string, not a list. Turn them into lists here.\n", - "gene_table_merged[\"alias\"] = gene_table_merged[\"alias\"].apply(\n", - " lambda cell: cell if isinstance(cell, list) else [cell]\n", - ")\n", - "\n", - "\n", - "# Some alias values are lists of lists or have duplicate values\n", - "def flatten(row):\n", - " flattened = []\n", - " for item in row:\n", - " if isinstance(item, list):\n", - " flattened = flattened + item\n", - " else:\n", - " flattened.append(item)\n", - " return flattened\n", - "\n", - "\n", "gene_table_merged[\"alias\"] = gene_table_merged[\"alias\"].apply(\n", - " lambda row: list(set(flatten(row)))\n", + " preprocessing_utils.standardize_list_item\n", ")" ] }, @@ -752,456 +289,42 @@ "source": [ "## Remove duplicate Ensembl IDs from the list. \n", "\n", - "Duplicates in the list typically have the same Ensembl ID but different gene symbols. This usually happens when a single Ensembl ID maps to multiple Entrez IDs in the NCBI database. There's not a good way to reconcile this, so we first check for entries whose `symbol` is something other than \"LOC#######\", and designate that entry as the main row. If there are multiple or zero entries meeting that criteria, we just use the first entry in the list for each ensembl ID and discard the rest, which is what the Agora front end does. The gene symbols of duplicate rows are then added as aliases to the matching unique row." + "Duplicates in the list typically have the same Ensembl ID but different gene symbols. This usually happens when a single Ensembl ID maps to multiple Entrez IDs in the NCBI database. For every set of duplicated rows with the same Ensembl ID, we remove all rows but the first row in the set, and the symbols and aliases of the removed rows get added to the \"alias\" field of the first row." ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "id": "bc63cc53", "metadata": { "scrolled": true }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ensembl_gene_id_id_versionaliasnamesummarysymboltype_of_genenotfound
19626ENSG000002497382856261.0[]uncharacterized LOC285626NaNLOC285626ncRNANaN
19627ENSG000002497381053776831.0[]uncharacterized LOC105377683NaNLOC105377683ncRNANaN
24698ENSG0000027638738021.0[CD158A, NKAT1, KIR2DL3, KIR-K64, NKAT-1, p58....killer cell immunoglobulin like receptor, two ...Killer cell immunoglobulin-like receptors (KIR...KIR2DL1protein-codingNaN
24699ENSG000002763871249005711.0[]killer cell immunoglobulin-like receptor 2DS1NaNLOC124900571protein-codingNaN
29514ENSG000002303731001332201.0[GOLGA6L3]golgin A6 family like 3, pseudogeneNaNGOLGA6L3PpseudoNaN
29515ENSG000002303736424021.0[GOLGA6L21P]golgin A6 family like 17, pseudogeneNaNGOLGA6L17PpseudoNaN
\n", - "
" - ], - "text/plain": [ - " ensembl_gene_id _id _version \\\n", - "19626 ENSG00000249738 285626 1.0 \n", - "19627 ENSG00000249738 105377683 1.0 \n", - "24698 ENSG00000276387 3802 1.0 \n", - "24699 ENSG00000276387 124900571 1.0 \n", - "29514 ENSG00000230373 100133220 1.0 \n", - "29515 ENSG00000230373 642402 1.0 \n", - "\n", - " alias \\\n", - "19626 [] \n", - "19627 [] \n", - "24698 [CD158A, NKAT1, KIR2DL3, KIR-K64, NKAT-1, p58.... \n", - "24699 [] \n", - "29514 [GOLGA6L3] \n", - "29515 [GOLGA6L21P] \n", - "\n", - " name \\\n", - "19626 uncharacterized LOC285626 \n", - "19627 uncharacterized LOC105377683 \n", - "24698 killer cell immunoglobulin like receptor, two ... \n", - "24699 killer cell immunoglobulin-like receptor 2DS1 \n", - "29514 golgin A6 family like 3, pseudogene \n", - "29515 golgin A6 family like 17, pseudogene \n", - "\n", - " summary symbol \\\n", - "19626 NaN LOC285626 \n", - "19627 NaN LOC105377683 \n", - "24698 Killer cell immunoglobulin-like receptors (KIR... KIR2DL1 \n", - "24699 NaN LOC124900571 \n", - "29514 NaN GOLGA6L3P \n", - "29515 NaN GOLGA6L17P \n", - "\n", - " type_of_gene notfound \n", - "19626 ncRNA NaN \n", - "19627 ncRNA NaN \n", - "24698 protein-coding NaN \n", - "24699 protein-coding NaN \n", - "29514 pseudo NaN \n", - "29515 pseudo NaN " - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "# duplicated() will return true if the ID is a duplicate and is not the first one to appear the list.\n", + "# For printing only\n", "dupes = gene_table_merged[\"ensembl_gene_id\"].duplicated()\n", - "dupe_vals = gene_table_merged[dupes]\n", + "dupe_ids = gene_table_merged.loc[dupes, \"ensembl_gene_id\"]\n", + "print(\n", + " gene_table_merged.loc[\n", + " gene_table_merged[\"ensembl_gene_id\"].isin(dupe_ids),\n", + " [\"ensembl_gene_id\", \"symbol\", \"alias\"],\n", + " ]\n", + ")\n", "\n", - "# Rows with duplicated Ensembl IDs\n", - "all_duplicated = gene_table_merged.loc[\n", - " gene_table_merged[\"ensembl_gene_id\"].isin(dupe_vals[\"ensembl_gene_id\"])\n", - "]\n", - "all_duplicated" + "# Remove duplicates\n", + "gene_table_merged = preprocessing_utils.merge_duplicate_ensembl_ids(gene_table_merged)" ] }, { "cell_type": "code", "execution_count": null, - "id": "093a2e98", + "id": "bc76d96e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "3 duplicated genes have been processed.\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ensembl_gene_id_id_versionaliasnamesummarysymboltype_of_genenotfound
35848ENSG00000085998556241.0[RP76, LGMDR15, LGMD2O, gnT-I.2, GNTI.2, GnT I...protein O-linked mannose N-acetylglucosaminylt...This gene encodes a type II transmembrane prot...POMGNT1protein-codingNaN
35849ENSG00000285081ENSG000002850811.0[]NaNNaNNaNNaNNaN
35850ENSG00000126822260301.0[ARHGEF43, KIAA0599]pleckstrin homology and RhoGEF domain containi...Predicted to enable guanyl-nucleotide exchange...PLEKHG3protein-codingNaN
35851ENSG00000187240796591.0[DHC2, hdhc11, DNCH2, SRTD3, SRPS2B, ATD3, DHC...dynein cytoplasmic 2 heavy chain 1This gene encodes a large cytoplasmic dynein p...DYNC2H1protein-codingNaN
35852ENSG0000010147071251.0[CMYP15, CMYO15, CFAP85, FAP85, MYONRI]troponin C2, fast skeletal typeTroponin (Tn), a key protein complex in the re...TNNC2protein-codingNaN
35853ENSG000002414721005069941.0[]PTPRG antisense RNA 1NaNPTPRG-AS1ncRNANaN
35854ENSG00000133106942401.0[BRESI1]epithelial stromal interaction 1The protein encoded by this gene has been show...EPSTI1protein-codingNaN
35855ENSG000002303731001332201.0[GOLGA6L3, GOLGA6L21P, GOLGA6L17P]golgin A6 family like 3, pseudogeneNaNGOLGA6L3PpseudoNaN
35856ENSG000002497382856261.0[LOC105377683]uncharacterized LOC285626NaNLOC285626ncRNANaN
35857ENSG0000027638738021.0[CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64...killer cell immunoglobulin like receptor, two ...Killer cell immunoglobulin-like receptors (KIR...KIR2DL1protein-codingNaN
\n", - "
" - ], - "text/plain": [ - " ensembl_gene_id _id _version \\\n", - "35848 ENSG00000085998 55624 1.0 \n", - "35849 ENSG00000285081 ENSG00000285081 1.0 \n", - "35850 ENSG00000126822 26030 1.0 \n", - "35851 ENSG00000187240 79659 1.0 \n", - "35852 ENSG00000101470 7125 1.0 \n", - "35853 ENSG00000241472 100506994 1.0 \n", - "35854 ENSG00000133106 94240 1.0 \n", - "35855 ENSG00000230373 100133220 1.0 \n", - "35856 ENSG00000249738 285626 1.0 \n", - "35857 ENSG00000276387 3802 1.0 \n", - "\n", - " alias \\\n", - "35848 [RP76, LGMDR15, LGMD2O, gnT-I.2, GNTI.2, GnT I... \n", - "35849 [] \n", - "35850 [ARHGEF43, KIAA0599] \n", - "35851 [DHC2, hdhc11, DNCH2, SRTD3, SRPS2B, ATD3, DHC... \n", - "35852 [CMYP15, CMYO15, CFAP85, FAP85, MYONRI] \n", - "35853 [] \n", - "35854 [BRESI1] \n", - "35855 [GOLGA6L3, GOLGA6L21P, GOLGA6L17P] \n", - "35856 [LOC105377683] \n", - "35857 [CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64... \n", - "\n", - " name \\\n", - "35848 protein O-linked mannose N-acetylglucosaminylt... \n", - "35849 NaN \n", - "35850 pleckstrin homology and RhoGEF domain containi... \n", - "35851 dynein cytoplasmic 2 heavy chain 1 \n", - "35852 troponin C2, fast skeletal type \n", - "35853 PTPRG antisense RNA 1 \n", - "35854 epithelial stromal interaction 1 \n", - "35855 golgin A6 family like 3, pseudogene \n", - "35856 uncharacterized LOC285626 \n", - "35857 killer cell immunoglobulin like receptor, two ... \n", - "\n", - " summary symbol \\\n", - "35848 This gene encodes a type II transmembrane prot... POMGNT1 \n", - "35849 NaN NaN \n", - "35850 Predicted to enable guanyl-nucleotide exchange... PLEKHG3 \n", - "35851 This gene encodes a large cytoplasmic dynein p... DYNC2H1 \n", - "35852 Troponin (Tn), a key protein complex in the re... TNNC2 \n", - "35853 NaN PTPRG-AS1 \n", - "35854 The protein encoded by this gene has been show... EPSTI1 \n", - "35855 NaN GOLGA6L3P \n", - "35856 NaN LOC285626 \n", - "35857 Killer cell immunoglobulin-like receptors (KIR... KIR2DL1 \n", - "\n", - " type_of_gene notfound \n", - "35848 protein-coding NaN \n", - "35849 NaN NaN \n", - "35850 protein-coding NaN \n", - "35851 protein-coding NaN \n", - "35852 protein-coding NaN \n", - "35853 ncRNA NaN \n", - "35854 protein-coding NaN \n", - "35855 pseudo NaN \n", - "35856 ncRNA NaN \n", - "35857 protein-coding NaN " - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "keep_df = gene_table_merged.drop(all_duplicated.index)\n", - "\n", - "# For each duplicated Ensembl ID, collapse to 1 row and append that row to keep_df\n", - "for ens_id in set(all_duplicated[\"ensembl_gene_id\"]):\n", - " group = all_duplicated.loc[all_duplicated[\"ensembl_gene_id\"] == ens_id].copy(\n", - " deep=True\n", - " )\n", - " # Put any entries with symbols that aren't \"LOC#####\" at the top of the data frame\n", - " matches = group[\"symbol\"].str.startswith(\"LOC\") == False\n", - " group = pd.concat([group.loc[matches], group.loc[matches == False]]).reset_index(\n", - " drop=True\n", - " )\n", - "\n", - " # Add all duplicate symbols and their aliases to the alias field of the first entry\n", - " for row in group.index[1:]:\n", - " group.at[group.index[0], \"alias\"].append(group[\"symbol\"][row])\n", - " if len(group.at[row, \"alias\"]) > 0:\n", - " group.at[group.index[0], \"alias\"] = (\n", - " group.at[group.index[0], \"alias\"] + group[\"alias\"][row]\n", - " )\n", - "\n", - " # Make sure we didn't add duplicate aliases\n", - " group.at[group.index[0], \"alias\"] = list(set(group.at[group.index[0], \"alias\"]))\n", - "\n", - " # Keep the first row only, which now has all the aliases\n", - " keep_df = pd.concat([keep_df, group.iloc[0].to_frame().T], ignore_index=True)\n", - "\n", - "print(\n", - " str(len(all_duplicated.drop_duplicates(\"ensembl_gene_id\")))\n", - " + \" duplicated genes have been processed.\"\n", - ")\n", - "gene_table_merged = keep_df.reset_index(drop=True)\n", - "gene_table_merged.tail(n=10)" + "print(str(len(dupe_ids.drop_duplicates())) + \" duplicated genes have been processed.\")\n", + "print(gene_table_merged.shape)\n", + "print(gene_table_merged.loc[gene_table_merged[\"ensembl_gene_id\"].isin(dupe_ids), \"alias\"])" ] }, { @@ -1218,64 +341,12 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "4a1bbdee", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " name date url version\n", - "1 Ensembl GRCh37 Feb 2014 https://grch37.ensembl.org GRCh37\n", - "2 Ensembl 113 Oct 2024 https://oct2024.archive.ensembl.org 113\n", - "3 Ensembl 112 May 2024 https://may2024.archive.ensembl.org 112\n", - "4 Ensembl 111 Jan 2024 https://jan2024.archive.ensembl.org 111\n", - "5 Ensembl 110 Jul 2023 https://jul2023.archive.ensembl.org 110\n", - "6 Ensembl 109 Feb 2023 https://feb2023.archive.ensembl.org 109\n", - "7 Ensembl 108 Oct 2022 https://oct2022.archive.ensembl.org 108\n", - "8 Ensembl 107 Jul 2022 https://jul2022.archive.ensembl.org 107\n", - "9 Ensembl 106 Apr 2022 https://apr2022.archive.ensembl.org 106\n", - "10 Ensembl 105 Dec 2021 https://dec2021.archive.ensembl.org 105\n", - "11 Ensembl 104 May 2021 https://may2021.archive.ensembl.org 104\n", - "12 Ensembl 103 Feb 2021 https://feb2021.archive.ensembl.org 103\n", - "13 Ensembl 102 Nov 2020 https://nov2020.archive.ensembl.org 102\n", - "14 Ensembl 101 Aug 2020 https://aug2020.archive.ensembl.org 101\n", - "15 Ensembl 100 Apr 2020 https://apr2020.archive.ensembl.org 100\n", - "16 Ensembl 99 Jan 2020 https://jan2020.archive.ensembl.org 99\n", - "17 Ensembl 98 Sep 2019 https://sep2019.archive.ensembl.org 98\n", - "18 Ensembl 80 May 2015 https://may2015.archive.ensembl.org 80\n", - "19 Ensembl 77 Oct 2014 https://oct2014.archive.ensembl.org 77\n", - "20 Ensembl 75 Feb 2014 https://feb2014.archive.ensembl.org 75\n", - "21 Ensembl 54 May 2009 https://may2009.archive.ensembl.org 54\n", - " current_release\n", - "1 \n", - "2 *\n", - "3 \n", - "4 \n", - "5 \n", - "6 \n", - "7 \n", - "8 \n", - "9 \n", - "10 \n", - "11 \n", - "12 \n", - "13 \n", - "14 \n", - "15 \n", - "16 \n", - "17 \n", - "18 \n", - "19 \n", - "20 \n", - "21 \n", - "\n" - ] - } - ], + "outputs": [], "source": [ "archive_df = r.listEnsemblArchives()\n", "archive_df.to_csvfile(path=archive_filename, row_names=False, quote=False)\n", @@ -1295,172 +366,12 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "9a747309", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Querying genes 1 - 1000\n", - "Querying genes 1001 - 2000\n", - "Querying genes 2001 - 3000\n", - "Querying genes 3001 - 4000\n", - "Querying genes 4001 - 5000\n", - "Querying genes 5001 - 6000\n", - "Querying genes 6001 - 7000\n", - "Querying genes 7001 - 8000\n", - "Querying genes 8001 - 9000\n", - "Querying genes 9001 - 10000\n", - "Querying genes 10001 - 11000\n", - "Querying genes 11001 - 12000\n", - "Querying genes 12001 - 13000\n", - "Querying genes 13001 - 14000\n", - "Querying genes 14001 - 15000\n", - "Querying genes 15001 - 16000\n", - "Querying genes 16001 - 17000\n", - "Querying genes 17001 - 18000\n", - "Querying genes 18001 - 19000\n", - "Querying genes 19001 - 20000\n", - "Querying genes 20001 - 21000\n", - "Querying genes 21001 - 22000\n", - "Querying genes 22001 - 23000\n", - "Querying genes 23001 - 24000\n", - "Querying genes 24001 - 25000\n", - "Querying genes 25001 - 26000\n", - "Querying genes 26001 - 27000\n", - "Querying genes 27001 - 28000\n", - "Querying genes 28001 - 29000\n", - "Querying genes 29001 - 30000\n", - "Querying genes 30001 - 31000\n", - "Querying genes 31001 - 32000\n", - "Querying genes 32001 - 33000\n", - "Querying genes 33001 - 34000\n", - "Querying genes 34001 - 35000\n", - "Querying genes 35001 - 35858\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
assemblypeptidepossible_replacementreleaselatesttypeidversionis_current
35853GRCh38None[]113ENSG00000241472.9GeneENSG0000024147291
35854GRCh38None[]113ENSG00000133106.15GeneENSG00000133106151
35855GRCh38None[]113ENSG00000230373.9GeneENSG0000023037391
35856GRCh38None[]113ENSG00000249738.11GeneENSG00000249738111
35857GRCh38None[]113ENSG00000276387.4GeneENSG0000027638741
\n", - "
" - ], - "text/plain": [ - " assembly peptide possible_replacement release latest type \\\n", - "35853 GRCh38 None [] 113 ENSG00000241472.9 Gene \n", - "35854 GRCh38 None [] 113 ENSG00000133106.15 Gene \n", - "35855 GRCh38 None [] 113 ENSG00000230373.9 Gene \n", - "35856 GRCh38 None [] 113 ENSG00000249738.11 Gene \n", - "35857 GRCh38 None [] 113 ENSG00000276387.4 Gene \n", - "\n", - " id version is_current \n", - "35853 ENSG00000241472 9 1 \n", - "35854 ENSG00000133106 15 1 \n", - "35855 ENSG00000230373 9 1 \n", - "35856 ENSG00000249738 11 1 \n", - "35857 ENSG00000276387 4 1 " - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "versions = preprocessing_utils.query_ensembl_version_api(\n", " ensembl_ids=gene_table_merged[\"ensembl_gene_id\"].tolist()\n", @@ -1471,71 +382,22 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "5c108238", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "release\n", - "100 21\n", - "101 8\n", - "102 16\n", - "103 12\n", - "104 17\n", - "105 10\n", - "106 35\n", - "107 12\n", - "108 4\n", - "109 4\n", - "110 11\n", - "111 52\n", - "112 354\n", - "113 34303\n", - "80 21\n", - "81 2\n", - "82 10\n", - "84 673\n", - "87 61\n", - "89 20\n", - "91 67\n", - "93 50\n", - "95 33\n", - "96 31\n", - "97 17\n", - "98 9\n", - "99 5\n", - "dtype: int64" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "versions.groupby(\"release\").size()" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "bf5aecb1", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "35858\n", - "35858\n", - "True\n" - ] - } - ], + "outputs": [], "source": [ "# Check that all IDs are the same between the result and the gene table\n", "print(len(versions[\"id\"]))\n", @@ -1548,21 +410,10 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "7fc8bbcd", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Make sure everything is GRCh38, not GRCh37\n", "all(versions[\"assembly\"] == \"GRCh38\")" @@ -1580,7 +431,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "0d5b5652", "metadata": { "scrolled": true @@ -1604,39 +455,10 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "337b2890", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "closest_release\n", - "80 985\n", - "98 9\n", - "99 5\n", - "100 21\n", - "101 8\n", - "102 16\n", - "103 12\n", - "104 17\n", - "105 10\n", - "106 35\n", - "107 12\n", - "108 4\n", - "109 4\n", - "110 11\n", - "111 52\n", - "112 354\n", - "113 34303\n", - "dtype: int64" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "versions[\"closest_release\"] = 0\n", "\n", @@ -1653,149 +475,12 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "343e5006", "metadata": { "scrolled": false }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
assemblypeptidepossible_replacementreleaselatesttypeidversionis_currentclosest_releasepermalink
0GRCh38None[]113ENSG00000151650.8GeneENSG0000015165081113https://oct2024.archive.ensembl.org/Homo_sapie...
1GRCh38None[]113ENSG00000168268.11GeneENSG00000168268111113https://oct2024.archive.ensembl.org/Homo_sapie...
2GRCh38None[]113ENSG00000186310.10GeneENSG00000186310101113https://oct2024.archive.ensembl.org/Homo_sapie...
3GRCh38None[]113ENSG00000204616.11GeneENSG00000204616111113https://oct2024.archive.ensembl.org/Homo_sapie...
4GRCh38None[]113ENSG00000158467.17GeneENSG00000158467171113https://oct2024.archive.ensembl.org/Homo_sapie...
\n", - "
" - ], - "text/plain": [ - " assembly peptide possible_replacement release latest type \\\n", - "0 GRCh38 None [] 113 ENSG00000151650.8 Gene \n", - "1 GRCh38 None [] 113 ENSG00000168268.11 Gene \n", - "2 GRCh38 None [] 113 ENSG00000186310.10 Gene \n", - "3 GRCh38 None [] 113 ENSG00000204616.11 Gene \n", - "4 GRCh38 None [] 113 ENSG00000158467.17 Gene \n", - "\n", - " id version is_current closest_release \\\n", - "0 ENSG00000151650 8 1 113 \n", - "1 ENSG00000168268 11 1 113 \n", - "2 ENSG00000186310 10 1 113 \n", - "3 ENSG00000204616 11 1 113 \n", - "4 ENSG00000158467 17 1 113 \n", - "\n", - " permalink \n", - "0 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "1 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "2 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "3 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "4 https://oct2024.archive.ensembl.org/Homo_sapie... " - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "versions[\"permalink\"] = \"\"\n", "\n", @@ -1812,166 +497,20 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "4b01719d", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
assemblypeptidepossible_replacementreleaselatesttypeidversionis_currentclosest_releasepermalink
67GRCh38None[]84ENSG00000265108.1GeneENSG00000265108180https://may2015.archive.ensembl.org/Homo_sapie...
68GRCh38None[]80ENSG00000280803.1GeneENSG00000280803180https://may2015.archive.ensembl.org/Homo_sapie...
111GRCh38None[]84ENSG00000281672.1GeneENSG00000281672180https://may2015.archive.ensembl.org/Homo_sapie...
135GRCh38None[]87ENSG00000279857.1GeneENSG00000279857180https://may2015.archive.ensembl.org/Homo_sapie...
141GRCh38None[]84ENSG00000274483.1GeneENSG00000274483180https://may2015.archive.ensembl.org/Homo_sapie...
\n", - "
" - ], - "text/plain": [ - " assembly peptide possible_replacement release latest type \\\n", - "67 GRCh38 None [] 84 ENSG00000265108.1 Gene \n", - "68 GRCh38 None [] 80 ENSG00000280803.1 Gene \n", - "111 GRCh38 None [] 84 ENSG00000281672.1 Gene \n", - "135 GRCh38 None [] 87 ENSG00000279857.1 Gene \n", - "141 GRCh38 None [] 84 ENSG00000274483.1 Gene \n", - "\n", - " id version is_current closest_release \\\n", - "67 ENSG00000265108 1 80 \n", - "68 ENSG00000280803 1 80 \n", - "111 ENSG00000281672 1 80 \n", - "135 ENSG00000279857 1 80 \n", - "141 ENSG00000274483 1 80 \n", - "\n", - " permalink \n", - "67 https://may2015.archive.ensembl.org/Homo_sapie... \n", - "68 https://may2015.archive.ensembl.org/Homo_sapie... \n", - "111 https://may2015.archive.ensembl.org/Homo_sapie... \n", - "135 https://may2015.archive.ensembl.org/Homo_sapie... \n", - "141 https://may2015.archive.ensembl.org/Homo_sapie... " - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "versions[versions[\"closest_release\"] < 100].head()" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "id": "c4128cc9", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "https://oct2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000151650\n", - "https://oct2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000142192\n" - ] - } - ], + "outputs": [], "source": [ "print(versions[\"permalink\"][0])\n", "print(versions[\"permalink\"][25])" @@ -1979,21 +518,10 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "73791e6c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Does every gene have an associated URL?\n", "url_base_len = len(archive_table[\"url\"][0]) + 1\n", @@ -2010,174 +538,10 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "f3edfd2f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(35858, 12)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ensembl_gene_id_id_versionaliasnamesummarysymboltype_of_genenotfoundensembl_releasepossible_replacementpermalink
0ENSG00000151650272871.0[NA88A, HPX42B, VENTX2]VENT homeoboxThis gene encodes a member of the Vent family ...VENTXprotein-codingNaN113[]https://oct2024.archive.ensembl.org/Homo_sapie...
1ENSG00000168268649431.0[]5'-nucleotidase domain containing 2Predicted to enable 5'-nucleotidase activity. ...NT5DC2protein-codingNaN113[]https://oct2024.archive.ensembl.org/Homo_sapie...
2ENSG0000018631046751.0[MB20, NPL3]nucleosome assembly protein 1 like 3This gene is intronless and encodes a member o...NAP1L3protein-codingNaN113[]https://oct2024.archive.ensembl.org/Homo_sapie...
3ENSG00000204616110741.0[C6orf13, RNF, HCGI, HCG1]tripartite motif containing 31This gene encodes a protein that functions as ...TRIM31protein-codingNaN113[]https://oct2024.archive.ensembl.org/Homo_sapie...
4ENSG00000158467233821.0[IRBIT2, ADOHCYASE3]adenosylhomocysteinase like 2The protein encoded by this gene acts as a hom...AHCYL2protein-codingNaN113[]https://oct2024.archive.ensembl.org/Homo_sapie...
\n", - "
" - ], - "text/plain": [ - " ensembl_gene_id _id _version alias \\\n", - "0 ENSG00000151650 27287 1.0 [NA88A, HPX42B, VENTX2] \n", - "1 ENSG00000168268 64943 1.0 [] \n", - "2 ENSG00000186310 4675 1.0 [MB20, NPL3] \n", - "3 ENSG00000204616 11074 1.0 [C6orf13, RNF, HCGI, HCG1] \n", - "4 ENSG00000158467 23382 1.0 [IRBIT2, ADOHCYASE3] \n", - "\n", - " name \\\n", - "0 VENT homeobox \n", - "1 5'-nucleotidase domain containing 2 \n", - "2 nucleosome assembly protein 1 like 3 \n", - "3 tripartite motif containing 31 \n", - "4 adenosylhomocysteinase like 2 \n", - "\n", - " summary symbol type_of_gene \\\n", - "0 This gene encodes a member of the Vent family ... VENTX protein-coding \n", - "1 Predicted to enable 5'-nucleotidase activity. ... NT5DC2 protein-coding \n", - "2 This gene is intronless and encodes a member o... NAP1L3 protein-coding \n", - "3 This gene encodes a protein that functions as ... TRIM31 protein-coding \n", - "4 The protein encoded by this gene acts as a hom... AHCYL2 protein-coding \n", - "\n", - " notfound ensembl_release possible_replacement \\\n", - "0 NaN 113 [] \n", - "1 NaN 113 [] \n", - "2 NaN 113 [] \n", - "3 NaN 113 [] \n", - "4 NaN 113 [] \n", - "\n", - " permalink \n", - "0 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "1 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "2 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "3 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "4 https://oct2024.archive.ensembl.org/Homo_sapie... " - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "versions = versions[[\"id\", \"release\", \"possible_replacement\", \"permalink\"]]\n", "versions.rename(\n", @@ -2202,276 +566,21 @@ "metadata": {}, "source": [ "### Final cleanup\n", - "Unfilled \"possible_replacement\" entries should be changed from NaN to empty lists. \n", - "\n", - "\"possible_replacement\" entries that have data in them exist as a list of dicts, and need to have the Ensembl IDs pulled out of them as a list of strings. \n", + "\"possible_replacement\" entries will either be an empty list or a list of dictionaries. Entries that have data in them need to have the Ensembl IDs pulled out of them as a list of strings.\n", "\n", "Remove unneeded columns. " ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "d0c07b7a", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ensembl_gene_idnamealiassummarysymboltype_of_geneensembl_releasepossible_replacementpermalink
0ENSG00000151650VENT homeobox[NA88A, HPX42B, VENTX2]This gene encodes a member of the Vent family ...VENTXprotein-coding113[]https://oct2024.archive.ensembl.org/Homo_sapie...
1ENSG000001682685'-nucleotidase domain containing 2[]Predicted to enable 5'-nucleotidase activity. ...NT5DC2protein-coding113[]https://oct2024.archive.ensembl.org/Homo_sapie...
2ENSG00000186310nucleosome assembly protein 1 like 3[MB20, NPL3]This gene is intronless and encodes a member o...NAP1L3protein-coding113[]https://oct2024.archive.ensembl.org/Homo_sapie...
3ENSG00000204616tripartite motif containing 31[C6orf13, RNF, HCGI, HCG1]This gene encodes a protein that functions as ...TRIM31protein-coding113[]https://oct2024.archive.ensembl.org/Homo_sapie...
4ENSG00000158467adenosylhomocysteinase like 2[IRBIT2, ADOHCYASE3]The protein encoded by this gene acts as a hom...AHCYL2protein-coding113[]https://oct2024.archive.ensembl.org/Homo_sapie...
..............................
35853ENSG00000241472PTPRG antisense RNA 1[]NaNPTPRG-AS1ncRNA113[]https://oct2024.archive.ensembl.org/Homo_sapie...
35854ENSG00000133106epithelial stromal interaction 1[BRESI1]The protein encoded by this gene has been show...EPSTI1protein-coding113[]https://oct2024.archive.ensembl.org/Homo_sapie...
35855ENSG00000230373golgin A6 family like 3, pseudogene[GOLGA6L3, GOLGA6L21P, GOLGA6L17P]NaNGOLGA6L3Ppseudo113[]https://oct2024.archive.ensembl.org/Homo_sapie...
35856ENSG00000249738uncharacterized LOC285626[LOC105377683]NaNLOC285626ncRNA113[]https://oct2024.archive.ensembl.org/Homo_sapie...
35857ENSG00000276387killer cell immunoglobulin like receptor, two ...[CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64...Killer cell immunoglobulin-like receptors (KIR...KIR2DL1protein-coding113[]https://oct2024.archive.ensembl.org/Homo_sapie...
\n", - "

35858 rows × 9 columns

\n", - "
" - ], - "text/plain": [ - " ensembl_gene_id name \\\n", - "0 ENSG00000151650 VENT homeobox \n", - "1 ENSG00000168268 5'-nucleotidase domain containing 2 \n", - "2 ENSG00000186310 nucleosome assembly protein 1 like 3 \n", - "3 ENSG00000204616 tripartite motif containing 31 \n", - "4 ENSG00000158467 adenosylhomocysteinase like 2 \n", - "... ... ... \n", - "35853 ENSG00000241472 PTPRG antisense RNA 1 \n", - "35854 ENSG00000133106 epithelial stromal interaction 1 \n", - "35855 ENSG00000230373 golgin A6 family like 3, pseudogene \n", - "35856 ENSG00000249738 uncharacterized LOC285626 \n", - "35857 ENSG00000276387 killer cell immunoglobulin like receptor, two ... \n", - "\n", - " alias \\\n", - "0 [NA88A, HPX42B, VENTX2] \n", - "1 [] \n", - "2 [MB20, NPL3] \n", - "3 [C6orf13, RNF, HCGI, HCG1] \n", - "4 [IRBIT2, ADOHCYASE3] \n", - "... ... \n", - "35853 [] \n", - "35854 [BRESI1] \n", - "35855 [GOLGA6L3, GOLGA6L21P, GOLGA6L17P] \n", - "35856 [LOC105377683] \n", - "35857 [CD158A, NKAT1, KIR2DL3, LOC124900571, KIR-K64... \n", - "\n", - " summary symbol \\\n", - "0 This gene encodes a member of the Vent family ... VENTX \n", - "1 Predicted to enable 5'-nucleotidase activity. ... NT5DC2 \n", - "2 This gene is intronless and encodes a member o... NAP1L3 \n", - "3 This gene encodes a protein that functions as ... TRIM31 \n", - "4 The protein encoded by this gene acts as a hom... AHCYL2 \n", - "... ... ... \n", - "35853 NaN PTPRG-AS1 \n", - "35854 The protein encoded by this gene has been show... EPSTI1 \n", - "35855 NaN GOLGA6L3P \n", - "35856 NaN LOC285626 \n", - "35857 Killer cell immunoglobulin-like receptors (KIR... KIR2DL1 \n", - "\n", - " type_of_gene ensembl_release possible_replacement \\\n", - "0 protein-coding 113 [] \n", - "1 protein-coding 113 [] \n", - "2 protein-coding 113 [] \n", - "3 protein-coding 113 [] \n", - "4 protein-coding 113 [] \n", - "... ... ... ... \n", - "35853 ncRNA 113 [] \n", - "35854 protein-coding 113 [] \n", - "35855 pseudo 113 [] \n", - "35856 ncRNA 113 [] \n", - "35857 protein-coding 113 [] \n", - "\n", - " permalink \n", - "0 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "1 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "2 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "3 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "4 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "... ... \n", - "35853 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "35854 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "35855 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "35856 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "35857 https://oct2024.archive.ensembl.org/Homo_sapie... \n", - "\n", - "[35858 rows x 9 columns]" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n", " \"possible_replacement\"\n", - "].apply(lambda cell: cell if cell is not np.NaN else [])\n", - "\n", - "gene_table_merged[\"possible_replacement\"] = gene_table_merged.apply(\n", - " lambda row: (\n", - " row[\"possible_replacement\"]\n", - " if len(row[\"possible_replacement\"]) == 0\n", - " else [x[\"stable_id\"] for x in row[\"possible_replacement\"]]\n", - " ),\n", - " axis=1,\n", - ")\n", + "].apply(lambda pr: pr if len(pr) == 0 else [x[\"stable_id\"] for x in pr])\n", "\n", "gene_table_merged = gene_table_merged[\n", " [\n", @@ -2501,7 +610,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "f2287922", "metadata": {}, "outputs": [], diff --git a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py index 4ee36b02..c9735fd2 100644 --- a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py +++ b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py @@ -7,6 +7,10 @@ r_query_biomart - queries Biomart using rpy2 filter_hasgs - removes human alternative sequence genes from a data frame get_all_adt_ensembl_ids - gets the Ensembl IDs in all of the files ingested by ADT + standardize_list_item - turn values of varying types into a list. Used for fixing the "alias" and + "possible_replacement" fields of gene_metadata. + merge_duplicate_ensembl_ids - collapse rows with the same Ensembl ID but different gene symbols + or aliases into one row """ import pandas as pd @@ -330,3 +334,77 @@ def _extract_ensembl_ids( # Remove duplicate values return list(set(file_ensembl_ids)) + + +def standardize_list_item(item: Union[str, List[str]]) -> List[str]: + """ + For the gene_metadata data frame, some queries return columns that are a mixture of None/NaN, + a single string, and a list of strings. This function standardizes the column values so that + everything is a list, either empty (if NaN) or a list of strings. The final list is sorted + alphabetically to make comparison between different versions of the file easier. + + This function is intended to be called as part of an apply() statement on a pandas data frame + column. + + Args: + item: either a list of strings, a list of lists of strings, or np.NaN + + Returns: + A single-level list of strings, which may be empty. The list is sorted alphabetically. + """ + # Convert NaN to an empty list + if item is np.NaN: + return [] + + # Convert plain strings to a list of one string + if isinstance(item, str): + return [item] + + # Get unique values only and sort them + item = list(set(item)) + item.sort() + return item + + +def merge_duplicate_ensembl_ids(gene_table: pd.DataFrame) -> pd.DataFrame: + """ + MyGene queries sometimes return multiple rows rows with the same Ensembl ID but different symbols + or other information. This usually happens when a single Ensembl ID maps to multiple Entrez IDs + in the NCBI database. There's not a good way to reconcile this, so for every set of rows with the + same Ensembl ID, we designate the first entry in the as the main row. The gene symbols of the + remaining rows in the set are then added as aliases to the "main" row, and all of their aliases + are added to the main row alias field as well. All rows in the set except the main row are then + deleted from the data frame, leaving a single row for that Ensembl ID with all symbols and aliases + from the duplicate rows merged into the alias field. + + Args: + gene_table: a pandas DataFrame containing gene metadata results from MyGene + + Returns: + a data frame with duplicate rows removed + """ + dupes = gene_table["ensembl_gene_id"].duplicated() + dupe_ids = gene_table.loc[dupes, "ensembl_gene_id"].drop_duplicates().tolist() + + for ens_id in dupe_ids: + rows = gene_table.loc[gene_table["ensembl_gene_id"] == ens_id] + + # Add duplicate rows' symbols to the alias field of the first row, then add duplicate rows' + # aliases to the first row's alias field. All other information in the duplicate rows is + # discarded. + new_alias = rows.iloc[0]["alias"] + + for row in rows.index[1:]: + new_alias.append(rows.loc[row, "symbol"]) + new_alias = new_alias + rows.loc[row, "alias"] + + # Remove any duplicate aliases and sort them + new_alias = list(set(new_alias)) + new_alias.sort() + + # Set the new aliases to the first row in this group and remove all duplicate rows from the + # data frame + gene_table.at[rows.index[0], "alias"] = new_alias + gene_table = gene_table.drop(rows.index[1:]) + + return gene_table From 503d3012b7dcaeae967a9c13f55a823b1338ce88 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Fri, 22 Nov 2024 11:32:13 -0800 Subject: [PATCH 5/8] Updated uniprot mapping script to use new preprocessing function to get all ADT ids --- .../AG-1388_ENSG_Uniprot_Mapping.ipynb | 160 ++---------------- 1 file changed, 10 insertions(+), 150 deletions(-) diff --git a/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb index ba477beb..2b369886 100644 --- a/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb +++ b/data_analysis/agora/notebooks/preprocessing/AG-1388_ENSG_Uniprot_Mapping.ipynb @@ -20,16 +20,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from unipressed import IdMappingClient\n", "import time\n", "import pandas as pd\n", - "import numpy as np\n", - "import agoradatatools.etl.utils as utils\n", - "import agoradatatools.etl.extract as extract\n", + "import preprocessing_utils\n", "\n", "config_filename = \"../../../../config.yaml\"" ] @@ -43,157 +41,19 @@ "Loop through all data sets in the config file to get all Ensembl IDs used in every data set. NOTE: In the future, it would be simpler to just load the `gene_metadata` data set once druggability genes are removed from it, rather than looping through all of these files. " ] }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'genes_biodomains': ('syn44151254.5', 'csv'),\n", - " 'neuropath_regression_results': ('syn22017882.5', 'csv'),\n", - " 'proteomics': ('syn18689335.3', 'csv'),\n", - " 'proteomics_tmt': ('syn35221005.2', 'csv'),\n", - " 'proteomics_srm': ('syn52579640.4', 'csv'),\n", - " 'target_exp_validation_harmonized': ('syn24184512.9', 'csv'),\n", - " 'metabolomics': ('syn26064497.1', 'feather'),\n", - " 'igap': ('syn12514826.5', 'csv'),\n", - " 'eqtl': ('syn12514912.3', 'csv'),\n", - " 'diff_exp_data': ('syn27211942.1', 'tsv'),\n", - " 'target_list': ('syn12540368.47', 'csv'),\n", - " 'median_expression': ('syn27211878.2', 'csv'),\n", - " 'tep_adi_info': ('syn51942280.2', 'csv'),\n", - " 'team_info': ('syn12615624.18', 'csv'),\n", - " 'team_member_info': ('syn12615633.18', 'csv'),\n", - " 'overall_scores': ('syn25575156.13', 'table'),\n", - " 'networks': ('syn11685347.1', 'csv')}" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "config = utils._get_config(config_path=config_filename)\n", - "datasets = config[\"datasets\"]\n", - "\n", - "files = {}\n", - "\n", - "for dataset in datasets:\n", - " dataset_name = list(dataset.keys())[0]\n", - "\n", - " for entity in dataset[dataset_name][\"files\"]:\n", - " entity_id = entity[\"id\"]\n", - " entity_format = entity[\"format\"]\n", - " entity_name = entity[\"name\"]\n", - "\n", - " # Ignore json files, which are post-processed and not what we're interested in.\n", - " # Also ignore \"druggability\" since we want to exclude druggability-only genes, and \n", - " # \"gene_metadata\" which includes druggability genes.\n", - " if entity_format != \"json\" and entity_name not in [\"druggability\", \"gene_metadata\"]:\n", - " files[entity_name] = (entity_id, entity_format)\n", - "\n", - "# There are some duplicate synID's in this list but that doesn't really matter\n", - "files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### We should now have a list of all raw data files ingested. Get each one and create a list of IDs." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "UPGRADE AVAILABLE\n", - "\n", - "A more recent version of the Synapse Client (4.6.0) is available. Your version (4.0.0) can be upgraded by typing:\n", - " pip install --upgrade synapseclient\n", - "\n", - "Python Synapse Client version 4.6.0 release notes\n", - "\n", - "https://python-docs.synapse.org/news/\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Welcome, Jaclyn Beck!\n", - "\n", - "INFO: 2024-11-15 11:43:36 | synapseclient_default | Welcome, Jaclyn Beck!\n", - "\n", - "genes_biodomains has an NaN Ensembl ID\n", - "WARNING: no Ensembl ID column found for team_info!\n", - "WARNING: no Ensembl ID column found for team_member_info!\n" - ] - } - ], - "source": [ - "syn = utils._login_to_synapse(token=None) # Assumes you have already logged in with a valid token\n", - "\n", - "# The various column names used to store Ensembl IDs in the files\n", - "col_names = [\"ENSG\", \"ensembl_gene_id\", \"GeneID\", \"ensembl_id\"]\n", - "file_ensembl_list = []\n", - "\n", - "for file in files.keys():\n", - " df = extract.get_entity_as_df(syn_id=files[file][0], source=files[file][1], syn=syn)\n", - "\n", - " file_ensembl_ids = None\n", - "\n", - " for C in col_names:\n", - " if C in df.columns:\n", - " file_ensembl_ids = df[C]\n", - "\n", - " # networks file is a special case\n", - " if file == \"networks\":\n", - " file_ensembl_ids = pd.melt(\n", - " df[[\"geneA_ensembl_gene_id\", \"geneB_ensembl_gene_id\"]]\n", - " )[\"value\"]\n", - "\n", - " if file_ensembl_ids is not None:\n", - " file_ensembl_list = file_ensembl_list + file_ensembl_ids.tolist()\n", - " if \"n/A\" in file_ensembl_ids.tolist():\n", - " print(file + \" has an n/A Ensembl ID\")\n", - " file_ensembl_list.remove(\"n/A\")\n", - " if np.NaN in file_ensembl_ids.tolist():\n", - " print(file + \" has an NaN Ensembl ID\")\n", - " else:\n", - " print(\"WARNING: no Ensembl ID column found for \" + file + \"!\")" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "35858\n" - ] - } - ], + "outputs": [], "source": [ - "file_ensembl_list = list(set(file_ensembl_list))\n", - "\n", - "# NaNs will be floats, so this removes them. Using np.isnan() on strings throws an error.\n", - "ensembl_ids = [x for x in file_ensembl_list if isinstance(x, str)]\n", - "\n", - "print(len(ensembl_ids))" + "ensembl_ids = preprocessing_utils.get_all_adt_ensembl_ids(\n", + " config_filename=config_filename,\n", + " exclude_files=[\"gene_metadata\", \"druggability\"],\n", + " token=None,\n", + ")\n", + "print(\"\")\n", + "print(str(len(ensembl_ids)) + \" Ensembl IDs found.\")" ] }, { From ab1bb82407000bdb046180992be108d2f8a5ffac Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Fri, 22 Nov 2024 13:45:49 -0800 Subject: [PATCH 6/8] Fixed standardize_list_item to work for possible_replacement --- .../AG-896_Preprocess_Gene_Annotations.ipynb | 4 ++++ .../notebooks/preprocessing/preprocessing_utils.py | 10 +++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb index bfbab4b4..e09882a4 100644 --- a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb +++ b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb @@ -578,6 +578,10 @@ "metadata": {}, "outputs": [], "source": [ + "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n", + " \"possible_replacement\"\n", + "].apply(preprocessing_utils.standardize_list_item)\n", + "\n", "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n", " \"possible_replacement\"\n", "].apply(lambda pr: pr if len(pr) == 0 else [x[\"stable_id\"] for x in pr])\n", diff --git a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py index c9735fd2..f9119d01 100644 --- a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py +++ b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py @@ -360,9 +360,13 @@ def standardize_list_item(item: Union[str, List[str]]) -> List[str]: if isinstance(item, str): return [item] - # Get unique values only and sort them - item = list(set(item)) - item.sort() + if isinstance(item, list): + # Get unique values only and sort them + item = list(set(item)) + item.sort() + + # No extra handling necessary for other data types + return item From 3a6065544b493e3efc528cb9c8c41a267589b565 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Fri, 22 Nov 2024 14:50:50 -0800 Subject: [PATCH 7/8] Fix to possible_replacement so the list field is standardized after it's actually a list of strings --- .../preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb index e09882a4..7550d17c 100644 --- a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb +++ b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb @@ -580,11 +580,11 @@ "source": [ "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n", " \"possible_replacement\"\n", - "].apply(preprocessing_utils.standardize_list_item)\n", + "].apply(lambda pr: pr if pr is np.NaN or len(pr) == 0 else [x[\"stable_id\"] for x in pr])\n", "\n", "gene_table_merged[\"possible_replacement\"] = gene_table_merged[\n", " \"possible_replacement\"\n", - "].apply(lambda pr: pr if len(pr) == 0 else [x[\"stable_id\"] for x in pr])\n", + "].apply(preprocessing_utils.standardize_list_item)\n", "\n", "gene_table_merged = gene_table_merged[\n", " [\n", From ce9dc5e4eb3b6b6b14a122168de679a31423adc0 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Fri, 22 Nov 2024 14:55:31 -0800 Subject: [PATCH 8/8] Updated comment in the standardize list function --- .../agora/notebooks/preprocessing/preprocessing_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py index f9119d01..e85f441a 100644 --- a/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py +++ b/data_analysis/agora/notebooks/preprocessing/preprocessing_utils.py @@ -347,10 +347,10 @@ def standardize_list_item(item: Union[str, List[str]]) -> List[str]: column. Args: - item: either a list of strings, a list of lists of strings, or np.NaN + item: either a string, a list of strings, or np.NaN Returns: - A single-level list of strings, which may be empty. The list is sorted alphabetically. + A list of strings or an empty list. The list is sorted alphabetically. """ # Convert NaN to an empty list if item is np.NaN: