Merge pull request #281 from thehyve/update_cancer_types

Update cancer types
cBioPortal · Jul 31, 2018 · 43f8c38 · 43f8c38
2 parents 663cae5 + f9e52d3
commit 43f8c38
Show file tree

Hide file tree

Showing 5 changed files with 219 additions and 6 deletions.
diff --git a/.circleci/portalinfo/cancertypes.json b/.circleci/portalinfo/cancertypes.json
diff --git a/seedDB/README.md b/seedDB/README.md
@@ -16,13 +16,14 @@ When using a release version **> 1.14.0**, a migration step to a new database sc
 
 **Schema 2.6.0**: [SQL file with create table statements](https://raw.githubusercontent.com/cBioPortal/cbioportal/v1.13.1/db-scripts/src/main/resources/cgds.sql)<br>
 **Seed database**: [seed-cbioportal_hg19_v2.6.0.sql.gz](https://github.com/cBioPortal/datahub/raw/master/seedDB/seed-cbioportal_hg19_v2.6.0.sql.gz)<br>
-md5sum 01a1db3ae38d160d27af23c2f7db86f7
+md5sum 4f418394148cb4c0af8a4d739df2dfd7
 
 Contents of seed database:
 - Entrez Gene IDs, HGNC symbols and gene aliases updated in April 2018 from [NCBI](ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz)
 - Gene lengths retrieved from [Gencode Release 27 (mapped to GRCh37)](https://www.gencodegenes.org/releases/27lift37.html)
 - Pfam graphics fetched in August 2017
 - Gene Sets from MSigDB 6.1
+- Cancer Types from OncoTree (fetched July 2018 from http://oncotree.mskcc.org)
 
 ## Previous seed databases
 #### Seed database schema 2.4.0

diff --git a/seedDB/Update-Seed-Database.md b/seedDB/Update-Seed-Database.md
@@ -5,7 +5,11 @@ This documentation file is addressed to developers. To update the seed database
 
 2. Run the migration script from a branch that includes the new database schema ([more information](https://github.com/cBioPortal/cbioportal/blob/master/docs/Updating-your-cBioPortal-installation.md#running-the-migration-script)).
 
-3. Move to the folder where you want to save the seed files. Use the following commands (assuming that the database is running on port 8306) to generate the new seed files. Please specify the species and the new schema version in the file name (e.g. for the human version of `v2.1.0`, the file name should be `seed-cbioportal_hg19_v2.1.0.sql`).
+3. Update the gene and gene alias by following the instructions in https://github.com/cBioPortal/cbioportal/blob/master/docs/Updating-gene-and-gene_alias-tables.md
+
+4. Update the cancer types by running `./update_cancer_types.py -p <local_dir>/portal.properties`
+
+5. Move to the folder where you want to save the seed files. Use the following commands (assuming that the database is running on port 8306) to generate the new seed files. Please specify the species and the new schema version in the file name (e.g. for the human version of `v2.1.0`, the file name should be `seed-cbioportal_hg19_v2.1.0.sql`).
 
 :warning: Do not confuse the schema version with the cBioPortal version.
 
@@ -14,17 +18,17 @@ mysqldump -u cbio -pP@ssword1 -P 8306 --host 127.0.0.1 --ignore-table cbioportal
 ```
 :warning: The database schema is not included in these dump files.
 
-4. In case gene sets are included in the seed, manually add a line at the end the sql file to update the gene set version.
+6. In case gene sets are included in the seed, manually add a line at the end the sql file to update the gene set version.
 ```bash
 -- Manually add gene set version
 UPDATE info SET GENESET_VERSION="msigdb_6.1";
 ```
 
-5. Zip the generated mysql dump files:
+7. Zip the generated mysql dump files:
 ```shell
 gzip seed-cbioportal_hg19_v2.1.0.sql
 ```
 
-6. New files are ready to be uploaded to datahub.
+8. New files are ready to be uploaded to datahub.
 
 :warning: The database schema itself is found at: `$PORTAL_HOME/db-scripts/src/main/resources/db/cgds.sql`
diff --git a/seedDB/seed-cbioportal_hg19_v2.6.0.sql.gz b/seedDB/seed-cbioportal_hg19_v2.6.0.sql.gz
diff --git a/seedDB/update_cancer_types.py b/seedDB/update_cancer_types.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python2.7
+
+# Copyright (c) 2018 The Hyve B.V.
+# This code is licensed under the GNU Affero General Public License (AGPL),
+# version 3, or (at your option) any later version.
+#
+# This file is part of cBioPortal.
+#
+# cBioPortal is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+"""Update cancer types script - updates cancer types from OncoTree API to an empty cBioPortal database
+
+http://oncotree.mskcc.org/#/home?tab=api
+
+Run with the command line option --help for usage information.
+"""
+
+from __future__ import print_function
+import requests
+import MySQLdb
+import sys
+import argparse
+
+
+def eprint(*args, **kwargs):
+    """Print error message and exit"""
+    print(*args, file=sys.stderr, **kwargs)
+    sys.exit(1)
+
+
+def retrieve_oncotree_cancer_types():
+    """Retrieve cancer types from OncoTree API"""
+
+    print('Retrieving cancer types from OncoTree API')
+    request_url = 'http://oncotree.mskcc.org/api/tumorTypes/tree?version=oncotree_latest_stable'
+    request_headers = {'Accept': 'application/json'}
+    # request_data = '{ "transcriptIds" : ["%s"] }' % ('", "'.join(self.transcript_set))
+    request = requests.get(url=request_url, headers=request_headers)
+    if request.ok:
+        # Parse transcripts and exons from JSON
+        result_json = request.json()
+        return result_json
+
+    else:
+        if request.status_code == 404:
+            print('Error 404')
+            sys.exit(1)
+        else:
+            request.raise_for_status()
+    return
+
+
+def parse_cancer_types_json(cancer_types_json):
+    """Parse JSON formatted cancer types received from OncoTree API"""
+
+    print('Parsing JSON formatted cancer types received from OncoTree API')
+
+    def flatten_oncotree(node, node_name):
+        """Recursive function to flatten the JSON formatted cancer types"""
+
+        type_of_cancer_id = node_name
+        name = node['name']
+        clinical_trial_keywords = node['name'].lower()
+        dedicated_color = node['color']
+        short_name = node_name
+        parent = node['parent']
+
+        cancer_type = [type_of_cancer_id, name, clinical_trial_keywords, dedicated_color, short_name, parent]
+        cancer_types.append(cancer_type)
+
+        if len(node['children'].values()) > 0:
+            for child_node in node['children'].keys():
+                flatten_oncotree(node['children'][child_node], child_node)
+
+    # Save cancer types as tuples in a list
+    cancer_types = list()
+
+    # The root node, 'TISSUE' can be skipped
+    for child in cancer_types_json['TISSUE']['children'].keys():
+        #     print child
+        flatten_oncotree(cancer_types_json['TISSUE']['children'][child], child)
+
+    return cancer_types
+
+
+def get_portal_properties(portal_properties_filename):
+    """Retrieve database settings from portal.properties"""
+
+    print('Retrieving database settings from portal.properties')
+
+    portal_properties = {}
+    with open(portal_properties_filename, 'r') as portal_properties_file:
+        for line in portal_properties_file:
+
+            # Skip line if its blank or a comment
+            line = line.strip()
+            if len(line) == 0 or line.startswith('#'):
+                continue
+
+            # Check whether the line contains a single property
+            if len(line.split('=')) > 2:
+                continue
+
+            # Read relevant portal properties
+            property_key, property_value = line.split('=')
+            if property_key == 'db.host':
+                # Check if host and port can be read.
+                if len(property_value.split(':')) != 2:
+                    eprint('Unable to read host from db.host in portal.properties.\n'
+                           'Expected format: host:port\n'
+                           'Found: %s' % property_value)
+                host, port = property_value.split(':')
+
+                # localhost has to be converted to 127.0.0.1
+                if host == 'localhost':
+                    host = '127.0.0.1'
+                portal_properties['host'] = host
+                portal_properties['port'] = int(port)
+
+            elif property_key == 'db.portal_db_name':
+                portal_properties['database'] = property_value
+
+            elif property_key == 'db.user':
+                portal_properties['user'] = property_value
+
+            elif property_key == 'db.password':
+                portal_properties['passwd'] = property_value
+
+    required_properties = ['host', 'user', 'passwd', 'database', 'port']
+    for required_property in required_properties:
+        if required_property not in portal_properties:
+            eprint('Unable to extract %s from portal.properties')
+    return portal_properties
+
+
+def insert_cancer_types(cancer_types, portal_properties):
+    """Insert cancer types in cBioPortal database"""
+
+    print('Inserting cancer types in cBioPortal database')
+    db = MySQLdb.connect(host=portal_properties['host'],
+                         user=portal_properties['user'],
+                         passwd=portal_properties['passwd'],
+                         db=portal_properties['database'],
+                         port=portal_properties['port'])
+    cursor = db.cursor()
+
+    # Check whether the database is empty
+    cursor.execute('SELECT * FROM cbioportal.cancer_study')
+    cancer_studies = cursor.fetchall()
+    if len(cancer_studies) > 0:
+        eprint('cancer_study table is not empty. Please use an empty cBioPortal database when updating cancer '
+               'types. Afterwards this empty database can be exported to create a new seed database.\n'
+               'https://github.com/cBioPortal/datahub/tree/master/seedDB')
+
+    # Remove foreign key restrictions
+    cursor.execute('ALTER TABLE cancer_study DROP FOREIGN KEY cancer_study_ibfk_1')
+    cursor.execute('ALTER TABLE sample DROP FOREIGN KEY sample_ibfk_2')
+
+    # Remove previous cancer types
+    cursor.execute('TRUNCATE type_of_cancer')
+
+    # Insert new cancer types
+    sql = 'INSERT INTO type_of_cancer ' \
+          '(TYPE_OF_CANCER_ID, NAME, CLINICAL_TRIAL_KEYWORDS, DEDICATED_COLOR, SHORT_NAME, PARENT) ' \
+          'VALUES (%s, %s, %s, %s, %s, %s)'
+    cursor.executemany(sql, cancer_types)
+
+    # Restore foreign key restrictions
+    cursor.execute('ALTER TABLE cancer_study ADD CONSTRAINT cancer_study_ibfk_1 FOREIGN KEY (TYPE_OF_CANCER_ID) '
+                   'REFERENCES type_of_cancer (TYPE_OF_CANCER_ID)')
+    cursor.execute('ALTER TABLE sample ADD CONSTRAINT sample_ibfk_2 FOREIGN KEY (TYPE_OF_CANCER_ID) REFERENCES '
+                   'type_of_cancer (TYPE_OF_CANCER_ID)')
+    db.commit()
+
+
+def main(portal_properties_filename):
+    # First retrieve data base settings from portal properties file
+    cancer_types_json = retrieve_oncotree_cancer_types()
+    cancer_types = parse_cancer_types_json(cancer_types_json)
+    portal_properties = get_portal_properties(portal_properties_filename)
+    insert_cancer_types(cancer_types, portal_properties)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(usage='-p <portal_properties_file>',
+                                     description='This code updates the cancer types in an empty cBioPortal database. '
+                                                 'Afterwards this database can be exported to create a new seed '
+                                                 'database.')
+
+    arguments = parser.add_argument_group('Named arguments')
+    arguments.add_argument('-p', '--portal_properties_file',
+                           required=True,
+                           help='Path to portal.properties file')
+
+    args = parser.parse_args()
+    main(args.portal_properties_file)