diff --git a/.github/workflows/validate-python.yml b/.github/workflows/validate-python.yml index ccf67a61..507c4e6e 100644 --- a/.github/workflows/validate-python.yml +++ b/.github/workflows/validate-python.yml @@ -14,7 +14,7 @@ jobs: - name: 'Validate tests' working-directory: ./cbioportal-core run: | - docker run -v ${PWD}:/cbioportal-core python:3.6 /bin/bash -c ' + docker run -v ${PWD}:/cbioportal-core python:3.6 /bin/sh -c ' cd cbioportal-core && pip install -r requirements.txt && - source test_scripts.sh' + ./test_scripts.sh' diff --git a/README.md b/README.md index 60433f9c..f4148fd3 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,59 @@ This repo contains: ## Inclusion in main codebase The `cbioportal-core` code is currently included in the final Docker image during the Docker build process: https://github.com/cBioPortal/cbioportal/blob/master/docker/web-and-data/Dockerfile#L48 +## Running in docker + +Build docker image with: +```bash +docker build -t cbioportal-core . +``` + +### Example of how to load `study_es_0` study + +Import gene panels + +```bash +docker run -it -v $(pwd)/tests/test_data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core \ +perl importGenePanel.pl --data /data/study_es_0/data_gene_panel_testpanel1.txt +docker run -it -v $(pwd)/tests/test_data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core \ +perl importGenePanel.pl --data /data/study_es_0/data_gene_panel_testpanel2.txt +``` + +Import gene sets and supplementary data + +```bash +docker run -it -v $(pwd)/src/test/resources/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core \ +perl importGenesetData.pl --data /data/genesets/study_es_0_genesets.gmt --new-version msigdb_7.5.1 --supp /data/genesets/study_es_0_supp-genesets.txt +``` + +Import gene set hierarchy data + +```bash +docker run -it -v $(pwd)/src/test/resources/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core \ +perl importGenesetHierarchy.pl --data /data/genesets/study_es_0_tree.yaml +``` + +Import study + +```bash +docker run -it -v $(pwd)/tests/test_data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core \ +python importer/metaImport.py -s /data/study_es_0 -p /data/api_json_system_tests -o +``` + +### Incremental upload of data + +To add or update specific patient, sample, or molecular data in an already loaded study, you can perform an incremental upload. This process is quicker than reloading the entire study. + +To execute an incremental upload, use the -d (or --data_directory) option instead of -s (or --study_directory). Here is an example command: +```bash +docker run -it -v $(pwd)/data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core python importer/metaImport.py -d /data/study_es_0_inc -p /data/api_json -o +``` +**Note:** +While the directory should adhere to the standard cBioPortal file formats and study structure, incremental uploads are not supported for all data types though. +For instance, uploading study metadata, resources, or GSVA data incrementally is currently unsupported. + +This method ensures efficient updates without the need for complete study reuploads, saving time and computational resources. + ## How to run integration tests This section guides you through the process of running integration tests by setting up a cBioPortal MySQL database environment using Docker. Please follow these steps carefully to ensure your testing environment is configured correctly. @@ -78,7 +131,7 @@ After you are done with the setup, you can build and test the project. 1. Execute tests through the provided script: ```bash -source test_scripts.sh +./test_scripts.sh ``` 2. Build the loader jar using Maven (includes testing): @@ -119,15 +172,3 @@ The script will search for `core-*.jar` in the root of the project: python scripts/importer/metaImport.py -s tests/test_data/study_es_0 -p tests/test_data/api_json_unit_tests -o ``` -## Running in docker - -Build docker image with: -```bash -docker build -t cbioportal-core . -``` - -Example of how to start the loading: -```bash -docker run -it -v $(pwd)/data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core python importer/metaImport.py -s /data/study_es_0 -p /data/api_json -o -``` - diff --git a/pom.xml b/pom.xml index 4e176537..f89513f6 100644 --- a/pom.xml +++ b/pom.xml @@ -252,6 +252,9 @@ org.apache.maven.plugins maven-surefire-plugin 2.21.0 + + false + default-test diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py index 97073ff6..c2f65cc0 100755 --- a/scripts/importer/cbioportalImporter.py +++ b/scripts/importer/cbioportalImporter.py @@ -12,6 +12,7 @@ import logging import re from pathlib import Path +from typing import Dict, Tuple # configure relative imports if running as a script; see PEP 366 # it might passed as empty string by certain tooling to mark a top level module @@ -39,6 +40,8 @@ from .cbioportal_common import ADD_CASE_LIST_CLASS from .cbioportal_common import VERSION_UTIL_CLASS from .cbioportal_common import run_java +from .cbioportal_common import UPDATE_CASE_LIST_CLASS +from .cbioportal_common import INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES # ------------------------------------------------------------------------------ @@ -101,8 +104,17 @@ def remove_study_id(jvm_args, study_id): args.append("--noprogress") # don't report memory usage and % progress run_java(*args) +def update_case_lists(jvm_args, meta_filename, case_lists_file_or_dir = None): + args = jvm_args.split(' ') + args.append(UPDATE_CASE_LIST_CLASS) + args.append("--meta") + args.append(meta_filename) + if case_lists_file_or_dir: + args.append("--case-lists") + args.append(case_lists_file_or_dir) + run_java(*args) -def import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity = None, meta_file_dictionary = None): +def import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity = None, meta_file_dictionary = None, incremental = False): args = jvm_args.split(' ') # In case the meta file is already parsed in a previous function, it is not @@ -133,6 +145,10 @@ def import_study_data(jvm_args, meta_filename, data_filename, update_generic_ass importer = IMPORTER_CLASSNAME_BY_META_TYPE[meta_file_type] args.append(importer) + if incremental: + if meta_file_type not in INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES: + raise NotImplementedError("This type does not support incremental upload: {}".format(meta_file_type)) + args.append("--overwrite-existing") if IMPORTER_REQUIRES_METADATA[importer]: args.append("--meta") args.append(meta_filename) @@ -212,11 +228,20 @@ def process_command(jvm_args, command, meta_filename, data_filename, study_ids, else: raise RuntimeError('Your command uses both -id and -meta. Please, use only one of the two parameters.') elif command == IMPORT_STUDY_DATA: - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity) elif command == IMPORT_CASE_LIST: import_case_list(jvm_args, meta_filename) -def process_directory(jvm_args, study_directory, update_generic_assay_entity = None): +def get_meta_filenames(data_directory): + meta_filenames = [ + os.path.join(data_directory, meta_filename) for + meta_filename in os.listdir(data_directory) if + re.search(r'(\b|_)meta(\b|[_0-9])', meta_filename, + flags=re.IGNORECASE) and + not (meta_filename.startswith('.') or meta_filename.endswith('~'))] + return meta_filenames + +def process_study_directory(jvm_args, study_directory, update_generic_assay_entity = None): """ Import an entire study directory based on meta files found. @@ -241,12 +266,7 @@ def process_directory(jvm_args, study_directory, update_generic_assay_entity = N cna_long_filepair = None # Determine meta filenames in study directory - meta_filenames = ( - os.path.join(study_directory, meta_filename) for - meta_filename in os.listdir(study_directory) if - re.search(r'(\b|_)meta(\b|[_0-9])', meta_filename, - flags=re.IGNORECASE) and - not (meta_filename.startswith('.') or meta_filename.endswith('~'))) + meta_filenames = get_meta_filenames(study_directory) # Read all meta files (excluding case lists) to determine what to import for meta_filename in meta_filenames: @@ -353,53 +373,53 @@ def process_directory(jvm_args, study_directory, update_generic_assay_entity = N raise RuntimeError('No sample attribute file found') else: meta_filename, data_filename = sample_attr_filepair - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Next, we need to import resource definitions for resource data if resource_definition_filepair is not None: meta_filename, data_filename = resource_definition_filepair - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Next, we need to import sample definitions for resource data if sample_resource_filepair is not None: meta_filename, data_filename = sample_resource_filepair - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Next, import everything else except gene panel, structural variant data, GSVA and # z-score expression. If in the future more types refer to each other, (like # in a tree structure) this could be programmed in a recursive fashion. for meta_filename, data_filename in regular_filepairs: - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Import structural variant data if structural_variant_filepair is not None: meta_filename, data_filename = structural_variant_filepair - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Import cna data if cna_long_filepair is not None: meta_filename, data_filename = cna_long_filepair - import_study_data(jvm_args=jvm_args, meta_filename=meta_filename, data_filename=data_filename, - meta_file_dictionary=study_meta_dictionary[meta_filename]) + import_data(jvm_args=jvm_args, meta_filename=meta_filename, data_filename=data_filename, + meta_file_dictionary=study_meta_dictionary[meta_filename]) # Import expression z-score (after expression) for meta_filename, data_filename in zscore_filepairs: - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Import GSVA genetic profiles (after expression and z-scores) if gsva_score_filepair is not None: # First import the GSVA score data meta_filename, data_filename = gsva_score_filepair - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Second import the GSVA p-value data meta_filename, data_filename = gsva_pvalue_filepair - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) if gene_panel_matrix_filepair is not None: meta_filename, data_filename = gene_panel_matrix_filepair - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Import the case lists case_list_dirname = os.path.join(study_directory, 'case_lists') @@ -412,6 +432,72 @@ def process_directory(jvm_args, study_directory, update_generic_assay_entity = N # enable study update_study_status(jvm_args, study_id) +def get_meta_filenames_by_type(data_directory) -> Dict[str, Tuple[str, Dict]]: + """ + Read all meta files in the data directory and return meta information (filename, content) grouped by type. + """ + meta_file_type_to_meta_files = {} + + # Determine meta filenames in study directory + meta_filenames = get_meta_filenames(data_directory) + + # Read all meta files (excluding case lists) to determine what to import + for meta_filename in meta_filenames: + + # Parse meta file + meta_dictionary = cbioportal_common.parse_metadata_file( + meta_filename, logger=LOGGER) + + # Retrieve meta file type + meta_file_type = meta_dictionary['meta_file_type'] + if meta_file_type is None: + # invalid meta file, let's die + raise RuntimeError('Invalid meta file: ' + meta_filename) + if meta_file_type not in meta_file_type_to_meta_files: + meta_file_type_to_meta_files[meta_file_type] = [] + + meta_file_type_to_meta_files[meta_file_type].append((meta_filename, meta_dictionary)) + return meta_file_type_to_meta_files + +def import_incremental_data(jvm_args, data_directory, update_generic_assay_entity, meta_file_type_to_meta_files): + """ + Load all data types that are available and support incremental upload + """ + for meta_file_type in INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES: + if meta_file_type not in meta_file_type_to_meta_files: + continue + meta_pairs = meta_file_type_to_meta_files[meta_file_type] + for meta_pair in meta_pairs: + meta_filename, meta_dictionary = meta_pair + data_filename = os.path.join(data_directory, meta_dictionary['data_filename']) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, meta_dictionary, incremental=True) + +def update_case_lists_from_folder(jvm_args, data_directory, meta_file_type_to_meta_files): + """ + Updates case lists if clinical sample provided. + The command takes case_list/ folder as optional argument. + If folder exists case lists will be updated accordingly. + """ + if MetaFileTypes.SAMPLE_ATTRIBUTES in meta_file_type_to_meta_files: + case_list_dirname = os.path.join(data_directory, 'case_lists') + sample_attributes_metas = meta_file_type_to_meta_files[MetaFileTypes.SAMPLE_ATTRIBUTES] + for meta_pair in sample_attributes_metas: + meta_filename, meta_dictionary = meta_pair + LOGGER.info('Updating case lists with sample ids', extra={'filename_': meta_filename}) + update_case_lists(jvm_args, meta_filename, case_lists_file_or_dir=case_list_dirname if os.path.isdir(case_list_dirname) else None) + +def process_data_directory(jvm_args, data_directory, update_generic_assay_entity = None): + """ + Incremental import of data directory based on meta files found. + """ + + meta_file_type_to_meta_files = get_meta_filenames_by_type(data_directory) + + not_supported_meta_types = meta_file_type_to_meta_files.keys() - INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES + if not_supported_meta_types: + raise NotImplementedError("These types do not support incremental upload: {}".format(", ".join(not_supported_meta_types))) + import_incremental_data(jvm_args, data_directory, update_generic_assay_entity, meta_file_type_to_meta_files) + update_case_lists_from_folder(jvm_args, data_directory, meta_file_type_to_meta_files) def usage(): # TODO : replace this by usage string from interface() @@ -435,26 +521,27 @@ def check_files(meta_filename, data_filename): print('data-file cannot be found:' + data_filename, file=ERROR_FILE) sys.exit(2) -def check_dir(study_directory): +def check_dir(data_directory): # check existence of directory - if not os.path.exists(study_directory) and study_directory != '': - print('Study cannot be found: ' + study_directory, file=ERROR_FILE) + if not os.path.exists(data_directory) and data_directory != '': + print('Directory cannot be found: ' + data_directory, file=ERROR_FILE) sys.exit(2) def add_parser_args(parser): - parser.add_argument('-s', '--study_directory', type=str, required=False, - help='Path to Study Directory') + data_source_group = parser.add_mutually_exclusive_group() + data_source_group.add_argument('-s', '--study_directory', type=str, help='Path to Study Directory') + data_source_group.add_argument('-d', '--data_directory', type=str, help='Path to Data Directory') parser.add_argument('-jvo', '--java_opts', type=str, default=os.environ.get('JAVA_OPTS'), help='Path to specify JAVA_OPTS for the importer. \ - (default: gets the JAVA_OPTS from the environment)') + (default: gets the JAVA_OPTS from the environment)') parser.add_argument('-jar', '--jar_path', type=str, required=False, - help='Path to scripts JAR file') + help='Path to scripts JAR file') parser.add_argument('-meta', '--meta_filename', type=str, required=False, help='Path to meta file') parser.add_argument('-data', '--data_filename', type=str, required=False, help='Path to Data file') -def interface(): +def interface(args=None): parent_parser = argparse.ArgumentParser(description='cBioPortal meta Importer') add_parser_args(parent_parser) parser = argparse.ArgumentParser() @@ -484,7 +571,7 @@ def interface(): # TODO - add same argument to metaimporter # TODO - harmonize on - and _ - parser = parser.parse_args() + parser = parser.parse_args(args) if parser.command is not None and parser.subcommand is not None: print('Cannot call multiple commands') sys.exit(2) @@ -547,14 +634,16 @@ def main(args): # process the options jvm_args = "-Dspring.profiles.active=dbcp " + args.java_opts - study_directory = args.study_directory # check if DB version and application version are in sync check_version(jvm_args) - if study_directory != None: - check_dir(study_directory) - process_directory(jvm_args, study_directory, args.update_generic_assay_entity) + if args.data_directory is not None: + check_dir(args.data_directory) + process_data_directory(jvm_args, args.data_directory, args.update_generic_assay_entity) + elif args.study_directory is not None: + check_dir(args.study_directory) + process_study_directory(jvm_args, args.study_directory, args.update_generic_assay_entity) else: check_args(args.command) check_files(args.meta_filename, args.data_filename) diff --git a/scripts/importer/cbioportal_common.py b/scripts/importer/cbioportal_common.py index 35f71d34..e4bbe041 100644 --- a/scripts/importer/cbioportal_common.py +++ b/scripts/importer/cbioportal_common.py @@ -37,6 +37,7 @@ IMPORT_CANCER_TYPE_CLASS = "org.mskcc.cbio.portal.scripts.ImportTypesOfCancers" IMPORT_CASE_LIST_CLASS = "org.mskcc.cbio.portal.scripts.ImportSampleList" ADD_CASE_LIST_CLASS = "org.mskcc.cbio.portal.scripts.AddCaseList" +UPDATE_CASE_LIST_CLASS = "org.mskcc.cbio.portal.scripts.UpdateCaseListsSampleIds" VERSION_UTIL_CLASS = "org.mskcc.cbio.portal.util.VersionUtil" PORTAL_PROPERTY_DATABASE_USER = 'db.user' @@ -364,6 +365,27 @@ class MetaFileTypes(object): }, } +# order is important! This is the order in which they should be loaded: +INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES = [ + MetaFileTypes.PATIENT_ATTRIBUTES, + MetaFileTypes.SAMPLE_ATTRIBUTES, + MetaFileTypes.MUTATION, + MetaFileTypes.MUTATION_UNCALLED, + MetaFileTypes.EXPRESSION, + MetaFileTypes.CNA_DISCRETE, + MetaFileTypes.CNA_CONTINUOUS, + MetaFileTypes.CNA_LOG2, + MetaFileTypes.METHYLATION, + MetaFileTypes.PROTEIN, + MetaFileTypes.GENERIC_ASSAY_CONTINUOUS, + MetaFileTypes.GENERIC_ASSAY_BINARY, + MetaFileTypes.GENERIC_ASSAY_CATEGORICAL, + MetaFileTypes.TIMELINE, + MetaFileTypes.GENE_PANEL_MATRIX, + MetaFileTypes.STRUCTURAL_VARIANT, + MetaFileTypes.SEG, +] + IMPORTER_CLASSNAME_BY_META_TYPE = { MetaFileTypes.STUDY: IMPORT_STUDY_CLASS, MetaFileTypes.CANCER_TYPE: IMPORT_CANCER_TYPE_CLASS, diff --git a/scripts/importer/metaImport.py b/scripts/importer/metaImport.py index 7fdced9f..7b6a1a35 100755 --- a/scripts/importer/metaImport.py +++ b/scripts/importer/metaImport.py @@ -56,8 +56,11 @@ class Color(object): def interface(): parser = argparse.ArgumentParser(description='cBioPortal meta Importer') - parser.add_argument('-s', '--study_directory', type=str, required=True, - help='path to directory.') + data_source_group = parser.add_mutually_exclusive_group() + data_source_group.add_argument('-s', '--study_directory', + type=str, help='path to study directory.') + data_source_group.add_argument('-d', '--data_directory', + type=str, help='path to data directory for incremental upload.') portal_mode_group = parser.add_mutually_exclusive_group() portal_mode_group.add_argument('-u', '--url_server', type=str, @@ -115,7 +118,7 @@ def interface(): # supply parameters that the validation script expects to have parsed args.error_file = False - study_dir = args.study_directory + data_dir = args.data_directory if args.data_directory is not None else args.study_directory # Validate the study directory. print("Starting validation...\n", file=sys.stderr) @@ -139,9 +142,9 @@ def interface(): # Import OncoKB annotations when asked, and there are no validation warnings or warnings are overruled study_is_valid = exitcode == 0 or (exitcode == 3 and args.override_warning) if study_is_valid and args.import_oncokb: - mutation_meta_file_path = libImportOncokb.find_meta_file_by_fields(study_dir, {'genetic_alteration_type': 'MUTATION_EXTENDED'}) + mutation_meta_file_path = libImportOncokb.find_meta_file_by_fields(data_dir, {'genetic_alteration_type': 'MUTATION_EXTENDED'}) mutation_data_file_name = libImportOncokb.find_data_file_from_meta_file(mutation_meta_file_path) - mutation_data_file_path = os.path.join(study_dir, mutation_data_file_name) + mutation_data_file_path = os.path.join(data_dir, mutation_data_file_name) study_is_modified = False print("\n") if os.path.exists(mutation_data_file_path): @@ -163,9 +166,9 @@ def interface(): for log_handler in validator_logger.handlers: log_handler.close() validator_logger.handlers = [] - cna_meta_file_path = libImportOncokb.find_meta_file_by_fields(study_dir, {'genetic_alteration_type': 'COPY_NUMBER_ALTERATION', 'datatype': 'DISCRETE'}) + cna_meta_file_path = libImportOncokb.find_meta_file_by_fields(data_dir, {'genetic_alteration_type': 'COPY_NUMBER_ALTERATION', 'datatype': 'DISCRETE'}) cna_data_file_name = libImportOncokb.find_data_file_from_meta_file(cna_meta_file_path) - cna_data_file_path = os.path.join(study_dir, cna_data_file_name) + cna_data_file_path = os.path.join(data_dir, cna_data_file_name) if os.path.exists(cna_data_file_path): print("Starting import of OncoKB annotations for discrete CNA file ...\n", file=sys.stderr) try: diff --git a/scripts/importer/validateData.py b/scripts/importer/validateData.py index 1f473abb..079479ba 100755 --- a/scripts/importer/validateData.py +++ b/scripts/importer/validateData.py @@ -70,7 +70,6 @@ DEFINED_SAMPLE_IDS = None DEFINED_SAMPLE_ATTRIBUTES = None PATIENTS_WITH_SAMPLES = None -DEFINED_CANCER_TYPES = None mutation_sample_ids = None mutation_file_sample_ids = set() sample_ids_panel_dict = {} @@ -727,7 +726,7 @@ def checkSampleId(self, sample_id, column_number): Return True if the sample id was valid, False otherwise. """ - if sample_id not in DEFINED_SAMPLE_IDS: + if DEFINED_SAMPLE_IDS is not None and sample_id not in DEFINED_SAMPLE_IDS: self.logger.error( 'Sample ID not defined in clinical file', extra={'line_number': self.line_number, @@ -741,7 +740,7 @@ def checkPatientId(self, patient_id, column_number): Return True if the patient id was valid, False otherwise. """ - if patient_id not in PATIENTS_WITH_SAMPLES: + if PATIENTS_WITH_SAMPLES is not None and patient_id not in PATIENTS_WITH_SAMPLES: self.logger.error( 'Patient ID not defined in clinical file', extra={'line_number': self.line_number, @@ -2871,7 +2870,7 @@ def checkHeader(self, cols): 'cause': 'SAMPLE_ID'}) # refuse to define attributes also defined in the sample-level file for attribute_id in self.defined_attributes: - if attribute_id in DEFINED_SAMPLE_ATTRIBUTES: + if DEFINED_SAMPLE_ATTRIBUTES is not None and attribute_id in DEFINED_SAMPLE_ATTRIBUTES: # log this as a file-aspecific error, using the base logger self.logger.logger.error( 'Clinical attribute is defined both as sample-level and ' @@ -2912,7 +2911,7 @@ def checkLine(self, data): self.patient_id_lines[value])}) else: self.patient_id_lines[value] = self.line_number - if value not in PATIENTS_WITH_SAMPLES: + if PATIENTS_WITH_SAMPLES is not None and value not in PATIENTS_WITH_SAMPLES: self.logger.warning( 'Clinical data defined for a patient with ' 'no samples', @@ -2979,12 +2978,13 @@ def checkLine(self, data): def onComplete(self): """Perform final validations based on the data parsed.""" - for patient_id in PATIENTS_WITH_SAMPLES: - if patient_id not in self.patient_id_lines: - self.logger.warning( - 'Missing clinical data for a patient associated with ' - 'samples', - extra={'cause': patient_id}) + if PATIENTS_WITH_SAMPLES: + for patient_id in PATIENTS_WITH_SAMPLES: + if patient_id not in self.patient_id_lines: + self.logger.warning( + 'Missing clinical data for a patient associated with ' + 'samples', + extra={'cause': patient_id}) super(PatientClinicalValidator, self).onComplete() @@ -3385,7 +3385,7 @@ def checkLine(self, data): sample_ids_panel_dict[sample_id] = data[self.mutation_stable_id_index - 1] # Sample ID has been removed from list, so subtract 1 position. if data[self.mutation_stable_id_index - 1] != 'NA': - if sample_id not in mutation_sample_ids: + if mutation_sample_ids is not None and sample_id not in mutation_sample_ids: self.logger.error('Sample ID has mutation gene panel, but is not in the sequenced case list', extra={'line_number': self.line_number, 'cause': sample_id}) @@ -4717,7 +4717,7 @@ def process_metadata_files(directory, portal_instance, logger, relaxed_mode, str if stable_id in stable_ids: # stable id already used in other meta file, give error: logger.error( - 'stable_id repeated. It should be unique across all files in a study', + 'stable_id repeated. It should be unique across all files in a directory', extra={'filename_': filename, 'cause': stable_id}) else: @@ -4801,11 +4801,6 @@ def process_metadata_files(directory, portal_instance, logger, relaxed_mode, str else: validators_by_type[meta_file_type].append(None) - if study_cancer_type is None: - logger.error( - 'Cancer type needs to be defined for a study. Verify that you have a study file ' - 'and have defined the cancer type correctly.') - # prepend the cancer study id to any case list suffixes defined_case_list_fns = {} if study_id is not None: @@ -4930,7 +4925,7 @@ def processCaseListDirectory(caseListDir, cancerStudyId, logger, for value in seen_sample_ids: # Compare case list sample ids with clinical file - if value not in DEFINED_SAMPLE_IDS: + if DEFINED_SAMPLE_IDS is not None and value not in DEFINED_SAMPLE_IDS: logger.error( 'Sample ID not defined in clinical file', extra={'filename_': case, @@ -5293,8 +5288,11 @@ def load_portal_info(path, logger, offline=False): # ------------------------------------------------------------------------------ def interface(args=None): parser = argparse.ArgumentParser(description='cBioPortal study validator') - parser.add_argument('-s', '--study_directory', - type=str, required=True, help='path to directory.') + data_source_group = parser.add_mutually_exclusive_group() + data_source_group.add_argument('-s', '--study_directory', + type=str, help='path to study directory.') + data_source_group.add_argument('-d', '--data_directory', + type=str, help='path to directory.') portal_mode_group = parser.add_mutually_exclusive_group() portal_mode_group.add_argument('-u', '--url_server', type=str, @@ -5341,7 +5339,6 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ attributes are not None. """ - global DEFINED_CANCER_TYPES global DEFINED_SAMPLE_IDS global DEFINED_SAMPLE_ATTRIBUTES global PATIENTS_WITH_SAMPLES @@ -5369,6 +5366,11 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ stable_ids, tags_file_path) = process_metadata_files(study_dir, portal_instance, logger, relaxed_mode, strict_maf_checks) + if study_cancer_type is None: + logger.error( + 'Cancer type needs to be defined for a study. Verify that you have a study file ' + 'and have defined the cancer type correctly.') + # first parse and validate cancer type files studydefined_cancer_types = [] if cbioportal_common.MetaFileTypes.CANCER_TYPE in validators_by_meta_type: @@ -5385,7 +5387,6 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ cancer_type_validators[0].validate() studydefined_cancer_types = ( cancer_type_validators[0].defined_cancer_types) - DEFINED_CANCER_TYPES = studydefined_cancer_types # next check the cancer type of the meta_study file if cbioportal_common.MetaFileTypes.STUDY not in validators_by_meta_type: @@ -5393,7 +5394,7 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ return if portal_instance.cancer_type_dict is not None and not ( study_cancer_type in portal_instance.cancer_type_dict or - study_cancer_type in DEFINED_CANCER_TYPES): + study_cancer_type in studydefined_cancer_types): logger.error( 'Cancer type of study is neither known to the portal nor defined ' 'in a cancer_type file', @@ -5544,7 +5545,19 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ # additional validation between meta files, after all meta files are processed validate_data_relations(validators_by_meta_type, logger) + logger.info('Validation complete') + +def validate_data_dir(data_dir, portal_instance, logger, relaxed_mode, strict_maf_checks): + # walk over the meta files in the dir and get properties of the study + validators_by_meta_type, *_ = process_metadata_files(data_dir, portal_instance, logger, relaxed_mode, strict_maf_checks) + for meta_file_type, validators in validators_by_meta_type.items(): + # if there was no validator for this meta file + if not validators: + continue + logger.info("Validating %s", meta_file_type) + for validator in validators: + validator.validate() logger.info('Validation complete') @@ -5565,7 +5578,15 @@ def main_validate(args): logger.addHandler(exit_status_handler) # process the options - study_dir = args.study_directory + if args.study_directory: + data_dir = args.study_directory + partial_data = False + elif args.data_directory: + data_dir = args.data_directory + partial_data = True + else: + raise RuntimeError("Neither study_directory nor data_directory argument is specified.") + server_url = args.url_server html_output_filename = args.html_table @@ -5578,14 +5599,14 @@ def main_validate(args): output_loglevel = logging.DEBUG # check existence of directory - if not os.path.exists(study_dir): - print('directory cannot be found: ' + study_dir, file=sys.stderr) + if not os.path.exists(data_dir): + print('directory cannot be found: ' + data_dir, file=sys.stderr) return 2 # set default message handler text_handler = logging.StreamHandler(sys.stdout) text_handler.setFormatter( - cbioportal_common.LogfileStyleFormatter(study_dir)) + cbioportal_common.LogfileStyleFormatter(data_dir)) collapsing_text_handler = cbioportal_common.CollapsingLogMessageHandler( capacity=5e5, flushLevel=logging.CRITICAL, @@ -5601,7 +5622,7 @@ def main_validate(args): import jinja2 # pylint: disable=import-error html_handler = Jinja2HtmlHandler( - study_dir, + data_dir, html_output_filename, capacity=1e5) # TODO extend CollapsingLogMessageHandler to flush to multiple targets, @@ -5615,7 +5636,7 @@ def main_validate(args): if args.error_file: errfile_handler = logging.FileHandler(args.error_file, 'w') - errfile_handler.setFormatter(ErrorFileFormatter(study_dir)) + errfile_handler.setFormatter(ErrorFileFormatter(data_dir)) # TODO extend CollapsingLogMessageHandler to flush to multiple targets, # and get rid of the duplicated buffering of messages here coll_errfile_handler = cbioportal_common.CollapsingLogMessageHandler( @@ -5644,7 +5665,10 @@ def main_validate(args): # set portal version cbio_version = portal_instance.portal_version - validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_checks) + if partial_data: + validate_data_dir(data_dir, portal_instance, logger, relaxed_mode, strict_maf_checks) + else: + validate_study(data_dir, portal_instance, logger, relaxed_mode, strict_maf_checks) if html_handler is not None: # flush logger and generate HTML while overriding cbio_version after retrieving it from the API @@ -5670,7 +5694,7 @@ def _get_column_index(parts, name): finally: logging.shutdown() del logging._handlerList[:] # workaround for harmless exceptions on exit - print(('Validation of study {status}.'.format( + print(('Validation of data {status}.'.format( status={0: 'succeeded', 1: 'failed', 2: 'not performed as problems occurred', diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java index 4073dbbb..64e9ca59 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java @@ -32,12 +32,24 @@ package org.mskcc.cbio.portal.dao; -import java.sql.*; -import java.text.*; -import java.util.*; import org.apache.commons.lang3.StringUtils; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.CancerStudyTags; +import org.mskcc.cbio.portal.model.ReferenceGenome; +import org.mskcc.cbio.portal.model.TypeOfCancer; + +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; /** * Analogous to and replaces the old DaoCancerType. A CancerStudy has a NAME and @@ -61,7 +73,6 @@ public static enum Status { private static final Map byInternalId = new HashMap(); static { - SpringUtil.initDataSource(); reCacheAll(); } diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java index f626d9f4..4d9bfde1 100755 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java @@ -47,12 +47,15 @@ */ public final class DaoClinicalData { - public static final String SAMPLE_TABLE = "clinical_sample"; - public static final String PATIENT_TABLE = "clinical_patient"; + public static final String SAMPLE_ATTRIBUTES_TABLE = "clinical_sample"; + public static final String PATIENT_ATTRIBUTES_TABLE = "clinical_patient"; - private static final String SAMPLE_INSERT = "INSERT INTO " + SAMPLE_TABLE + "(`INTERAL_ID`,`ATTR_ID`,`ATTR_VALUE` VALUES(?,?,?)"; - private static final String PATIENT_INSERT = "INSERT INTO " + PATIENT_TABLE + "(`INTERNAL_ID`,`ATTR_ID`,`ATTR_VALUE` VALUES(?,?,?)"; + private static final String SAMPLE_ATTRIBUTES_INSERT = "INSERT INTO " + SAMPLE_ATTRIBUTES_TABLE + "(`INTERNAL_ID`,`ATTR_ID`,`ATTR_VALUE`) VALUES(?,?,?)"; + private static final String PATIENT_ATTRIBUTES_INSERT = "INSERT INTO " + PATIENT_ATTRIBUTES_TABLE + "(`INTERNAL_ID`,`ATTR_ID`,`ATTR_VALUE`) VALUES(?,?,?)"; + private static final String SAMPLE_ATTRIBUTES_DELETE = "DELETE FROM " + SAMPLE_ATTRIBUTES_TABLE + " WHERE `INTERNAL_ID` = ?"; + + private static final String PATIENT_ATTRIBUTES_DELETE = "DELETE FROM " + PATIENT_ATTRIBUTES_TABLE + " WHERE `INTERNAL_ID` = ?"; private static final Map sampleAttributes = new HashMap(); private static final Map patientAttributes = new HashMap(); @@ -61,8 +64,8 @@ private DaoClinicalData() {} public static synchronized void reCache() { clearCache(); - cacheAttributes(SAMPLE_TABLE, sampleAttributes); - cacheAttributes(PATIENT_TABLE, patientAttributes); + cacheAttributes(SAMPLE_ATTRIBUTES_TABLE, sampleAttributes); + cacheAttributes(PATIENT_ATTRIBUTES_TABLE, patientAttributes); } private static void clearCache() @@ -95,13 +98,13 @@ private static void cacheAttributes(String table, Map cache) public static int addSampleDatum(int internalSampleId, String attrId, String attrVal) throws DaoException { sampleAttributes.put(attrId, attrId); - return addDatum(SAMPLE_INSERT, SAMPLE_TABLE, internalSampleId, attrId, attrVal); + return addDatum(SAMPLE_ATTRIBUTES_INSERT, SAMPLE_ATTRIBUTES_TABLE, internalSampleId, attrId, attrVal); } public static int addPatientDatum(int internalPatientId, String attrId, String attrVal) throws DaoException { patientAttributes.put(attrId, attrId); - return addDatum(PATIENT_INSERT, PATIENT_TABLE, internalPatientId, attrId, attrVal); + return addDatum(PATIENT_ATTRIBUTES_INSERT, PATIENT_ATTRIBUTES_TABLE, internalPatientId, attrId, attrVal); } public static int addDatum(String query, String tableName, @@ -126,7 +129,7 @@ public static int addDatum(String query, String tableName, pstmt.setString(3, attrVal); int toReturn = pstmt.executeUpdate(); - if (tableName.equals(PATIENT_TABLE)) { + if (tableName.equals(PATIENT_ATTRIBUTES_TABLE)) { patientAttributes.put(attrId, attrId); } else { @@ -165,10 +168,10 @@ private static int getInternalCancerStudyId(String cancerStudyId) throws DaoExce private static String getAttributeTable(String attrId) throws DaoException { if (sampleAttributes.containsKey(attrId)) { - return SAMPLE_TABLE; + return SAMPLE_ATTRIBUTES_TABLE; } else if (patientAttributes.containsKey(attrId)) { - return (PATIENT_TABLE); + return (PATIENT_ATTRIBUTES_TABLE); } else { return null; @@ -209,7 +212,7 @@ public static List getDataByPatientId(int cancerStudyId, String pa { List internalIds = new ArrayList(); internalIds.add(DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudyId, patientId).getInternalId()); - return getDataByInternalIds(cancerStudyId, PATIENT_TABLE, internalIds); + return getDataByInternalIds(cancerStudyId, PATIENT_ATTRIBUTES_TABLE, internalIds); } private static List getDataByInternalIds(int internalCancerStudyId, String table, List internalIds) throws DaoException @@ -247,7 +250,7 @@ public static List getData(String cancerStudyId) throws DaoExcepti public static List getData(int cancerStudyId) throws DaoException { - return getDataByInternalIds(cancerStudyId, PATIENT_TABLE, getPatientIdsByCancerStudy(cancerStudyId)); + return getDataByInternalIds(cancerStudyId, PATIENT_ATTRIBUTES_TABLE, getPatientIdsByCancerStudy(cancerStudyId)); } private static List getPatientIdsByCancerStudy(int cancerStudyId) @@ -281,7 +284,7 @@ public static List getData(int cancerStudyId, Collection p patientIdsInt.add(DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudyId, patientId).getInternalId()); } - return getDataByInternalIds(cancerStudyId, PATIENT_TABLE, patientIdsInt); + return getDataByInternalIds(cancerStudyId, PATIENT_ATTRIBUTES_TABLE, patientIdsInt); } public static List getSampleAndPatientData(int cancerStudyId, Collection sampleIds) throws DaoException @@ -302,9 +305,9 @@ public static List getSampleAndPatientData(int cancerStudyId, Coll } sampleIdsForPatient.add(sampleId); } - List sampleClinicalData = getDataByInternalIds(cancerStudyId, SAMPLE_TABLE, sampleIdsInt); + List sampleClinicalData = getDataByInternalIds(cancerStudyId, SAMPLE_ATTRIBUTES_TABLE, sampleIdsInt); - List patientClinicalData = getDataByInternalIds(cancerStudyId, PATIENT_TABLE, patientIdsInt); + List patientClinicalData = getDataByInternalIds(cancerStudyId, PATIENT_ATTRIBUTES_TABLE, patientIdsInt); for (ClinicalData cd : patientClinicalData) { String stablePatientId = cd.getStableId(); Set sampleIdsForPatient = mapPatientIdSampleIds.get(stablePatientId); @@ -336,9 +339,9 @@ public static List getSampleAndPatientData(int cancerStudyId, Coll } sampleIdsForPatient.add(sampleId); } - List sampleClinicalData = getDataByInternalIds(cancerStudyId, SAMPLE_TABLE, sampleIdsInt, Collections.singletonList(attr.getAttrId())); + List sampleClinicalData = getDataByInternalIds(cancerStudyId, SAMPLE_ATTRIBUTES_TABLE, sampleIdsInt, Collections.singletonList(attr.getAttrId())); - List patientClinicalData = getDataByInternalIds(cancerStudyId, PATIENT_TABLE, patientIdsInt, Collections.singletonList(attr.getAttrId())); + List patientClinicalData = getDataByInternalIds(cancerStudyId, PATIENT_ATTRIBUTES_TABLE, patientIdsInt, Collections.singletonList(attr.getAttrId())); for (ClinicalData cd : patientClinicalData) { String stablePatientId = cd.getStableId(); Set sampleIdsForPatient = mapPatientIdSampleIds.get(stablePatientId); @@ -361,7 +364,61 @@ public static List getSampleData(int cancerStudyId, Collection sampleInternalIds, String attrId) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + try { + con = JdbcUtil.getDbConnection(DaoClinicalData.class); + pstmt = con.prepareStatement("DELETE FROM " + SAMPLE_ATTRIBUTES_TABLE + + " WHERE `ATTR_ID` = ? AND `INTERNAL_ID` IN (" + + String.join(",", Collections.nCopies(sampleInternalIds.size(), "?")) + + ")"); + int parameterIndex = 1; + pstmt.setString(parameterIndex++, attrId); + for (Integer sampleInternalId : sampleInternalIds) { + pstmt.setInt(parameterIndex++, sampleInternalId); + } + pstmt.executeUpdate(); + } + catch (SQLException e) { + throw new DaoException(e); + } + finally { + JdbcUtil.closeAll(DaoClinicalData.class, con, pstmt, null); + } } public static List getSampleData(int cancerStudyId, Collection sampleIds) throws DaoException @@ -370,7 +427,7 @@ public static List getSampleData(int cancerStudyId, Collection getData(String cancerStudyId, Collection patientIds, ClinicalAttribute attr) throws DaoException @@ -381,7 +438,7 @@ public static List getData(String cancerStudyId, Collection getDataByInternalIds(int internalCancerStudyId, String table, List internalIds, Collection attributeIds) throws DaoException @@ -432,7 +489,7 @@ public static List getDataByAttributeIds(int internalCancerStudyId while(rs.next()) { Integer patientId = rs.getInt("INTERNAL_ID"); if (patients.contains(patientId)) { - clinicals.add(extract(PATIENT_TABLE, internalCancerStudyId, rs)); + clinicals.add(extract(PATIENT_ATTRIBUTES_TABLE, internalCancerStudyId, rs)); } } } @@ -457,7 +514,7 @@ private static ClinicalData extract(String table, int internalCancerStudyId, Res private static String getStableIdFromInternalId(String table, int internalId) { - if (table.equals(SAMPLE_TABLE)) { + if (table.equals(SAMPLE_ATTRIBUTES_TABLE)) { return DaoSample.getSampleById(internalId).getStableId(); } else { @@ -585,13 +642,13 @@ public static List getDataByPatientIds(int cancerStudyId, List getPatientsByAttribute(int cancerStudy, String paramName, String paramValue) throws DaoException { - List ids = getIdsByAttribute(cancerStudy, paramName, paramValue, PATIENT_TABLE); + List ids = getIdsByAttribute(cancerStudy, paramName, paramValue, PATIENT_ATTRIBUTES_TABLE); return InternalIdUtil.getPatientsById(ids); } public static List getSamplesByAttribute(int cancerStudy, String paramName, String paramValue) throws DaoException { - List ids = getIdsByAttribute(cancerStudy, paramName, paramValue, SAMPLE_TABLE); + List ids = getIdsByAttribute(cancerStudy, paramName, paramValue, SAMPLE_ATTRIBUTES_TABLE); return InternalIdUtil.getSamplesById(ids); } @@ -660,4 +717,22 @@ public static Map> getCancerTypeInfoBySamples(List s JdbcUtil.closeAll(DaoClinicalData.class, con, pstmt, rs); } } + + public static void removePatientAttributesData(int internalPatientId) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoClinicalData.class); + pstmt = con.prepareStatement(PATIENT_ATTRIBUTES_DELETE); + pstmt.setInt(1, internalPatientId); + pstmt.executeUpdate(); + } + catch (SQLException e) { + throw new DaoException(e); + } + finally { + JdbcUtil.closeAll(DaoClinicalData.class, con, pstmt, rs); + } + } } \ No newline at end of file diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalEvent.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalEvent.java index 21722902..714769bd 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalEvent.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalEvent.java @@ -32,6 +32,9 @@ package org.mskcc.cbio.portal.dao; +import org.apache.commons.lang3.StringUtils; +import org.mskcc.cbio.portal.model.ClinicalEvent; + import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; @@ -40,8 +43,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import org.apache.commons.lang3.StringUtils; -import org.mskcc.cbio.portal.model.ClinicalEvent; /** * @@ -52,7 +53,7 @@ private DaoClinicalEvent() {} public static int addClinicalEvent(ClinicalEvent clinicalEvent) { if (!MySQLbulkLoader.isBulkLoad()) { - throw new IllegalStateException("Only buld load mode is allowed for importing clinical events"); + throw new IllegalStateException("Only bulk load mode is allowed for importing clinical events"); } MySQLbulkLoader.getMySQLbulkLoader("clinical_event").insertRecord( @@ -202,6 +203,23 @@ public static void deleteByCancerStudyId(int cancerStudyId) throws DaoException JdbcUtil.closeAll(DaoClinicalEvent.class, con, pstmt, rs); } } + + public static void deleteByPatientId(int patientId) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoClinicalEvent.class); + + pstmt = con.prepareStatement("DELETE FROM clinical_event WHERE clinical_event.PATIENT_ID = ?"); + pstmt.setInt(1, patientId); + pstmt.executeUpdate(); + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoClinicalEvent.class, con, pstmt, rs); + } + } public static void deleteAllRecords() throws DaoException { Connection con = null; diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java index e7785d4f..fbdbc6e1 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java @@ -119,7 +119,35 @@ private static long addCnaEventDirectly(CnaEvent cnaEvent) throws DaoException { JdbcUtil.closeAll(DaoCnaEvent.class, con, pstmt, rs); } } - + + public static void removeSampleCnaEvents(int cnaProfileId, List sampleIds) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoCnaEvent.class); + pstmt = con.prepareStatement + ("DELETE sample_cna_event, alteration_driver_annotation" + + " FROM sample_cna_event" + + " LEFT JOIN alteration_driver_annotation ON alteration_driver_annotation.`ALTERATION_EVENT_ID` = sample_cna_event.`CNA_EVENT_ID`" + + " AND alteration_driver_annotation.`SAMPLE_ID` = sample_cna_event.`SAMPLE_ID`" + + " AND alteration_driver_annotation.`GENETIC_PROFILE_ID` = sample_cna_event.`GENETIC_PROFILE_ID`" + + " WHERE sample_cna_event.`GENETIC_PROFILE_ID` = ? AND sample_cna_event.`SAMPLE_ID` IN (" + + String.join(",", Collections.nCopies(sampleIds.size(), "?")) + + ")"); + int parameterIndex = 1; + pstmt.setInt(parameterIndex++, cnaProfileId); + for (Integer sampleId : sampleIds) { + pstmt.setInt(parameterIndex++, sampleId); + } + pstmt.executeUpdate(); + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoCnaEvent.class, con, pstmt, rs); + } + } + public static Map> getSamplesWithAlterations( Collection eventIds) throws DaoException { return getSamplesWithAlterations(StringUtils.join(eventIds, ",")); diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegment.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegment.java index a0113a44..bb099498 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegment.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegment.java @@ -67,7 +67,16 @@ public static int addCopyNumberSegment(CopyNumberSegment seg) throws DaoExceptio } } - public static void createFractionGenomeAlteredClinicalData(int cancerStudyId) throws DaoException { + /** + * Ensures FRACTION_GENOME_ALTERED clinical sample attribute is created and up to date. + * @param cancerStudyId - id of the study to create the clinical attribute for + * @param sampleIds - specifies for which samples to calculate this attribute. + * if sampleIds=null, the calculation is done for all samples in the study + * @param updateMode - if true, updates the attribute if it exists + * @throws DaoException + */ + + public static void createFractionGenomeAlteredClinicalData(int cancerStudyId, Set sampleIds, boolean updateMode) throws DaoException { Connection con = null; PreparedStatement pstmt = null; ResultSet rs = null; @@ -80,8 +89,15 @@ public static void createFractionGenomeAlteredClinicalData(int cancerStudyId) th "AS c2 WHERE c2.`CANCER_STUDY_ID` = c1.`CANCER_STUDY_ID` AND c2.`SAMPLE_ID` = c1.`SAMPLE_ID` AND " + "ABS(c2.`SEGMENT_MEAN`) >= 0.2) / SUM(`END`-`START`)) AS `VALUE` FROM `copy_number_seg` AS c1 , `cancer_study` " + "WHERE c1.`CANCER_STUDY_ID` = cancer_study.`CANCER_STUDY_ID` AND cancer_study.`CANCER_STUDY_ID`=? " + - "GROUP BY cancer_study.`CANCER_STUDY_ID` , `SAMPLE_ID` HAVING SUM(`END`-`START`) > 0;"); - pstmt.setInt(1, cancerStudyId); + (sampleIds == null ? "" : ("AND `SAMPLE_ID` IN ("+ String.join(",", Collections.nCopies(sampleIds.size(), "?")) + ") ")) + +"GROUP BY cancer_study.`CANCER_STUDY_ID` , `SAMPLE_ID` HAVING SUM(`END`-`START`) > 0;"); + int parameterIndex = 1; + pstmt.setInt(parameterIndex++, cancerStudyId); + if (sampleIds != null) { + for (Integer sampleId : sampleIds) { + pstmt.setInt(parameterIndex++, sampleId); + } + } Map fractionGenomeAltereds = new HashMap(); rs = pstmt.executeQuery(); while (rs.next()) { @@ -94,7 +110,10 @@ public static void createFractionGenomeAlteredClinicalData(int cancerStudyId) th false, "20", cancerStudyId); DaoClinicalAttributeMeta.addDatum(attr); } - + + if (updateMode) { + DaoClinicalData.removeSampleAttributesData(fractionGenomeAltereds.keySet(), FRACTION_GENOME_ALTERED_ATTR_ID); + } for (Map.Entry fractionGenomeAltered : fractionGenomeAltereds.entrySet()) { DaoClinicalData.addSampleDatum(fractionGenomeAltered.getKey(), FRACTION_GENOME_ALTERED_ATTR_ID, fractionGenomeAltered.getValue()); } @@ -283,4 +302,27 @@ public static boolean segmentDataExistForSample(int cancerStudyId, int sampleId) JdbcUtil.closeAll(DaoCopyNumberSegment.class, con, pstmt, rs); } } + + public static void deleteSegmentDataForSamples(int cancerStudyId, Set sampleIds) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoCopyNumberSegment.class); + pstmt = con.prepareStatement("DELETE FROM `copy_number_seg`" + + " WHERE `CANCER_STUDY_ID`= ?" + + " AND `SAMPLE_ID` IN (" + String.join(",", Collections.nCopies(sampleIds.size(), "?")) + + ")"); + int parameterIndex = 1; + pstmt.setInt(parameterIndex++, cancerStudyId); + for (Integer sampleId : sampleIds) { + pstmt.setInt(parameterIndex++, sampleId); + } + pstmt.executeUpdate(); + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoCopyNumberSegment.class, con, pstmt, rs); + } + } } diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegmentFile.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegmentFile.java index ef0011a4..cf2332f6 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegmentFile.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegmentFile.java @@ -65,7 +65,7 @@ public static int addCopyNumberSegmentFile(CopyNumberSegmentFile copySegFile) th } catch (SQLException e) { throw new DaoException(e); } finally { - JdbcUtil.closeAll(DaoCopyNumberSegment.class, con, pstmt, rs); + JdbcUtil.closeAll(DaoCopyNumberSegmentFile.class, con, pstmt, rs); } } @@ -86,6 +86,9 @@ public static CopyNumberSegmentFile getCopyNumberSegmentFile(int cancerStudyId) cnsf.referenceGenomeId = CopyNumberSegmentFile.ReferenceGenomeId.valueOf(rs.getString("REFERENCE_GENOME_ID")); cnsf.description = rs.getString("DESCRIPTION"); cnsf.filename = rs.getString("FILENAME"); + if (rs.next()) { + throw new SQLException("More than one row was returned."); + } return cnsf; } return null; diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java index 1f58acb7..d625e724 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java @@ -46,7 +46,9 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.util.DataValidator; import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.TsvUtil; /** * A Utility Class that speeds access to Gene Info. @@ -90,7 +92,7 @@ private synchronized void fillCache() { if (line.startsWith("#")) { continue; } - String[] parts = line.trim().split("\t",-1); + String[] parts = TsvUtil.splitTsvLine(line); CanonicalGene gene = getGene(Long.parseLong(parts[1])); if (gene==null) { ProgressMonitor.logWarning(line+" in config file [resources" + GENE_SYMBOL_DISAMBIGUATION_FILE + @@ -322,7 +324,7 @@ public List guessGene(String geneId, String chr) { } CanonicalGene gene; - if (geneId.matches("[0-9]+")) { // likely to be a entrez gene id + if (DataValidator.isValidNumericSequence(geneId)) { // likely to be a entrez gene id gene = getGene(Integer.parseInt(geneId)); if (gene!=null) { return Collections.singletonList(gene); diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java index 25bef125..25cd987a 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java @@ -65,12 +65,10 @@ private DaoGeneticAlteration() { * Gets Instance of Dao Object. (Singleton pattern). * * @return DaoGeneticAlteration Object. - * @throws DaoException Dao Initialization Error. */ - public static DaoGeneticAlteration getInstance() throws DaoException { + public static DaoGeneticAlteration getInstance() { if (daoGeneticAlteration == null) { daoGeneticAlteration = new DaoGeneticAlteration(); - } return daoGeneticAlteration; @@ -96,7 +94,7 @@ public int addGeneticAlterations(int geneticProfileId, long entrezGeneId, String throws DaoException { return addGeneticAlterationsForGeneticEntity(geneticProfileId, DaoGeneOptimized.getGeneticEntityId(entrezGeneId), values); } - + public int addGeneticAlterationsForGeneticEntity(int geneticProfileId, int geneticEntityId, String[] values) throws DaoException { @@ -238,9 +236,21 @@ public HashMap> getGeneticAlterationMapForEntit HashMap mapSampleValue = new HashMap(); int geneticEntityId = rs.getInt("GENETIC_ENTITY_ID"); String values = rs.getString("VALUES"); - //hm.debug.. - String valueParts[] = values.split(DELIM); - for (int i=0; i getProcessedAlterationData( rs = pstmt.executeQuery(); while (rs.next()) { long entrezGeneId = DaoGeneOptimized.getEntrezGeneId(rs.getInt("GENETIC_ENTITY_ID")); - String[] values = rs.getString("VALUES").split(DELIM); + String valuesString = rs.getString("VALUES"); + if (valuesString.endsWith(DELIM)) { + valuesString = valuesString.substring(0, valuesString.length() - DELIM.length()); + } + String[] values = valuesString.split(DELIM, -1); ObjectNode datum = processor.process( entrezGeneId, values, diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticProfile.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticProfile.java index 0326fb2f..baf5f530 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticProfile.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticProfile.java @@ -32,10 +32,17 @@ package org.mskcc.cbio.portal.dao; -import java.sql.*; -import java.util.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.SpringUtil; +import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.GeneticProfile; + +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; /** * Analogous to and replaces the old DaoCancerType. A CancerStudy has a NAME and @@ -52,7 +59,6 @@ private DaoGeneticProfile() {} private static final Map> byStudy = new HashMap>(); static { - SpringUtil.initDataSource(); reCache(); } diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java index 8adbdadd..730a288d 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java @@ -67,6 +67,8 @@ public final class DaoMutation { public static final String NAN = "NaN"; private static final String MUTATION_COUNT_ATTR_ID = "MUTATION_COUNT"; + private static final String DELETE_ALTERATION_DRIVER_ANNOTATION = "DELETE from alteration_driver_annotation WHERE GENETIC_PROFILE_ID=? and SAMPLE_ID=?"; + private static final String DELETE_MUTATION = "DELETE from mutation WHERE GENETIC_PROFILE_ID=? and SAMPLE_ID=?"; public static int addMutation(ExtendedMutation mutation, boolean newMutationEvent) throws DaoException { if (!MySQLbulkLoader.isBulkLoad()) { @@ -475,18 +477,10 @@ public static ArrayList getMutations (long entrezGeneId, Strin return mutationList; } - /** - * @deprecated We believe that this method is no longer called by any part of the codebase, and it will soon be deleted. - */ - @Deprecated public static ArrayList getMutations (int geneticProfileId, int sampleId) throws DaoException { return getMutations(geneticProfileId, Arrays.asList(Integer.valueOf(sampleId))); } - /** - * @deprecated We believe that this method is no longer called by any part of the codebase, and it will soon be deleted. - */ - @Deprecated public static ArrayList getMutations (int geneticProfileId, List sampleIds) throws DaoException { Connection con = null; PreparedStatement pstmt = null; @@ -1501,18 +1495,20 @@ protected static String boolToStr(boolean value) return value ? "1" : "0"; } - /** - * @deprecated We believe that this method is no longer called by any part of the codebase, and it will soon be deleted. - */ - @Deprecated - public static void deleteAllRecordsInGeneticProfile(long geneticProfileId) throws DaoException { + public static void deleteAllRecordsInGeneticProfileForSample(long geneticProfileId, long internalSampleId) throws DaoException { Connection con = null; PreparedStatement pstmt = null; ResultSet rs = null; try { con = JdbcUtil.getDbConnection(DaoMutation.class); - pstmt = con.prepareStatement("DELETE from mutation WHERE GENETIC_PROFILE_ID=?"); + pstmt = con.prepareStatement(DELETE_ALTERATION_DRIVER_ANNOTATION); + pstmt.setLong(1, geneticProfileId); + pstmt.setLong(2, internalSampleId); + pstmt.executeUpdate(); + + pstmt = con.prepareStatement(DELETE_MUTATION); pstmt.setLong(1, geneticProfileId); + pstmt.setLong(2, internalSampleId); pstmt.executeUpdate(); } catch (SQLException e) { throw new DaoException(e); diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoReferenceGenome.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoReferenceGenome.java index 1d9bb499..9f33d7e4 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoReferenceGenome.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoReferenceGenome.java @@ -17,11 +17,15 @@ package org.mskcc.cbio.portal.dao; -import java.sql.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.SpringUtil; +import org.mskcc.cbio.portal.model.ReferenceGenome; -import java.util.*; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.HashMap; +import java.util.Map; /** @@ -36,7 +40,6 @@ public final class DaoReferenceGenome { private static final Map genomeInternalIds = new HashMap(); static { - SpringUtil.initDataSource(); reCache(); } diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java index c3200389..8950b7b5 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java @@ -42,31 +42,38 @@ */ public class DaoSampleList { - /** + private static final String DELETE_SAMPLE_LIST_LIST = "DELETE FROM sample_list_list WHERE `LIST_ID` = ?"; + + /** * Adds record to sample_list table. */ public int addSampleList(SampleList sampleList) throws DaoException { Connection con = null; PreparedStatement pstmt = null; - ResultSet rs = null; int rows; try { con = JdbcUtil.getDbConnection(DaoSampleList.class); pstmt = con.prepareStatement("INSERT INTO sample_list (`STABLE_ID`, `CANCER_STUDY_ID`, `NAME`, `CATEGORY`," + - "`DESCRIPTION`)" + " VALUES (?,?,?,?,?)"); + "`DESCRIPTION`)" + " VALUES (?,?,?,?,?)", Statement.RETURN_GENERATED_KEYS); pstmt.setString(1, sampleList.getStableId()); pstmt.setInt(2, sampleList.getCancerStudyId()); pstmt.setString(3, sampleList.getName()); pstmt.setString(4, sampleList.getSampleListCategory().getCategory()); pstmt.setString(5, sampleList.getDescription()); rows = pstmt.executeUpdate(); - int listListRow = addSampleListList(sampleList, con); - rows = (listListRow != -1) ? (rows + listListRow) : rows; + try (ResultSet generatedKey = pstmt.getGeneratedKeys()) { + if (generatedKey.next()) { + int listId = generatedKey.getInt(1); + addSampleListList(sampleList.getCancerStudyId(), listId, sampleList.getSampleList(), con); + } else { + throw new DaoException("Creating sample list failed, no ID obtained."); + } + } } catch (SQLException e) { throw new DaoException(e); } finally { - JdbcUtil.closeAll(DaoSampleList.class, con, pstmt, rs); + JdbcUtil.closeAll(DaoSampleList.class, con, pstmt, null); } return rows; @@ -204,41 +211,12 @@ public void deleteAllRecords() throws DaoException { } } - /** - * Given a patient list, gets list id from sample_list table - */ - private int getSampleListId(SampleList sampleList) throws DaoException { - Connection con = null; - PreparedStatement pstmt = null; - ResultSet rs = null; - try { - con = JdbcUtil.getDbConnection(DaoSampleList.class); - pstmt = con.prepareStatement("SELECT LIST_ID FROM sample_list WHERE STABLE_ID=?"); - pstmt.setString(1, sampleList.getStableId()); - rs = pstmt.executeQuery(); - if (rs.next()) { - return rs.getInt("LIST_ID"); - } - return -1; - } catch (SQLException e) { - throw new DaoException(e); - } finally { - JdbcUtil.closeAll(DaoSampleList.class, con, pstmt, rs); - } - } - /** * Adds record to sample_list_list. */ - private int addSampleListList(SampleList sampleList, Connection con) throws DaoException { + private int addSampleListList(int cancerStudyId, int sampleListId, List sampleList, Connection con) throws DaoException { - // get patient list id - int sampleListId = getSampleListId(sampleList); - if (sampleListId == -1) { - return -1; - } - - if (sampleList.getSampleList().isEmpty()) { + if (sampleList.isEmpty()) { return 0; } @@ -248,23 +226,21 @@ private int addSampleListList(SampleList sampleList, Connection con) throws DaoE try { StringBuilder sql = new StringBuilder("INSERT INTO sample_list_list (`LIST_ID`, `SAMPLE_ID`) VALUES "); // NOTE - as of 12/12/14, patient lists contain sample ids - for (String sampleId : sampleList.getSampleList()) { - Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(sampleList.getCancerStudyId(), sampleId); + for (String sampleId : sampleList) { + Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudyId, sampleId); if (sample == null) { - System.out.println("null sample: " + sampleId + ":" + sampleList.getStableId()); + System.out.println("null sample: " + sampleId); ++skippedPatients; continue; } sql.append("('").append(sampleListId).append("','").append(sample.getInternalId()).append("'),"); } - if (skippedPatients == sampleList.getSampleList().size()) { + if (skippedPatients == sampleList.size()) { return 0; } sql.deleteCharAt(sql.length()-1); pstmt = con.prepareStatement(sql.toString()); return pstmt.executeUpdate(); - } catch (NullPointerException e) { - throw new DaoException(e); } catch (SQLException e) { throw new DaoException(e); } finally { @@ -272,6 +248,24 @@ private int addSampleListList(SampleList sampleList, Connection con) throws DaoE } } + public void updateSampleListList(SampleList sampleList) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + try { + con = JdbcUtil.getDbConnection(DaoSampleList.class); + pstmt = con.prepareStatement(DELETE_SAMPLE_LIST_LIST); + pstmt.setInt(1, sampleList.getSampleListId()); + pstmt.executeUpdate(); + + addSampleListList(sampleList.getCancerStudyId(), sampleList.getSampleListId(), sampleList.getSampleList(), con); + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoSampleList.class, con, pstmt, null); + } + } + + /** * Given a patient list object (thus patient list id) gets patient list list. */ @@ -287,7 +281,8 @@ private ArrayList getSampleListList(SampleList sampleList, Connection co ArrayList patientIds = new ArrayList(); while (rs.next()) { // NOTE - as of 12/12/14, patient lists contain sample ids - Sample sample = DaoSample.getSampleById(rs.getInt("SAMPLE_ID")); + int sample_id = rs.getInt("SAMPLE_ID"); + Sample sample = DaoSample.getSampleById(sample_id); patientIds.add(sample.getStableId()); } return patientIds; diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java index acfb299d..5a6a9e5c 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java @@ -32,12 +32,22 @@ package org.mskcc.cbio.portal.dao; -import org.mskcc.cbio.portal.model.*; - import org.apache.commons.lang3.StringUtils; - -import java.sql.*; -import java.util.*; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; + +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; /** * Data access object for sample_profile table @@ -50,98 +60,62 @@ public final class DaoSampleProfile { private DaoSampleProfile() {} private static final int NO_SUCH_PROFILE_ID = -1; - private static final String TABLE_NAME = "sample_profile"; - - public static int addSampleProfile(Integer sampleId, Integer geneticProfileId, Integer panelId) throws DaoException { - if (MySQLbulkLoader.isBulkLoad()) { - - // Add new record using bulk loader. Order of fields is: - // 1. sample ID - // 2. genetic Profile ID - // 3. gene panel ID - if (panelId != null) { - MySQLbulkLoader.getMySQLbulkLoader(TABLE_NAME).insertRecord( - Integer.toString(sampleId), - Integer.toString(geneticProfileId), - Integer.toString(panelId)); - } else { - MySQLbulkLoader.getMySQLbulkLoader(TABLE_NAME).insertRecord( - Integer.toString(sampleId), - Integer.toString(geneticProfileId), - null); - } - return 1; - } + public static void upsertSampleToProfileMapping(Collection sampleIds, Integer geneticProfileId, Integer panelId) throws DaoException { + upsertSampleToProfileMapping( + sampleIds.stream() + .map(sampleId -> new SampleProfileTuple(geneticProfileId, sampleId, panelId)).toList()); + } - // Add new record without using bulk loader + public record SampleProfileTuple(int geneticProfileId, int sampleId, Integer panelId) {} + + public static void upsertSampleToProfileMapping(Collection idTuples) throws DaoException { + if (idTuples.isEmpty()) { + return; + } Connection con = null; PreparedStatement pstmt = null; - ResultSet rs = null; - try { - if (!sampleExistsInGeneticProfile(sampleId, geneticProfileId)) { - con = JdbcUtil.getDbConnection(DaoSampleProfile.class); - pstmt = con.prepareStatement - ("INSERT INTO sample_profile (`SAMPLE_ID`, `GENETIC_PROFILE_ID`, `PANEL_ID`) " - + "VALUES (?,?,?)"); - pstmt.setInt(1, sampleId); - pstmt.setInt(2, geneticProfileId); - if (panelId != null) { - pstmt.setInt(3, panelId); - } - else { - pstmt.setNull(3, java.sql.Types.INTEGER); + con = JdbcUtil.getDbConnection(DaoSampleProfile.class); + + pstmt = con.prepareStatement + ("INSERT INTO sample_profile (`SAMPLE_ID`, `GENETIC_PROFILE_ID`, `PANEL_ID`)" + + " VALUES" + + String.join(",", Collections.nCopies(idTuples.size(), " (?,?,?)")) + + " ON DUPLICATE KEY UPDATE `PANEL_ID` = VALUES(`PANEL_ID`);"); + int parameterIndex = 1; + for (SampleProfileTuple idTuple : idTuples) { + pstmt.setInt(parameterIndex++, idTuple.sampleId()); + pstmt.setInt(parameterIndex++, idTuple.geneticProfileId()); + if (idTuple.panelId() != null) { + pstmt.setInt(parameterIndex, idTuple.panelId()); + } else { + pstmt.setNull(parameterIndex, java.sql.Types.INTEGER); } - return pstmt.executeUpdate(); - } else { - // This should be an error, because the record already exists. - return 0; + parameterIndex++; } - } catch (NullPointerException e) { - throw new DaoException(e); + pstmt.executeUpdate(); } catch (SQLException e) { throw new DaoException(e); } finally { - JdbcUtil.closeAll(DaoSampleProfile.class, con, pstmt, rs); + JdbcUtil.closeAll(DaoSampleProfile.class, con, pstmt, null); } } - public static void updateSampleProfile(Integer sampleId, Integer geneticProfileId, Integer panelId) throws DaoException { - /** - * Update a record in the sample_profile table when adding gene panel field from the sample profile matrix. - * Can not use the bulk loader, because the sample might already be added, which requires an UPDATE of the - * record. - */ + public static boolean sampleExistsInGeneticProfile(int sampleId, int geneticProfileId) + throws DaoException { Connection con = null; PreparedStatement pstmt = null; ResultSet rs = null; try { con = JdbcUtil.getDbConnection(DaoSampleProfile.class); - if (!sampleExistsInGeneticProfile(sampleId, geneticProfileId)) { - - pstmt = con.prepareStatement - ("INSERT INTO sample_profile (`SAMPLE_ID`, `GENETIC_PROFILE_ID`, `PANEL_ID`) VALUES (?,?,?)"); - pstmt.setInt(1, sampleId); - pstmt.setInt(2, geneticProfileId); - if (panelId != null) { - pstmt.setInt(3, panelId); - } else { - pstmt.setNull(3, java.sql.Types.INTEGER); - } - } else { - pstmt = con.prepareStatement - ("UPDATE `sample_profile` SET `PANEL_ID` = ? WHERE (`SAMPLE_ID` = ? AND `GENETIC_PROFILE_ID` = ?)"); - if (panelId != null) { - pstmt.setInt(1, panelId); - } else { - pstmt.setNull(1, java.sql.Types.INTEGER); - } - pstmt.setInt(2, sampleId); - pstmt.setInt(3, geneticProfileId); - } - pstmt.executeUpdate(); + pstmt = con.prepareStatement + ("SELECT * FROM sample_profile WHERE SAMPLE_ID = ? AND GENETIC_PROFILE_ID = ?"); + pstmt.setInt(1, sampleId); + pstmt.setInt(2, geneticProfileId); + rs = pstmt.executeQuery(); + return (rs.next()); } catch (NullPointerException e) { throw new DaoException(e); } catch (SQLException e) { @@ -150,8 +124,8 @@ public static void updateSampleProfile(Integer sampleId, Integer geneticProfileI JdbcUtil.closeAll(DaoSampleProfile.class, con, pstmt, rs); } } - - public static boolean sampleExistsInGeneticProfile(int sampleId, int geneticProfileId) + + public static Integer getPanelId(int sampleId, int geneticProfileId) throws DaoException { Connection con = null; PreparedStatement pstmt = null; @@ -160,14 +134,20 @@ public static boolean sampleExistsInGeneticProfile(int sampleId, int geneticProf try { con = JdbcUtil.getDbConnection(DaoSampleProfile.class); pstmt = con.prepareStatement - ("SELECT * FROM sample_profile WHERE SAMPLE_ID = ? AND GENETIC_PROFILE_ID = ?"); + ("SELECT PANEL_ID FROM sample_profile WHERE SAMPLE_ID = ? AND GENETIC_PROFILE_ID = ?"); pstmt.setInt(1, sampleId); pstmt.setInt(2, geneticProfileId); rs = pstmt.executeQuery(); - return (rs.next()); - } catch (NullPointerException e) { - throw new DaoException(e); - } catch (SQLException e) { + if (rs.next()) { + int panelId = rs.getInt(1); + if (rs.wasNull()) { + return null; + } + return panelId; + } else { + throw new NoSuchElementException("No sample_profile with SAMPLE_ID=" + sampleId + " and GENETIC_PROFILE_ID=" + geneticProfileId); + } + } catch (NoSuchElementException | SQLException e) { throw new DaoException(e); } finally { JdbcUtil.closeAll(DaoSampleProfile.class, con, pstmt, rs); diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoStructuralVariant.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoStructuralVariant.java index a11026a8..8940e06c 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoStructuralVariant.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoStructuralVariant.java @@ -29,7 +29,9 @@ import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.Set; public class DaoStructuralVariant { @@ -151,6 +153,33 @@ public static void addStructuralVariantToBulkLoader(StructuralVariant structural } } + public static void deleteStructuralVariants(int geneticProfileId, Set sampleIds) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoGene.class); + pstmt = con.prepareStatement("DELETE structural_variant, alteration_driver_annotation" + + " FROM structural_variant" + + " LEFT JOIN alteration_driver_annotation" + + " ON alteration_driver_annotation.GENETIC_PROFILE_ID = structural_variant.GENETIC_PROFILE_ID" + + " AND alteration_driver_annotation.SAMPLE_ID = structural_variant.SAMPLE_ID" + + " WHERE structural_variant.GENETIC_PROFILE_ID=? AND structural_variant.SAMPLE_ID IN (" + + String.join(",", Collections.nCopies(sampleIds.size(), "?")) + + ")"); + int parameterIndex = 1; + pstmt.setInt(parameterIndex++, geneticProfileId); + for (Integer sampleId : sampleIds) { + pstmt.setInt(parameterIndex++, sampleId); + } + pstmt.executeUpdate(); + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoGene.class, con, pstmt, rs); + } + } + public static long getLargestInternalId() throws DaoException { Connection con = null; PreparedStatement pstmt = null; diff --git a/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java b/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java index 40f9e9ed..e75931b2 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java @@ -35,10 +35,12 @@ import java.sql.*; import java.util.*; import javax.sql.DataSource; -import org.apache.commons.dbcp2.BasicDataSource; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.mskcc.cbio.portal.util.*; +import org.springframework.jdbc.datasource.DataSourceTransactionManager; +import org.springframework.jdbc.datasource.TransactionAwareDataSourceProxy; +import org.springframework.transaction.support.TransactionTemplate; /** * Connection Utility for JDBC. @@ -50,6 +52,8 @@ public class JdbcUtil { private static DataSource dataSource; private static Map activeConnectionCount = new HashMap(); // keep track of the number of active connection per class/requester private static final Logger LOG = LoggerFactory.getLogger(JdbcUtil.class); + private static DataSourceTransactionManager transactionManager; + private static TransactionTemplate transactionTemplate; /** * Gets the data source @@ -57,17 +61,28 @@ public class JdbcUtil { */ public static DataSource getDataSource() { if (dataSource == null) { - dataSource = new JdbcDataSource(); + dataSource = new TransactionAwareDataSourceProxy(new JdbcDataSource()); + setupTransactionManagement(); } return dataSource; } + private static void setupTransactionManagement() { + transactionManager = new DataSourceTransactionManager(dataSource); + transactionTemplate = new TransactionTemplate(transactionManager); + } + /** * Sets the data source * @param value the data source */ public static void setDataSource(DataSource value) { dataSource = value; + setupTransactionManagement(); + } + + public static TransactionTemplate getTransactionTemplate() { + return transactionTemplate; } /** diff --git a/src/main/java/org/mskcc/cbio/portal/dao/MySQLbulkLoader.java b/src/main/java/org/mskcc/cbio/portal/dao/MySQLbulkLoader.java index 3414fc4b..84d8984e 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/MySQLbulkLoader.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/MySQLbulkLoader.java @@ -180,7 +180,7 @@ public void insertRecord( String... fieldValues) { tempFileWriter.write( "\t" ); tempFileWriter.write( escapeValue(fieldValues[i]) ); } - tempFileWriter.write("\n");; + tempFileWriter.write("\n"); if( rows++ < numDebuggingRowsToPrint ){ StringBuffer sb = new StringBuffer( escapeValue(fieldValues[0]) ); diff --git a/src/main/java/org/mskcc/cbio/portal/model/StructuralVariant.java b/src/main/java/org/mskcc/cbio/portal/model/StructuralVariant.java index 89c8352d..60cf8399 100644 --- a/src/main/java/org/mskcc/cbio/portal/model/StructuralVariant.java +++ b/src/main/java/org/mskcc/cbio/portal/model/StructuralVariant.java @@ -468,4 +468,58 @@ public String getAnnotationJson() { public void setAnnotationJson(String annotationJson) { this.annotationJson = annotationJson; } + + @Override + public String toString() { + return "StructuralVariant{" + + "internalId=" + internalId + + ", geneticProfileId=" + geneticProfileId + + ", structuralVariantId=" + structuralVariantId + + ", sampleIdInternal=" + sampleIdInternal + + ", sampleId='" + sampleId + '\'' + + ", site1EntrezGeneId=" + site1EntrezGeneId + + ", site1HugoSymbol='" + site1HugoSymbol + '\'' + + ", site1EnsemblTranscriptId='" + site1EnsemblTranscriptId + '\'' + + ", site1Chromosome='" + site1Chromosome + '\'' + + ", site1Position=" + site1Position + + ", site1Contig='" + site1Contig + '\'' + + ", site1Region='" + site1Region + '\'' + + ", site1RegionNumber=" + site1RegionNumber + + ", site1Description='" + site1Description + '\'' + + ", site2EntrezGeneId=" + site2EntrezGeneId + + ", site2HugoSymbol='" + site2HugoSymbol + '\'' + + ", site2EnsemblTranscriptId='" + site2EnsemblTranscriptId + '\'' + + ", site2Chromosome='" + site2Chromosome + '\'' + + ", site2Position=" + site2Position + + ", site2Contig='" + site2Contig + '\'' + + ", site2Region='" + site2Region + '\'' + + ", site2RegionNumber=" + site2RegionNumber + + ", site2Description='" + site2Description + '\'' + + ", site2EffectOnFrame='" + site2EffectOnFrame + '\'' + + ", ncbiBuild='" + ncbiBuild + '\'' + + ", dnaSupport='" + dnaSupport + '\'' + + ", rnaSupport='" + rnaSupport + '\'' + + ", normalReadCount=" + normalReadCount + + ", tumorReadCount=" + tumorReadCount + + ", normalVariantCount=" + normalVariantCount + + ", tumorVariantCount=" + tumorVariantCount + + ", normalPairedEndReadCount=" + normalPairedEndReadCount + + ", tumorPairedEndReadCount=" + tumorPairedEndReadCount + + ", normalSplitReadCount=" + normalSplitReadCount + + ", tumorSplitReadCount=" + tumorSplitReadCount + + ", annotation='" + annotation + '\'' + + ", breakpointType='" + breakpointType + '\'' + + ", connectionType='" + connectionType + '\'' + + ", eventInfo='" + eventInfo + '\'' + + ", variantClass='" + variantClass + '\'' + + ", length=" + length + + ", comments='" + comments + '\'' + + ", svStatus='" + svStatus + '\'' + + ", driverFilter='" + driverFilter + '\'' + + ", driverFilterAnn='" + driverFilterAnn + '\'' + + ", driverTiersFilter='" + driverTiersFilter + '\'' + + ", driverTiersFilterAnn='" + driverTiersFilterAnn + '\'' + + ", annotationJson='" + annotationJson + '\'' + + '}'; + } } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/AddCaseList.java b/src/main/java/org/mskcc/cbio/portal/scripts/AddCaseList.java index acc067c6..e91ebef4 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/AddCaseList.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/AddCaseList.java @@ -23,9 +23,6 @@ package org.mskcc.cbio.portal.scripts; -import java.util.ArrayList; -import java.util.List; - import org.mskcc.cbio.portal.dao.DaoCancerStudy; import org.mskcc.cbio.portal.dao.DaoSample; import org.mskcc.cbio.portal.dao.DaoSampleList; @@ -33,9 +30,10 @@ import org.mskcc.cbio.portal.model.Sample; import org.mskcc.cbio.portal.model.SampleList; import org.mskcc.cbio.portal.model.SampleListCategory; -import org.mskcc.cbio.portal.util.ConsoleUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; -import org.mskcc.cbio.portal.util.SpringUtil; + +import java.util.ArrayList; +import java.util.List; /** * Command Line tool to Add new case lists by generating them based on some rules. @@ -131,7 +129,6 @@ public void run() { throw new UsageException(progName, null, argSpec, "cancer_study_identifier is not specified."); } - SpringUtil.initDataSource(); CancerStudy theCancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyIdentifier); if (theCancerStudy == null) { throw new IllegalArgumentException("cancer study identified by cancer_study_identifier '" diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ConvertCosmicVcfToMaf.java b/src/main/java/org/mskcc/cbio/portal/scripts/ConvertCosmicVcfToMaf.java index 470a06e9..a36fffa4 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ConvertCosmicVcfToMaf.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ConvertCosmicVcfToMaf.java @@ -35,6 +35,7 @@ import org.mskcc.cbio.portal.util.ConsoleUtil; import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.TsvUtil; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -68,7 +69,7 @@ public void convert() throws IOException { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); if (!line.startsWith("#")) { - String parts[] = line.split("\t",-1); + String parts[] = TsvUtil.splitTsvLine(line); if (parts.length<8) { System.err.println("Wrong line in cosmic: "+line); continue; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java b/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java index df1a1ce3..d9cfad9d 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java @@ -36,6 +36,7 @@ import org.mskcc.cbio.portal.util.ConsoleUtil; import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.util.TsvUtil; import java.io.File; import java.io.IOException; @@ -109,7 +110,7 @@ private HashSet getExcludedCases() throws IOException { HashSet excludedCaseSet = new HashSet(); while (line != null) { - if (!line.startsWith("#") && line.trim().length() > 0) { + if (TsvUtil.isDataLine(line)) { String parts[] = line.split("\t"); excludedCaseSet.add(parts[0]); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java index 32aa43f2..e1d9a9f8 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java @@ -2,25 +2,39 @@ import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfileSamples; import org.mskcc.cbio.portal.model.CanonicalGene; import org.mskcc.cbio.portal.util.ProgressMonitor; -import java.util.*; +import java.util.HashSet; +import java.util.List; +import java.util.Set; import static java.lang.String.format; public class GeneticAlterationImporter { - private final int geneticProfileId; - private Set importSetOfGenes = new HashSet<>(); - private DaoGeneticAlteration daoGeneticAlteration; + protected int geneticProfileId; + protected List orderedSampleList; + private final Set importSetOfGenes = new HashSet<>(); + private final Set importSetOfGeneticEntityIds = new HashSet<>(); + private final DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); + + protected GeneticAlterationImporter() {} public GeneticAlterationImporter( - int geneticProfileId, - DaoGeneticAlteration daoGeneticAlteration + int geneticProfileId, + List orderedSampleList ) { this.geneticProfileId = geneticProfileId; - this.daoGeneticAlteration = daoGeneticAlteration; + this.orderedSampleList = orderedSampleList; + } + + protected void storeOrderedSampleList() throws DaoException { + int rowCount = DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); + if (rowCount < 1) { + throw new IllegalStateException("Failed to store the ordered sample list."); + } } /** @@ -29,31 +43,62 @@ public GeneticAlterationImporter( * multiple rows for the same gene, and we only want to import the first row. */ public boolean store( - String[] values, - CanonicalGene gene, - String geneSymbol + String[] values, + CanonicalGene gene, + String geneSymbol ) throws DaoException { - try { - if (importSetOfGenes.add(gene.getEntrezGeneId())) { - daoGeneticAlteration.addGeneticAlterations(geneticProfileId, gene.getEntrezGeneId(), values); - return true; - } else { - String geneSymbolMessage = ""; - if (geneSymbol != null && !geneSymbol.equalsIgnoreCase(gene.getHugoGeneSymbolAllCaps())) { - geneSymbolMessage = " (given as alias in your file as: " + geneSymbol + ")"; - } - ProgressMonitor.logWarning(format( - "Gene %s (%d)%s found to be duplicated in your file. Duplicated row will be ignored!", - gene.getHugoGeneSymbolAllCaps(), - gene.getEntrezGeneId(), - geneSymbolMessage) - ); - return false; - } - } catch (Exception e) { - throw new RuntimeException("Aborted: Error found for row starting with " + geneSymbol + ": " + e.getMessage()); + ensureNumberOfValuesIsCorrect(values.length); + if (importSetOfGenes.add(gene.getEntrezGeneId())) { + daoGeneticAlteration.addGeneticAlterations(geneticProfileId, gene.getEntrezGeneId(), values); + return true; + } + String geneSymbolMessage = ""; + if (geneSymbol != null && !geneSymbol.equalsIgnoreCase(gene.getHugoGeneSymbolAllCaps())) { + geneSymbolMessage = " (given as alias in your file as: " + geneSymbol + ")"; + } + ProgressMonitor.logWarning(format( + "Gene %s (%d)%s found to be duplicated in your file. Duplicated row will be ignored!", + gene.getHugoGeneSymbolAllCaps(), + gene.getEntrezGeneId(), + geneSymbolMessage) + ); + return false; + } + + + /** + * Universal method that stores values for different genetic entities + * @param geneticEntityId + * @param values + * @return true if entity has been stored, false - if entity already existed + * @throws DaoException + */ + public boolean store( + int geneticEntityId, + String[] values + ) throws DaoException { + ensureNumberOfValuesIsCorrect(values.length); + if (importSetOfGeneticEntityIds.add(geneticEntityId)) { + daoGeneticAlteration.addGeneticAlterationsForGeneticEntity(geneticProfileId, geneticEntityId, values); + return true; + } + ProgressMonitor.logWarning("Data for genetic entity with id " + geneticEntityId + " already imported from file. Record will be skipped."); + return false; + } + + private void ensureNumberOfValuesIsCorrect(int valuesNumber) { + if (valuesNumber != orderedSampleList.size()) { + throw new IllegalArgumentException("There has to be " + orderedSampleList.size() + " values, but only " + valuesNumber+ " has passed."); } } + public void initialize() { + try { + storeOrderedSampleList(); + } catch (DaoException e) { + throw new RuntimeException(e); + } + } + public void finalize() { } } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java new file mode 100644 index 00000000..4177420a --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java @@ -0,0 +1,114 @@ +package org.mskcc.cbio.portal.scripts; + +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfileSamples; +import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.util.ArrayUtil; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +public class GeneticAlterationIncrementalImporter extends GeneticAlterationImporter { + + private final List fileOrderedSampleList; + private final DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); + private HashMap> geneticAlterationMap; + + public GeneticAlterationIncrementalImporter( + int geneticProfileId, + List fileOrderedSampleList + ) { + + this.geneticProfileId = geneticProfileId; + this.fileOrderedSampleList = fileOrderedSampleList; + } + + @Override + public boolean store(String[] values, CanonicalGene gene, String geneSymbol) throws DaoException { + int geneticEntityId = gene.getGeneticEntityId(); + String[] expandedValues = extendValues(geneticEntityId, values); + return super.store(expandedValues, gene, geneSymbol); + } + + @Override + public boolean store(int geneticEntityId, String[] values) throws DaoException { + String[] expandedValues = extendValues(geneticEntityId, values); + return super.store(geneticEntityId, expandedValues); + } + + @Override + public void initialize() { + try { + this.geneticAlterationMap = daoGeneticAlteration.getGeneticAlterationMapForEntityIds(geneticProfileId, null); + ArrayList savedOrderedSampleList = DaoGeneticProfileSamples.getOrderedSampleList(this.geneticProfileId); + int initialOrderSampleListSize = savedOrderedSampleList.size(); + geneticAlterationMap.forEach((geneticEntityId, sampleToValue) -> { + if (sampleToValue.size() != initialOrderSampleListSize) { + throw new IllegalStateException("Number of samples (" + + sampleToValue.size() + ") for genetic entity with id " + + geneticEntityId + " does not match with the number in the preexisting sample list (" + + initialOrderSampleListSize + ")."); + } + }); + // add all new sample ids at the end + this.orderedSampleList = new ArrayList<>(savedOrderedSampleList); + Set savedSampleSet = new HashSet<>(savedOrderedSampleList); + List newSampleIds = this.fileOrderedSampleList.stream().filter(sampleId -> !savedSampleSet.contains(sampleId)).toList(); + this.orderedSampleList.addAll(newSampleIds); + DaoGeneticProfileSamples.deleteAllSamplesInGeneticProfile(this.geneticProfileId); + daoGeneticAlteration.deleteAllRecordsInGeneticProfile(this.geneticProfileId); + } catch (DaoException e) { + throw new RuntimeException(e); + } + super.initialize(); + } + + @Override + public void finalize() { + expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue(); + super.finalize(); + } + + private String[] extendValues(int geneticEntityId, String[] values) { + Map sampleIdToValue = mapWithFileOrderedSampleList(values); + String[] updatedSampleValues = new String[orderedSampleList.size()]; + for (int i = 0; i < orderedSampleList.size(); i++) { + updatedSampleValues[i] = ""; + int sampleId = orderedSampleList.get(i); + if (geneticAlterationMap.containsKey(geneticEntityId)) { + HashMap savedSampleIdToValue = geneticAlterationMap.get(geneticEntityId); + updatedSampleValues[i] = savedSampleIdToValue.containsKey(sampleId) ? savedSampleIdToValue.remove(sampleId): ""; + if (savedSampleIdToValue.isEmpty()) { + geneticAlterationMap.remove(geneticEntityId); + } + } + if (sampleIdToValue.containsKey(sampleId)) { + updatedSampleValues[i] = sampleIdToValue.get(sampleId); + } + } + return updatedSampleValues; + } + + private Map mapWithFileOrderedSampleList(String[] values) { + return ArrayUtil.zip(fileOrderedSampleList.toArray(Integer[]::new), values); + } + + private void expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue() { + // Expand remaining genetic entity id rows that were not mentioned in the file + new HashSet<>(geneticAlterationMap.keySet()).forEach(geneticEntityId -> { + try { + String[] values = new String[fileOrderedSampleList.size()]; + Arrays.fill(values, ""); + this.store(geneticEntityId, values); + } catch (DaoException e) { + throw new RuntimeException(e); + } + }); + } +} diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCancerStudy.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCancerStudy.java index 03372dbb..75ab1f91 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCancerStudy.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCancerStudy.java @@ -32,8 +32,11 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.model.*; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.CancerStudyTags; +import org.mskcc.cbio.portal.util.CancerStudyReader; +import org.mskcc.cbio.portal.util.CancerStudyTagsReader; +import org.mskcc.cbio.portal.util.ProgressMonitor; import java.io.File; @@ -53,7 +56,6 @@ public void run() { } File file = new File(args[0]); - SpringUtil.initDataSource(); CancerStudy cancerStudy = CancerStudyReader.loadCancerStudy(file); CancerStudyTags cancerStudyTags = CancerStudyTagsReader.loadCancerStudyTags(file, cancerStudy); String message = "Loaded the following cancer study:" + diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java index 11eeedbc..d15beed6 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java @@ -32,16 +32,40 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; +import joptsimple.OptionException; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import joptsimple.OptionSpec; +import org.apache.commons.collections4.map.MultiKeyMap; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoClinicalAttributeMeta; +import org.mskcc.cbio.portal.dao.DaoClinicalData; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoPatient; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.ClinicalAttribute; +import org.mskcc.cbio.portal.model.Patient; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; +import org.mskcc.cbio.portal.util.SurvivalAttributeUtil; import org.mskcc.cbio.portal.util.SurvivalAttributeUtil.SurvivalStatusAttributes; -import java.io.*; -import joptsimple.*; -import java.util.*; -import java.util.regex.*; -import org.apache.commons.collections4.map.MultiKeyMap; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Properties; +import java.util.Set; +import java.util.regex.Matcher; public class ImportClinicalData extends ConsoleRunnable { @@ -61,6 +85,7 @@ public class ImportClinicalData extends ConsoleRunnable { private CancerStudy cancerStudy; private AttributeTypes attributesType; private boolean relaxed; + private boolean overwriteExisting; private Set patientIds = new HashSet(); public static enum MissingAttributeValues @@ -102,6 +127,11 @@ public static enum AttributeTypes { PATIENT_ATTRIBUTES("PATIENT"), SAMPLE_ATTRIBUTES("SAMPLE"), + /** + * We want to encourage use patient or sample files instead, not mixed ones. + * See https://github.com/cBioPortal/cbioportal-core/issues/31 + */ + @Deprecated MIXED_ATTRIBUTES("MIXED"); private String attributeType; @@ -332,25 +362,32 @@ private boolean addDatum(String[] fields, List columnAttrs, M //check if sample is not already added: Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), stableSampleId, false); if (sample != null) { - //this should be a WARNING in case of TCGA studies (see https://github.com/cBioPortal/cbioportal/issues/839#issuecomment-203452415) - //and an ERROR in other studies. I.e. a sample should occur only once in clinical file! - if (stableSampleId.startsWith("TCGA-")) { - ProgressMonitor.logWarning("Sample " + stableSampleId + " found to be duplicated in your file. Only data of the first sample will be processed."); - return false; - } - //give error or warning if sample is already in DB and this is NOT expected (i.e. not supplemental data): - if (!this.isSupplementalData()) { - throw new RuntimeException("Error: Sample " + stableSampleId + " found to be duplicated in your file."); - } - else { - internalSampleId = sample.getInternalId(); - } + internalSampleId = sample.getInternalId(); + if (overwriteExisting && this.attributesType == AttributeTypes.SAMPLE_ATTRIBUTES) { + DaoClinicalData.removeSampleAttributesData(internalSampleId); + } else { + //this should be a WARNING in case of TCGA studies (see https://github.com/cBioPortal/cbioportal/issues/839#issuecomment-203452415) + //and an ERROR in other studies. I.e. a sample should occur only once in clinical file! + if (stableSampleId.startsWith("TCGA-")) { + ProgressMonitor.logWarning("Sample " + stableSampleId + " found to be duplicated in your file. Only data of the first sample will be processed."); + return false; + } + if (this.isSupplementalData()) { + internalSampleId = sample.getInternalId(); + } else { + //give error or warning if sample is already in DB and this is NOT expected (i.e. not supplemental data): + throw new RuntimeException("Error: Sample " + stableSampleId + " found to be duplicated in your file."); + } + } } else { Patient patient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), stablePatientId); if (patient != null) { //patient exists, get internal id: internalPatientId = patient.getInternalId(); + if (overwriteExisting && this.attributesType == AttributeTypes.PATIENT_ATTRIBUTES) { + DaoClinicalData.removePatientAttributesData(internalPatientId); + } } else { //add patient: @@ -616,12 +653,14 @@ public void run() { "cancer study id").withOptionalArg().describedAs("study").ofType(String.class); OptionSpec attributeFlag = parser.accepts("a", "(deprecated) Flag for using MIXED_ATTRIBUTES").withOptionalArg().describedAs("a").ofType(String.class); - OptionSpec relaxedFlag = parser.accepts("r", + OptionSpec relaxedFlag = parser.accepts("r", "(not recommended) Flag for relaxed mode, determining how to handle detected data harmonization problems in the same study").withOptionalArg().describedAs("r").ofType(String.class); parser.accepts( "loadMode", "direct (per record) or bulk load of data" ) .withOptionalArg().describedAs( "[directLoad|bulkLoad (default)]" ).ofType( String.class ); parser.accepts("noprogress", "this option can be given to avoid the messages regarding memory usage and % complete"); - + OptionSpec overWriteExistingFlag = parser.accepts("overwrite-existing", + "Flag that enables re-uploading data for the patient/sample entries that already exist in the database").withOptionalArg().describedAs("overwrite-existing").ofType(String.class); + OptionSet options = null; try { options = parser.parse( args ); @@ -652,16 +691,16 @@ public void run() { attributesDatatype = properties.getProperty("datatype"); cancerStudyStableId = properties.getProperty("cancer_study_identifier"); } - if( options.has ( attributeFlag ) ) - { - attributesDatatype = "MIXED_ATTRIBUTES"; - } - if( options.has ( relaxedFlag ) ) - { - relaxed = true; + if (options.has(attributeFlag)) { + attributesDatatype = "MIXED_ATTRIBUTES"; + } + if (options.has(relaxedFlag)) { + relaxed = true; + } + if (options.has(overWriteExistingFlag)) { + overwriteExisting = true; - } - SpringUtil.initDataSource(); + } CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyStableId); if (cancerStudy == null) { throw new IllegalArgumentException("Unknown cancer study: " + cancerStudyStableId); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index f03e5b45..e158f2d5 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -22,63 +22,116 @@ */ package org.mskcc.cbio.portal.scripts; -import com.google.common.base.*; -import com.google.common.collect.*; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; - -import java.io.*; +import com.google.common.base.Strings; +import com.google.common.collect.HashBasedTable; +import com.google.common.collect.Table; +import org.mskcc.cbio.portal.dao.DaoCnaEvent; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.dao.JdbcUtil; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.model.CnaEvent; +import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.util.CnaUtil; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.GeneticProfileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; +import org.mskcc.cbio.portal.util.TsvUtil; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; import java.util.Optional; -import java.util.*; -import java.util.stream.*; +import java.util.Set; +import java.util.stream.Collectors; -import static com.google.common.collect.Lists.*; -import static java.lang.String.*; +import static com.google.common.collect.Lists.newArrayList; +import static java.lang.String.format; import static org.cbioportal.model.MolecularProfile.DataType.DISCRETE; +import static org.cbioportal.model.MolecularProfile.ImportType.DISCRETE_LONG; public class ImportCnaDiscreteLongData { private final File cnaFile; private final int geneticProfileId; - private final GeneticAlterationImporter geneticAlterationGeneImporter; - private String genePanel; + private GeneticAlterationImporter geneticAlterationGeneImporter; private final DaoGeneOptimized daoGene; private CnaUtil cnaUtil; private Set existingCnaEvents = new HashSet<>(); private int samplesSkipped = 0; private Set namespaces; - private final ArrayList sampleIdGeneticProfileIds = new ArrayList<>(); + private boolean isIncrementalUpdateMode; + + private GeneticProfile geneticProfile; + + private ArrayList orderedSampleList; + private final Integer genePanelId; + + public ImportCnaDiscreteLongData( + File cnaFile, + int geneticProfileId, + String genePanel, + DaoGeneOptimized daoGene, + Set namespaces, + boolean isIncrementalUpdateMode + ) { + this.namespaces = namespaces; + this.cnaFile = cnaFile; + this.geneticProfileId = geneticProfileId; + this.geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); + if (!Set.of(DISCRETE.name(), DISCRETE_LONG.name()).contains(geneticProfile.getDatatype())) { + throw new IllegalStateException("Platform " + + geneticProfileId + + " has not supported datatype: " + + geneticProfile.getDatatype()); + } + this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); + this.daoGene = daoGene; + this.isIncrementalUpdateMode = isIncrementalUpdateMode; + } public ImportCnaDiscreteLongData( File cnaFile, int geneticProfileId, String genePanel, DaoGeneOptimized daoGene, - DaoGeneticAlteration daoGeneticAlteration, Set namespaces ) { - this.namespaces = namespaces; - this.cnaFile = cnaFile; - this.geneticProfileId = geneticProfileId; - this.genePanel = genePanel; - this.daoGene = daoGene; - this.geneticAlterationGeneImporter = new GeneticAlterationImporter(geneticProfileId, daoGeneticAlteration); + this(cnaFile, geneticProfileId, genePanel, daoGene, namespaces, false); + } + public void importData() { + JdbcUtil.getTransactionTemplate().execute(status -> { + try { + doImportData(); + } catch (Throwable e) { + status.setRollbackOnly(); + throw new RuntimeException(e); + } + return null; + }); } - public void importData() throws Exception { + private void doImportData() throws Exception { FileReader reader = new FileReader(this.cnaFile); BufferedReader buf = new BufferedReader(reader); // Pass first line with headers to util: String line = buf.readLine(); int lineIndex = 1; - String[] headerParts = line.split("\t", -1); + String[] headerParts = TsvUtil.splitTsvLine(line); this.cnaUtil = new CnaUtil(headerParts, this.namespaces); - GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); - boolean isDiscretizedCnaProfile = geneticProfile != null && geneticProfile.getGeneticAlterationType() == GeneticAlterationType.COPY_NUMBER_ALTERATION && geneticProfile.showProfileInAnalysisTab(); @@ -97,10 +150,11 @@ public void importData() throws Exception { this.extractDataToImport(geneticProfile, line, lineIndex, toImport); } - DaoGeneticProfileSamples.addGeneticProfileSamples( - geneticProfileId, - newArrayList(toImport.eventsTable.columnKeySet()) - ); + orderedSampleList = newArrayList(toImport.eventsTable.columnKeySet()); + this.geneticAlterationGeneImporter = isIncrementalUpdateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) + : new GeneticAlterationImporter(geneticProfileId, orderedSampleList); + geneticAlterationGeneImporter.initialize(); + DaoSampleProfile.upsertSampleToProfileMapping(orderedSampleList, geneticProfileId, genePanelId); for (Long entrezId : toImport.eventsTable.rowKeySet()) { boolean added = storeGeneticAlterations(toImport, entrezId); @@ -117,6 +171,7 @@ public void importData() throws Exception { ProgressMonitor.setCurrentMessage(" --> total number of samples skipped (normal samples): " + getSamplesSkipped()); buf.close(); + geneticAlterationGeneImporter.finalize(); MySQLbulkLoader.flushAll(); } @@ -130,11 +185,10 @@ public void extractDataToImport( int lineIndex, CnaImportData importContainer ) throws Exception { - boolean hasData = !line.startsWith("#") && line.trim().length() > 0; - if (!hasData) { + if (!TsvUtil.isDataLine(line)) { return; } - String[] lineParts = line.split("\t", -1); + String[] lineParts = TsvUtil.splitTsvLine(line); CanonicalGene gene = this.getGene(cnaUtil.getEntrezSymbol(lineParts), lineParts, cnaUtil); importContainer.genes.add(gene); @@ -147,7 +201,12 @@ public void extractDataToImport( String sampleIdStr = cnaUtil.getSampleIdStr(lineParts); Sample sample = findSample(sampleIdStr, cancerStudyId); - createSampleProfile(sample); + if (sample == null) { + if (StableIdUtil.isNormal(sampleIdStr)) { + return; + } + throw new RuntimeException("Sample with stable id " + sampleIdStr + " is not found in the database."); + } long entrezId = gene.getEntrezGeneId(); int sampleId = sample.getInternalId(); @@ -175,6 +234,9 @@ private void storeCnaEvents(CnaImportData toImport, Long entrezId) throws DaoExc .filter(v -> v.cnaEvent != null) .map(v -> v.cnaEvent) .collect(Collectors.toList()); + if (isIncrementalUpdateMode) { + DaoCnaEvent.removeSampleCnaEvents(geneticProfileId, orderedSampleList); + } CnaUtil.storeCnaEvents(existingCnaEvents, events); } @@ -213,7 +275,7 @@ private boolean storeGeneticAlterations(CnaImportData toImport, Long entrezId) t ? gene.get().getHugoGeneSymbolAllCaps() : "" + entrezId; - return this.geneticAlterationGeneImporter.store(values, gene.get(), geneSymbol); + return geneticAlterationGeneImporter.store(values, gene.get(), geneSymbol); } /** @@ -271,49 +333,6 @@ private CanonicalGene getGene( return null; } - /** - * Find sample and create sample profile when needed - * - * @return boolean created or not - */ - public boolean createSampleProfile( - Sample sample - ) throws Exception { - boolean inDatabase = DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId); - Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); - SampleIdGeneticProfileId toCreate = new SampleIdGeneticProfileId(sample.getInternalId(), geneticProfileId); - boolean isQueued = this.sampleIdGeneticProfileIds.contains(toCreate); - if (!inDatabase && !isQueued) { - DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelID); - this.sampleIdGeneticProfileIds.add(toCreate); - return true; - } - return false; - } - - - private static class SampleIdGeneticProfileId { - public int sampleId; - public int geneticProfileId; - - public SampleIdGeneticProfileId(int sampleId, int geneticProfileId) { - this.sampleId = sampleId; - this.geneticProfileId = geneticProfileId; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - - SampleIdGeneticProfileId that = (SampleIdGeneticProfileId) o; - return sampleId == that.sampleId - && geneticProfileId == that.geneticProfileId; - } - } - /** * Find sample and create sample profile when needed */ diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java index 1c876a75..69efd93d 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java @@ -32,15 +32,32 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.model.*; - -import joptsimple.*; - -import java.io.*; +import joptsimple.OptionSet; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoCopyNumberSegment; +import org.mskcc.cbio.portal.dao.DaoCopyNumberSegmentFile; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.CopyNumberSegment; +import org.mskcc.cbio.portal.model.CopyNumberSegmentFile; +import org.mskcc.cbio.portal.model.ReferenceGenome; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; import java.math.BigDecimal; -import java.util.*; +import java.util.HashSet; +import java.util.Properties; +import java.util.Set; /** * Import Segment data into database. @@ -49,14 +66,16 @@ public class ImportCopyNumberSegmentData extends ConsoleRunnable { private int entriesSkipped; - + private boolean isIncrementalUpdateMode; + private Set processedSampleIds; + private void importData(File file, int cancerStudyId) throws IOException, DaoException { - MySQLbulkLoader.bulkLoadOn(); FileReader reader = new FileReader(file); BufferedReader buf = new BufferedReader(reader); try { String line = buf.readLine(); // skip header line long segId = DaoCopyNumberSegment.getLargestId(); + processedSampleIds = new HashSet<>(); while ((line=buf.readLine()) != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); @@ -66,8 +85,7 @@ private void importData(File file, int cancerStudyId) throws IOException, DaoExc System.err.println("wrong format: "+line); } - CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByInternalId(cancerStudyId); - String chrom = strs[1].trim(); + String chrom = strs[1].trim(); //validate in same way as GistitReader: ValidationUtils.validateChromosome(chrom); @@ -97,8 +115,11 @@ private void importData(File file, int cancerStudyId) throws IOException, DaoExc CopyNumberSegment cns = new CopyNumberSegment(cancerStudyId, s.getInternalId(), chrom, start, end, numProbes, segMean); cns.setSegId(++segId); DaoCopyNumberSegment.addCopyNumberSegment(cns); + processedSampleIds.add(s.getInternalId()); + } + if (isIncrementalUpdateMode) { + DaoCopyNumberSegment.deleteSegmentDataForSamples(cancerStudyId, processedSampleIds); } - MySQLbulkLoader.flushAll(); } finally { buf.close(); @@ -110,27 +131,31 @@ public void run() { String description = "Import 'segment data' files"; OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, true); + if (options.has("loadMode") && !"bulkLoad".equalsIgnoreCase((String) options.valueOf("loadMode"))) { + throw new UnsupportedOperationException("This loader supports bulkLoad load mode only, but " + + options.valueOf("loadMode") + + " has been supplied."); + } String dataFile = (String) options.valueOf("data"); File descriptorFile = new File((String) options.valueOf("meta")); + isIncrementalUpdateMode = options.has("overwrite-existing"); Properties properties = new Properties(); properties.load(new FileInputStream(descriptorFile)); ProgressMonitor.setCurrentMessage("Reading data from: " + dataFile); - SpringUtil.initDataSource(); CancerStudy cancerStudy = getCancerStudy(properties); - if (segmentDataExistsForCancerStudy(cancerStudy)) { + if (!isIncrementalUpdateMode && segmentDataExistsForCancerStudy(cancerStudy)) { throw new IllegalArgumentException("Seg data for cancer study " + cancerStudy.getCancerStudyStableId() + " has already been imported: " + dataFile); } - + MySQLbulkLoader.bulkLoadOn(); importCopyNumberSegmentFileMetadata(cancerStudy, properties); importCopyNumberSegmentFileData(cancerStudy, dataFile); - DaoCopyNumberSegment.createFractionGenomeAlteredClinicalData(cancerStudy.getInternalId()); - if( MySQLbulkLoader.isBulkLoad()) { - MySQLbulkLoader.flushAll(); - } + MySQLbulkLoader.flushAll(); + MySQLbulkLoader.bulkLoadOff(); + DaoCopyNumberSegment.createFractionGenomeAlteredClinicalData(cancerStudy.getInternalId(), processedSampleIds, isIncrementalUpdateMode); } catch (RuntimeException e) { throw e; } catch (IOException|DaoException e) { @@ -150,7 +175,7 @@ private static boolean segmentDataExistsForCancerStudy(CancerStudy cancerStudy) return (DaoCopyNumberSegment.segmentDataExistForCancerStudy(cancerStudy.getInternalId())); } - private static void importCopyNumberSegmentFileMetadata(CancerStudy cancerStudy, Properties properties) throws DaoException { + private void importCopyNumberSegmentFileMetadata(CancerStudy cancerStudy, Properties properties) throws DaoException { CopyNumberSegmentFile copyNumSegFile = new CopyNumberSegmentFile(); copyNumSegFile.cancerStudyId = cancerStudy.getInternalId(); String referenceGenomeId = properties.getProperty("reference_genome_id").trim(); @@ -165,7 +190,18 @@ private static void importCopyNumberSegmentFileMetadata(CancerStudy cancerStudy, copyNumSegFile.referenceGenomeId = getRefGenId(referenceGenomeId); copyNumSegFile.description = properties.getProperty("description").trim(); copyNumSegFile.filename = properties.getProperty("data_filename").trim(); - DaoCopyNumberSegmentFile.addCopyNumberSegmentFile(copyNumSegFile); + CopyNumberSegmentFile storedCopyNumSegFile = DaoCopyNumberSegmentFile.getCopyNumberSegmentFile(cancerStudy.getInternalId()); + if (isIncrementalUpdateMode && storedCopyNumSegFile != null) { + if (storedCopyNumSegFile.referenceGenomeId != copyNumSegFile.referenceGenomeId) { + throw new IllegalStateException("You are trying to upload " + + copyNumSegFile.referenceGenomeId + + " reference genome data into " + + storedCopyNumSegFile.referenceGenomeId + + " reference genome data."); + } + } else { + DaoCopyNumberSegmentFile.addCopyNumberSegmentFile(copyNumSegFile); + } } private void importCopyNumberSegmentFileData(CancerStudy cancerStudy, String dataFilename) throws IOException, DaoException { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java index cea48b84..25eb5e1e 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java @@ -32,16 +32,27 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.model.*; +import org.mskcc.cbio.portal.dao.DaoCosmicData; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.model.CosmicMutationFrequency; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.MutationKeywordUtils; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.TsvUtil; import org.springframework.util.Assert; -import java.io.*; - +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; import java.util.HashMap; import java.util.Map; -import java.util.regex.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.stream.IntStream; public class ImportCosmicData { @@ -70,7 +81,7 @@ public void importData() throws IOException, DaoException { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); if (!line.startsWith("#")) { - String parts[] = line.split("\t",-1); + String parts[] = TsvUtil.splitTsvLine(line); if (parts.length<8) { System.err.println("Wrong line in cosmic: "+line); continue; @@ -180,7 +191,6 @@ public static void main(String[] args) throws Exception { System.out.println("command line usage: importCosmicData.pl "); return; } - SpringUtil.initDataSource(); DaoCosmicData.deleteAllRecords(); ProgressMonitor.setConsoleMode(true); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java index a819b0de..f05362ff 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java @@ -32,11 +32,18 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.TsvUtil; -import java.io.*; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; /** * Command Line tool to import background drug information. @@ -58,7 +65,7 @@ public void importData() throws IOException, DaoException { while (line != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if (!line.startsWith("#") && line.trim().length() > 0) { + if (TsvUtil.isDataLine(line)) { line = line.trim(); String parts[] = line.split("\t"); String geneSymbol = parts[0]; @@ -79,7 +86,6 @@ public static void main(String[] args) throws Exception { return; } ProgressMonitor.setConsoleMode(true); - SpringUtil.initDataSource(); File file = new File(args[0]); System.out.println("Reading drug data from: " + file.getAbsolutePath()); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index 39dd97c3..4cc8fb20 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -32,22 +32,49 @@ package org.mskcc.cbio.portal.scripts; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.model.ExtendedMutation.MutationEvent; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.maf.*; - import org.apache.commons.lang3.StringUtils; +import org.mskcc.cbio.maf.MafRecord; +import org.mskcc.cbio.maf.MafUtil; +import org.mskcc.cbio.portal.dao.DaoAlleleSpecificCopyNumber; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoMutation; +import org.mskcc.cbio.portal.dao.DaoReferenceGenome; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.AlleleSpecificCopyNumber; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.model.ExtendedMutation; +import org.mskcc.cbio.portal.model.ExtendedMutation.MutationEvent; +import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.ExtendedMutationUtil; +import org.mskcc.cbio.portal.util.GeneticProfileUtil; +import org.mskcc.cbio.portal.util.GlobalProperties; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; +import org.mskcc.cbio.portal.util.TsvUtil; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; -import java.util.*; -import java.util.regex.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * Import an extended mutation file. @@ -59,7 +86,7 @@ *
* @author Selcuk Onur Sumer */ -public class ImportExtendedMutationData{ +public class ImportExtendedMutationData { private File mutationFile; private int geneticProfileId; @@ -68,27 +95,37 @@ public class ImportExtendedMutationData{ private int entriesSkipped = 0; private int samplesSkipped = 0; private Set sampleSet = new HashSet(); + private Set internalSampleIds = new HashSet(); + private Set geneSet = new HashSet(); - private String genePanel; private Set filteredMutations = new HashSet(); private Set namespaces = new HashSet(); private Pattern SEQUENCE_SAMPLES_REGEX = Pattern.compile("^.*sequenced_samples:(.*)$"); private final String ASCN_NAMESPACE = "ASCN"; + private final Integer genePanelId; + + private final boolean overwriteExisting; + /** * construct an ImportExtendedMutationData. * Filter mutations according to the no argument MutationFilter(). */ public ImportExtendedMutationData(File mutationFile, int geneticProfileId, String genePanel, Set filteredMutations, Set namespaces) { + this(mutationFile, geneticProfileId, genePanel, filteredMutations, namespaces, false); + } + + public ImportExtendedMutationData(File mutationFile, int geneticProfileId, String genePanel, Set filteredMutations, Set namespaces, boolean overwriteExisting) { this.mutationFile = mutationFile; this.geneticProfileId = geneticProfileId; this.swissprotIsAccession = false; - this.genePanel = genePanel; + this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); this.filteredMutations = filteredMutations; // create default MutationFilter myMutationFilter = new MutationFilter( ); this.namespaces = namespaces; + this.overwriteExisting = overwriteExisting; } public ImportExtendedMutationData(File mutationFile, int geneticProfileId, String genePanel) { @@ -150,15 +187,15 @@ public void importData() throws IOException, DaoException { referenceGenome = GlobalProperties.getReferenceGenomeName(); } String genomeBuildName = DaoReferenceGenome.getReferenceGenomeByGenomeName(referenceGenome).getBuildName(); - + Set processedSamples = new HashSet<>(); while((line=buf.readLine()) != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if( !line.startsWith("#") && line.trim().length() > 0) + if(TsvUtil.isDataLine(line)) { - String[] parts = line.split("\t", -1 ); // the -1 keeps trailing empty strings; see JavaDoc for String + String[] parts = TsvUtil.splitTsvLine(line); MafRecord record = mafUtil.parseRecord(line); if (!record.getNcbiBuild().equalsIgnoreCase(genomeBuildName)) { @@ -180,6 +217,9 @@ public void importData() throws IOException, DaoException { else { throw new RuntimeException("Unknown sample id '" + StableIdUtil.getSampleId(barCode) + "' found in MAF file: " + this.mutationFile.getCanonicalPath()); } + } else if (overwriteExisting && !processedSamples.contains(sample.getInternalId())) { + DaoMutation.deleteAllRecordsInGeneticProfileForSample(geneticProfileId, sample.getInternalId()); + processedSamples.add(sample.getInternalId()); } String validationStatus = record.getValidationStatus(); @@ -417,9 +457,6 @@ public void importData() throws IOException, DaoException { } else { mutations.put(mutation,mutation); } - if(!sampleSet.contains(sample.getStableId())) { - addSampleProfileRecord(sample); - } // update ascn object with mutation unique key details if (ascn != null){ ascn.updateAscnUniqueKeyDetails(mutation); @@ -428,6 +465,7 @@ public void importData() throws IOException, DaoException { //keep track: sampleSet.add(sample.getStableId()); + internalSampleIds.add(sample.getInternalId()); geneSet.add(mutation.getEntrezGeneId()+""); } else { @@ -436,6 +474,7 @@ public void importData() throws IOException, DaoException { } } } + DaoSampleProfile.upsertSampleToProfileMapping(internalSampleIds, geneticProfileId, genePanelId); for (MutationEvent event : newEvents) { try { @@ -563,19 +602,21 @@ private String transformOMAScore( String omaScore) { private String processMAFHeader(BufferedReader buffer) throws IOException, DaoException { GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); String line = buffer.readLine().trim(); + Set internalSampleIds = new HashSet<>(); while (line.startsWith("#")) { Matcher seqSamplesMatcher = SEQUENCE_SAMPLES_REGEX.matcher(line); // line is of format #sequenced_samples: STABLE_ID STABLE_ID STABLE_ID STABLE_ID if (seqSamplesMatcher.find()) { - addSampleProfileRecords(getSequencedSamples(seqSamplesMatcher.group(1), geneticProfile)); + internalSampleIds.addAll(getSequencedInternalSampleId(seqSamplesMatcher.group(1), geneticProfile)); } line = buffer.readLine().trim(); } + DaoSampleProfile.upsertSampleToProfileMapping(internalSampleIds, geneticProfileId, genePanelId); return line; } - private List getSequencedSamples(String sequencedSamplesIDList, GeneticProfile geneticProfile) { - ArrayList toReturn = new ArrayList(); + private Set getSequencedInternalSampleId(String sequencedSamplesIDList, GeneticProfile geneticProfile) { + Set toReturn = new HashSet<>(); for (String stableSampleID : sequencedSamplesIDList.trim().split("\\s")) { Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(geneticProfile.getCancerStudyId(), StableIdUtil.getSampleId(stableSampleID)); @@ -583,33 +624,13 @@ private List getSequencedSamples(String sequencedSamplesIDList, GeneticP if (sample == null) { missingSample(stableSampleID); } - toReturn.add(sample); + toReturn.add(sample.getInternalId()); } return toReturn; } - private void addSampleProfileRecords(List sequencedSamples) throws DaoException { - for (Sample sample : sequencedSamples) { - addSampleProfileRecord(sample); - } - if( MySQLbulkLoader.isBulkLoad()) { - MySQLbulkLoader.flushAll(); - } - } - - private void addSampleProfileRecord(Sample sample) throws DaoException { - if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId)) { - Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); - DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelID); - } - } - private void missingSample(String stableSampleID) { throw new NullPointerException("Sample is not found in database (is it missing from clinical data file?): " + stableSampleID); } - private String convertMapToJsonString(Map> map) throws JsonProcessingException { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(map); - } } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java index 0ab8bd88..d640bc3c 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java @@ -32,19 +32,37 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.CanonicalGene; -import org.mskcc.cbio.portal.model.ReferenceGenome; -import org.mskcc.cbio.portal.model.ReferenceGenomeGene; -import org.mskcc.cbio.portal.util.*; - import joptsimple.OptionException; import joptsimple.OptionParser; import joptsimple.OptionSet; import joptsimple.OptionSpec; - -import java.io.*; -import java.util.*; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoReferenceGenome; +import org.mskcc.cbio.portal.dao.DaoReferenceGenomeGene; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.model.ReferenceGenome; +import org.mskcc.cbio.portal.model.ReferenceGenomeGene; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.DataValidator; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.GlobalProperties; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.TsvUtil; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; /** * Command Line Tool to Import Background Gene Data. @@ -187,8 +205,8 @@ public static void importHGNCData(File geneFile, String genomeBuild) throws IOEx while ((line = buf.readLine()) != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - String parts[] = line.split("\t", -1); // include trailing empty strings - if (!parts[0].matches("[0-9]+")) { + String parts[] = TsvUtil.splitTsvLine(line); // include trailing empty strings + if (!DataValidator.isValidNumericSequence(parts[0])) { ProgressMonitor.logWarning("Skipping gene with invalid entrez gene id '" + parts[1] + "'"); continue; } @@ -684,8 +702,6 @@ static void importSuppGeneData(File suppGeneFile, String referenceGenomeBuild) t @Override public void run() { try { - SpringUtil.initDataSource(); - String description = "Update gene / gene alias tables "; // using a real options parser, helps avoid bugs diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java index 8e0d77c4..e48df328 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java @@ -32,13 +32,32 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; - -import java.io.*; -import joptsimple.*; -import java.util.*; +import joptsimple.OptionException; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import joptsimple.OptionSpec; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoGenePanel; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.GenePanel; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.util.Arrays; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Properties; +import java.util.Set; /** * @@ -68,6 +87,10 @@ public void run() { "gene panel file" ).withRequiredArg().describedAs( "meta_file.txt" ).ofType( String.class ); parser.accepts("noprogress", "this option can be given to avoid the messages regarding memory usage and % complete"); + // supported by the uploader already. Added for uniformity, to do not cause error when upstream software uses this flag + parser.accepts("overwrite-existing", + "Enables re-uploading gene panel profile map data that already exists.") + .withOptionalArg().describedAs("overwrite-existing").ofType(String.class); OptionSet options; try { options = parser.parse( args ); @@ -96,7 +119,6 @@ public void run() { } setFile(genePanel_f); - SpringUtil.initDataSource(); importData(); } catch (RuntimeException e) { throw e; @@ -153,7 +175,9 @@ public void importData() throws Exception { Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), sampleId); row_data.remove((int)sampleIdIndex); - + + + Set sampleProfileTuples = new HashSet<>(); // Loop over the values in the row for (int i = 0; i < row_data.size(); i++) { String genePanelName = row_data.get(i); @@ -166,12 +190,13 @@ public void importData() throws Exception { } Integer genePanelId = determineGenePanelId(genePanelName); - // Add gene panel information to database - DaoSampleProfile.updateSampleProfile( - sample.getInternalId(), - profileIds.get(i), - genePanelId); + Integer geneticProfileId = profileIds.get(i); + int sampleInternalId = sample.getInternalId(); + + sampleProfileTuples.add(new DaoSampleProfile.SampleProfileTuple(geneticProfileId, sampleInternalId, genePanelId)); } + + DaoSampleProfile.upsertSampleToProfileMapping(sampleProfileTuples); } } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java index 2da0ebd2..f102c420 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java @@ -55,6 +55,7 @@ import joptsimple.OptionParser; import joptsimple.OptionSet; import joptsimple.OptionSpec; +import org.mskcc.cbio.portal.util.TsvUtil; /** * Note; Imports genetic entities from generic assay files. Has been written for treatment response data @@ -160,7 +161,6 @@ public static void startImport(OptionSet options, OptionSpec data, Optio * @throws Exception */ public static void importData(File dataFile, GeneticAlterationType geneticAlterationType, String additionalProperties, boolean updateInfo) throws Exception { - ProgressMonitor.setCurrentMessage("Reading data from: " + dataFile.getCanonicalPath()); // read generic assay data file @@ -186,6 +186,10 @@ public static void importData(File dataFile, GeneticAlterationType geneticAltera currentLine = buf.readLine(); while (currentLine != null) { + if (!TsvUtil.isDataLine(currentLine)) { + currentLine = buf.readLine(); + continue; + } String[] parts = currentLine.split("\t"); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java index a7dda2ca..ad10fe01 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java @@ -79,7 +79,8 @@ public ImportGenericAssayPatientLevelData(File dataFile, String targetLine, int * @throws IOException IO Error. * @throws DaoException Database Error. */ - public void importData(int numLines) throws IOException, DaoException { + public void importData() throws IOException, DaoException { + int numLines = FileUtil.getNumLines(dataFile); geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); @@ -115,18 +116,13 @@ public void importData(int numLines) throws IOException, DaoException { throw new RuntimeException("Unknown patient id '" + StableIdUtil.getPatientId(patientIds[i]) + "' found in tab-delimited file: " + this.dataFile.getCanonicalPath()); } else { List samples = DaoSample.getSamplesByPatientId(patient.getInternalId()); - List sampleInternalIds = samples.stream().map(sample -> sample.getInternalId()).collect(Collectors.toList()); - for (int j = 0; j < sampleInternalIds.size(); j++) { - if (!DaoSampleProfile.sampleExistsInGeneticProfile(sampleInternalIds.get(j), geneticProfileId)) { - Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); - DaoSampleProfile.addSampleProfile(sampleInternalIds.get(j), geneticProfileId, genePanelID); - } - orderedSampleList.add(sampleInternalIds.get(j)); - } + samples.forEach(sample -> orderedSampleList.add(sample.getInternalId())); numSamplesInPatient[i][0] = samples.size(); sampleCount += samples.size(); } } + Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); + DaoSampleProfile.upsertSampleToProfileMapping(orderedSampleList, geneticProfileId, genePanelID); ProgressMonitor.setCurrentMessage(" --> total number of data lines: " + (numLines-1)); @@ -189,8 +185,8 @@ private boolean parseGenericAssayLine(String line, int nrColumns, int patientSta boolean recordIsStored = false; - if (!line.startsWith("#") && line.trim().length() > 0) { - String[] parts = line.split("\t", -1); + if (TsvUtil.isDataLine(line)) { + String[] parts = TsvUtil.splitTsvLine(line); if (parts.length > nrColumns) { if (line.split("\t").length > nrColumns) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGisticData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGisticData.java index e35a9ed6..f575202e 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGisticData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGisticData.java @@ -32,15 +32,19 @@ package org.mskcc.cbio.portal.scripts; -import java.io.*; -import java.util.ArrayList; - +import joptsimple.OptionSet; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGistic; import org.mskcc.cbio.portal.model.Gistic; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.GisticReader; +import org.mskcc.cbio.portal.util.ProgressMonitor; import org.mskcc.cbio.portal.validate.validationException; -import joptsimple.OptionSet; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; /** @@ -57,8 +61,6 @@ public void run () { String dataFile = (String) options.valueOf("data"); String studyId = (String) options.valueOf("study"); - SpringUtil.initDataSource(); - File gistic_f = new File(dataFile); int cancerStudyInternalId = ValidationUtils.getInternalStudyId(studyId); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportHprd.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportHprd.java index 1cbe26aa..1102d618 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportHprd.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportHprd.java @@ -32,11 +32,18 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; -import java.io.*; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; /** * Command Line to Import HPRD Interactions. @@ -122,7 +129,6 @@ public static void main(String[] args) { return; } ProgressMonitor.setConsoleModeAndParseShowProgress(args); - SpringUtil.initDataSource(); try { File geneFile = new File(args[0]); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportMicroRNAIDs.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportMicroRNAIDs.java index 929565fe..66b639c1 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportMicroRNAIDs.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportMicroRNAIDs.java @@ -32,11 +32,21 @@ package org.mskcc.cbio.portal.scripts; -import java.io.*; -import java.util.*; -import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; import org.mskcc.cbio.portal.model.CanonicalGene; -import org.mskcc.cbio.portal.util.*; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; /** * Command Line Tool to Import Background Gene Data. @@ -48,7 +58,6 @@ public static void importData(File geneFile) throws IOException, DaoException { FileReader reader = new FileReader(geneFile); BufferedReader buf = new BufferedReader(reader); String line = buf.readLine(); // skip first line - SpringUtil.initDataSource(); DaoGeneOptimized daoGene = DaoGeneOptimized.getInstance(); List mirnas = new ArrayList(); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportMutSigData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportMutSigData.java index f03f751f..7c0f80ac 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportMutSigData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportMutSigData.java @@ -32,10 +32,12 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.DaoException; -import org.mskcc.cbio.portal.util.*; - import joptsimple.OptionSet; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.MutSigReader; +import org.mskcc.cbio.portal.util.ProgressMonitor; import java.io.File; import java.io.IOException; @@ -54,8 +56,7 @@ public void run() { OptionSet options = ConsoleUtil.parseStandardDataAndStudyOptions(args, description); String dataFile = (String) options.valueOf("data"); String studyId = (String) options.valueOf("study"); - SpringUtil.initDataSource(); - + File mutSigFile = new File(dataFile); ProgressMonitor.setCurrentMessage( "Reading data from: " + mutSigFile.getAbsolutePath()); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportPathwayCommonsExtSif.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportPathwayCommonsExtSif.java index a17f783b..73a864c8 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportPathwayCommonsExtSif.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportPathwayCommonsExtSif.java @@ -32,11 +32,18 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; -import java.io.*; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; /** * Command Line to Import HPRD Interactions. @@ -119,7 +126,6 @@ public static void main(String[] args) { return; } ProgressMonitor.setConsoleMode(true); - SpringUtil.initDataSource(); try { File sifFile = new File(args[0]); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index 54ce204e..a35e8c29 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -32,14 +32,17 @@ package org.mskcc.cbio.portal.scripts; -import java.io.*; -import java.util.Set; - -import joptsimple.*; +import joptsimple.OptionSet; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.GeneticProfileReader; +import org.mskcc.cbio.portal.util.ProgressMonitor; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; +import java.io.File; +import java.util.Set; import static org.cbioportal.model.MolecularProfile.ImportType.DISCRETE_LONG; @@ -54,12 +57,7 @@ public class ImportProfileData extends ConsoleRunnable { public void run() { DaoGeneOptimized daoGene; DaoGeneticAlteration daoGeneticAlteration; - try { - daoGene = DaoGeneOptimized.getInstance(); - daoGeneticAlteration = DaoGeneticAlteration.getInstance(); - } catch (DaoException e) { - throw new RuntimeException("Could not create dao instances", e); - } + daoGene = DaoGeneOptimized.getInstance(); try { // Parse arguments @@ -73,7 +71,7 @@ public void run() { if (options.has("update-info") && (((String) options.valueOf("update-info")).equalsIgnoreCase("true") || options.valueOf("update-info").equals("1"))) { updateInfo = true; } - SpringUtil.initDataSource(); + boolean overwriteExisting = options.has("overwrite-existing"); ProgressMonitor.setCurrentMessage("Reading data from: " + dataFile.getAbsolutePath()); // Load genetic profile and gene panel GeneticProfile geneticProfile = null; @@ -86,19 +84,17 @@ public void run() { } // Print profile report - int numLines = FileUtil.getNumLines(dataFile); ProgressMonitor.setCurrentMessage( " --> profile id: " + geneticProfile.getGeneticProfileId() + "\n --> profile name: " + geneticProfile.getProfileName() + "\n --> genetic alteration type: " + geneticProfile.getGeneticAlterationType().name()); - ProgressMonitor.setMaxValue(numLines); - + // Check genetic alteration type if (geneticProfile.getGeneticAlterationType() == GeneticAlterationType.MUTATION_EXTENDED || geneticProfile.getGeneticAlterationType() == GeneticAlterationType.MUTATION_UNCALLED) { Set filteredMutations = GeneticProfileReader.getVariantClassificationFilter( descriptorFile ); Set namespaces = GeneticProfileReader.getNamespaces( descriptorFile ); - ImportExtendedMutationData importer = new ImportExtendedMutationData(dataFile, geneticProfile.getGeneticProfileId(), genePanel, filteredMutations, namespaces); + ImportExtendedMutationData importer = new ImportExtendedMutationData(dataFile, geneticProfile.getGeneticProfileId(), genePanel, filteredMutations, namespaces, overwriteExisting); String swissprotIdType = geneticProfile.getOtherMetaDataField("swissprot_identifier"); if (swissprotIdType != null && swissprotIdType.equals("accession")) { importer.setSwissprotIsAccession(true); @@ -112,7 +108,8 @@ public void run() { dataFile, geneticProfile.getGeneticProfileId(), genePanel, - namespaces + namespaces, + overwriteExisting ); importer.importData(); } else if (geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENERIC_ASSAY) { @@ -121,8 +118,11 @@ public void run() { // use a different importer for patient level data String patientLevel = geneticProfile.getOtherMetaDataField("patient_level"); if (patientLevel != null && patientLevel.trim().toLowerCase().equals("true")) { + if (overwriteExisting) { + throw new UnsupportedOperationException("Incremental upload for generic assay patient_level data is not supported. Please use sample level instead."); + } ImportGenericAssayPatientLevelData genericAssayProfileImporter = new ImportGenericAssayPatientLevelData(dataFile, geneticProfile.getTargetLine(), geneticProfile.getGeneticProfileId(), genePanel, geneticProfile.getOtherMetaDataField("generic_entity_meta_properties")); - genericAssayProfileImporter.importData(numLines); + genericAssayProfileImporter.importData(); } else { // use ImportTabDelimData importer for non-patient level data ImportTabDelimData genericAssayProfileImporter = new ImportTabDelimData( @@ -131,22 +131,23 @@ public void run() { geneticProfile.getGeneticProfileId(), genePanel, geneticProfile.getOtherMetaDataField("generic_entity_meta_properties"), - daoGeneticAlteration + overwriteExisting, + daoGene ); - genericAssayProfileImporter.importData(numLines); + genericAssayProfileImporter.importData(); } } else if( geneticProfile.getGeneticAlterationType() == GeneticAlterationType.COPY_NUMBER_ALTERATION - && DISCRETE_LONG.name().equals(geneticProfile.getDatatype()) + && DISCRETE_LONG.name().equals(geneticProfile.getOtherMetaDataField("datatype")) ) { Set namespaces = GeneticProfileReader.getNamespaces(descriptorFile); ImportCnaDiscreteLongData importer = new ImportCnaDiscreteLongData( - dataFile, - geneticProfile.getGeneticProfileId(), + dataFile, + geneticProfile.getGeneticProfileId(), genePanel, daoGene, - daoGeneticAlteration, - namespaces + namespaces, + overwriteExisting ); importer.importData(); } else { @@ -155,13 +156,14 @@ public void run() { geneticProfile.getTargetLine(), geneticProfile.getGeneticProfileId(), genePanel, - daoGeneticAlteration + overwriteExisting, + daoGene ); String pdAnnotationsFilename = geneticProfile.getOtherMetaDataField("pd_annotations_filename"); if (pdAnnotationsFilename != null && !"".equals(pdAnnotationsFilename)) { importer.setPdAnnotationsFile(new File(dataFile.getParent(), pdAnnotationsFilename)); } - importer.importData(numLines); + importer.importData(); } } catch (Exception e) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportReferenceGenome.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportReferenceGenome.java index 1230608a..0e3b032e 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportReferenceGenome.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportReferenceGenome.java @@ -17,20 +17,27 @@ package org.mskcc.cbio.portal.scripts; -import org.apache.commons.lang3.StringUtils; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.ReferenceGenome; -import org.mskcc.cbio.portal.util.*; - import joptsimple.OptionException; import joptsimple.OptionParser; import joptsimple.OptionSet; import joptsimple.OptionSpec; - -import java.io.*; -import java.util.*; -import java.text.SimpleDateFormat; +import org.apache.commons.lang3.StringUtils; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoReferenceGenome; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.ReferenceGenome; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.HashSet; +import java.util.Set; /** * Command Line Tool to Import Reference Genome Used by Molecular Profiling. @@ -108,8 +115,6 @@ private static void addReferenceGenomesToDB(Set referenceGenome @Override public void run() { try { - SpringUtil.initDataSource(); - String description = "Update reference_genome table "; // using a real options parser, helps avoid bugs diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java index d04124ba..ea1e3730 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java @@ -1,17 +1,37 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; - -import java.io.*; -import joptsimple.*; -import java.util.*; -import java.util.regex.*; -import java.util.stream.Collectors; - +import joptsimple.OptionException; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import joptsimple.OptionSpec; import org.apache.commons.collections4.map.MultiKeyMap; import org.cbioportal.model.ResourceType; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoPatient; +import org.mskcc.cbio.portal.dao.DaoResourceData; +import org.mskcc.cbio.portal.dao.DaoResourceDefinition; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.Patient; +import org.mskcc.cbio.portal.model.ResourceDefinition; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.MissingValues; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.stream.Collectors; public class ImportResourceData extends ConsoleRunnable { @@ -438,7 +458,6 @@ public void run() { relaxed = true; } - SpringUtil.initDataSource(); CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyStableId); if (cancerStudy == null) { throw new IllegalArgumentException("Unknown cancer study: " + cancerStudyStableId); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceDefinition.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceDefinition.java index 5811ee44..a4b7b7ec 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceDefinition.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceDefinition.java @@ -1,13 +1,28 @@ package org.mskcc.cbio.portal.scripts; +import joptsimple.OptionException; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import joptsimple.OptionSpec; import org.cbioportal.model.ResourceType; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; - -import java.io.*; -import joptsimple.*; -import java.util.*; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoResourceDefinition; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.ResourceDefinition; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.MissingValues; +import org.mskcc.cbio.portal.util.ProgressMonitor; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; public class ImportResourceDefinition extends ConsoleRunnable { @@ -290,7 +305,6 @@ public void run() { relaxed = true; } - SpringUtil.initDataSource(); CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyStableId); if (cancerStudy == null) { throw new IllegalArgumentException("Unknown cancer study: " + cancerStudyStableId); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java index 411de87f..ea85fe03 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java @@ -32,12 +32,26 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.util.*; - -import java.io.*; -import java.util.*; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoPatient; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleList; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.Patient; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.model.SampleList; +import org.mskcc.cbio.portal.model.SampleListCategory; +import org.mskcc.cbio.portal.util.CaseList; +import org.mskcc.cbio.portal.util.CaseListReader; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; +import org.mskcc.cbio.portal.validate.CaseListValidator; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; /** * Command Line tool to Import Sample Lists. @@ -46,51 +60,25 @@ public class ImportSampleList extends ConsoleRunnable { public static void importSampleList(File dataFile) throws IOException, DaoException { ProgressMonitor.setCurrentMessage("Read data from: " + dataFile.getAbsolutePath()); - Properties properties = new TrimmedProperties(); - properties.load(new FileInputStream(dataFile)); - - String stableId = properties.getProperty("stable_id"); - - if (stableId.contains(" ")) { - throw new IllegalArgumentException("stable_id cannot contain spaces: " + stableId); - } - - if (stableId == null || stableId.length() == 0) { - throw new IllegalArgumentException("stable_id is not specified."); - } + CaseList caseList = CaseListReader.readFile(dataFile); + CaseListValidator.validateAll(caseList); - String cancerStudyIdentifier = properties.getProperty("cancer_study_identifier"); - if (cancerStudyIdentifier == null) { - throw new IllegalArgumentException("cancer_study_identifier is not specified."); - } - SpringUtil.initDataSource(); - CancerStudy theCancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyIdentifier); + CancerStudy theCancerStudy = DaoCancerStudy.getCancerStudyByStableId(caseList.getCancerStudyIdentifier()); if (theCancerStudy == null) { throw new IllegalArgumentException("cancer study identified by cancer_study_identifier '" - + cancerStudyIdentifier + "' not found in dbms or inaccessible to user."); + + caseList.getCancerStudyIdentifier() + "' not found in dbms or inaccessible to user."); } - String sampleListName = properties.getProperty("case_list_name"); - - String sampleListCategoryStr = properties.getProperty("case_list_category"); + String sampleListCategoryStr = caseList.getCategory(); if (sampleListCategoryStr == null || sampleListCategoryStr.length() == 0) { sampleListCategoryStr = "other"; } SampleListCategory sampleListCategory = SampleListCategory.get(sampleListCategoryStr); - String sampleListDescription = properties.getProperty("case_list_description"); - String sampleListStr = properties.getProperty("case_list_ids"); - if (sampleListName == null) { - throw new IllegalArgumentException("case_list_name is not specified."); - } else if (sampleListDescription == null) { - throw new IllegalArgumentException("case_list_description is not specified."); - } - boolean itemsAddedViaPatientLink = false; // construct sample id list ArrayList sampleIDsList = new ArrayList(); - String[] sampleIds = sampleListStr.split("\t"); - for (String sampleId : sampleIds) { + for (String sampleId : caseList.getSampleIds()) { sampleId = StableIdUtil.getSampleId(sampleId); Sample s = DaoSample.getSampleByCancerStudyAndSampleId(theCancerStudy.getInternalId(), sampleId); if (s==null) { @@ -110,31 +98,31 @@ public static void importSampleList(File dataFile) throws IOException, DaoExcept } else if (!sampleIDsList.contains(s.getStableId())) { sampleIDsList.add(s.getStableId()); } else { - ProgressMonitor.logWarning("Warning: duplicated sample ID "+s.getStableId()+" in case list "+stableId); + ProgressMonitor.logWarning("Warning: duplicated sample ID " + s.getStableId() + " in case list " + caseList.getStableId()); } } DaoSampleList daoSampleList = new DaoSampleList(); - SampleList sampleList = daoSampleList.getSampleListByStableId(stableId); + SampleList sampleList = daoSampleList.getSampleListByStableId(caseList.getStableId()); if (sampleList != null) { - throw new IllegalArgumentException("Patient list with this stable Id already exists: " + stableId); + throw new IllegalArgumentException("Patient list with this stable Id already exists: " + caseList.getStableId()); } sampleList = new SampleList(); - sampleList.setStableId(stableId); + sampleList.setStableId(caseList.getStableId()); int cancerStudyId = theCancerStudy.getInternalId(); sampleList.setCancerStudyId(cancerStudyId); sampleList.setSampleListCategory(sampleListCategory); - sampleList.setName(sampleListName); - sampleList.setDescription(sampleListDescription); + sampleList.setName(caseList.getName()); + sampleList.setDescription(caseList.getDescription()); sampleList.setSampleList(sampleIDsList); daoSampleList.addSampleList(sampleList); - sampleList = daoSampleList.getSampleListByStableId(stableId); + sampleList = daoSampleList.getSampleListByStableId(caseList.getStableId()); ProgressMonitor.setCurrentMessage(" --> stable ID: " + sampleList.getStableId()); ProgressMonitor.setCurrentMessage(" --> sample list name: " + sampleList.getName()); - ProgressMonitor.setCurrentMessage(" --> number of samples in file: " + sampleIds.length); + ProgressMonitor.setCurrentMessage(" --> number of samples in file: " + caseList.getSampleIds().size()); String warningSamplesViaPatientLink = (itemsAddedViaPatientLink? "(nb: can be higher if samples were added via patient link)" : ""); ProgressMonitor.setCurrentMessage(" --> number of samples stored in final sample list " + warningSamplesViaPatientLink + ": " + sampleIDsList.size()); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportSif.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportSif.java index 75589416..6763e41c 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportSif.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportSif.java @@ -32,12 +32,20 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; -import java.io.*; -import java.util.*; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; /** * Command Line to Import SIF Interactions. @@ -151,8 +159,6 @@ public static void main(String[] args) { } ProgressMonitor.setConsoleMode(true); - SpringUtil.initDataSource(); - try { File geneFile = new File(args[0]); String dataSource = args[1]; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java index 50ebd329..d7e4b66a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java @@ -34,7 +34,6 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; -import java.util.ArrayList; import java.util.HashSet; import java.util.Set; @@ -46,22 +45,25 @@ public class ImportStructuralVariantData { // Initialize variables - private File structuralVariantFile; - private int geneticProfileId; - private String genePanel; - private Set namespaces; - private Set sampleSet = new HashSet<>(); + private final File structuralVariantFile; + private final int geneticProfileId; + private final Integer genePanelId; + private final Set namespaces; + + private final boolean isIncrementalUpdateMode; public ImportStructuralVariantData( File structuralVariantFile, int geneticProfileId, String genePanel, - Set namespaces + Set namespaces, + boolean isIncrementalUpdateMode ) throws DaoException { this.structuralVariantFile = structuralVariantFile; this.geneticProfileId = geneticProfileId; - this.genePanel = genePanel; + this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); this.namespaces = namespaces; + this.isIncrementalUpdateMode = isIncrementalUpdateMode; } public void importData() throws IOException, DaoException { @@ -75,15 +77,15 @@ public void importData() throws IOException, DaoException { int recordCount = 0; // Genetic profile is read in first GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); - ArrayList orderedSampleList = new ArrayList(); + Set sampleIds = new HashSet<>(); long id = DaoStructuralVariant.getLargestInternalId(); Set uniqueSVs = new HashSet<>(); while ((line = buf.readLine()) != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if( !line.startsWith("#") && line.trim().length() > 0) { + if(TsvUtil.isDataLine(line)) { recordCount++; - String parts[] = line.split("\t", -1); + String parts[] = TsvUtil.splitTsvLine(line); StructuralVariant structuralVariant = structuralVariantUtil.parseStructuralVariantRecord(parts); structuralVariant.setInternalId(++id); structuralVariant.setGeneticProfileId(geneticProfileId); @@ -175,21 +177,16 @@ public void importData() throws IOException, DaoException { // Add structural variant DaoStructuralVariant.addStructuralVariantToBulkLoader(structuralVariant); - // Add sample to sample profile list, which is important for gene panels - if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId) && !sampleSet.contains(sample.getStableId())) { - if (genePanel != null) { - DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, GeneticProfileUtil.getGenePanelId(genePanel)); - } else { - DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, null); - } - } - sampleSet.add(sample.getStableId()); - orderedSampleList.add(sample.getInternalId()); + sampleIds.add(sample.getInternalId()); } } } } - DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); + + DaoSampleProfile.upsertSampleToProfileMapping(sampleIds, geneticProfileId, genePanelId); + if (isIncrementalUpdateMode) { + DaoStructuralVariant.deleteStructuralVariants(geneticProfileId, sampleIds); + } buf.close(); MySQLbulkLoader.flushAll(); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index b984abf4..75143ba3 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -32,19 +32,50 @@ package org.mskcc.cbio.portal.scripts; -import java.io.*; -import java.util.*; +import org.apache.commons.lang3.ArrayUtils; +import org.mskcc.cbio.portal.dao.DaoCnaEvent; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneset; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.dao.JdbcUtil; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.model.CnaEvent; +import org.mskcc.cbio.portal.model.Geneset; +import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.util.CnaUtil; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.DataValidator; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.GeneticProfileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; +import org.mskcc.cbio.portal.util.TsvUtil; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.IntStream; import java.util.stream.Stream; -import org.apache.commons.lang3.ArrayUtils; -import org.cbioportal.model.EntityType; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; - /** * Code to Import Copy Number Alteration, MRNA Expression Data, Methylation, or protein RPPA data @@ -53,12 +84,8 @@ */ public class ImportTabDelimData { public static final String CNA_VALUE_AMPLIFICATION = "2"; - public static final String CNA_VALUE_GAIN = "1"; - public static final String CNA_VALUE_HEMIZYGOUS_DELETION = "-1"; public static final String CNA_VALUE_HOMOZYGOUS_DELETION = "-2"; public static final String CNA_VALUE_PARTIAL_DELETION = "-1.5"; - public static final String CNA_VALUE_ZERO = "0"; - private HashSet importedGeneticEntitySet = new HashSet<>(); private File dataFile; private String targetLine; private int geneticProfileId; @@ -66,11 +93,18 @@ public class ImportTabDelimData { private int entriesSkipped = 0; private int nrExtraRecords = 0; private Set arrayIdSet = new HashSet(); - private String genePanel; private String genericEntityProperties; private File pdAnnotationsFile; private Map, Map> pdAnnotations; - private final GeneticAlterationImporter geneticAlterationImporter; + private GeneticAlterationImporter geneticAlterationImporter; + + private int numLines; + + private DaoGeneOptimized daoGene; + + private boolean isIncrementalUpdateMode; + private ArrayList orderedSampleList; + private final Integer genePanelId; /** * Constructor. @@ -81,7 +115,8 @@ public class ImportTabDelimData { * @param geneticProfileId GeneticProfile ID. * @param genePanel GenePanel * @param genericEntityProperties Generic Assay Entities. - * + * @param isIncrementalUpdateMode if true, update/append data to the existing one + * * @deprecated : TODO shall we deprecate this feature (i.e. the targetLine)? */ public ImportTabDelimData( @@ -90,17 +125,11 @@ public ImportTabDelimData( int geneticProfileId, String genePanel, String genericEntityProperties, - DaoGeneticAlteration daoGeneticAlteration + boolean isIncrementalUpdateMode, + DaoGeneOptimized daoGene ) { - this.dataFile = dataFile; - this.targetLine = targetLine; - this.geneticProfileId = geneticProfileId; - this.genePanel = genePanel; - this.genericEntityProperties = genericEntityProperties; - this.geneticAlterationImporter = new GeneticAlterationImporter( - geneticProfileId, - daoGeneticAlteration - ); + this(dataFile, targetLine, geneticProfileId, genePanel, isIncrementalUpdateMode, daoGene); + this.genericEntityProperties = genericEntityProperties; } /** @@ -110,7 +139,8 @@ public ImportTabDelimData( * @param targetLine The line we want to import. * If null, all lines are imported. * @param geneticProfileId GeneticProfile ID. - * + * @param isIncrementalUpdateMode if true, update/append data to the existing one + * * @deprecated : TODO shall we deprecate this feature (i.e. the targetLine)? */ public ImportTabDelimData( @@ -118,18 +148,17 @@ public ImportTabDelimData( String targetLine, int geneticProfileId, String genePanel, - DaoGeneticAlteration daoGeneticAlteration + boolean isIncrementalUpdateMode, + DaoGeneOptimized daoGene ) { - this.dataFile = dataFile; + this(dataFile, geneticProfileId, genePanel, isIncrementalUpdateMode, daoGene); this.targetLine = targetLine; - this.geneticProfileId = geneticProfileId; - this.genePanel = genePanel; - this.geneticAlterationImporter = new GeneticAlterationImporter(geneticProfileId, daoGeneticAlteration); } /** * Constructor. * + * @param isIncrementalUpdateMode if true, update/append data to the existing one * @param dataFile Data File containing Copy Number Alteration, MRNA Expression Data, or protein RPPA data * @param geneticProfileId GeneticProfile ID. */ @@ -137,28 +166,48 @@ public ImportTabDelimData( File dataFile, int geneticProfileId, String genePanel, - DaoGeneticAlteration daoGeneticAlteration + boolean isIncrementalUpdateMode, + DaoGeneOptimized daoGene ) { this.dataFile = dataFile; this.geneticProfileId = geneticProfileId; - this.genePanel = genePanel; - this.geneticAlterationImporter = new GeneticAlterationImporter(geneticProfileId, daoGeneticAlteration); + this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); + this.isIncrementalUpdateMode = isIncrementalUpdateMode; + this.daoGene = daoGene; + this.geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); + if (this.isIncrementalUpdateMode + && geneticProfile != null + && this.geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENESET_SCORE) { + throw new UnsupportedOperationException("Incremental upload of geneset scores is not supported."); + } } /** * Import the Copy Number Alteration, mRNA Expression, protein RPPA, GSVA or generic_assay data * - * @throws IOException IO Error. - * @throws DaoException Database Error. */ - public void importData(int numLines) throws IOException, DaoException { - - geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); - + public void importData() { + JdbcUtil.getTransactionTemplate().execute(status -> { + try { + doImportData(); + } catch (Throwable e) { + status.setRollbackOnly(); + throw new RuntimeException(e); + } + return null; + }); + } + private void doImportData() throws IOException, DaoException { + try { + this.numLines = FileUtil.getNumLines(dataFile); + } catch (IOException e) { + throw new RuntimeException(e); + } + ProgressMonitor.setMaxValue(numLines); FileReader reader = new FileReader(dataFile); BufferedReader buf = new BufferedReader(reader); String headerLine = buf.readLine(); - String parts[] = headerLine.split("\t"); + String[] headerParts = TsvUtil.splitTsvLine(headerLine); //Whether data regards CNA or RPPA: boolean isDiscretizedCnaProfile = geneticProfile != null @@ -166,23 +215,28 @@ public void importData(int numLines) throws IOException, DaoException { && geneticProfile.showProfileInAnalysisTab(); boolean isRppaProfile = geneticProfile != null && geneticProfile.getGeneticAlterationType() == GeneticAlterationType.PROTEIN_LEVEL - && "Composite.Element.Ref".equalsIgnoreCase(parts[0]); + && "Composite.Element.Ref".equalsIgnoreCase(headerParts[0]); boolean isGsvaProfile = geneticProfile != null && geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENESET_SCORE - && parts[0].equalsIgnoreCase("geneset_id"); + && headerParts[0].equalsIgnoreCase("geneset_id"); boolean isGenericAssayProfile = geneticProfile != null && geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENERIC_ASSAY - && parts[0].equalsIgnoreCase("ENTITY_STABLE_ID"); + && headerParts[0].equalsIgnoreCase("ENTITY_STABLE_ID"); + + long typesDetected = List.of(isDiscretizedCnaProfile, isRppaProfile, isGsvaProfile, isGenericAssayProfile).stream().filter(Boolean::booleanValue).count(); + if (typesDetected > 1) { + throw new IllegalStateException("More then one data type is detected."); + } int numRecordsToAdd = 0; int samplesSkipped = 0; try { - int hugoSymbolIndex = getHugoSymbolIndex(parts); - int entrezGeneIdIndex = getEntrezGeneIdIndex(parts); - int rppaGeneRefIndex = getRppaGeneRefIndex(parts); - int genesetIdIndex = getGenesetIdIndex(parts); - int sampleStartIndex = getStartIndex(parts, hugoSymbolIndex, entrezGeneIdIndex, rppaGeneRefIndex, genesetIdIndex); - int genericAssayIdIndex = getGenericAssayIdIndex(parts); + int hugoSymbolIndex = getHugoSymbolIndex(headerParts); + int entrezGeneIdIndex = getEntrezGeneIdIndex(headerParts); + int rppaGeneRefIndex = getRppaGeneRefIndex(headerParts); + int genesetIdIndex = getGenesetIdIndex(headerParts); + int sampleStartIndex = getStartIndex(headerParts, hugoSymbolIndex, entrezGeneIdIndex, rppaGeneRefIndex, genesetIdIndex); + int genericAssayIdIndex = getGenericAssayIdIndex(headerParts); if (isRppaProfile) { if (rppaGeneRefIndex == -1) { throw new RuntimeException("Error: the following column should be present for RPPA data: Composite.Element.Ref"); @@ -200,10 +254,9 @@ public void importData(int numLines) throws IOException, DaoException { } String sampleIds[]; - sampleIds = new String[parts.length - sampleStartIndex]; - System.arraycopy(parts, sampleStartIndex, sampleIds, 0, parts.length - sampleStartIndex); + sampleIds = new String[headerParts.length - sampleStartIndex]; + System.arraycopy(headerParts, sampleStartIndex, sampleIds, 0, headerParts.length - sampleStartIndex); - int nrUnknownSamplesAdded = 0; ProgressMonitor.setCurrentMessage(" --> total number of samples: " + sampleIds.length); Map, Map> pdAnnotationsForStableSampleIds = null; @@ -211,9 +264,9 @@ public void importData(int numLines) throws IOException, DaoException { pdAnnotationsForStableSampleIds = readPdAnnotations(this.pdAnnotationsFile); } // link Samples to the genetic profile - ArrayList orderedSampleList = new ArrayList(); ArrayList filteredSampleIndices = new ArrayList(); this.pdAnnotations = new HashMap<>(); + this.orderedSampleList = new ArrayList<>(); for (int i = 0; i < sampleIds.length; i++) { Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(geneticProfile.getCancerStudyId(), StableIdUtil.getSampleId(sampleIds[i])); @@ -228,10 +281,6 @@ public void importData(int numLines) throws IOException, DaoException { throw new RuntimeException("Unknown sample id '" + StableIdUtil.getSampleId(sampleIds[i]) + "' found in tab-delimited file: " + this.dataFile.getCanonicalPath()); } } - if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId)) { - Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); - DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelID); - } orderedSampleList.add(sample.getInternalId()); if (pdAnnotationsForStableSampleIds != null) { Set> keys = new HashSet<>(pdAnnotationsForStableSampleIds.keySet()); @@ -248,21 +297,15 @@ public void importData(int numLines) throws IOException, DaoException { if (pdAnnotationsForStableSampleIds != null && !pdAnnotationsForStableSampleIds.keySet().isEmpty()) { ProgressMonitor.logWarning("WARNING: Following pd annotation sample-entrezId pairs newer used in the data file: " + pdAnnotationsForStableSampleIds.keySet()); } - if (nrUnknownSamplesAdded > 0) { - ProgressMonitor.logWarning("WARNING: Number of samples added on the fly because they were missing in clinical data: " + nrUnknownSamplesAdded); - } if (samplesSkipped > 0) { ProgressMonitor.setCurrentMessage(" --> total number of samples skipped (normal samples): " + samplesSkipped); } ProgressMonitor.setCurrentMessage(" --> total number of data lines: " + (numLines - 1)); - DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); - - //Gene cache: - DaoGeneOptimized daoGene = DaoGeneOptimized.getInstance(); + this.geneticAlterationImporter = isIncrementalUpdateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) + : new GeneticAlterationImporter(geneticProfileId, orderedSampleList); - //Object to insert records in the generic 'genetic_alteration' table: - DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); + geneticAlterationImporter.initialize(); //cache for data found in cna_event' table: Set existingCnaEvents = new HashSet<>(); @@ -277,28 +320,62 @@ public void importData(int numLines) throws IOException, DaoException { genericAssayStableIdToEntityIdMap = GenericAssayMetaUtils.buildGenericAssayStableIdToEntityIdMap(); } - int lenParts = parts.length; String line = buf.readLine(); while (line != null) { + ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); boolean recordAdded = false; - // either parse line as geneset or gene for importing into 'genetic_alteration' table - if (isGsvaProfile) { - recordAdded = parseGenesetLine(line, lenParts, sampleStartIndex, genesetIdIndex, - filteredSampleIndices, daoGeneticAlteration); - } else if (isGenericAssayProfile) { - recordAdded = parseGenericAssayLine(line, lenParts, sampleStartIndex, genericAssayIdIndex, - filteredSampleIndices, daoGeneticAlteration, genericAssayStableIdToEntityIdMap); - } else { - recordAdded = parseLine(line, lenParts, sampleStartIndex, - hugoSymbolIndex, entrezGeneIdIndex, rppaGeneRefIndex, - isRppaProfile, isDiscretizedCnaProfile, - daoGene, - filteredSampleIndices, orderedSampleList, - existingCnaEvents); + if (TsvUtil.isDataLine(line)) { + String[] rowParts = TsvUtil.splitTsvLine(line); + + TsvUtil.ensureHeaderAndRowMatch(headerParts, rowParts); + String[] sampleValues = ArrayUtils.subarray(rowParts, sampleStartIndex, rowParts.length); + + // trim whitespace from values + sampleValues = Stream.of(sampleValues).map(String::trim).toArray(String[]::new); + sampleValues = filterOutNormalValues(filteredSampleIndices, sampleValues); + + // either parse line as geneset or gene for importing into 'genetic_alteration' table + if (isGsvaProfile) { + String genesetId = rowParts[genesetIdIndex]; + recordAdded = saveGenesetLine(sampleValues, genesetId); + } else if (isGenericAssayProfile) { + String genericAssayId = rowParts[genericAssayIdIndex]; + recordAdded = saveGenericAssayLine(sampleValues, genericAssayId, genericAssayStableIdToEntityIdMap); + } else { + String geneSymbol = null; + if (hugoSymbolIndex != -1) { + geneSymbol = rowParts[hugoSymbolIndex]; + } + if (rppaGeneRefIndex != -1) { + geneSymbol = rowParts[rppaGeneRefIndex]; + } + if (geneSymbol != null && geneSymbol.isEmpty()) { + geneSymbol = null; + } + //get entrez + String entrez = null; + if (entrezGeneIdIndex != -1) { + entrez = rowParts[entrezGeneIdIndex]; + } + if (entrez != null && entrez.isEmpty()) { + entrez = null; + } + if (entrez != null && !DataValidator.isValidNumericSequence(entrez)) { + ProgressMonitor.logWarning("Ignoring line with invalid Entrez_Id " + entrez); + } else { + String firstCellValue = rowParts[0]; + if (targetLine == null || firstCellValue.equals(targetLine)) { + recordAdded = saveLine(sampleValues, + entrez, geneSymbol, + isRppaProfile, isDiscretizedCnaProfile, + existingCnaEvents); + } + } + } } // increment number of records added or entries skipped @@ -311,6 +388,8 @@ public void importData(int numLines) throws IOException, DaoException { line = buf.readLine(); } + DaoSampleProfile.upsertSampleToProfileMapping(orderedSampleList, geneticProfileId, genePanelId); + geneticAlterationImporter.finalize(); if (MySQLbulkLoader.isBulkLoad()) { MySQLbulkLoader.flushAll(); } @@ -365,7 +444,7 @@ private Map, Map> readPdAnnotations(File String line = reader.readLine(); while (line != null) { - String[] row = line.split("\t", -1); + String[] row = TsvUtil.splitTsvLine(line); if (row.length < 6) { throw new RuntimeException("Mis-formatted row: " + String.join(", ", row)); } @@ -483,363 +562,231 @@ private Map, Map> readPdAnnotations(File * AMIXED0... * * - * @param line the line from the profile data file to be parsed - * @param nrColumns the number of columns, defined by the header line - * @param sampleStartIndex the index of the first column with a sample name in the header field - * @param hugoSymbolIndex the index of the column Hugo_Symbol - * @param entrezGeneIdIndex the index of the column Entrez_Gene_Id - * @param rppaGeneRefIndex the index of the column Composite.Element.Ref * @param isRppaProfile true if this is an rppa profile (i.e. alteration type is PROTEIN_LEVEL and the first column is Composite.Element.Ref) * @param isDiscretizedCnaProfile true if this is a discretized CNA profile (i.e. alteration type COPY_NUMBER_ALTERATION and showProfileInAnalysisTab is true) - * @param daoGene an instance of DaoGeneOptimized ... for use in resolving gene symbols - * @param orderedSampleList a list of the internal sample ids corresponding to the sample names in the header line * @param existingCnaEvents a collection of CnaEvents, to be added to or updated during parsing of individual lines * @return true if any record was stored in genetic_alteration, else false * @throws DaoException if any DaoException is thrown while using daoGene or daoGeneticAlteration */ - private boolean parseLine(String line, int nrColumns, int sampleStartIndex, - int hugoSymbolIndex, int entrezGeneIdIndex, int rppaGeneRefIndex, - boolean isRppaProfile, boolean isDiscretizedCnaProfile, - DaoGeneOptimized daoGene, - List filteredSampleIndices, List orderedSampleList, - Set existingCnaEvents + private boolean saveLine(String[] values, + String entrez, + String geneSymbol, + boolean isRppaProfile, + boolean isDiscretizedCnaProfile, + Set existingCnaEvents ) throws DaoException { - //TODO: refactor this entire function - split functionality into smaller units / subroutines - boolean recordStored = false; - // Ignore lines starting with # - if (!line.startsWith("#") && line.trim().length() > 0) { - String[] parts = line.split("\t", -1); + if (isRppaProfile && geneSymbol == null) { + ProgressMonitor.logWarning("Ignoring line with no Composite.Element.REF value"); + return false; + } - if (parts.length > nrColumns) { - if (line.split("\t").length > nrColumns) { - ProgressMonitor.logWarning("Ignoring line with more fields (" + parts.length - + ") than specified in the headers(" + nrColumns + "): \n" + parts[0]); - return false; - } - } - String values[] = (String[]) ArrayUtils.subarray(parts, sampleStartIndex, parts.length > nrColumns ? nrColumns : parts.length); - values = filterOutNormalValues(filteredSampleIndices, values); + //If all are empty, skip line: + boolean noGeneSpecified = geneSymbol == null && entrez == null; + if (noGeneSpecified) { + ProgressMonitor.logWarning("Ignoring line with no Hugo_Symbol and no Entrez_Id"); + return false; + } - String geneSymbol = null; - if (hugoSymbolIndex != -1) { - geneSymbol = parts[hugoSymbolIndex]; - } - //RPPA: //TODO - we should split up the RPPA scenario from this code...too many if/else because of this - if (rppaGeneRefIndex != -1) { - geneSymbol = parts[rppaGeneRefIndex]; - } - if (geneSymbol != null && geneSymbol.isEmpty()) { - geneSymbol = null; + if (geneSymbol != null) { + boolean multipleGenesLine = geneSymbol.contains("///"); + if (multipleGenesLine) { + ProgressMonitor.logWarning("Ignoring gene symbol: " + geneSymbol + + " It is separated by ///. This indicates that the line contains information regarding multiple genes, and we cannot currently handle this"); + return false; } - if (isRppaProfile && geneSymbol == null) { - ProgressMonitor.logWarning("Ignoring line with no Composite.Element.REF value"); + boolean unknownGene = geneSymbol.contains("---"); + if (unknownGene) { + ProgressMonitor.logWarning("Ignoring gene symbol: " + geneSymbol + + " It is specified as ---. This indicates that the line contains information regarding an unknown gene, and we cannot currently handle this"); return false; } - //get entrez - String entrez = null; - if (entrezGeneIdIndex != -1) { - entrez = parts[entrezGeneIdIndex]; + } + + List genes; + //If rppa, parse genes from "Composite.Element.REF" column: + if (isRppaProfile) { + genes = parseRPPAGenes(geneSymbol); + } else { + genes = parseGenes(entrez, geneSymbol); + } + + //if genes still null, skip current record + if (genes == null || genes.isEmpty()) { + ProgressMonitor.logWarning("Gene with Entrez_Id " + entrez + " and gene symbol" + geneSymbol +" not found. Record will be skipped for this gene."); + return false; + } + + List genesMatchingAnAlias = Collections.emptyList(); + if (geneSymbol != null) { + genesMatchingAnAlias = daoGene.getGenesForAlias(geneSymbol); + } + + Set microRNAGenes = new HashSet<>(); + Set nonMicroRNAGenes = new HashSet<>(); + Iterator geneIterator = Stream.concat(genes.stream(), genesMatchingAnAlias.stream()).iterator(); + while (geneIterator.hasNext()) { + CanonicalGene g = geneIterator.next(); + if ("miRNA".equals(g.getType())) { + microRNAGenes.add(g); + } else { + nonMicroRNAGenes.add(g); } - if (entrez != null) { - if (entrez.isEmpty()) { - entrez = null; - } - else if (!entrez.matches("[0-9]+")) { - //TODO - would be better to give an exception in some cases, like negative Entrez values - ProgressMonitor.logWarning("Ignoring line with invalid Entrez_Id " + entrez); - return false; + } + if (!microRNAGenes.isEmpty()) { + // for micro rna, duplicate the data + for (CanonicalGene gene : microRNAGenes) { + if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { + recordStored = true; } } - - //If all are empty, skip line: - if (geneSymbol == null && entrez == null) { - ProgressMonitor.logWarning("Ignoring line with no Hugo_Symbol and no Entrez_Id"); + if (!recordStored) { + if (nonMicroRNAGenes.isEmpty()) { + // this means that no microRNA records could not be stored + ProgressMonitor.logWarning("Could not store microRNA data"); + } else { + // this case : + // - at least one of the entrez-gene-ids was not a microRNA + // - all of the matched microRNA ids (if any) failed to be imported (presumably already imported on a prior line) + ProgressMonitor.logWarning("Gene symbol " + geneSymbol + " found to be ambiguous (a mixture of microRNA and other types). Record will be skipped for this gene."); + } return false; + } + } else { + // none of the matched genes are type "miRNA" + if (genes.size() == 1) { + // Store all values per gene: + recordStored = this.geneticAlterationImporter.store(values, genes.get(0), geneSymbol); + //only add extra CNA related records if the step above worked, otherwise skip: + if (recordStored && isDiscretizedCnaProfile) { + if (isIncrementalUpdateMode) { + DaoCnaEvent.removeSampleCnaEvents(geneticProfileId, orderedSampleList); + } + long entrezGeneId = genes.get(0).getEntrezGeneId(); + CnaUtil.storeCnaEvents(existingCnaEvents, composeCnaEventsToAdd(values, entrezGeneId)); + } } else { - if (geneSymbol != null && (geneSymbol.contains("///") || geneSymbol.contains("---"))) { - // Ignore gene IDs separated by ///. This indicates that - // the line contains information regarding multiple genes, and - // we cannot currently handle this. - // Also, ignore gene IDs that are specified as ---. This indicates - // the line contains information regarding an unknown gene, and - // we cannot currently handle this. - ProgressMonitor.logWarning("Ignoring gene ID: " + geneSymbol); - return false; + if (isRppaProfile) { // for protein data, duplicate the data + recordStored = saveRppaValues(values, recordStored, genes, geneSymbol); } else { - List genes = null; - //If rppa, parse genes from "Composite.Element.REF" column: - if (isRppaProfile) { - genes = parseRPPAGenes(geneSymbol); - if (genes == null) { - //will be null when there is a parse error in this case, so we - //can return here and avoid duplicated messages: - return false; - } - if (genes.isEmpty()) { - String gene = (geneSymbol != null) ? geneSymbol : entrez; - ProgressMonitor.logWarning("Gene not found for: [" + gene - + "]. Ignoring it " - + "and all tab-delimited data associated with it!"); - return false; - } - } else { - //try entrez: - if (entrez != null) { - CanonicalGene gene = daoGene.getGene(Long.parseLong(entrez)); - if (gene != null) { - genes = Arrays.asList(gene); - } - } - //no entrez or could not resolve by entrez, try hugo: - if ((genes == null || genes.isEmpty()) && geneSymbol != null) { - // deal with multiple symbols separate by |, use the first one - int ix = geneSymbol.indexOf("|"); - if (ix > 0) { - geneSymbol = geneSymbol.substring(0, ix); - } - genes = daoGene.getGene(geneSymbol, true); - } - //if genes still null, skip current record - if (genes == null || genes.isEmpty()) { - ProgressMonitor.logWarning("Entrez_Id " + entrez + " not found. Record will be skipped for this gene."); - return false; - } + if (!recordStored) { + // this case : + // - the hugo gene symbol was ambiguous (matched multiple entrez-gene-ids) + ProgressMonitor.logWarning("Gene symbol " + geneSymbol + " found to be ambiguous. Record will be skipped for this gene."); } + } + } + } + return recordStored; + } - // If targetLine is specified and does not match the current line, skip the current line. - if (targetLine != null && !(parts[0].equals(targetLine))) { - return false; - } + private boolean saveRppaValues(String[] values, boolean recordStored, List genes, String geneSymbol) throws DaoException { + for (CanonicalGene gene : genes) { + if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { + recordStored = true; + nrExtraRecords++; + } + } + if (recordStored) { + //skip one, to avoid double counting: + nrExtraRecords--; + } else { + // this means that RPPA could not be stored + ProgressMonitor.logWarning("Could not store RPPA data"); + } + return recordStored; + } - List genesMatchingAnAlias = Collections.emptyList(); - if (geneSymbol != null) { - genesMatchingAnAlias = daoGene.getGenesForAlias(geneSymbol); - } + private List parseGenes(String entrez, String geneSymbol) { + //try entrez: + if (entrez != null) { + CanonicalGene gene = daoGene.getGene(Long.parseLong(entrez)); + if (gene != null) { + return Arrays.asList(gene); + } + } + //no entrez or could not resolve by entrez, try hugo: + if (geneSymbol != null) { + // deal with multiple symbols separate by |, use the first one + int ix = geneSymbol.indexOf("|"); + if (ix > 0) { + geneSymbol = geneSymbol.substring(0, ix); + } + return daoGene.getGene(geneSymbol, true); + } + return List.of(); + } - Set microRNAGenes = new HashSet<>(); - Set nonMicroRNAGenes = new HashSet<>(); - Iterator geneIterator = Stream.concat(genes.stream(), genesMatchingAnAlias.stream()).iterator(); - while (geneIterator.hasNext()) { - CanonicalGene g = geneIterator.next(); - if ("miRNA".equals(g.getType())) { - microRNAGenes.add(g); - } else { - nonMicroRNAGenes.add(g); - } - } - if (!microRNAGenes.isEmpty()) { - // for micro rna, duplicate the data - for (CanonicalGene gene : microRNAGenes) { - if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { - recordStored = true; - } - } - if (!recordStored) { - if (nonMicroRNAGenes.isEmpty()) { - // this means that no microRNA records could not be stored - ProgressMonitor.logWarning("Could not store microRNA data"); - } else { - // this case : - // - at least one of the entrez-gene-ids was not a microRNA - // - all of the matched microRNA ids (if any) failed to be imported (presumably already imported on a prior line) - ProgressMonitor.logWarning("Gene symbol " + geneSymbol + " found to be ambiguous (a mixture of microRNA and other types). Record will be skipped for this gene."); - } - return false; - } - } else { - // none of the matched genes are type "miRNA" - if (genes.size() == 1) { - List cnaEventsToAdd = new ArrayList(); - - if (isDiscretizedCnaProfile) { - long entrezGeneId = genes.get(0).getEntrezGeneId(); - for (int i = 0; i < values.length; i++) { - - // temporary solution -- change partial deletion back to full deletion. - if (values[i].equals(CNA_VALUE_PARTIAL_DELETION)) { - values[i] = CNA_VALUE_HOMOZYGOUS_DELETION; - } - if (values[i].equals(CNA_VALUE_AMPLIFICATION) - // || values[i].equals(CNA_VALUE_GAIN) >> skipping GAIN, ZERO, HEMIZYGOUS_DELETION to minimize size of dataset in DB - // || values[i].equals(CNA_VALUE_ZERO) - // || values[i].equals(CNA_VALUE_HEMIZYGOUS_DELETION) - || values[i].equals(CNA_VALUE_HOMOZYGOUS_DELETION) - ) { - Integer sampleId = orderedSampleList.get(i); - CnaEvent cnaEvent = new CnaEvent(sampleId, geneticProfileId, entrezGeneId, Short.parseShort(values[i])); - //delayed add: - AbstractMap.SimpleEntry sampleGenePair = new AbstractMap.SimpleEntry<>(sampleId, entrezGeneId); - Map pdAnnotationDetails = this.pdAnnotations.get(sampleGenePair); - if (pdAnnotationDetails != null) { - cnaEvent.setDriverFilter(pdAnnotationDetails.get("DRIVER_FILTER")); - cnaEvent.setDriverFilterAnnotation(pdAnnotationDetails.get("DRIVER_FILTER_ANNOTATION")); - cnaEvent.setDriverTiersFilter(pdAnnotationDetails.get("DRIVER_TIERS_FILTER")); - cnaEvent.setDriverTiersFilterAnnotation(pdAnnotationDetails.get("DRIVER_TIERS_FILTER_ANNOTATION")); - } - cnaEventsToAdd.add(cnaEvent); - } - } - } - // Store all values per gene: - recordStored = this.geneticAlterationImporter.store(values, genes.get(0), geneSymbol); - //only add extra CNA related records if the step above worked, otherwise skip: - if (recordStored) { - CnaUtil.storeCnaEvents(existingCnaEvents, cnaEventsToAdd); - } - } else { - if (isRppaProfile) { // for protein data, duplicate the data - for (CanonicalGene gene : genes) { - if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { - recordStored = true; - nrExtraRecords++; - } - } - if (recordStored) { - //skip one, to avoid double counting: - nrExtraRecords--; - } else { - // this means that RPPA could not be stored - ProgressMonitor.logWarning("Could not store RPPA data"); - } - } else { - if (!recordStored) { - // this case : - // - the hugo gene symbol was ambiguous (matched multiple entrez-gene-ids) - ProgressMonitor.logWarning("Gene symbol " + geneSymbol + " found to be ambiguous. Record will be skipped for this gene."); - } - } - } - } + private List composeCnaEventsToAdd(String[] values, long entrezGeneId) { + List cnaEventsToAdd = new ArrayList(); + for (int i = 0; i < values.length; i++) { + + // temporary solution -- change partial deletion back to full deletion. + if (values[i].equals(CNA_VALUE_PARTIAL_DELETION)) { + values[i] = CNA_VALUE_HOMOZYGOUS_DELETION; + } + if (values[i].equals(CNA_VALUE_AMPLIFICATION) + // || values[i].equals(CNA_VALUE_GAIN) >> skipping GAIN, ZERO, HEMIZYGOUS_DELETION to minimize size of dataset in DB + // || values[i].equals(CNA_VALUE_ZERO) + // || values[i].equals(CNA_VALUE_HEMIZYGOUS_DELETION) + || values[i].equals(CNA_VALUE_HOMOZYGOUS_DELETION) + ) { + Integer sampleId = orderedSampleList.get(i); + CnaEvent cnaEvent = new CnaEvent(sampleId, geneticProfileId, entrezGeneId, Short.parseShort(values[i])); + //delayed add: + AbstractMap.SimpleEntry sampleGenePair = new AbstractMap.SimpleEntry<>(sampleId, entrezGeneId); + Map pdAnnotationDetails = this.pdAnnotations.get(sampleGenePair); + if (pdAnnotationDetails != null) { + cnaEvent.setDriverFilter(pdAnnotationDetails.get("DRIVER_FILTER")); + cnaEvent.setDriverFilterAnnotation(pdAnnotationDetails.get("DRIVER_FILTER_ANNOTATION")); + cnaEvent.setDriverTiersFilter(pdAnnotationDetails.get("DRIVER_TIERS_FILTER")); + cnaEvent.setDriverTiersFilterAnnotation(pdAnnotationDetails.get("DRIVER_TIERS_FILTER_ANNOTATION")); } + cnaEventsToAdd.add(cnaEvent); } } - return recordStored; + return cnaEventsToAdd; } /** * Parses line for gene set record and stores record in 'genetic_alteration' table. - * @param line - * @param nrColumns - * @param sampleStartIndex - * @param genesetIdIndex - * @param filteredSampleIndices - * @param daoGeneticAlteration + * @param genesetId * @return * @throws DaoException */ - private boolean parseGenesetLine(String line, int nrColumns, int sampleStartIndex, int genesetIdIndex, - List filteredSampleIndices, DaoGeneticAlteration daoGeneticAlteration) throws DaoException { + private boolean saveGenesetLine(String[] values, String genesetId) throws DaoException { boolean storedRecord = false; - if (!line.startsWith("#") && line.trim().length() > 0) { - String[] parts = line.split("\t", -1); - - if (parts.length > nrColumns) { - if (line.split("\t").length > nrColumns) { - ProgressMonitor.logWarning("Ignoring line with more fields (" + parts.length - + ") than specified in the headers(" + nrColumns + "): \n" + parts[0]); - return false; - } - } - - String values[] = (String[]) ArrayUtils.subarray(parts, sampleStartIndex, parts.length > nrColumns ? nrColumns : parts.length); - - // trim whitespace from values - values = Stream.of(values).map(String::trim).toArray(String[]::new); - values = filterOutNormalValues(filteredSampleIndices, values); - - Geneset geneset = DaoGeneset.getGenesetByExternalId(parts[genesetIdIndex]); - if (geneset != null) { - storedRecord = storeGeneticEntityGeneticAlterations(values, daoGeneticAlteration, geneset.getGeneticEntityId(), - EntityType.GENESET, geneset.getExternalId()); - } - else { - ProgressMonitor.logWarning("Geneset " + parts[genesetIdIndex] + " not found in DB. Record will be skipped."); - } + Geneset geneset = DaoGeneset.getGenesetByExternalId(genesetId); + if (geneset != null) { + storedRecord = this.geneticAlterationImporter.store(geneset.getGeneticEntityId(), values); + } + else { + ProgressMonitor.logWarning("Geneset " + genesetId + " not found in DB. Record will be skipped."); } return storedRecord; } /** * Parses line for generic assay profile record and stores record in 'genetic_alteration' table. - * @param line row from the separated-text that contains one or more values on a single sample - * @param nrColumns - * @param sampleStartIndex index of the first sample column - * @param genericAssayIdIndex index of the column that uniquely identifies a sample - * @param filteredSampleIndices - * @param daoGeneticAlteration - * @return - * @throws DaoException */ - - private boolean parseGenericAssayLine(String line, int nrColumns, int sampleStartIndex, int genericAssayIdIndex, - List filteredSampleIndices, DaoGeneticAlteration daoGeneticAlteration, Map genericAssayStableIdToEntityIdMap) throws DaoException { + private boolean saveGenericAssayLine(String[] values, String genericAssayId, Map genericAssayStableIdToEntityIdMap) throws DaoException { boolean recordIsStored = false; - if (!line.startsWith("#") && line.trim().length() > 0) { - String[] parts = line.split("\t", -1); - - if (parts.length > nrColumns) { - if (line.split("\t").length > nrColumns) { - ProgressMonitor.logWarning("Ignoring line with more fields (" + parts.length - + ") than specified in the headers(" + nrColumns + "): \n" + parts[0]); - return false; - } - } - - String values[] = (String[]) ArrayUtils.subarray(parts, sampleStartIndex, parts.length > nrColumns ? nrColumns : parts.length); - - // trim whitespace from values - values = Stream.of(values).map(String::trim).toArray(String[]::new); - values = filterOutNormalValues(filteredSampleIndices, values); - - String stableId = parts[genericAssayIdIndex]; - Integer entityId = genericAssayStableIdToEntityIdMap.getOrDefault(stableId, null); + Integer entityId = genericAssayStableIdToEntityIdMap.getOrDefault(genericAssayId, null); - if (entityId == null) { - ProgressMonitor.logWarning("Generic Assay entity " + parts[genericAssayIdIndex] + " not found in DB. Record will be skipped."); - } else { - recordIsStored = storeGeneticEntityGeneticAlterations(values, daoGeneticAlteration, entityId, - EntityType.GENERIC_ASSAY, stableId); - } - - return recordIsStored; + if (entityId == null) { + ProgressMonitor.logWarning("Generic Assay entity " + genericAssayId + " not found in DB. Record will be skipped."); + } else { + recordIsStored = this.geneticAlterationImporter.store(entityId, values); } return recordIsStored; } - /** - * Stores genetic alteration data for a genetic entity. - * @param values - * @param daoGeneticAlteration - * @param geneticEntityId - internal id for genetic entity - * @param geneticEntityType - "GENE", "GENESET", "PHOSPHOPROTEIN" - * @param geneticEntityName - hugo symbol for "GENE", external id for "GENESET", phospho gene name for "PHOSPHOPROTEIN" - * @return boolean indicating if record was stored successfully or not - */ - private boolean storeGeneticEntityGeneticAlterations(String[] values, DaoGeneticAlteration daoGeneticAlteration, - Integer geneticEntityId, EntityType geneticEntityType, String geneticEntityName) { - try { - if (importedGeneticEntitySet.add(geneticEntityId)) { - daoGeneticAlteration.addGeneticAlterationsForGeneticEntity(geneticProfile.getGeneticProfileId(), geneticEntityId, values); - return true; - } - else { - ProgressMonitor.logWarning("Data for genetic entity " + geneticEntityName - + " [" + geneticEntityType + "] already imported from file. Record will be skipped."); - return false; - } - } - catch (Exception ex) { - throw new RuntimeException("Aborted: Error found for row starting with " + geneticEntityName + ": " + ex.getMessage()); - } - } - /** * Tries to parse the genes and look them up in DaoGeneOptimized * diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java index 0b5b182b..24903d17 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java @@ -32,14 +32,26 @@ package org.mskcc.cbio.portal.scripts; -import java.io.*; -import java.util.*; -import joptsimple.*; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; +import joptsimple.OptionSet; +import org.mskcc.cbio.portal.dao.DaoClinicalEvent; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoPatient; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.ClinicalEvent; +import org.mskcc.cbio.portal.model.Patient; import org.mskcc.cbio.portal.util.ConsoleUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; -import org.mskcc.cbio.portal.util.SpringUtil; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Properties; +import java.util.Set; /** * Imports timeline data for display in patient view @@ -48,9 +60,8 @@ */ public class ImportTimelineData extends ConsoleRunnable { - private static void importData(String dataFile, int cancerStudyId) throws IOException, DaoException { + private static void importData(String dataFile, int cancerStudyId, boolean overwriteExisting) throws IOException, DaoException { MySQLbulkLoader.bulkLoadOn(); - SpringUtil.initDataSource(); ProgressMonitor.setCurrentMessage("Reading file " + dataFile); FileReader reader = new FileReader(dataFile); @@ -72,9 +83,10 @@ private static void importData(String dataFile, int cancerStudyId) throws IOExce throw new RuntimeException("The first line must start with\n'PATIENT_ID\tSTART_DATE\tEVENT_TYPE'\nor\n" + "PATIENT_ID\tSTART_DATE\tSTOP_DATE\tEVENT_TYPE"); } - + long clinicalEventId = DaoClinicalEvent.getLargestClinicalEventId(); - + Set processedPatientIds = new HashSet<>(); + while ((line = buff.readLine()) != null) { line = line.trim(); @@ -90,6 +102,9 @@ private static void importData(String dataFile, int cancerStudyId) throws IOExce ProgressMonitor.logWarning("Patient " + patientId + " not found in study " + cancerStudyId + ". Skipping entry."); continue; } + if (overwriteExisting && processedPatientIds.add(patient.getInternalId())) { + DaoClinicalEvent.deleteByPatientId(patient.getInternalId()); + } ClinicalEvent event = new ClinicalEvent(); event.setClinicalEventId(++clinicalEventId); event.setPatientId(patient.getInternalId()); @@ -119,17 +134,23 @@ private static void importData(String dataFile, int cancerStudyId) throws IOExce public void run() { try { String description = "Import 'timeline' data"; - + OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, true); - String dataFile = (String) options.valueOf("data"); + if (options.has("loadMode") && !"bulkLoad".equalsIgnoreCase((String) options.valueOf("loadMode"))) { + throw new UnsupportedOperationException("This loader supports bulkLoad load mode only, but " + + options.valueOf("loadMode") + + " has been supplied."); + } + String dataFile = (String) options.valueOf("data"); File descriptorFile = new File((String) options.valueOf("meta")); + boolean overwriteExisting = options.has("overwrite-existing"); Properties properties = new TrimmedProperties(); properties.load(new FileInputStream(descriptorFile)); int cancerStudyInternalId = ValidationUtils.getInternalStudyId(properties.getProperty("cancer_study_identifier")); - importData(dataFile, cancerStudyInternalId); + importData(dataFile, cancerStudyInternalId, overwriteExisting); } catch (RuntimeException e) { throw e; } catch (IOException|DaoException e) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java index 9e119f03..10bf9159 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java @@ -37,10 +37,9 @@ import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoTypeOfCancer; import org.mskcc.cbio.portal.model.TypeOfCancer; -import org.mskcc.cbio.portal.scripts.ConsoleRunnable; import org.mskcc.cbio.portal.util.ConsoleUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; -import org.mskcc.cbio.portal.util.SpringUtil; +import org.mskcc.cbio.portal.util.TsvUtil; /** * Load all the types of cancer and their names from a file. @@ -76,7 +75,6 @@ public void run() { public static void load(File file, boolean clobber) throws IOException, DaoException { ProgressMonitor.setCurrentMessage("Loading cancer types..."); List typeOfCancerList = parseCancerTypesFromFile(file); - SpringUtil.initDataSource(); if (clobber) { ProgressMonitor.setCurrentMessage("Deleting all previous cancer types..."); DaoTypeOfCancer.deleteAllRecords(); //TODO - remove this option - foreign key constraints may mean large cascade effects (possibly the deletion of all studies) - instead, change the option to 'deleteTypeOfCancerIfNotPresent' and add a loop through existing typeOfCancer records, removing those which are not in the parsed typeOfCancerList @@ -92,7 +90,7 @@ private static List parseCancerTypesFromFile(File file) throws IOE Scanner scanner = new Scanner(file); while (scanner.hasNextLine()) { String nextLine = scanner.nextLine(); - String[] fields = nextLine.split("\t", -1); + String[] fields = TsvUtil.splitTsvLine(nextLine); throwExceptionIfColumnCountIsWrong(file, nextLine, fields, EXPECTED_DATAFILE_COLUMN_COUNT); TypeOfCancer typeOfCancer = new TypeOfCancer(); String typeOfCancerId = fields[0].trim(); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java index c886c8b8..fbaa030f 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java @@ -33,12 +33,20 @@ package org.mskcc.cbio.portal.scripts; // imports -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.util.*; -import java.io.*; -import java.util.*; +import org.mskcc.cbio.portal.dao.DaoUser; +import org.mskcc.cbio.portal.dao.DaoUserAuthorities; +import org.mskcc.cbio.portal.model.User; +import org.mskcc.cbio.portal.model.UserAuthorities; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.TsvUtil; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.util.Arrays; +import java.util.List; /** * Import a file of users and their authorities. @@ -62,8 +70,6 @@ public static void main(String[] args) throws Exception { ProgressMonitor.setConsoleMode(true); - SpringUtil.initDataSource(); - File file = new File(args[0]); FileReader reader = new FileReader(file); BufferedReader buf = new BufferedReader(reader); @@ -72,7 +78,7 @@ public static void main(String[] args) throws Exception { while (line != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if (!line.startsWith("#") && line.trim().length() > 0) { + if (TsvUtil.isDataLine(line)) { try { addUser(line); count++; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/NormalizeExpressionLevels.java b/src/main/java/org/mskcc/cbio/portal/scripts/NormalizeExpressionLevels.java index a5c8d642..d2214c93 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/NormalizeExpressionLevels.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/NormalizeExpressionLevels.java @@ -32,14 +32,25 @@ package org.mskcc.cbio.portal.scripts; -import java.io.*; -import java.util.*; import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.model.CanonicalGene; -import org.mskcc.cbio.portal.util.SpringUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; + /** * * Given expression and CNV data for a set of samples generate normalized expression values. @@ -110,7 +121,6 @@ public class NormalizeExpressionLevels{ public static void main (String[]args) { try { - SpringUtil.initDataSource(); // init dao gene daoGeneOptimized = DaoGeneOptimized.getInstance(); driver(args); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/RemoveCancerStudy.java b/src/main/java/org/mskcc/cbio/portal/scripts/RemoveCancerStudy.java index ad515683..8ededd3a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/RemoveCancerStudy.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/RemoveCancerStudy.java @@ -32,9 +32,9 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.util.*; import org.mskcc.cbio.portal.dao.DaoCancerStudy; import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.util.ProgressMonitor; /** * Command Line Tool to Remove a Single Cancer Study. @@ -51,8 +51,7 @@ public void run() { ""); } String cancerStudyIdentifier = args[0]; - - SpringUtil.initDataSource(); + ProgressMonitor.setCurrentMessage( "Checking if Cancer study with identifier " + cancerStudyIdentifier + diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ResetDatabase.java b/src/main/java/org/mskcc/cbio/portal/scripts/ResetDatabase.java index 82cad899..43c0b95d 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ResetDatabase.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ResetDatabase.java @@ -32,8 +32,26 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.util.*; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoClinicalData; +import org.mskcc.cbio.portal.dao.DaoClinicalEvent; +import org.mskcc.cbio.portal.dao.DaoCopyNumberSegmentFile; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneset; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoGeneticProfileSamples; +import org.mskcc.cbio.portal.dao.DaoInfo; +import org.mskcc.cbio.portal.dao.DaoMutSig; +import org.mskcc.cbio.portal.dao.DaoMutation; +import org.mskcc.cbio.portal.dao.DaoPatient; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleList; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.dao.DaoTypeOfCancer; +import org.mskcc.cbio.portal.dao.DaoUser; +import org.mskcc.cbio.portal.dao.DaoUserAuthorities; /** * Empty the database. @@ -91,7 +109,6 @@ public static void resetDatabase() throws DaoException { } public static void main(String[] args) throws DaoException { - SpringUtil.initDataSource(); StatDatabase.statDb(); ResetDatabase.resetDatabase(); System.err.println("Database Cleared and Reset."); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/TransactionalScriptRunner.java b/src/main/java/org/mskcc/cbio/portal/scripts/TransactionalScriptRunner.java index 7ee8d94a..b7124de0 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/TransactionalScriptRunner.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/TransactionalScriptRunner.java @@ -1,14 +1,13 @@ package org.mskcc.cbio.portal.scripts; -import java.io.File; - -import org.mskcc.cbio.portal.util.SpringUtil; import org.mskcc.cbio.portal.util.TransactionalScripts; import org.springframework.context.support.FileSystemXmlApplicationContext; import org.springframework.transaction.TransactionStatus; import org.springframework.transaction.support.TransactionCallback; import org.springframework.transaction.support.TransactionTemplate; +import java.io.File; + /** * A high-level script runner than can be used to run a batch of scripts within a * transactional context. It's handy loading a batch of data of different types. @@ -69,8 +68,7 @@ public void run () { // Inject the context into SpringUtil, so we don't need to initialize again. // This ensures that the XML files from the command line provide a complete // context and we don't get data sources later from anywhere else. - SpringUtil.initDataSource(context); - + // Set up the transaction template transactionTemplate = (TransactionTemplate) context.getBean("scriptTransactionTemplate"); if (transactionTemplate == null) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCancerStudy.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCancerStudy.java index af8a5ba9..00bcbf69 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCancerStudy.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCancerStudy.java @@ -23,10 +23,10 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.util.*; import org.mskcc.cbio.portal.dao.DaoCancerStudy; import org.mskcc.cbio.portal.dao.DaoException; -import org.mskcc.cbio.portal.model.*; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.util.ProgressMonitor; /** @@ -58,7 +58,6 @@ public void run() { "Invalid study status parameter: " + cancerStudyStatus); } - SpringUtil.initDataSource(); CancerStudy theCancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyIdentifier); if (theCancerStudy == null) { throw new IllegalArgumentException("cancer study identified by cancer_study_identifier '" diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java new file mode 100644 index 00000000..865cc660 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java @@ -0,0 +1,226 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.scripts; + +import joptsimple.OptionException; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import joptsimple.OptionSpec; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoSampleList; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.SampleList; +import org.mskcc.cbio.portal.util.CaseList; +import org.mskcc.cbio.portal.util.CaseListReader; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.validate.CaseListValidator; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; + +public class UpdateCaseListsSampleIds extends ConsoleRunnable { + + private File metaFile; + private File dataFile; + private List caseListFiles = List.of(); + private String cancerStudyStableId; + private final Map> caseListSampleIdToSampleIds = new LinkedHashMap<>(); + private final DaoSampleList daoSampleList = new DaoSampleList(); + private LinkedHashSet allSampleIds; + + public UpdateCaseListsSampleIds(String[] args) { + super(args); + } + + /** + * Updates case list sample ids from clinical sample and case list files + */ + public void run() { + parseArguments(); + readStudyIdAndDataFileFromMetaFile(); + this.allSampleIds = readSampleIdsFromDataFile(this.dataFile); + this.caseListSampleIdToSampleIds.put(cancerStudyStableId + "_all", this.allSampleIds); + Map> readCaseListSampleIds = readCaseListFiles(); + this.caseListSampleIdToSampleIds.putAll(readCaseListSampleIds); + updateCaseListsForTheStudy(this.caseListSampleIdToSampleIds); + } + + private Map> readCaseListFiles() { + LinkedHashMap> result = new LinkedHashMap<>(); + for (File caseListFile : this.caseListFiles) { + CaseList caseList = CaseListReader.readFile(caseListFile); + CaseListValidator.validateIdFields(caseList); + String cancerStudyIdentifier = caseList.getCancerStudyIdentifier(); + if (!cancerStudyIdentifier.equals(this.cancerStudyStableId)) { + ProgressMonitor.logWarning( + String.format( + "Skipping %s case list file as it belongs to %s study and we uploading %s study.", + caseListFile, cancerStudyIdentifier, this.cancerStudyStableId)); + continue; + } + LinkedHashSet extraSampleIds = new LinkedHashSet<>(caseList.getSampleIds()); + extraSampleIds.removeAll(this.allSampleIds); + if (!extraSampleIds.isEmpty()) { + throw new RuntimeException(caseListFile.getAbsolutePath() + ": The following sample ids present in the case list file, but not specified in the clinical sample file: " + String.join(", ", extraSampleIds)); + } + result.put(caseList.getStableId(), new LinkedHashSet<>(caseList.getSampleIds())); + } + return result; + } + + /** + * Updates the sample lists according to the steps below: + * + * 1. New sample IDs provided in the `caseListSampleIdToSampleIds` map are added to their corresponding case lists. + * 2. These sample IDs are removed from any other case lists within the same study. + * + * @param caseListSampleIdToSampleIds A map where the key is the case list stable ID and the value is a set of sample IDs + * to be added to the corresponding case list. + * Note: This map only includes the case lists that need to be updated with new sample IDs. + * Existing case lists in the study that are not in the map will not be dropped, + * but the provided sample IDs will be removed from these lists if present. + * @throws RuntimeException if any DAO operations fail or if a case list with a specified stable ID is not found. + */ + private void updateCaseListsForTheStudy(Map> caseListSampleIdToSampleIds) { + DaoCancerStudy.reCacheAll(); + try { + for (Map.Entry> caseListStableIdToSampleIds : caseListSampleIdToSampleIds.entrySet()) { + String caseListStableId = caseListStableIdToSampleIds.getKey(); + Set uploadedSampleIds = caseListStableIdToSampleIds.getValue(); + SampleList sampleList = daoSampleList.getSampleListByStableId(caseListStableId); + if (sampleList == null) { + throw new RuntimeException("No case list with " + caseListStableId + " stable id is found"); + } + LinkedHashSet newCaseListSampleIds = new LinkedHashSet<>(sampleList.getSampleList()); + if (newCaseListSampleIds.addAll(uploadedSampleIds)) { + sampleList.setSampleList(new ArrayList<>(newCaseListSampleIds)); + daoSampleList.updateSampleListList(sampleList); + } + } + CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(this.cancerStudyStableId); + List sampleLists = daoSampleList.getAllSampleLists(cancerStudy.getInternalId()); + List remainingLists = sampleLists.stream().filter(sl -> + !caseListSampleIdToSampleIds.containsKey(sl.getStableId()) && sl.getSampleList().stream().anyMatch(this.allSampleIds::contains) + ).toList(); + for (SampleList remainingList : remainingLists) { + ArrayList newSampleList = new ArrayList<>(remainingList.getSampleList()); + if (newSampleList.removeAll(this.allSampleIds)) { + remainingList.setSampleList(newSampleList); + daoSampleList.updateSampleListList(remainingList); + } + } + } catch (DaoException e) { + throw new RuntimeException(e); + } + } + + private LinkedHashSet readSampleIdsFromDataFile(File dataFile) { + LinkedHashSet allSampleIds = new LinkedHashSet<>(); + try (FileReader reader = new FileReader(dataFile); + BufferedReader buff = new BufferedReader(reader)) { + String line; + int sampleIdPosition = -1; + while ((line = buff.readLine()) != null) { + String trimmedLine = line.trim(); + if (trimmedLine.isEmpty() || trimmedLine.startsWith("#")) { + continue; + } + + String[] fieldValues = line.split("\t"); + if (sampleIdPosition == -1) { + sampleIdPosition = List.of(fieldValues).indexOf("SAMPLE_ID"); + if (sampleIdPosition == -1) { + throw new RuntimeException("No SAMPLE_ID header is found"); + } + } else { + allSampleIds.add(fieldValues[sampleIdPosition].trim()); + } + } + return allSampleIds; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void readStudyIdAndDataFileFromMetaFile() { + TrimmedProperties properties = new TrimmedProperties(); + try { + FileInputStream inStream = new FileInputStream(this.metaFile); + properties.load(inStream); + this.cancerStudyStableId = properties.getProperty("cancer_study_identifier"); + String dataFilename = properties.getProperty("data_filename"); + this.dataFile = new File(metaFile.getParent(), dataFilename); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private void parseArguments() { + String progName = getClass().getName(); + String description = "Updates (adds/removes) sample ids in specified case lists."; + + OptionParser parser = new OptionParser(); + OptionSpec metaOpt = parser.accepts("meta", + "clinical sample (genetic_alteration_type=CLINICAL and datatype=SAMPLE_ATTRIBUTES or datatype=MIXED_ATTRIBUTES) meta data file. All sample ids found in the file will be added to the _all case list.").withRequiredArg().required().describedAs("meta_clinical_sample.txt").ofType(String.class); + OptionSpec caseListDirOrFileOpt = parser.accepts("case-lists", + "case list file or a directory with case list files").withRequiredArg().describedAs("case_lists/").ofType(String.class); + + try { + OptionSet options = parser.parse(args); + this.metaFile = new File(options.valueOf(metaOpt)); + if (options.has(caseListDirOrFileOpt)) { + File caseListDirOrFile = new File(options.valueOf(caseListDirOrFileOpt)); + if (caseListDirOrFile.isDirectory()) { + this.caseListFiles = Arrays.stream(Objects.requireNonNull(caseListDirOrFile.listFiles())) + .filter(file -> !file.getName().startsWith(".") && !file.getName().endsWith("~")).collect(Collectors.toList()); + } else if (caseListDirOrFile.isFile()) { + this.caseListFiles = List.of(caseListDirOrFile); + } else { + throw new RuntimeException("No file " + caseListDirOrFile.getAbsolutePath() + " exists"); + } + } + } catch (OptionException e) { + throw new UsageException( + progName, description, parser, + e.getMessage()); + } + } + + /** + * Runs the command as a script and exits with an appropriate exit code. + * + * @param args the arguments given on the command line + */ + public static void main(String[] args) { + ConsoleRunnable runner = new UpdateCaseListsSampleIds(args); + runner.runInConsole(); + } +} diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateMetaData.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateMetaData.java index d9669ed5..8235ed3a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateMetaData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateMetaData.java @@ -32,9 +32,11 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.util.*; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.GeneticProfileReader; +import org.mskcc.cbio.portal.util.ProgressMonitor; import java.io.File; @@ -51,7 +53,6 @@ public static void main(String[] args) throws Exception { } ProgressMonitor.setConsoleMode(true); - SpringUtil.initDataSource(); File descriptorFile = new File(args[0]); GeneticProfile geneticProfile = GeneticProfileReader.loadGeneticProfileFromMeta(descriptorFile); diff --git a/src/main/java/org/mskcc/cbio/portal/util/ArrayUtil.java b/src/main/java/org/mskcc/cbio/portal/util/ArrayUtil.java new file mode 100644 index 00000000..3235d33e --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/util/ArrayUtil.java @@ -0,0 +1,21 @@ +package org.mskcc.cbio.portal.util; + +import java.util.HashMap; +import java.util.Map; + +public class ArrayUtil { + public static Map zip(K[] keys, V[] values) { + Map map = new HashMap<>(); + + // Check if both arrays have the same length + if (keys.length == values.length) { + for (int i = 0; i < keys.length; i++) { + map.put(keys[i], values[i]); + } + } else { + throw new IllegalArgumentException("Arrays must be of the same length"); + } + return map; + + } +} \ No newline at end of file diff --git a/src/main/java/org/mskcc/cbio/portal/util/CaseList.java b/src/main/java/org/mskcc/cbio/portal/util/CaseList.java new file mode 100644 index 00000000..5e01c984 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/util/CaseList.java @@ -0,0 +1,48 @@ +package org.mskcc.cbio.portal.util; + +import java.util.List; + +public class CaseList { + + private final String stableId; + private final String cancerStudyIdentifier; + private final String name; + private final String description; + + private final String category; + private final List sampleIds; + + CaseList(String stableId, String cancerStudyIdentifier, String name, String description, String category, List sampleIds) { + this.stableId = stableId; + this.cancerStudyIdentifier = cancerStudyIdentifier; + this.name = name; + this.description = description; + this.category = category; + this.sampleIds = sampleIds; + } + + public String getStableId() { + return stableId; + } + + public String getCancerStudyIdentifier() { + return cancerStudyIdentifier; + } + + public String getName() { + return name; + } + + public String getCategory() { + return category; + } + + public String getDescription() { + return description; + } + + public List getSampleIds() { + return sampleIds; + } + +} diff --git a/src/main/java/org/mskcc/cbio/portal/util/CaseListReader.java b/src/main/java/org/mskcc/cbio/portal/util/CaseListReader.java new file mode 100644 index 00000000..02c15763 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/util/CaseListReader.java @@ -0,0 +1,43 @@ +package org.mskcc.cbio.portal.util; + +import org.mskcc.cbio.portal.scripts.TrimmedProperties; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; +import java.util.stream.Collectors; + +public class CaseListReader { + + public static CaseList readFile(File caseListFile) { + Properties properties = new TrimmedProperties(); + try { + properties.load(new FileReader(caseListFile)); + } catch (IOException e) { + throw new RuntimeException(e); + } + + String stableId = properties.getProperty("stable_id"); + String cancerStudyIdentifier = properties.getProperty("cancer_study_identifier"); + String caseListName = properties.getProperty("case_list_name"); + String caseListDescription = properties.getProperty("case_list_description"); + String caseListCategory = properties.getProperty("case_list_category"); + String caseListIds = properties.getProperty("case_list_ids"); + List sampleIds = caseListIds == null ? List.of() + : Arrays.stream(caseListIds.split("\t")).toList(); + + return new CaseList( + stableId, + cancerStudyIdentifier, + caseListName, + caseListDescription, + caseListCategory, + sampleIds + ); + } + + +} diff --git a/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java b/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java index 3cc6fd71..b5aa293e 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java @@ -53,7 +53,6 @@ public static void storeCnaEvents( if (!CNA.AMP.equals(cnaEvent.getAlteration()) && !CNA.HOMDEL.equals(cnaEvent.getAlteration())) { continue; } - // Revert PR https://github.com/cBioPortal/cbioportal-core/pull/1 breaks importer Optional existingCnaEvent = existingCnaEvents .stream() diff --git a/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java b/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java index b5b36227..0d2b6a23 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java @@ -138,8 +138,10 @@ public static OptionSet parseStandardDataAndMetaOptions(String[] args, String de parser.accepts( "loadMode", "direct (per record) or bulk load of data" ) .withRequiredArg().describedAs( "[directLoad|bulkLoad (default)]" ).ofType( String.class ); } + parser.accepts("overwrite-existing", + "Enables overwriting data if it turns out it already exists in DB.").withOptionalArg().describedAs("overwrite-existing").ofType(String.class); String progName = "importScript"; - + OptionSet options = null; try { options = parser.parse( args ); @@ -176,10 +178,6 @@ public static OptionSet parseStandardDataAndMetaOptions(String[] args, String de "Error: unknown loadMode action: " + actionArg); } } - else { - throw new UsageException(progName, description, parser, - "Error: 'loadMode' argument required."); - } } return options; } @@ -251,6 +249,9 @@ public static OptionSet parseStandardDataAndMetaUpdateOptions(String[] args, Str parser.accepts( "loadMode", "direct (per record) or bulk load of data" ) .withRequiredArg().describedAs( "[directLoad|bulkLoad (default)]" ).ofType( String.class ); } + parser.accepts("overwrite-existing", + "Enables overwriting data if it turns out it already exists in DB.").withOptionalArg().describedAs("overwrite-existing").ofType(String.class); + String progName = "importScript"; OptionSet options = null; diff --git a/src/main/java/org/mskcc/cbio/portal/util/DataValidator.java b/src/main/java/org/mskcc/cbio/portal/util/DataValidator.java new file mode 100644 index 00000000..1878f063 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/util/DataValidator.java @@ -0,0 +1,7 @@ +package org.mskcc.cbio.portal.util; + +public class DataValidator { + public static boolean isValidNumericSequence(String str) { + return str.matches("[0-9]+"); + } +} diff --git a/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java b/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java index 744ca565..4f0958ee 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java @@ -43,30 +43,6 @@ * @author Ethan Cerami. */ public class FileUtil { - /** - * BioPAX File Type. - */ - public static final int BIOPAX = 0; - - /** - * PSI_MI File Type. - */ - public static final int PSI_MI = 1; - - /** - * External DBs File Type. - */ - public static final int EXTERNAL_DBS = 2; - - /** - * Identifiers File Type. - */ - public static final int IDENTIFIERS = 3; - - /** - * Unknown File Type. - */ - public static final int UNKNOWN = 4; /** * Gets Number of Lines in Specified File. @@ -77,32 +53,16 @@ public class FileUtil { */ public static int getNumLines(File file) throws IOException { int numLines = 0; - FileReader reader = new FileReader(file); - BufferedReader buffered = new BufferedReader(reader); - String line = buffered.readLine(); - while (line != null) { - if (!line.startsWith("#") && line.trim().length() > 0) { - numLines++; + try (FileReader reader = new FileReader(file); BufferedReader buffered = new BufferedReader(reader)) { + String line = buffered.readLine(); + while (line != null) { + if (TsvUtil.isDataLine(line)) { + numLines++; + } + line = buffered.readLine(); } - line = buffered.readLine(); + return numLines; } - reader.close(); - return numLines; } - /** - * Gets Next Line of Input. Filters out Empty Lines and Comments. - * - * @param buf BufferedReader Object. - * @return next line of input. - * @throws IOException Error reading input stream. - */ - public static String getNextLine(BufferedReader buf) throws IOException { - String line = buf.readLine(); - while (line != null && (line.trim().length() == 0 - || line.trim().startsWith("#"))) { - line = buf.readLine(); - } - return line; - } } \ No newline at end of file diff --git a/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java b/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java index af686a72..7d3bb6cc 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java +++ b/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java @@ -48,6 +48,9 @@ import org.mskcc.cbio.portal.model.GeneticProfileLink; import org.mskcc.cbio.portal.scripts.TrimmedProperties; +import static org.cbioportal.model.MolecularProfile.DataType.DISCRETE; +import static org.cbioportal.model.MolecularProfile.ImportType.DISCRETE_LONG; + /** * Prepare a GeneticProfile for having its data loaded. * @@ -76,22 +79,33 @@ public static GeneticProfile loadGeneticProfile(File file) throws IOException, D GeneticProfile geneticProfile = loadGeneticProfileFromMeta(file); GeneticProfile existingGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId(geneticProfile.getStableId()); if (existingGeneticProfile != null) { - if (!existingGeneticProfile.getDatatype().equals("MAF")) { - // the dbms already contains a GeneticProfile with the file's stable_id. This scenario is not supported - // anymore, so throw error telling user to remove existing profile first: - throw new RuntimeException("Error: genetic_profile record found with same Stable ID as the one used in your data: " - + existingGeneticProfile.getStableId() + ". Remove the existing genetic_profile record first."); + ProgressMonitor.setCurrentMessage("genetic_profile record found with same Stable ID (" + geneticProfile.getStableId() + + "). Using it instead."); + if (geneticProfile.getGeneticAlterationType() != existingGeneticProfile.getGeneticAlterationType()) { + throw new IllegalStateException("genetic_profile record found with same Stable ID (" + + existingGeneticProfile.getStableId() + ") but different genetic alteration type: " + + existingGeneticProfile.getGeneticProfileId()); + } + if (DISCRETE_LONG.name().equals(geneticProfile.getDatatype())) { + if (!Set.of(DISCRETE_LONG.name(), DISCRETE.name()).contains(existingGeneticProfile.getDatatype())) { + throw new IllegalStateException("genetic_profile record found with same Stable ID (" + + existingGeneticProfile.getStableId() + ") but unsupported data type: " + + existingGeneticProfile.getDatatype()); + } } else { - // For mutation data only we can have multiple files with the same genetic_profile. - // There is a constraint in the mutation database table to prevent duplicated data - // If this constraint is hit (mistakenly importing the same maf twice) MySqlBulkLoader will throw an exception - // - // make an object combining the pre-existing profile with the file-specific properties of the current file - GeneticProfile gp = new GeneticProfile(existingGeneticProfile); - gp.setTargetLine(gp.getTargetLine()); - gp.setOtherMetadataFields(gp.getAllOtherMetadataFields()); - return gp; + if (!existingGeneticProfile.getDatatype().equals(geneticProfile.getDatatype())) { + throw new IllegalStateException("genetic_profile record found with same Stable ID (" + + existingGeneticProfile.getStableId() + ") but different data type: " + + existingGeneticProfile.getDatatype()); + } + } + if (geneticProfile.getCancerStudyId() != existingGeneticProfile.getCancerStudyId()) { + throw new IllegalStateException("genetic_profile record found with same Stable ID (" + + existingGeneticProfile.getStableId() + ") but different cancer study (id=" + + existingGeneticProfile.getCancerStudyId() + ")"); } + existingGeneticProfile.setOtherMetadataFields(geneticProfile.getAllOtherMetadataFields()); + return existingGeneticProfile; } // For GSVA profiles, we want to create a geneticProfileLink from source_stable_id for: diff --git a/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileUtil.java b/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileUtil.java index 16ab5098..748ffd54 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileUtil.java @@ -87,6 +87,9 @@ public static boolean outlierExpressionSelected(HashSet geneticProfileId public static int getGenePanelId(String panelId) { GenePanel genePanel = DaoGenePanel.getGenePanelByStableId(panelId); + if (genePanel == null) { + throw new NoSuchElementException("Gene panel with id " + panelId + " not found."); + } return genePanel.getInternalId(); } diff --git a/src/main/java/org/mskcc/cbio/portal/util/MyCancerGenomeLinkUtil.java b/src/main/java/org/mskcc/cbio/portal/util/MyCancerGenomeLinkUtil.java index e1c035e1..a5244050 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/MyCancerGenomeLinkUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/MyCancerGenomeLinkUtil.java @@ -33,23 +33,12 @@ import java.io.BufferedReader; import java.io.IOException; -import java.io.InputStream; import java.io.InputStreamReader; -import java.net.URL; import java.util.ArrayList; -import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; -import java.util.TreeMap; -import java.util.logging.Level; -import java.util.logging.Logger; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import javax.net.ssl.HttpsURLConnection; -import org.apache.commons.text.StringEscapeUtils; + import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.model.CanonicalGene; @@ -107,7 +96,7 @@ private static void setMyCancerGenomeLinkFromLocal() { while ((line=in.readLine())!=null && line.startsWith("#")) {} for (; line!=null; line=in.readLine()) { - String[] parts = line.trim().split("\t",-1); + String[] parts = TsvUtil.splitTsvLine(line); if (parts.length<4) { continue; } diff --git a/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java b/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java new file mode 100644 index 00000000..0c2e61a2 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java @@ -0,0 +1,43 @@ +package org.mskcc.cbio.portal.util; + +/** + * Utils to parse and validate TSV lines + * @author Ruslan Forostianov + */ +public class TsvUtil { + /** + * Detects if the line has some data + * e.g. blank line and comments are not considered as data rows + * @param line the line to evaluate + * @return true if the line contains data, false otherwise + */ + public static boolean isDataLine(String line) { + return !line.startsWith("#") && line.trim().length() > 0; + } + + /** + * Splits tsv line and does not trim empty values at the end. + * @param line + * @return + */ + public static String[] splitTsvLine(String line) { + return line.split("\t", -1); + } + + /** + * Makes sure header and row length match + * @param headerParts + * @param rowParts + */ + public static void ensureHeaderAndRowMatch(String[] headerParts, String[] rowParts) { + int headerColumns = headerParts.length; + if (rowParts.length > headerColumns) { + throw new IllegalArgumentException("Found line with more fields (" + rowParts.length + + ") than specified in the headers(" + headerColumns + "): \n" + rowParts[0]); + } + if (rowParts.length < headerColumns) { + throw new IllegalArgumentException("Found line with less fields (" + rowParts.length + + ") than specified in the headers(" + headerColumns + "): \n" + rowParts[0]); + } + } +} diff --git a/src/main/java/org/mskcc/cbio/portal/validate/CaseListValidator.java b/src/main/java/org/mskcc/cbio/portal/validate/CaseListValidator.java new file mode 100644 index 00000000..d6a2494e --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/validate/CaseListValidator.java @@ -0,0 +1,48 @@ +package org.mskcc.cbio.portal.validate; + +import org.mskcc.cbio.portal.util.CaseList; + +public class CaseListValidator { + + /** + * Fields that are used during case list update + * @param caseList + */ + public static void validateIdFields(CaseList caseList) { + if (caseList.getStableId() == null) { + throw new IllegalArgumentException("stable id is not specified."); + } + if (caseList.getStableId().matches(".*\\s.*")) { + throw new IllegalArgumentException(String.format("stable id cannot contain white space(s): '%s'", caseList.getStableId())); + } + if (caseList.getCancerStudyIdentifier() == null) { + throw new IllegalArgumentException("cancer study identifier is not specified."); + } + if (caseList.getCancerStudyIdentifier().matches(".*\\s.*")) { + throw new IllegalArgumentException(String.format("cancer study identifier cannot contain white space(s): '%s'", caseList.getStableId())); + } + if (caseList.getSampleIds() == null || caseList.getSampleIds().isEmpty()) { + throw new IllegalArgumentException("sample ids are not specified."); + } + } + + /** + * Fields that are used during case list creation + * @param caseList + */ + public static void validateDescriptionFields(CaseList caseList) { + if (caseList.getName() == null) { + throw new IllegalArgumentException("case list name is not specified."); + } + if (caseList.getDescription() == null) { + throw new IllegalArgumentException("case list description is not specified."); + } + } + + public static void validateAll(CaseList caseList) { + validateIdFields(caseList); + validateDescriptionFields(caseList); + } + + +} diff --git a/src/test/java/org/cbioportal/model/util/TsvUtilTest.java b/src/test/java/org/cbioportal/model/util/TsvUtilTest.java new file mode 100644 index 00000000..c49de40b --- /dev/null +++ b/src/test/java/org/cbioportal/model/util/TsvUtilTest.java @@ -0,0 +1,29 @@ +package org.cbioportal.model.util; + +import org.junit.Test; + +import static org.junit.Assert.assertTrue; +import static org.mskcc.cbio.portal.util.TsvUtil.ensureHeaderAndRowMatch; +import static org.junit.Assert.assertThrows; + +public class TsvUtilTest { + + @Test + public void testEnsureHeaderAndRowMatch_headerHasGreaterLength() { + IllegalArgumentException illegalArgumentException = assertThrows(IllegalArgumentException.class, + () -> ensureHeaderAndRowMatch(new String[] {"header1", "header2"}, new String[] {"row1"})); + assertTrue(illegalArgumentException.getMessage().contains("Found line with less fields")); + } + + @Test + public void testEnsureHeaderAndRowMatch_headerHasSmallerLength() { + IllegalArgumentException illegalArgumentException = assertThrows(IllegalArgumentException.class, + () -> ensureHeaderAndRowMatch(new String[] {"header1"}, new String[] {"row1", "row2"})); + assertTrue(illegalArgumentException.getMessage().contains("Found line with more fields")); + } + + @Test + public void testEnsureHeaderAndRowMatch_headerHasSameLength() { + ensureHeaderAndRowMatch(new String[] {"header1", "header2"}, new String[] {"row1", "row2"}); + } +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoGeneticProfile.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoGeneticProfile.java index 8c1afdcc..83e04144 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoGeneticProfile.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoGeneticProfile.java @@ -72,7 +72,7 @@ public void setUp() throws DaoException public void testDaoGetAllGeneticProfiles() throws DaoException { ArrayList list = DaoGeneticProfile.getAllGeneticProfiles(studyId); - assertEquals(7, list.size()); + assertEquals(9, list.size()); } @Test @@ -134,12 +134,12 @@ public void testDaoDeleteGeneticProfile() throws DaoException { GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileById(2); - assertEquals(7, DaoGeneticProfile.getCount()); + assertEquals(9, DaoGeneticProfile.getCount()); DaoGeneticProfile.deleteGeneticProfile(geneticProfile); - assertEquals(6, DaoGeneticProfile.getCount()); + assertEquals(8, DaoGeneticProfile.getCount()); ArrayList list = DaoGeneticProfile.getAllGeneticProfiles(studyId); - assertEquals(6, list.size()); + assertEquals(8, list.size()); geneticProfile = list.get(0); assertEquals(studyId, geneticProfile.getCancerStudyId()); assertEquals("mRNA expression (microarray)", geneticProfile.getProfileName()); @@ -155,7 +155,7 @@ public void testDaoUpdateGeneticProfile() throws DaoException { geneticProfile.getGeneticProfileId(), "Updated Name", "Updated Description")); ArrayList list = DaoGeneticProfile.getAllGeneticProfiles(studyId); - assertEquals(7, list.size()); + assertEquals(9, list.size()); geneticProfile = list.get(0); assertEquals(studyId, geneticProfile.getCancerStudyId()); assertEquals("Updated Name", geneticProfile.getProfileName()); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoSampleProfile.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoSampleProfile.java index 705f46b6..d4c80a8e 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoSampleProfile.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoSampleProfile.java @@ -55,6 +55,7 @@ import java.util.ArrayList; import java.util.HashSet; +import java.util.List; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -105,8 +106,8 @@ public void testDaoSampleProfile() throws DaoException { Patient patient = DaoPatient.getPatientByCancerStudyAndPatientId(study.getInternalId(), "TCGA-12345"); Sample sample = DaoSample.getSampleByPatientAndSampleId(patient.getInternalId(), "TCGA-12345-01"); - int num = DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, null); - assertEquals(1, num); + DaoSampleProfile.upsertSampleToProfileMapping(List.of( + new DaoSampleProfile.SampleProfileTuple(geneticProfileId, sample.getInternalId(), null))); boolean exists = DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId); assertTrue(exists); @@ -114,8 +115,8 @@ public void testDaoSampleProfile() throws DaoException { assertEquals(geneticProfileId, DaoSampleProfile.getProfileIdForSample(sample.getInternalId())); sample = DaoSample.getSampleByPatientAndSampleId(patient.getInternalId(), "TCGA-123456-01"); - num = DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanel.getInternalId()); - assertEquals(1, num); + DaoSampleProfile.upsertSampleToProfileMapping(List.of( + new DaoSampleProfile.SampleProfileTuple(geneticProfileId, sample.getInternalId(), genePanel.getInternalId()))); boolean existsByPanelId = DaoSampleProfile.sampleProfileMappingExistsByPanel(genePanel.getInternalId()); assertTrue(existsByPanelId); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/GeneticAlterationsTestHelper.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/GeneticAlterationsTestHelper.java new file mode 100644 index 00000000..48ca7b4e --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/GeneticAlterationsTestHelper.java @@ -0,0 +1,53 @@ +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticEntity; + +import java.util.HashMap; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class GeneticAlterationsTestHelper { + public static Set geneStableIdsToEntityIds(Set beforeStableIds) { + return beforeStableIds.stream().map(stableId -> { + try { + return geneStableIdToEntityId(stableId); + } catch (DaoException e) { + throw new RuntimeException(e); + } + }).collect(Collectors.toSet()); + } + + public static int geneStableIdToEntityId(String stableId) throws DaoException { + return DaoGeneticEntity.getGeneticEntityByStableId(stableId).getId(); + } + + public static void assertPriorDataState(HashMap> beforeResult, Set expectedEntityIds, Set expectedSampleIds) { + assertEquals(expectedEntityIds, beforeResult.keySet()); + beforeResult.forEach((entityId, sampleIdToValue) -> { + assertEquals("Samples for gene with entityId = " + entityId + " have to match expected ones", + expectedSampleIds, beforeResult.get(entityId).keySet()); + }); + } + + public static void assertNoChange(HashMap> beforeResult, + HashMap> afterResult, + Set entityIds, + Set sampleIds) { + entityIds.forEach(entityId -> { + assertTrue("After result is expected to contain entityId=" + entityId, + afterResult.containsKey(entityId)); + sampleIds.forEach(sampleId -> { + assertTrue("Sample_id=" + sampleId + " expected to be found for gene with entityId=" + entityId, + afterResult.get(entityId).containsKey(sampleId)); + assertEquals("The values for sample_id=" + sampleId + + " and entityId=" + entityId + " before and after upload have to match.", + beforeResult.get(entityId).get(sampleId), afterResult.get(entityId).get(sampleId)); + }); + }); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java new file mode 100644 index 00000000..0e1d8a68 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java @@ -0,0 +1,244 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.cbioportal.model.CNA; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoCnaEvent; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGenePanel; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.model.CnaEvent; +import org.mskcc.cbio.portal.model.GenePanel; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.TestContextManager; +import org.springframework.transaction.PlatformTransactionManager; +import org.springframework.transaction.TransactionStatus; +import org.springframework.transaction.support.DefaultTransactionDefinition; + +import java.io.File; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; + +/** + * Tests Incremental Import of PROTEIN_LEVEL Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(Parameterized.class) +@ContextConfiguration(locations = {"classpath:/applicationContext-dao.xml"}) +public class TestIncrementalCopyNumberAlterationImport { + + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ATM + final long absentGeneEntrezId = 472l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + private final String metaFile; + private final String dataFile; + + { beforeEntrezIds.add(absentGeneEntrezId); } + + // stable_id: TCGA-XX-0800 + final int newSampleId = 15; + // stable_id: TCGA-A1-A0SO + final int updateSampleId = 12; + final Set noChangeSampleIds = Set.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + { beforeSampleIds.add(updateSampleId); } + + final Set afterSampleIds = new HashSet<>(beforeSampleIds); + { afterSampleIds.add(newSampleId); } + + @Parameterized.Parameters(name = "{0}") + public static Collection primeNumbers() { + return Arrays.asList(new Object[][] { + { "meta_cna_discrete.txt", "data_cna_discrete.txt" }, + { "meta_cna_discrete_long.txt", "data_cna_discrete_long.txt" }, + }); + } + + public TestIncrementalCopyNumberAlterationImport(String metaFile, String dataFile) { + this.metaFile = metaFile; + this.dataFile = dataFile; + } + + /** + * Test incremental upload of COPY_NUMBER_ALTERATION DISCRETE + */ + @Test + public void testDiscreteCNA() throws DaoException { + GeneticProfile discreteCNAProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_gistic"); + assertNotNull(discreteCNAProfile); + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); + assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); + + Map beforeSampleIdToPanelId = new HashMap<>(); + for (int sampleId : noChangeSampleIds) { + try { + beforeSampleIdToPanelId.put(sampleId, + DaoSampleProfile.getPanelId(sampleId, discreteCNAProfile.getGeneticProfileId())); + } catch (DaoException e) { + throw new RuntimeException(e); + } + } + + List allCnaLevels = Arrays.stream(CNA.values()).map(CNA::getCode).toList(); + Set beforeCnaEventsSampleIds = Set.of(4, 13, 14, updateSampleId); + List beforeSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), + null, + discreteCNAProfile.getGeneticProfileId(), + allCnaLevels); + Map> beforeSampleIdToSampleCnaEvents = beforeSampleCnaEvents.stream().collect(Collectors.groupingBy(CnaEvent::getSampleId)); + assertEquals(beforeCnaEventsSampleIds, beforeSampleIdToSampleCnaEvents.keySet()); + + File dataFolder = new File("src/test/resources/incremental/copy_number_alteration/"); + File metaFile = new File(dataFolder, this.metaFile); + File dataFile = new File(dataFolder, this.dataFile); + + /** + * Test + */ + new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }).run(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); + assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, + afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); + assertEquals("-2", afterResult.get(newGeneEntrezId).get(newSampleId)); + assertEquals("2", afterResult.get(newGeneEntrezId).get(updateSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(updateSampleId)); + + List afterSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), + afterResult.keySet(), + discreteCNAProfile.getGeneticProfileId(), + allCnaLevels); + Map> afterSampleIdToSampleCnaEvents = afterSampleCnaEvents.stream().collect(Collectors.groupingBy(CnaEvent::getSampleId)); + assertEquals("There is only one new sample that has to gain cna events", beforeCnaEventsSampleIds.size() + 1, afterSampleIdToSampleCnaEvents.size()); + beforeCnaEventsSampleIds.forEach(sampleId -> { + if (sampleId == updateSampleId) { + return; + } + Set beforeCnaEvents = beforeSampleIdToSampleCnaEvents.get(sampleId).stream().map(CnaEvent::getEvent).collect(Collectors.toSet()); + Set afterCnaEvents = afterSampleIdToSampleCnaEvents.get(sampleId).stream().map(CnaEvent::getEvent).collect(Collectors.toSet()); + assertEquals("CNA events for sample_id=" + sampleId + " must not change.", beforeCnaEvents, afterCnaEvents); + }); + Map newSampleEntrezGeneIdToCnaAlteration = afterSampleIdToSampleCnaEvents.get(newSampleId).stream() + .map(CnaEvent::getEvent) + .collect(Collectors.toMap( + event -> event.getGene().getEntrezGeneId(), + CnaEvent.Event::getAlteration)); + assertEquals(Map.of( + 208l, CNA.HOMDEL, + 3265l, CNA.AMP, + 4893l, CNA.HOMDEL, + 672l, CNA.AMP, + 673l, CNA.AMP, + 675l, CNA.HOMDEL, + newGeneEntrezId, CNA.HOMDEL + ), + newSampleEntrezGeneIdToCnaAlteration); + Map updatedSampleEntrezGeneIdToCnaAlteration = afterSampleIdToSampleCnaEvents.get(updateSampleId).stream() + .map(CnaEvent::getEvent) + .collect(Collectors.toMap( + event -> event.getGene().getEntrezGeneId(), + CnaEvent.Event::getAlteration)); + assertEquals(Map.of( + 10000l, CNA.HOMDEL, + 207l, CNA.AMP, + 3845l, CNA.AMP, + 673l, CNA.HOMDEL, + newGeneEntrezId, CNA.AMP + ), + updatedSampleEntrezGeneIdToCnaAlteration); + + Map afterSampleIdToPanelId = new HashMap<>(); + for (int sampleId : noChangeSampleIds) { + try { + afterSampleIdToPanelId.put(sampleId, + DaoSampleProfile.getPanelId(sampleId, discreteCNAProfile.getGeneticProfileId())); + } catch (DaoException e) { + throw new RuntimeException(e); + } + } + assertEquals(beforeSampleIdToPanelId, afterSampleIdToPanelId); + + GenePanel genePanel = DaoGenePanel.getGenePanelByStableId("TSTGNPNLCNADS"); + for (int sampleId : Set.of(updateSampleId, newSampleId)) { + assertEquals("Sample profile has to point to TSTGNPNLCNADS panel", + genePanel.getInternalId(), + DaoSampleProfile.getPanelId(sampleId, discreteCNAProfile.getGeneticProfileId())); + } + } + + private TestContextManager testContextManager; + + private PlatformTransactionManager transactionManager; + + private TransactionStatus transactionStatus; + @Before + public void before() throws Exception { + this.testContextManager = new TestContextManager(getClass()); + this.testContextManager.prepareTestInstance(this); + this.transactionManager = this.testContextManager.getTestContext().getApplicationContext().getBean(PlatformTransactionManager.class); + this.transactionStatus = transactionManager.getTransaction(new DefaultTransactionDefinition()); + DaoCancerStudy.reCacheAll(); + } + + @After + public void after() { + this.transactionManager.rollback(transactionStatus); + } +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberSegmentDataImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberSegmentDataImport.java new file mode 100644 index 00000000..db2ee519 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberSegmentDataImport.java @@ -0,0 +1,121 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoClinicalData; +import org.mskcc.cbio.portal.dao.DaoCopyNumberSegment; +import org.mskcc.cbio.portal.dao.DaoCopyNumberSegmentFile; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.ClinicalData; +import org.mskcc.cbio.portal.model.CopyNumberSegment; +import org.mskcc.cbio.portal.model.CopyNumberSegmentFile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.scripts.ImportCopyNumberSegmentData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.List; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +/** + * Tests Incremental Import of CNA segmented data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalCopyNumberSegmentDataImport { + + /** + * Test incremental upload of CNA SEG data + */ + @Test + public void testIncrementalUpload() throws DaoException { + String segSampleId = "TCGA-A1-A0SE-01"; + Sample segDataSample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), segSampleId); + + CopyNumberSegmentFile copyNumberSegmentFile = new CopyNumberSegmentFile(); + copyNumberSegmentFile.cancerStudyId = cancerStudy.getInternalId(); + copyNumberSegmentFile.referenceGenomeId = CopyNumberSegmentFile.ReferenceGenomeId.hg19; + copyNumberSegmentFile.segFileId = 1; + copyNumberSegmentFile.filename = "test_file.seg"; + copyNumberSegmentFile.description = "test seg file description"; + DaoCopyNumberSegmentFile.addCopyNumberSegmentFile(copyNumberSegmentFile); + DaoClinicalData.addSampleDatum(segDataSample.getInternalId(), "FRACTION_GENOME_ALTERED", "TEST"); + MySQLbulkLoader.bulkLoadOn(); + CopyNumberSegment copyNumberSegment = new CopyNumberSegment( + cancerStudy.getInternalId(), + segDataSample.getInternalId(), + "1", + 3218610, + 95674710, + 100, + 0.01); + copyNumberSegment.setSegId(1L); + DaoCopyNumberSegment.addCopyNumberSegment(copyNumberSegment); + MySQLbulkLoader.flushAll(); + + File dataFolder = new File("src/test/resources/incremental/copy_number_alteration/"); + File metaFile = new File(dataFolder, "meta_cna_seg.txt"); + File dataFile = new File(dataFolder, "data_cna.seg"); + + ImportCopyNumberSegmentData importCnaSegData = new ImportCopyNumberSegmentData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importCnaSegData.run(); + + CopyNumberSegmentFile fetchedCopyNumberSegmentFile = DaoCopyNumberSegmentFile.getCopyNumberSegmentFile(cancerStudy.getInternalId()); + assertNotNull(fetchedCopyNumberSegmentFile); + assertEquals("test_file.seg", fetchedCopyNumberSegmentFile.filename); + List cnaSegments = DaoCopyNumberSegment + .getSegmentForASample(segDataSample.getInternalId(), cancerStudy.getInternalId()); + assertEquals(9, cnaSegments.size()); + List clinicalData = DaoClinicalData.getSampleData(cancerStudy.getInternalId(), Set.of(segSampleId)); + ClinicalData fractionGenomeAltered = clinicalData.stream() + .filter(cd -> "FRACTION_GENOME_ALTERED".equals(cd.getAttrId())).findFirst().get(); + assertEquals("0.0000", fractionGenomeAltered.getAttrVal()); + } + + public static final String STUDY_ID = "study_tcga_pub"; + private CancerStudy cancerStudy; + + @Before + public void setUp() throws DaoException { + cancerStudy = DaoCancerStudy.getCancerStudyByStableId(STUDY_ID); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenePanelMatrixImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenePanelMatrixImport.java new file mode 100644 index 00000000..e1df66a6 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenePanelMatrixImport.java @@ -0,0 +1,93 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGenePanel; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.ExtendedMutation; +import org.mskcc.cbio.portal.model.GenePanel; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.ArrayList; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.mskcc.cbio.portal.dao.DaoMutation.getMutations; + +/** + * Tests Incremental Import of Gene Panel Matrix Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalGenePanelMatrixImport { + + /** + * Test incremental upload + */ + @Test + public void testIncrementalUpload() throws DaoException { + File dataFolder = new File("src/test/resources/incremental/gene_panel_matrix/"); + File metaFile = new File(dataFolder, "meta_gene_panel_matrix.txt"); + File dataFile = new File(dataFolder, "data_gene_panel_matrix.txt"); + + ImportGenePanelProfileMap importGenePanelProfileMap = new ImportGenePanelProfileMap(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importGenePanelProfileMap.run(); + + GenePanel mutationGenePanel = DaoGenePanel.getGenePanelByStableId("TSTGNPNLMUTEXT"); + GeneticProfile mutationsProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mutations"); + GenePanel longGenePanel = DaoGenePanel.getGenePanelByStableId("TESTPANEL_CNA_DISCRETE_LONG_FORMAT"); + GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_gistic"); + GeneticProfile ic50Profile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_treatment_ic50"); + CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId("study_tcga_pub"); + Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), "TCGA-A1-A0SB-01"); + assertEquals(mutationGenePanel.getInternalId(), + DaoSampleProfile.getPanelId(sample.getInternalId(), mutationsProfile.getGeneticProfileId())); + assertEquals(longGenePanel.getInternalId(), + DaoSampleProfile.getPanelId(sample.getInternalId(), geneticProfile.getGeneticProfileId())); + assertNull(DaoSampleProfile.getPanelId(sample.getInternalId(), ic50Profile.getGeneticProfileId())); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java new file mode 100644 index 00000000..da681e5b --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java @@ -0,0 +1,174 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGenePanel; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticEntity; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.model.GenePanel; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertThrows; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.geneStableIdToEntityId; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.geneStableIdsToEntityIds; + +/** + * Tests Incremental Import of Generic Assay data + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalGenericAssayImporter { + + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + // stable_id: TCGA-A1-A0SE-01 + final int noChangeSampleId = 3; + final Set beforeSampleIds = Set.of(updateSampleId, noChangeSampleId); + + // Stable id that is part of the platform, but absent during the incremental upload + final String absentStableId = "L-685458"; + final Set noChangeStableIds = Set.of("Erlotinib", "Irinotecan", "Lapatinib"); + final Set beforeStableIds = new HashSet<>(noChangeStableIds); + { beforeStableIds.add(absentStableId); } + + private GeneticProfile ic50Profile; + private HashMap> beforeResult; + + /** + * Test incremental upload of GENERIC_ASSAY + */ + @Test + public void testGenericAssay() throws DaoException { + + File dataFolder = new File("src/test/resources/incremental/generic_assay/"); + File metaFile = new File(dataFolder, "meta_treatment_ic50.txt"); + File dataFile = new File(dataFolder, "data_treatment_ic50.txt"); + + /** + * Test + */ + new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }).run(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); + assertEquals("After result should have +1 amount of entries", beforeResult.size() + 1, afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, geneStableIdsToEntityIds(noChangeStableIds), Set.of(noChangeSampleId)); + int erlotinibEntityId = geneStableIdToEntityId("Erlotinib"); + assertEquals(">8", afterResult.get(erlotinibEntityId).get(newSampleId)); + assertEquals("7.5", afterResult.get(erlotinibEntityId).get(updateSampleId)); + int irinotecanEntityId = geneStableIdToEntityId("Irinotecan"); + assertEquals("", afterResult.get(irinotecanEntityId).get(newSampleId)); + assertEquals("0.081", afterResult.get(irinotecanEntityId).get(updateSampleId)); + int absentEntityId = geneStableIdToEntityId(absentStableId); + assertEquals("", afterResult.get(absentEntityId).get(newSampleId)); + assertEquals("", afterResult.get(absentEntityId).get(updateSampleId)); + int lapatinibEntityId = geneStableIdToEntityId("Lapatinib"); + assertEquals("6.2", afterResult.get(lapatinibEntityId).get(newSampleId)); + assertEquals("7.848", afterResult.get(lapatinibEntityId).get(updateSampleId)); + int lbw242EntityId = geneStableIdToEntityId("LBW242"); + assertEquals("0.1", afterResult.get(lbw242EntityId).get(newSampleId)); + assertEquals(">~8", afterResult.get(lbw242EntityId).get(updateSampleId)); + assertNotNull("New generic entity has to be added", DaoGeneticEntity.getGeneticEntityByStableId("LBW242")); + assertFalse("This sample should not get sample_profile", DaoSampleProfile.sampleExistsInGeneticProfile(noChangeSampleId, ic50Profile.getGeneticProfileId())); + GenePanel genePanel = DaoGenePanel.getGenePanelByStableId("TSTGNPNLGENASS"); + for (int sampleId : Set.of(updateSampleId, newSampleId)) { + assertEquals("Sample profile has to point to TSTGNPNLGENASS panel", + genePanel.getInternalId(), + DaoSampleProfile.getPanelId(sampleId, ic50Profile.getGeneticProfileId())); + } + } + + /** + * Test that incremental upload of GENERIC_ASSAY (patient level) is not supported + */ + @Test + public void testGenericAssayPatientLevel() throws DaoException { + + File dataFolder = new File("src/test/resources/incremental/generic_assay/"); + File metaFile = new File(dataFolder, "meta_treatment_ic50_patient_level.txt"); + File dataFile = new File(dataFolder, "data_treatment_ic50_patient_level.txt"); + + /** + * Test + */ + assertThrows("Incremental upload for generic assay patient_level data is not supported. Please use sample level instead.", + RuntimeException.class, () -> { + new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }).run(); + }); + } + + @Before + public void setUp() throws DaoException { + DaoCancerStudy.reCacheAll(); + + ic50Profile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_treatment_ic50"); + assertNotNull(ic50Profile); + + beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); + Set beforeEntityIds = geneStableIdsToEntityIds(beforeStableIds); + assertPriorDataState(beforeResult, beforeEntityIds, beforeSampleIds); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGsvaImporter.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGsvaImporter.java new file mode 100644 index 00000000..c6fff1fa --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGsvaImporter.java @@ -0,0 +1,80 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticEntity; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportTabDelimData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertThrows; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.geneStableIdToEntityId; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.geneStableIdsToEntityIds; + +/** + * Tests Incremental Import is not supported for GSVA data type + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalGsvaImporter { + @Test + public void testGsvaIsNotSupported() throws DaoException, IOException { + GeneticProfile gsvaProfile = new GeneticProfile(); + gsvaProfile.setCancerStudyId(DaoCancerStudy.getCancerStudyByStableId("study_tcga_pub").getInternalId()); + gsvaProfile.setStableId("gsva_scores"); + gsvaProfile.setDatatype("GENESET_SCORE"); + gsvaProfile.setGeneticAlterationType(GeneticAlterationType.GENESET_SCORE); + gsvaProfile.setProfileName("gsva test platform"); + DaoGeneticProfile.addGeneticProfile(gsvaProfile); + + assertThrows(UnsupportedOperationException.class, () -> + new ImportTabDelimData(File.createTempFile("gsva", "test"), + DaoGeneticProfile.getGeneticProfileByStableId("gsva_scores").getGeneticProfileId(), + null, + true, + DaoGeneOptimized.getInstance())); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMrnaExpressionImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMrnaExpressionImport.java new file mode 100644 index 00000000..d44ccee5 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMrnaExpressionImport.java @@ -0,0 +1,119 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; + +/** + * Tests Incremental Import of MRNA_EXPRESSION Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = {"classpath:/applicationContext-dao.xml"}) +@Rollback +@Transactional +public class TestIncrementalMrnaExpressionImport { + + /** + * Test incremental upload of MRNA_EXPRESSION + */ + @Test + public void testMrnaExpression() throws DaoException, IOException { + /** + * Prior checks + */ + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ARAF + final long absentGeneEntrezId = 369l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 472l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + beforeEntrezIds.add(absentGeneEntrezId); + + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + final Set noChangeSampleIds = Set.of(3, 6, 8, 9, 10, 12, 13); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + beforeSampleIds.add(updateSampleId); + + GeneticProfile mrnaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mrna"); + assertNotNull(mrnaProfile); + + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); + + File dataFolder = new File("src/test/resources/incremental/mrna_expression/"); + File metaFile = new File(dataFolder, "meta_expression_Zscores.txt"); + File dataFile = new File(dataFolder, "data_expression_Zscores.txt"); + + /** + * Test + */ + new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }).run(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, + afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample", beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); + HashMap newGeneRow = afterResult.get(newGeneEntrezId); + assertEquals("-0.1735", newGeneRow.get(newSampleId)); + assertEquals("-0.6412", newGeneRow.get(updateSampleId)); + HashMap absentGeneRow = afterResult.get(absentGeneEntrezId); + assertEquals("", absentGeneRow.get(newSampleId)); + assertEquals("", absentGeneRow.get(updateSampleId)); + } +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMutationsImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMutationsImport.java new file mode 100644 index 00000000..2a63df78 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMutationsImport.java @@ -0,0 +1,137 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.model.*; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.ArrayList; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.mskcc.cbio.portal.dao.DaoMutation.getMutations; + +/** + * Tests Incremental Import of Mutation Molecular Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalMutationsImport { + + public static final String STUDY_ID = "study_tcga_pub"; + private CancerStudy cancerStudy; + + @Before + public void setUp() throws DaoException { + cancerStudy = DaoCancerStudy.getCancerStudyByStableId(STUDY_ID); + } + /** + * Test inserting new mutation profile data for existing sample and genetic profile + */ + @Test + public void testInsertNewMutationProfileDataForExistingSampleAndProfile() throws DaoException { + GeneticProfile mutationGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mutations"); + assertNotNull(mutationGeneticProfile); + String mutationDataSampleId = "TCGA-A1-A0SE-01"; + /** + * this sample does not have mutation data attached + */ + Sample mutationDataSample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), mutationDataSampleId); + + File singleTcgaSampleFolder = new File("src/test/resources/incremental/insert_mutation_data/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_mutations.txt"); + File dataFile = new File(singleTcgaSampleFolder, "data_mutations_extended.txt"); + + ImportProfileData importProfileData = new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importProfileData.run(); + + ArrayList insertedMutations = getMutations( + mutationGeneticProfile.getGeneticProfileId(), + mutationDataSample.getInternalId()); + assertEquals(3, insertedMutations.size()); + assertNotNull(insertedMutations.get(0).getEvent()); + assertNotNull(insertedMutations.get(1).getEvent()); + assertNotNull(insertedMutations.get(2).getEvent()); + GenePanel genePanel = DaoGenePanel.getGenePanelByStableId("TSTGNPNLMUTEXT"); + assertEquals("Sample profile has to point to TSTGNPNLMUTEXT panel", + genePanel.getInternalId(), + DaoSampleProfile.getPanelId(mutationDataSample.getInternalId(), mutationGeneticProfile.getGeneticProfileId())); + } + /** + * Test updating mutation profile data for existing sample. The mutation genetic profile exists. + */ + @Test + public void testUpdateMutationProfileDataForExistingSampleAndProfile() throws DaoException { + GeneticProfile mutationGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mutations"); + assertNotNull(mutationGeneticProfile); + String mutationDataSampleId = "TCGA-A1-A0SH-01"; + /** + * this sample does have 2 mutation data rows attached. See seed_mini.sql + */ + Sample mutationDataSample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), mutationDataSampleId); + + File singleTcgaSampleFolder = new File("src/test/resources/incremental/update_mutation_data/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_mutations.txt"); + File dataFile = new File(singleTcgaSampleFolder, "data_mutations_extended.txt"); + + ImportProfileData importProfileData = new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importProfileData.run(); + + ArrayList insertedMutations = getMutations( + mutationGeneticProfile.getGeneticProfileId(), + mutationDataSample.getInternalId()); + assertEquals(3, insertedMutations.size()); + assertNotNull(insertedMutations.get(0).getEvent()); + assertNotNull(insertedMutations.get(1).getEvent()); + assertNotNull(insertedMutations.get(2).getEvent()); + Set entrezIds = insertedMutations.stream().map(m -> m.getEntrezGeneId()).collect(Collectors.toSet()); + Set expected = Set.of(207L, 208L, 672L); + assertEquals(expected, entrezIds); + GenePanel genePanel = DaoGenePanel.getGenePanelByStableId("TSTGNPNLMUTEXT"); + assertEquals("Sample profile has to point to TSTGNPNLMUTEXT panel", + genePanel.getInternalId(), + DaoSampleProfile.getPanelId(mutationDataSample.getInternalId(), mutationGeneticProfile.getGeneticProfileId())); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalPatientsImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalPatientsImport.java new file mode 100644 index 00000000..7dcf4d3c --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalPatientsImport.java @@ -0,0 +1,117 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.model.*; +import org.mskcc.cbio.portal.scripts.ImportClinicalData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +/** + * Tests Incremental Import of Sample Clinical Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalPatientsImport { + + public static final String STUDY_ID = "study_tcga_pub"; + private CancerStudy cancerStudy; + + @Before + public void setUp() throws DaoException { + cancerStudy = DaoCancerStudy.getCancerStudyByStableId(STUDY_ID); + } + + @Test + public void testInsertNewPatient() throws DaoException { + String newPatientId = "TEST-INC-TCGA-P2"; + File singleTcgaSampleFolder = new File("src/test/resources/incremental/insert_single_tcga_patient/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_patient.txt"); + File dataFile = new File(singleTcgaSampleFolder, "clinical_data_single_PATIENT.txt"); + + ImportClinicalData importClinicalData = new ImportClinicalData(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importClinicalData.run(); + + Patient newPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), newPatientId); + assertNotNull("Patient with id " + newPatientId + " has to be injected to the DB.", newPatient); + + List clinicalData = DaoClinicalData.getData(cancerStudy.getInternalId(), List.of(newPatientId)); + Map patientAttrs = clinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); + assertEquals(Map.of( + "SUBTYPE", "basal-like", + "OS_STATUS", "0:LIVING", + "OS_MONTHS", "45.6", + "DFS_STATUS", "1:Recurred/Progressed"), patientAttrs); + } + + @Test + public void testUpdatePatientAttributes() throws DaoException { + String updatedPatientId = "TCGA-A1-A0SB"; + + Patient tcgaPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), + updatedPatientId); + DaoClinicalData.addPatientDatum(tcgaPatient.getInternalId(), "SUBTYPE", "Luminal A"); + DaoClinicalData.addPatientDatum(tcgaPatient.getInternalId(), "OS_STATUS", "0:LIVING"); + DaoClinicalData.addPatientDatum(tcgaPatient.getInternalId(), "OS_MONTHS", "34.56"); + + File singleTcgaSampleFolder = new File("src/test/resources/incremental/update_single_tcga_patient/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_patient.txt"); + File dataFile = new File(singleTcgaSampleFolder, "clinical_data_single_PATIENT.txt"); + + ImportClinicalData importClinicalData = new ImportClinicalData(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importClinicalData.run(); + + Patient newPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), updatedPatientId); + assertNotNull("Patient with id " + updatedPatientId + " has to be injected to the DB.", newPatient); + + List clinicalData = DaoClinicalData.getData(cancerStudy.getInternalId(), List.of(updatedPatientId)); + Map patientAttrs = clinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); + assertEquals(Map.of( + "SUBTYPE", "basal-like", + "OS_MONTHS", "56.7", + "DFS_STATUS", "1:Recurred/Progressed", + "DFS_MONTHS", "100"), patientAttrs); + } +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalProteinLevelImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalProteinLevelImport.java new file mode 100644 index 00000000..f3933b27 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalProteinLevelImport.java @@ -0,0 +1,122 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; + +/** + * Tests Incremental Import of PROTEIN_LEVEL Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = {"classpath:/applicationContext-dao.xml"}) +@Rollback +@Transactional +public class TestIncrementalProteinLevelImport { + + /** + * Test incremental upload of PROTEIN_LEVEL + */ + @Test + public void testRppa() throws DaoException { + /** + * Prior checks + */ + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ARAF + final long absentGeneEntrezId = 369l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 472l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + beforeEntrezIds.add(absentGeneEntrezId); + + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + final Set noChangeSampleIds = Set.of(3, 6, 8, 9, 10, 12, 13); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + beforeSampleIds.add(updateSampleId); + + GeneticProfile rppaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_rppa"); + assertNotNull(rppaProfile); + + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); + assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); + + File dataFolder = new File("src/test/resources/incremental/protein_level/"); + File metaFile = new File(dataFolder, "meta_rppa.txt"); + File dataFile = new File(dataFolder, "data_rppa.txt"); + + /** + * Test + */ + new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }).run(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); + assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, + afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); + assertEquals("-0.141047088398489", afterResult.get(newGeneEntrezId).get(newSampleId)); + assertEquals("1.61253243564957", afterResult.get(newGeneEntrezId).get(updateSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(updateSampleId)); + } + + @Before + public void setUp() throws DaoException { + DaoCancerStudy.reCacheAll(); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalSamplesImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalSamplesImport.java new file mode 100644 index 00000000..93fea8ce --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalSamplesImport.java @@ -0,0 +1,185 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.model.*; +import org.mskcc.cbio.portal.scripts.ImportClinicalData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.junit.Assert.*; + +/** + * Tests Incremental Import of Sample Clinical Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalSamplesImport { + + public static final String STUDY_ID = "study_tcga_pub"; + private CancerStudy cancerStudy; + private final String UPDATE_TCGA_SAMPLE_ID = "TCGA-A1-A0SH-01"; + + @Before + public void setUp() throws DaoException { + cancerStudy = DaoCancerStudy.getCancerStudyByStableId(STUDY_ID); + } + /** + * Test inserting new sample for existing patient + */ + @Test + public void testInsertNewSampleForExistingPatient() throws DaoException { + /** + * prepare a new patient without samples + */ + String patientId = "TEST-INC-TCGA-P1"; + Patient patient = new Patient(cancerStudy, patientId); + int internalPatientId = DaoPatient.addPatient(patient); + DaoClinicalData.addPatientDatum(internalPatientId, "OS_STATUS", "0:LIVING"); + + String newSampleId = "TEST-INC-TCGA-P1-S1"; + File singleTcgaSampleFolder = new File("src/test/resources/incremental/insert_single_tcga_sample/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + File dataFile = new File(singleTcgaSampleFolder, "clinical_data_single_SAMPLE.txt"); + + ImportClinicalData importClinicalData = new ImportClinicalData(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importClinicalData.run(); + + List samples = DaoSample.getSamplesByPatientId(internalPatientId); + assertEquals("A new sample has to be attached to the patient", 1, samples.size()); + Sample sample = samples.get(0); + assertEquals(newSampleId, sample.getStableId()); + + List sampleClinicalData = DaoClinicalData.getSampleData(cancerStudy.getInternalId(), List.of(newSampleId)); + Map sampleAttrs = sampleClinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); + assertEquals(Map.of( + "SUBTYPE", "basal-like", + "OS_STATUS", "1:DECEASED", + "OS_MONTHS", "12.34", + "DFS_STATUS", "1:Recurred/Progressed"), sampleAttrs); + + // Patient attributes get SAMPLE_COUNT + List patientClinicalData = DaoClinicalData.getData(cancerStudy.getInternalId(), List.of(patientId)); + Map patientAttrs = patientClinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); + assertEquals(Map.of( + "OS_STATUS", "0:LIVING", + "SAMPLE_COUNT", "1"), patientAttrs); + } + + /** + * Test inserting new sample for nonexistent patient. + * EXPECTED RESULTS: + * 1. The new patient entry has to be inserted + * 2. Sample and all its clinical attributes have to be inserted + */ + @Test + public void testInsertNewSampleForNonexistentPatient() throws DaoException { + String newPatientId = "TEST-INC-TCGA-P2"; + String newSampleId = "TEST-INC-TCGA-P2-S1"; + File singleTcgaSampleFolder = new File("src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + File dataFile = new File(singleTcgaSampleFolder, "clinical_data_single_SAMPLE.txt"); + + ImportClinicalData importClinicalData = new ImportClinicalData(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importClinicalData.run(); + + Patient newPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), newPatientId); + assertNotNull("The new patient has to be created.", newPatient); + + List samples = DaoSample.getSamplesByPatientId(newPatient.getInternalId()); + assertEquals("A new sample has to be attached to the patient", 1, samples.size()); + Sample sample = samples.get(0); + assertEquals(newSampleId, sample.getStableId()); + + List clinicalData = DaoClinicalData.getSampleData(cancerStudy.getInternalId(), List.of(newSampleId)); + Map sampleAttrs = clinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); + assertEquals(Map.of( + "SUBTYPE", "Luminal A", + "OS_STATUS", "0:LIVING", + "OS_MONTHS", "23.45", + "DFS_STATUS", "1:Recurred/Progressed", + "DFS_MONTHS", "100"), sampleAttrs); + } + + /** + * Test reloading sample clinical attributes + */ + @Test + public void testReloadSampleClinicalAttributes() throws DaoException { + /** + * Add to a tcga sample some clinical attributes (test data sets doesn't have any) + */ + Sample tcgaSample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), + UPDATE_TCGA_SAMPLE_ID); + DaoClinicalData.addSampleDatum(tcgaSample.getInternalId(), "SUBTYPE", "Luminal A"); + DaoClinicalData.addSampleDatum(tcgaSample.getInternalId(), "OS_STATUS", "0:LIVING"); + DaoClinicalData.addSampleDatum(tcgaSample.getInternalId(), "OS_MONTHS", "34.56"); + + File singleTcgaSampleFolder = new File("src/test/resources/incremental/update_single_tcga_sample/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + File dataFile = new File(singleTcgaSampleFolder, "clinical_data_single_SAMPLE.txt"); + + ImportClinicalData importClinicalData = new ImportClinicalData(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importClinicalData.run(); + + List clinicalData = DaoClinicalData.getSampleData(cancerStudy.getInternalId(), List.of(UPDATE_TCGA_SAMPLE_ID)); + Map sampleAttrs = clinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); + assertEquals(Map.of( + "OS_STATUS", "1:DECEASED", + "OS_MONTHS", "45.67", + "DFS_STATUS", "1:Recurred/Progressed", + "DFS_MONTHS", "123"), sampleAttrs); + + /** + * Sub-entries stayed as they were, not removed. + */ + GeneticProfile mutationsProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mutations"); + assertNotNull(mutationsProfile); + ArrayList mutations = DaoMutation.getMutations(mutationsProfile.getGeneticProfileId(), tcgaSample.getInternalId()); + assertEquals(2, mutations.size()); + } +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalStructuralVariantsImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalStructuralVariantsImport.java new file mode 100644 index 00000000..e477d91f --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalStructuralVariantsImport.java @@ -0,0 +1,125 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGenePanel; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.dao.DaoStructuralVariant; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.GenePanel; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.model.StructuralVariant; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.List; +import java.util.Optional; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.mskcc.cbio.portal.dao.DaoMutation.getMutations; + +/** + * Tests Incremental Import of Structural Variants Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalStructuralVariantsImport { + + public static final String STUDY_ID = "study_tcga_pub"; + private CancerStudy cancerStudy; + + @Before + public void setUp() throws DaoException { + cancerStudy = DaoCancerStudy.getCancerStudyByStableId(STUDY_ID); + } + /** + * Test incremental upload of SV data + */ + @Test + public void testIncrementalUpload() throws DaoException { + GeneticProfile svGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_structural_variants"); + assertNotNull(svGeneticProfile); + String svDataSampleId = "TCGA-A1-A0SE-01"; + /** + * this sample does not have SV data attached + */ + Sample svDataSample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), svDataSampleId); + + StructuralVariant structuralVariant = new StructuralVariant(); + structuralVariant.setSampleIdInternal(svDataSample.getInternalId()); + structuralVariant.setGeneticProfileId(svGeneticProfile.getGeneticProfileId()); + structuralVariant.setAnnotation("TESTANNOT"); + structuralVariant.setDriverFilter("DRVFILTER"); + structuralVariant.setSite1RegionNumber(1); + structuralVariant.setSite2RegionNumber(2); + structuralVariant.setComments("This record has to be overwritten"); + DaoStructuralVariant.addStructuralVariantToBulkLoader(structuralVariant); + MySQLbulkLoader.flushAll(); + DaoSampleProfile.upsertSampleToProfileMapping(List.of( + new DaoSampleProfile.SampleProfileTuple(svGeneticProfile.getGeneticProfileId(), svDataSample.getInternalId(), null))); + + File singleTcgaSampleFolder = new File("src/test/resources/incremental/structural_variants/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_structural_variants.txt"); + File dataFile = new File(singleTcgaSampleFolder, "data_structural_variants.txt"); + + ImportProfileData importProfileData = new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importProfileData.run(); + + List structuralVariants = DaoStructuralVariant.getAllStructuralVariants(); + assertEquals(3, structuralVariants.size()); + Set.of("site1_test_desc_1", "site1_test_desc_2", "site1_test_desc_3").forEach(site1Desc -> { + Optional osv = structuralVariants.stream() + .filter(sv -> site1Desc.equals(sv.getSite1Description()) + && sv.getSampleIdInternal() == svDataSample.getInternalId() + && sv.getGeneticProfileId() == svGeneticProfile.getGeneticProfileId()).findFirst(); + assertTrue(osv.isPresent()); + assertNotNull(osv.get().getDriverFilter()); + }); + GenePanel genePanel = DaoGenePanel.getGenePanelByStableId("TSTGNPNLSV"); + assertEquals("Sample profile has to point to TSTGNPNLSV panel", + genePanel.getInternalId(), + DaoSampleProfile.getPanelId(svDataSample.getInternalId(), svGeneticProfile.getGeneticProfileId())); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimDataTransaction.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimDataTransaction.java new file mode 100644 index 00000000..d909015c --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimDataTransaction.java @@ -0,0 +1,103 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.runner.RunWith; +import org.mockito.junit.jupiter.MockitoExtension; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportTabDelimData; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Propagation; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.HashMap; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.ArgumentMatchers.anyLong; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +/** + * Tests Transaction for Incremental Import of Tab Delimited Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +public class TestIncrementalTabDelimDataTransaction { + + /** + * Test transaction + */ + @Test + @ExtendWith(MockitoExtension.class) + //Mysql does not support nested transactions. That's why we disable the outer transaction. + @Transactional(propagation = Propagation.NOT_SUPPORTED) + public void testTransaction() throws Exception { + GeneticProfile mrnaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mrna"); + + File dataFolder = new File("src/test/resources/incremental/mrna_expression/"); + File dataFile = new File(dataFolder, "data_expression_Zscores.txt"); + + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + + DaoGeneOptimized mockedDao = mock(DaoGeneOptimized.class); + + when(mockedDao.getGene(anyLong())) + .thenThrow(new RuntimeException("Simulated error")); + /** + * Test + */ + try { + new ImportTabDelimData(dataFile, + mrnaProfile.getGeneticProfileId(), + null, + true, + mockedDao).importData(); + fail("Import has to fail"); + } catch (RuntimeException runtimeException) { + assertTrue(runtimeException.getMessage(), runtimeException.getMessage().contains("Simulated error")); + assertTrue(true); + } + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + assertEquals(beforeResult, afterResult); + } + + @Before + public void setUp() throws DaoException { + DaoCancerStudy.reCacheAll(); + } +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTimelineImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTimelineImport.java new file mode 100644 index 00000000..c077c58b --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTimelineImport.java @@ -0,0 +1,115 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoClinicalEvent; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoPatient; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.ClinicalEvent; +import org.mskcc.cbio.portal.model.Patient; +import org.mskcc.cbio.portal.scripts.ImportTimelineData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + +/** + * Tests Incremental Import of Timeline Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalTimelineImport { + + public static final String STUDY_ID = "study_tcga_pub"; + private CancerStudy cancerStudy; + + @Before + public void setUp() throws DaoException { + cancerStudy = DaoCancerStudy.getCancerStudyByStableId(STUDY_ID); + } + + @Test + public void testTimelineDataReloading() throws DaoException { + MySQLbulkLoader.bulkLoadOn(); + ClinicalEvent event = new ClinicalEvent(); + event.setClinicalEventId(1L); + Patient sbPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), "TCGA-A1-A0SB"); + event.setPatientId(sbPatient.getInternalId()); + event.setStartDate(5L); + event.setEventType("SPECIMEN"); + event.setEventData(Map.of("SPECIMEN_SITE", "specimen_site_to_erase")); + DaoClinicalEvent.addClinicalEvent(event); + MySQLbulkLoader.flushAll(); + + File singleTcgaSampleFolder = new File("src/test/resources/incremental/clinical/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_timeline.txt"); + File dataFile = new File(singleTcgaSampleFolder, "data_timeline.txt"); + + ImportTimelineData importTimelineData = new ImportTimelineData(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importTimelineData.run(); + + List sbClinicalEvents = DaoClinicalEvent.getClinicalEvent(sbPatient.getInternalId()); + assertEquals(2, sbClinicalEvents.size()); + ClinicalEvent sbSpecimen = sbClinicalEvents.stream().filter(ce -> ce.getEventType().equals("SPECIMEN")).findFirst().get(); + assertEquals(20L, sbSpecimen.getStartDate()); + assertEquals(60L, sbSpecimen.getStopDate()); + assertEquals(Map.of( + "SPECIMEN_SITE", "test_specimen_site_1", + "SPECIMEN_TYPE", "test_specimen_type", + "SOURCE", "test_source_3" + ), sbSpecimen.getEventData()); + ClinicalEvent sbStatus = sbClinicalEvents.stream().filter(ce -> ce.getEventType().equals("STATUS")).findFirst().get(); + assertEquals(10L, sbStatus.getStartDate()); + assertEquals(20L, sbStatus.getStopDate()); + assertEquals(Map.of("SOURCE", "test_source_4"), sbStatus.getEventData()); + + Patient sdPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), "TCGA-A1-A0SD"); + List sdClinicalEvents = DaoClinicalEvent.getClinicalEvent(sdPatient.getInternalId()); + assertEquals(1, sdClinicalEvents.size()); + ClinicalEvent sdStatus = sdClinicalEvents.stream().filter(ce -> ce.getEventType().equals("STATUS")).findFirst().get(); + assertEquals(45L, sdStatus.getStartDate()); + assertNull(sdStatus.getStopDate()); + assertEquals(Map.of("SOURCE", "test_source_2"), sdStatus.getEventData()); + + Patient nonexistentPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), "NONEXISTENT_PATIENT"); + assertNull(nonexistentPatient); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestUpdateCaseListsSampleIds.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestUpdateCaseListsSampleIds.java new file mode 100644 index 00000000..e473955f --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestUpdateCaseListsSampleIds.java @@ -0,0 +1,172 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.model.*; +import org.mskcc.cbio.portal.scripts.UpdateCaseListsSampleIds; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; + +import static org.junit.Assert.*; + +/** + * Tests Incremental Import of Case Lists. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestUpdateCaseListsSampleIds { + + DaoSampleList daoSampleList = new DaoSampleList(); + /** + * Test adding sample id to the all case list. It is the default behaviour of the command. + */ + @Test + public void testAddSampleIdToAllCaseList() throws DaoException { + String sampleIdToAdd = "TCGA-XX-0800-01"; + File singleTcgaSampleFolder = new File("src/test/resources/update_case_lists/add_sample_to_case_list/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + + assertSampleIdNotInCaseLists(sampleIdToAdd, "study_tcga_pub_all"); + + UpdateCaseListsSampleIds importClinicalData = new UpdateCaseListsSampleIds(new String[] { + "--meta", metaFile.getAbsolutePath(), + }); + importClinicalData.run(); + + assertSampleIdInCaseLists(sampleIdToAdd, "study_tcga_pub_all"); + } + + /** + * Test adding sample id to a MRNA case list. + * Sample has to be added to the all case list as well. + */ + @Test + public void testAddSampleIdToMrnaCaseList() throws DaoException { + String sampleIdToAdd = "TCGA-XX-0800-01"; + File singleTcgaSampleFolder = new File("src/test/resources/update_case_lists/add_sample_to_case_list/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + File caseListsDir = new File(singleTcgaSampleFolder, "case_lists/"); + + assertSampleIdNotInCaseLists(sampleIdToAdd, "study_tcga_pub_all", "study_tcga_pub_mrna"); + + UpdateCaseListsSampleIds importClinicalData = new UpdateCaseListsSampleIds(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--case-lists", caseListsDir.getAbsolutePath() + }); + importClinicalData.run(); + + assertSampleIdInCaseLists(sampleIdToAdd, "study_tcga_pub_all", "study_tcga_pub_mrna"); + } + + /** + * Test re-adding sample to very same case list (efficiently no-op) should not complain. + */ + @Test + public void testReAddingSampleToTheSameListShouldWork() throws DaoException { + String sampleIdToAdd = "TCGA-A1-A0SH-01"; + String[] caseListsSampleIsPartOf = new String[] { + "study_tcga_pub_all", + "study_tcga_pub_acgh", + "study_tcga_pub_cnaseq", + "study_tcga_pub_complete", + "study_tcga_pub_log2CNA", + "study_tcga_pub_mrna", + "study_tcga_pub_sequenced"}; + String[] caseListsSampleIsNotPartOf = new String[] { + "study_tcga_pub_methylation_hm27", + }; + + File singleTcgaSampleFolder = new File("src/test/resources/update_case_lists/update_tcga_samples/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + File caseListsDir = new File(singleTcgaSampleFolder, "case_lists/"); + + assertSampleIdInCaseLists(sampleIdToAdd, caseListsSampleIsPartOf); + assertSampleIdNotInCaseLists(sampleIdToAdd, caseListsSampleIsNotPartOf); + + UpdateCaseListsSampleIds importClinicalData = new UpdateCaseListsSampleIds(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--case-lists", caseListsDir.getAbsolutePath() + }); + importClinicalData.run(); + + assertSampleIdInCaseLists(sampleIdToAdd, caseListsSampleIsPartOf); + assertSampleIdNotInCaseLists(sampleIdToAdd, caseListsSampleIsNotPartOf); + } + + /** + * Test removing sample ids from not specified case lists + */ + @Test + public void testRemovingSampleIdsFromNotSpecifiedCaseLists() throws DaoException { + String sampleIdToAdd = "TCGA-A1-A0SH-01"; + + File singleTcgaSampleFolder = new File("src/test/resources/update_case_lists/update_tcga_samples/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + File caseListsDir = new File(singleTcgaSampleFolder, "case_lists/"); + File caseAcghFile = new File(caseListsDir, "case_acgh.txt"); + + UpdateCaseListsSampleIds importClinicalData = new UpdateCaseListsSampleIds(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--case-lists", caseAcghFile.getAbsolutePath() + }); + importClinicalData.run(); + + assertSampleIdInCaseLists(sampleIdToAdd, "study_tcga_pub_all", "study_tcga_pub_acgh"); + assertSampleIdNotInCaseLists(sampleIdToAdd, "study_tcga_pub_cnaseq", + "study_tcga_pub_complete", + "study_tcga_pub_log2CNA", + "study_tcga_pub_methylation_hm27", + "study_tcga_pub_mrna", + "study_tcga_pub_sequenced"); + } + @Before + public void init() { + // FIXME How we can remove this re-caching and keep tests to work? + // pre conditions (asserts before the testee operation is called) are relying on it + DaoCancerStudy.reCacheAll(); + } + + private void assertSampleIdInCaseLists(String sampleId, String... caseListStableIds) throws DaoException { + for (String caseListStableId : caseListStableIds) { + SampleList sampleList = daoSampleList.getSampleListByStableId(caseListStableId); + assertNotNull(caseListStableId + " case list has to exist", sampleList); + assertTrue(sampleId + " has to be in the " + caseListStableId + " case list", sampleList.getSampleList().contains(sampleId)); + }; + } + + private void assertSampleIdNotInCaseLists(String sampleId, String... caseListStableIds) throws DaoException { + for (String caseListStableId : caseListStableIds) { + SampleList sampleList = daoSampleList.getSampleListByStableId(caseListStableId); + assertNotNull(caseListStableId + " case list has to exist", sampleList); + assertTrue(sampleId + " has not to be in the " + caseListStableId + " case list", !sampleList.getSampleList().contains(sampleId)); + }; + } +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java index d317aa03..27eb111e 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java @@ -108,7 +108,6 @@ public void testImportCnaDiscreteLongDataAddsSamples() throws Exception { geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test new samples are added: @@ -134,7 +133,6 @@ public void testImportCnaDiscreteLongDataAddsCnaEvents() throws Exception { geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces ).importData(); @@ -180,7 +178,7 @@ public void testImportCnaDiscreteLongDataAddsCnaEvents() throws Exception { @Test public void testImportCnaDiscreteLongDataAddsGeneticAlterations() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test.txt"); new ImportCnaDiscreteLongData( @@ -188,7 +186,6 @@ public void testImportCnaDiscreteLongDataAddsGeneticAlterations() throws Excepti geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test genetic alterations are added for all genes: @@ -205,7 +202,7 @@ public void testImportCnaDiscreteLongDataAddsGeneticAlterations() throws Excepti @Test public void testImportCnaDiscreteLongDataAddsMissingGeneticAlterations() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test_with_cna_events_missing.txt"); new ImportCnaDiscreteLongData( @@ -213,7 +210,6 @@ public void testImportCnaDiscreteLongDataAddsMissingGeneticAlterations() throws geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test genetic alteration are added of non-cna event: @@ -233,7 +229,7 @@ public void testImportCnaDiscreteLongDataAddsMissingGeneticAlterations() throws @Test public void testImportCnaDiscreteLongDataAddsGeneticAlterationsAndProfileSamplesInCorrectOrder() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test.txt"); new ImportCnaDiscreteLongData( @@ -241,7 +237,6 @@ public void testImportCnaDiscreteLongDataAddsGeneticAlterationsAndProfileSamples geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test order of genetic alteration values: @@ -260,7 +255,7 @@ public void testImportCnaDiscreteLongDataAddsGeneticAlterationsAndProfileSamples @Test public void testImportCnaDiscreteLongDataHandlesEntriesWithoutEntrezButWithHugo() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test_without_entrez_with_hugo.txt"); new ImportCnaDiscreteLongData( @@ -268,7 +263,6 @@ public void testImportCnaDiscreteLongDataHandlesEntriesWithoutEntrezButWithHugo( geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test order of genetic alteration values: @@ -283,7 +277,7 @@ public void testImportCnaDiscreteLongDataHandlesEntriesWithoutEntrezButWithHugo( @Test public void testImportCnaDiscreteLongDataHandlesEntriesWithWrongEntrezAndCorrectHugo() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test_with_wrong_entrez_and_correct_hugo.txt"); new ImportCnaDiscreteLongData( @@ -291,7 +285,6 @@ public void testImportCnaDiscreteLongDataHandlesEntriesWithWrongEntrezAndCorrect geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test order of genetic alteration values: @@ -306,7 +299,7 @@ public void testImportCnaDiscreteLongDataHandlesEntriesWithWrongEntrezAndCorrect @Test public void testImportCnaDiscreteLongDataAddsGeneticAlterationsFromNonCnaEvents() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test.txt"); new ImportCnaDiscreteLongData( @@ -314,7 +307,6 @@ public void testImportCnaDiscreteLongDataAddsGeneticAlterationsFromNonCnaEvents( geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test genetic alteration are added of non-cna event: @@ -334,7 +326,7 @@ public void testImportCnaDiscreteLongDataAddsGeneticAlterationsFromNonCnaEvents( @Test public void testImportCnaDiscreteLongDataIgnoresLineWithDuplicateGene() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test.txt"); new ImportCnaDiscreteLongData( @@ -342,7 +334,6 @@ public void testImportCnaDiscreteLongDataIgnoresLineWithDuplicateGene() throws E geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test genetic alteration are deduplicated: @@ -364,7 +355,6 @@ public void testImportCnaDiscreteLongDataAddsPdAnnotations() throws Exception { geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces ).importData(); List genes = newArrayList(3983L, 27334L, 2115L); @@ -394,7 +384,6 @@ public void testImportCnaDiscreteLongData_changesProfileDatatypeFromDiscreteLong geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces ).importData(); @@ -418,7 +407,6 @@ public void testImportCnaDiscreteLongDataOnlyAddsSpecifiedCustomNamespaceColumns geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), namespacesToImport ).importData(); @@ -458,7 +446,6 @@ public void testImportCnaDiscreteLongDataImportsMissingNamespacesAsNull() throws geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), namespacesToImport ).importData(); @@ -507,7 +494,6 @@ public void testImportCnaDiscreteLongDataAddsCustomNamespaceColumnsForEachSample geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), namespaces ).importData(); @@ -549,7 +535,6 @@ public void testImportCnaDiscreteLongDataImportsCustomNamespaceColumnsAsNullWhen geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), namespaces ).importData(); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayData.java index a0a33c6d..fa7e0449 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayData.java @@ -95,10 +95,11 @@ public void testImportGenericAssayData() throws Exception { // Open mutational signature test data file File file = new File("src/test/resources/data_mutational_signature.txt"); - + int numRecordsForGenericAssayBefore = getNumRecordsForGenericAssay(); + // import data and test all mutational signatures were added ImportGenericAssayEntity.importData(file, GeneticAlterationType.GENERIC_ASSAY, "name,description", false); - assertEquals(60, getNumRecordsForGenericAssay()); + assertEquals(numRecordsForGenericAssayBefore + 60, getNumRecordsForGenericAssay()); // test wether a record can be retrieved via stable id GenericAssayMeta genericAssayMeta1 = DaoGenericAssay.getGenericAssayMetaByStableId("mean_1"); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayPatientLevelData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayPatientLevelData.java index 123715f8..480e9a61 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayPatientLevelData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayPatientLevelData.java @@ -53,7 +53,6 @@ import org.mskcc.cbio.portal.model.Patient; import org.mskcc.cbio.portal.model.Sample; import org.mskcc.cbio.portal.scripts.ImportGenericAssayPatientLevelData; -import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; import org.springframework.test.annotation.Rollback; import org.springframework.test.context.ContextConfiguration; @@ -151,8 +150,7 @@ private void runImportGenericAssayPatientLevelData() throws DaoException, IOExce File file = new File("src/test/resources/tabDelimitedData/data_patient_generic_assay.txt"); ImportGenericAssayPatientLevelData parser = new ImportGenericAssayPatientLevelData(file, null, geneticProfileId, null, "name,description"); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + parser.importData(); HashMap> geneticAlterationMap = daoGeneticAlteration.getGeneticAlterationMapForEntityIds(geneticProfileId, Arrays.asList(geneticEntity1.getId(), geneticEntity2.getId())); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportStructuralVariantData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportStructuralVariantData.java index 76a58f85..2f91a779 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportStructuralVariantData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportStructuralVariantData.java @@ -91,7 +91,7 @@ public void testImportStructuralVariantData() throws DaoException, IOException { // Load test structural variants File file = new File("src/test/resources/data_structural_variants.txt"); - ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, noNamespaces); + ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, noNamespaces, false); importer.importData(); MySQLbulkLoader.flushAll(); @@ -133,7 +133,7 @@ public void testImportStructuralVariantDataImportsCustomNamespacesFromTwoSamples // Load test structural variants File file = new File("src/test/resources/data_structural_variants.txt"); Set namespacesToImport = newHashSet("StructVarNamespace", "StructVarNamespace2"); - ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport); + ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport, false); importer.importData(); MySQLbulkLoader.flushAll(); @@ -159,7 +159,7 @@ public void testImportStructuralVariantDataIgnoresUnspecifiedNamespaces() throws // Load test structural variants File file = new File("src/test/resources/data_structural_variants_with_unspecified_namespace.txt"); Set namespacesToImport = newHashSet("StructVarNamespace", "StructVarNamespace2"); - ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport); + ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport, false); importer.importData(); MySQLbulkLoader.flushAll(); @@ -182,7 +182,7 @@ public void testImportStructuralVariantDataWithNoNamespaceData() throws DaoExcep // Load test structural variants File file = new File("src/test/resources/data_structural_variants_with_no_namespace_data.txt"); Set namespacesToImport = newHashSet("StructVarNamespace", "StructVarNamespace2"); - ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport); + ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport, false); importer.importData(); MySQLbulkLoader.flushAll(); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java index 33779cd3..68f8940d 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java @@ -38,7 +38,6 @@ import org.mskcc.cbio.portal.dao.DaoCancerStudy; import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoGeneOptimized; -import org.mskcc.cbio.portal.dao.DaoGeneset; import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.dao.DaoPatient; @@ -48,15 +47,12 @@ import org.mskcc.cbio.portal.model.CancerStudy; import org.mskcc.cbio.portal.model.CanonicalGene; import org.mskcc.cbio.portal.model.CopyNumberStatus; -import org.mskcc.cbio.portal.model.Geneset; import org.mskcc.cbio.portal.model.GeneticAlterationType; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.model.Patient; import org.mskcc.cbio.portal.model.Sample; -import org.mskcc.cbio.portal.scripts.ImportGenesetData; import org.mskcc.cbio.portal.scripts.ImportTabDelimData; import org.mskcc.cbio.portal.util.ConsoleUtil; -import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; import org.springframework.test.annotation.Rollback; import org.springframework.test.context.ContextConfiguration; @@ -171,9 +167,8 @@ private void runImportCnaData() throws DaoException, IOException{ ProgressMonitor.setConsoleMode(false); // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/cna_test.txt"); - ImportTabDelimData parser = new ImportTabDelimData(file, "Barry", geneticProfileId, null, DaoGeneticAlteration.getInstance()); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + ImportTabDelimData parser = new ImportTabDelimData(file, "Barry", geneticProfileId, null, false, DaoGeneOptimized.getInstance()); + parser.importData(); String value = dao.getGeneticAlteration(geneticProfileId, sample1, 999999207); assertEquals ("0", value); @@ -236,9 +231,8 @@ private void runImportCnaData2() throws DaoException, IOException{ ProgressMonitor.setConsoleMode(false); // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/cna_test2.txt"); - ImportTabDelimData parser = new ImportTabDelimData(file, geneticProfileId, null, DaoGeneticAlteration.getInstance()); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + ImportTabDelimData parser = new ImportTabDelimData(file, geneticProfileId, null, false, DaoGeneOptimized.getInstance()); + parser.importData(); String value = dao.getGeneticAlteration(geneticProfileId, sample1, 207); assertEquals (value, "0"); @@ -321,9 +315,8 @@ private void runImportRnaData1() throws DaoException, IOException{ // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/mrna_test.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance()); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, false, DaoGeneOptimized.getInstance()); + parser.importData(); ConsoleUtil.showMessages(); int sampleId = DaoSample.getSampleByCancerStudyAndSampleId(studyId, "DD639").getInternalId(); @@ -375,9 +368,8 @@ public void testImportmRnaData2() throws Exception { // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/tabDelimitedData/data_expression2.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance()); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, false, DaoGeneOptimized.getInstance()); + parser.importData(); // check if expected warnings are given: ArrayList warnings = ProgressMonitor.getWarnings(); @@ -468,9 +460,8 @@ public void testImportRppaData() throws Exception { // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/tabDelimitedData/data_rppa.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance()); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, false, DaoGeneOptimized.getInstance()); + parser.importData(); ConsoleUtil.showMessages(); int sampleId = DaoSample.getSampleByCancerStudyAndSampleId(studyId, "SAMPLE1").getInternalId(); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestIntegrationTest.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestIntegrationTest.java index bc3bbac1..dc76865f 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestIntegrationTest.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestIntegrationTest.java @@ -28,16 +28,8 @@ import com.fasterxml.jackson.core.JsonParseException; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.ObjectMapper; -import org.cbioportal.model.GenericAssayData; -import org.cbioportal.model.GenesetMolecularData; -import org.cbioportal.model.StructuralVariant; -import org.cbioportal.model.StructuralVariantQuery; -import org.cbioportal.persistence.PersistenceConstants; -import org.cbioportal.service.GenericAssayService; -import org.cbioportal.service.GenesetDataService; -import org.cbioportal.service.StructuralVariantService; +import org.cbioportal.model.GeneticEntity; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.mskcc.cbio.portal.dao.DaoCancerStudy; @@ -51,7 +43,9 @@ import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.dao.DaoGistic; import org.mskcc.cbio.portal.dao.DaoMutation; +import org.mskcc.cbio.portal.dao.DaoSample; import org.mskcc.cbio.portal.dao.DaoSampleList; +import org.mskcc.cbio.portal.dao.DaoStructuralVariant; import org.mskcc.cbio.portal.dao.DaoTypeOfCancer; import org.mskcc.cbio.portal.dao.MySQLbulkLoader; import org.mskcc.cbio.portal.model.CancerStudy; @@ -59,9 +53,12 @@ import org.mskcc.cbio.portal.model.ClinicalAttribute; import org.mskcc.cbio.portal.model.ClinicalData; import org.mskcc.cbio.portal.model.ExtendedMutation; +import org.mskcc.cbio.portal.model.Geneset; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.model.Gistic; +import org.mskcc.cbio.portal.model.Sample; import org.mskcc.cbio.portal.model.SampleList; +import org.mskcc.cbio.portal.model.StructuralVariant; import org.mskcc.cbio.portal.model.TypeOfCancer; import org.mskcc.cbio.portal.scripts.ImportGenePanel; import org.mskcc.cbio.portal.util.ConsoleUtil; @@ -81,12 +78,12 @@ import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @@ -129,17 +126,15 @@ public void setUp() throws DaoException, JsonParseException, JsonMappingExceptio * * @throws Throwable */ - @Ignore("Skip TestIntegrationTest.testLoadStudyEs0 due to NullPointerException") @Test public void testLoadStudyEs0() throws Throwable { try { // === assumptions that we rely upon in the checks later on: ==== - // assumption 1: there are no clinical attributes at the start of the test: - assertEquals(0, DaoClinicalAttributeMeta.getAllMap().size()); - // use this to get progress info/troubleshoot: // ProgressMonitor.setConsoleMode(true); + int numberOfMutationsInDb = DaoMutation.getAllMutations().size(); + // ==== Load the data ==== TransactionalScripts scripts = applicationContext.getBean(TransactionalScripts.class); scripts.run(); @@ -167,29 +162,25 @@ public void testLoadStudyEs0() throws Throwable { List mutations = DaoMutation.getAllMutations(); // check number of mutation records in the database // 3 in seed_mini.sql + 33 study_es_0/data_mutations_extended.maf (2 silent ignored)) - // so we expect 34 records in DB: - assertEquals(34, mutations.size()); + // so we expect +34 records in DB: + assertEquals(numberOfMutationsInDb + 34, mutations.size()); //===== Check STRUCTURAL VARIANT data ======== - // 45 structural variant events are imported, using 31 unique genes, using 39 samples - // Not all 31 genes have to be queried. BRAF is fused to many of the test genes. - List entrezGeneIds = new ArrayList(Arrays.asList(57670, 673, 8031, 5979, 27436, 238, 7113, 2078, 1956, 238, 5774, 2115, 7273)); - // Add samples and molecular profile IDs - List sampleIds = new ArrayList(Arrays.asList("TCGA-A2-A04P-01", "TCGA-A1-A0SB-01", "TCGA-A1-A0SB-01", "TCGA-A2-A04P-01", "TCGA-A2-A04P-01", "TCGA-A1-A0SK-01", "TCGA-A2-A0CM-01", "TCGA-AR-A1AR-01", "TCGA-B6-A0WX-01", "TCGA-BH-A1F0-01", "TCGA-B6-A0I6-01", "TCGA-BH-A18V-01", "TCGA-BH-A18Q-01", "TCGA-BH-A18K-01", "TCGA-BH-A0HL-01", "TCGA-BH-A0E0-01", "TCGA-BH-A0RX-01", "TCGA-A7-A13D-01", "TCGA-BH-A0E6-01", "TCGA-AO-A0J4-01", "TCGA-A7-A0CE-01", "TCGA-A7-A13E-01", "TCGA-A7-A0DA-01", "TCGA-D8-A142-01", "TCGA-D8-A143-01", "TCGA-AQ-A04J-01", "TCGA-BH-A0HN-01", "TCGA-A2-A0T0-01", "TCGA-A2-A0YE-01", "TCGA-A2-A0YJ-01", "TCGA-A2-A0D0-01", "TCGA-A2-A04U-01", "TCGA-AO-A0J6-01", "TCGA-A2-A0YM-01", "TCGA-A2-A0D2-01", "TCGA-BH-A0B3-01", "TCGA-A2-A04Q-01", "TCGA-A2-A0SX-01", "TCGA-AO-A0JL-01")); - List geneticProfileStableIds = new ArrayList(); - geneticProfileStableIds = Collections.nCopies(sampleIds.size(), "study_es_0_structural_variants"); + GeneticProfile svGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_es_0_structural_variants"); - StructuralVariantService structuralVariantService = applicationContext.getBean(StructuralVariantService.class); - List noStructVars = Collections.emptyList(); - List structuralVariants = structuralVariantService.fetchStructuralVariants(geneticProfileStableIds, sampleIds, entrezGeneIds, noStructVars); + List structuralVariants = DaoStructuralVariant.getAllStructuralVariants() + .stream() + .filter(sv -> + sv.getGeneticProfileId() == svGeneticProfile.getGeneticProfileId() + ) + .collect(Collectors.toList()); - // Check if all 45 structural variants are imported - assertEquals(45, structuralVariants.size()); + // Check if all 48 structural variants are imported + assertEquals(48, structuralVariants.size()); //===== Check CNA data ======== DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); - ArrayList hugoGeneSymbols = new ArrayList(Arrays.asList("ACAP3","AGRN","ATAD3A","ATAD3B","ATAD3C","AURKAIP1","ERCC5")); ArrayList entrezIds = new ArrayList(Arrays.asList(116983L, 375790L, 55210L, 83858L, 219293L, 54998L, 2073L)); GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_es_0_gistic"); int countAMP_DEL = 0; @@ -286,7 +277,7 @@ public void testLoadStudyEs0() throws Throwable { // ===== check gistic data // servlet uses this query: ArrayList gistics = DaoGistic.getAllGisticByCancerStudyId(cancerStudy.getInternalId()); - assertEquals(12, gistics.size()); + assertEquals(11, gistics.size()); Gistic gisticChr10 = null, gisticChr20 = null; for (Gistic gistic : gistics) { if (gistic.getChromosome() == 20) { @@ -340,53 +331,70 @@ public void testLoadStudyEs0() throws Throwable { // ===== check mutational signature String testMutationalSignatureStableIds = "mean_1"; + GeneticEntity mutationSignatureGeneticEntity = DaoGeneticEntity.getGeneticEntityByStableId(testMutationalSignatureStableIds); + assertNotNull(mutationSignatureGeneticEntity); + String testMutationalSignatureMolecularProfileIds = "study_es_0_mutational_signature"; - assertNotNull(DaoGeneticEntity.getGeneticEntityByStableId(testMutationalSignatureStableIds)); + GeneticProfile mutationSignatureProfile = DaoGeneticProfile.getGeneticProfileByStableId(testMutationalSignatureMolecularProfileIds); + assertNotNull(mutationSignatureProfile); // ENTITY_STABLE_ID name description TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 // TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 // TCGA-BH-A0HP-01 TCGA-BH-A18P-01 // mean_1 ... ... ... 0.370266873 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723 - GenericAssayService genericAssayService = applicationContext.getBean(GenericAssayService.class); - List mutationalSignatureData = genericAssayService.fetchGenericAssayData(Arrays.asList(testMutationalSignatureMolecularProfileIds), - Arrays.asList("TCGA-A1-A0SB-01", "TCGA-A1-A0SH-01"), Arrays.asList(testMutationalSignatureStableIds), PersistenceConstants.SUMMARY_PROJECTION); - assertEquals(2, mutationalSignatureData.size()); - assertEquals("0.370266873", mutationalSignatureData.get(0).getValue()); - assertEquals("0.022753384", mutationalSignatureData.get(1).getValue()); + HashMap mutationalSignatureData = DaoGeneticAlteration + .getInstance() + .getGeneticAlterationMapForEntityIds( + mutationSignatureProfile.getGeneticProfileId(), + List.of(mutationSignatureGeneticEntity.getId())).get(mutationSignatureGeneticEntity.getId()); + Sample sbSample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), "TCGA-A1-A0SB-01"); + Sample shSample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), "TCGA-A1-A0SH-01"); + assertEquals("0.370266873", mutationalSignatureData.get(sbSample.getInternalId())); + assertEquals("0.022753384", mutationalSignatureData.get(shSample.getInternalId())); // ===== check GSVA data // ... - String testGeneset = "GO_ATP_DEPENDENT_CHROMATIN_REMODELING"; - assertEquals(4, DaoGeneset.getGenesetByExternalId(testGeneset).getGenesetGeneIds().size()); + String testGenesetExternalId = "GO_ATP_DEPENDENT_CHROMATIN_REMODELING"; + Geneset testGeneset = DaoGeneset.getGenesetByExternalId(testGenesetExternalId); + assertEquals(4, testGeneset.getGenesetGeneIds().size()); // scores: TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 // TCGA-A2-A04U-01 // GO_ATP_DEPENDENT_CHROMATIN_REMODELING -0.293861251463613 -0.226227563676626 // -0.546556962547473 -0.0811115513543749 0.56919171543422 // using new api: - GenesetDataService genesetDataService = applicationContext.getBean(GenesetDataService.class); - List genesetData = genesetDataService.fetchGenesetData("study_es_0_gsva_scores", - "study_es_0_all", Arrays.asList(testGeneset)); + GeneticProfile gsvaScoresProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_es_0_gsva_scores"); + HashMap genesetData = DaoGeneticAlteration + .getInstance() + .getGeneticAlterationMapForEntityIds( + gsvaScoresProfile.getGeneticProfileId(), + List.of(testGeneset.getGeneticEntityId())).get(testGeneset.getGeneticEntityId()); assertEquals(5, genesetData.size()); - genesetData = genesetDataService.fetchGenesetData("study_es_0_gsva_scores", - Arrays.asList("TCGA-A1-A0SB-01", "TCGA-A1-A0SH-01"), Arrays.asList(testGeneset)); - assertEquals(2, genesetData.size()); - assertEquals(-0.293861251463613, Double.parseDouble(genesetData.get(0).getValue()), 0.00001); - assertEquals(-0.0811115513543749, Double.parseDouble(genesetData.get(1).getValue()), 0.00001); + String sbSampleGenesetValueString = genesetData.get(sbSample.getInternalId()); + String shSampleGenesetValuesString = genesetData.get(shSample.getInternalId()); + assertEquals(-0.293861251463613, Double.parseDouble(sbSampleGenesetValueString), 0.00001); + assertEquals(-0.0811115513543749, Double.parseDouble(shSampleGenesetValuesString), 0.00001); // ===== check treatment (profile) data // ... - String testTreatment = "Irinotecan"; - assertNotNull(DaoGeneticEntity.getGeneticEntityByStableId(testTreatment)); + String testTreatmentStableId = "Irinotecan"; + GeneticEntity testTreatmentGeneticEntity = DaoGeneticEntity.getGeneticEntityByStableId(testTreatmentStableId); + assertNotNull(testTreatmentGeneticEntity); // ENTITY_STABLE_ID NAME DESCRIPTION URL TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 // TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 // TCGA-BH-A0HP-01 TCGA-BH-A18P-01 // Irinotecan ... ... ... NA 0.080764666 NA 0.06704437 0.069568723 0.034992039 // 0.740817904 0.209220141 - GenericAssayService treatmentDataService = applicationContext.getBean(GenericAssayService.class); - List treatmentData = treatmentDataService.getGenericAssayData("study_es_0_treatment_ic50", "study_es_0_all", Arrays.asList(testTreatment), PersistenceConstants.SUMMARY_PROJECTION); + GeneticProfile treatmentIc50Profile = DaoGeneticProfile.getGeneticProfileByStableId("study_es_0_treatment_ic50"); + HashMap treatmentData = DaoGeneticAlteration + .getInstance() + .getGeneticAlterationMapForEntityIds( + treatmentIc50Profile.getGeneticProfileId(), + List.of(testTreatmentGeneticEntity.getId())).get(testTreatmentGeneticEntity.getId()); assertEquals(8, treatmentData.size()); - assertEquals("NA", treatmentData.get(0).getValue()); - assertEquals(0.080764666, Double.parseDouble(treatmentData.get(1).getValue()), 0.00001); + String sbSampleIrinotecanTraetmentValuesString = treatmentData.get(sbSample.getInternalId()); + assertEquals("NA", sbSampleIrinotecanTraetmentValuesString); + String shSampleIrinotecanTraetmentValuesString = treatmentData.get(shSample.getInternalId()); + assertEquals(0.06704437, Double.parseDouble(shSampleIrinotecanTraetmentValuesString), 0.00001); // ===== check study status assertEquals(DaoCancerStudy.Status.AVAILABLE, DaoCancerStudy.getStatus("study_es_0")); @@ -419,7 +427,7 @@ private void loadGenes() throws DaoException, JsonParseException, JsonMappingExc Map> aliasesMap = new HashMap>(); InputStream inputStream = new FileInputStream( - "src/test/scripts/test_data/api_json_system_tests/genesaliases.json"); + "tests/test_data/api_json_system_tests/genesaliases.json"); // parse json file: ObjectMapper mapper = new ObjectMapper(); TestGeneAlias[] genesAliases = mapper.readValue(inputStream, TestGeneAlias[].class); @@ -434,7 +442,7 @@ private void loadGenes() throws DaoException, JsonParseException, JsonMappingExc aliases.add(testGeneAlias.geneAlias); } - inputStream = new FileInputStream("src/test/scripts/test_data/api_json_system_tests/genes.json"); + inputStream = new FileInputStream("tests/test_data/api_json_system_tests/genes.json"); // parse json file: mapper = new ObjectMapper(); TestGene[] genes = mapper.readValue(inputStream, TestGene[].class); @@ -457,9 +465,9 @@ private void loadGenes() throws DaoException, JsonParseException, JsonMappingExc */ private void loadGenePanel() throws Exception { ImportGenePanel gp = new ImportGenePanel(null); - gp.setFile(new File("src/test/scripts/test_data/study_es_0/data_gene_panel_testpanel1.txt")); + gp.setFile(new File("tests/test_data/study_es_0/data_gene_panel_testpanel1.txt")); gp.importData(); - gp.setFile(new File("src/test/scripts/test_data/study_es_0/data_gene_panel_testpanel2.txt")); + gp.setFile(new File("tests/test_data/study_es_0/data_gene_panel_testpanel2.txt")); gp.importData(); } @@ -473,7 +481,7 @@ static class TestGene { @JsonIgnoreProperties(ignoreUnknown = true) static class TestGeneAlias { - @JsonProperty("gene_alias") + @JsonProperty("alias") String geneAlias; @JsonProperty("entrezGeneId") int entrezGeneId; diff --git a/src/test/java/org/mskcc/cbio/portal/servlet/NullHttpServletResponse.java b/src/test/java/org/mskcc/cbio/portal/servlet/NullHttpServletResponse.java index 29079cd6..409db2bd 100644 --- a/src/test/java/org/mskcc/cbio/portal/servlet/NullHttpServletResponse.java +++ b/src/test/java/org/mskcc/cbio/portal/servlet/NullHttpServletResponse.java @@ -235,7 +235,7 @@ public String getContentType() { // properties /////////////////////////////////////////////////////////////// - private ServletOutputStream servletOutputStream = null; // new NullServletOutputStream();; + private ServletOutputStream servletOutputStream = null; // new NullServletOutputStream(); private StringWriter myStringWriter = new StringWriter(); public String getOutput(){ diff --git a/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java b/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java index b7ef75e0..d5229172 100644 --- a/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java +++ b/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java @@ -34,20 +34,17 @@ import java.io.BufferedReader; import java.io.FileReader; -import java.io.IOException; + import org.mskcc.cbio.portal.model.ExtendedMutation; import org.mskcc.cbio.portal.model.ExtendedMutation.MutationEvent; import java.util.*; -import java.util.regex.Matcher; + import org.junit.Assert; import org.junit.Test; import org.mskcc.cbio.maf.MafRecord; import org.mskcc.cbio.maf.MafUtil; -import org.mskcc.cbio.portal.dao.DaoException; -import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.model.AlleleSpecificCopyNumber; -import org.mskcc.cbio.portal.model.GeneticProfile; /** * @@ -273,7 +270,7 @@ public void testResolveAscnAnnotationNamespace() throws Exception { List ascnRecords = new ArrayList<>(); while((line=buf.readLine()) != null) { - if (!line.startsWith("#") && line.trim().length() > 0) { + if (TsvUtil.isDataLine(line)) { MafRecord record = mafUtil.parseRecord(line); // every record in test MAF should have ASCN data Assert.assertTrue(record.getNamespacesMap().containsKey(ASCN_NAMESPACE)); diff --git a/src/test/resources/data_CNA_sample.txt b/src/test/resources/data_CNA_sample.txt index f225740b..ec8b13fc 100644 --- a/src/test/resources/data_CNA_sample.txt +++ b/src/test/resources/data_CNA_sample.txt @@ -1,3 +1,3 @@ -GeneId Hugo_Symbol TCGA-02-0001-01 TCGA-02-0003-01 TCGA-02-0004-01 TCGA-02-0006-01 +GeneId Hugo_Symbol TCGA-02-0001-01 TCGA-02-0003-01 TCGA-02-0004-01 TCGA-02-0006-01 999999672 TESTBRCA1 -2 0 1 0 -999999675 TESTBRCA2 0 2 0 -1 \ No newline at end of file +999999675 TESTBRCA2 0 2 0 -1 diff --git a/src/test/resources/incremental/clinical/data_timeline.txt b/src/test/resources/incremental/clinical/data_timeline.txt new file mode 100644 index 00000000..679a9da5 --- /dev/null +++ b/src/test/resources/incremental/clinical/data_timeline.txt @@ -0,0 +1,5 @@ +PATIENT_ID START_DATE STOP_DATE EVENT_TYPE SPECIMEN_SITE SPECIMEN_TYPE SOURCE +TCGA-A1-A0SB 20 60 SPECIMEN test_specimen_site_1 test_specimen_type test_source_3 +TCGA-A1-A0SB 10 20 STATUS test_source_4 +TCGA-A1-A0SD 45 STATUS test_source_2 +NONEXISTENT_PATIENT 100 200 STATUS test_source_1 diff --git a/src/test/resources/incremental/clinical/meta_timeline.txt b/src/test/resources/incremental/clinical/meta_timeline.txt new file mode 100644 index 00000000..bacded8c --- /dev/null +++ b/src/test/resources/incremental/clinical/meta_timeline.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: TIMELINE +data_filename: data_timeline.txt diff --git a/src/test/resources/incremental/copy_number_alteration/data_cna.seg b/src/test/resources/incremental/copy_number_alteration/data_cna.seg new file mode 100644 index 00000000..fd1be197 --- /dev/null +++ b/src/test/resources/incremental/copy_number_alteration/data_cna.seg @@ -0,0 +1,10 @@ +ID chrom loc.start loc.end num.mark seg.mean +TCGA-A1-A0SE-01 1 3218610 95674710 53225 0.0055 +TCGA-A1-A0SE-01 1 95676511 95676518 2 -1.6636 +TCGA-A1-A0SE-01 1 95680124 167057183 24886 0.0053 +TCGA-A1-A0SE-01 1 167057495 167059336 3 -1.0999 +TCGA-A1-A0SE-01 1 167059760 181602002 9213 -8e-04 +TCGA-A1-A0SE-01 1 181603120 181609567 6 -1.2009 +TCGA-A1-A0SE-01 1 181610685 201473647 12002 0.0055 +TCGA-A1-A0SE-01 1 201474400 201474544 2 -1.4235 +TCGA-A1-A0SE-01 1 201475220 247813706 29781 -4e-04 diff --git a/src/test/resources/incremental/copy_number_alteration/data_cna_discrete.txt b/src/test/resources/incremental/copy_number_alteration/data_cna_discrete.txt new file mode 100644 index 00000000..7664e868 --- /dev/null +++ b/src/test/resources/incremental/copy_number_alteration/data_cna_discrete.txt @@ -0,0 +1,17 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-XX-0800-01 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL TCGA-A1-A0SO-01 +AKT3 10000 0 -2 -2 +AKT1 207 -1 2 2 +# All after the pipe has to be removed +AKT2|TEST 208 -2 2 -1 +HRAS 3265 2 2 0 +KRAS 3845 0 -2 2 +# This gene absent in this file, but it's still part of the profile and has to be updated +#ATM 472 +# This line missing the hugo symbol and the gene has to be detected by entrez id + 4893 -2 -2 -1 +# This line missing the entrez id and the gene has to be detected by hugo symbol +BRCA1 2 2 0 +BRAF 673 2 -2 -2 +BRCA2 675 -1.5 2 0 +# This gene is new! the empty values should be set for the already existing samples in the database +CDK1 983 -2 -2 2 diff --git a/src/test/resources/incremental/copy_number_alteration/data_cna_discrete_long.txt b/src/test/resources/incremental/copy_number_alteration/data_cna_discrete_long.txt new file mode 100644 index 00000000..88e406c4 --- /dev/null +++ b/src/test/resources/incremental/copy_number_alteration/data_cna_discrete_long.txt @@ -0,0 +1,37 @@ +Hugo_Symbol Entrez_Gene_Id Sample_Id Value cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation +AKT3 10000 TCGA-XX-0800-01 0 +AKT3 10000 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL -2 +AKT3 10000 TCGA-A1-A0SO-01 -2 +AKT1 207 TCGA-XX-0800-01 -1 +AKT1 207 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL 2 +AKT1 207 TCGA-A1-A0SO-01 2 +# All after the pipe has to be removed +AKT2|TEST 208 TCGA-XX-0800-01 -2 +AKT2|TEST 208 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL 2 +AKT2|TEST 208 TCGA-A1-A0SO-01 -1 Putative_Driver Test driver Class 1 Class annotation +HRAS 3265 TCGA-XX-0800-01 2 +HRAS 3265 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL 2 +HRAS 3265 TCGA-A1-A0SO-01 0 +KRAS 3845 TCGA-XX-0800-01 0 Class 2 Class annotation +KRAS 3845 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL -2 +KRAS 3845 TCGA-A1-A0SO-01 2 Putative_Passenger Test passenger Class 2 Class annotation +# This gene absent in this file, but it's still part of the profile and has to be updated +#ATM 472 +# This line missing the hugo symbol and the gene has to be detected by entrez id + 4893 TCGA-XX-0800-01 -2 + 4893 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL -2 + 4893 TCGA-A1-A0SO-01 -1 +# This line missing the entrez id and the gene has to be detected by hugo symbol +BRCA1 TCGA-XX-0800-01 2 +BRCA1 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL 2 +BRCA1 TCGA-A1-A0SO-01 0 +BRAF 673 TCGA-XX-0800-01 2 +BRAF 673 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL -2 +BRAF 673 TCGA-A1-A0SO-01 -2 +BRCA2 675 TCGA-XX-0800-01 -1.5 +BRCA2 675 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL 2 +BRCA2 675 TCGA-A1-A0SO-01 0 +# This gene is new! the empty values should be set for the already existing samples in the database +CDK1 983 TCGA-XX-0800-01 -2 Putative_Driver +CDK1 983 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL -2 +CDK1 983 TCGA-A1-A0SO-01 2 Putative_Passenger Test passenger diff --git a/src/test/resources/incremental/copy_number_alteration/data_cna_pd_annotations.txt b/src/test/resources/incremental/copy_number_alteration/data_cna_pd_annotations.txt new file mode 100644 index 00000000..3fbcfc58 --- /dev/null +++ b/src/test/resources/incremental/copy_number_alteration/data_cna_pd_annotations.txt @@ -0,0 +1,7 @@ +SAMPLE_ID Entrez_Gene_Id cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation +TCGA-A1-A0SO-01 3845 Putative_Passenger Test passenger Class 2 Class annotation +TCGA-A1-A0SO-01 208 Putative_Driver Test driver Class 1 Class annotation +TCGA-A1-A0SO-01 983 Putative_Passenger Test passenger +TCGA-XX-0800-01 3845 Class 2 Class annotation +TCGA-XX-0800-01 208 Class 1 Class annotation +TCGA-XX-0800-01 983 Putative_Driver diff --git a/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete.txt b/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete.txt new file mode 100644 index 00000000..2cdb4613 --- /dev/null +++ b/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete.txt @@ -0,0 +1,11 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: DISCRETE +stable_id: gistic +show_profile_in_analysis_tab: true +profile_description: Putative copy-number from GISTIC 2.0. Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. +profile_name: Putative copy-number alterations from GISTIC +data_filename: data_cna_discrete.txt +pd_annotations_filename: data_cna_pd_annotations.txt +namespaces: CustomNamespace +gene_panel: TSTGNPNLCNADS \ No newline at end of file diff --git a/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete_long.txt b/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete_long.txt new file mode 100644 index 00000000..c3172961 --- /dev/null +++ b/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete_long.txt @@ -0,0 +1,10 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: DISCRETE_LONG +stable_id: gistic +show_profile_in_analysis_tab: true +profile_description: Putative copy-number from GISTIC 2.0. Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. +profile_name: Putative copy-number alterations from GISTIC +data_filename: data_cna_discrete_long.txt +namespaces: CustomNamespace +gene_panel: TSTGNPNLCNADS diff --git a/src/test/resources/incremental/copy_number_alteration/meta_cna_seg.txt b/src/test/resources/incremental/copy_number_alteration/meta_cna_seg.txt new file mode 100644 index 00000000..61d86a9a --- /dev/null +++ b/src/test/resources/incremental/copy_number_alteration/meta_cna_seg.txt @@ -0,0 +1,6 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: SEG +reference_genome_id: hg19 +description: Test somatic CNA data +data_filename: data_cna.seg diff --git a/src/test/resources/incremental/gene_panel_matrix/data_gene_panel_matrix.txt b/src/test/resources/incremental/gene_panel_matrix/data_gene_panel_matrix.txt new file mode 100644 index 00000000..ca2bafda --- /dev/null +++ b/src/test/resources/incremental/gene_panel_matrix/data_gene_panel_matrix.txt @@ -0,0 +1,2 @@ +SAMPLE_ID mutations gistic treatment_ic50 +TCGA-A1-A0SB-01 TSTGNPNLMUTEXT TESTPANEL_CNA_DISCRETE_LONG_FORMAT WXS/WGS diff --git a/src/test/resources/incremental/gene_panel_matrix/meta_gene_panel_matrix.txt b/src/test/resources/incremental/gene_panel_matrix/meta_gene_panel_matrix.txt new file mode 100644 index 00000000..f0a7385c --- /dev/null +++ b/src/test/resources/incremental/gene_panel_matrix/meta_gene_panel_matrix.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: GENE_PANEL_MATRIX +datatype: GENE_PANEL_MATRIX +data_filename: data_gene_panel_matrix.txt diff --git a/src/test/resources/incremental/generic_assay/data_treatment_ic50.txt b/src/test/resources/incremental/generic_assay/data_treatment_ic50.txt new file mode 100644 index 00000000..79606fbf --- /dev/null +++ b/src/test/resources/incremental/generic_assay/data_treatment_ic50.txt @@ -0,0 +1,8 @@ +ENTITY_STABLE_ID NAME DESCRIPTION URL TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 +Erlotinib Name of Erlotinib Desc of Erlotinib Url of Erlotinib >8 7.5 +Irinotecan Name of Irinotecan Desc of Irinotecan Url of Irinotecan 0.081 +# The database has this entity, but not the file +#L-685458 +Lapatinib Name of Lapatinib Desc of Lapatinib Url of Lapatinib 6.2 7.848 +#The entity will be added +LBW242 Name of LBW242 Desc of LBW242 Url of LBW242 0.1 >~8 diff --git a/src/test/resources/incremental/generic_assay/data_treatment_ic50_patient_level.txt b/src/test/resources/incremental/generic_assay/data_treatment_ic50_patient_level.txt new file mode 100644 index 00000000..34753bba --- /dev/null +++ b/src/test/resources/incremental/generic_assay/data_treatment_ic50_patient_level.txt @@ -0,0 +1,8 @@ +ENTITY_STABLE_ID NAME DESCRIPTION URL TCGA-A1-A0SB TCGA-A1-A0SD +Erlotinib Name of Erlotinib Desc of Erlotinib Url of Erlotinib >8 7.5 +Irinotecan Name of Irinotecan Desc of Irinotecan Url of Irinotecan 0.081 +# The database has this entity, but not the file +#L-685458 +Lapatinib Name of Lapatinib Desc of Lapatinib Url of Lapatinib 6.2 7.848 +#The entity will be added +LBW242 Name of LBW242 Desc of LBW242 Url of LBW242 0.1 >~8 diff --git a/src/test/resources/incremental/generic_assay/meta_treatment_ic50.txt b/src/test/resources/incremental/generic_assay/meta_treatment_ic50.txt new file mode 100644 index 00000000..477d01fd --- /dev/null +++ b/src/test/resources/incremental/generic_assay/meta_treatment_ic50.txt @@ -0,0 +1,13 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: GENERIC_ASSAY +generic_assay_type: TREATMENT_RESPONSE +datatype: LIMIT-VALUE +stable_id: treatment_ic50 +profile_name: IC50 values of compounds on cellular phenotype readout +profile_description: IC50 (compound concentration resulting in half maximal inhibition) of compounds on cellular phenotype readout of cultured mutant cell lines. +data_filename: data_treatment_ic50.txt +show_profile_in_analysis_tab: true +pivot_threshold_value: 0.1 +value_sort_order: ASC +generic_entity_meta_properties: NAME,DESCRIPTION,URL +gene_panel: TSTGNPNLGENASS diff --git a/src/test/resources/incremental/generic_assay/meta_treatment_ic50_patient_level.txt b/src/test/resources/incremental/generic_assay/meta_treatment_ic50_patient_level.txt new file mode 100644 index 00000000..181899f5 --- /dev/null +++ b/src/test/resources/incremental/generic_assay/meta_treatment_ic50_patient_level.txt @@ -0,0 +1,13 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: GENERIC_ASSAY +generic_assay_type: TREATMENT_RESPONSE +datatype: LIMIT-VALUE +stable_id: treatment_ic50 +profile_name: IC50 values of compounds on cellular phenotype readout +profile_description: IC50 (compound concentration resulting in half maximal inhibition) of compounds on cellular phenotype readout of cultured mutant cell lines. +data_filename: data_treatment_ic50_patient_level.txt +show_profile_in_analysis_tab: true +pivot_threshold_value: 0.1 +value_sort_order: ASC +generic_entity_meta_properties: NAME,DESCRIPTION,URL +patient_level: true diff --git a/src/test/resources/incremental/insert_mutation_data/data_mutations_extended.txt b/src/test/resources/incremental/insert_mutation_data/data_mutations_extended.txt new file mode 100644 index 00000000..1eec7202 --- /dev/null +++ b/src/test/resources/incremental/insert_mutation_data/data_mutations_extended.txt @@ -0,0 +1,4 @@ +Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Tumor_Sample_Barcode Verification_Status Validation_Status Mutation_Status Sequencer Chromosome Start_position End_position Variant_Classification HGVSp_Short MA:FImpact MA:link.MSA MA:link.PDB +AKT1 207 broad.mit.edu GRCh37 TCGA-A1-A0SE-01 Unknown valid Unknown Illumina GAIIx chr1 22078087 22078087 Missense_Mutation D820N neutral mutationassessor.org/?cm=msa&ty=f&p=PGBM_HUMAN&rb=814&re=869&var=D820N +AKT2 208 broad.mit.edu GRCh37 TCGA-A1-A0SE-01 Unknown valid Unknown Illumina GAIIx chr1 34085156 34085156 Missense_Mutation V277I low mutationassessor.org/?cm=msa&ty=f&p=CSMD2_HUMAN&rb=202&re=303&var=V277I mutationassessor.org/pdb.php?prot=CSMD2_HUMAN&from=202&to=303&var=V277I +AKT3 10000 broad.mit.edu GRCh37 TCGA-A1-A0SE-01 Unknown valid Unknown Illumina GAIIx chr1 35989584 35989584 Missense_Mutation F628L mutationassessor.org/?cm=msa&ty=f&p=CLSPN_HUMAN&rb=601&re=800&var=F628L diff --git a/src/test/resources/incremental/insert_mutation_data/meta_mutations.txt b/src/test/resources/incremental/insert_mutation_data/meta_mutations.txt new file mode 100644 index 00000000..2282fbab --- /dev/null +++ b/src/test/resources/incremental/insert_mutation_data/meta_mutations.txt @@ -0,0 +1,9 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: MUTATION_EXTENDED +stable_id: mutations +datatype: MAF +show_profile_in_analysis_tab: true +profile_name: Test Mutations +profile_description: Mutation data for testing. +data_filename: data_mutations_extended.txt +gene_panel: TSTGNPNLMUTEXT diff --git a/src/test/resources/incremental/insert_single_tcga_patient/clinical_data_single_PATIENT.txt b/src/test/resources/incremental/insert_single_tcga_patient/clinical_data_single_PATIENT.txt new file mode 100644 index 00000000..38b9ef9f --- /dev/null +++ b/src/test/resources/incremental/insert_single_tcga_patient/clinical_data_single_PATIENT.txt @@ -0,0 +1,6 @@ +#Patient Identifier Subtype Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Patient identifier Subtype description Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 1 +PATIENT_ID SUBTYPE OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TEST-INC-TCGA-P2 basal-like 0:LIVING 45.6 1:Recurred/Progressed NA diff --git a/src/test/resources/incremental/insert_single_tcga_patient/meta_clinical_patient.txt b/src/test/resources/incremental/insert_single_tcga_patient/meta_clinical_patient.txt new file mode 100644 index 00000000..9e418c43 --- /dev/null +++ b/src/test/resources/incremental/insert_single_tcga_patient/meta_clinical_patient.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: PATIENT_ATTRIBUTES +data_filename: clinical_data_single_PATIENT.txt diff --git a/src/test/resources/incremental/insert_single_tcga_sample/clinical_data_single_SAMPLE.txt b/src/test/resources/incremental/insert_single_tcga_sample/clinical_data_single_SAMPLE.txt new file mode 100644 index 00000000..1feeebbc --- /dev/null +++ b/src/test/resources/incremental/insert_single_tcga_sample/clinical_data_single_SAMPLE.txt @@ -0,0 +1,6 @@ +#Sample Identifier Patient Identifier Subtype Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Sample identifier Patient Identifier Subtype description Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 1 1 +SAMPLE_ID PATIENT_ID SUBTYPE OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TEST-INC-TCGA-P1-S1 TEST-INC-TCGA-P1 basal-like 1:DECEASED 12.34 1:Recurred/Progressed NA diff --git a/src/test/resources/incremental/insert_single_tcga_sample/meta_clinical_sample.txt b/src/test/resources/incremental/insert_single_tcga_sample/meta_clinical_sample.txt new file mode 100644 index 00000000..b0b4753e --- /dev/null +++ b/src/test/resources/incremental/insert_single_tcga_sample/meta_clinical_sample.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: SAMPLE_ATTRIBUTES +data_filename: clinical_data_single_SAMPLE.txt diff --git a/src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/clinical_data_single_SAMPLE.txt b/src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/clinical_data_single_SAMPLE.txt new file mode 100644 index 00000000..1252404b --- /dev/null +++ b/src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/clinical_data_single_SAMPLE.txt @@ -0,0 +1,6 @@ +#Patient Identifier Sample Identifier Subtype Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Patient Identifier Sample identifier Subtype description Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 1 1 +SAMPLE_ID PATIENT_ID SUBTYPE OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TEST-INC-TCGA-P2-S1 TEST-INC-TCGA-P2 Luminal A 0:LIVING 23.45 1:Recurred/Progressed 100 diff --git a/src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/meta_clinical_sample.txt b/src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/meta_clinical_sample.txt new file mode 100644 index 00000000..b0b4753e --- /dev/null +++ b/src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/meta_clinical_sample.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: SAMPLE_ATTRIBUTES +data_filename: clinical_data_single_SAMPLE.txt diff --git a/src/test/resources/incremental/mrna_expression/data_expression_Zscores.txt b/src/test/resources/incremental/mrna_expression/data_expression_Zscores.txt new file mode 100644 index 00000000..a96eabd7 --- /dev/null +++ b/src/test/resources/incremental/mrna_expression/data_expression_Zscores.txt @@ -0,0 +1,29 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL TCGA-A1-A0SD-01 +AKT3 10000 0.6393 0.1 0.5377 +AKT1 207 0.785 0.1 0.0426 +# All after the pipe has to be removed +AKT2|TEST 208 1.0741 0.1 0.718 +HRAS 3265 -0.1735 0.1 -0.6412 +# This gene absent in this file, but it's still part of the profile and has to be updated +#ARAF 369 +KRAS 3845 0.785 0.1 0.0426 +ATM 472 1.0741 0.1 0.718 +# This line missing the hugo symbol and the gene has to be detected by entrez id + 4893 -0.1735 0.1 -0.6412 +# This line missing the entrez id and the gene has to be detected by hugo symbol +BRCA1 0.6393 0.1 0.5377 +BRAF 673 0.785 0.1 0.0426 +# Duplicate lines should be ignored +BRAF 673 0.7851 0.1 0.0427 +BRCA2 675 1.0741 0.1 0.718 +# This gene is new! the empty values should be set for the already existing samples in the database +CDK1 983 -0.1735 0.1 -0.6412 +# These lines have to be skipped +# invalid entrez id +P2RY10 -1 0.741 0.1 0.685 +# Multigene sign +/// 369 0.6393 0.1 0.5377 +# Unknown gene sign +--- 3845 0.785 0.1 0.0426 +# Empty gene info + 1.0741 0.1 0.718 diff --git a/src/test/resources/incremental/mrna_expression/meta_expression_Zscores.txt b/src/test/resources/incremental/mrna_expression/meta_expression_Zscores.txt new file mode 100644 index 00000000..e761fed3 --- /dev/null +++ b/src/test/resources/incremental/mrna_expression/meta_expression_Zscores.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: MRNA_EXPRESSION +datatype: Z-SCORE +stable_id: mrna +profile_description: Expression levels (Agilent microarray). +show_profile_in_analysis_tab: false +profile_name: mRNA expression (microarray) +data_filename: data_expression_Zscores.txt diff --git a/src/test/resources/incremental/protein_level/data_rppa.txt b/src/test/resources/incremental/protein_level/data_rppa.txt new file mode 100644 index 00000000..0953ce99 --- /dev/null +++ b/src/test/resources/incremental/protein_level/data_rppa.txt @@ -0,0 +1,24 @@ +Composite.Element.REF TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 +AKT3|akt3 1.26122710480548 0.037186254715365 +# Multiple gene symbols joined by space +AKT1 AKT2 AKT3|akt1 1.61253243664957 -0.141077088398489 +# All after the pipe has to be removed +AKT2|TEST 5.4424238579025E-05 0.062264661774981 +HRAS|hras 0.37624053370992 0.270399126328659 +# This gene absent in this file, but it's still part of the profile and has to be updated 0.407622077164699 -0.326522823583974 +#ARAF +KRAS|kras -0.335040546938807 0.00730643372831408 +ATM|atm 0.037186254715365 1.26122710480548 +# This line missing the entrez id and the gene has to be detected by hugo symbol 0.062264661774981 5.4424238579025E-05 +BRCA1|brca1 0.270399126328659 0.37624053370992 +BRAF|braf -0.326522823583974 0.407622077164699 +# Duplicate lines should be ignored 0.218650367364756 0.383702820778609 +BRAF|braf 0.00730643372831408 -0.335040546938807 +BRCA2|brca2 -0.141077088398489 1.61253243664957 +# This gene is new! the empty values should be set for the already existing samples in the database +CDK1|cdk1 -0.141047088398489 1.61253243564957 +# These lines have to be skipped +/// -0.335040546938807 0.00730643372831408 +--- 0.037186254715365 1.26122710480548 + 0.064 0.644 +NA|K-Ras 0.062264661774981 5.4424238579025E-05 diff --git a/src/test/resources/incremental/protein_level/meta_rppa.txt b/src/test/resources/incremental/protein_level/meta_rppa.txt new file mode 100644 index 00000000..f6481c7d --- /dev/null +++ b/src/test/resources/incremental/protein_level/meta_rppa.txt @@ -0,0 +1,7 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: PROTEIN_LEVEL +datatype: LOG2-VALUE +stable_id: rppa +profile_name: Test RPPA +profile_description: Test protein level data +data_filename: data_rppa.txt diff --git a/src/test/resources/incremental/structural_variants/data_structural_variants.txt b/src/test/resources/incremental/structural_variants/data_structural_variants.txt new file mode 100644 index 00000000..7514bce6 --- /dev/null +++ b/src/test/resources/incremental/structural_variants/data_structural_variants.txt @@ -0,0 +1,4 @@ +Sample_Id Site1_Entrez_Gene_Id Site1_Hugo_Symbol Site1_Ensembl_Transcript_Id Site1_Region_Number Site1_Chromosome Site1_Position Site1_Region Site1_Description Site2_Entrez_Gene_Id Site2_Hugo_Symbol Site2_Ensembl_Transcript_Id Site2_Region_Number Site2_Chromosome Site2_Position Site2_Contig Site2_Region Site2_Description Site2_Effect_On_Frame NCBI_Build DNA_Support RNA_Support Normal_Read_Count Tumor_Read_Count Normal_Variant_Count Tumor_Variant_Count Normal_Paired_End_Read_Count Tumor_Paired_End_Read_Count Normal_Split_Read_Count Tumor_Split_Read_Count Annotation Breakpoint_Type Center Connection_Type Event_Info Class SV_Length Comments External_Annotation cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation SV_Status StructVarNs.column1 StructVarNs2.lorem StructVarNs.column2 +TCGA-A1-A0SE-01 NA AKT1 ENST00000242365 15 7 138536968 EXON site1_test_desc_1 NA BRCA1 ENST00000288602 10 7 140482957 EXON PIEZO1-NCOA4.PIEZO1.COSF509_2 NA GRCh37 no yes NA 1000 NA 900 NA NA NA NA PIEZO1-NCOA4.K16B10.COSF509 NA NA NA Fusion NA NA Gain-of-Function COSMIC:COSF509 Putative_Driver Test driver Foo Class 4 Class annotation SOMATIC value1 ipsum value2 +TCGA-A1-A0SE-01 NA AKT2 ENST00000242365 15 7 138536968 EXON site1_test_desc_2 NA BRAF ENST00000288602 10 7 140482957 EXON KIAA1549-BRAF.K16B10.COSF509_2 NA GRCh37 no yes NA 1000 NA 900 NA NA NA NA KIAA1549-BRAF.K16B10.COSF509 NA NA NA Fusion NA NA Gain-of-Function COSMIC:COSF509 Putative_Driver Test driver Class 4 Class annotation SOMATIC value1 ipsum value2 +TCGA-A1-A0SE-01 NA AKT3 ENST00000344348 7 10 51582939 EXON site1_test_desc_3 NA BRCA2 ENST00000340058 12 10 43612031 EXON NCOA4-RET.N7R12_2 NA GRCh37 no yes NA 1001 NA 800 NA NA NA NA NCOA4-RET.N7R1 NA NA NA Fusion NA NA Gain-of-Function NA Putative_Passenger Test driver Class 3 Class annotation SOMATIC NA NA NA diff --git a/src/test/resources/incremental/structural_variants/meta_structural_variants.txt b/src/test/resources/incremental/structural_variants/meta_structural_variants.txt new file mode 100644 index 00000000..0998ac6e --- /dev/null +++ b/src/test/resources/incremental/structural_variants/meta_structural_variants.txt @@ -0,0 +1,10 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: STRUCTURAL_VARIANT +datatype: SV +data_filename: data_structural_variants.txt +stable_id: structural_variants +profile_name: Test Targeted Fusion Assay data +profile_description: Test Targeted Fusion Assay data description +show_profile_in_analysis_tab: true +gene_panel: TSTGNPNLSV +namespaces: StructVarNs,StructVarNs2 diff --git a/src/test/resources/incremental/update_mutation_data/data_mutations_extended.txt b/src/test/resources/incremental/update_mutation_data/data_mutations_extended.txt new file mode 100644 index 00000000..e9703211 --- /dev/null +++ b/src/test/resources/incremental/update_mutation_data/data_mutations_extended.txt @@ -0,0 +1,4 @@ +Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Tumor_Sample_Barcode Verification_Status Validation_Status Mutation_Status Sequencer Chromosome Start_position End_position Variant_Classification HGVSp_Short MA:FImpact MA:link.MSA MA:link.PDB +AKT1 207 broad.mit.edu GRCh37 TCGA-A1-A0SH-01 Unknown valid Unknown Illumina GAIIx chr1 22078087 22078087 Missense_Mutation D820N neutral mutationassessor.org/?cm=msa&ty=f&p=PGBM_HUMAN&rb=814&re=869&var=D820N +AKT2 208 broad.mit.edu GRCh37 TCGA-A1-A0SH-01 Unknown valid Unknown Illumina GAIIx chr1 34085156 34085156 Missense_Mutation V277I low mutationassessor.org/?cm=msa&ty=f&p=CSMD2_HUMAN&rb=202&re=303&var=V277I mutationassessor.org/pdb.php?prot=CSMD2_HUMAN&from=202&to=303&var=V277I +BRCA1 672 broad.mit.edu GRCh37 TCGA-A1-A0SH-01 Unknown valid Unknown Illumina GAIIx chr17 35989584 35989584 Missense_Mutation F628L mutationassessor.org/?cm=msa&ty=f&p=CLSPN_HUMAN&rb=601&re=800&var=F628L diff --git a/src/test/resources/incremental/update_mutation_data/meta_mutations.txt b/src/test/resources/incremental/update_mutation_data/meta_mutations.txt new file mode 100644 index 00000000..2282fbab --- /dev/null +++ b/src/test/resources/incremental/update_mutation_data/meta_mutations.txt @@ -0,0 +1,9 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: MUTATION_EXTENDED +stable_id: mutations +datatype: MAF +show_profile_in_analysis_tab: true +profile_name: Test Mutations +profile_description: Mutation data for testing. +data_filename: data_mutations_extended.txt +gene_panel: TSTGNPNLMUTEXT diff --git a/src/test/resources/incremental/update_single_tcga_patient/clinical_data_single_PATIENT.txt b/src/test/resources/incremental/update_single_tcga_patient/clinical_data_single_PATIENT.txt new file mode 100644 index 00000000..37421482 --- /dev/null +++ b/src/test/resources/incremental/update_single_tcga_patient/clinical_data_single_PATIENT.txt @@ -0,0 +1,6 @@ +#Patient Identifier Subtype Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Patient identifier Subtype description Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 1 +PATIENT_ID SUBTYPE OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TCGA-A1-A0SB basal-like NA 56.7 1:Recurred/Progressed 100 diff --git a/src/test/resources/incremental/update_single_tcga_patient/meta_clinical_patient.txt b/src/test/resources/incremental/update_single_tcga_patient/meta_clinical_patient.txt new file mode 100644 index 00000000..9e418c43 --- /dev/null +++ b/src/test/resources/incremental/update_single_tcga_patient/meta_clinical_patient.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: PATIENT_ATTRIBUTES +data_filename: clinical_data_single_PATIENT.txt diff --git a/src/test/resources/incremental/update_single_tcga_sample/clinical_data_single_SAMPLE.txt b/src/test/resources/incremental/update_single_tcga_sample/clinical_data_single_SAMPLE.txt new file mode 100644 index 00000000..12d0b7c0 --- /dev/null +++ b/src/test/resources/incremental/update_single_tcga_sample/clinical_data_single_SAMPLE.txt @@ -0,0 +1,6 @@ +#Sample Identifier Patient Identifier Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Sample identifier Patient Identifier Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 1 +SAMPLE_ID PATIENT_ID OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TCGA-A1-A0SH-01 TCGA-A1-A0SH 1:DECEASED 45.67 1:Recurred/Progressed 123 diff --git a/src/test/resources/incremental/update_single_tcga_sample/meta_clinical_sample.txt b/src/test/resources/incremental/update_single_tcga_sample/meta_clinical_sample.txt new file mode 100644 index 00000000..b0b4753e --- /dev/null +++ b/src/test/resources/incremental/update_single_tcga_sample/meta_clinical_sample.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: SAMPLE_ATTRIBUTES +data_filename: clinical_data_single_SAMPLE.txt diff --git a/src/test/resources/integrationTestScript.xml b/src/test/resources/integrationTestScript.xml index 395272de..87ed6e8f 100644 --- a/src/test/resources/integrationTestScript.xml +++ b/src/test/resources/integrationTestScript.xml @@ -25,7 +25,7 @@ org.mskcc.cbio.portal.scripts.ImportTypesOfCancers - src/test/scripts/test_data/study_es_0/data_cancer_type.txt + tests/test_data/study_es_0/data_cancer_type.txt false --noprogress @@ -47,25 +47,25 @@ org.mskcc.cbio.portal.scripts.ImportCancerStudy - src/test/scripts/test_data/study_es_0/meta_study.txt + tests/test_data/study_es_0/meta_study.txt org.mskcc.cbio.portal.scripts.ImportClinicalData --data - src/test/scripts/test_data/study_es_0/data_clinical_samples.txt + tests/test_data/study_es_0/data_clinical_samples.txt --meta - src/test/scripts/test_data/study_es_0/meta_clinical_samples.txt + tests/test_data/study_es_0/meta_clinical_samples.txt org.mskcc.cbio.portal.scripts.ImportClinicalData --data - src/test/scripts/test_data/study_es_0/data_clinical_patients.txt + tests/test_data/study_es_0/data_clinical_patients.txt --meta - src/test/scripts/test_data/study_es_0/meta_clinical_patients.txt + tests/test_data/study_es_0/meta_clinical_patients.txt @@ -74,9 +74,9 @@ org.mskcc.cbio.portal.scripts.ImportCopyNumberSegmentData --data - src/test/scripts/test_data/study_es_0/data_cna_hg19.seg + tests/test_data/study_es_0/data_cna_hg19.seg --meta - src/test/scripts/test_data/study_es_0/meta_cna_hg19_seg.txt + tests/test_data/study_es_0/meta_cna_hg19_seg.txt --loadMode bulkload --noprogress @@ -86,9 +86,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_mutations_extended.maf + tests/test_data/study_es_0/data_mutations_extended.maf --meta - src/test/scripts/test_data/study_es_0/meta_mutations_extended.txt + tests/test_data/study_es_0/meta_mutations_extended.txt --loadMode bulkload --noprogress @@ -98,9 +98,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_cna_discrete.txt + tests/test_data/study_es_0/data_cna_discrete.txt --meta - src/test/scripts/test_data/study_es_0/meta_cna_discrete.txt + tests/test_data/study_es_0/meta_cna_discrete.txt --loadMode bulkload --noprogress @@ -110,9 +110,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_expression_median.txt + tests/test_data/study_es_0/data_expression_median.txt --meta - src/test/scripts/test_data/study_es_0/meta_expression_median.txt + tests/test_data/study_es_0/meta_expression_median.txt --loadMode bulkload --noprogress @@ -122,9 +122,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_structural_variants.txt + tests/test_data/study_es_0/data_structural_variants.txt --meta - src/test/scripts/test_data/study_es_0/meta_structural_variants.txt + tests/test_data/study_es_0/meta_structural_variants.txt --loadMode bulkload --noprogress @@ -134,7 +134,7 @@ org.mskcc.cbio.portal.scripts.ImportGisticData --data - src/test/scripts/test_data/study_es_0/data_gistic_genes_amp.txt + tests/test_data/study_es_0/data_gistic_genes_amp.txt --study study_es_0 --noprogress @@ -144,9 +144,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_cna_log2.txt + tests/test_data/study_es_0/data_cna_log2.txt --meta - src/test/scripts/test_data/study_es_0/meta_cna_log2.txt + tests/test_data/study_es_0/meta_cna_log2.txt --loadMode bulkload --noprogress @@ -156,9 +156,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_methylation_hm27.txt + tests/test_data/study_es_0/data_methylation_hm27.txt --meta - src/test/scripts/test_data/study_es_0/meta_methylation_hm27.txt + tests/test_data/study_es_0/meta_methylation_hm27.txt --loadMode bulkload --noprogress @@ -168,9 +168,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_gsva_scores.txt + tests/test_data/study_es_0/data_gsva_scores.txt --meta - src/test/scripts/test_data/study_es_0/meta_gsva_scores.txt + tests/test_data/study_es_0/meta_gsva_scores.txt --loadMode bulkload --noprogress @@ -178,9 +178,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_gsva_pvalues.txt + tests/test_data/study_es_0/data_gsva_pvalues.txt --meta - src/test/scripts/test_data/study_es_0/meta_gsva_pvalues.txt + tests/test_data/study_es_0/meta_gsva_pvalues.txt --loadMode bulkload --noprogress @@ -190,9 +190,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_treatment_ic50.txt + tests/test_data/study_es_0/data_treatment_ic50.txt --meta - src/test/scripts/test_data/study_es_0/meta_treatment_ic50.txt + tests/test_data/study_es_0/meta_treatment_ic50.txt --loadMode bulkload --noprogress @@ -200,9 +200,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_treatment_ec50.txt + tests/test_data/study_es_0/data_treatment_ec50.txt --meta - src/test/scripts/test_data/study_es_0/meta_treatment_ec50.txt + tests/test_data/study_es_0/meta_treatment_ec50.txt --loadMode bulkload --noprogress @@ -212,9 +212,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_mutational_signature.txt + tests/test_data/study_es_0/data_mutational_signature.txt --meta - src/test/scripts/test_data/study_es_0/meta_mutational_signature.txt + tests/test_data/study_es_0/meta_mutational_signature.txt --loadMode bulkload --noprogress @@ -223,7 +223,7 @@ org.mskcc.cbio.portal.scripts.ImportSampleList - src/test/scripts/test_data/study_es_0/case_lists/cases_custom.txt + tests/test_data/study_es_0/case_lists/cases_custom.txt diff --git a/src/test/resources/seed_mini.sql b/src/test/resources/seed_mini.sql index e17819cf..5ffe18da 100644 --- a/src/test/resources/seed_mini.sql +++ b/src/test/resources/seed_mini.sql @@ -90,6 +90,15 @@ INSERT INTO `reference_genome` VALUES (2,'human','hg38','GRCh38',3049315783,'htt INSERT INTO "cancer_study" ("CANCER_STUDY_ID","CANCER_STUDY_IDENTIFIER","TYPE_OF_CANCER_ID","NAME","DESCRIPTION","PUBLIC","PMID","CITATION","GROUPS","REFERENCE_GENOME_ID") VALUES (1,'study_tcga_pub','brca','Breast Invasive Carcinoma (TCGA,Nature 2012)','The Cancer Genome Atlas (TCGA) Breast Invasive Carcinoma project. 825 cases.
Nature 2012. Raw data via the TCGA Data Portal.',1,'23000897,26451490','TCGA,Nature 2012,...','SU2C-PI3K;PUBLIC;GDAC',1); +-- clinical_attribute_meta +INSERT INTO "clinical_attribute_meta" +VALUES ('DFS_MONTHS','Disease Free (Months)','Disease free in months since treatment','NUMBER',0,'1',1), +('DFS_STATUS','Disease Free Status','Disease free status','STRING',0,'1',1), +('OS_MONTHS','Overall Survival (Months)','Overall survival in months since diagnosis','NUMBER',0,'1',1), +('OS_STATUS','Overall Survival Status','Overall survival status','STRING',0,'1',1), +('SAMPLE_COUNT','Number of Samples Per Patient','Number of Samples Per Patient','STRING',1,'1',1), +('SUBTYPE','Subtype','Subtype description','STRING',0,'1',1); + -- gene as genetic_entity INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); SET @max_entity_id = (Select MAX(ID) from genetic_entity); @@ -191,6 +200,74 @@ INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); SET @max_entity_id = (Select MAX(ID) from genetic_entity); INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,2261,'FGFR3','protein-coding'); +-- missing genes for study_es_0 +-- additional genes for CNA data +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,116983,'ACAP3','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,2073,'ERCC5','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,219293,'ATAD3C','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,375790,'AGRN','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,54998,'AURKAIP1','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,55210,'ATAD3A','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,83858,'ATAD3B','protein-coding'); +-- genes for data_methylation_hm27.txt +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,24145,'PANX1','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,283234,'CCDC88B','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,3232,'HOXD3','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,3613,'IMPA2','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,389,'RHOC','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,487,'ATP2A1','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,7871,'SLMAP','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,8148,'TAF15','protein-coding'); +-- gene panels +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,55,'ACP3','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,81061,'OR11H1','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,388946,'TMEM247','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,7157,'TP53','protein-coding'); + +-- Generic genetic entities +INSERT INTO "genetic_entity" ("ENTITY_TYPE", "STABLE_ID") VALUES ('GENERIC_ASSAY', 'Erlotinib'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE", "STABLE_ID") VALUES ('GENERIC_ASSAY', 'Irinotecan'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE", "STABLE_ID") VALUES ('GENERIC_ASSAY', 'L-685458'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE", "STABLE_ID") VALUES ('GENERIC_ASSAY', 'Lapatinib'); + -- cna_event INSERT INTO "cna_event" ("CNA_EVENT_ID","ENTREZ_GENE_ID","ALTERATION") VALUES (20093,207,-2); INSERT INTO "cna_event" ("CNA_EVENT_ID","ENTREZ_GENE_ID","ALTERATION") VALUES (20092,207,2); @@ -319,9 +396,15 @@ INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (6,'study_tcga_pub_mutations',1,'MUTATION_EXTENDED','MAF','Mutations','Mutation data from whole exome sequencing.','1'); INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (7,'study_tcga_pub_structural_variants',1,'STRUCTURAL_VARIANT','SV','Structural Variants','Structural Variants test data.','1'); INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (8,'study_tcga_pub_cna_long',1,'COPY_NUMBER_ALTERATION','DISCRETE_LONG','CNA values','CNA values dummy data','1'); +INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (9,'study_tcga_pub_rppa',1,'PROTEIN_LEVEL','LOG2-VALUE','RPPA values','RPPA values dummy data','0'); +INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (10,'study_tcga_pub_treatment_ic50',1,'GENERIC_ASSAY','LIMIT-VALUE','test treatment values','treatment values dummy data','0'); -- gene_panel INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (1,'TESTPANEL_CNA_DISCRETE_LONG_FORMAT','Some test panel'); +INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (2,'TSTGNPNLCNADS','The CNA Discrete test panel'); +INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (3,'TSTGNPNLMUTEXT','The mutation extended test panel'); +INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (4,'TSTGNPNLGENASS','The generic assay test panel'); +INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (5,'TSTGNPNLSV','The structural variance test panel'); -- genetic_alteration INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (2,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 10000),'0,0,1,2,0,1,1,1,0,1,1,1,0,1,'); @@ -366,12 +449,31 @@ INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALU INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (5,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 672),'0.066638638,'); INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (5,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 673),'0.020369562,'); INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (5,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 675),'0.793930197,'); +-- RPPA +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 10000),'-0.472,1.514,0.145,-0.183,0.913,-0.665,-1.700,0.976,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 207),'-1.102,-0.243,0.018,-0.154,0.330,1.005,0.681,-0.664,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 208),'-1.221,-0.592,-0.176,-0.310,-1.198,-0.670,0.077,-0.302,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 3265),'0.061,-0.055,-0.165,0.517,2.021,0.381,-0.728,0.944,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 369),'-1.129,-0.306,0.180,-0.601,0.166,0.402,0.243,-0.999,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 3845),'0.177,0.404,0.188,0.428,1.676,0.238,0.469,2.161,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 472),'-1.503,-1.925,-1.755,-1.576,-1.029,-1.401,-1.514,-2.074,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 4893),'-1.914,-2.059,-1.228,-1.322,-4.166,-1.187,0.284,-0.130,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 672),'-1.661,-1.392,-1.924,-1.656,-0.361,-1.998,-0.136,-0.709,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 673),'0.233,0.561,-0.106,-0.085,-0.012,0.143,0.141,0.609,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 675),'-0.570,-1.340,-1.544,-0.404,0.632,-1.231,0.771,-0.036,'); +-- Generic assay data +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (10,(Select "ID" from "genetic_entity" where "STABLE_ID" = 'Erlotinib'),'5.2,>8,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (10,(Select "ID" from "genetic_entity" where "STABLE_ID" = 'Irinotecan'),'>8,7.1,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (10,(Select "ID" from "genetic_entity" where "STABLE_ID" = 'L-685458'),'>4.6,7.2,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (10,(Select "ID" from "genetic_entity" where "STABLE_ID" = 'Lapatinib'),'6.9,>~8,'); -- genetic_profile_samples INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (2,'1,2,3,4,5,6,7,8,9,10,11,12,13,14,'); INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (3,'2,3,6,8,9,10,12,13,'); INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (4,'1,2,3,4,5,6,7,8,9,10,11,12,13,14,'); INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (5,'2,'); +INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (9,'2,3,6,8,9,10,12,13,'); +INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (10,'2,3,'); -- patient INSERT INTO "patient" ("INTERNAL_ID","STABLE_ID","CANCER_STUDY_ID") VALUES (1,'TCGA-A1-A0SB',1); diff --git a/src/test/resources/update_case_lists/add_sample_to_case_list/case_lists/case_mrna.txt b/src/test/resources/update_case_lists/add_sample_to_case_list/case_lists/case_mrna.txt new file mode 100644 index 00000000..32619f58 --- /dev/null +++ b/src/test/resources/update_case_lists/add_sample_to_case_list/case_lists/case_mrna.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_tcga_pub +stable_id: study_tcga_pub_mrna +case_list_ids: TCGA-XX-0800-01 \ No newline at end of file diff --git a/src/test/resources/update_case_lists/add_sample_to_case_list/clinical_data_single_SAMPLE.txt b/src/test/resources/update_case_lists/add_sample_to_case_list/clinical_data_single_SAMPLE.txt new file mode 100644 index 00000000..600f753e --- /dev/null +++ b/src/test/resources/update_case_lists/add_sample_to_case_list/clinical_data_single_SAMPLE.txt @@ -0,0 +1,6 @@ +#Patient Identifier Sample Identifier Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Patient Identifier Sample identifier Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 1 +SAMPLE_ID PATIENT_ID OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TCGA-XX-0800-01 TCGA-XX-0800 1:DECEASED 45.67 1:Recurred/Progressed 123 diff --git a/src/test/resources/update_case_lists/add_sample_to_case_list/meta_clinical_sample.txt b/src/test/resources/update_case_lists/add_sample_to_case_list/meta_clinical_sample.txt new file mode 100644 index 00000000..b0b4753e --- /dev/null +++ b/src/test/resources/update_case_lists/add_sample_to_case_list/meta_clinical_sample.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: SAMPLE_ATTRIBUTES +data_filename: clinical_data_single_SAMPLE.txt diff --git a/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_acgh.txt b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_acgh.txt new file mode 100644 index 00000000..95f18659 --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_acgh.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_tcga_pub +stable_id: study_tcga_pub_acgh +case_list_ids: TCGA-A1-A0SH-01 TCGA-XX-0800-01 diff --git a/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_cnaseq.txt b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_cnaseq.txt new file mode 100644 index 00000000..0101d819 --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_cnaseq.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_tcga_pub +stable_id: study_tcga_pub_cnaseq +case_list_ids: TCGA-A1-A0SH-01 diff --git a/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_complete.txt b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_complete.txt new file mode 100644 index 00000000..d2bb08b4 --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_complete.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_tcga_pub +stable_id: study_tcga_pub_complete +case_list_ids: TCGA-A1-A0SH-01 diff --git a/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_log2CNA.txt b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_log2CNA.txt new file mode 100644 index 00000000..e8c3baec --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_log2CNA.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_tcga_pub +stable_id: study_tcga_pub_log2CNA +case_list_ids: TCGA-A1-A0SH-01 diff --git a/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_mrna.txt b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_mrna.txt new file mode 100644 index 00000000..093ed0fe --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_mrna.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_tcga_pub +stable_id: study_tcga_pub_mrna +case_list_ids: TCGA-A1-A0SH-01 TCGA-XX-0800-01 diff --git a/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_sequenced.txt b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_sequenced.txt new file mode 100644 index 00000000..03c493c9 --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_sequenced.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_tcga_pub +stable_id: study_tcga_pub_sequenced +case_list_ids: TCGA-A1-A0SH-01 diff --git a/src/test/resources/update_case_lists/update_tcga_samples/clinical_data_sample.txt b/src/test/resources/update_case_lists/update_tcga_samples/clinical_data_sample.txt new file mode 100644 index 00000000..1e7552e2 --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/clinical_data_sample.txt @@ -0,0 +1,7 @@ +#Patient Identifier Sample Identifier Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Patient Identifier Sample identifier Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 1 +SAMPLE_ID PATIENT_ID OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TCGA-A1-A0SH-01 TCGA-A1-A0SH 1:DECEASED 45.67 1:Recurred/Progressed 123 +TCGA-XX-0800-01 TCGA-XX-0800 0:LIVING 56.78 1:Recurred/Progressed 234 diff --git a/src/test/resources/update_case_lists/update_tcga_samples/meta_clinical_sample.txt b/src/test/resources/update_case_lists/update_tcga_samples/meta_clinical_sample.txt new file mode 100644 index 00000000..09c7ba99 --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/meta_clinical_sample.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: SAMPLE_ATTRIBUTES +data_filename: clinical_data_sample.txt diff --git a/test_scripts.sh b/test_scripts.sh old mode 100644 new mode 100755 index 807f536e..279192a3 --- a/test_scripts.sh +++ b/test_scripts.sh @@ -1 +1,3 @@ -pushd tests/; PYTHONPATH=../scripts:$PYTHONPATH python -m unittest *.py; popd +#!/bin/bash + +pushd tests/ && PYTHONPATH=../scripts:$PYTHONPATH python -m unittest *.py; exit_stat=$?; popd; exit $exit_stat diff --git a/tests/system_tests_import_data.py b/tests/system_tests_import_data.py new file mode 100755 index 00000000..9c30cdef --- /dev/null +++ b/tests/system_tests_import_data.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 + +''' +This code is licensed under the GNU Affero General Public License (AGPL), +version 3, or (at your option) any later version. +''' + +import unittest +from unittest import mock +from unittest.mock import call +from importer import cbioportalImporter + +common_part = ('-Dspring.profiles.active=dbcp', '-cp', 'test.jar') + +class DataImporterTests(unittest.TestCase): + ''' + Tests of commands produced by scripts + ''' + + def setUp(self): + self.maxDiff = None + + @mock.patch('importer.cbioportalImporter.locate_jar') + @mock.patch('importer.cbioportalImporter.run_java') + def test_full_study_load(self, run_java, locate_jar): + ''' + Tests java commands full study load produces + ''' + locate_jar.return_value = "test.jar" + + study_directory = 'test_data/study_es_0' + args = ['--study_directory', study_directory] + parsed_args = cbioportalImporter.interface(args) + cbioportalImporter.main(parsed_args) + + remove_study_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.RemoveCancerStudy', + 'study_es_0', '--noprogress') + create_study_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportCancerStudy', + f'{study_directory}/meta_study.txt', '--noprogress') + clinical_sample_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportClinicalData', + '--meta', f'{study_directory}/meta_clinical_samples.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_clinical_samples.txt', '--noprogress') + make_study_available_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.UpdateCancerStudy', + 'study_es_0', 'AVAILABLE', '--noprogress') + mol_profile_calls = [ + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_cna_log2.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_cna_log2.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_expression_median.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_expression_median.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_generic_assay_patient_test.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_generic_assay_patient_test.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_methylation_hm27.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_methylation_hm27.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_mutational_signature.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_mutational_signature.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_mutations_extended.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_mutations_extended.maf', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportResourceData', '--meta', f'{study_directory}/meta_resource_patient.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_resource_patient.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportResourceData', '--meta', f'{study_directory}/meta_resource_study.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_resource_study.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_treatment_ec50.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_treatment_ec50.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_treatment_ic50.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_treatment_ic50.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_structural_variants.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_structural_variants.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_cna_discrete.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_cna_discrete.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_expression_median_Zscores.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_expression_median_Zscores.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_gsva_scores.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_gsva_scores.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_gsva_pvalues.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_gsva_pvalues.txt', '--noprogress'), + + ] + self.assertCountEqual(run_java.call_args_list, [ + call(*common_part, 'org.mskcc.cbio.portal.util.VersionUtil',), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportTypesOfCancers', f'{study_directory}/data_cancer_type.txt', 'false', '--noprogress'), + remove_study_call, + create_study_call, + clinical_sample_call, + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportResourceDefinition', '--meta', f'{study_directory}/meta_resource_definition.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_resource_definition.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportResourceData', '--meta', f'{study_directory}/meta_resource_sample.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_resource_sample.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportClinicalData', '--meta', f'{study_directory}/meta_clinical_patients.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_clinical_patients.txt', '--noprogress'), + *mol_profile_calls, + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportCopyNumberSegmentData', '--meta', f'{study_directory}/meta_cna_hg19_seg.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_cna_hg19.seg', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportGisticData', '--data', f'{study_directory}/data_gistic_genes_amp.txt', '--study', 'study_es_0', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap', '--meta', f'{study_directory}/meta_gene_panel_matrix.txt', '--data', f'{study_directory}/data_gene_panel_matrix.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportSampleList', f'{study_directory}/case_lists/cases_cna.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportSampleList', f'{study_directory}/case_lists/cases_cnaseq.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportSampleList', f'{study_directory}/case_lists/cases_custom.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportSampleList', f'{study_directory}/case_lists/cases_sequenced.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportSampleList', f'{study_directory}/case_lists/cases_test.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.AddCaseList', 'study_es_0', 'all', '--noprogress'), + make_study_available_call, + ]) + + self.assertTrue(run_java.call_args_list.index(remove_study_call) < run_java.call_args_list.index(create_study_call)) + self.assertTrue(run_java.call_args_list.index(create_study_call) < run_java.call_args_list.index(clinical_sample_call)) + self.assertTrue(all(run_java.call_args_list.index(clinical_sample_call) < run_java.call_args_list.index(mol_profile_call) + for mol_profile_call in mol_profile_calls)) + self.assertEqual(run_java.call_args_list[-1], make_study_available_call) + + + @mock.patch('importer.cbioportalImporter.locate_jar') + @mock.patch('importer.cbioportalImporter.run_java') + def test_incremental_load(self, run_java, locate_jar): + ''' + Tests java commands incremental load produces + ''' + locate_jar.return_value = "test.jar" + + data_directory = 'test_data/study_es_0_inc' + args = ['--data_directory', data_directory] + parsed_args = cbioportalImporter.interface(args) + cbioportalImporter.main(parsed_args) + + clinical_patient_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportClinicalData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_clinical_patients.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_clinical_patients.txt', '--noprogress') + clinical_sample_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportClinicalData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_clinical_samples.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_clinical_samples.txt', '--noprogress') + mutation_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_mutations_extended.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_mutations_extended.maf', '--noprogress') + cna_discrete_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_cna_discrete.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_cna_discrete.txt', '--noprogress') + cna_log2_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_cna_log2.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_cna_log2.txt', '--noprogress') + expression_median_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_expression_median.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_expression_median.txt', '--noprogress') + methylation_hm27_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_methylation_hm27.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_methylation_hm27.txt', '--noprogress') + treatment_ic50_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_treatment_ic50.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_treatment_ic50.txt', '--noprogress') + sv_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_structural_variants.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_structural_variants.txt', '--noprogress') + timeline_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportTimelineData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_timeline.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_timeline.txt', '--noprogress') + case_list_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.UpdateCaseListsSampleIds', + '--meta', f'{data_directory}/meta_clinical_samples.txt', '--case-lists', f'{data_directory}/case_lists') + gene_panel_matrix_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap', '--overwrite-existing', + '--meta', f'{data_directory}/meta_gene_panel_matrix.txt', '--data', f'{data_directory}/data_gene_panel_matrix.txt', '--noprogress') + seg_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportCopyNumberSegmentData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_cna_hg19_seg.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_cna_hg19.seg', '--noprogress') + + self.assertCountEqual(run_java.call_args_list, [ + call(*common_part, 'org.mskcc.cbio.portal.util.VersionUtil',), + clinical_patient_call, + clinical_sample_call, + mutation_call, + cna_discrete_call, + cna_log2_call, + expression_median_call, + methylation_hm27_call, + treatment_ic50_call, + sv_call, + timeline_call, + gene_panel_matrix_call, + seg_call, + case_list_call, + ]) + + self.assertTrue(run_java.call_args_list.index(clinical_sample_call) < run_java.call_args_list.index(mutation_call)) + self.assertTrue(run_java.call_args_list.index(clinical_sample_call) < run_java.call_args_list.index(case_list_call)) + + +if __name__ == '__main__': + unittest.main(buffer=True) diff --git a/tests/test_data/study_es_0_inc/case_lists/cases_sequenced.txt b/tests/test_data/study_es_0_inc/case_lists/cases_sequenced.txt new file mode 100644 index 00000000..c05c77f7 --- /dev/null +++ b/tests/test_data/study_es_0_inc/case_lists/cases_sequenced.txt @@ -0,0 +1,5 @@ +cancer_study_identifier: study_es_0 +stable_id: study_es_0_sequenced +case_list_name: Samples profiled for mutations +case_list_description: This is this case list that contains all samples that are profiled for mutations. +case_list_ids: TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW-01 diff --git a/tests/test_data/study_es_0_inc/cna_long/data_cna_discrete_long.txt b/tests/test_data/study_es_0_inc/cna_long/data_cna_discrete_long.txt new file mode 100644 index 00000000..9c3b1427 --- /dev/null +++ b/tests/test_data/study_es_0_inc/cna_long/data_cna_discrete_long.txt @@ -0,0 +1,31 @@ +Hugo_Symbol Entrez_Gene_Id Sample_Id Value cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation +AKT3 10000 TCGA-C8-A12K-01 0 +AKT3 10000 TCGA-BH-NON-EXIST -2 +AKT3 10000 TCGA-AO-A129-01 -2 +AKT1 207 TCGA-C8-A12K-01 -1 +AKT1 207 TCGA-BH-NON-EXIST 2 +AKT1 207 TCGA-AO-A129-01 2 +AKT2|TEST 208 TCGA-C8-A12K-01 -2 +AKT2|TEST 208 TCGA-BH-NON-EXIST 2 +AKT2|TEST 208 TCGA-AO-A129-01 -1 Putative_Driver Test driver Class 1 Class annotation +HRAS 3265 TCGA-C8-A12K-01 2 +HRAS 3265 TCGA-BH-NON-EXIST 2 +HRAS 3265 TCGA-AO-A129-01 0 +KRAS 3845 TCGA-C8-A12K-01 0 Class 2 Class annotation +KRAS 3845 TCGA-BH-NON-EXIST -2 +KRAS 3845 TCGA-AO-A129-01 2 Putative_Passenger Test passenger Class 2 Class annotation + 4893 TCGA-C8-A12K-01 -2 + 4893 TCGA-BH-NON-EXIST -2 + 4893 TCGA-AO-A129-01 -1 +BRCA1 TCGA-C8-A12K-01 2 +BRCA1 TCGA-BH-NON-EXIST 2 +BRCA1 TCGA-AO-A129-01 0 +BRAF 673 TCGA-C8-A12K-01 2 +BRAF 673 TCGA-BH-NON-EXIST -2 +BRAF 673 TCGA-AO-A129-01 -2 +BRCA2 675 TCGA-C8-A12K-01 -1.5 +BRCA2 675 TCGA-BH-NON-EXIST 2 +BRCA2 675 TCGA-AO-A129-01 0 +CDK1 983 TCGA-C8-A12K-01 -2 Putative_Driver +CDK1 983 TCGA-BH-NON-EXIST -2 +CDK1 983 TCGA-AO-A129-01 2 Putative_Passenger Test passenger diff --git a/tests/test_data/study_es_0_inc/cna_long/meta_cna_discrete_long.txt b/tests/test_data/study_es_0_inc/cna_long/meta_cna_discrete_long.txt new file mode 100644 index 00000000..d1ce3813 --- /dev/null +++ b/tests/test_data/study_es_0_inc/cna_long/meta_cna_discrete_long.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: DISCRETE_LONG +stable_id: gistic +show_profile_in_analysis_tab: true +profile_description: Putative copy-number from GISTIC 2.0. Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. +profile_name: Putative copy-number alterations from GISTIC +data_filename: data_cna_discrete_long.txt diff --git a/tests/test_data/study_es_0_inc/data_clinical_patients.txt b/tests/test_data/study_es_0_inc/data_clinical_patients.txt new file mode 100644 index 00000000..105e2fa4 --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_clinical_patients.txt @@ -0,0 +1,7 @@ +#Patient Identifier Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Patient identifier Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 +PATIENT_ID OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TCGA-BH-A18K 1:DECEASED 96.74 NA [Not Available] +TCGA-BH-NEW 0:LIVING 2.37 0:DiseaseFree 2.37 diff --git a/tests/test_data/study_es_0_inc/data_clinical_samples.txt b/tests/test_data/study_es_0_inc/data_clinical_samples.txt new file mode 100644 index 00000000..40dbf1ea --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_clinical_samples.txt @@ -0,0 +1,8 @@ +#Patient Identifier Sample Identifier Subtype +#Patient identifier Sample identifier Subtype description +#STRING STRING STRING +#1 1 1 +PATIENT_ID SAMPLE_ID SUBTYPE +TCGA-A1-A0SB TCGA-A1-A0SB-01 Luminal A +TCGA-A1-A0SB TCGA-A1-A0SB-03 basal-like +TCGA-BH-NEW TCGA-BH-NEW-01 NA diff --git a/tests/test_data/study_es_0_inc/data_cna_discrete.txt b/tests/test_data/study_es_0_inc/data_cna_discrete.txt new file mode 100644 index 00000000..518b727c --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_cna_discrete.txt @@ -0,0 +1,9 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW +ACAP3 116983 0 0 -1 + 375790 -1 -1 0 +ATAD3A 55210 0 0 -2 +ATAD3B 83858 -2 -1 0 +ATAD3C 219293 0 0 0 +ERCC5 2073 0 -1 -2 +ACP3 55 0 0 0 +TP53 -1 0 -2 diff --git a/tests/test_data/study_es_0_inc/data_cna_hg19.seg b/tests/test_data/study_es_0_inc/data_cna_hg19.seg new file mode 100644 index 00000000..4c149a9c --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_cna_hg19.seg @@ -0,0 +1,10 @@ +ID chrom loc.start loc.end num.mark seg.mean +TCGA-A2-A04P-01 1 3218610 95674710 53225 0.0055 +TCGA-A2-A04P-01 1 95676511 95676518 2 -1.6636 +TCGA-A2-A04P-01 1 95680124 167057183 24886 0.0053 +TCGA-A1-A0SB-01 1 167057495 167059336 3 -1.0999 +TCGA-A1-A0SB-01 1 167059760 181602002 9213 -8e-04 +TCGA-A1-A0SB-03 1 181603120 181609567 6 -1.2009 +TCGA-A1-A0SB-03 1 181610685 201473647 12002 0.0055 +TCGA-BH-NEW-01 1 201474400 201474544 2 -1.4235 +TCGA-BH-NEW-01 1 201475220 247813706 29781 -4e-04 diff --git a/tests/test_data/study_es_0_inc/data_cna_log2.txt b/tests/test_data/study_es_0_inc/data_cna_log2.txt new file mode 100644 index 00000000..bb0fdb32 --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_cna_log2.txt @@ -0,0 +1,9 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW +ACAP3 116983 0.751 0.533 0.114 + 375790 0.062 0.071 0.948 +ATAD3A 55210 0.487 0.695 0.364 +ATAD3B 83858 0.150 0.492 0.300 +ATAD3C 219293 0.995 0.170 0.654 +ERCC5 2073 0.816 0.514 0.165 +ACP3 55 0.252 0.713 0.513 +TP53 0.360 0.538 0.891 diff --git a/tests/test_data/study_es_0_inc/data_cna_pd_annotations.txt b/tests/test_data/study_es_0_inc/data_cna_pd_annotations.txt new file mode 100644 index 00000000..53d372d2 --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_cna_pd_annotations.txt @@ -0,0 +1,5 @@ +SAMPLE_ID Entrez_Gene_Id cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation +TCGA-A1-A0SB-01 116983 Putative_Passenger Test passenger Class 2 Class annotation +TCGA-A1-A0SB-01 375790 Putative_Driver Test driver Class 1 Class annotation +TCGA-A1-A0SB-03 219293 Putative_Passenger Test passenger +TCGA-BH-NEW 2073 Putative_Driver Test driver diff --git a/tests/test_data/study_es_0_inc/data_expression_median.txt b/tests/test_data/study_es_0_inc/data_expression_median.txt new file mode 100644 index 00000000..7e1f5a4b --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_expression_median.txt @@ -0,0 +1,9 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW +ACAP3 116983 0.096 0.826 0.032 + 375790 0.309 0.399 0.680 +ATAD3A 55210 0.569 0.189 0.266 +ATAD3B 83858 0.829 0.473 0.611 +ATAD3C 219293 0.307 0.445 0.045 +ERCC5 2073 0.171 0.766 0.590 +ACP3 55 0.422 0.870 0.745 +TP53 0.179 0.694 0.808 diff --git a/tests/test_data/study_es_0_inc/data_gene_panel_matrix.txt b/tests/test_data/study_es_0_inc/data_gene_panel_matrix.txt new file mode 100644 index 00000000..344837b7 --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_gene_panel_matrix.txt @@ -0,0 +1,6 @@ +SAMPLE_ID mutations gistic +TEST-A2B8-01 TESTPANEL1 NA +TEST_SAMPLE_3 NA TESTPANEL1 +TCGA-BH-NEW-01 TESTPANEL1 TESTPANEL1 +TCGA-A1-A0SK-01 TESTPANEL2 TESTPANEL1 +TCGA-A1-A0SB-01 TESTPANEL2 TESTPANEL1 diff --git a/tests/test_data/study_es_0_inc/data_methylation_hm27.txt b/tests/test_data/study_es_0_inc/data_methylation_hm27.txt new file mode 100644 index 00000000..3db35409 --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_methylation_hm27.txt @@ -0,0 +1,9 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW +ACAP3 116983 0.022 0.681 0.790 + 375790 0.435 0.340 0.321 +ATAD3A 55210 0.229 0.946 0.439 +ATAD3B 83858 0.885 0.707 0.664 +ATAD3C 219293 0.660 0.315 0.694 +ERCC5 2073 0.436 0.749 0.345 +ACP3 55 0.622 0.396 0.029 +TP53 0.563 0.686 0.607 diff --git a/tests/test_data/study_es_0_inc/data_mutations_extended.maf b/tests/test_data/study_es_0_inc/data_mutations_extended.maf new file mode 100644 index 00000000..42bd026a --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_mutations_extended.maf @@ -0,0 +1,20 @@ +#version 2.4 +Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer MA:FImpact MA:FIS Amino_Acid_Change MA:link.MSA MA:link.PDB MA:link.var Tumor_Sample_UUID Matched_Norm_Sample_UUID HGVSc HGVSp HGVSp_Short Transcript_ID Exon_Number t_depth t_ref_count t_alt_count n_depth n_ref_count n_alt_count all_effects Allele Gene Feature Feature_type Consequence cDNA_position CDS_position Protein_position Amino_acids Codons Existing_variation ALLELE_NUM DISTANCE SYMBOL SYMBOL_SOURCE HGNC_ID BIOTYPE CANONICAL CCDS ENSP SWISSPROT TREMBL UNIPARC RefSeq SIFT PolyPhen EXON INTRON DOMAINS GMAF AFR_MAF AMR_MAF ASN_MAF EAS_MAF EUR_MAF SAS_MAF AA_MAF EA_MAF CLIN_SIG SOMATIC PUBMED MOTIF_NAME MOTIF_POS HIGH_INF_POS MOTIF_SCORE_CHANGE IMPACT PICK VARIANT_CLASS TSL HGVS_OFFSET PHENO chromosome_name_wu start_wu stop_wu reference_wu variant_wu type_wu gene_name_wu transcript_name_wu transcript_species_wu transcript_source_wu transcript_version_wu strand_wu transcript_status_wu trv_type_wu c_position_wu amino_acid_change_wu ucsc_cons_wu domain_wu all_domains_wu deletion_substructures_wu transcript_error_wu default_gene_name_wu gene_name_source_wu ensembl_gene_id normal_ref_reads normal_var_reads normal_vaf tumor_ref_reads tumors_var_reads tumor_vaf evs_ea evs_aa evs_all chromosome_name_WU start_WU stop_WU reference_WU variant_WU type_WU gene_name_WU transcript_name_WU transcript_species_WU transcript_source_WU transcript_version_WU strand_WU transcript_status_WU trv_type_WU c_position_WU amino_acid_change_WU ucsc_cons_WU domain_WU all_domains_WU deletion_substructures_WU transcript_error_WU default_gene_name_WU gene_name_source_WU EVS_EA EVS_AA EVS_All cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation Zygosity.name Zygosity.code +OR11H1 genome.wustl.edu GRCh37 22 16449539 16449539 -1 Missense_Mutation SNP A A G TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 A A Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx low 1.49 V89A getma.org/?cm=msa&ty=f&p=O11H1_HUMAN&rb=1&re=154&var=V89A getma.org/?cm=var&var=hg19,22,16449539,A,G&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.266T>C p.Val89Ala p.V89A ENST00000252835 1/1 0 0 OR11H1,missense_variant,p.Val89Ala,ENST00000252835,NM_001005239.1; G ENSG00000130538 ENST00000252835 Transcript missense_variant 267/982 266/981 89/326 V/A gTc/gCc rs199856986,COSM1484040 1 OR11H1 HGNC 15404 protein_coding YES CCDS33594.1 ENSP00000252835 O11H1_HUMAN UPI000004B1CF NM_001005239.1 deleterious(0.02) possibly_damaging(0.589) 1/1 Transmembrane_helices:TMhelix,PROSITE_profiles:PS50262,hmmpanther:PTHR24242:SF201,hmmpanther:PTHR24242,Gene3D:1.20.1070.10,Superfamily_domains:SSF81321 0,1 MODERATE 1 SNV 0,1 22 16449539 16449539 A G SNP OR11H1 ENST00000252835 human ensembl 69_37n -1 known missense c.266 p.V89A 234 pfam_7TM_GPCR_Rhodpsn,pfscan_GPCR_Rhodpsn_supfam pfam_7TM_GPCR_Rhodpsn,pfscan_GPCR_Rhodpsn_supfam,prints_Olfact_rcpt,prints_7TM_GPCR_Rhodpsn - no_errors OR11H1 HGNC ENSG00000130538 65 0 0 38 6 13.64 - - - 22 16449539 16449539 A G SNP OR11H1 ENST00000252835 human ensembl 69_37n -1 known missense c.266 p.V89A 234 pfam_7TM_GPCR_Rhodpsn,pfscan_GPCR_Rhodpsn_supfam pfam_7TM_GPCR_Rhodpsn,pfscan_GPCR_Rhodpsn_supfam,prints_Olfact_rcpt,prints_7TM_GPCR_Rhodpsn - no_errors OR11H1 HGNC - - - Putative_Driver Test driver +TMEM247 genome.wustl.edu GRCh37 2 46707888 46707888 1 Frame_Shift_Del DEL G G - TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 G G Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.463delG p.Ala155ArgfsTer59 p.A155Rfs*59 ENST00000434431 2/3 0 0 TMEM247,frameshift_variant,p.Ala155ArgfsTer59,ENST00000434431,NM_001145051.2;TMEM247,intron_variant,,ENST00000432241,; - ENSG00000187600 ENST00000434431 Transcript frameshift_variant 462/659 462/659 154/219 E/X gaG/ga COSM1408208,~rs70940616 1 TMEM247 HGNC 42967 protein_coding YES CCDS56117.1 ENSP00000388684 TM247_HUMAN UPI0000366EF8 NM_001145051.2 2/3 Coiled-coils_(Ncoils):Coil,Pfam_domain:PF15444 -:0.0202 -:0.0439 1 HIGH 1 deletion 1 1 2 46707888 46707888 G - DEL TMEM247 ENST00000434431 human ensembl 69_37n 1 known frame_shift_del c.462 p.A155fs 83 - no_stop_codon:bad_bp_length_for_coding_region TMEM247 HGNC ENSG00000187600 20 0 0 7 3 30 - - - 2 46707888 46707888 G - DEL TMEM247 ENST00000434431 human ensembl 69_37n 1 known frame_shift_del c.462 p.A155fs 83 - no_stop_codon:bad_bp_length_for_coding_region TMEM247 HGNC - - - Putative_Passenger Test passenger Class 2 Class annotation +ABLIM1 genome.wustl.edu GRCh37 10 116247760 116247760 -1 Missense_Mutation SNP T C C TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 T T Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx medium 3.39 H333R getma.org/?cm=msa&ty=f&p=ABLM1_HUMAN&rb=285&re=339&var=H333R getma.org/pdb.php?prot=ABLM1_HUMAN&from=285&to=339&var=H333R getma.org/?cm=var&var=hg19,10,116247760,T,C&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.998A>G p.His333Arg p.H333R ENST00000277895 8/23 0 0 ABLIM1,missense_variant,p.His273Arg,ENST00000533213,;ABLIM1,missense_variant,p.His273Arg,ENST00000369252,NM_001003408.1,NM_001003407.1;ABLIM1,missense_variant,p.His17Arg,ENST00000392952,NM_006720.3;ABLIM1,missense_variant,p.His17Arg,ENST00000369266,;ABLIM1,missense_variant,p.His333Arg,ENST00000277895,NM_002313.5;ABLIM1,missense_variant,p.His17Arg,ENST00000369253,;ABLIM1,missense_variant,p.His17Arg,ENST00000428430,;ABLIM1,upstream_gene_variant,,ENST00000440467,;ABLIM1,missense_variant,p.His273Arg,ENST00000392955,;ABLIM1,missense_variant,p.His273Arg,ENST00000369256,; C ENSG00000099204 ENST00000277895 Transcript missense_variant 1096/2657 998/2337 333/778 H/R cAt/cGt COSM1474374,COSM1474373,COSM1474375 1 ABLIM1 HGNC 78 protein_coding YES CCDS7590.1 ENSP00000277895 ABLM1_HUMAN UPI0000418D06 NM_002313.5 deleterious(0) probably_damaging(0.988) 8/23 PROSITE_profiles:PS50023,hmmpanther:PTHR24213:SF18,hmmpanther:PTHR24213,Gene3D:2.10.110.10,SMART_domains:SM00132,Superfamily_domains:SSF57716 1,1,1 MODERATE 1 SNV 1,1,1 10 116247760 116247760 T C SNP ABLIM1 ENST00000277895 human ensembl 69_37n -1 known missense c.998 p.H333R 1000 smart_Znf_LIM pfam_Znf_LIM,pfam_Villin_headpiece,superfamily_Villin_headpiece,smart_Znf_LIM,smart_Villin_headpiece,pfscan_Villin_headpiece,pfscan_Znf_LIM - no_errors ABLIM1 HGNC ENSG00000099204 77 0 0 36 13 26.53 - - - 10 116247760 116247760 T C SNP ABLIM1 ENST00000277895 human ensembl 69_37n -1 known missense c.998 p.H333R 1000 smart_Znf_LIM pfam_Znf_LIM,pfam_Villin_headpiece,superfamily_Villin_headpiece,smart_Znf_LIM,smart_Villin_headpiece,pfscan_Villin_headpiece,pfscan_Znf_LIM - no_errors ABLIM1 HGNC - - - Putative_Driver Test driver Class 1 Class annotation +ADAMTS20 genome.wustl.edu GRCh37 12 43944926 43944926 -1 Missense_Mutation SNP T T C TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 T T Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx medium 2.85 Y80C getma.org/?cm=msa&ty=f&p=ATS20_HUMAN&rb=40&re=186&var=Y80C getma.org/?cm=var&var=hg19,12,43944926,T,C&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 p.Tyr80Cys p.Y80C ENST00000389420 2/39 0 0 ADAMTS20,missense_variant,p.Tyr80Cys,ENST00000389420,NM_025003.3;ADAMTS20,missense_variant,p.Tyr80Cys,ENST00000553158,; C ENSG00000173157 ENST00000389420 Transcript missense_variant 239/6076 239/5733 80/1910 Y/C tAt/tGt COSM1476552,COSM1476551 1 ADAMTS20 HGNC 17178 protein_coding YES CCDS31778.2 ENSP00000374071 ATS20_HUMAN UPI00004565F4 NM_025003.3 deleterious(0) probably_damaging(1) 2/39 hmmpanther:PTHR13723,hmmpanther:PTHR13723:SF165,Pfam_domain:PF01562 1,1 MODERATE 1 SNV 1,1 12 43944926 43944926 T C SNP ADAMTS20 ENST00000389420 human ensembl 69_37n -1 known missense c.239 p.Y80C 1000 pfam_Peptidase_M12B_N pfam_Pept_M12B_GON-ADAMTSs,pfam_Thrombospondin_1_rpt,pfam_Peptidase_M12B_N,pfam_ADAM_spacer1,pfam_Peptidase_M12B,superfamily_Thrombospondin_1_rpt,smart_Thrombospondin_1_rpt,prints_Peptidase_M12B_ADAM-TS,pfscan_Pept_M12B_GON-ADAMTSs,pfscan_Thrombospondin_1_rpt,pfscan_Peptidase_M12B - no_errors ADAMTS20 HGNC ENSG00000173157 50 0 0 19 17 45.95 - - - 12 43944926 43944926 T C SNP ADAMTS20 ENST00000389420 human ensembl 69_37n -1 known missense c.239 p.Y80C 1000 pfam_Peptidase_M12B_N pfam_Pept_M12B_GON-ADAMTSs,pfam_Thrombospondin_1_rpt,pfam_Peptidase_M12B_N,pfam_ADAM_spacer1,pfam_Peptidase_M12B,superfamily_Thrombospondin_1_rpt,smart_Thrombospondin_1_rpt,prints_Peptidase_M12B_ADAM-TS,pfscan_Pept_M12B_GON-ADAMTSs,pfscan_Thrombospondin_1_rpt,pfscan_Peptidase_M12B - no_errors ADAMTS20 HGNC - - - Unknown Class 4 Class annotation +DTNB genome.wustl.edu GRCh37 2 25678299 25678299 -1 Missense_Mutation SNP C G T TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 C C Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx medium 2125 V382M getma.org/?cm=msa&ty=f&p=DTNB_HUMAN&rb=283&re=473&var=V382M getma.org/?cm=var&var=hg19,2,25678299,C,T&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.1144C>A p.Val382Met p.V382M ENST00000406818 11/21 0 0 DTNB,missense_variant,p.Val382Met,ENST00000406818,NM_001256303.1,NM_021907.4;DTNB,missense_variant,p.Val382Met,ENST00000407661,NM_183360.2,NM_001256304.1;DTNB,missense_variant,p.Val382Met,ENST00000404103,NM_033147.3;DTNB,missense_variant,p.Val382Met,ENST00000288642,;DTNB,missense_variant,p.Val325Met,ENST00000496972,NM_001256308.1;DTNB,missense_variant,p.Val178Met,ENST00000545439,;DTNB,intron_variant,,ENST00000407038,NM_033148.3;DTNB,intron_variant,,ENST00000407186,;DTNB,intron_variant,,ENST00000405222,NM_183361.2;DTNB,intron_variant,,ENST00000489756,;DTNB,intron_variant,,ENST00000481841,;DTNB,intron_variant,,ENST00000486555,;DTNB,3_prime_UTR_variant,,ENST00000398951,;DTNB,non_coding_transcript_exon_variant,,ENST00000485845,;DTNB,non_coding_transcript_exon_variant,,ENST00000479898,;DTNB,intron_variant,,ENST00000356599,;DTNB,intron_variant,,ENST00000482145,; T ENSG00000138101 ENST00000406818 Transcript missense_variant 1394/2474 1144/1884 382/627 V/M Gtg/Atg COSM3839175,COSM3839176 1 DTNB HGNC 3058 protein_coding YES CCDS46237.1 ENSP00000384084 DTNB_HUMAN Q53TC8_HUMAN,Q53T51_HUMAN,Q53SF9_HUMAN,Q53QV1_HUMAN,F8W9U0_HUMAN,E9PE76_HUMAN,E7ES64_HUMAN UPI0000129949 NM_001256303.1,NM_021907.4 deleterious(0.03) benign(0.379) 11/21 hmmpanther:PTHR11915:SF227,hmmpanther:PTHR11915,PIRSF_domain:PIRSF038204 1,1 MODERATE 1 SNV 1,1 2 25678299 25678299 C T SNP DTNB ENST00000406818 human ensembl 69_37n -1 known missense c.1144 p.V382M 1000 pirsf_Distrobrevin pfam_EF-hand_dom_typ1,pfam_EF-hand_dom_typ2,pfam_Znf_ZZ,smart_Znf_ZZ,pirsf_Distrobrevin,pfscan_Znf_ZZ - no_errors DTNB HGNC ENSG00000138101 35 0 0 9 9 50 - - - 2 25678299 25678299 C T SNP DTNB ENST00000406818 human ensembl 69_37n -1 known missense c.1144 p.V382M 1000 pirsf_Distrobrevin pfam_EF-hand_dom_typ1,pfam_EF-hand_dom_typ2,pfam_Znf_ZZ,smart_Znf_ZZ,pirsf_Distrobrevin,pfscan_Znf_ZZ - no_errors DTNB HGNC - - - Putative_Passenger Test passenger Class 1 Class annotation +TP53 genome.wustl.edu GRCh37 17 7578253 7578253 0 Missense_Mutation SNP C C A TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 A C Unknown Germline Phase_IV Capture 1 dbGAP IlluminaGAIIx medium 3.005 getma.org/?cm=msa&ty=f&p=P53_HUMAN&rb=95&re=289&var=G199V getma.org/pdb.php?prot=P53_HUMAN&from=95&to=289&var=G199V getma.org/?cm=var&var=hg19,17,7578253,C,A&fts=all ENST00000269305.4:c.596G>T p.Gly199Val p.G199V ENST00000269305 11-Jun 0 0 TP53,missense_variant,p.Gly199Val,ENST00000420246,NM_001126114.2,NM_001276696.1;TP53,missense_variant,p.Gly199Val,ENST00000455263,NM_001276695.1,NM_001126113.2;TP53,missense_variant,p.Gly199Val,ENST00000269305,NM_001126112.2,NM_001276761.1,NM_001276760.1,NM_000546.5,NM_001126118.1;TP53,missense_variant,p.Gly199Val,ENST00000445888,;TP53,missense_variant,p.Gly199Val,ENST00000359597,;TP53,missense_variant,p.Gly199Val,ENST00000413465,;TP53,missense_variant,p.Gly67Val,ENST00000509690,;TP53,missense_variant,p.Gly106Val,ENST00000514944,;TP53,downstream_gene_variant,,ENST00000508793,;TP53,downstream_gene_variant,,ENST00000604348,;TP53,downstream_gene_variant,,ENST00000503591,;TP53,upstream_gene_variant,,ENST00000576024,;TP53,intron_variant,,ENST00000574684,;TP53,non_coding_transcript_exon_variant,,ENST00000510385,;TP53,non_coding_transcript_exon_variant,,ENST00000504290,;TP53,non_coding_transcript_exon_variant,,ENST00000504937,;TP53,non_coding_transcript_exon_variant,,ENST00000505014,; A ENSG00000141510 ENST00000269305 Transcript missense_variant 786/2579 596/1182 199 G/V gGa/gTa TP53_g.12665G>T,COSM44140,COSM255788,COSM255787,COSM255789,COSM3675525,COSM3675524,COSM255790 1 TP53 HGNC 11998 protein_coding YES CCDS11118.1 ENSP00000269305 P53_HUMAN S5LQU8_HUMAN,Q761V2_HUMAN,Q6IT77_HUMAN,Q1HGV1_HUMAN,Q0PKT5_HUMAN,L0ES54_HUMAN,L0EQ05_HUMAN,K7PPA8_HUMAN,H2EHT1_HUMAN,G4Y083_HUMAN,E9PCY9_HUMAN,E7ESS1_HUMAN,E7EMR6_HUMAN,B5AKF6_HUMAN,B4DNI2_HUMAN,A4GWD0_HUMAN,A4GWB8_HUMAN,A4GWB5_HUMAN,A4GW97_HUMAN,A4GW76_HUMAN,A4GW75_HUMAN,A4GW74_HUMAN,A4GW67_HUMAN,A2I9Z1_HUMAN,A2I9Z0_HUMAN UPI000002ED67 NM_001126112.2 deleterious(0) probably_damaging(1) 11-Jun Gene3D:2.60.40.720,Pfam_domain:PF00870,hmmpanther:PTHR11447,hmmpanther:PTHR11447:SF6,Superfamily_domains:SSF49417 0,1,1,1,1,1,1,1 MODERATE 1 SNV 0,1,1,1,1,1,1,1 17 7578253 7578253 C A SNP TP53 NM_000546.4 human genbank 58_37c -1 reviewed missense c.596 p.G199V 1 HMMPfam_P53|7Csuperfamily_p53-like transcription factors HMMPfam_P53_TAD|7CHMMPfam_P53|7Csuperfamily_p53-like transcription factors|7CPatternScan_P53|7CHMMPfam_P53_tetramer|7Csuperfamily_p53 tetramerization domain - 17 7578253 7578253 C A SNP TP53 NM_000546.4 human genbank 58_37c -1 reviewed missense c.596 p.G199V 1 HMMPfam_P53|7Csuperfamily_p53-like transcription factors HMMPfam_P53_TAD|7CHMMPfam_P53|7Csuperfamily_p53-like transcription factors|7CPatternScan_P53|7CHMMPfam_P53_tetramer|7Csuperfamily_p53 tetramerization domain - +TP53 genome.wustl.edu GRCh37 17 7576851 7576851 0 Splice_Site SNP A A C novel unknown TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 A A Unknown Somatic Phase_IV Capture 1 dbGAP Illumina GAIIx ENST00000269305.4:c.993+2T>G p.X331_splice ENST00000269305 0 0 TP53,splice_donor_variant,,ENST00000420246,NM_001126114.2,NM_001276696.1;TP53,splice_donor_variant,,ENST00000455263,NM_001276695.1,NM_001126113.2;TP53,splice_donor_variant,,ENST00000269305,NM_001126112.2,NM_001276761.1,NM_001276760.1,NM_000546.5,NM_001126118.1;TP53,splice_donor_variant,,ENST00000445888,;TP53,splice_donor_variant,,ENST00000359597,;TP53,splice_donor_variant,,ENST00000576024,;TP53,intron_variant,,ENST00000413465,;TP53,downstream_gene_variant,,ENST00000509690,;TP53,downstream_gene_variant,,ENST00000508793,;TP53,downstream_gene_variant,,ENST00000604348,;TP53,downstream_gene_variant,,ENST00000503591,;TP53,downstream_gene_variant,,ENST00000514944,;TP53,downstream_gene_variant,,ENST00000574684,;TP53,splice_donor_variant,,ENST00000510385,;TP53,splice_donor_variant,,ENST00000504290,;TP53,splice_donor_variant,,ENST00000504937,;TP53,downstream_gene_variant,,ENST00000505014,; C ENSG00000141510 ENST00000269305 Transcript splice_donor_variant -/2579 993/1182 TP53_g.14067T>G,COSM29774,COSM146229 1 TP53 HGNC 11998 protein_coding YES CCDS11118.1 ENSP00000269305 P53_HUMAN S5LQU8_HUMAN,Q761V2_HUMAN,Q6IT77_HUMAN,Q1HGV1_HUMAN,Q0PKT5_HUMAN,L0ES54_HUMAN,L0EQ05_HUMAN,K7PPA8_HUMAN,H2EHT1_HUMAN,G4Y083_HUMAN,E9PCY9_HUMAN,E7ESS1_HUMAN,E7EMR6_HUMAN,B5AKF6_HUMAN,B4DNI2_HUMAN,A4GWD0_HUMAN,A4GWB8_HUMAN,A4GWB5_HUMAN,A4GW97_HUMAN,A4GW76_HUMAN,A4GW75_HUMAN,A4GW74_HUMAN,A4GW67_HUMAN,A2I9Z1_HUMAN,A2I9Z0_HUMAN UPI000002ED67 NM_001126112.2 10-Sep 0,1,1 HIGH 1 SNV 0,1,1 17 7576851 7576851 A C SNP TP53 NM_000546 human genbank 57_37b -1 reviewed splice_site c.993+2 e8+2 1 - - - 17 7576851 7576851 A C SNP TP53 NM_000546 human genbank 57_37b -1 reviewed splice_site c.993+2 e8+2 1 - - - +BRCA1 genome.wustl.edu GRCh37 17 41243581 41243581 0 Nonsense_Mutation SNP G G A rs80357262 TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 A G Unknown Germline Phase_IV Capture 1 dbGAP IlluminaGAIIx 0 getma.org/?cm=var&var=hg19,17,41243581,G,A&fts=all ENST00000357654.3:c.3967C>T p.Gln1323Ter p.Q1323* ENST00000357654 23-Oct 0 0 BRCA1,stop_gained,p.Gln1027Ter,ENST00000309486,NM_007297.3;BRCA1,stop_gained,p.Gln1323Ter,ENST00000357654,NM_007294.3;BRCA1,stop_gained,p.Gln1323Ter,ENST00000346315,;BRCA1,stop_gained,p.Gln1323Ter,ENST00000354071,;BRCA1,stop_gained,p.Gln1323Ter,ENST00000471181,NM_007300.3;BRCA1,stop_gained,p.Gln1276Ter,ENST00000493795,;BRCA1,stop_gained,p.Gln88Ter,ENST00000461574,;BRCA1,intron_variant,,ENST00000352993,;BRCA1,intron_variant,,ENST00000351666,;BRCA1,intron_variant,,ENST00000468300,NM_007299.3;BRCA1,intron_variant,,ENST00000491747,NM_007298.3;BRCA1,intron_variant,,ENST00000478531,;BRCA1,intron_variant,,ENST00000493919,;BRCA1,intron_variant,,ENST00000484087,;BRCA1,intron_variant,,ENST00000591534,;BRCA1,intron_variant,,ENST00000487825,;BRCA1,intron_variant,,ENST00000586385,;BRCA1,intron_variant,,ENST00000591849,;BRCA1,downstream_gene_variant,,ENST00000470026,;BRCA1,downstream_gene_variant,,ENST00000477152,;BRCA1,downstream_gene_variant,,ENST00000494123,;BRCA1,downstream_gene_variant,,ENST00000473961,;BRCA1,downstream_gene_variant,,ENST00000497488,;BRCA1,downstream_gene_variant,,ENST00000476777,;BRCA1,3_prime_UTR_variant,,ENST00000461221,;BRCA1,non_coding_transcript_exon_variant,,ENST00000467274,;BRCA1,downstream_gene_variant,,ENST00000492859,;BRCA1,downstream_gene_variant,,ENST00000412061,; A ENSG00000012048 ENST00000357654 Transcript stop_gained 4086/7094 3967/5592 1323 Q/* Caa/Taa rs80357262 1 BRCA1 HGNC 1100 protein_coding CCDS11453.1 ENSP00000350283 BRCA1_HUMAN Q9UE29_HUMAN,Q9NQR3_HUMAN,Q92897_HUMAN,Q7KYU6_HUMAN,Q4EW25_HUMAN,Q3YB53_HUMAN,Q3YB50_HUMAN,Q3YB49_HUMAN,Q3LRH8_HUMAN,Q3B891_HUMAN,K7EPC7_HUMAN,K4K7V3_HUMAN,K4JXS7_HUMAN,K4JUB1_HUMAN,G4V503_HUMAN,G4V502_HUMAN,G4V500_HUMAN,G4V4Z8_HUMAN,G4V4Z7_HUMAN,G1UI37_HUMAN,E9PFZ0_HUMAN,E7EWN5_HUMAN,E7EP70_HUMAN,C9IZW4_HUMAN,C4PFY7_HUMAN UPI0000126AC8 NM_007294.3 23-Oct PIRSF_domain:PIRSF001734,hmmpanther:PTHR13763,hmmpanther:PTHR13763:SF0 not_provided,pathogenic HIGH SNV 1 17 41243581 41243581 G A SNP BRCA1 NM_007294.3 human genbank 58_37c -1 reviewed nonsense c.3967 p.Q1323* 0.949 HMMPfam_BRCT|7CHMMSmart_SM00292|7Csuperfamily_BRCT domain|7CHMMSmart_SM00184|7CPatternScan_ZF_RING_1|7CHMMPfam_zf-C3HC4|7Csuperfamily_RING/U-box - 17 41243581 41243581 G A SNP BRCA1 NM_007294.3 human genbank 58_37c -1 reviewed nonsense c.3967 p.Q1323* 0.949 HMMPfam_BRCT|7CHMMSmart_SM00292|7Csuperfamily_BRCT domain|7CHMMSmart_SM00184|7CPatternScan_ZF_RING_1|7CHMMPfam_zf-C3HC4|7Csuperfamily_RING/U-box - +BRCA1 genome.wustl.edu GRCh37 17 41201181 41201181 0 Missense_Mutation SNP C C A rs80357069 byCluster TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 C C Unknown Somatic Phase_IV Capture 1 dbGAP Illumina GAIIx medium 2.25 getma.org/?cm=msa&ty=f&p=BRCA1_HUMAN&rb=1756&re=1842&var=G1788V getma.org/pdb.php?prot=BRCA1_HUMAN&from=1756&to=1842&var=G1788V getma.org/?cm=var&var=hg19,17,41201181,C,A&fts=all ENST00000357654.3:c.5363G>T p.Gly1788Val p.G1788V ENST00000357654 21/23 0 0 BRCA1,missense_variant,p.Gly1492Val,ENST00000309486,NM_007297.3;BRCA1,missense_variant,p.Gly1788Val,ENST00000357654,NM_007294.3;BRCA1,missense_variant,p.Gly1549Val,ENST00000346315,;BRCA1,missense_variant,p.Gly1523Val,ENST00000354071,;BRCA1,missense_variant,p.Gly1809Val,ENST00000471181,NM_007300.3;BRCA1,missense_variant,p.Gly1741Val,ENST00000493795,;BRCA1,missense_variant,p.Gly646Val,ENST00000352993,;BRCA1,missense_variant,p.Gly605Val,ENST00000351666,;BRCA1,missense_variant,p.Gly684Val,ENST00000491747,NM_007298.3;BRCA1,missense_variant,p.Gly279Val,ENST00000591534,;BRCA1,missense_variant,p.Gly98Val,ENST00000586385,;BRCA1,missense_variant,p.Gly21Val,ENST00000591849,;BRCA1,intron_variant,,ENST00000468300,NM_007299.3;BRCA1,3_prime_UTR_variant,,ENST00000461221,; A ENSG00000012048 ENST00000357654 Transcript missense_variant 5482/7094 5363/5592 1788 G/V gGt/gTt rs80357069,COSM436662 1 BRCA1 HGNC 1100 protein_coding CCDS11453.1 ENSP00000350283 BRCA1_HUMAN Q9UE29_HUMAN,Q9NQR3_HUMAN,Q92897_HUMAN,Q7KYU6_HUMAN,Q4EW25_HUMAN,Q3YB53_HUMAN,Q3YB50_HUMAN,Q3YB49_HUMAN,Q3LRH8_HUMAN,Q3B891_HUMAN,K7EPC7_HUMAN,K4K7V3_HUMAN,K4JXS7_HUMAN,K4JUB1_HUMAN,G4V503_HUMAN,G4V502_HUMAN,G4V500_HUMAN,G4V4Z8_HUMAN,G4V4Z7_HUMAN,G1UI37_HUMAN,E9PFZ0_HUMAN,E7EWN5_HUMAN,E7EP70_HUMAN,C9IZW4_HUMAN,C4PFY7_HUMAN UPI0000126AC8 NM_007294.3 deleterious(0) benign(0.031) 21/23 Gene3D:3.40.50.10190,Pfam_domain:PF00533,PIRSF_domain:PIRSF001734,Prints_domain:PR00493,PROSITE_profiles:PS50172,hmmpanther:PTHR13763,hmmpanther:PTHR13763:SF0,SMART_domains:SM00292,Superfamily_domains:SSF52113 not_provided,pathogenic 0,1 MODERATE SNV 1,1 17 41201181 41201181 C A SNP BRCA1 NM_007294 human genbank 57_37b -1 reviewed missense c.5363 p.G1788V 1 HMMPfam_BRCT,HMMSmart_SM00292,superfamily_BRCT domain superfamily_RING/U-box,HMMSmart_SM00184,HMMPfam_zf-C3HC4,PatternScan_ZF_RING_1,HMMPfam_BRCT,HMMSmart_SM00292,superfamily_BRCT domain - 17 41201181 41201181 C A SNP BRCA1 NM_007294 human genbank 57_37b -1 reviewed missense c.5363 p.G1788V 1 HMMPfam_BRCT,HMMSmart_SM00292,superfamily_BRCT domain superfamily_RING/U-box,HMMSmart_SM00184,HMMPfam_zf-C3HC4,PatternScan_ZF_RING_1,HMMPfam_BRCT,HMMSmart_SM00292,superfamily_BRCT domain - +ATM genome.wustl.edu GRCh37 11 108173702 108173702 0 Frame_Shift_Del DEL G G - TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 - G Unknown Germline Phase_IV Capture 1 dbGAP IlluminaGAIIx ENST00000278616.4:c.5443delG p.Asp1815ThrfsTer13 p.D1815Tfs*13 ENST00000278616 36/63 0 0 ATM,frameshift_variant,p.Asp1815ThrfsTer13,ENST00000278616,NM_000051.3;ATM,frameshift_variant,p.Asp1815ThrfsTer13,ENST00000452508,;ATM,non_coding_transcript_exon_variant,,ENST00000524792,;ATM,non_coding_transcript_exon_variant,,ENST00000533690,;ATM,non_coding_transcript_exon_variant,,ENST00000534625,;ATM,upstream_gene_variant,,ENST00000529588,; - ENSG00000149311 ENST00000278616 Transcript frameshift_variant 5827/13147 5442/9171 1814 L/X ttG/tt rs772138812 1 ATM HGNC 795 protein_coding YES CCDS31669.1 ENSP00000278616 ATM_HUMAN M0QXY8_HUMAN,E9PRG7_HUMAN,E9PIN0_HUMAN UPI0000DBEF44 NM_000051.3 36/63 hmmpanther:PTHR11139,hmmpanther:PTHR11139:SF66 HIGH 1 deletion 1 11 108173702 108173702 G - DEL ATM NM_000051.3 human genbank 58_37c 1 reviewed frame_shift_del c.5442 p.D1815fs 1 superfamily_ARM repeat superfamily_ARM repeat|7CHMMPfam_FAT|7Csuperfamily_Protein kinase-like (PK-like)|7CHMMPfam_PI3_PI4_kinase|7CHMMSmart_SM00146|7CPatternScan_PI3_4_KINASE_1|7CPatternScan_PI3_4_KINASE_2|7CHMMPfam_FATC - 11 108173702 108173702 G - DEL ATM NM_000051.3 human genbank 58_37c 1 reviewed frame_shift_del c.5442 p.D1815fs 1 superfamily_ARM repeat superfamily_ARM repeat|7CHMMPfam_FAT|7Csuperfamily_Protein kinase-like (PK-like)|7CHMMPfam_PI3_PI4_kinase|7CHMMSmart_SM00146|7CPatternScan_PI3_4_KINASE_1|7CPatternScan_PI3_4_KINASE_2|7CHMMPfam_FATC - +ATM genome.wustl.edu GRCh37 11 108106472 108106472 0 Frame_Shift_Del DEL T T - novel unknown TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 T T Unknown Somatic Phase_IV Capture 1 dbGAP Illumina GAIIx ENST00000278616.4:c.409delT p.Tyr137ThrfsTer16 p.Y137Tfs*16 ENST00000278616 May-63 0 0 ATM,frameshift_variant,p.Tyr137ThrfsTer16,ENST00000278616,NM_000051.3;ATM,frameshift_variant,p.Tyr137ThrfsTer16,ENST00000452508,;ATM,frameshift_variant,p.Tyr137ThrfsTer16,ENST00000527805,;ATM,intron_variant,,ENST00000527891,;ATM,downstream_gene_variant,,ENST00000601453,;ATM,non_coding_transcript_exon_variant,,ENST00000530958,; - ENSG00000149311 ENST00000278616 Transcript frameshift_variant 792/13147 407/9171 136 I/X aTt/at COSM428356,COSM1474979 1 ATM HGNC 795 protein_coding YES CCDS31669.1 ENSP00000278616 ATM_HUMAN M0QXY8_HUMAN,E9PRG7_HUMAN,E9PIN0_HUMAN UPI0000DBEF44 NM_000051.3 May-63 Pfam_domain:PF11640,hmmpanther:PTHR11139,hmmpanther:PTHR11139:SF66 1,1 HIGH 1 deletion 2 1,1 11 108106472 108106472 T - DEL ATM NM_000051 human genbank 57_37b 1 reviewed frame_shift_del c.407 p.Y137fs 0.013 superfamily_ARM repeat,HMMPfam_FAT,superfamily_Protein kinase-like (PK-like),HMMPfam_PI3_PI4_kinase,HMMSmart_SM00146,PatternScan_PI3_4_KINASE_1,PatternScan_PI3_4_KINASE_2,HMMPfam_FATC (deletion:cds_exon[108106397,108106561]) 11 108106472 108106472 T - DEL ATM NM_000051 human genbank 57_37b 1 reviewed frame_shift_del c.407 p.Y137fs 0.013 superfamily_ARM repeat,HMMPfam_FAT,superfamily_Protein kinase-like (PK-like),HMMPfam_PI3_PI4_kinase,HMMSmart_SM00146,PatternScan_PI3_4_KINASE_1,PatternScan_PI3_4_KINASE_2,HMMPfam_FATC (deletion:cds_exon[108106397,108106561]) +KAT2A genome.wustl.edu GRCh37 17 40272381 40272381 -1 Silent SNP G G A TCGA-BH-NEW-01 TCGA-BH-NEW-10 G G Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.471C>T p.= p.H157H ENST00000225916 3/18 0 0 KAT2A,synonymous_variant,p.=,ENST00000225916,NM_021078.2;CTD-2132N18.3,synonymous_variant,p.=,ENST00000592574,;RAB5C,downstream_gene_variant,,ENST00000393860,NM_201434.2;RAB5C,downstream_gene_variant,,ENST00000346213,NM_004583.3;HSPB9,upstream_gene_variant,,ENST00000355067,NM_033194.2;CTD-2132N18.3,missense_variant,p.Thr150Met,ENST00000592248,;KAT2A,synonymous_variant,p.=,ENST00000465682,;CTD-2132N18.3,3_prime_UTR_variant,,ENST00000585562,;KAT2A,upstream_gene_variant,,ENST00000592310,;KAT2A,upstream_gene_variant,,ENST00000588759,; A ENSG00000108773 ENST00000225916 Transcript synonymous_variant 525/3109 471/2514 157/837 H caC/caT rs536716483,COSM1479581 1 KAT2A HGNC 4201 protein_coding YES CCDS11417.1 ENSP00000225916 KAT2A_HUMAN K7ERS6_HUMAN UPI000000D978 NM_021078.2 3/18 hmmpanther:PTHR22880:SF124,hmmpanther:PTHR22880,Pfam_domain:PF06466,PIRSF_domain:PIRSF003048 A:0.0002 A:0 A:0 A:0.001 A:0 A:0 0,1 LOW 1 SNV 0,1 17 40272381 40272381 G A SNP ENSG00000267261 ENST00000592248 human ensembl 69_37n -1 known missense c.449 p.T150M 486 smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho pfam_Small_GTPase,pfam_MIRO-like,pfam_Small_GTPase_ARF/SAR,pfam_Gtr1_RagA,smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho,prints_Small_GTPase,tigrfam_Small_GTP-bd_dom - no_errors CTD-2132N18.3 Clone_based_vega_gene ENSG00000267261 40 0 0 30 36 54.55 - - - 17 40272381 40272381 G A SNP ENSG00000267261 ENST00000592248 human ensembl 69_37n -1 known missense c.449 p.T150M 486 smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho pfam_Small_GTPase,pfam_MIRO-like,pfam_Small_GTPase_ARF/SAR,pfam_Gtr1_RagA,smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho,prints_Small_GTPase,tigrfam_Small_GTP-bd_dom - no_errors CTD-2132N18.3 Clone_based_vega_gene - - - Putative_Driver Test driver Class 1 Class annotation +MSH3 genome.wustl.edu GRCh37 5 80024722 80024722 1 Frame_Shift_Del DEL T T - TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 T T Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.1508delT p.Leu503TrpfsTer5 p.L503Wfs*5 ENST00000265081 10/24 0 0 MSH3,frameshift_variant,p.Leu503TrpfsTer5,ENST00000265081,NM_002439.4;MSH3,non_coding_transcript_exon_variant,,ENST00000512258,; - ENSG00000113318 ENST00000265081 Transcript frameshift_variant 1586/4092 1506/3414 502/1137 S/X tcT/tc 1 MSH3 HGNC 7326 protein_coding YES CCDS34195.1 ENSP00000265081 MSH3_HUMAN UPI0000DBEE85 NM_002439.4 10/24 Superfamily_domains:SSF53150,Gene3D:3.30.420.110,Pfam_domain:PF05188,hmmpanther:PTHR11361,hmmpanther:PTHR11361:SF34 HIGH 1 deletion 2 5 80024722 80024722 T - DEL MSH3 ENST00000265081 human ensembl 69_37n 1 known frame_shift_del c.1506 p.L503fs 998 pfam_DNA_mismatch_repair_MutS_connt,superfamily_DNA_mismatch_repair_MutS_connt pfam_DNA_mismatch_repair_MutS_C,pfam_DNA_mismatch_repair_MutS_core,pfam_DNA_mismatch_repair_MutS_connt,pfam_DNA_mismatch_repair_MutS-lik_N,pfam_DNA_mismatch_repair_MutS_clamp,superfamily_DNA_mismatch_repair_MutS_core,superfamily_DNA_mismatch_repair_MutS_N,superfamily_DNA_mismatch_repair_MutS_connt,smart_DNA_mismatch_repair_MutS_core,smart_DNA_mismatch_repair_MutS_C - no_errors MSH3 HGNC ENSG00000113318 83 0 0 12 2 14.29 - - - 5 80024722 80024722 T - DEL MSH3 ENST00000265081 human ensembl 69_37n 1 known frame_shift_del c.1506 p.L503fs 998 pfam_DNA_mismatch_repair_MutS_connt,superfamily_DNA_mismatch_repair_MutS_connt pfam_DNA_mismatch_repair_MutS_C,pfam_DNA_mismatch_repair_MutS_core,pfam_DNA_mismatch_repair_MutS_connt,pfam_DNA_mismatch_repair_MutS-lik_N,pfam_DNA_mismatch_repair_MutS_clamp,superfamily_DNA_mismatch_repair_MutS_core,superfamily_DNA_mismatch_repair_MutS_N,superfamily_DNA_mismatch_repair_MutS_connt,smart_DNA_mismatch_repair_MutS_core,smart_DNA_mismatch_repair_MutS_C - no_errors MSH3 HGNC - - - Putative_Passenger Test passenger Class 3 Class annotation +MYB genome.wustl.edu GRCh37 6 135507043 135507044 1 Frame_Shift_Ins INS - - A TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 - - Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.27dupA p.Tyr10IlefsTer2 p.Y10Ifs*2 ENST00000367814 2/15 0 0 MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000341911,NM_001130173.1,NM_001161658.1,NM_001161656.1;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000316528,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000442647,NM_001161660.1,NM_001130172.1;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000367814,NM_001161659.1,NM_005375.2;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000525369,NM_001161657.1;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000527615,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000528774,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000534121,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000533624,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000534044,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000420123,;MYB,upstream_gene_variant,,ENST00000430686,;MYB,non_coding_transcript_exon_variant,,ENST00000531845,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000367812,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000533837,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000438901,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000525477,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000463282,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000339290,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000533808,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000525514,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000529586,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000526889,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000526320,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000531519,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000533384,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000531737,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000529262,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000526565,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000528015,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000526187,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000525002,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000528343,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000528140,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000528345,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000525940,;MYB,frameshift_variant,p.Tyr10Ter,ENST00000531634,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000524588,; A ENSG00000118513 ENST00000367814 Transcript frameshift_variant 212-213/3302 26-27/1923 9/640 I/IX ata/atAa COSM1487247,COSM1487248 1 MYB HGNC 7545 protein_coding CCDS5174.1 ENSP00000356788 MYB_HUMAN Q9UMI7_HUMAN,Q708J0_HUMAN,Q708E9_HUMAN,Q708E3_HUMAN UPI000012FAEA NM_001161659.1,NM_005375.2 2/15 hmmpanther:PTHR10641,hmmpanther:PTHR10641:SF454 1,1 HIGH insertion 1 1,1 6 135507043 135507044 - A INS MYB ENST00000341911 human ensembl 69_37n 1 known frame_shift_ins c.26_27 p.Y10fs 1.000:0.997 pfam_C-myb_C,pfam_SANT/Myb,pfam_Tscrpt_reg_Wos2-domain,superfamily_Homeodomain-like,smart_SANT/Myb,pfscan_Myb-like_dom - no_errors MYB HGNC ENSG00000118513 50 0 0 36 4 10 - - - 6 135507043 135507044 - A INS MYB ENST00000341911 human ensembl 69_37n 1 known frame_shift_ins c.26_27 p.Y10fs 1.000:0.997 pfam_C-myb_C,pfam_SANT/Myb,pfam_Tscrpt_reg_Wos2-domain,superfamily_Homeodomain-like,smart_SANT/Myb,pfscan_Myb-like_dom - no_errors MYB HGNC - - - Putative_Passenger Test passenger +PIEZO1 genome.wustl.edu GRCh37 16 88790292 88790292 -1 Missense_Mutation SNP T T C TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 T T Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx low 1.18 Q1441R getma.org/?cm=msa&ty=f&p=PIEZ1_HUMAN&rb=58&re=1627&var=Q1441R getma.org/?cm=var&var=hg19,16,88790292,T,C&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.4322A>G p.Gln1441Arg p.Q1441R ENST00000301015 31/51 0 0 PIEZO1,missense_variant,p.Gln1441Arg,ENST00000301015,NM_001142864.2;PIEZO1,missense_variant,p.Gln115Arg,ENST00000474606,;PIEZO1,upstream_gene_variant,,ENST00000327397,;PIEZO1,upstream_gene_variant,,ENST00000466823,;RP5-1142A6.9,downstream_gene_variant,,ENST00000564984,;PIEZO1,non_coding_transcript_exon_variant,,ENST00000566414,;PIEZO1,upstream_gene_variant,,ENST00000419505,;PIEZO1,upstream_gene_variant,,ENST00000497793,;PIEZO1,upstream_gene_variant,,ENST00000495568,;PIEZO1,downstream_gene_variant,,ENST00000475586,;PIEZO1,downstream_gene_variant,,ENST00000491917,; C ENSG00000103335 ENST00000301015 Transcript missense_variant 4569/8072 4322/7566 1441/2521 Q/R cAg/cGg COSM1479166 1 PIEZO1 HGNC 28993 protein_coding YES CCDS54058.1 ENSP00000301015 PIEZ1_HUMAN UPI0001B300F3 NM_001142864.2 tolerated(0.25) possibly_damaging(0.78) 31/51 hmmpanther:PTHR13167,hmmpanther:PTHR13167:SF40 1 MODERATE 1 SNV 1 16 88790292 88790292 T C SNP PIEZO1 ENST00000301015 human ensembl 69_37n -1 novel missense c.4322 p.Q1441R 1000 pfam_DUF3595 - no_errors PIEZO1 HGNC ENSG00000103335 37 0 0 20 8 28.57 - - - 16 88790292 88790292 T C SNP PIEZO1 ENST00000301015 human ensembl 69_37n -1 novel missense c.4322 p.Q1441R 1000 pfam_DUF3595 - no_errors PIEZO1 HGNC - - - Putative_Passenger Test passenger Class 3 Class annotation +BRCA2 genome.wustl.edu GRCh37 13 108106473 108106473 0 Nonsense_Mutation SNP T T C TCGA-BH-NEW-01 TCGA-BH-NEW-10 Germline p.D191G BRCA2_HUMAN +BRCA2 genome.wustl.edu GRCh37 13 108106474 108106474 0 Nonsense_Mutation SNP T T C TCGA-BH-NEW-01 TCGA-BH-NEW-10 Somatic p.D191G BRCA2_HUMAN +BRCA2 genome.wustl.edu GRCh37 13 108106475 108106475 0 In_Frame_Del DEL T T - TCGA-A1-A0SK-01 TCGA-A1-A0SK-10 Germline p.R2659T BRCA2_HUMAN diff --git a/tests/test_data/study_es_0_inc/data_structural_variants.txt b/tests/test_data/study_es_0_inc/data_structural_variants.txt new file mode 100644 index 00000000..db82553f --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_structural_variants.txt @@ -0,0 +1,10 @@ +Sample_Id Site1_Entrez_Gene_Id Site1_Hugo_Symbol Site1_Ensembl_Transcript_Id Site1_Region_Number Site1_Chromosome Site1_Position Site1_Region Site1_Description Site2_Entrez_Gene_Id Site2_Hugo_Symbol Site2_Ensembl_Transcript_Id Site2_Region_Number Site2_Chromosome Site2_Position Site2_Contig Site2_Region Site2_Description Site2_Effect_On_Frame NCBI_Build DNA_Support RNA_Support Normal_Read_Count Tumor_Read_Count Normal_Variant_Count Tumor_Variant_Count Normal_Paired_End_Read_Count Tumor_Paired_End_Read_Count Normal_Split_Read_Count Tumor_Split_Read_Count Annotation Breakpoint_Type Center Connection_Type Event_Info Class SV_Length Comments External_Annotation cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation SV_Status StructVarNs.column1 StructVarNs2.lorem StructVarNs.column2 +TCGA-BH-NEW NA PIEZO1 ENST00000242365 15 7 138536968 EXON PIEZO1-NCOA4.K16B10.COSF509_1 NA NCOA4 ENST00000288602 10 7 140482957 EXON PIEZO1-NCOA4.PIEZO1.COSF509_2 NA GRCh37 no yes NA 1000 NA 900 NA NA NA NA PIEZO1-NCOA4.K16B10.COSF509 NA NA NA Fusion NA NA Gain-of-Function COSMIC:COSF509 Putative_Driver Test driver Foo Class 4 Class annotation SOMATIC value1 ipsum value2 +TCGA-BH-NEW NA KIAA1549 ENST00000242365 15 7 138536968 EXON KIAA1549-BRAF.K16B10.COSF509_1 NA BRAF ENST00000288602 10 7 140482957 EXON KIAA1549-BRAF.K16B10.COSF509_2 NA GRCh37 no yes NA 1000 NA 900 NA NA NA NA KIAA1549-BRAF.K16B10.COSF509 NA NA NA Fusion NA NA Gain-of-Function COSMIC:COSF509 Putative_Driver Test driver Class 4 Class annotation SOMATIC value1 ipsum value2 +TCGA-A1-A0SB-03 NA NCOA4 ENST00000344348 7 10 51582939 EXON NCOA4-RET.N7R12_1 NA RET ENST00000340058 12 10 43612031 EXON NCOA4-RET.N7R12_2 NA GRCh37 no yes NA 1001 NA 800 NA NA NA NA NCOA4-RET.N7R1 NA NA NA Fusion NA NA Gain-of-Function NA Putative_Passenger Test driver Class 3 Class annotation SOMATIC +TCGA-BH-NEW NA EML4 ENST00000318522 6 2 42492091 EXON EML4-ALK.E6bA20.AB374362_1 NA ALK ENST00000389048 20 2 29446394 EXON EML4-ALK.E6bA20.AB374362_2 NA GRCh37 no yes NA 1002 NA 700 NA NA NA NA EML4-ALK.E6bA20.AB374362 NA NA NA Fusion NA NA Gain-of-Function GENBANK:AB374362 Putative_Driver Test driver Class 2 Class annotation SOMATIC +TCGA-BH-NEW NA TMPRSS2 ENST00000332149 1 21 42880007 EXON TMPRSS2-ERG.T1E2.COSF23.1_1 NA ERG ENST00000442448 2 21 39956869 EXON TMPRSS2-ERG.T1E2.COSF23.1_2 NA GRCh37 no yes NA 1003 NA 600 NA NA NA NA TMPRSS2-ERG.T1E2.COSF23.1 NA NA NA Fusion NA NA Gain-of-Function COSMIC:COSF23 Unknown Test driver Class 1 Class annotation SOMATIC +TCGA-A1-A0SB-01 NA EGFR ENST00000275493 1 7 55087058 EXON EGFR-EGFR.E1E8.DelPositive.1_1 NA EGFR ENST00000275493 8 7 55223522 EXON EGFR-EGFR.E1E8.DelPositive.1_2 NA GRCh37 no yes NA 1004 NA 500 NA NA NA NA EGFR-EGFR.E1E8.DelPositive NA NA NA Fusion NA NA NA NA Putative_Driver Test driver Unknown Class annotation SOMATIC +TCGA-BH-NEW NA ALK ENST00000389048 11 2 29497964 EXON ALK-PTPN3.A11P3_1 NA PTPN3 ENST00000374541 3 9 112219679 EXON ALK-PTPN3.A11P3_2 NA GRCh37 no yes NA 1005 NA 400 NA NA NA NA ALK-PTPN3.A11P3 NA NA NA Fusion NA NA NA NA NA NA NA NA SOMATIC +TCGA-A1-A0SB-01 NA EML4 ENST00000318522 13 2 42522656 EXON EML4-ALK.E13A20.AB462411_1 NA ALK ENST00000389048 20 2 29446335 EXON EML4-ALK.E13A20.AB462411_2 NA GRCh37 no yes NA 1006 NA 300 NA NA NA NA EML4-ALK.E13A20 NA NA NA Fusion NA NA Gain-of-Function GENBANK:AB462411 NA NA NA NA SOMATIC +TCGA-A1-A0SB-03 NA TMPRSS2 ENST00000455813 1 21 42870045 EXON TMPRSS2-ETV1.T1bE4_1 NA ETV1 ENST00000405358 4 7 14017105 EXON TMPRSS2-ETV1.T1bE4_2 NA GRCh37 no yes NA 1007 NA 200 NA NA NA NA TMPRSS2-ETV1.T1bE4 NA NA NA Fusion NA NA NA NA NA NA NA NA SOMATIC diff --git a/tests/test_data/study_es_0_inc/data_timeline.txt b/tests/test_data/study_es_0_inc/data_timeline.txt new file mode 100644 index 00000000..e950603c --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_timeline.txt @@ -0,0 +1,4 @@ +PATIENT_ID START_DATE STOP_DATE EVENT_TYPE SPECIMEN_SITE SPECIMEN_TYPE SOURCE +TCGA-BH-A18K 20 60 SPECIMEN test_specimen_site_1 test_specimen_type test_source_3 +TCGA-BH-A18K 10 20 STATUS test_source_4 +TCGA-BH-NEW 100 200 STATUS test_source_1 diff --git a/tests/test_data/study_es_0_inc/data_treatment_ic50.txt b/tests/test_data/study_es_0_inc/data_treatment_ic50.txt new file mode 100644 index 00000000..806799de --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_treatment_ic50.txt @@ -0,0 +1,10 @@ +ENTITY_STABLE_ID NAME DESCRIPTION URL TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW +17-AAG Name of 17-AAG Desc of 17-AAG Url of 17-AAG 0.315 0.329701692 0.053038094 +AEW541 Name of AEW541 Desc of AEW541 Url of AEW541 >8 2.353 2.68212986 +AZD0530 Name of AZD0530 Desc of AZD0530 Url of AZD0530 0.234 >8 4.597949505 +AZD6244 Name of AZD6244 Desc of AZD6244 Url of AZD6244 >8 >8 >8 +Erlotinib Name of Erlotinib Desc of Erlotinib Url of Erlotinib >8 >8 >8 +Irinotecan Name of Irinotecan Desc of Irinotecan Url of Irinotecan NA 0.083 NA +L-685458 Name of L-685458 Desc of L-685458 Url of L-685458 >8 >8 3.267752409 +LBW242 Name of LBW242 Desc of LBW242 Url of LBW242 NA >8 >8 +Nilotinib Name of Nilotinib Desc of Nilotinib Url of Nilotinib >8 >8 NA diff --git a/tests/test_data/study_es_0_inc/meta_clinical_patients.txt b/tests/test_data/study_es_0_inc/meta_clinical_patients.txt new file mode 100644 index 00000000..5ff93a44 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_clinical_patients.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: CLINICAL +datatype: PATIENT_ATTRIBUTES +data_filename: data_clinical_patients.txt diff --git a/tests/test_data/study_es_0_inc/meta_clinical_samples.txt b/tests/test_data/study_es_0_inc/meta_clinical_samples.txt new file mode 100644 index 00000000..7e4f6741 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_clinical_samples.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: CLINICAL +datatype: SAMPLE_ATTRIBUTES +data_filename: data_clinical_samples.txt diff --git a/tests/test_data/study_es_0_inc/meta_cna_discrete.txt b/tests/test_data/study_es_0_inc/meta_cna_discrete.txt new file mode 100644 index 00000000..f6ea8bea --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_cna_discrete.txt @@ -0,0 +1,10 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: DISCRETE +stable_id: gistic +show_profile_in_analysis_tab: true +profile_description: Putative copy-number from GISTIC 2.0. Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. +profile_name: Putative copy-number alterations from GISTIC +data_filename: data_cna_discrete.txt +pd_annotations_filename: data_cna_pd_annotations.txt +namespaces: CustomNamespace diff --git a/tests/test_data/study_es_0_inc/meta_cna_hg19_seg.txt b/tests/test_data/study_es_0_inc/meta_cna_hg19_seg.txt new file mode 100644 index 00000000..f17e1657 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_cna_hg19_seg.txt @@ -0,0 +1,6 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: SEG +reference_genome_id: hg19 +description: Somatic CNA data (copy number ratio from tumor samples minus ratio from matched normals) from TCGA. +data_filename: data_cna_hg19.seg diff --git a/tests/test_data/study_es_0_inc/meta_cna_log2.txt b/tests/test_data/study_es_0_inc/meta_cna_log2.txt new file mode 100644 index 00000000..74a07b8e --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_cna_log2.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: LOG2-VALUE +stable_id: log2CNA +show_profile_in_analysis_tab: false +profile_description: Log2 copy-number values for each gene (from Affymetrix SNP6). +profile_name: Log2 copy-number values +data_filename: data_cna_log2.txt diff --git a/tests/test_data/study_es_0_inc/meta_expression_median.txt b/tests/test_data/study_es_0_inc/meta_expression_median.txt new file mode 100644 index 00000000..1e2fc6a7 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_expression_median.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: MRNA_EXPRESSION +datatype: CONTINUOUS +stable_id: mrna +profile_description: Expression levels (Agilent microarray). +show_profile_in_analysis_tab: false +profile_name: mRNA expression (microarray) +data_filename: data_expression_median.txt diff --git a/tests/test_data/study_es_0_inc/meta_gene_panel_matrix.txt b/tests/test_data/study_es_0_inc/meta_gene_panel_matrix.txt new file mode 100644 index 00000000..440f19c3 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_gene_panel_matrix.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: GENE_PANEL_MATRIX +datatype: GENE_PANEL_MATRIX +data_filename: data_gene_panel_matrix.txt diff --git a/tests/test_data/study_es_0_inc/meta_methylation_hm27.txt b/tests/test_data/study_es_0_inc/meta_methylation_hm27.txt new file mode 100644 index 00000000..582b12e9 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_methylation_hm27.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: METHYLATION +datatype: CONTINUOUS +stable_id: methylation_hm27 +profile_description: Methylation beta-values (HM27 platform). For genes with multiple methylation probes, the probe least correlated with expression is selected. +show_profile_in_analysis_tab: false +profile_name: Methylation (HM27) +data_filename: data_methylation_hm27.txt diff --git a/tests/test_data/study_es_0_inc/meta_mutations_extended.txt b/tests/test_data/study_es_0_inc/meta_mutations_extended.txt new file mode 100644 index 00000000..94df92aa --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_mutations_extended.txt @@ -0,0 +1,10 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: MUTATION_EXTENDED +datatype: MAF +stable_id: mutations +show_profile_in_analysis_tab: true +profile_description: Mutation data from whole exome sequencing. +profile_name: Mutations +data_filename: data_mutations_extended.maf +swissprot_identifier: name +namespaces: Zygosity diff --git a/tests/test_data/study_es_0_inc/meta_structural_variants.txt b/tests/test_data/study_es_0_inc/meta_structural_variants.txt new file mode 100644 index 00000000..b62d3cbd --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_structural_variants.txt @@ -0,0 +1,10 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: STRUCTURAL_VARIANT +datatype: SV +data_filename: data_structural_variants.txt +stable_id: structural_variants +profile_name: Targeted Fusion Assay data (Fake data) +profile_description: Targeted Fusion Assay data +show_profile_in_analysis_tab: true +gene_panel: TESTPANEL1 +namespaces: StructVarNs,StructVarNs2 diff --git a/tests/test_data/study_es_0_inc/meta_timeline.txt b/tests/test_data/study_es_0_inc/meta_timeline.txt new file mode 100644 index 00000000..51a46508 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_timeline.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: CLINICAL +datatype: TIMELINE +data_filename: data_timeline.txt diff --git a/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt b/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt new file mode 100644 index 00000000..edc5ef22 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt @@ -0,0 +1,12 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: GENERIC_ASSAY +generic_assay_type: TREATMENT_RESPONSE +datatype: LIMIT-VALUE +stable_id: treatment_ic50 +profile_name: IC50 values of compounds on cellular phenotype readout +profile_description: IC50 (compound concentration resulting in half maximal inhibition) of compounds on cellular phenotype readout of cultured mutant cell lines. +data_filename: data_treatment_ic50.txt +show_profile_in_analysis_tab: true +pivot_threshold_value: 0.1 +value_sort_order: ASC +generic_entity_meta_properties: NAME,DESCRIPTION,URL diff --git a/tests/unit_tests_validate_data.py b/tests/unit_tests_validate_data.py index 3048a821..3089d33b 100755 --- a/tests/unit_tests_validate_data.py +++ b/tests/unit_tests_validate_data.py @@ -2297,12 +2297,10 @@ class StudyCompositionTestCase(LogBufferTestCase): def setUp(self): """Store validateData globals changed by running validate_study().""" super(StudyCompositionTestCase, self).setUp() - self.orig_defined_cancer_types = validateData.DEFINED_CANCER_TYPES self.orig_defined_sample_ids = validateData.DEFINED_SAMPLE_IDS def tearDown(self): """Restore the environment to before setUp() was called.""" - validateData.DEFINED_CANCER_TYPES = self.orig_defined_cancer_types validateData.DEFINED_SAMPLE_IDS = self.orig_defined_sample_ids super(StudyCompositionTestCase, self).tearDown() @@ -3087,5 +3085,13 @@ def test_required_field_permutations(self): self.assertEqual(logging.ERROR, record.levelno) self.assertIn('This line has no value for cbp_driver_tiers and a value for cbp_driver_tiers_annotation. Please, fill the cbp_driver_tiers column.', record.getMessage()) + def test_incremental_data_validation(self): + validateData.validate_data_dir('test_data/study_es_0_inc', + PORTAL_INSTANCE, + self.logger, False, False) + record_list = self.get_log_records() + self.assertEqual('Validation complete', record_list[-1].getMessage()) + + if __name__ == '__main__': unittest.main(buffer=True)