From 7e42f947fbbd1c030f166459d082ae7fa70c1592 Mon Sep 17 00:00:00 2001 From: Daniel Suveges Date: Mon, 16 Jan 2023 16:08:12 +0000 Subject: [PATCH] Do pics (#57) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: experimental LD using hail * feat: handling of the lower triangle * docs: more comments explaining what\'s going on * feat: non-working implementation of ld information * feat: ld information based on precomputed index * fix: no longer neccesary * feat: missing ld.py added * docs: intervals functions examples * fix: typo * refactor: larger scale update in the GWAS Catalog data ingestion * feat: precommit updated * feat: first pics implementation derived from gnomAD LD information * feat: modularise iterating over populations * feat: finalizing GWAS Catalog ingestion steps * fix: test fixed, due to changes in logic * feat: ignoring .coverage file * feat: modularised pics method * feat: integrating pics * chore: smoothing out bits * feat: cleanup of the pics logic No select statements, more concise functions and carrying over all required information * fix: slight updates * feat: map gnomad positions to ensembl positions LD * fix: use cumsum instead of postprob * feat: update studies schemas * feat: working on integrating ingestion with pics * feat: support for hail doctests * test: _query_block_matrix example * feat: ignore hail logs for testing * feat: ingore TYPE_CHECKING blocks from testing * feat: pics benchmark notebook (co-authored by Irene) * feat: new finding on r > 1 * docs: Explanation about the coordinate shift * test: several pics doctests * test: fixed test for log neg pval * style: remove variables only used for return * feat: parametrise liftover chain * refactor: consolidating some code * feat: finishing pics * chore: adding tests to effect harmonization * chore: adding more tests * feat: benchmarking new dataset * fix: resolving minor bugs around pics * Apply suggestions from code review Co-authored-by: Irene López <45119610+ireneisdoomed@users.noreply.github.com> * fix: applying review comments * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix: addressing some more review comments * fix: update GnomAD join * fix: bug sorted out in pics filterin * feat: abstracting QC flagging * feat: solving clumping * fix: clumping is now complete Co-authored-by: David Co-authored-by: David Ochoa Co-authored-by: Irene López <45119610+ireneisdoomed@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .coveragerc | 4 + Makefile | 33 +- configs/etl/reference.yaml | 50 +- notebooks/pics_benchmark.ipynb | 904 ++++++++ poetry.lock | 252 +++ pyproject.toml | 1 + src/conftest.py | 30 +- src/etl/common/ETLSession.py | 18 +- src/etl/common/spark_helpers.py | 62 + src/etl/common/utils.py | 1 + src/etl/gwas_ingest/association_mapping.py | 300 +++ src/etl/gwas_ingest/clumping.py | 168 ++ src/etl/gwas_ingest/effect_harmonization.py | 406 ++-- src/etl/gwas_ingest/ld.py | 404 ++++ src/etl/gwas_ingest/pics.py | 500 +++++ src/etl/gwas_ingest/process_associations.py | 544 ++--- src/etl/gwas_ingest/study_ingestion.py | 202 +- src/etl/json/data/__init__.py | 3 + .../json/data/gwas_pValueText_resolve.json | 1870 +++++++++++++++++ .../gwascat_2_gnomad_superpopulation.json | 70 + src/etl/json/schemas/studies.json | 74 +- src/etl/v2g/intervals/Liftover.py | 2 +- src/run_gwas_ingest.py | 98 +- src/run_precompute_ld_indexes.py | 32 + tests/conftest.py | 7 +- tests/test_effect_harmonization.py | 169 +- tests/test_gwas_process_assoc.py | 319 +-- 27 files changed, 5716 insertions(+), 807 deletions(-) create mode 100644 .coveragerc create mode 100644 notebooks/pics_benchmark.ipynb create mode 100644 src/etl/gwas_ingest/association_mapping.py create mode 100644 src/etl/gwas_ingest/clumping.py create mode 100644 src/etl/gwas_ingest/ld.py create mode 100644 src/etl/gwas_ingest/pics.py create mode 100644 src/etl/json/data/__init__.py create mode 100644 src/etl/json/data/gwas_pValueText_resolve.json create mode 100644 src/etl/json/data/gwascat_2_gnomad_superpopulation.json create mode 100644 src/run_precompute_ld_indexes.py diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 000000000..06e4886d8 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,4 @@ +[report] +exclude_lines = + pragma: no cover + if TYPE_CHECKING: diff --git a/Makefile b/Makefile index 8ebf7583d..02ded64ef 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ PROJECT_ID ?= open-targets-genetics-dev REGION ?= europe-west1 -CLUSTER_NAME ?= il-coloc +CLUSTER_NAME ?= ${USER}-genetics-etl PROJECT_NUMBER ?= $$(gcloud projects list --filter=${PROJECT_ID} --format="value(PROJECT_NUMBER)") APP_NAME ?= $$(cat pyproject.toml| grep name | cut -d" " -f3 | sed 's/"//g') VERSION_NO ?= $$(poetry version --short) @@ -24,6 +24,7 @@ setup-dev: ## Setup dev environment build: clean ## Build Python Package with Dependencies @echo "Packaging Code and Dependencies for ${APP_NAME}-${VERSION_NO}" + @rm -rf ./dist @poetry build @cp ./src/*.py ./dist @poetry run python ./utils/configure.py --cfg job > ./dist/config.yaml @@ -31,6 +32,19 @@ build: clean ## Build Python Package with Dependencies @gsutil cp ./dist/${APP_NAME}-${VERSION_NO}-py3-none-any.whl gs://genetics_etl_python_playground/initialisation/ @gsutil cp ./utils/initialise_cluster.sh gs://genetics_etl_python_playground/initialisation/ +prepare_pics: ## Create cluster for variant annotation + gcloud dataproc clusters create ${CLUSTER_NAME} \ + --image-version=2.0 \ + --project=${PROJECT_ID} \ + --region=${REGION} \ + --master-machine-type=n1-highmem-96 \ + --enable-component-gateway \ + --num-master-local-ssds=1 \ + --master-local-ssd-interface=NVME \ + --metadata="PACKAGE=gs://genetics_etl_python_playground/initialisation/${APP_NAME}-${VERSION_NO}-py3-none-any.whl" \ + --initialization-actions=gs://genetics_etl_python_playground/initialisation/initialise_cluster.sh \ + --single-node \ + --max-idle=10m prepare_variant_annotation: ## Create cluster for variant annotation gcloud dataproc clusters create ${CLUSTER_NAME} \ @@ -150,6 +164,23 @@ run_gwas: ## Ingest gwas dataset on a dataproc cluster gcloud dataproc jobs submit pyspark ./dist/run_gwas_ingest.py \ --cluster=${CLUSTER_NAME} \ --files=./dist/config.yaml \ + --properties='spark.jars=/opt/conda/miniconda3/lib/python3.8/site-packages/hail/backend/hail-all-spark.jar,spark.driver.extraClassPath=/opt/conda/miniconda3/lib/python3.8/site-packages/hail/backend/hail-all-spark.jar,spark.executor.extraClassPath=./hail-all-spark.jar,spark.serializer=org.apache.spark.serializer.KryoSerializer,spark.kryo.registrator=is.hail.kryo.HailKryoRegistrator' \ --py-files=gs://genetics_etl_python_playground/initialisation/${APP_NAME}-${VERSION_NO}-py3-none-any.whl \ --project=${PROJECT_ID} \ --region=${REGION} + +run_pics: ## Run pics method + gcloud dataproc jobs submit pyspark ./dist/pics_experiment.py \ + --cluster=${CLUSTER_NAME} \ + --files=./dist/config.yaml \ + --properties='spark.jars=/opt/conda/miniconda3/lib/python3.8/site-packages/hail/backend/hail-all-spark.jar,spark.driver.extraClassPath=/opt/conda/miniconda3/lib/python3.8/site-packages/hail/backend/hail-all-spark.jar,spark.executor.extraClassPath=./hail-all-spark.jar,spark.serializer=org.apache.spark.serializer.KryoSerializer,spark.kryo.registrator=is.hail.kryo.HailKryoRegistrator' \ + --project=${PROJECT_ID} \ + --region=${REGION} + +run_precompute_ld_index: ## Precompute ld-index information + gcloud dataproc jobs submit pyspark ./dist/run_precompute_ld_indexes.py \ + --cluster=${CLUSTER_NAME} \ + --files=./dist/config.yaml \ + --properties='spark.jars=/opt/conda/miniconda3/lib/python3.8/site-packages/hail/backend/hail-all-spark.jar,spark.driver.extraClassPath=/opt/conda/miniconda3/lib/python3.8/site-packages/hail/backend/hail-all-spark.jar,spark.executor.extraClassPath=./hail-all-spark.jar,spark.serializer=org.apache.spark.serializer.KryoSerializer,spark.kryo.registrator=is.hail.kryo.HailKryoRegistrator' \ + --project=${PROJECT_ID} \ + --region=${REGION} diff --git a/configs/etl/reference.yaml b/configs/etl/reference.yaml index 1db05f6b1..018f03c3b 100644 --- a/configs/etl/reference.yaml +++ b/configs/etl/reference.yaml @@ -83,15 +83,57 @@ variant_index: gwas_ingest: inputs: - variant_annotation: gs://ot-team/dsuveges/variant_annotation/2022-08-11 - gwas_catalog_associations: ${etl.inputs}/v2d/gwas_catalog_v1.0.2-associations_e107_r2022-09-14.tsv - gwas_catalog_studies: ${etl.inputs}/v2d/gwas-catalog-v1.0.3-studies-r2022-09-14.tsv - gwas_catalog_ancestries: ${etl.inputs}/v2d/gwas-catalog-v1.0.3-ancestries-r2022-09-14.tsv + variant_annotation: ${etl.outputs}/variant_annotation + gnomad_populations: + - id: afr + index: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.afr.common.ld.variant_indices.ht + matrix: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.afr.common.adj.ld.bm + parsed_index: ${etl.inputs}/ld/gnomad_r2.1.1.afr.common.ld.variant_indices_2mb.parquet + - id: amr + index: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.amr.common.ld.variant_indices.ht + matrix: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.amr.common.adj.ld.bm + parsed_index: ${etl.inputs}/ld/gnomad_r2.1.1.amr.common.ld.variant_indices_2mb.parquet + - id: asj + index: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.asj.common.ld.variant_indices.ht + matrix: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.asj.common.adj.ld.bm + parsed_index: ${etl.inputs}/ld/gnomad_r2.1.1.asj.common.ld.variant_indices_2mb.parquet + - id: eas + index: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.eas.common.ld.variant_indices.ht + matrix: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.eas.common.adj.ld.bm + parsed_index: ${etl.inputs}/ld/gnomad_r2.1.1.eas.common.ld.variant_indices_2mb.parquet + - id: fin + index: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.fin.common.ld.variant_indices.ht + matrix: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.fin.common.adj.ld.bm + parsed_index: ${etl.inputs}/ld/gnomad_r2.1.1.fin.common.ld.variant_indices_2mb.parquet + - id: nfe + index: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.nfe.common.ld.variant_indices.ht + matrix: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.nfe.common.adj.ld.bm + parsed_index: ${etl.inputs}/ld/gnomad_r2.1.1.nfe.common.ld.variant_indices_2mb.parquet + - id: est + index: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.est.common.ld.variant_indices.ht + matrix: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.est.common.adj.ld.bm + parsed_index: ${etl.inputs}/ld/gnomad_r2.1.1.est.common.ld.variant_indices_2mb.parquet + - id: nwe + index: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.nwe.common.ld.variant_indices.ht + matrix: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.nwe.common.adj.ld.bm + parsed_index: ${etl.inputs}/ld/gnomad_r2.1.1.nwe.common.ld.variant_indices_2mb.parquet + - id: seu + index: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.seu.common.ld.variant_indices.ht + matrix: gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.seu.common.adj.ld.bm + parsed_index: ${etl.inputs}/ld/gnomad_r2.1.1.seu.common.ld.variant_indices_2mb.parquet + gwas_catalog_associations: ${etl.inputs}/v2d/gwas_catalog_v1.0.2-associations_e107_r2022-11-29.tsv + gwas_catalog_studies: ${etl.inputs}/v2d/gwas-catalog-v1.0.3-studies-r2022-11-29.tsv + gwas_catalog_ancestries: ${etl.inputs}/v2d/gwas-catalog-v1.0.3-ancestries-r2022-11-29.tsv summary_stats_list: ${etl.inputs}/v2d/harmonised_list.txt + grch37_to_grch38_chain: gs://hail-common/references/grch37_to_grch38.over.chain.gz outputs: gwas_catalog_associations: ${etl.outputs}/gwas_catalog_associations gwas_catalog_studies: ${etl.outputs}/gwas_catalog_studies + pics_credible_set: ${etl.outputs}/pics_credible_set parameters: ingest_unpublished_studies: False p_value_cutoff: 5e-8 + ld_window: 2_000_000 + min_r2: 0.5 + k: 6.4 # Empiric constant that can be adjusted to fit the curve, 6.4 recommended. machine: ${machine.default} diff --git a/notebooks/pics_benchmark.ipynb b/notebooks/pics_benchmark.ipynb new file mode 100644 index 000000000..47c788923 --- /dev/null +++ b/notebooks/pics_benchmark.ipynb @@ -0,0 +1,904 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Benchmarking new PICS implementation\n", + "\n", + "The objective of this notebook is to compare the new implementation of PICS estimated on GWAS Catalog associations using gnomAD LD reference, against the previous implementation using 1000 genomes phase III LD reference. \n", + "\n", + "1. Describe the new dataset\n", + " - Number of signals covered.\n", + " - Number of signals dropped.\n", + "2. Copare with old PICS Dataset.\n", + " - Δ number of covered study (not particularly relevant given updates in GWAS Catalog)\n", + " - Δ number of covered peaks from studies found in the old release - might see increased coverage.\n", + " - Δ in the recovered credible set: number of variants, change in posterior probability.\n", + " - Δ in the average number of credible sets.\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import pyspark.sql.functions as f\n", + "import pyspark.sql.types as t\n", + "from pyspark.sql import SparkSession, DataFrame\n", + "from pyspark.sql.window import Window\n", + "\n", + "spark = SparkSession.builder.getOrCreate()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1 Describing the new dataset\n", + "\n", + "1. Study count.\n", + "2. Association count.\n", + "3. Studies split.\n", + "4. Associations not resolved in LD set." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- chromosome: string (nullable = true)\n", + " |-- variantId: string (nullable = true)\n", + " |-- studyId: string (nullable = true)\n", + " |-- position: string (nullable = true)\n", + " |-- referenceAllele: string (nullable = true)\n", + " |-- alternateAllele: string (nullable = true)\n", + " |-- pValueMantissa: float (nullable = true)\n", + " |-- pValueExponent: integer (nullable = true)\n", + " |-- beta: string (nullable = true)\n", + " |-- beta_ci_lower: double (nullable = true)\n", + " |-- beta_ci_upper: double (nullable = true)\n", + " |-- odds_ratio: string (nullable = true)\n", + " |-- odds_ratio_ci_lower: double (nullable = true)\n", + " |-- odds_ratio_ci_upper: double (nullable = true)\n", + " |-- qualityControl: array (nullable = true)\n", + " | |-- element: string (containsNull = true)\n", + " |-- sampleSize: double (nullable = true)\n", + " |-- tagVariantId: string (nullable = true)\n", + " |-- R_overall: double (nullable = true)\n", + " |-- pics_mu: double (nullable = true)\n", + " |-- pics_std: double (nullable = true)\n", + " |-- pics_postprob: double (nullable = true)\n", + " |-- pics_95_perc_credset: boolean (nullable = true)\n", + " |-- pics_99_perc_credset: boolean (nullable = true)\n", + " |-- hasResolvedCredibleSet: boolean (nullable = false)\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 82:> (0 + 1) / 1]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-RECORD 0------------------------------------------\n", + " chromosome | 6 \n", + " variantId | 6_13215826_A_G \n", + " studyId | GCST000101_1 \n", + " position | 13215826 \n", + " referenceAllele | A \n", + " alternateAllele | G \n", + " pValueMantissa | 3.0 \n", + " pValueExponent | -6 \n", + " beta | null \n", + " beta_ci_lower | null \n", + " beta_ci_upper | null \n", + " odds_ratio | null \n", + " odds_ratio_ci_lower | null \n", + " odds_ratio_ci_upper | null \n", + " qualityControl | [Subsignificant p-value] \n", + " sampleSize | 1094.0 \n", + " tagVariantId | 6_13215826_A_G \n", + " R_overall | 1.0 \n", + " pics_mu | 5.522878745280337 \n", + " pics_std | 0.0 \n", + " pics_postprob | 0.12718888994626093 \n", + " pics_95_perc_credset | true \n", + " pics_99_perc_credset | true \n", + " hasResolvedCredibleSet | true \n", + "only showing top 1 row\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "new_study_locus = (\n", + " spark.read.parquet(\"gs://genetics_etl_python_playground/XX.XX/output/python_etl/parquet/pics_credible_set/\")\n", + " .withColumn('pics_99_perc_credset', f.when(f.col('tagVariantId').isNull(), False).otherwise(f.col('pics_99_perc_credset')))\n", + " .withColumn(\n", + " 'hasResolvedCredibleSet', \n", + " f.when(\n", + " f.array_contains(\n", + " f.collect_set(f.col('pics_99_perc_credset')).over(Window.partitionBy('studyId', 'variantId')), \n", + " True\n", + " ),\n", + " True\n", + " ).otherwise(False)\n", + " )\n", + " .persist()\n", + ")\n", + "\n", + "\n", + "new_study_locus.printSchema()\n", + "new_study_locus.show(1, False, True)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "22/12/19 11:45:49 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.\n", + "[Stage 224:============================================> (173 + 16) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Study count: 35956\n", + "Association (unique study/variant pairs) count: 433108\n", + "Associations with resolved credible set: 381056 (88.0%)\n", + "Number of good (non-flagged) associations without resolved credible set: 39763 (9.2%)\n", + "Number of good (non-flagged) associations with resolved credible set: 260736 (60.2%)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "study_count = new_study_locus.select('studyId').distinct().count()\n", + "association_count = new_study_locus.select('studyId', 'variantId').distinct().count()\n", + "association_w_credible_set = new_study_locus.filter(f.col('hasResolvedCredibleSet')).persist()\n", + "credible_set_count = association_w_credible_set.select('studyId', 'variantId').distinct().count()\n", + "failed_w_ld = (\n", + " new_study_locus\n", + " # Selecting good associations without credible sets:\n", + " .filter(\n", + " (~f.col('hasResolvedCredibleSet')) & \n", + " (f.size(f.col('qualityControl'))>0)\n", + " )\n", + " # Get associations:\n", + " .select('studyId', 'variantId')\n", + " .distinct()\n", + " .count()\n", + ")\n", + "good_association_count = (\n", + " association_w_credible_set\n", + " # Drop failed associations:\n", + " .filter(f.size(f.col('qualityControl')) == 0)\n", + " .select('studyId', 'variantId')\n", + " .distinct()\n", + " .count()\n", + ")\n", + "\n", + "print(f'Study count: {study_count}')\n", + "print(f'Association (unique study/variant pairs) count: {association_count}')\n", + "print(f'Associations with resolved credible set: {credible_set_count} ({round(credible_set_count/association_count*100, 1)}%)')\n", + "print(f'Number of good (non-flagged) associations without resolved credible set: {failed_w_ld} ({round(failed_w_ld/association_count*100, 1)}%)')\n", + "print(f'Number of good (non-flagged) associations with resolved credible set: {good_association_count} ({round(good_association_count/association_count*100, 1)}%)')\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Focusing only on the actual credible sets." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "22/12/19 11:57:56 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.\n", + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of resolved credible sets: 381056\n", + "Studies with resolved credible sets: 33723\n", + "Number of lead/tag pairs: 18722043\n" + ] + } + ], + "source": [ + "# Thu\n", + "credible_sets = new_study_locus.filter(f.col('pics_99_perc_credset')).persist()\n", + "resolved_assoc_count = credible_sets.select('studyId', 'variantId').distinct().count()\n", + "resolved_study_count = credible_sets.select('studyId').distinct().count()\n", + "lead_tag_pair_count = credible_sets.select('studyId', 'variantId', 'tagVariantId').distinct().count()\n", + "\n", + "grouped_credset_pdf = credible_sets.groupBy('studyId', 'variantId').count().toPandas()\n", + "\n", + "print(f'Number of resolved credible sets: {resolved_assoc_count}')\n", + "print(f'Studies with resolved credible sets: {resolved_study_count}')\n", + "print(f'Number of lead/tag pairs: {lead_tag_pair_count}')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAD4CAYAAADsKpHdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAARlUlEQVR4nO3dbYxc5XnG8f9VmxAHAuElrBBGXSKstLw0TbAoLVW0qtPihijmA0iWkuBWriwhkpIWKTKN1KgfLEFVQgIqSFZIMZQGKElkK4g2yGRVVQITE0iNcVycQMHBxaEQglEhmN79MM+S8bJej9dr7+7M/yeN5sw95zl77pHg2uc5Z8epKiRJ+rWZPgFJ0uxgIEiSAANBktQYCJIkwECQJDXzZ/oEpurkk0+u4eHhKY197bXXOOaYY6b3hGY5ex4M9jwYDqXnRx999MWqev9E783ZQBgeHmbz5s1TGjs6OsrIyMj0ntAsZ8+DwZ4Hw6H0nOS/9veeS0aSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIkYEADYctPX2F49X0Mr75vpk9FkmaNgQwESdI7GQiSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNT0FQpK/SLI1yRNJvpHk3UlOTPJAkqfa8wld+1+TZEeS7Uku6qqfl2RLe+/GJGn1o5Pc3eqbkgxPe6eSpEkdMBCSnAb8ObC4qs4B5gHLgdXAxqpaBGxsr0lyVnv/bGApcHOSee1wtwCrgEXtsbTVVwIvV9WZwA3AddPSnSSpZ70uGc0HFiSZD7wHeB5YBqxr768DLmnby4C7quqNqnoa2AGcn+RU4LiqeqiqCrh93JixY90LLBmbPUiSjoz5B9qhqn6a5O+AZ4H/Bb5bVd9NMlRVu9o+u5Kc0oacBjzcdYidrfZm2x5fHxvzXDvW3iSvACcBL3afS5JVdGYYDA0NMTo6ehCt/srQArj63L0AUz7GXLNnz56B6XWMPQ8Ge54+BwyEdm1gGXAG8HPgn5N8erIhE9RqkvpkY/YtVK0F1gIsXry4RkZGJjmN/bvpzvVcv6XT+jOfmtox5prR0VGm+nnNVfY8GOx5+vSyZPQx4Omq+llVvQl8C/g94IW2DER73t323wmc3jV+IZ0lpp1te3x9nzFtWep44KWpNCRJmppeAuFZ4IIk72nr+kuAbcAGYEXbZwWwvm1vAJa3O4fOoHPx+JG2vPRqkgvacS4fN2bsWJcCD7brDJKkI6SXawibktwL/ADYCzxGZ9nmWOCeJCvphMZlbf+tSe4Bnmz7X1lVb7XDXQHcBiwA7m8PgFuBO5LsoDMzWD4t3UmSenbAQACoqi8BXxpXfoPObGGi/dcAayaobwbOmaD+Oi1QJEkzw79UliQBBoIkqTEQJEmAgSBJagwESRJgIEiSGgNBkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKkxECRJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSGgNBkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElqDARJEtBjICR5X5J7k/woybYkv5vkxCQPJHmqPZ/Qtf81SXYk2Z7koq76eUm2tPduTJJWPzrJ3a2+KcnwtHcqSZpUrzOErwL/UlW/AXwI2AasBjZW1SJgY3tNkrOA5cDZwFLg5iTz2nFuAVYBi9pjaauvBF6uqjOBG4DrDrEvSdJBOmAgJDkO+ChwK0BV/bKqfg4sA9a13dYBl7TtZcBdVfVGVT0N7ADOT3IqcFxVPVRVBdw+bszYse4FlozNHiRJR8b8Hvb5APAz4B+SfAh4FLgKGKqqXQBVtSvJKW3/04CHu8bvbLU32/b4+tiY59qx9iZ5BTgJeLH7RJKsojPDYGhoiNHR0d66HGdoAVx97l6AKR9jrtmzZ8/A9DrGngeDPU+fXgJhPvAR4HNVtSnJV2nLQ/sx0W/2NUl9sjH7FqrWAmsBFi9eXCMjI5Ocxv7ddOd6rt/Saf2ZT03tGHPN6OgoU/285ip7Hgz2PH16uYawE9hZVZva63vpBMQLbRmI9ry7a//Tu8YvBJ5v9YUT1PcZk2Q+cDzw0sE2I0maugMGQlX9N/Bckg+20hLgSWADsKLVVgDr2/YGYHm7c+gMOhePH2nLS68muaBdH7h83JixY10KPNiuM0iSjpBelowAPgfcmeRdwE+AP6UTJvckWQk8C1wGUFVbk9xDJzT2AldW1VvtOFcAtwELgPvbAzoXrO9IsoPOzGD5IfYlSTpIPQVCVT0OLJ7grSX72X8NsGaC+mbgnAnqr9MCRZI0M/xLZUkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSGgNBkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKkxECRJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSmvkzfQIzbXj1fW9vP3PtxTN4JpI0s5whSJIAA0GS1PQcCEnmJXksyXfa6xOTPJDkqfZ8Qte+1yTZkWR7kou66ucl2dLeuzFJWv3oJHe3+qYkw9PYoySpBwczQ7gK2Nb1ejWwsaoWARvba5KcBSwHzgaWAjcnmdfG3AKsAha1x9JWXwm8XFVnAjcA102pG0nSlPUUCEkWAhcDX+sqLwPWte11wCVd9buq6o2qehrYAZyf5FTguKp6qKoKuH3cmLFj3QssGZs9SJKOjF7vMvoK8AXgvV21oaraBVBVu5Kc0uqnAQ937bez1d5s2+PrY2Oea8fam+QV4CTgxe6TSLKKzgyDoaEhRkdHezz9fQ0tgKvP3fuO+lSPNxfs2bOnr/ubiD0PBnuePgcMhCSfAHZX1aNJRno45kS/2dck9cnG7FuoWgusBVi8eHGNjPRyOu90053ruX7LO1t/5lNTO95cMDo6ylQ/r7nKngeDPU+fXmYIFwKfTPJx4N3AcUn+EXghyaltdnAqsLvtvxM4vWv8QuD5Vl84Qb17zM4k84HjgZem2JMkaQoOeA2hqq6pqoVVNUznYvGDVfVpYAOwou22AljftjcAy9udQ2fQuXj8SFteejXJBe36wOXjxowd69L2M94xQ5AkHT6H8pfK1wL3JFkJPAtcBlBVW5PcAzwJ7AWurKq32pgrgNuABcD97QFwK3BHkh10ZgbLD+G8JElTcFCBUFWjwGjb/h9gyX72WwOsmaC+GThngvrrtECRJM0M/1JZkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKkxECRJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpOZQ/k3lvjO8+r63t5+59uIZPBNJOvKcIUiSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkCDARJUuMfpu2Hf6QmadA4Q5AkAQaCJKkxECRJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpMZAkCQBPQRCktOTfC/JtiRbk1zV6icmeSDJU+35hK4x1yTZkWR7kou66ucl2dLeuzFJWv3oJHe3+qYkw4ehV0nSJHqZIewFrq6q3wQuAK5MchawGthYVYuAje017b3lwNnAUuDmJPPasW4BVgGL2mNpq68EXq6qM4EbgOumoTdJ0kE4YCBU1a6q+kHbfhXYBpwGLAPWtd3WAZe07WXAXVX1RlU9DewAzk9yKnBcVT1UVQXcPm7M2LHuBZaMzR4kSUfGQX25XVvK+TCwCRiqql3QCY0kp7TdTgMe7hq2s9XebNvj62NjnmvH2pvkFeAk4MVxP38VnRkGQ0NDjI6OHszpv21oAVx97t6e95/qz5lN9uzZ0xd9HAx7Hgz2PH16DoQkxwLfBD5fVb+Y5Bf4id6oSeqTjdm3ULUWWAuwePHiGhkZOcBZT+ymO9dz/ZaDyMItr+3zci5+++no6ChT/bzmKnseDPY8fXq6yyjJUXTC4M6q+lYrv9CWgWjPu1t9J3B61/CFwPOtvnCC+j5jkswHjgdeOthmJElT18tdRgFuBbZV1Ze73toArGjbK4D1XfXl7c6hM+hcPH6kLS+9muSCdszLx40ZO9alwIPtOoMk6QjpZd3kQuAzwJYkj7faXwHXAvckWQk8C1wGUFVbk9wDPEnnDqUrq+qtNu4K4DZgAXB/e0AncO5IsoPOzGD5obUlSTpYBwyEqvp3Jl7jB1iynzFrgDUT1DcD50xQf50WKJKkmeE/oTkF/vOakvqRX10hSQIMBElSYyBIkgADQZLUGAiSJMBAkCQ13nZ6iLwFVVK/cIYgSQIMBElSYyBIkgADQZLUeFF5GnmBWdJc5gxBkgQYCJKkxkCQJAFeQzhsvJ4gaa4xEI4Aw0HSXOCSkSQJMBAkSY1LRkeYy0eSZitnCJIkwBnCrOHMQdJMMxBmUHcISNJMc8lIkgQ4Q5iVXD6SNBOcIUiSAANBktS4ZDTLuXwk6UgxEOaQ/d2VZFBImg4GQh9wFiFpOhgIfcZwkDRVBkIf6w6H25YeM4NnImkuMBAGxJafvsKfTHANwlmEpDEGwoDzQrWkMQaCJtTL9ywZGlJ/MRA0Zb1+OZ/BIc0NsyYQkiwFvgrMA75WVdfO8ClpmhyOb3U1ZKTpNysCIck84O+BPwR2At9PsqGqnpzZM9Ns1UvIXH3u3gkvpB8Kg0j9bFYEAnA+sKOqfgKQ5C5gGWAgaFaZ7f+GxeEIwdmoO5h7uYPOmyd6k6qa6XMgyaXA0qr6s/b6M8DvVNVnx+23CljVXn4Q2D7FH3ky8OIUx85V9jwY7HkwHErPv15V75/ojdkyQ8gEtXckVVWtBdYe8g9LNlfV4kM9zlxiz4PBngfD4ep5tnz99U7g9K7XC4HnZ+hcJGkgzZZA+D6wKMkZSd4FLAc2zPA5SdJAmRVLRlW1N8lngX+lc9vp16tq62H8kYe87DQH2fNgsOfBcFh6nhUXlSVJM2+2LBlJkmaYgSBJAgYsEJIsTbI9yY4kq2f6fKZLkq8n2Z3kia7aiUkeSPJUez6h671r2mewPclFM3PWhybJ6Um+l2Rbkq1Jrmr1vu07ybuTPJLkh63nv2n1vu15TJJ5SR5L8p32ehB6fibJliSPJ9ncaoe376oaiAedi9U/Bj4AvAv4IXDWTJ/XNPX2UeAjwBNdtb8FVrft1cB1bfus1vvRwBntM5k30z1MoedTgY+07fcC/9l669u+6fy9zrFt+yhgE3BBP/fc1ftfAv8EfKe9HoSenwFOHlc7rH0P0gzh7a/HqKpfAmNfjzHnVdW/AS+NKy8D1rXtdcAlXfW7quqNqnoa2EHns5lTqmpXVf2gbb8KbANOo4/7ro497eVR7VH0cc8ASRYCFwNf6yr3dc+TOKx9D1IgnAY81/V6Z6v1q6Gq2gWd/3kCp7R6330OSYaBD9P5jbmv+25LJ48Du4EHqqrvewa+AnwB+L+uWr/3DJ2w/26SR9vX9sBh7ntW/B3CEdLT12MMgL76HJIcC3wT+HxV/SKZqL3OrhPU5lzfVfUW8NtJ3gd8O8k5k+w+53tO8glgd1U9mmSklyET1OZUz10urKrnk5wCPJDkR5PsOy19D9IMYdC+HuOFJKcCtOfdrd43n0OSo+iEwZ1V9a1W7vu+Aarq58AosJT+7vlC4JNJnqGzzPsHSf6R/u4ZgKp6vj3vBr5NZwnosPY9SIEwaF+PsQFY0bZXAOu76suTHJ3kDGAR8MgMnN8hSWcqcCuwraq+3PVW3/ad5P1tZkCSBcDHgB/Rxz1X1TVVtbCqhun8N/tgVX2aPu4ZIMkxSd47tg38EfAEh7vvmb6SfoSv2n+czt0oPwa+ONPnM419fQPYBbxJ5zeFlcBJwEbgqfZ8Ytf+X2yfwXbgj2f6/KfY8+/TmRL/B/B4e3y8n/sGfgt4rPX8BPDXrd63PY/rf4Rf3WXU1z3TuRvyh+2xdez/V4e7b7+6QpIEDNaSkSRpEgaCJAkwECRJjYEgSQIMBElSYyBIkgADQZLU/D99c2F/4EPbtQAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "grouped_credset_pdf.query('count < 500')['count'].hist(bins=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Credible sets with only one variant: 29536 (7.8)%\n", + "Median size of credible sets: 21.0\n" + ] + } + ], + "source": [ + "median_credset_size = grouped_credset_pdf['count'].median()\n", + "credsets_with_single = len(grouped_credset_pdf.query('count == 1'))\n", + "\n", + "print(f'Credible sets with only one variant: {credsets_with_single} ({round(credsets_with_single/len(grouped_credset_pdf)*100, 1)})%')\n", + "print(f'Median size of credible sets: {median_credset_size}')" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 285:====================================> (139 + 8) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+---------------+-----+\n", + "| studyId| variantId|count|\n", + "+------------+---------------+-----+\n", + "|GCST90095125|17_46142465_T_A|11930|\n", + "|GCST90095124|17_46142465_T_A|11221|\n", + "|GCST006483_1|17_45608332_A_G| 9666|\n", + "| GCST011766|17_45846834_C_G| 9566|\n", + "|GCST006481_2|17_45608332_A_G| 9445|\n", + "|GCST006483_1|17_45605039_C_G| 8912|\n", + "|GCST006483_3|17_45605039_C_G| 8602|\n", + "|GCST006481_4|17_45605039_C_G| 8545|\n", + "|GCST006483_1|17_46770468_T_G| 7465|\n", + "|GCST006481_2|17_46770468_T_G| 7396|\n", + "|GCST006483_3|17_46770468_T_G| 7374|\n", + "|GCST006481_4|17_46770468_T_G| 7327|\n", + "|GCST001651_9|17_46257341_G_A| 6926|\n", + "|GCST90134596|17_45707983_T_C| 6748|\n", + "| GCST012099|17_45610951_A_G| 6603|\n", + "|GCST90104034|17_46152620_T_C| 6545|\n", + "| GCST012101|17_45610951_A_G| 6374|\n", + "|GCST90134597|17_45707983_T_C| 6372|\n", + "| GCST007692|17_45846834_C_G| 6331|\n", + "|GCST90013445|17_45996523_A_G| 5668|\n", + "|GCST008675_1|17_45733530_C_T| 5196|\n", + "|GCST004008_1|17_45749271_G_A| 5101|\n", + "|GCST006483_1|17_46785767_T_C| 4913|\n", + "|GCST006481_2|17_46785767_T_C| 4880|\n", + "| GCST007065|11_55736589_G_A| 4071|\n", + "|GCST90100220|10_73256607_T_A| 3897|\n", + "|GCST90095190|17_45913906_A_G| 3858|\n", + "|GCST90095190|17_46055092_G_A| 3855|\n", + "|GCST90095190|17_45609706_G_A| 3764|\n", + "|GCST000996_1|11_55368743_C_T| 3727|\n", + "+------------+---------------+-----+\n", + "only showing top 30 rows\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "credible_sets.groupBy('studyId', 'variantId').count().filter(f.col('count') > 1000).orderBy('count', ascending=False).show(30)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Comparing with old dataset\n", + "\n", + "- Data: `gs://genetics-portal-dev-staging/v2d/220210/ld.parquet`" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 336:===================================================> (16 + 1) / 17]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of lead/tag count: 19406519\n", + "NUmber of studies covered: 18349\n", + "Number of associations covered: 265715\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "old_study_locus = (\n", + " spark.read.parquet(\"gs://genetics-portal-dev-staging/v2d/220210/ld.parquet\")\n", + " .select(\n", + " f.col('study_id').alias(\"studyId\"), \n", + " f.concat_ws(\"_\", f.col(\"lead_chrom\"), f.col(\"lead_pos\"), f.col(\"lead_ref\"), f.col(\"lead_alt\")).alias(\"variantId\"), \n", + " f.concat_ws(\"_\", f.col(\"tag_chrom\"), f.col(\"tag_pos\"), f.col(\"tag_ref\"), f.col(\"tag_alt\")).alias(\"tagVariantId\"),\n", + " 'pics_postprob',\n", + " 'pics_95perc_credset',\n", + " 'pics_99perc_credset'\n", + " )\n", + " .distinct()\n", + ")\n", + "lead_tag_pair_count = old_study_locus.count()\n", + "study_count = old_study_locus.select('studyId').distinct().count()\n", + "association_count = old_study_locus.select('studyId', 'variantId').distinct().count()\n", + "\n", + "print(f'Number of lead/tag count: {lead_tag_pair_count}')\n", + "print(f'NUmber of studies covered: {study_count}')\n", + "print(f'Number of associations covered: {association_count}')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "22/12/19 13:53:16 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The median number of tag size: 21.0\n", + "Number of associations with single credible set: 9231\n", + "Number of associations with more than 1000 tags set: 441\n", + "+-------------+---------------+-----+\n", + "| studyId| variantId|count|\n", + "+-------------+---------------+-----+\n", + "| GCST001482|17_45900461_C_T| 3685|\n", + "| GCST90018953|17_45856424_G_T| 3684|\n", + "| GCST007692|17_45846834_C_G| 3649|\n", + "| GCST90018960|17_45761354_C_T| 3360|\n", + "| GCST90018996|17_46112544_A_G| 3348|\n", + "| GCST90091060|17_45873075_C_A| 3295|\n", + "| GCST002970|17_45846317_A_G| 3294|\n", + "| GCST001548|17_45846853_T_C| 3294|\n", + "| GCST007328|17_45887201_A_C| 3294|\n", + "| GCST007430|17_45887201_A_C| 3294|\n", + "| GCST010701|17_45855805_C_T| 3294|\n", + "| GCST001126|17_45846317_A_G| 3294|\n", + "| GCST012009|17_45862033_A_C| 3294|\n", + "| GCST006941|17_45841739_C_T| 3293|\n", + "| GCST004601|17_45841730_A_G| 3293|\n", + "| GCST010002|17_45895867_C_T| 3293|\n", + "| GCST90025948|17_45834077_T_C| 3293|\n", + "| GCST008733|17_45834077_T_C| 3293|\n", + "| GCST008734|17_45834077_T_C| 3293|\n", + "|GCST009518_66|17_45841730_A_G| 3293|\n", + "+-------------+---------------+-----+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "tag_count = old_study_locus.groupBy('studyId', 'variantId').count().persist()\n", + "median_tag_count = tag_count.toPandas()['count'].median()\n", + "single_count = tag_count.filter(f.col('count') == 1).count()\n", + "over_1000 = tag_count.filter(f.col('count') >= 1000).count()\n", + "\n", + "print(f'The median number of tag size: {median_credset_size}')\n", + "print(f'Number of associations with single credible set: {single_count}')\n", + "print(f'Number of associations with more than 1000 tags set: {over_1000}')\n", + "\n", + "tag_count.orderBy('count',ascending=False).show(20)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare credible sets\n", + "\n", + "To make datasets comparable, both datasets need to updated with `studyAccession`: getting the GWAS Catalog study identifier by removing the suffix." + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 414:> (0 + 1) / 1][Stage 1072:========> (9 + 7) / 16]\r" + ] + } + ], + "source": [ + "processed_new = (\n", + " credible_sets\n", + " # Dropping leads with sub-significant p-values:\n", + " .filter(f.size(f.col('qualityControl')) == 0)\n", + " .select(\n", + " f.split(f.col('studyId'), '_').getItem(0).alias('studyAccession'),\n", + " 'variantId', \n", + " 'tagVariantId', \n", + " 'pics_mu', \n", + " 'pics_postprob', \n", + " 'pics_95_perc_credset', \n", + " 'pics_99_perc_credset'\n", + " )\n", + " .persist()\n", + ")\n", + "\n", + "processed_old = (\n", + " old_study_locus\n", + " .select(\n", + " f.split(f.col('studyId'), '_').getItem(0).alias('studyAccession'),\n", + " 'variantId', \n", + " 'tagVariantId',\n", + " 'pics_postprob',\n", + " 'pics_95perc_credset',\n", + " 'pics_99perc_credset' \n", + " )\n", + " .persist()\n", + ")\n", + "\n", + "processed_old.show(1, False, True)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "22/12/19 14:19:35 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.\n", + "22/12/19 14:19:35 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.\n", + "[Stage 414:> (0 + 1) / 1][Stage 431:============>(187 + 7) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------+---------------+---------------------+---------------------+\n", + "|studyAccession| variantId|new_credible_set_size|old_credible_set_size|\n", + "+--------------+---------------+---------------------+---------------------+\n", + "| GCST000114|15_48099968_A_G| 13| 38|\n", + "| GCST000172|3_190632672_A_G| 12| null|\n", + "| GCST000184|18_60217517_G_A| 233| 214|\n", + "| GCST000189|16_81270154_T_C| 6| null|\n", + "| GCST000189|9_105892815_G_T| 26| null|\n", + "| GCST000282|19_11100236_C_T| 34| 69|\n", + "| GCST000425|16_23055939_T_G| 227| null|\n", + "| GCST000452|2_156696348_A_C| 19| null|\n", + "| GCST000679| 10_6056986_C_T| 12| null|\n", + "| GCST000817|9_136220024_G_T| 23| 27|\n", + "| GCST000876|11_18349351_G_C| 2| 6|\n", + "| GCST000943| 20_1960525_G_A| null| 2|\n", + "| GCST000957|22_49692725_G_A| 16| null|\n", + "| GCST000964|13_77957479_G_A| null| 47|\n", + "| GCST000998|10_44280376_C_T| 193| null|\n", + "| GCST000998|21_34226827_C_T| 29| 32|\n", + "| GCST001010| 6_32689801_T_C| 1| 112|\n", + "| GCST001040|17_37738049_G_A| null| 21|\n", + "| GCST001057|13_66393490_A_G| 7| null|\n", + "| GCST001059|2_198123211_C_A| 8| null|\n", + "+--------------+---------------+---------------------+---------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 414:> (0 + 1) / 1]\r" + ] + } + ], + "source": [ + "aggregated_new = (\n", + " processed_new\n", + " .join(processed_old.select('studyAccession').distinct(), on='studyAccession', how='right')\n", + " .groupBy('studyAccession', 'variantId')\n", + " .agg(f.size(f.collect_list(f.col('tagVariantId'))).alias('new_credible_set_size'))\n", + " .persist()\n", + ")\n", + "\n", + "aggregated_old = (\n", + " processed_old\n", + " .groupBy('studyAccession', 'variantId')\n", + " .agg(f.size(f.collect_list(f.col('tagVariantId'))).alias('old_credible_set_size'))\n", + " .persist()\n", + ")\n", + "\n", + "credset_compare = (\n", + " aggregated_new\n", + " .join(aggregated_old.filter(f.col('studyAccession').startswith('GCST')), on=['studyAccession', 'variantId'], how='outer')\n", + " .persist()\n", + ")\n", + "\n", + "credset_compare.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The number of extra credible sets covered by the new dataset: 104508 (53.0%)\n", + "Number of lost credible sets in the new datasets: 49292 (25.0%)\n", + "The number of extra credible sets with more than 1 tags covered by the new dataset: 94745 (48.1%)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 414:> (0 + 1) / 1]\r" + ] + } + ], + "source": [ + "extra_coverage = credset_compare.filter(f.col('old_credible_set_size').isNull()).count()\n", + "lost_coverage = credset_compare.filter(f.col('new_credible_set_size').isNull()).count()\n", + "old_full_count = aggregated_old.filter(f.col('studyAccession').startswith('GCST')).count()\n", + "\n", + "print(f'The number of extra credible sets covered by the new dataset: {extra_coverage} ({round(extra_coverage/old_full_count * 100, 1)}%)')\n", + "print(f'Number of lost credible sets in the new datasets: {lost_coverage} ({round(lost_coverage/old_full_count*100, 1)}%)')\n", + "\n", + "extra_coverage_more = credset_compare.filter(f.col('old_credible_set_size').isNull() & (f.col('new_credible_set_size')>1)).count()\n", + "\n", + "print(f'The number of extra credible sets with more than 1 tags covered by the new dataset: {extra_coverage_more} ({round(extra_coverage_more/old_full_count * 100, 1)}%)')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------+--------------------+---------------------+---------------------+\n", + "|studyAccession| variantId|new_credible_set_size|old_credible_set_size|\n", + "+--------------+--------------------+---------------------+---------------------+\n", + "| GCST000943| 20_1960525_G_A| null| 2|\n", + "| GCST000964| 13_77957479_G_A| null| 47|\n", + "| GCST001040| 17_37738049_G_A| null| 21|\n", + "| GCST002216| 7_73450539_A_G| null| 94|\n", + "| GCST002221| 9_133372523_G_C| null| 20|\n", + "| GCST002223| 8_19973410_C_T| null| 106|\n", + "| GCST002223| 8_20009083_C_T| null| 98|\n", + "| GCST003043| 16_11271643_C_T| null| 14|\n", + "| GCST003191| 20_22824423_G_A| null| 30|\n", + "| GCST003879| 22_23030688_C_G| null| 104|\n", + "| GCST004132| 16_10871740_T_C| null| 74|\n", + "| GCST004365| 3_186755027_C_T| null| 12|\n", + "| GCST004600| 6_35756341_T_C| null| 17|\n", + "| GCST004601| 11_8721318_TC_T| null| 171|\n", + "| GCST004601| 6_27878966_G_C| null| 267|\n", + "| GCST004603|16_88730362_G_GGG...| null| 10|\n", + "| GCST004603| 4_17777672_A_T| null| 9|\n", + "| GCST004605| 6_28489735_CT_C| null| 1|\n", + "| GCST004607| 20_56413821_A_G| null| 15|\n", + "| GCST004607| 2_218258320_T_A| null| 235|\n", + "+--------------+--------------------+---------------------+---------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 414:> (0 + 1) / 1]\r" + ] + } + ], + "source": [ + "credset_compare.filter(f.col('new_credible_set_size').isNull()).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Conclusion:**\n", + "- The reason of the disagreement is the fact that the old dataset contains data from summary stats finemapping.\n", + "- To resolve this problem, we exclude those studies which have summary stats. These credible sets should be in a better agreement." + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "141" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 414:> (0 + 1) / 1]\r" + ] + } + ], + "source": [ + "(\n", + " spark.read.parquet(\"gs://genetics-portal-dev-staging/v2d/220401/ld.parquet\")\n", + " .filter(f.col('study_id') == 'GCST002223')\n", + " .select('lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt')\n", + " .distinct()\n", + " .count()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "22/12/19 15:57:00 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.\n", + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of credible sets in the selected studies (27054): 105658\n", + "The number of extra credible sets covered by the new dataset in the same studies: 73250 (69.3%)\n", + "Number of lost credible sets in the new datasets: 8454 (8.0%)\n", + "The number of extra credible sets with more than 1 tags covered by the new dataset: 67183 (63.6%)\n" + ] + } + ], + "source": [ + "studies_with_no_sumstats = (\n", + " spark.read.parquet('gs://genetics_etl_python_playground/XX.XX/output/python_etl/parquet/gwas_catalog_studies/')\n", + " .filter(~f.col('hasSumstats'))\n", + " .select(f.split(f.col('studyId'), '_').getItem(0).alias('studyAccession'))\n", + " .distinct()\n", + ")\n", + "\n", + "# Dropping studies with summary statistics:\n", + "credset_compare_update = credset_compare.join(studies_with_no_sumstats, on='studyAccession', how='inner').distinct().persist()\n", + "\n", + "old_full_count = credset_compare_update.filter(f.col('old_credible_set_size').isNotNull()).count()\n", + "extra_coverage = credset_compare_update.filter(f.col('old_credible_set_size').isNull()).count()\n", + "lost_coverage = credset_compare_update.filter(f.col('new_credible_set_size').isNull()).count()\n", + "\n", + "print(f'Number of credible sets in the selected studies ({studies_with_no_sumstats.count()}): {old_full_count}')\n", + "print(f'The number of extra credible sets covered by the new dataset in the same studies: {extra_coverage} ({round(extra_coverage/old_full_count * 100, 1)}%)')\n", + "print(f'Number of lost credible sets in the new datasets: {lost_coverage} ({round(lost_coverage/old_full_count*100, 1)}%)')\n", + "\n", + "extra_coverage_more = credset_compare_update.filter(f.col('old_credible_set_size').isNull() & (f.col('new_credible_set_size')>1)).count()\n", + "\n", + "print(f'The number of extra credible sets with more than 1 tags covered by the new dataset: {extra_coverage_more} ({round(extra_coverage_more/old_full_count * 100, 1)}%)')" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "1+1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "otgenetics-Z1loiStc-py3.8", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16 (default, Dec 7 2022, 01:39:17) \n[Clang 14.0.0 (clang-1400.0.29.202)]" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "5a448d06c31dd563cc2d2f896cd972f1626bb3e0fbcfc3d2f2ab4cc41131eab9" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/poetry.lock b/poetry.lock index e229f7a31..3bbf4cb52 100644 --- a/poetry.lock +++ b/poetry.lock @@ -357,6 +357,20 @@ python-versions = "*" [package.extras] test = ["flake8 (==3.7.8)", "hypothesis (==3.55.3)"] +[[package]] +name = "comm" +version = "0.1.1" +description = "Jupyter Python Comm implementation, for usage in ipykernel, xeus-python etc." +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +traitlets = ">5.3" + +[package.extras] +test = ["pytest"] + [[package]] name = "coverage" version = "6.5.0" @@ -406,6 +420,14 @@ category = "dev" optional = false python-versions = ">=3.6,<4.0" +[[package]] +name = "debugpy" +version = "1.6.4" +description = "An implementation of the Debug Adapter Protocol for Python" +category = "dev" +optional = false +python-versions = ">=3.7" + [[package]] name = "decorator" version = "4.4.2" @@ -447,6 +469,14 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "entrypoints" +version = "0.4" +description = "Discover and load entry points from installed packages." +category = "dev" +optional = false +python-versions = ">=3.6" + [[package]] name = "eradicate" version = "2.1.0" @@ -1041,6 +1071,35 @@ docs = ["sphinx", "sphinx-autobuild"] png = ["cairosvg"] tests = ["pytest", "pytest-cov", "pytest-mock"] +[[package]] +name = "ipykernel" +version = "6.19.0" +description = "IPython Kernel for Jupyter" +category = "dev" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +appnope = {version = "*", markers = "platform_system == \"Darwin\""} +comm = ">=0.1.1" +debugpy = ">=1.0" +ipython = ">=7.23.1" +jupyter-client = ">=6.1.12" +matplotlib-inline = ">=0.1" +nest-asyncio = "*" +packaging = "*" +psutil = "*" +pyzmq = ">=17" +tornado = ">=6.1" +traitlets = ">=5.4.0" + +[package.extras] +cov = ["coverage[toml]", "curio", "matplotlib", "pytest-cov", "trio"] +docs = ["myst-parser", "pydata-sphinx-theme", "sphinx", "sphinxcontrib-github-alt"] +lint = ["black (>=22.6.0)", "mdformat (>0.7)", "ruff (>=0.0.156)"] +test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0)", "pytest-asyncio", "pytest-cov", "pytest-timeout"] +typing = ["mypy (>=0.990)"] + [[package]] name = "ipython" version = "8.7.0" @@ -1158,6 +1217,44 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "jupyter-client" +version = "7.4.8" +description = "Jupyter protocol implementation and client libraries" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +entrypoints = "*" +jupyter-core = ">=4.9.2" +nest-asyncio = ">=1.5.4" +python-dateutil = ">=2.8.2" +pyzmq = ">=23.0" +tornado = ">=6.2" +traitlets = "*" + +[package.extras] +doc = ["ipykernel", "myst-parser", "sphinx (>=1.3.6)", "sphinx-rtd-theme", "sphinxcontrib-github-alt"] +test = ["codecov", "coverage", "ipykernel (>=6.12)", "ipython", "mypy", "pre-commit", "pytest", "pytest-asyncio (>=0.18)", "pytest-cov", "pytest-timeout"] + +[[package]] +name = "jupyter-core" +version = "5.1.0" +description = "Jupyter core package. A base package on which Jupyter projects rely." +category = "dev" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +platformdirs = ">=2.5" +pywin32 = {version = ">=1.0", markers = "sys_platform == \"win32\" and platform_python_implementation != \"PyPy\""} +traitlets = ">=5.3" + +[package.extras] +docs = ["myst-parser", "sphinxcontrib-github-alt", "traitlets"] +test = ["ipykernel", "pre-commit", "pytest", "pytest-cov", "pytest-timeout"] + [[package]] name = "Markdown" version = "3.3.7" @@ -1706,6 +1803,17 @@ category = "main" optional = false python-versions = ">=3.7" +[[package]] +name = "psutil" +version = "5.9.4" +description = "Cross-platform lib for process and system monitoring in Python." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.extras] +test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] + [[package]] name = "ptyprocess" version = "0.7.0" @@ -1963,6 +2071,18 @@ python-versions = ">=3.6" [package.dependencies] pyyaml = "*" +[[package]] +name = "pyzmq" +version = "24.0.1" +description = "Python bindings for 0MQ" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +cffi = {version = "*", markers = "implementation_name == \"pypy\""} +py = {version = "*", markers = "implementation_name == \"pypy\""} + [[package]] name = "requests" version = "2.28.1" @@ -2578,6 +2698,10 @@ commonmark = [ {file = "commonmark-0.9.1-py2.py3-none-any.whl", hash = "sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9"}, {file = "commonmark-0.9.1.tar.gz", hash = "sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60"}, ] +comm = [ + {file = "comm-0.1.1-py3-none-any.whl", hash = "sha256:788a4ec961956c1cb2b0ba3c21f2458ff5757bb2f552032b140787af88d670a3"}, + {file = "comm-0.1.1.tar.gz", hash = "sha256:f395ea64f4f261f35ffc2fbf80a62ec071375dac48cd3ea56092711e74dd063e"}, +] coverage = [ {file = "coverage-6.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ef8674b0ee8cc11e2d574e3e2998aea5df5ab242e012286824ea3c6970580e53"}, {file = "coverage-6.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:784f53ebc9f3fd0e2a3f6a78b2be1bd1f5575d7863e10c6e12504f240fd06660"}, @@ -2665,6 +2789,26 @@ darglint = [ {file = "darglint-1.8.1-py3-none-any.whl", hash = "sha256:5ae11c259c17b0701618a20c3da343a3eb98b3bc4b5a83d31cdd94f5ebdced8d"}, {file = "darglint-1.8.1.tar.gz", hash = "sha256:080d5106df149b199822e7ee7deb9c012b49891538f14a11be681044f0bb20da"}, ] +debugpy = [ + {file = "debugpy-1.6.4-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:6ae238943482c78867ac707c09122688efb700372b617ffd364261e5e41f7a2f"}, + {file = "debugpy-1.6.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a39e7da178e1f22f4bc04b57f085e785ed1bcf424aaf318835a1a7129eefe35"}, + {file = "debugpy-1.6.4-cp310-cp310-win32.whl", hash = "sha256:143f79d0798a9acea21cd1d111badb789f19d414aec95fa6389cfea9485ddfb1"}, + {file = "debugpy-1.6.4-cp310-cp310-win_amd64.whl", hash = "sha256:563f148f94434365ec0ce94739c749aabf60bf67339e68a9446499f3582d62f3"}, + {file = "debugpy-1.6.4-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:1caee68f7e254267df908576c0d0938f8f88af16383f172cb9f0602e24c30c01"}, + {file = "debugpy-1.6.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40e2a83d31a16b83666f19fa06d97b2cc311af88e6266590579737949971a17e"}, + {file = "debugpy-1.6.4-cp37-cp37m-win32.whl", hash = "sha256:82229790442856962aec4767b98ba2559fe0998f897e9f21fb10b4fd24b6c436"}, + {file = "debugpy-1.6.4-cp37-cp37m-win_amd64.whl", hash = "sha256:67edf033f9e512958f7b472975ff9d9b7ff64bf4440f6f6ae44afdc66b89e6b6"}, + {file = "debugpy-1.6.4-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:4ab5e938925e5d973f567d6ef32751b17d10f3be3a8c4d73c52f53e727f69bf1"}, + {file = "debugpy-1.6.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8df268e9f72fc06efc2e75e8dc8e2b881d6a397356faec26efb2ee70b6863b7"}, + {file = "debugpy-1.6.4-cp38-cp38-win32.whl", hash = "sha256:86bd25f38f8b6c5d430a5e2931eebbd5f580c640f4819fcd236d0498790c7204"}, + {file = "debugpy-1.6.4-cp38-cp38-win_amd64.whl", hash = "sha256:62ba4179b372a62abf9c89b56997d70a4100c6dea6c2a4e0e4be5f45920b3253"}, + {file = "debugpy-1.6.4-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:d2968e589bda4e485a9c61f113754a28e48d88c5152ed8e0b2564a1fadbe50a5"}, + {file = "debugpy-1.6.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e62b8034ede98932b92268669318848a0d42133d857087a3b9cec03bb844c615"}, + {file = "debugpy-1.6.4-cp39-cp39-win32.whl", hash = "sha256:3d9c31baf64bf959a593996c108e911c5a9aa1693a296840e5469473f064bcec"}, + {file = "debugpy-1.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:ea4bf208054e6d41749f17612066da861dff10102729d32c85b47f155223cf2b"}, + {file = "debugpy-1.6.4-py2.py3-none-any.whl", hash = "sha256:e886a1296cd20a10172e94788009ce74b759e54229ebd64a43fa5c2b4e62cd76"}, + {file = "debugpy-1.6.4.zip", hash = "sha256:d5ab9bd3f4e7faf3765fd52c7c43c074104ab1e109621dc73219099ed1a5399d"}, +] decorator = [ {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"}, {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"}, @@ -2681,6 +2825,10 @@ distlib = [ {file = "distlib-0.3.6-py2.py3-none-any.whl", hash = "sha256:f35c4b692542ca110de7ef0bea44d73981caeb34ca0b9b6b2e6d7790dda8f80e"}, {file = "distlib-0.3.6.tar.gz", hash = "sha256:14bad2d9b04d3a36127ac97f30b12a19268f211063d8f8ee4f47108896e11b46"}, ] +entrypoints = [ + {file = "entrypoints-0.4-py3-none-any.whl", hash = "sha256:f174b5ff827504fd3cd97cc3f8649f3693f51538c7e4bdf3ef002c8429d42f9f"}, + {file = "entrypoints-0.4.tar.gz", hash = "sha256:b706eddaa9218a19ebcd67b56818f05bb27589b1ca9e8d797b74affad4ccacd4"}, +] eradicate = [ {file = "eradicate-2.1.0-py3-none-any.whl", hash = "sha256:8bfaca181db9227dc88bdbce4d051a9627604c2243e7d85324f6d6ce0fd08bb2"}, {file = "eradicate-2.1.0.tar.gz", hash = "sha256:aac7384ab25b1bf21c4c012de9b4bf8398945a14c98c911545b2ea50ab558014"}, @@ -2916,6 +3064,10 @@ interrogate = [ {file = "interrogate-1.5.0-py3-none-any.whl", hash = "sha256:a4ccc5cbd727c74acc98dee6f5e79ef264c0bcfa66b68d4e123069b2af89091a"}, {file = "interrogate-1.5.0.tar.gz", hash = "sha256:b6f325f0aa84ac3ac6779d8708264d366102226c5af7d69058cecffcff7a6d6c"}, ] +ipykernel = [ + {file = "ipykernel-6.19.0-py3-none-any.whl", hash = "sha256:851aa3f9cbbec6918136ada733069a6709934b8106a743495070cf46917eb9a9"}, + {file = "ipykernel-6.19.0.tar.gz", hash = "sha256:7aabde9e201c4a8f43000f4be0d057f91df13b906ea646acd9047fcb85600b9b"}, +] ipython = [ {file = "ipython-8.7.0-py3-none-any.whl", hash = "sha256:352042ddcb019f7c04e48171b4dd78e4c4bb67bf97030d170e154aac42b656d9"}, {file = "ipython-8.7.0.tar.gz", hash = "sha256:882899fe78d5417a0aa07f995db298fa28b58faeba2112d2e3a4c95fe14bb738"}, @@ -2947,6 +3099,14 @@ jmespath = [ jsmin = [ {file = "jsmin-3.0.1.tar.gz", hash = "sha256:c0959a121ef94542e807a674142606f7e90214a2b3d1eb17300244bbb5cc2bfc"}, ] +jupyter-client = [ + {file = "jupyter_client-7.4.8-py3-none-any.whl", hash = "sha256:d4a67ae86ee014bcb96bd8190714f6af921f2b0f52f4208b086aa5acfd9f8d65"}, + {file = "jupyter_client-7.4.8.tar.gz", hash = "sha256:109a3c33b62a9cf65aa8325850a0999a795fac155d9de4f7555aef5f310ee35a"}, +] +jupyter-core = [ + {file = "jupyter_core-5.1.0-py3-none-any.whl", hash = "sha256:f5740d99606958544396914b08e67b668f45e7eff99ab47a7f4bcead419c02f4"}, + {file = "jupyter_core-5.1.0.tar.gz", hash = "sha256:a5ae7c09c55c0b26f692ec69323ba2b62e8d7295354d20f6cd57b749de4a05bf"}, +] Markdown = [ {file = "Markdown-3.3.7-py3-none-any.whl", hash = "sha256:f5da449a6e1c989a4cea2631aa8ee67caa5a2ef855d551c88f9e309f4634c621"}, {file = "Markdown-3.3.7.tar.gz", hash = "sha256:cbb516f16218e643d8e0a95b309f77eb118cb138d39a4f27851e6a63581db874"}, @@ -3423,6 +3583,22 @@ protobuf = [ {file = "protobuf-3.20.2-py2.py3-none-any.whl", hash = "sha256:c9cdf251c582c16fd6a9f5e95836c90828d51b0069ad22f463761d27c6c19019"}, {file = "protobuf-3.20.2.tar.gz", hash = "sha256:712dca319eee507a1e7df3591e639a2b112a2f4a62d40fe7832a16fd19151750"}, ] +psutil = [ + {file = "psutil-5.9.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c1ca331af862803a42677c120aff8a814a804e09832f166f226bfd22b56feee8"}, + {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:68908971daf802203f3d37e78d3f8831b6d1014864d7a85937941bb35f09aefe"}, + {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:3ff89f9b835100a825b14c2808a106b6fdcc4b15483141482a12c725e7f78549"}, + {file = "psutil-5.9.4-cp27-cp27m-win32.whl", hash = "sha256:852dd5d9f8a47169fe62fd4a971aa07859476c2ba22c2254d4a1baa4e10b95ad"}, + {file = "psutil-5.9.4-cp27-cp27m-win_amd64.whl", hash = "sha256:9120cd39dca5c5e1c54b59a41d205023d436799b1c8c4d3ff71af18535728e94"}, + {file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6b92c532979bafc2df23ddc785ed116fced1f492ad90a6830cf24f4d1ea27d24"}, + {file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:efeae04f9516907be44904cc7ce08defb6b665128992a56957abc9b61dca94b7"}, + {file = "psutil-5.9.4-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:54d5b184728298f2ca8567bf83c422b706200bcbbfafdc06718264f9393cfeb7"}, + {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:16653106f3b59386ffe10e0bad3bb6299e169d5327d3f187614b1cb8f24cf2e1"}, + {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54c0d3d8e0078b7666984e11b12b88af2db11d11249a8ac8920dd5ef68a66e08"}, + {file = "psutil-5.9.4-cp36-abi3-win32.whl", hash = "sha256:149555f59a69b33f056ba1c4eb22bb7bf24332ce631c44a319cec09f876aaeff"}, + {file = "psutil-5.9.4-cp36-abi3-win_amd64.whl", hash = "sha256:fd8522436a6ada7b4aad6638662966de0d61d241cb821239b2ae7013d41a43d4"}, + {file = "psutil-5.9.4-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6001c809253a29599bc0dfd5179d9f8a5779f9dffea1da0f13c53ee568115e1e"}, + {file = "psutil-5.9.4.tar.gz", hash = "sha256:3d7f9739eb435d4b1338944abe23f49584bde5395f27487d2ee25ad9a8774a62"}, +] ptyprocess = [ {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, @@ -3567,6 +3743,82 @@ pyyaml_env_tag = [ {file = "pyyaml_env_tag-0.1-py3-none-any.whl", hash = "sha256:af31106dec8a4d68c60207c1886031cbf839b68aa7abccdb19868200532c2069"}, {file = "pyyaml_env_tag-0.1.tar.gz", hash = "sha256:70092675bda14fdec33b31ba77e7543de9ddc88f2e5b99160396572d11525bdb"}, ] +pyzmq = [ + {file = "pyzmq-24.0.1-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:28b119ba97129d3001673a697b7cce47fe6de1f7255d104c2f01108a5179a066"}, + {file = "pyzmq-24.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bcbebd369493d68162cddb74a9c1fcebd139dfbb7ddb23d8f8e43e6c87bac3a6"}, + {file = "pyzmq-24.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae61446166983c663cee42c852ed63899e43e484abf080089f771df4b9d272ef"}, + {file = "pyzmq-24.0.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:87f7ac99b15270db8d53f28c3c7b968612993a90a5cf359da354efe96f5372b4"}, + {file = "pyzmq-24.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9dca7c3956b03b7663fac4d150f5e6d4f6f38b2462c1e9afd83bcf7019f17913"}, + {file = "pyzmq-24.0.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:8c78bfe20d4c890cb5580a3b9290f700c570e167d4cdcc55feec07030297a5e3"}, + {file = "pyzmq-24.0.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:48f721f070726cd2a6e44f3c33f8ee4b24188e4b816e6dd8ba542c8c3bb5b246"}, + {file = "pyzmq-24.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:afe1f3bc486d0ce40abb0a0c9adb39aed3bbac36ebdc596487b0cceba55c21c1"}, + {file = "pyzmq-24.0.1-cp310-cp310-win32.whl", hash = "sha256:3e6192dbcefaaa52ed81be88525a54a445f4b4fe2fffcae7fe40ebb58bd06bfd"}, + {file = "pyzmq-24.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:86de64468cad9c6d269f32a6390e210ca5ada568c7a55de8e681ca3b897bb340"}, + {file = "pyzmq-24.0.1-cp311-cp311-macosx_10_15_universal2.whl", hash = "sha256:838812c65ed5f7c2bd11f7b098d2e5d01685a3f6d1f82849423b570bae698c00"}, + {file = "pyzmq-24.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:dfb992dbcd88d8254471760879d48fb20836d91baa90f181c957122f9592b3dc"}, + {file = "pyzmq-24.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7abddb2bd5489d30ffeb4b93a428130886c171b4d355ccd226e83254fcb6b9ef"}, + {file = "pyzmq-24.0.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:94010bd61bc168c103a5b3b0f56ed3b616688192db7cd5b1d626e49f28ff51b3"}, + {file = "pyzmq-24.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:8242543c522d84d033fe79be04cb559b80d7eb98ad81b137ff7e0a9020f00ace"}, + {file = "pyzmq-24.0.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ccb94342d13e3bf3ffa6e62f95b5e3f0bc6bfa94558cb37f4b3d09d6feb536ff"}, + {file = "pyzmq-24.0.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6640f83df0ae4ae1104d4c62b77e9ef39be85ebe53f636388707d532bee2b7b8"}, + {file = "pyzmq-24.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a180dbd5ea5d47c2d3b716d5c19cc3fb162d1c8db93b21a1295d69585bfddac1"}, + {file = "pyzmq-24.0.1-cp311-cp311-win32.whl", hash = "sha256:624321120f7e60336be8ec74a172ae7fba5c3ed5bf787cc85f7e9986c9e0ebc2"}, + {file = "pyzmq-24.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:1724117bae69e091309ffb8255412c4651d3f6355560d9af312d547f6c5bc8b8"}, + {file = "pyzmq-24.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:15975747462ec49fdc863af906bab87c43b2491403ab37a6d88410635786b0f4"}, + {file = "pyzmq-24.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b947e264f0e77d30dcbccbb00f49f900b204b922eb0c3a9f0afd61aaa1cedc3d"}, + {file = "pyzmq-24.0.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0ec91f1bad66f3ee8c6deb65fa1fe418e8ad803efedd69c35f3b5502f43bd1dc"}, + {file = "pyzmq-24.0.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:db03704b3506455d86ec72c3358a779e9b1d07b61220dfb43702b7b668edcd0d"}, + {file = "pyzmq-24.0.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:e7e66b4e403c2836ac74f26c4b65d8ac0ca1eef41dfcac2d013b7482befaad83"}, + {file = "pyzmq-24.0.1-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:7a23ccc1083c260fa9685c93e3b170baba45aeed4b524deb3f426b0c40c11639"}, + {file = "pyzmq-24.0.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:fa0ae3275ef706c0309556061185dd0e4c4cd3b7d6f67ae617e4e677c7a41e2e"}, + {file = "pyzmq-24.0.1-cp36-cp36m-win32.whl", hash = "sha256:f01de4ec083daebf210531e2cca3bdb1608dbbbe00a9723e261d92087a1f6ebc"}, + {file = "pyzmq-24.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:de4217b9eb8b541cf2b7fde4401ce9d9a411cc0af85d410f9d6f4333f43640be"}, + {file = "pyzmq-24.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:78068e8678ca023594e4a0ab558905c1033b2d3e806a0ad9e3094e231e115a33"}, + {file = "pyzmq-24.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77c2713faf25a953c69cf0f723d1b7dd83827b0834e6c41e3fb3bbc6765914a1"}, + {file = "pyzmq-24.0.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8bb4af15f305056e95ca1bd086239b9ebc6ad55e9f49076d27d80027f72752f6"}, + {file = "pyzmq-24.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:0f14cffd32e9c4c73da66db97853a6aeceaac34acdc0fae9e5bbc9370281864c"}, + {file = "pyzmq-24.0.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:0108358dab8c6b27ff6b985c2af4b12665c1bc659648284153ee501000f5c107"}, + {file = "pyzmq-24.0.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:d66689e840e75221b0b290b0befa86f059fb35e1ee6443bce51516d4d61b6b99"}, + {file = "pyzmq-24.0.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ae08ac90aa8fa14caafc7a6251bd218bf6dac518b7bff09caaa5e781119ba3f2"}, + {file = "pyzmq-24.0.1-cp37-cp37m-win32.whl", hash = "sha256:8421aa8c9b45ea608c205db9e1c0c855c7e54d0e9c2c2f337ce024f6843cab3b"}, + {file = "pyzmq-24.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:54d8b9c5e288362ec8595c1d98666d36f2070fd0c2f76e2b3c60fbad9bd76227"}, + {file = "pyzmq-24.0.1-cp38-cp38-macosx_10_15_universal2.whl", hash = "sha256:acbd0a6d61cc954b9f535daaa9ec26b0a60a0d4353c5f7c1438ebc88a359a47e"}, + {file = "pyzmq-24.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:47b11a729d61a47df56346283a4a800fa379ae6a85870d5a2e1e4956c828eedc"}, + {file = "pyzmq-24.0.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:abe6eb10122f0d746a0d510c2039ae8edb27bc9af29f6d1b05a66cc2401353ff"}, + {file = "pyzmq-24.0.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:07bec1a1b22dacf718f2c0e71b49600bb6a31a88f06527dfd0b5aababe3fa3f7"}, + {file = "pyzmq-24.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0d945a85b70da97ae86113faf9f1b9294efe66bd4a5d6f82f2676d567338b66"}, + {file = "pyzmq-24.0.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1b7928bb7580736ffac5baf814097be342ba08d3cfdfb48e52773ec959572287"}, + {file = "pyzmq-24.0.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:b946da90dc2799bcafa682692c1d2139b2a96ec3c24fa9fc6f5b0da782675330"}, + {file = "pyzmq-24.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c8840f064b1fb377cffd3efeaad2b190c14d4c8da02316dae07571252d20b31f"}, + {file = "pyzmq-24.0.1-cp38-cp38-win32.whl", hash = "sha256:4854f9edc5208f63f0841c0c667260ae8d6846cfa233c479e29fdc85d42ebd58"}, + {file = "pyzmq-24.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:42d4f97b9795a7aafa152a36fe2ad44549b83a743fd3e77011136def512e6c2a"}, + {file = "pyzmq-24.0.1-cp39-cp39-macosx_10_15_universal2.whl", hash = "sha256:52afb0ac962963fff30cf1be775bc51ae083ef4c1e354266ab20e5382057dd62"}, + {file = "pyzmq-24.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8bad8210ad4df68c44ff3685cca3cda448ee46e20d13edcff8909eba6ec01ca4"}, + {file = "pyzmq-24.0.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:dabf1a05318d95b1537fd61d9330ef4313ea1216eea128a17615038859da3b3b"}, + {file = "pyzmq-24.0.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5bd3d7dfd9cd058eb68d9a905dec854f86649f64d4ddf21f3ec289341386c44b"}, + {file = "pyzmq-24.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8012bce6836d3f20a6c9599f81dfa945f433dab4dbd0c4917a6fb1f998ab33d"}, + {file = "pyzmq-24.0.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c31805d2c8ade9b11feca4674eee2b9cce1fec3e8ddb7bbdd961a09dc76a80ea"}, + {file = "pyzmq-24.0.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:3104f4b084ad5d9c0cb87445cc8cfd96bba710bef4a66c2674910127044df209"}, + {file = "pyzmq-24.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:df0841f94928f8af9c7a1f0aaaffba1fb74607af023a152f59379c01c53aee58"}, + {file = "pyzmq-24.0.1-cp39-cp39-win32.whl", hash = "sha256:a435ef8a3bd95c8a2d316d6e0ff70d0db524f6037411652803e118871d703333"}, + {file = "pyzmq-24.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:2032d9cb994ce3b4cba2b8dfae08c7e25bc14ba484c770d4d3be33c27de8c45b"}, + {file = "pyzmq-24.0.1-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:bb5635c851eef3a7a54becde6da99485eecf7d068bd885ac8e6d173c4ecd68b0"}, + {file = "pyzmq-24.0.1-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:83ea1a398f192957cb986d9206ce229efe0ee75e3c6635baff53ddf39bd718d5"}, + {file = "pyzmq-24.0.1-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:941fab0073f0a54dc33d1a0460cb04e0d85893cb0c5e1476c785000f8b359409"}, + {file = "pyzmq-24.0.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e8f482c44ccb5884bf3f638f29bea0f8dc68c97e38b2061769c4cb697f6140d"}, + {file = "pyzmq-24.0.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:613010b5d17906c4367609e6f52e9a2595e35d5cc27d36ff3f1b6fa6e954d944"}, + {file = "pyzmq-24.0.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:65c94410b5a8355cfcf12fd600a313efee46ce96a09e911ea92cf2acf6708804"}, + {file = "pyzmq-24.0.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:20e7eeb1166087db636c06cae04a1ef59298627f56fb17da10528ab52a14c87f"}, + {file = "pyzmq-24.0.1-pp38-pypy38_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a2712aee7b3834ace51738c15d9ee152cc5a98dc7d57dd93300461b792ab7b43"}, + {file = "pyzmq-24.0.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a7c280185c4da99e0cc06c63bdf91f5b0b71deb70d8717f0ab870a43e376db8"}, + {file = "pyzmq-24.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:858375573c9225cc8e5b49bfac846a77b696b8d5e815711b8d4ba3141e6e8879"}, + {file = "pyzmq-24.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:80093b595921eed1a2cead546a683b9e2ae7f4a4592bb2ab22f70d30174f003a"}, + {file = "pyzmq-24.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f3f3154fde2b1ff3aa7b4f9326347ebc89c8ef425ca1db8f665175e6d3bd42f"}, + {file = "pyzmq-24.0.1-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abb756147314430bee5d10919b8493c0ccb109ddb7f5dfd2fcd7441266a25b75"}, + {file = "pyzmq-24.0.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44e706bac34e9f50779cb8c39f10b53a4d15aebb97235643d3112ac20bd577b4"}, + {file = "pyzmq-24.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:687700f8371643916a1d2c61f3fdaa630407dd205c38afff936545d7b7466066"}, + {file = "pyzmq-24.0.1.tar.gz", hash = "sha256:216f5d7dbb67166759e59b0479bca82b8acf9bed6015b526b8eb10143fb08e77"}, +] requests = [ {file = "requests-2.28.1-py3-none-any.whl", hash = "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"}, {file = "requests-2.28.1.tar.gz", hash = "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983"}, diff --git a/pyproject.toml b/pyproject.toml index 93da2b38e..7d7babcee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ pandas = "<1.5.0" [tool.poetry.group.dev.dependencies] ipython = "^8.5.0" +ipykernel = "^6.19.0" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/src/conftest.py b/src/conftest.py index 53cb42cd3..20e2fb5bf 100644 --- a/src/conftest.py +++ b/src/conftest.py @@ -1,9 +1,12 @@ """Test configuration within src dir (doctests).""" from __future__ import annotations +import subprocess from typing import Any +import hail as hl import pytest +from pyspark import SparkConf from pyspark.sql import SparkSession @@ -20,7 +23,30 @@ def spark(doctest_namespace: dict[str, Any]) -> SparkSession: Returns: SparkSession: local spark session """ - spark = SparkSession.builder.master("local").appName("test").getOrCreate() + # configure spark using hail backend + hail_home = subprocess.check_output( + "poetry run pip show hail | grep Location | awk -F' ' '{print $2 \"/hail\"}'", + shell=True, + text=True, + ).strip() + conf = ( + SparkConf() + .set( + "spark.jars", + hail_home + "/backend/hail-all-spark.jar", + ) + .set( + "spark.driver.extraClassPath", + hail_home + "/backend/hail-all-spark.jar", + ) + .set("spark.executor.extraClassPath", "./hail-all-spark.jar") + .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .set("spark.kryo.registrator", "is.hail.kryo.HailKryoRegistrator") + # .set("spark.kryoserializer.buffer", "512m") + ) + # init spark session + spark = SparkSession.builder.config(conf=conf).getOrCreate() doctest_namespace["spark"] = spark - + # init hail session + hl.init(sc=spark.sparkContext, log="/dev/null") return spark diff --git a/src/etl/common/ETLSession.py b/src/etl/common/ETLSession.py index 051e98231..2a2c9447e 100644 --- a/src/etl/common/ETLSession.py +++ b/src/etl/common/ETLSession.py @@ -6,6 +6,7 @@ import json from typing import TYPE_CHECKING +from pyspark.conf import SparkConf from pyspark.sql import SparkSession from pyspark.sql.types import StructType @@ -19,15 +20,24 @@ class ETLSession: """Spark session class.""" - def __init__(self: ETLSession, cfg: DictConfig) -> None: - """Creates spark session and logger.""" + spark_config = SparkConf() + + def __init__( + self: ETLSession, cfg: DictConfig, spark_config: SparkConf = spark_config + ) -> None: + """Initialises spark session and logger. + + Args: + cfg (DictConfig): configuration file + spark_config (SparkConf): Optional spark config. Defaults to spark_config. + """ # create session and retrieve Spark logger object self.spark = ( - SparkSession.builder.master(cfg.environment.sparkUri) + SparkSession.builder.config(conf=spark_config) + .master(cfg.environment.sparkUri) .appName(cfg.etl.name) .getOrCreate() ) - self.logger = Log4j(self.spark) def read_parquet(self: ETLSession, path: str, schema_json: str) -> DataFrame: diff --git a/src/etl/common/spark_helpers.py b/src/etl/common/spark_helpers.py index 2e42819d9..ef73c303d 100644 --- a/src/etl/common/spark_helpers.py +++ b/src/etl/common/spark_helpers.py @@ -78,3 +78,65 @@ def get_record_with_maximum_value( """ w = Window.partitionBy(grouping_col).orderBy(f.col(sorting_col).desc()) return get_top_ranked_in_window(df, w) + + +def _neglog_p(p_value_mantissa: Column, p_value_exponent: Column) -> Column: + """Compute the negative log p-value. + + Args: + p_value_mantissa (Column): P-value mantissa + p_value_exponent (Column): P-value exponent + + Returns: + Column: Negative log p-value + + Examples: + >>> d = [(1, 1), (5, -2), (1, -1000)] + >>> df = spark.createDataFrame(d).toDF("p_value_mantissa", "p_value_exponent") + >>> df.withColumn("neg_log_p", _neglog_p(f.col("p_value_mantissa"), f.col("p_value_exponent"))).show() + +----------------+----------------+------------------+ + |p_value_mantissa|p_value_exponent| neg_log_p| + +----------------+----------------+------------------+ + | 1| 1| -1.0| + | 5| -2|1.3010299956639813| + | 1| -1000| 1000.0| + +----------------+----------------+------------------+ + + """ + return -1 * (f.log10(p_value_mantissa) + p_value_exponent) + + +def adding_quality_flag( + qc_column: Column, flag_condition: Column, flag_text: str +) -> Column: + """Update the provided quality control list with a new flag if condition is met. + + Args: + qc_column (Column): Array column with existing QC flags. + flag_condition (Column): This is a column of booleans, signing which row should be flagged + flag_text (str): Text for the new quality control flag + + Returns: + Column: Array column with the updated list of qc flags. + + Examples: + >>> data = [(True, ['Existing flag']),(True, []),(False, [])] + >>> new_flag = 'This is a new flag' + >>> ( + ... spark.createDataFrame(data, ['flag', 'qualityControl']) + ... .withColumn('qualityControl', adding_quality_flag(f.col('qualityControl'), f.col('flag'), new_flag)) + ... .show(truncate=False) + ... ) + +-----+-----------------------------------+ + |flag |qualityControl | + +-----+-----------------------------------+ + |true |[Existing flag, This is a new flag]| + |true |[This is a new flag] | + |false|[] | + +-----+-----------------------------------+ + + """ + return f.when( + flag_condition, + f.array_union(qc_column, f.array(f.lit(flag_text))), + ).otherwise(qc_column) diff --git a/src/etl/common/utils.py b/src/etl/common/utils.py index a3946e931..209c4613b 100644 --- a/src/etl/common/utils.py +++ b/src/etl/common/utils.py @@ -29,6 +29,7 @@ def convert_gnomad_position_to_ensembl( """Converting GnomAD variant position to Ensembl variant position. For indels (the reference or alternate allele is longer than 1), then adding 1 to the position, for SNPs, the position is unchanged. + More info about the problem: https://www.biostars.org/p/84686/ Args: position (Column): Column diff --git a/src/etl/gwas_ingest/association_mapping.py b/src/etl/gwas_ingest/association_mapping.py new file mode 100644 index 000000000..c2a148fea --- /dev/null +++ b/src/etl/gwas_ingest/association_mapping.py @@ -0,0 +1,300 @@ +"""Functions to reliable map GWAS Catalog associations to GnomAD3.1 variants.""" +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pyspark.sql import functions as f +from pyspark.sql.window import Window + +from etl.common.spark_helpers import adding_quality_flag, get_record_with_maximum_value + +if TYPE_CHECKING: + from pyspark.sql import Column, DataFrame + + from etl.common.ETLSession import ETLSession + +# Quality control flags: +NON_MAPPED_VARIANT = "No mapping in GnomAd" + + +def map_variants( + parsed_associations: DataFrame, variant_annotation_path: str, etl: ETLSession +) -> DataFrame: + """Add variant metadata in associations. + + Args: + parsed_associations (DataFrame): associations + etl (ETLSession): current ETL session + variant_annotation_path (str): variant annotation path + + Returns: + DataFrame: associations with variant metadata + """ + variants = etl.spark.read.parquet(variant_annotation_path).select( + f.col("id").alias("variantId"), + "chromosome", + "position", + f.col("rsIds").alias("rsIdsGnomad"), + "referenceAllele", + "alternateAllele", + "alleleFrequencies", + ) + + # Filtering the GnomnAD dataset: + overlapping_variants = variants.join( + f.broadcast(parsed_associations.select("chromosome", "position").distinct()), + on=["chromosome", "position"], + how="inner", + ) + + mapped_associations = ( + # Left-join to rescue unmapped associations: + parsed_associations.join( + f.broadcast(overlapping_variants), on=["chromosome", "position"], how="left" + ) + # Even if there's no variant mapping in gnomad, to make sure we are not losing any assocations, + .withColumn("variantId", f.coalesce(f.col("variantId"), f.col("gwasVariant"))) + # Flagging variants that could not be mapped to gnomad: + .withColumn( + "qualityControl", + adding_quality_flag( + f.col("qualityControl"), + f.col("alternateAllele").isNull(), + NON_MAPPED_VARIANT, + ), + ).persist() + ) + assoc_without_variant = mapped_associations.filter( + f.col("referenceAllele").isNull() + ).count() + # etl.logger.info( + print( + f"Loading variant annotation and joining with associations... {assoc_without_variant} associations outside gnomAD" + ) + return mapped_associations + + +def _check_rsids(gnomad: Column, gwas: Column) -> Column: + """If the intersection of the two arrays is greater than 0, return True, otherwise return False. + + Args: + gnomad (Column): rsids from gnomad + gwas (Column): rsids from the GWAS Catalog + + Returns: + A boolean column that is true if the GnomAD rsIDs can be found in the GWAS rsIDs. + + Examples: + >>> d = [ + ... (1, ["rs123", "rs523"], ["rs123"]), + ... (2, [], ["rs123"]), + ... (3, ["rs123", "rs523"], []), + ... (4, [], []), + ... ] + >>> df = spark.createDataFrame(d, ['associationId', 'gnomad', 'gwas']) + >>> df.withColumn("rsid_matches", _check_rsids(f.col("gnomad"),f.col('gwas'))).show() + +-------------+--------------+-------+------------+ + |associationId| gnomad| gwas|rsid_matches| + +-------------+--------------+-------+------------+ + | 1|[rs123, rs523]|[rs123]| true| + | 2| []|[rs123]| false| + | 3|[rs123, rs523]| []| false| + | 4| []| []| false| + +-------------+--------------+-------+------------+ + + + """ + return f.when(f.size(f.array_intersect(gnomad, gwas)) > 0, True).otherwise(False) + + +def _flag_mappings_to_retain(association_id: Column, filter_column: Column) -> Column: + """Flagging mappings to drop for each association. + + Some associations have multiple mappings. Some has matching rsId others don't. We only + want to drop the non-matching mappings, when a matching is available for the given association. + This logic can be generalised for other measures eg. allele concordance. + + Args: + association_id (Column): association identifier column + filter_column (Column): boolean col indicating to keep a mapping + + Returns: + A column with a boolean value. + + Examples: + >>> d = [ + ... (1, False), + ... (1, False), + ... (2, False), + ... (2, True), + ... (3, True), + ... (3, True), + ... ] + >>> df = spark.createDataFrame(d, ['associationId', 'filter']) + >>> df.withColumn("isConcordant", _flag_mappings_to_retain(f.col("associationId"),f.col('filter'))).show() + +-------------+------+------------+ + |associationId|filter|isConcordant| + +-------------+------+------------+ + | 1| false| true| + | 1| false| true| + | 3| true| true| + | 3| true| true| + | 2| false| false| + | 2| true| true| + +-------------+------+------------+ + + + """ + w = Window.partitionBy(association_id) + + # Generating a boolean column informing if the filter column contains true anywhere for the association: + aggregated_filter = f.when( + f.array_contains(f.collect_set(filter_column).over(w), True), True + ).otherwise(False) + + # Generate a filter column: + return f.when(aggregated_filter & (~filter_column), False).otherwise(True) + + +def _check_concordance( + risk_allele: Column, reference_allele: Column, alternate_allele: Column +) -> Column: + """A function to check if the risk allele is concordant with the alt or ref allele. + + If the risk allele is the same as the reference or alternate allele, or if the reverse complement of + the risk allele is the same as the reference or alternate allele, then the allele is concordant. + If no mapping is available (ref/alt is null), the function returns True. + + Args: + risk_allele (Column): The allele that is associated with the risk of the disease. + reference_allele (Column): The reference allele from the GWAS catalog + alternate_allele (Column): The alternate allele of the variant. + + Returns: + A boolean column that is True if the risk allele is the same as the reference or alternate allele, + or if the reverse complement of the risk allele is the same as the reference or alternate allele. + + Examples: + >>> d = [ + ... ('A', 'A', 'G'), + ... ('A', 'T', 'G'), + ... ('A', 'C', 'G'), + ... ('A', 'A', '?'), + ... (None, None, 'A'), + ... ] + >>> df = spark.createDataFrame(d, ['riskAllele', 'referenceAllele', 'alternateAllele']) + >>> df.withColumn("isConcordant", _check_concordance(f.col("riskAllele"),f.col('referenceAllele'), f.col('alternateAllele'))).show() + +----------+---------------+---------------+------------+ + |riskAllele|referenceAllele|alternateAllele|isConcordant| + +----------+---------------+---------------+------------+ + | A| A| G| true| + | A| T| G| true| + | A| C| G| false| + | A| A| ?| true| + | null| null| A| true| + +----------+---------------+---------------+------------+ + + + """ + # Calculating the reverse complement of the risk allele: + risk_allele_reverse_complement = f.when( + risk_allele.rlike(r"^[ACTG]+$"), + f.reverse(f.translate(risk_allele, "ACTG", "TGAC")), + ).otherwise(risk_allele) + + # OK, is the risk allele or the reverse complent is the same as the mapped alleles: + return ( + f.when( + (risk_allele == reference_allele) | (risk_allele == alternate_allele), + True, + ) + # If risk allele is found on the negative strand: + .when( + (risk_allele_reverse_complement == reference_allele) + | (risk_allele_reverse_complement == alternate_allele), + True, + ) + # If risk allele is ambiguous, still accepted: < This condition could be reconsidered + .when(risk_allele == "?", True) + # If the association could not be mapped we keep it: + .when(reference_allele.isNull(), True) + # Allele is discordant: + .otherwise(False) + ) + + +def clean_mappings(df: DataFrame) -> DataFrame: + """A function to sort out multiple mappings for associations. + + We keep the mapping with the highest MAF, and if there are multiple mappings with the same MAF, we + keep the one with the matching rsId, and if there are multiple mappings with the same MAF and rsId, + we keep the one with concordant alleles + + Args: + df (DataFrame): DataFrame + + Returns: + A dataframe with the following columns: + - associationId + - variantId + - rsIdsGnomad + - rsIdsGwasCatalog + - riskAllele + - referenceAllele + - alternateAllele + - alleleFrequencies + - isRsIdMatched + - isConcordant + """ + all_mappings = df.withColumn( + "isRsIdMatched", _check_rsids(f.col("rsIdsGnomad"), f.col("rsIdsGwasCatalog")) + ).withColumn( + "isConcordant", + _check_concordance( + f.col("riskAllele"), f.col("referenceAllele"), f.col("alternateAllele") + ), + ) + + mafs = ( + df.select("variantId", f.explode(f.col("alleleFrequencies")).alias("af")) + .distinct() + .withColumn( + "maf", + f.when( + f.col("af.alleleFrequency") > 0.5, 1 - f.col("af.alleleFrequency") + ).otherwise(f.col("af.alleleFrequency")), + ) + .groupBy("variantId") + .agg(f.max("maf").alias("maxMaf")) + ) + + return ( + all_mappings.join(mafs, on="variantId", how="left") + # Keep rows, where GnomAD rsId matches with GWAS rsId if present: + .withColumn( + "rsidFilter", + _flag_mappings_to_retain(f.col("associationId"), f.col("isRsIdMatched")), + ) + .filter("rsidFilter") + # Dropping rows, where alleles aren't concordant, but concordant alleles available: + .withColumn( + "concordanceFilter", + _flag_mappings_to_retain(f.col("associationId"), f.col("isConcordant")), + ) + .filter(f.col("concordanceFilter")) + # Out of the remaining mappings, keeping the one with the highest MAF: + .transform( + lambda df: get_record_with_maximum_value( + df, grouping_col="associationId", sorting_col="maxMaf" + ) + ) + .drop( + "rsidFilter", + "concordanceFilter", + "isRsIdMatched", + "maxMaf", + "isRsIdMatched", + "isConcordant", + ) + .persist() + ) diff --git a/src/etl/gwas_ingest/clumping.py b/src/etl/gwas_ingest/clumping.py new file mode 100644 index 000000000..bd1507033 --- /dev/null +++ b/src/etl/gwas_ingest/clumping.py @@ -0,0 +1,168 @@ +"""This module provides toolset to clump together signals with overlapping ld set.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pyspark.sql.functions as f +from pyspark.sql import Window + +from etl.common.spark_helpers import adding_quality_flag +from etl.gwas_ingest.pics import _neglog_p + +if TYPE_CHECKING: + from pyspark.sql import DataFrame + + +def _get_resolver(df: DataFrame) -> DataFrame: + """Getting a lead to more significant lead mapping for joining. + + Args: + df (DataFrame): LD sets with some level of resolution + + Returns: + DataFrame + """ + return df.select( + "studyId", + f.col("variantId").alias("explained"), + f.col("explained").alias("new_explained"), + ).distinct() + + +def _clean_dataframe(df: DataFrame) -> DataFrame: + """As part of the iterative lead variant resolve, come columns needs to be deleted. + + Args: + df (DataFrame): Association dataset with resolved LD and R_overall calculated. + + Returns: + DataFrame: Non-independent associations are assigned to the most significant linked lead + """ + if "is_resolved" in df.columns: + return df.drop("explained", "is_resolved").withColumnRenamed( + "new_explained", "explained" + ) + else: + return df + + +def clumping(df: DataFrame) -> DataFrame: + """Clump non-independent credible sets. + + Args: + df (DataFrame): The LD expanded dataset, before PICS calculation + + Returns: + DataFrame: Clumped signals are resolved by: + - removing tagging varinants of non independent leads. + - removing overall R from non independent leads. + - Adding QC flag to non-independent leads pointing to the relevant lead. + """ + w = Window.partitionBy("studyId", "variantPair").orderBy(f.col("negLogPVal").desc()) + + # This dataframe contains all the resolved and independent leads. However not all linked signals are properly assigned to a more significant lead: + resolved_independent = ( + df + # lead/tag pairs irrespective of the order: + .withColumn( + "variantPair", + f.concat_ws( + "_", f.array_sort(f.array(f.col("variantId"), f.col("tagVariantId"))) + ), + ) + # Generating the negLogPval: + .withColumn( + "negLogPVal", _neglog_p(f.col("pValueMantissa"), f.col("pValueExponent")) + ) + # Counting the number of occurrence for each pair - for one study, each pair should occure only once: + .withColumn("rank", f.count(f.col("tagVariantId")).over(w)) + # Only the most significant lead is kept if credible sets are overlapping + keeping lead if credible set is not resolved: + .withColumn( + "keep_lead", + f.when( + f.max(f.col("rank")).over(Window.partitionBy("studyId", "variantId")) + <= 1, + True, + ).otherwise(False), + ) + # Adding reference lead that explains: + .withColumn( + "reference_lead", + f.when(f.col("rank") > 1, f.col("tagVariantId")).when( + f.col("keep_lead"), f.col("variantId") + ), + ) + # One credible set might contain multiple tags that are themselves leads. We need to collect them into an array: + # At this point we move from lead/tag to study/lead level: + .withColumn( + "all_explained", + f.collect_set(f.col("reference_lead")).over( + Window.partitionBy("studyId", "variantId") + ), + ) + .withColumn("explained", f.explode("all_explained")) + .drop("reference_lead", "all_explained") + .persist() + ) + + # We need to keep iterating until all linked leads are resolved: + while True: + resolved_independent = ( + resolved_independent.transform(_clean_dataframe) + .join( + _get_resolver(resolved_independent), + on=["studyId", "explained"], + how="left", + ) + .withColumn( + "is_resolved", + f.when(f.col("new_explained") == f.col("explained"), True).otherwise( + False + ), + ) + .persist() + ) + + if resolved_independent.select("is_resolved").distinct().count() == 1: + break + + # At this point all linked leads are resolved. Now the dataframe needs to be consolidated: + return ( + resolved_independent.drop("is_resolved", "new_explained", "neglog_pval") + .withColumn( + "qualityControl", + adding_quality_flag( + f.col("qualityControl"), + ~f.col("keep_lead"), + f.concat_ws( + " ", + f.lit("Association explained by:"), + f.concat_ws( + ", ", + f.collect_set(f.col("explained")).over( + Window.partitionBy("studyId", "variantId") + ), + ), + ), + ), + ) + # Remove tag information if lead is explained by other variant: + .withColumn( + "tagVariantId", + f.when(f.col("explained") == f.col("variantId"), f.col("tagVariantId")), + ) + .withColumn( + "R_overall", f.when(f.col("tagVariantId").isNotNull(), f.col("R_overall")) + ) + # Drop unused column: + .drop( + "variantPair", + "explained", + "negLogPVal", + "rank", + "keep_lead", + ) + .distinct() + .orderBy("studyId", "variantId") + ) diff --git a/src/etl/gwas_ingest/effect_harmonization.py b/src/etl/gwas_ingest/effect_harmonization.py index 1f349eb3f..d3d59a1f7 100644 --- a/src/etl/gwas_ingest/effect_harmonization.py +++ b/src/etl/gwas_ingest/effect_harmonization.py @@ -8,23 +8,28 @@ from pyspark.sql import types as t from scipy.stats import norm +from etl.common.spark_helpers import adding_quality_flag + if TYPE_CHECKING: from pyspark.sql import Column, DataFrame +# Quality control flags: +PALINDROME_FLAG = "Palindrome alleles - cannot harmonize" + -def pval_to_zscore(pvalcol: Column) -> Column: +def _pval_to_zscore(pval_col: Column) -> Column: """Convert p-value column to z-score column. Args: - pvalcol (Column): pvalues to be casted to floats. + pval_col (Column): pvalues to be casted to floats. Returns: Column: p-values transformed to z-scores Examples: - >>> d = d = [{"id": "t1", "pval": "1"}, {"id": "t2", "pval": "0.9"}, {"id": "t3", "pval": "0.05"}, {"id": "t4", "pval": "1e-300"}, {"id": "t5", "pval": "1e-1000"}, {"id": "t6", "pval": "NA"}] + >>> d = [{"id": "t1", "pval": "1"}, {"id": "t2", "pval": "0.9"}, {"id": "t3", "pval": "0.05"}, {"id": "t4", "pval": "1e-300"}, {"id": "t5", "pval": "1e-1000"}, {"id": "t6", "pval": "NA"}] >>> df = spark.createDataFrame(d) - >>> df.withColumn("zscore", pval_to_zscore(f.col("pval"))).show() + >>> df.withColumn("zscore", _pval_to_zscore(f.col("pval"))).show() +---+-------+----------+ | id| pval| zscore| +---+-------+----------+ @@ -38,7 +43,7 @@ def pval_to_zscore(pvalcol: Column) -> Column: """ - pvalue_float = pvalcol.cast(t.FloatType()) + pvalue_float = pval_col.cast(t.FloatType()) pvalue_nozero = f.when(pvalue_float == 0, sys.float_info.min).otherwise( pvalue_float ) @@ -48,134 +53,215 @@ def pval_to_zscore(pvalcol: Column) -> Column: )(pvalue_nozero) -def get_reverse_complement(df: DataFrame, allele_col: str) -> DataFrame: - """Get reverse complement allele of a specified allele column. +def _get_reverse_complement(allele_col: Column) -> Column: + """A function to return the reverse complement of an allele column. + + It takes a string and returns the reverse complement of that string if it's a DNA sequence, + otherwise it returns the original string. Assumes alleles in upper case. Args: - df (DataFrame): input DataFrame - allele_col (str): the name of the column containing the allele + allele_col (Column): The column containing the allele to reverse complement. Returns: - DataFrame: A dataframe with a new column called revcomp_{allele_col} + A column that is the reverse complement of the allele column. + + Examples: + >>> d = [{"allele": 'A'}, {"allele": 'T'},{"allele": 'G'}, {"allele": 'C'},{"allele": 'AC'}, {"allele": 'GTaatc'},{"allele": '?'}, {"allele": None}] + >>> df = spark.createDataFrame(d) + >>> df.withColumn("revcom_allele", _get_reverse_complement(f.col("allele"))).show() + +------+-------------+ + |allele|revcom_allele| + +------+-------------+ + | A| T| + | T| A| + | G| C| + | C| G| + | AC| GT| + |GTaatc| GATTAC| + | ?| ?| + | null| null| + +------+-------------+ + + """ - return df.withColumn( - f"revcomp_{allele_col}", - f.when( - f.col(allele_col).rlike("[ACTG]+"), - f.reverse(f.translate(f.col(allele_col), "ACTG", "TGAC")), - ), - ) + allele_col = f.upper(allele_col) + return f.when( + allele_col.rlike("[ACTG]+"), + f.reverse(f.translate(allele_col, "ACTG", "TGAC")), + ).otherwise(allele_col) -def harmonise_beta(df: DataFrame) -> DataFrame: - """Harmonise betas. +def _harmonize_beta( + effect_size: Column, confidence_interval: Column, needs_harmonization: Column +) -> Column: + """A function to harmonize beta. - The harmonization of the beta follows the logic: - - The beta is flipped (multiplied by -1) if: - 1) the effect needs harmonization and - 2) the annotation of the effect is annotated as decrease - - The 95% confidence interval of the effect is calculated using the z-score - - Irrelevant columns are dropped. + If the confidence interval contains the word "increase" or "decrease" it indicates, we are dealing with betas. + If it's "increase" and the effect size needs to be harmonized, + then multiply the effect size by -1 Args: - df (DataFrame): summary stat DataFrame + effect_size (Column): The effect size column from the dataframe + confidence_interval (Column): The confidence interval of the effect size. + needs_harmonization (Column): a boolean column that indicates whether the effect size needs to be flipped Returns: - DataFrame: input DataFrame with harmonised beta columns: - - beta - - beta_ci_lower - - beta_ci_upper - - beta_direction + A column of the harmonized beta values. + + Examples: + >>> d = [ + ... {"effect_size": 0.6, 'confidence_interval': '[0.1-0.9] unit decrease', 'needs_harmonization': True}, + ... {"effect_size": 0.6, 'confidence_interval': '[0.1-0.9] unit decrease', 'needs_harmonization': False}, + ... {"effect_size": 0.6, 'confidence_interval': '[0.1-0.9] unit increase', 'needs_harmonization': True}, + ... {"effect_size": 0.6, 'confidence_interval': '[0.1-0.9] unit increase', 'needs_harmonization': False}, + ... {"effect_size": 0.6, 'confidence_interval': '[0.1-0.9]', 'needs_harmonization': False}, + ... ] + >>> df = spark.createDataFrame(d) + >>> df.withColumn("beta", _harmonize_beta(f.col("effect_size"), f.col('confidence_interval'), f.col('needs_harmonization'))).show() + +--------------------+-----------+-------------------+----+ + | confidence_interval|effect_size|needs_harmonization|beta| + +--------------------+-----------+-------------------+----+ + |[0.1-0.9] unit de...| 0.6| true| 0.6| + |[0.1-0.9] unit de...| 0.6| false|-0.6| + |[0.1-0.9] unit in...| 0.6| true|-0.6| + |[0.1-0.9] unit in...| 0.6| false| 0.6| + | [0.1-0.9]| 0.6| false|null| + +--------------------+-----------+-------------------+----+ + + """ - # The z-score corresponding to p-value: 0.05 - zscore_95 = 1.96 + # The effect is given as beta, if the confidence interval contains 'increase' or 'decrease' + beta = f.when( + confidence_interval.rlike("|".join(["decrease", "increase"])), + effect_size, + ) + # Flipping beta if harmonization is required or effect negated by saying 'decrease' + return f.when( + (confidence_interval.contains("increase") & needs_harmonization) + | (confidence_interval.contains("decrease") & ~needs_harmonization), + beta * -1, + ).otherwise(beta) + + +def _calculate_beta_ci(beta: Column, zscore: Column, direction: Column) -> Column: + """Calculating confidence intervals for beta values. + + Args: + beta (Column): The beta value for the given row + zscore (Column): The z-score of the beta coefficient. + direction (Column): This is the direction of the confidence interval. It can be either "upper" or "lower". + + Returns: + The upper and lower bounds of the confidence interval for the beta coefficient. + + Examples: + >>> d = [ + ... {"beta": 0.6, 'zscore': 3, 'direction': 'upper'}, + ... {"beta": 0.6, 'zscore': 3, 'direction': 'lower'}, + ... {"beta": 0.6, 'zscore': 3, 'direction': 'something'}, + ... {"beta": None, 'zscore': 3, 'direction': 'lower'}, + ... ] + >>> df = spark.createDataFrame(d) + >>> df.withColumn("beta_confidence_interval", _calculate_beta_ci(f.col("beta"), f.col('zscore'), f.col('direction'))).show() + +----+---------+------+------------------------+ + |beta|direction|zscore|beta_confidence_interval| + +----+---------+------+------------------------+ + | 0.6| upper| 3| 0.992| + | 0.6| lower| 3| 0.20800000000000002| + | 0.6|something| 3| null| + |null| lower| 3| null| + +----+---------+------+------------------------+ + + """ + zscore_95 = f.lit(1.96) return ( - df.withColumn( - "beta", - f.when( - ( - f.col("confidence_interval").contains("increase") - & f.col("needs_harmonization") - ) - | ( - f.col("confidence_interval").contains("decrease") - & ~f.col("needs_harmonization") - ), - f.col("beta") * -1, - ).otherwise(f.col("beta")), - ) - .withColumn( - "beta_conf_intervals", - f.array( - f.col("beta") - f.lit(zscore_95) * f.col("beta") / f.col("zscore"), - f.col("beta") + f.lit(zscore_95) * f.col("beta") / f.col("zscore"), - ), - ) - .withColumn("beta_ci_lower", f.array_min(f.col("beta_conf_intervals"))) - .withColumn("beta_ci_upper", f.array_max(f.col("beta_conf_intervals"))) - .withColumn( - "beta_direction", - f.when(f.col("beta") >= 0, "+").when(f.col("beta") < 0, "-"), - ) - .drop("beta_conf_intervals") + f.when(direction == "upper", beta + f.abs(zscore_95 * beta) / zscore) + .when(direction == "lower", beta - f.abs(zscore_95 * beta) / zscore) + .otherwise(None) ) -def harmonise_odds_ratio(df: DataFrame) -> DataFrame: - """Harmonise odds ratio. - - The harmonization of the odds ratios follows the logic: - - The effect is flipped (reciprocal value is calculated) if the effect needs harmonization - - The 95% confidence interval is calculated using the z-score - - Irrelevant columns are dropped. +def _harmonize_odds_ratio( + effect_size: Column, confidence_interval: Column, needs_harmonization: Column +) -> Column: + """Harmonizing odds ratio. Args: - df (DataFrame): summary stat DataFrame + effect_size (Column): The effect size column from the dataframe + confidence_interval (Column): The confidence interval of the effect size. + needs_harmonization (Column): a boolean column that indicates whether the odds ratio needs to be flipped Returns: - DataFrame: odds ratio with harmonised OR in columns: - - odds_ratio - - odds_ratio_ci_lower - - odds_ratio_ci_upper - - odds_ratio_direction + A column with the odds ratio, or 1/odds_ratio if harmonization required. + + Examples: + >>> d = [ + ... {"effect_size": 0.6, 'confidence_interval': '[0.1-0.9] unit decrease', 'needs_harmonization': True}, + ... {"effect_size": 0.6, 'confidence_interval': '[0.1-0.9]', 'needs_harmonization': False}, + ... {"effect_size": 0.6, 'confidence_interval': '[0.1-0.9]', 'needs_harmonization': True}, + ... ] + >>> df = spark.createDataFrame(d) + >>> df.withColumn("odds_ratio", _harmonize_odds_ratio(f.col("effect_size"), f.col('confidence_interval'), f.col('needs_harmonization'))).show() + +--------------------+-----------+-------------------+------------------+ + | confidence_interval|effect_size|needs_harmonization| odds_ratio| + +--------------------+-----------+-------------------+------------------+ + |[0.1-0.9] unit de...| 0.6| true| null| + | [0.1-0.9]| 0.6| false| 0.6| + | [0.1-0.9]| 0.6| true|1.6666666666666667| + +--------------------+-----------+-------------------+------------------+ + + """ - # The z-score corresponding to p-value: 0.05 - zscore_95 = 1.96 + # The confidence interval tells if we are not dealing with betas -> OR + odds_ratio = f.when( + ~confidence_interval.rlike("|".join(["decrease", "increase"])), + effect_size, + ) - return ( - df.withColumn( - "odds_ratio", - f.when(f.col("needs_harmonization"), 1 / f.col("odds_ratio")).otherwise( - f.col("odds_ratio") - ), - ) - .withColumn("odds_ratio_estimate", f.log(f.col("odds_ratio"))) - .withColumn("odds_ratio_se", f.col("odds_ratio_estimate") / f.col("zscore")) - .withColumn( - "odds_ratio_direction", - f.when(f.col("odds_ratio") >= 1, "+").when(f.col("odds_ratio") < 1, "-"), - ) - .withColumn( - "odds_ratio_conf_intervals", - f.array( - f.exp( - f.col("odds_ratio_estimate") - - f.lit(zscore_95) * f.col("odds_ratio_se") - ), - f.exp( - f.col("odds_ratio_estimate") - + f.lit(zscore_95) * f.col("odds_ratio_se") - ), - ), - ) - .withColumn( - "odds_ratio_ci_lower", f.array_min(f.col("odds_ratio_conf_intervals")) - ) - .withColumn( - "odds_ratio_ci_upper", f.array_max(f.col("odds_ratio_conf_intervals")) - ) - .drop("odds_ratio_conf_intervals", "odds_ratio_se", "odds_ratio_estimate") + return f.when(needs_harmonization, 1 / odds_ratio).otherwise(odds_ratio) + + +def _calculate_or_ci(odds_ratio: Column, zscore: Column, direction: Column) -> Column: + """Calculating confidence intervals for odds-ratio values. + + Args: + odds_ratio (Column): The odds ratio of the association between the exposure and outcome. + zscore (Column): The z-score of the confidence interval. + direction (Column): This is either "upper" or "lower" and determines whether the upper or lower confidence interval is calculated. + + Returns: + The upper and lower bounds of the 95% confidence interval for the odds ratio. + + Examples: + >>> d = [ + ... {"odds_ratio": 0.6, 'zscore': 3, 'direction': 'upper'}, + ... {"odds_ratio": 1.6, 'zscore': 3, 'direction': 'lower'}, + ... {"odds_ratio": 0.6, 'zscore': 3, 'direction': 'something'}, + ... {"odds_ratio": None, 'zscore': 3, 'direction': 'lower'}, + ... ] + >>> df = spark.createDataFrame(d) + >>> df.withColumn("or_confidence_interval", _calculate_or_ci(f.col("odds_ratio"), f.col('zscore'), f.col('direction'))).show() + +---------+----------+------+----------------------+ + |direction|odds_ratio|zscore|or_confidence_interval| + +---------+----------+------+----------------------+ + | upper| 0.6| 3| 0.8377075574145849| + | lower| 1.6| 3| 1.1769597039688107| + |something| 0.6| 3| null| + | lower| null| 3| null| + +---------+----------+------+----------------------+ + + + """ + zscore_95 = f.lit(1.96) + odds_ratio_estimate = f.log(odds_ratio) + odds_ratio_se = odds_ratio_estimate / zscore + return f.when( + direction == "upper", + f.exp(odds_ratio_estimate + f.abs(zscore_95 * odds_ratio_se)), + ).when( + direction == "lower", + f.exp(odds_ratio_estimate - f.abs(zscore_95 * odds_ratio_se)), ) @@ -190,54 +276,94 @@ def harmonize_effect(df: DataFrame) -> DataFrame: """ return ( df - # Get reverse complement of the alleles of the mapped variants: - .transform(lambda df: get_reverse_complement(df, "alt")) - .transform(lambda df: get_reverse_complement(df, "ref")) - # A variant is palindromic if the reference and alt alleles are reverse complement of each other: - # eg. T -> A: in such cases we cannot disambigate the effect, which means we cannot be sure if + # If the alleles are palindrome - the reference and alt alleles are reverse complement of each other: + # eg. T -> A: in such cases we cannot disambiguate the effect, which means we cannot be sure if # the effect is given to the alt allele on the positive strand or the ref allele on - # The negative strand. + # The negative strand. We assume, we don't need to harminze. + # Adding a flag indicating if effect harmonization is required: + .withColumn( + "isPalindrome", + f.when( + f.col("referenceAllele") + == _get_reverse_complement(f.col("alternateAllele")), + True, + ).otherwise(False), + ) + # If a variant is palindrome, we drop the effect size, so no harmonization can happen: .withColumn( - "is_palindrome", - f.when(f.col("ref") == f.col("revcomp_alt"), True).otherwise(False), + "effectSize", + f.when(f.col("isPalindrome"), None).otherwise(f.col("effectSize")), ) - # We are harmonizing the effect on the alternative allele: - # Adding a flag to trigger harmonization if: risk == ref or risk == revcomp(ref): + # As we are calculating effect on the alternate allele, we have to harmonise effect + # if the risk allele is reference allele or the reverse complement of the reference allele .withColumn( - "needs_harmonization", + "needsHarmonisation", f.when( - (f.col("risk_allele") == f.col("ref")) - | (f.col("risk_allele") == f.col("revcomp_ref")), + (f.col("riskAllele") == f.col("referenceAllele")) + | ( + f.col("riskAllele") + == _get_reverse_complement(f.col("referenceAllele")) + ), True, ).otherwise(False), ) # Z-score is needed to calculate 95% confidence interval: - .withColumn("zscore", pval_to_zscore(f.col("p_value"))) - # Annotation provides information if the effect is odds-ratio or beta: - # Effect is lost for variants with palindromic alleles. + .withColumn( + "zscore", + _pval_to_zscore( + f.concat_ws("E", f.col("pValueMantissa"), f.col("pValueExponent")) + ), + ) + # Harmonizing betas + calculate the corresponding confidence interval: .withColumn( "beta", - f.when( - f.col("confidence_interval").rlike(r"[increase|decrease]") - & (~f.col("is_palindrome")), - f.col("or_beta"), + _harmonize_beta( + f.col("effectSize"), + f.col("confidenceInterval"), + f.col("needsHarmonisation"), ), ) + .withColumn( + "beta_ci_upper", + _calculate_beta_ci(f.col("beta"), f.col("zscore"), f.lit("upper")), + ) + .withColumn( + "beta_ci_lower", + _calculate_beta_ci(f.col("beta"), f.col("zscore"), f.lit("lower")), + ) + # Harmonizing odds-ratios + calculate the corresponding confidence interval: .withColumn( "odds_ratio", - f.when( - (~f.col("confidence_interval").rlike(r"[increase|decrease]")) - & (~f.col("is_palindrome")), - f.col("or_beta"), + _harmonize_odds_ratio( + f.col("effectSize"), + f.col("confidenceInterval"), + f.col("needsHarmonisation"), ), ) - # Harmonize beta: - .transform(harmonise_beta) - # Harmonize odds-ratio: - .transform(harmonise_odds_ratio) - # Coalesce effect direction: .withColumn( - "direction", - f.coalesce(f.col("beta_direction"), f.col("odds_ratio_direction")), + "odds_ratio_ci_upper", + _calculate_or_ci(f.col("odds_ratio"), f.col("zscore"), f.lit("upper")), + ) + .withColumn( + "odds_ratio_ci_lower", + _calculate_or_ci(f.col("odds_ratio"), f.col("zscore"), f.lit("lower")), + ) + # Adding QC flag to variants with palindrome alleles: + .withColumn( + "qualityControl", + adding_quality_flag( + f.col("qualityControl"), f.col("isPalindrome"), PALINDROME_FLAG + ), + ) + # Dropping unused columns: + .drop( + "zscore", + "effectSize", + "confidenceInterval", + "needsHarmonisation", + "isPalindrome", + "zscore", + "riskAllele", ) + .persist() ) diff --git a/src/etl/gwas_ingest/ld.py b/src/etl/gwas_ingest/ld.py new file mode 100644 index 000000000..2e25472c8 --- /dev/null +++ b/src/etl/gwas_ingest/ld.py @@ -0,0 +1,404 @@ +"""Performing linkage disequilibrium (LD) operations.""" +from __future__ import annotations + +from functools import reduce +from typing import TYPE_CHECKING + +from pyspark.sql import DataFrame, Window +from pyspark.sql import functions as f + +from etl.common.spark_helpers import ( + get_record_with_maximum_value, + get_record_with_minimum_value, +) +from etl.common.utils import convert_gnomad_position_to_ensembl + +if TYPE_CHECKING: + from omegaconf.listconfig import ListConfig + from hail.table import Table + from pyspark.sql import Column + from etl.common.ETLSession import ETLSession + +import hail as hl +from hail.linalg import BlockMatrix + + +def _liftover_loci(variant_index: Table, grch37_to_grch38_chain_path: str) -> Table: + """Liftover hail table with LD variant index. + + Args: + variant_index (Table): LD variant indexes + grch37_to_grch38_chain_path (str): Path to chain file + + Returns: + Table: LD variant index with locus 38 coordinates + """ + if not hl.get_reference("GRCh37").has_liftover("GRCh38"): + rg37 = hl.get_reference("GRCh37") + rg38 = hl.get_reference("GRCh38") + rg37.add_liftover(grch37_to_grch38_chain_path, rg38) + + return variant_index.annotate(locus38=hl.liftover(variant_index.locus, "GRCh38")) + + +def _interval_start(contig: Column, position: Column, ld_radius: int) -> Column: + """Start position of the interval based on available positions. + + Args: + contig (Column): genomic contigs + position (Column): genomic positions + ld_radius (int): bp around locus + + Returns: + Column: Position of the locus starting the interval + + Examples: + >>> d = [ + ... {"contig": "21", "pos": 100}, + ... {"contig": "21", "pos": 200}, + ... {"contig": "21", "pos": 300}, + ... ] + >>> df = spark.createDataFrame(d) + >>> df.withColumn("start", _interval_start(f.col("contig"), f.col("pos"), 100)).show() + +------+---+-----+ + |contig|pos|start| + +------+---+-----+ + | 21|100| 100| + | 21|200| 100| + | 21|300| 200| + +------+---+-----+ + + + """ + w = Window.partitionBy(contig).orderBy(position).rangeBetween(-ld_radius, ld_radius) + return f.min(position).over(w) + + +def _interval_stop(contig: Column, position: Column, ld_radius: int) -> Column: + """Stop position of the interval based on available positions. + + Args: + contig (Column): genomic contigs + position (Column): genomic positions + ld_radius (int): bp around locus + + Returns: + Column: Position of the locus at the end of the interval + + Examples: + >>> d = [ + ... {"contig": "21", "pos": 100}, + ... {"contig": "21", "pos": 200}, + ... {"contig": "21", "pos": 300}, + ... ] + >>> df = spark.createDataFrame(d) + >>> df.withColumn("start", _interval_stop(f.col("contig"), f.col("pos"), 100)).show() + +------+---+-----+ + |contig|pos|start| + +------+---+-----+ + | 21|100| 200| + | 21|200| 300| + | 21|300| 300| + +------+---+-----+ + + + """ + w = Window.partitionBy(contig).orderBy(position).rangeBetween(-ld_radius, ld_radius) + return f.max(position).over(w) + + +def _annotate_index_intervals(index: DataFrame, ld_radius: int) -> DataFrame: + """Annotate LD index with indexes starting and stopping a given interval. + + Args: + index (DataFrame): LD index + ld_radius (int): radius around each position + + Returns: + DataFrame: including `start_idx` and `stop_idx` + """ + index_with_positions = index.select( + "*", + _interval_start( + contig=f.col("chromosome"), + position=f.col("position"), + ld_radius=ld_radius, + ).alias("start_pos"), + _interval_stop( + contig=f.col("chromosome"), + position=f.col("position"), + ld_radius=ld_radius, + ).alias("stop_pos"), + ) + + return ( + index_with_positions.join( + ( + index_with_positions + # Given the multiple variants with the same chromosome/position can have different indexes, filter for the lowest index: + .transform( + lambda df: get_record_with_minimum_value( + df, ["chromosome", "position"], "idx" + ) + ).select( + "chromosome", + f.col("position").alias("start_pos"), + f.col("idx").alias("start_idx"), + ) + ), + on=["chromosome", "start_pos"], + ) + .join( + ( + index_with_positions + # Given the multiple variants with the same chromosome/position can have different indexes, filter for the highest index: + .transform( + lambda df: get_record_with_maximum_value( + df, ["chromosome", "position"], "idx" + ) + ).select( + "chromosome", + f.col("position").alias("stop_pos"), + f.col("idx").alias("stop_idx"), + ) + ), + on=["chromosome", "stop_pos"], + ) + .drop("start_pos", "stop_pos") + ) + + +def _query_block_matrix( + bm: BlockMatrix, idxs: list[int], starts: list[int], stops: list[int], min_r2: float +) -> DataFrame: + """Query block matrix for idxs rows sparsified by start/stop columns. + + Args: + bm (BlockMatrix): LD matrix containing r values + idxs (List[int]): Row indexes to query (distinct and incremenetal) + starts (List[int]): Interval start column indexes (same size as idxs) + stops (List[int]): Interval start column indexes (same size as idxs) + min_r2 (float): Minimum r2 to keep + + Returns: + DataFrame: i,j,r where i and j are the row and column indexes and r is the LD + + Examples: + >>> import numpy as np + >>> r = np.array([[1, 0.8, 0.7, 0.2], + ... [0.8, 1, 0.6, 0.1], + ... [0.7, 0.6, 1, 0.3], + ... [0.2, 0.1, 0.3, 1]]) + >>> bm_r = BlockMatrix.from_numpy(r) + >>> _query_block_matrix(bm_r, [1, 2], [0, 1], [3, 4], 0.5).show() + +---+---+---+ + | i| j| r| + +---+---+---+ + | 0| 0|0.8| + | 0| 1|1.0| + | 1| 2|1.0| + +---+---+---+ + + """ + bm_sparsified = bm.filter_rows(idxs).sparsify_row_intervals( + starts, stops, blocks_only=True + ) + entries = bm_sparsified.entries(keyed=False) + + return ( + entries.rename({"entry": "r"}) + .to_spark() + .filter(f.col("r") ** 2 >= min_r2) + .withColumn("r", f.when(f.col("r") >= 1, f.lit(1)).otherwise(f.col("r"))) + ) + + +def lead_coordinates_in_ld( + leads_df: DataFrame, +) -> tuple[list[int], list[int], list[int]]: + """Idxs for lead, first variant in the region and last variant in the region. + + Args: + leads_df (DataFrame): Lead variants from `_annotate_index_intervals` output + + Returns: + Tuple[List[int], List[int], List[int]]: Lead, start and stop indexes + """ + intervals = ( + leads_df + # start idx > stop idx in rare occasions due to liftover + .filter(f.col("start_idx") < f.col("stop_idx")) + .groupBy("chromosome", "idx") + .agg(f.min("start_idx").alias("start_idx"), f.max("stop_idx").alias("stop_idx")) + .sort(f.col("idx")) + .persist() + ) + + idxs = intervals.select("idx").rdd.flatMap(lambda x: x).collect() + starts = intervals.select("start_idx").rdd.flatMap(lambda x: x).collect() + stops = intervals.select("stop_idx").rdd.flatMap(lambda x: x).collect() + + return idxs, starts, stops + + +def precompute_ld_index( + pop_ldindex_path: str, + ld_radius: int, + grch37_to_grch38_chain_path: str, +) -> DataFrame: + """Parse LD index and annotate with interval start and stop. + + Args: + pop_ldindex_path (str): path to gnomAD LD index + ld_radius (int): radius + grch37_to_grch38_chain_path (str): path to chain file for liftover + + Returns: + DataFrame: Parsed LD iindex + """ + ld_index = hl.read_table(pop_ldindex_path).naive_coalesce(400) + ld_index_38 = _liftover_loci(ld_index, grch37_to_grch38_chain_path) + + ld_index_spark = ( + ld_index_38.to_spark() + .filter(f.col("`locus38.position`").isNotNull()) + .select( + f.regexp_replace("`locus38.contig`", "chr", "").alias("chromosome"), + f.col("`locus38.position`").alias("position"), + f.col("`alleles`").getItem(0).alias("referenceAllele"), + f.col("`alleles`").getItem(1).alias("alternateAllele"), + f.col("idx"), + ) + # Convert gnomad position to Ensembl position (1-based for indels) + .withColumn( + "position", + convert_gnomad_position_to_ensembl( + f.col("position"), f.col("referenceAllele"), f.col("alternateAllele") + ), + ) + .repartition(400, "chromosome") + .sortWithinPartitions("position") + .persist() + ) + + return _annotate_index_intervals(ld_index_spark, ld_radius) + + +def variants_in_ld_in_gnomad_pop( + etl: ETLSession, + variants_df: DataFrame, + ld_path: str, + parsed_ld_index_path: str, + min_r2: float, +) -> DataFrame: + """Return lead variants with LD annotation. + + Args: + etl (ETLSession): Session + variants_df (DataFrame): variants to annotate + ld_path (str): path to LD matrix + parsed_ld_index_path (str): path to LD index + min_r2 (float): minimum r2 to keep + + Returns: + DataFrame: lead variants with LD annotation + """ + # LD blockmatrix and indexes from gnomAD + bm = BlockMatrix.read(ld_path) + bm = bm + bm.T + + parsed_ld_index = etl.spark.read.parquet(parsed_ld_index_path).persist() + + leads_with_idxs = variants_df.join( + parsed_ld_index, + on=["chromosome", "position", "referenceAllele", "alternateAllele"], + ) + + # idxs for lead, first variant in the region and last variant in the region + idxs, starts, stops = lead_coordinates_in_ld(leads_with_idxs) + + etl.logger.info("Querying block matrix...") + entries = _query_block_matrix(bm, idxs, starts, stops, min_r2) + + i_position_lut = etl.spark.createDataFrame(list(enumerate(idxs))).toDF("i", "idx") + + return ( + entries.join( + f.broadcast(leads_with_idxs.join(f.broadcast(i_position_lut), on="idx")), + on="i", + how="inner", + ) + .drop("i", "idx", "start_idx", "stop_idx") + .alias("leads") + .join( + parsed_ld_index.select( + f.col("chromosome"), + f.concat_ws( + "_", + f.col("chromosome"), + f.col("position"), + f.col("referenceAllele"), + f.col("alternateAllele"), + ).alias("tagVariantId"), + f.col("idx").alias("tag_idx"), + ).alias("tags"), + on=[ + f.col("leads.chromosome") == f.col("tags.chromosome"), + f.col("leads.j") == f.col("tags.tag_idx"), + ], + ) + .select( + "variantId", "leads.chromosome", "gnomadPopulation", "tagVariantId", "r" + ) + ) + + +def ld_annotation_by_locus_ancestry( + etl: ETLSession, + association_ancestry: DataFrame, + ld_populations: ListConfig, + min_r2: float, +) -> DataFrame: + """LD information for all locus and ancestries. + + Args: + etl (ETLSession): Session + association_ancestry (DataFrame): variant-ancestry information + ld_populations (ListConfig): list of populations to annotate + min_r2 (float): minimum r2 to keep + + Returns: + DataFrame: lead variants with LD annotation by relevant ancestry + """ + # All gnomad populations captured in associations: + assoc_populations = ( + association_ancestry.select("gnomadPopulation") + .distinct() + .rdd.flatMap(lambda x: x) + .collect() + ) + + # Retrieve LD information from gnomAD + ld_annotated_assocs = [] + for pop in assoc_populations: + pop_parsed_ldindex_path = "" + pop_matrix_path = "" + for popobj in ld_populations: + if popobj.id == pop: + pop_parsed_ldindex_path = popobj.parsed_index + pop_matrix_path = popobj.matrix + variants_in_pop = association_ancestry.filter( + f.col("gnomadPopulation") == pop + ).distinct() + etl.logger.info(f"[{pop}] - Annotating LD information...") + ld_annotated_assocs.append( + variants_in_ld_in_gnomad_pop( + etl=etl, + variants_df=variants_in_pop, + ld_path=pop_matrix_path, + parsed_ld_index_path=pop_parsed_ldindex_path, + min_r2=min_r2, + ) + ) + + return reduce(DataFrame.unionByName, ld_annotated_assocs) diff --git a/src/etl/gwas_ingest/pics.py b/src/etl/gwas_ingest/pics.py new file mode 100644 index 000000000..2b006a69e --- /dev/null +++ b/src/etl/gwas_ingest/pics.py @@ -0,0 +1,500 @@ +"""Calculate PICS for a given study and locus.""" + +from __future__ import annotations + +import importlib.resources as pkg_resources +import json +from typing import TYPE_CHECKING + +import pyspark.sql.functions as f +import pyspark.sql.types as t +from pyspark.sql import DataFrame, Window +from scipy.stats import norm + +from etl.common.spark_helpers import _neglog_p, adding_quality_flag +from etl.gwas_ingest.clumping import clumping +from etl.gwas_ingest.ld import ld_annotation_by_locus_ancestry +from etl.json import data + +if TYPE_CHECKING: + from omegaconf.listconfig import ListConfig + from pyspark.sql import Column + + from etl.common.ETLSession import ETLSession + + +# Quality control flags: +UNRESOLVED_LEAD_FLAG = "Credible set not resolved" + + +@f.udf(t.DoubleType()) +def _norm_sf(mu: float, std: float, neglog_p: float) -> float | None: + """Returns the survival function of the normal distribution for the p-value. + + Args: + mu (float): mean + std (float): standard deviation + neglog_p (float): negative log p-value + + Returns: + float: survival function + + Examples: + >>> d = [{"mu": 0, "neglog_p": 0, "std": 1}, {"mu": 1, "neglog_p": 10, "std": 10}] + >>> spark.createDataFrame(d).withColumn("norm_sf", _norm_sf(f.col("mu"), f.col("std"), f.col("neglog_p"))).show() + +---+--------+---+-------------------+ + | mu|neglog_p|std| norm_sf| + +---+--------+---+-------------------+ + | 0| 0| 1| 1.0| + | 1| 10| 10|0.36812025069351895| + +---+--------+---+-------------------+ + + """ + try: + return float(norm(mu, std).sf(neglog_p) * 2) + except TypeError: + return None + + +def _get_study_gnomad_ancestries(etl: ETLSession, study_df: DataFrame) -> DataFrame: + """Get all studies and their ancestries. + + Args: + etl (ETLSession): session + study_df (DataFrame): studies from GWAS Catalog + + Returns: + DataFrame: studies mapped to gnomAD ancestries and their frequencies + """ + # GWAS Catalog to gnomAD superpopulation mapping + gwascat_2_gnomad_pop = etl.spark.createDataFrame( + json.loads( + pkg_resources.read_text( + data, "gwascat_2_gnomad_superpopulation.json", encoding="utf-8" + ) + ) + ) + + # Study ancestries + w_study = Window.partitionBy("studyId") + return ( + study_df + # Excluding studies where no sample discription is provided: + .filter(f.col("discoverySamples").isNotNull()) + # Exploding sample description and study identifier: + .withColumn("discoverySample", f.explode(f.col("discoverySamples"))) + # Splitting sample descriptions further: + .withColumn( + "ancestries", f.split(f.col("discoverySample.ancestry"), r",\s(?![^()]*\))") + ) + # Dividing sample sizes assuming even distribution + .withColumn( + "adjustedSampleSize", + f.col("discoverySample.sampleSize") / f.size(f.col("ancestries")), + ) + # Exploding ancestries + .withColumn("gwas_catalog_ancestry", f.explode(f.col("ancestries"))) + # map gwas population to gnomad superpopulation + .join(gwascat_2_gnomad_pop, on="gwas_catalog_ancestry", how="left") + # Group by sutdies and aggregate for major population: + .groupBy("studyId", "gnomadPopulation") + .agg(f.sum(f.col("adjustedSampleSize")).alias("sampleSize")) + # Calculate proportions for each study + .withColumn( + "relativeSampleSize", + f.col("sampleSize") / f.sum("sampleSize").over(w_study), + ) + .drop("sampleSize") + ) + + +def _weighted_r_overall( + chromosome: Column, + study_id: Column, + variant_id: Column, + tag_variant_id: Column, + relative_sample_size: Column, + r: Column, +) -> Column: + """Aggregation of weighted R information using ancestry proportions. + + Args: + chromosome (Column): Chromosome + study_id (Column): Study identifier + variant_id (Column): Variant identifier + tag_variant_id (Column): Tag variant identifier + relative_sample_size (Column): Relative sample size + r (Column): Correlation + + Returns: + Column: Estimates weighted R information + """ + pseudo_r = f.when(r >= 1, 0.9999995).otherwise(r) + zscore_overall = f.sum(f.atan(pseudo_r) * relative_sample_size).over( + Window.partitionBy(chromosome, study_id, variant_id, tag_variant_id) + ) + return f.round(f.tan(zscore_overall), 6) + + +def _is_in_credset( + chromosome: Column, + study_id: Column, + variant_id: Column, + pics_postprob: Column, + credset_probability: float, +) -> Column: + """Check whether a variant is in the XX% credible set. + + Args: + chromosome (Column): Chromosome column + study_id (Column): Study ID column + variant_id (Column): Variant ID column + pics_postprob (Column): PICS posterior probability column + credset_probability (float): Credible set probability + + Returns: + Column: Whether the variant is in the credible set + """ + w_cumlead = ( + Window.partitionBy(chromosome, study_id, variant_id) + .orderBy(f.desc(pics_postprob)) + .rowsBetween(Window.unboundedPreceding, Window.currentRow) + ) + pics_postprob_cumsum = f.sum(pics_postprob).over(w_cumlead) + w_credset = Window.partitionBy(chromosome, study_id, variant_id).orderBy( + pics_postprob_cumsum + ) + return ( + # If posterior probability is null, credible set flag is False: + f.when(pics_postprob.isNull(), False) + # If the posterior probability meets the criteria the flag is True: + .when( + f.lag(pics_postprob_cumsum, 1).over(w_credset) >= credset_probability, False + ) + # IF criteria is not met, flag is False: + .otherwise(True) + ) + + +def _pics_posterior_probability( + pics_mu: Column, + pics_std: Column, + neglog_p: Column, + chromosome: Column, + study_id: Column, + variant_id: Column, +) -> Column: + """Compute the PICS posterior probability. + + Args: + pics_mu (Column): PICS mu + pics_std (Column): PICS standard deviation + neglog_p (Column): Negative log p-value + chromosome (Column): Chromosome column + study_id (Column): Study ID column + variant_id (Column): Variant ID column + + Returns: + Column: PICS posterior probability + """ + pics_relative_prob = f.when(pics_std == 0, 1.0).otherwise( + _norm_sf(pics_mu, pics_std, neglog_p) + ) + w_lead = Window.partitionBy(chromosome, study_id, variant_id) + pics_relative_prob_sum = f.sum(pics_relative_prob).over(w_lead) + return pics_relative_prob / pics_relative_prob_sum + + +def _pics_standard_deviation(neglog_p: Column, r: Column, k: float) -> Column: + """Compute the PICS standard deviation. + + Args: + neglog_p (Column): Negative log p-value + r (Column): R-squared + k (float): Empiric constant that can be adjusted to fit the curve, 6.4 recommended. + + Returns: + Column: PICS standard deviation + + Examples: + >>> k = 6.4 + >>> d = [(1.0, 1.0), (10.0, 1.0), (10.0, 0.5), (100.0, 0.5), (1.0, 0.0)] + >>> spark.createDataFrame(d).toDF("neglog_p", "r").withColumn("std", _pics_standard_deviation(f.col("neglog_p"), f.col("r"), k)).show() + +--------+---+-----------------+ + |neglog_p| r| std| + +--------+---+-----------------+ + | 1.0|1.0| 0.0| + | 10.0|1.0| 0.0| + | 10.0|0.5|1.571749395040553| + | 100.0|0.5|4.970307999319905| + | 1.0|0.0| 0.5| + +--------+---+-----------------+ + + """ + return f.sqrt(1 - f.abs(r) ** k) * f.sqrt(neglog_p) / 2 + + +def _pics_mu(neglog_p: Column, r: Column) -> Column: + """Compute the PICS mu. + + Args: + neglog_p (Column): Negative log p-value + r (Column): R + + Returns: + Column: PICS mu + + Examples: + >>> d = [(1.0, 1.0), (10.0, 1.0), (10.0, 0.5), (100.0, 0.5), (1.0, 0.0)] + >>> spark.createDataFrame(d).toDF("neglog_p", "r").withColumn("mu", _pics_mu(f.col("neglog_p"), f.col("r"))).show() + +--------+---+----+ + |neglog_p| r| mu| + +--------+---+----+ + | 1.0|1.0| 1.0| + | 10.0|1.0|10.0| + | 10.0|0.5| 2.5| + | 100.0|0.5|25.0| + | 1.0|0.0| 0.0| + +--------+---+----+ + + """ + return neglog_p * (r**2) + + +def _flag_partial_mapped( + study_id: Column, variant_id: Column, tag_variant_id: Column +) -> Column: + """Generate flag for lead/tag pairs. + + Some lead variants can be resolved in one population but not in other. Those rows interfere with PICS calculation, so they needs to be dropped. + + Args: + study_id (Column): Study identifier column + variant_id (Column): Identifier of the lead variant + tag_variant_id (Column): Identifier of the tag variant + + Returns: + Column: Boolean + + Examples: + >>> data = [ + ... ('study_1', 'lead_1', 'tag_1'), # <- keep row as tag available. + ... ('study_1', 'lead_1', 'tag_2'), # <- keep row as tag available. + ... ('study_1', 'lead_2', 'tag_3'), # <- keep row as tag available + ... ('study_1', 'lead_2', None), # <- drop row as lead 2 is resolved. + ... ('study_1', 'lead_3', None) # <- keep row as lead 3 is not resolved. + ... ] + >>> ( + ... spark.createDataFrame(data, ['studyId', 'variantId', 'tagVariantId']) + ... .withColumn("flag_to_keep_tag", _flag_partial_mapped(f.col('studyId'), f.col('variantId'), f.col('tagVariantId'))) + ... .show() + ... ) + +-------+---------+------------+----------------+ + |studyId|variantId|tagVariantId|flag_to_keep_tag| + +-------+---------+------------+----------------+ + |study_1| lead_3| null| true| + |study_1| lead_1| tag_1| true| + |study_1| lead_1| tag_2| true| + |study_1| lead_2| tag_3| true| + |study_1| lead_2| null| false| + +-------+---------+------------+----------------+ + + """ + has_resolved_tag = f.when( + f.array_contains( + f.collect_set(tag_variant_id.isNotNull()).over( + Window.partitionBy(study_id, variant_id) + ), + True, + ), + True, + ).otherwise(False) + + return f.when(tag_variant_id.isNotNull() | ~has_resolved_tag, True).otherwise(False) + + +def calculate_pics( + associations_ld_allancestries: DataFrame, + k: float, +) -> DataFrame: + """It calculates the PICS statistics for each variant in the input DataFrame. + + Args: + associations_ld_allancestries (DataFrame): DataFrame + k (float): float + + Returns: + A dataframe with the columns: + - pics_mu + - pics_std + - pics_postprob + - pics_95_perc_credset + - pics_99_perc_credset + """ + # Calculate and return PICS statistics + return ( + associations_ld_allancestries.withColumn( + "pics_mu", + _pics_mu( + _neglog_p(f.col("pValueMantissa"), f.col("pValueExponent")), + f.col("R_overall"), + ), + ) + .withColumn( + "pics_std", + _pics_standard_deviation( + _neglog_p(f.col("pValueMantissa"), f.col("pValueExponent")), + f.col("R_overall"), + k, + ), + ) + # Some overall_R are suspiciously high. For such cases, pics_std fails. Drop them: + .filter(~f.isnan(f.col("pics_std"))) + .withColumn( + "pics_postprob", + _pics_posterior_probability( + f.col("pics_mu"), + f.col("pics_std"), + _neglog_p(f.col("pValueMantissa"), f.col("pValueExponent")), + f.col("chromosome"), + f.col("studyId"), + f.col("variantId"), + ), + ) + .withColumn( + "pics_95_perc_credset", + _is_in_credset( + f.col("chromosome"), + f.col("studyId"), + f.col("variantId"), + f.col("pics_postprob"), + 0.95, + ), + ) + .withColumn( + "pics_99_perc_credset", + _is_in_credset( + f.col("chromosome"), + f.col("studyId"), + f.col("variantId"), + f.col("pics_postprob"), + 0.99, + ), + ) + ) + + +def pics_all_study_locus( + etl: ETLSession, + associations: DataFrame, + studies: DataFrame, + ld_populations: ListConfig, + min_r2: float, + k: float, +) -> DataFrame: + """Calculates study-locus based on PICS. + + It takes in a dataframe of associations, a dataframe of studies, a list of LD populations, a minimum + R^2, and a constant k, and returns a dataframe of PICS results + + Args: + etl (ETLSession): ETLSession + associations (DataFrame): DataFrame + studies (DataFrame): DataFrame + ld_populations (ListConfig): ListConfig = ListConfig( + min_r2 (float): Minimum R^2 + k (float): Empiric constant that can be adjusted to fit the curve, 6.4 recommended. + + Returns: + DataFrame: _description_ + """ + # Extracting ancestry information from study table, then map to gnomad population: + gnomad_mapped_studies = _get_study_gnomad_ancestries( + etl, studies.withColumnRenamed("id", "studyId") + ) + print("gnomad_mapped_studies") + print(gnomad_mapped_studies.show()) + # Joining to associations: + association_gnomad = associations.join( + gnomad_mapped_studies, + on="studyId", + how="left", + ).persist() + print("association_gnomad") + print(association_gnomad.show()) + # Extracting mapped variants for LD expansion: + variant_population = ( + association_gnomad.filter(f.col("position").isNotNull()) + .select( + "variantId", + "gnomadPopulation", + "chromosome", + "position", + "referenceAllele", + "alternateAllele", + ) + .distinct() + ) + print(variant_population.show()) + # Number of distinct variants/population pairs to map: + etl.logger.info(f"Number of variant/ancestry pairs: {variant_population.count()}") + etl.logger.info( + f'Number of unique variants: {variant_population.select("variantId").distinct().count()}' + ) + + # LD information for all locus and ancestries + ld_r = ld_annotation_by_locus_ancestry( + etl, variant_population, ld_populations, min_r2 + ).persist() + + # Joining association with linked variants (while keeping unresolved associations). + association_ancestry_ld = ( + association_gnomad.join( + ld_r, on=["chromosome", "variantId", "gnomadPopulation"], how="left" + ) + # It is possible that a lead variant is resolved in the LD panel of a population, but not in the other. + # Once the linked variants are joined this leads to unnecessary nulls. To avoid this, adding + # a flag indicating if a lead variant is resolved in the LD panel for at least one population: + .withColumn( + "flag_to_keep_tag", + _flag_partial_mapped( + f.col("studyId"), f.col("variantId"), f.col("tagVariantId") + ), + ) + # Dropping rows where no tag is available, however the lead is resolved: + .filter(f.col("flag_to_keep_tag")) + # Dropping unused column: + .drop("flag_to_keep_tag") + # Updating qualityControl for unresolved associations: + .withColumn( + "qualityControl", + adding_quality_flag( + f.col("qualityControl"), + f.col("tagVariantId").isNull(), + UNRESOLVED_LEAD_FLAG, + ), + ).distinct() + ) + + # Aggregation of weighted R information using ancestry proportions + associations_ld_allancestries = ( + association_ancestry_ld.withColumn( + "R_overall", + _weighted_r_overall( + f.col("chromosome"), + f.col("studyId"), + f.col("variantId"), + f.col("tagVariantId"), + f.col("relativeSampleSize"), + f.col("r"), + ), + ) + # Collapse the data by study, lead, tag + .drop("relativeSampleSize", "r", "gnomadPopulation").distinct() + # Clumping non-independent associations together: + .transform(clumping) + ) + + pics_results = calculate_pics(associations_ld_allancestries, k) + + return pics_results diff --git a/src/etl/gwas_ingest/process_associations.py b/src/etl/gwas_ingest/process_associations.py index 9a07810f3..504784039 100644 --- a/src/etl/gwas_ingest/process_associations.py +++ b/src/etl/gwas_ingest/process_associations.py @@ -1,20 +1,32 @@ """Process GWAS catalog associations.""" from __future__ import annotations +import importlib.resources as pkg_resources +import json from typing import TYPE_CHECKING import numpy as np from pyspark.sql import functions as f from pyspark.sql import types as t -from pyspark.sql.window import Window +from etl.common.spark_helpers import adding_quality_flag +from etl.gwas_ingest.association_mapping import clean_mappings, map_variants +from etl.gwas_ingest.effect_harmonization import harmonize_effect from etl.gwas_ingest.study_ingestion import parse_efos +from etl.json import data if TYPE_CHECKING: from pyspark.sql import Column, DataFrame from etl.common.ETLSession import ETLSession + +# Quality control flags: +SUBSIGNIFICANT_FLAG = "Subsignificant p-value" +INCOMPLETE_MAPPING_FLAG = "Incomplete genomic mapping" +COMPOSITE_FLAG = "Composite association" +INCONSISTENCY_FLAG = "Variant inconsistency" + ASSOCIATION_COLUMNS_MAP = { # Variant related columns: "STRONGEST SNP-RISK ALLELE": "strongestSnpRiskAllele", # variant id and the allele is extracted (; separated list) @@ -27,116 +39,106 @@ # Study related columns: "STUDY ACCESSION": "studyAccession", # Disease/Trait related columns: - "DISEASE/TRAIT": "diseaseTrait", # Reported trait of the study + "DISEASE/TRAIT": "associationDiseaseTrait", # Reported trait of the study "MAPPED_TRAIT_URI": "mappedTraitUri", # Mapped trait URIs of the study - "MERGED": "merged", - "P-VALUE (TEXT)": "pvalueText", # Extra information on the association. + # "MERGED": "merged", + "P-VALUE (TEXT)": "pValueText", # Extra information on the association. # Association details: - "P-VALUE": "pvalue", # p-value of the association, string: split into exponent and mantissa. - "PVALUE_MLOG": "pvalueMlog", # -log10(p-value) of the association, float + "P-VALUE": "pValue", # p-value of the association, string: split into exponent and mantissa. + "PVALUE_MLOG": "pValueNeglog", # -log10(p-value) of the association, float # No longer needed. Can be calculated. "OR or BETA": "effectSize", # Effect size of the association, Odds ratio or beta "95% CI (TEXT)": "confidenceInterval", # Confidence interval of the association, string: split into lower and upper bound. - "CONTEXT": "context", + # "CONTEXT": "context", } -def filter_assoc_by_maf(associations: DataFrame) -> DataFrame: - """Filter associations by Minor Allele Frequency. - - If an association is mapped to multiple concordant (or at least not discordant) variants, - we only keep the one that has the highest minor allele frequency. - - This filter is based on the assumption that GWAS Studies are focusing on common variants mostly. - This filter is especially designed to filter out rare alleles of multiallelics with matching rsIDs. +@f.udf(t.ArrayType(t.StringType(), containsNull=True)) +def _clean_pvaluetext(s: str) -> list[str] | None: + """User defined function for PySpark to clean comma separated texts. Args: - associations (DataFrame): associations + s (str): str Returns: - DataFrame: associations filtered by allelic frequency + A list of strings. """ - # parsing population names from schema: - for field in associations.schema.fields: - if field.name == "alleleFrequencies" and isinstance( - field.dataType, t.StructType - ): - pop_names = field.dataType.fieldNames() - break + if s: + return [w.strip() for w in s.split(",")] + else: + return [] - def af2maf(c: Column) -> Column: - """Column function to calculate minor allele frequency from allele frequency.""" - return f.when(c > 0.5, 1 - c).otherwise(c) - # Windowing through all associations. Within an association, rows are ordered by the maximum MAF: - w = Window.partitionBy("associationId").orderBy(f.desc("maxMAF")) +def _parse_pvaluetext_ancestry(pvaluetext: Column) -> Column: + """Parsing ancestry information captured in p-value text. - return ( - associations.withColumn( - "maxMAF", - f.array_max( - f.array( - *[af2maf(f.col(f"alleleFrequencies.{pop}")) for pop in pop_names] - ) - ), - ) - .withColumn("row_number", f.row_number().over(w)) - .filter(f.col("row_number") == 1) - .drop("row_number", "alleleFrequencies") - .persist() + If the pValueText column contains the word "uropean", then return "nfe". If it contains the word + "frican", then return "afr" + + Args: + pvaluetext (Column): The column that contains the parsed p-value text + + Returns: + A column with the values "nfe" or "afr" + """ + return f.when(pvaluetext.contains("uropean"), "nfe").when( + pvaluetext.contains("frican"), "afr" ) -def concordance_filter(df: DataFrame) -> DataFrame: - """Concordance filter. +def _pvalue_text_resolve(df: DataFrame, etl: ETLSession) -> DataFrame: + """Parsind p-value text. - This function filters for variants with concordant alleles with the association's reported risk allele. - A risk allele is considered concordant if: - - equal to the alt allele - - equal to the ref allele - - equal to the revese complement of the alt allele. - - equal to the revese complement of the ref allele. - - Or the risk allele is ambigious, noted by '?' + Parsing `pValueText` column + mapping abbreviations reference list. + `pValueText` that has been cleaned and resolved to a standardised format. + `pValueAncestry` column with parsed ancestry information Args: - df (DataFrame): associations + df (DataFrame): DataFrame + etl (ETLSession): ETLSession Returns: - DataFrame: associations filtered for variants with concordant alleles with the risk allele. + A dataframe with the p-value text resolved. """ + columns = df.columns + + # GWAS Catalog to p-value mapping + pval_mapping = etl.spark.createDataFrame( + json.loads( + pkg_resources.read_text( + data, "gwas_pValueText_resolve.json", encoding="utf-8" + ) + ) + ).withColumn("full_description", f.lower(f.col("full_description"))) + + # Explode pvalue-mappings: return ( df - # Adding column with the reverse-complement of the risk allele: + # Cleaning and fixing p-value text: .withColumn( - "riskAlleleReverseComplement", - f.when( - f.col("riskAllele").rlike(r"^[ACTG]+$"), - f.reverse(f.translate(f.col("riskAllele"), "ACTG", "TGAC")), - ).otherwise(f.col("riskAllele")), + "pValueText", + _clean_pvaluetext(f.regexp_replace(f.col("pValueText"), r"[\(\)]", "")), ) - # Adding columns flagging concordance: - .withColumn( - "isConcordant", - # If risk allele is found on the positive strand: - f.when( - (f.col("riskAllele") == f.col("referenceAllele")) - | (f.col("riskAllele") == f.col("alternateAllele")), - True, - ) - # If risk allele is found on the negative strand: - .when( - (f.col("riskAlleleReverseComplement") == f.col("referenceAllele")) - | (f.col("riskAlleleReverseComplement") == f.col("alternateAllele")), - True, - ) - # If risk allele is ambiguous, still accepted: < This condition could be reconsidered - .when(f.col("riskAllele") == "?", True) - # Allele is discordant: - .otherwise(False), + .select( + # Exploding p-value text: + *[ + f.explode_outer(col).alias(col) if col == "pValueText" else col + for col in columns + ] ) - # Dropping discordant associations: - .filter(f.col("isConcordant")) - .drop("isConcordant", "riskAlleleReverseComplement") - .persist() + # Join with mapping: + .join(pval_mapping, on="pValueText", how="left") + # Coalesce mappings: + .withColumn("pValueText", f.coalesce("full_description", "pValueText")) + # Resolve ancestries: + .withColumn("pValueAncestry", _parse_pvaluetext_ancestry(f.col("pValueText"))) + # Aggregate data: + .groupBy(*[col for col in columns if col != "pValueText"]) + .agg( + f.collect_set(f.col("pValueAncestry")).alias("pValueAncestry"), + f.collect_set(f.col("pValueText")).alias("pValueText"), + ) + .withColumn("pValueText", f.concat_ws(", ", f.col("pValueText"))) + .withColumn("pValueAncestry", f.concat_ws(", ", f.col("pValueAncestry"))) ) @@ -146,7 +148,7 @@ def read_associations_data( """Read GWASCatalog associations. It reads the GWAS Catalog association dataset, selects and renames columns, casts columns, and - applies some pre-defined filters on the data + flags associations that do not reach GWAS significance level. Args: etl (ETLSession): current ETL session @@ -166,19 +168,39 @@ def read_associations_data( *[ f.col(old_name).alias(new_name) for old_name, new_name in ASSOCIATION_COLUMNS_MAP.items() - ] + ], + f.monotonically_increasing_id().alias("associationId"), + ) + # Based on pre-defined filters set flag for failing associations: + # 1. Flagging associations based on variant x variant interactions + .withColumn( + "qualityControl", + adding_quality_flag( + f.array(), + f.col("pValueNegLog") < -np.log10(pvalue_cutoff), + SUBSIGNIFICANT_FLAG, + ), + ) + # 2. Flagging sub-significant associations + .withColumn( + "qualityControl", + adding_quality_flag( + f.col("qualityControl"), + f.col("position").isNull() & f.col("chromosome").isNull(), + INCOMPLETE_MAPPING_FLAG, + ), ) - # Cast minus log p-value as float: - .withColumn("pvalueMlog", f.col("pvalueMlog").cast(t.FloatType())) - # Apply some pre-defined filters on the data: - # 1. Dropping associations based on variant x variant interactions - # 2. Dropping sub-significant associations - # 3. Dropping associations without genomic location - .filter( - ~f.col("chrId").contains(" x ") - & (f.col("pvalueMlog") >= -np.log10(pvalue_cutoff)) - & (f.col("chrPos").isNotNull() & f.col("chrId").isNotNull()) - ).persist() + # 3. Flagging associations without genomic location + .withColumn( + "qualityControl", + adding_quality_flag( + f.col("qualityControl"), + f.col("chromosome").contains(" x "), + COMPOSITE_FLAG, + ), + ) + # Parsing association EFO: + .withColumn("associationEfos", parse_efos("mappedTraitUri")).persist() ) # Providing stats on the filtered association dataset: @@ -189,194 +211,158 @@ def read_associations_data( etl.logger.info( f'Number of variants: {association_df.select("snpIds").distinct().count()}' ) + etl.logger.info( + f'Number of failed associations: {association_df.filter(f.size(f.col("qualityControl")) != 0).count()}' + ) return association_df -def process_associations(association_df: DataFrame, etl: ETLSession) -> DataFrame: - """Post-process associations DataFrame. +def deconvolute_variants(associations: DataFrame) -> DataFrame: + """Deconvoluting the list of variants attached to GWAS associations. - - The function does the following: - - Adds a unique identifier to each association. - - Processes the variant related columns. - - Processes the EFO terms. - - Splits the p-value into exponent and mantissa. - - Drops some columns. - - Provides some stats on the filtered association dataset. + We split the variant notation (chr, pos, snp id) into arrays, and flag associations where the + number of genomic location is not consistent with the number of rsids Args: - association_df (DataFrame): associations - etl (ETLSession): current ETL session + associations (DataFrame): DataFrame Returns: - DataFrame: associations including the next columns: - - associationId - - snpIdCurrent - - chrId - - chrPos - - snpIds - - riskAllele - - rsidGwasCatalog - - efo - - exponent - - mantissa + DataFrame: Flagged associations with inconsistent mappings. """ - # Processing associations: - parsed_associations = ( - association_df.select( - # Adding association identifier for future deduplication: - f.monotonically_increasing_id().alias("associationId"), - # Processing variant related columns: - # - Sorting out current rsID field: <- why do we need this? rs identifiers should always come from the GnomAD dataset. - # - Removing variants with no genomic mappings -> losing ~3% of all associations - # - Multiple variants can correspond to a single association. - # - Variant identifiers are stored in the SNPS column, while the mapped coordinates are stored in the CHR_ID and CHR_POS columns. - # - All these fields are split into arrays, then they are paired with the same index eg. first ID is paired with first coordinate, and so on - # - Then the association is exploded to all variants. - # - The risk allele is extracted from the 'STRONGEST SNP-RISK ALLELE' column. - # The current snp id field is just a number at the moment (stored as a string). Adding 'rs' prefix if looks good. - f.when( - f.col("snpIdCurrent").rlike("^[0-9]*$"), - f.format_string("rs%s", f.col("snpIdCurrent")), - ) - .otherwise(f.col("snpIdCurrent")) - .alias("snpIdCurrent"), - # Variant fields are joined together in a matching list, then extracted into a separate rows again: - f.explode( - f.arrays_zip( - f.split(f.col("chromosome"), ";"), - f.split(f.col("position"), ";"), - f.split(f.col("strongestSnpRiskAllele"), "; "), - f.split(f.col("snpIds"), "; "), - ) - ).alias("VARIANT"), - # Extracting variant fields: - f.col("VARIANT.chromosome").alias("chromosome"), - f.col("VARIANT.position").alias("position"), - f.col("VARIANT.snpIds").alias("snpIds"), - f.col("VARIANT.strongestSnpRiskAllele").alias("strongestSnpRiskAllele"), + return ( + associations + # For further tracking, we save the variant of the association before splitting: + .withColumn("gwasVariant", f.col("snpIds")) + # Variant notations (chr, pos, snp id) are split into arrays: + .withColumn("chromosome", f.split(f.col("chromosome"), ";")) + .withColumn("position", f.split(f.col("position"), ";")) + .withColumn( + "strongestSnpRiskAllele", + f.split(f.col("strongestSnpRiskAllele"), "; "), ) - .select( - "*", - f.split(f.col("strongestSnpRiskAllele"), "-") - .getItem(1) - .alias("riskAllele"), - # Create a unique set of SNPs linked to the assocition: - f.array_distinct( - f.array( - f.split(f.col("strongestSnpRiskAllele"), "-").getItem(0), - f.col("snpIdCurrent"), - f.col("snpIds"), - ) - ).alias("rsIdsGwasCatalog"), - # Processing EFO terms: - # - Multiple EFO terms can correspond to a single association. - # - EFO terms are stored as full URIS, separated by semicolons. - # - Associations are exploded to all EFO terms. - # - EFO terms in the study table is not considered as association level EFO annotation has priority (via p-value text) - # Process EFO URIs: -> why do we explode? - parse_efos("mappedTraitUri").alias("efos"), - f.split(f.col("pvalue"), "E").getItem(1).cast("integer").alias("exponent"), - f.split(f.col("pvalue"), "E").getItem(0).cast("float").alias("mantissa"), + .withColumn("snpIds", f.split(f.col("snpIds"), "; ")) + # Flagging associations where the genomic location is not consistent with rsids: + .withColumn( + "qualityControl", + adding_quality_flag( + f.col("qualityControl"), + # Number of chromosome values different from position values: + (f.size(f.col("chromosome")) != f.size(f.col("position"))) | + # NUmber of chromosome values different from riskAllele values: + ( + f.size(f.col("chromosome")) + != f.size(f.col("strongestSnpRiskAllele")) + ), + INCONSISTENCY_FLAG, + ), ) - # Cleaning up: - .drop("mappedTraitUri", "strongestSnpRiskAllele", "VARIANT") - .persist() ) - # Providing stats on the filtered association dataset: - etl.logger.info(f"Number of associations: {parsed_associations.count()}") - etl.logger.info( - f'Number of studies: {parsed_associations.select("studyAccession").distinct().count()}' - ) - etl.logger.info( - f'Number of variants: {parsed_associations.select("snpIds").distinct().count()}' - ) - return parsed_associations +def _collect_rsids( + snp_id: Column, snp_id_current: Column, risk_allele: Column +) -> Column: + """It takes three columns, and returns an array of distinct values from those columns. + Args: + snp_id (Column): The original snp id from the GWAS catalog. + snp_id_current (Column): The current snp id field is just a number at the moment (stored as a string). Adding 'rs' prefix if looks good. The + risk_allele (Column): The risk allele for the SNP. + + Returns: + An array of distinct values. + """ + # The current snp id field is just a number at the moment (stored as a string). Adding 'rs' prefix if looks good. + snp_id_current = f.when( + snp_id_current.rlike("^[0-9]*$"), + f.format_string("rs%s", snp_id_current), + ) + # Cleaning risk allele: + risk_allele = f.split(risk_allele, "-").getItem(0) -def deduplicate(df: DataFrame) -> DataFrame: - """Deduplicate DataFrame (not implemented).""" - raise NotImplementedError + # Collecting all values: + return f.array_distinct(f.array(snp_id, snp_id_current, risk_allele)) -def filter_assoc_by_rsid(df: DataFrame) -> DataFrame: - """Filter associations by rsid. +def splitting_association(association: DataFrame) -> DataFrame: + """Splitting associations based on the list of parseable variants from the GWAS Catalog. Args: - df (DataFrame): associations requiring: - - associationId - - rsIdsGwasCatalog - - rsIdsGnomad + association (DataFrame): DataFrame Returns: - DataFrame: filtered associations + A dataframe with the same columns as the input dataframe, but with the variants exploded. """ - w = Window.partitionBy("associationId") + cols = association.columns + variant_cols = [ + "chromosome", + "position", + "strongestSnpRiskAllele", + "snpIds", + ] return ( - df - # See if the GnomAD variant that was mapped to a given association has a matching rsId: + association + # Pairing together matching chr:pos:rsid in a list of structs: .withColumn( - "matchingRsId", + "variants", f.when( - f.size( - f.array_intersect(f.col("rsIdsGwasCatalog"), f.col("rsIdsGnomad")) - ) - > 0, - True, - ).otherwise(False), + ~f.array_contains(f.col("qualityControl"), "Variant inconsistency"), + f.arrays_zip( + f.col("chromosome"), + f.col("position"), + f.col("strongestSnpRiskAllele"), + f.col("snpIds"), + ), + ).otherwise(None), + ) + # Exploding all variants: + .select( + *cols, + f.explode_outer("variants").alias("variant"), ) + # Updating chr, pos, rsid, risk-snp-allele columns: + .select( + *[ + f.col(f"variant.{col}").alias(col) if col in variant_cols else col + for col in cols + ], + # Extract p-value exponent: + f.split(f.col("pvalue"), "E") + .getItem(1) + .cast("integer") + .alias("pValueExponent"), + # Extract p-value mantissa: + f.split(f.col("pvalue"), "E") + .getItem(0) + .cast("float") + .alias("pValueMantissa"), + ) + # Extracting risk allele: .withColumn( - "successfulMappingExists", - f.when( - f.array_contains(f.collect_set(f.col("matchingRsId")).over(w), True), - True, - ).otherwise(False), + "riskAllele", f.split(f.col("strongestSnpRiskAllele"), "-").getItem(1) ) - .filter( - (f.col("matchingRsId") & f.col("successfulMappingExists")) - | (~f.col("matchingRsId") & ~f.col("successfulMappingExists")) + # Creating list of rsIds from gwas catalog dataset: + .withColumn( + "rsIdsGwasCatalog", + _collect_rsids( + f.col("snpIds"), f.col("snpIdCurrent"), f.col("strongestSnpRiskAllele") + ), + ) + # Dropping some un-used column: + .drop( + "pValue", + "riskAlleleFrequency", + "cnv", + "strongestSnpRiskAllele", + "snpIdCurrent", + "snpIds", + "pValue", + "mappedTraitUri", + "pValueNegLog", ) - .drop("successfulMappingExists", "matchingRsId") - .persist() - ) - - -def map_variants( - parsed_associations: DataFrame, variant_annotation_path: str, etl: ETLSession -) -> DataFrame: - """Add variant metadata in associations. - - Args: - parsed_associations (DataFrame): associations - etl (ETLSession): current ETL session - variant_annotation_path (str): variant annotation path - - Returns: - DataFrame: associations with variant metadata - """ - variants = etl.spark.read.parquet(variant_annotation_path).select( - f.col("id").alias("variantId"), - "chromosome", - "position", - "rsIdsGnomad", - "referenceAllele", - "alternateAllele", - "alleleFrequencies", - ) - - mapped_associations = variants.join( - f.broadcast(parsed_associations), on=["chromosome", "position"], how="right" - ).persist() - assoc_without_variant = mapped_associations.filter( - f.col("variantId").isNull() - ).count() - etl.logger.info( - f"Loading variant annotation and joining with associations... {assoc_without_variant} associations outside gnomAD" ) - return mapped_associations def ingest_gwas_catalog_associations( @@ -391,22 +377,58 @@ def ingest_gwas_catalog_associations( etl (ETLSession): current ETL session gwas_association_path (str): GWAS catalogue dataset path variant_annotation_path (str): variant annotation dataset path - pvalue_cutoff (float): GWAS significance threshold + pvalue_cutoff (float): GWAS significance threshold, flagging sub-significant associations Returns: DataFrame: Post-processed GWASCatalog associations """ return ( - # 1. Read associations: + # Read associations: read_associations_data(etl, gwas_association_path, pvalue_cutoff) - # 2. Process -> apply filter: - .transform(lambda df: process_associations(df, etl)) - # 3. Map variants to GnomAD3: + # Cleaning p-value text: + .transform(lambda df: _pvalue_text_resolve(df, etl)) + # Parsing variants: + .transform(deconvolute_variants) + # Splitting associations by GWAS Catalog variants: + .transform(splitting_association) + # Mapping variants to GnomAD3: .transform(lambda df: map_variants(df, variant_annotation_path, etl)) - # 4. Remove discordants: - .transform(concordance_filter) - # 5. deduplicate associations by matching rsIDs: - .transform(filter_assoc_by_rsid) - # 6. deduplication by MAF: - .transform(filter_assoc_by_maf) + # Cleaning GnomAD variant mappings to ensure no duplication happens: + .transform(clean_mappings) + # Harmonizing association effect: + .transform(harmonize_effect) + ) + + +def generate_association_table(df: DataFrame) -> DataFrame: + """Generating top-loci table. + + Args: + df (DataFrame): DataFrame + + Returns: + A DataFrame with the columns specified in the filter_columns list. + """ + filter_columns = [ + "chromosome", + "position", + "referenceAllele", + "alternateAllele", + "variantId", + "studyId", + "pValueMantissa", + "pValueExponent", + "beta", + "beta_ci_lower", + "beta_ci_upper", + "odds_ratio", + "odds_ratio_ci_lower", + "odds_ratio_ci_upper", + "qualityControl", + ] + return ( + df.select(*filter_columns) + .filter(f.col("variantId").isNotNull()) + .distinct() + .persist() ) diff --git a/src/etl/gwas_ingest/study_ingestion.py b/src/etl/gwas_ingest/study_ingestion.py index b8a3ee7eb..89b3a14a8 100644 --- a/src/etl/gwas_ingest/study_ingestion.py +++ b/src/etl/gwas_ingest/study_ingestion.py @@ -6,23 +6,28 @@ from pyspark.sql import functions as f from pyspark.sql import types as t +from pyspark.sql.window import Window -from etl.json import validate_df_schema +from etl.common.spark_helpers import adding_quality_flag if TYPE_CHECKING: from pyspark.sql import Column, DataFrame from etl.common.ETLSession import ETLSession + +# Quality control flags: +AMBIGUOUS_ASSOCIATION = "Ambiguous association" + STUDY_COLUMNS_MAP = { # 'DATE ADDED TO CATALOG': 'date_added_to_catalog', "PUBMED ID": "pubmedId", "FIRST AUTHOR": "firstAuthor", "DATE": "publicationDate", "JOURNAL": "journal", - "LINK": "link", + # "LINK": "link", # Link not read: links are generated on the front end "STUDY": "study", - "DISEASE/TRAIT": "diseaseTrait", + "DISEASE/TRAIT": "studyDiseaseTrait", "INITIAL SAMPLE SIZE": "initialSampleSize", # 'REPLICATION SAMPLE SIZE': 'replication_sample_size', # 'PLATFORM [SNPS PASSING QC]': 'platform', @@ -139,46 +144,135 @@ def extract_discovery_sample_sizes(df: DataFrame) -> DataFrame: ) -def parse_efos(c: str) -> Column: +def spliting_gwas_studies(study_association: DataFrame) -> DataFrame: + """Splitting studies and consolidating disease annotation. + + Processing disease annotation of the joined study/association table. If assigned disease + of the study and the association don't agree, we assume the study needs to be split. + Then disease EFOs, trait names and study ID are consolidated + + Args: + study_association (DataFrame): DataFrame + + Returns: + A dataframe with the studyAccession, studyId, diseaseTrait, and efos columns. + """ + # A window to aid study splitting: + study_split_window = Window.partitionBy("studyAccession").orderBy("splitField") + + # A window to detect ambiguous associations: + assoc_ambiguity_window = Window.partitionBy("studyId", "variantId") + return ( + study_association + # As some studies needs to be split by not only the p-value text, but the EFO as well, we need to do this thing: + .withColumn( + "splitField", + f.concat_ws( + "_", + f.col("pValueText"), + f.array_join(f.col("associationEfos"), "_"), + ), + ) + # Windowing over the groups: + .withColumn("row_number", f.dense_rank().over(study_split_window) - 1) + # Study identifiers are split when there are more than one type of associationEfos: + .withColumn( + "studyId", + f.when(f.col("row_number") == 0, f.col("studyAccession")).otherwise( + f.concat_ws("_", "studyAccession", "row_number") + ), + ) + # Disese traits are generated based on p-value text when splitting study: + .withColumn( + "diseaseTrait", + # When study is split: + f.when( + f.col("row_number") != 0, + f.when( + f.col("pValueText").isNotNull(), + f.concat( + "associationDiseaseTrait", f.lit(" ["), "pValueText", f.lit("]") + ), + ).otherwise("associationDiseaseTrait"), + ) + # When there's association disease trait: + .when( + f.col("associationDiseaseTrait").isNotNull(), + f.col("associationDiseaseTrait"), + ) + # When no association disease trait is present we get from study: + .otherwise(f.col("studyDiseaseTrait")), + ) + # The EFO field is also consolidated: + .withColumn( + "efos", + # When available, EFOs are pulled from associations: + f.when(f.col("associationEfos").isNotNull(), f.col("associationEfos")) + # When no association is given, the study level EFOs are used: + .otherwise(f.col("studyEfos")), + ) + # Flagging ambiguous associations: + .withColumn( + "qualityControl", + adding_quality_flag( + f.col("qualityControl"), + # There are more than one variant ID in one study: + f.count(f.col("variantId")).over(assoc_ambiguity_window) > 1, + AMBIGUOUS_ASSOCIATION, + ), + ).drop( + "row_number", + "studyAccession", + "studyEfos", + "studyDiseaseTrait", + "associationEfos", + "associationDiseaseTrait", + "pValueText", + # 'full_description' + ) + ) + + +def parse_efos(col_name: str) -> Column: """Extracting EFO identifiers. This function parses EFO identifiers from a comma-separated list of EFO URIs. Args: - c (str): name of column with a list of EFO IDs + col_name (str): name of column with a list of EFO IDs Returns: - Column: column with a list of parsed EFO IDs + Column: column with a sorted list of parsed EFO IDs """ - return f.expr(f"regexp_extract_all({c}, '([A-Z]+_[0-9]+)')") + return f.array_sort(f.expr(f"regexp_extract_all({col_name}, '([A-Z]+_[0-9]+)')")) -def column2camel_case(s: str) -> str: +def column2camel_case(col_name: str) -> str: """A helper function to convert column names to camel cases. Args: - s (str): a single column name + col_name (str): a single column name Returns: str: spark expression to select and rename the column """ - def string2camelcase(s: str) -> str: + def string2camelcase(col_name: str) -> str: """Converting a string to camelcase. Args: - s (str): a random string + col_name (str): a random string Returns: str: Camel cased string """ # Removing a bunch of unwanted characters from the column names: - s = re.sub(r"[\/\(\)\-]+", " ", s) + col_name = re.sub(r"[\/\(\)\-]+", " ", col_name) - first, *rest = s.split(" ") + first, *rest = col_name.split(" ") return "".join([first.lower(), *map(str.capitalize, rest)]) - return f"`{s}` as {string2camelcase(s)}" + return f"`{col_name}` as {string2camelcase(col_name)}" def parse_ancestries(etl: ETLSession, ancestry_file: str) -> DataFrame: @@ -217,6 +311,7 @@ def parse_ancestries(etl: ETLSession, ancestry_file: str) -> DataFrame: ) .withColumnRenamed("initial", "discoverySamples") .withColumnRenamed("replication", "replicationSamples") + .persist() ) # Generate information on the ancestry composition of the discovery stage, and calculate @@ -294,30 +389,85 @@ def ingest_gwas_catalog_studies( # Read GWAS Catalogue raw data gwas_studies = read_study_table(etl, study_file) gwas_ancestries = parse_ancestries(etl, ancestry_file) + ss_studies = get_sumstats_location(etl, summary_stats_list) + # Sample size extraction is a separate process: study_size_df = extract_discovery_sample_sizes(gwas_studies) - ss_studies = get_sumstats_location(etl, summary_stats_list) - studies = ( + return ( gwas_studies # Add study sizes: .join(study_size_df, on="studyAccession", how="left") # Adding summary stats location: - .join( - ss_studies, - on="studyAccession", - how="left", - ) + .join(ss_studies, on="studyAccession", how="left") .withColumn("hasSumstats", f.coalesce(f.col("hasSumstats"), f.lit(False))) .join(gwas_ancestries, on="studyAccession", how="left") + # Select relevant columns: .select( - "*", - parse_efos("mappedTraitUri").alias("efos"), + "studyAccession", + # Publication related fields: + "pubmedId", + "firstAuthor", + "publicationDate", + "journal", + "study", + # Disease related fields: + "studyDiseaseTrait", + parse_efos("mappedTraitUri").alias("studyEfos"), parse_efos("mappedBackgroundTraitUri").alias("backgroundEfos"), + # Sample related fields: + "initialSampleSize", + "nCases", + "nControls", + "nSamples", + # Ancestry related labels: + "discoverySamples", + "replicationSamples", + # "gnomadSamples", + # Summary stats fields: + "summarystatsLocation", + "hasSumstats", ) - .drop("initialSampleSize", "mappedBackgroundTraitUri", "mappedTraitUri") + .persist() ) - validate_df_schema(studies, "studies.json") - return studies +def generate_study_table(association_study: DataFrame) -> DataFrame: + """Extracting studies from the joined study/association table. + + Args: + association_study (DataFrame): DataFrame with both associations and studies. + + Returns: + A dataframe with the columns specified in the study_columns list. + """ + study_columns = [ + # Study id and type: + f.col("studyId").alias("id"), + f.lit("gwas").alias("type"), + # Publication level information: + "pubmedId", + f.col("firstAuthor").alias("publicationFirstAuthor"), + f.col("journal").alias("publicationJournal"), + "publicationDate", + f.col("study").alias("publicationTitle"), + # Trait level information: + f.col("diseaseTrait").alias("traitFromSource"), + f.col("efos").alias("traitFromSourceMappedIds"), + f.col("backgroundEfos").alias("backgroundTraitFromSourceMappedIds"), + # Sample descriptions: + # ancestryInitial + # ancestryReplication + "initialSampleSize", + "discoverySamples", + "replicationSamples", + # Sample counts: + "nCases", + "nControls", + "nSamples", + # Summary stats related info: + "hasSumstats", + "summarystatsLocation", + ] + + return association_study.select(*study_columns).distinct().persist() diff --git a/src/etl/json/data/__init__.py b/src/etl/json/data/__init__.py new file mode 100644 index 000000000..dbc834ccf --- /dev/null +++ b/src/etl/json/data/__init__.py @@ -0,0 +1,3 @@ +"""JSON dataframes.""" + +from __future__ import annotations diff --git a/src/etl/json/data/gwas_pValueText_resolve.json b/src/etl/json/data/gwas_pValueText_resolve.json new file mode 100644 index 000000000..1070eb416 --- /dev/null +++ b/src/etl/json/data/gwas_pValueText_resolve.json @@ -0,0 +1,1870 @@ +[ + { + "pValueText": "A risk allele not reported", + "full_description": "?" + }, + { + "pValueText": "5-hydroxyindoleacetic acid", + "full_description": "5-HIAA" + }, + { + "pValueText": "Biantennary nogalactosylated glycans", + "full_description": "A2" + }, + { + "pValueText": "African ancestry", + "full_description": "AA" + }, + { + "pValueText": "Age at menarche and buckling ratio", + "full_description": "AAM-BR" + }, + { + "pValueText": "Age at menarche and cortical thickness", + "full_description": "AAM-CT" + }, + { + "pValueText": "Age at menarche and periosteal diameter", + "full_description": "AAM-W" + }, + { + "pValueText": "alanine aminotransferase", + "full_description": "AAT" + }, + { + "pValueText": "abdominal aortic calcium", + "full_description": "AAC" + }, + { + "pValueText": "Aggregatibacter actinomycetemcomitans", + "full_description": "A action" + }, + { + "pValueText": "Amyloid-beta", + "full_description": "Ab" + }, + { + "pValueText": "ankle brachial index", + "full_description": "ABI" + }, + { + "pValueText": "anti-centromere antibodies", + "full_description": "ACA" + }, + { + "pValueText": "Angiotensin-converting enzyme", + "full_description": "ACE" + }, + { + "pValueText": "antibodies to citrullinated peptide antigens", + "full_description": "ACPA" + }, + { + "pValueText": "normalised agkistrodon contortrix venom ratio", + "full_description": "ACVn" + }, + { + "pValueText": "Alzheimer’s Disease", + "full_description": "AD" + }, + { + "pValueText": "Attention deficit hyperactivity disorder", + "full_description": "ADHD" + }, + { + "pValueText": "Adrenomedullin", + "full_description": "ADM" + }, + { + "pValueText": "atrial fibrillation", + "full_description": "AF" + }, + { + "pValueText": "Frontal Brain volume", + "full_description": "AFBV" + }, + { + "pValueText": "? fetoprotein", + "full_description": "AFP" + }, + { + "pValueText": "multivariable-adjusted frontal brain volume", + "full_description": "AFVB" + }, + { + "pValueText": "Abnormal Involuntary Movements Scale", + "full_description": "AIMS" + }, + { + "pValueText": "Acute insulin response", + "full_description": "AIRg" + }, + { + "pValueText": "Alaine", + "full_description": "Ala" + }, + { + "pValueText": "albumin", + "full_description": "ALB" + }, + { + "pValueText": "serum albumin:globulin ratio", + "full_description": "ALB/GLB" + }, + { + "pValueText": "all ancestries", + "full_description": "ALL" + }, + { + "pValueText": "combined tests of verbal delayed recall", + "full_description": "ALL-dr" + }, + { + "pValueText": "adjusted log lateral ventricular volume", + "full_description": "ALLV" + }, + { + "pValueText": "alkaline phosphatase", + "full_description": "ALP" + }, + { + "pValueText": "alpha-tocopherol", + "full_description": "Alpha-TOH" + }, + { + "pValueText": "amyotrophic lateral sclerosis", + "full_description": "ALS" + }, + { + "pValueText": "alanine aminotransferase", + "full_description": "ALT" + }, + { + "pValueText": "multivariable-adjusted temporal horn volume", + "full_description": "ALTHBV" + }, + { + "pValueText": "Alkaline phosphatase", + "full_description": "AlkPhos" + }, + { + "pValueText": "Age-related macular degeneration", + "full_description": "AMD" + }, + { + "pValueText": "Absolute neutrophil count", + "full_description": "ANC" + }, + { + "pValueText": "N-terminal pro-atrial natriuretic peptide", + "full_description": "ANP6" + }, + { + "pValueText": "angiographic coronary disease", + "full_description": "AngCAD" + }, + { + "pValueText": "age of smoking initiation", + "full_description": "AOI" + }, + { + "pValueText": "apolipoprotein A-1", + "full_description": "apoA-1" + }, + { + "pValueText": "activated partial thromboplastin time", + "full_description": "aPTT" + }, + { + "pValueText": "pathogenic water channel aquaporin 4", + "full_description": "AQP4" + }, + { + "pValueText": "AR-C124910XX", + "full_description": "ARC" + }, + { + "pValueText": "ostium secundum atrial septal defect", + "full_description": "ASD" + }, + { + "pValueText": "aspartate aminotransferase", + "full_description": "AST" + }, + { + "pValueText": "antithrombin", + "full_description": "At" + }, + { + "pValueText": "anti-topoisomerase antibodies", + "full_description": "ATA" + }, + { + "pValueText": "temporal brain volume", + "full_description": "ATBV" + }, + { + "pValueText": "total cerebral brain volume", + "full_description": "ATCBV" + }, + { + "pValueText": "multivariable-adjusted temporal brain volume", + "full_description": "ATVB" + }, + { + "pValueText": "area under the curve", + "full_description": "AUC" + }, + { + "pValueText": "baseline brachial artery flow velocity", + "full_description": "BABF" + }, + { + "pValueText": "Barnes Akathisia Scale", + "full_description": "BARS" + }, + { + "pValueText": "beta blockers", + "full_description": "BB" + }, + { + "pValueText": "bipolar disorder", + "full_description": "BD" + }, + { + "pValueText": "bone density estimated by T score at distal radius", + "full_description": "BD-RT" + }, + { + "pValueText": "bone density estimated by T score at midshaft tibia", + "full_description": "BD-TT" + }, + { + "pValueText": "bone mineral density", + "full_description": "BMD" + }, + { + "pValueText": "body mass index", + "full_description": "BMI" + }, + { + "pValueText": "basal metabolic rate", + "full_description": "BMR" + }, + { + "pValueText": "respiratory quotient during basal metabolic rate measurement", + "full_description": "BMR RQ" + }, + { + "pValueText": "brain parenchymal volume", + "full_description": "BPV" + }, + { + "pValueText": "breast cancer 1 gene and breast cancer 2 gene", + "full_description": "BRCA1/2" + }, + { + "pValueText": "Broadband ultrasound attenuation", + "full_description": "BUA" + }, + { + "pValueText": "blood urea nitrogen", + "full_description": "BUN" + }, + { + "pValueText": "cholesterol", + "full_description": "C" + }, + { + "pValueText": "Propionylcarnitine", + "full_description": "C3" + }, + { + "pValueText": "Butyrylcarnitine", + "full_description": "C4" + }, + { + "pValueText": "Nonaylcarnitine", + "full_description": "C9" + }, + { + "pValueText": "Decanoylcarnitine", + "full_description": "C10" + }, + { + "pValueText": "Decadienylcarnitine", + "full_description": "C10:2" + }, + { + "pValueText": "Dodecanoylcarnitine", + "full_description": "C12" + }, + { + "pValueText": "Hydroxytetradecenoylcarnitine", + "full_description": "C14:1-OH" + }, + { + "pValueText": "cancer antigen 19-9", + "full_description": "CA19-9" + }, + { + "pValueText": "coronary artery calcification", + "full_description": "CAC" + }, + { + "pValueText": "coronary artery lesions", + "full_description": "CAL" + }, + { + "pValueText": "carotid brachial pulse wave velocity", + "full_description": "CB-PWV" + }, + { + "pValueText": "Cortical thickness of the tibia", + "full_description": "CBT" + }, + { + "pValueText": "Clara cell secretory protein", + "full_description": "CC16" + }, + { + "pValueText": "common carotid artery", + "full_description": "CCA" + }, + { + "pValueText": "common carotid artery intimal medial thickness", + "full_description": "CCA IMT" + }, + { + "pValueText": "calcium channel blockers", + "full_description": "CCB" + }, + { + "pValueText": "central corneal thickness", + "full_description": "CCT" + }, + { + "pValueText": "Ligand, serum & plasma", + "full_description": "CD40L" + }, + { + "pValueText": "Disease Activity Score", + "full_description": "cDAS28" + }, + { + "pValueText": "Complicated disease course", + "full_description": "CDC" + }, + { + "pValueText": "cholesterol ester", + "full_description": "CE" + }, + { + "pValueText": "carcinoembryonic antigen", + "full_description": "CEA" + }, + { + "pValueText": "ceramide", + "full_description": "Cer" + }, + { + "pValueText": "Consortium to Establish a Registry for Alzheimer’s Disease delayed recall", + "full_description": "CERAD-dr" + }, + { + "pValueText": "CEPH (Centre d’Etude du Polymorphisme Humain) from Utah", + "full_description": "CEU" + }, + { + "pValueText": "carotid-femoral pulse wave velocity, long-term average", + "full_description": "CF-PWVLTA" + }, + { + "pValueText": "Clinical Global Impressions-Severity", + "full_description": "CGI" + }, + { + "pValueText": "Coronary heart disease", + "full_description": "CHD" + }, + { + "pValueText": "1st principal component on transformed hue and saturation values", + "full_description": "CHS1" + }, + { + "pValueText": "95% confidence interval", + "full_description": "CI" + }, + { + "pValueText": "creatinine kinase", + "full_description": "CK" + }, + { + "pValueText": "chronic kidney disease", + "full_description": "CKD" + }, + { + "pValueText": "health assessment questionnaire score", + "full_description": "cHAQ" + }, + { + "pValueText": "Classical Hodgkin lymphoma", + "full_description": "cHL" + }, + { + "pValueText": "former/current smokers", + "full_description": "CIGSTAT" + }, + { + "pValueText": "carotid intima media thickness", + "full_description": "cIMT" + }, + { + "pValueText": "cleft lip without cleft palate", + "full_description": "CL" + }, + { + "pValueText": "cleft lip with or without cleft palate", + "full_description": "CL/P" + }, + { + "pValueText": "total number of correct words across three letters", + "full_description": "COWA" + }, + { + "pValueText": "cleft palate", + "full_description": "CP" + }, + { + "pValueText": "Chronic periodontitis", + "full_description": "CPd" + }, + { + "pValueText": "cigarettes per day", + "full_description": "CPD" + }, + { + "pValueText": "10 or more cigarettes per day", + "full_description": "CPDBI" + }, + { + "pValueText": "C-reactive protein", + "full_description": "CRP" + }, + { + "pValueText": "C-reactive protein (CRP) averaged from 3 examinations (over about 20 years)", + "full_description": "CRP average 2,6,7" + }, + { + "pValueText": "C-reactive protein, offspring exam 2", + "full_description": "CRP2" + }, + { + "pValueText": "C-reactive protein exam 6", + "full_description": "CRP6" + }, + { + "pValueText": "cardioembolic stroke", + "full_description": "CS" + }, + { + "pValueText": "Swollen joint count", + "full_description": "cSJC" + }, + { + "pValueText": "Tender joint count", + "full_description": "cTJC" + }, + { + "pValueText": "cardiovascular disease", + "full_description": "CVD" + }, + { + "pValueText": "California Verbal Learning Test delayed recall (belongs to WL-dr category)", + "full_description": "CVLT-dr" + }, + { + "pValueText": "particle diameter", + "full_description": "D" + }, + { + "pValueText": "diastolic blood pressure", + "full_description": "DBP" + }, + { + "pValueText": "diastolic blood pressure, long-term average", + "full_description": "DBPLTA" + }, + { + "pValueText": "Dermatophagoides farina", + "full_description": "D.f." + }, + { + "pValueText": "Desialylated 2AB-labelled human plasma N-glycans groups", + "full_description": "DG" + }, + { + "pValueText": "combined results from the DGI, FUSION, WTCCC analyses", + "full_description": "DGI+FUSION+WTCCC" + }, + { + "pValueText": "dehydroisoandrosterone sulfate", + "full_description": "DHEA-S" + }, + { + "pValueText": "Disposition index", + "full_description": "DI" + }, + { + "pValueText": "diabetes mellitus", + "full_description": "DM" + }, + { + "pValueText": "Dermatophagoides pteronyssinus", + "full_description": "D.p." + }, + { + "pValueText": "Delayed Word Recall Test (belongs to WL-dr category)", + "full_description": "DWRT-dr" + }, + { + "pValueText": "dual energy X-ray absorptiometry", + "full_description": "DXA" + }, + { + "pValueText": "European Ancestry", + "full_description": "EA" + }, + { + "pValueText": "Epstein-Barr virus", + "full_description": "EBV" + }, + { + "pValueText": "excessive daytime sleepiness", + "full_description": "EDS" + }, + { + "pValueText": "electroencephalography", + "full_description": "EEG" + }, + { + "pValueText": "estimated energy requirement", + "full_description": "EER" + }, + { + "pValueText": "glomerular filtration rate", + "full_description": "eGFR" + }, + { + "pValueText": "estimated glomerular filtration rate based on serum creatinine", + "full_description": "eGFRcrea" + }, + { + "pValueText": "serum cystatin C", + "full_description": "eGFRcys" + }, + { + "pValueText": "Extraintestinal manifestations", + "full_description": "EIM" + }, + { + "pValueText": "Elated mania", + "full_description": "EM" + }, + { + "pValueText": "Estrogen receptor positive", + "full_description": "ER +ve" + }, + { + "pValueText": "Estrogen receptor negative", + "full_description": "ER -ve" + }, + { + "pValueText": "Endothelin-1", + "full_description": "ET-1" + }, + { + "pValueText": "endocrine treatment", + "full_description": "ET" + }, + { + "pValueText": "esophageal squamous cell carcinoma", + "full_description": "ESCC" + }, + { + "pValueText": "end-stage renal disease", + "full_description": "ESRD" + }, + { + "pValueText": "Epworth Sleepiness Scale", + "full_description": "ESS" + }, + { + "pValueText": "esterified cholesterol", + "full_description": "Est-C" + }, + { + "pValueText": "ever smokers, never smokers", + "full_description": "EVNV" + }, + { + "pValueText": "Factor 2 (visual memory and organization)", + "full_description": "F2" + }, + { + "pValueText": "Factor 3 (measure of attention and executive function - Trails A and B)", + "full_description": "F3" + }, + { + "pValueText": "female athletes", + "full_description": "FA" + }, + { + "pValueText": "free cholesterol", + "full_description": "FC" + }, + { + "pValueText": "forced expiratory flow", + "full_description": "FEF" + }, + { + "pValueText": "forced expiratory volume in 1 second", + "full_description": "FEV1" + }, + { + "pValueText": "longitudinal slope of forced expiratory volume in one second", + "full_description": "fev1slope" + }, + { + "pValueText": "fibrinogen", + "full_description": "FG" + }, + { + "pValueText": "fasting insulin", + "full_description": "FI" + }, + { + "pValueText": "female long endurance athletes", + "full_description": "FLE" + }, + { + "pValueText": "femoral neck", + "full_description": "FN" + }, + { + "pValueText": "femoral bone mineral density in males", + "full_description": "FNBMDm" + }, + { + "pValueText": "fasting plasma glucose", + "full_description": "FPG" + }, + { + "pValueText": "free Protein S", + "full_description": "fPS" + }, + { + "pValueText": "fasting serum free triiodothyronine", + "full_description": "Free T3" + }, + { + "pValueText": "female-only stroke", + "full_description": "FS" + }, + { + "pValueText": "female sexual dysfunction", + "full_description": "FSD" + }, + { + "pValueText": "fasting serum glucose", + "full_description": "FSG" + }, + { + "pValueText": "follicle-stimulating hormone", + "full_description": "FSH" + }, + { + "pValueText": "frequently sampled intravenous glucose tolerance test", + "full_description": "FSIGT" + }, + { + "pValueText": "free thyroxine 3", + "full_description": "Ft3" + }, + { + "pValueText": "free thyroxine", + "full_description": "Ft4" + }, + { + "pValueText": "frontotemporal dementia", + "full_description": "FTD" + }, + { + "pValueText": "Antennary fucosylated glycans", + "full_description": "FUC-A" + }, + { + "pValueText": "Core fucosylated glycans", + "full_description": "FUC-C" + }, + { + "pValueText": "functional Protein S", + "full_description": "funcPS" + }, + { + "pValueText": "forced vital capacity", + "full_description": "FVC" + }, + { + "pValueText": "Coagulation factors VII", + "full_description": "FVII" + }, + { + "pValueText": "forward wave amplitude, long-term average", + "full_description": "FWLTA" + }, + { + "pValueText": "grade 3 diarrhea", + "full_description": "G3D" + }, + { + "pValueText": "glucocerebrosiadase", + "full_description": "GBA" + }, + { + "pValueText": "glomerular filtration rate", + "full_description": "GFR" + }, + { + "pValueText": "glutamyltranspeptidase", + "full_description": "GGT" + }, + { + "pValueText": "Glucose", + "full_description": "Glc" + }, + { + "pValueText": "Glutamine", + "full_description": "Gln" + }, + { + "pValueText": "glucose", + "full_description": "GLU" + }, + { + "pValueText": "glucosylceramide", + "full_description": "GluCer" + }, + { + "pValueText": "Glutamyl oxaloacetic transaminase, Aspartate aminotransferase", + "full_description": "GOT (AST)" + }, + { + "pValueText": "Glycan peak", + "full_description": "GP" + }, + { + "pValueText": "glutamate pyruvate transaminase, alanine aminotransferase", + "full_description": "GPT (ALT)" + }, + { + "pValueText": "glycoprotein 130", + "full_description": "GP130" + }, + { + "pValueText": "general side effect burden", + "full_description": "GSE" + }, + { + "pValueText": "Hamilton Anxiety Scale", + "full_description": "HAM-A" + }, + { + "pValueText": "hemoglobin A1c", + "full_description": "HbA1C" + }, + { + "pValueText": "fetal hemoglobin", + "full_description": "HbF" + }, + { + "pValueText": "homocysteine", + "full_description": "Hcy" + }, + { + "pValueText": "Total cholesterol in HDL", + "full_description": "HDL-C" + }, + { + "pValueText": "human epidermal growth factor receptor 2", + "full_description": "HER2" + }, + { + "pValueText": "Hemoglobin", + "full_description": "Hgb" + }, + { + "pValueText": "Histidine", + "full_description": "His" + }, + { + "pValueText": "homeostasis model assessment of insulin resistance", + "full_description": "HOMA-IR" + }, + { + "pValueText": "human immunodeficiency virus", + "full_description": "HIV" + }, + { + "pValueText": "high density lipoprotein", + "full_description": "HDL" + }, + { + "pValueText": "homeostasis model assessment of beta-cell function", + "full_description": "HOMA-B" + }, + { + "pValueText": "hormone receptor", + "full_description": "HR" + }, + { + "pValueText": "maximum heart rate during treadmill fitness test", + "full_description": "HRmax" + }, + { + "pValueText": "Herpes simplex virus type 2", + "full_description": "HSV-2" + }, + { + "pValueText": "hematocrit", + "full_description": "Ht" + }, + { + "pValueText": "Hounsfield units", + "full_description": "HU" + }, + { + "pValueText": "homovanillic acid", + "full_description": "HVA" + }, + { + "pValueText": "Hopkins Verbal Learning Test delayed recall (belongs to WL-dr category)", + "full_description": "HVLT-dr" + }, + { + "pValueText": "islet autoantibody", + "full_description": "IA" + }, + { + "pValueText": "internal cartotid artery internal and common carotid intimal medial thickness", + "full_description": "ICAIMT" + }, + { + "pValueText": "Intercellular adhesion molecule", + "full_description": "ICAM" + }, + { + "pValueText": "intra-extradimensional set shifting", + "full_description": "IED" + }, + { + "pValueText": "insulin-like growth factor I precursor", + "full_description": "IGF1" + }, + { + "pValueText": "fasting serum insulin-like growth factor binding protein-1", + "full_description": "IGFBP-1" + }, + { + "pValueText": "fasting serum insulin-like growth factor binding protein-3", + "full_description": "IGFBP-3" + }, + { + "pValueText": "Interleukin-6 precursor", + "full_description": "IL6" + }, + { + "pValueText": "Interleukin-8 precursor", + "full_description": "IL8" + }, + { + "pValueText": "Interleukin-10 precursor", + "full_description": "IL10" + }, + { + "pValueText": "interleukin-12 precursor", + "full_description": "IL12" + }, + { + "pValueText": "Interleukin-18 precursor", + "full_description": "IL18" + }, + { + "pValueText": "Interleukin-1, beta", + "full_description": "IL1B" + }, + { + "pValueText": "interleukin-1 receptor antagonist protein precursor", + "full_description": "IL1RA" + }, + { + "pValueText": "irritable mania", + "full_description": "IM" + }, + { + "pValueText": "Carotid intimal medial thickness", + "full_description": "IMT" + }, + { + "pValueText": "insulin", + "full_description": "INS" + }, + { + "pValueText": "interaction", + "full_description": "int" + }, + { + "pValueText": "insulin resistance", + "full_description": "IR" + }, + { + "pValueText": "all ischemic stroke", + "full_description": "IS" + }, + { + "pValueText": "0-120 min insulin sensitivity index", + "full_description": "ISI_0-120" + }, + { + "pValueText": "Kawasaki disease", + "full_description": "KD" + }, + { + "pValueText": "linoleic acid", + "full_description": "LA" + }, + { + "pValueText": "large artery atherosclerosis", + "full_description": "LAA" + }, + { + "pValueText": "left atrial diameter", + "full_description": "LAD" + }, + { + "pValueText": "lung cancer", + "full_description": "LC" + }, + { + "pValueText": "low density lipoprotein", + "full_description": "LDL" + }, + { + "pValueText": "Total cholesterol in LDL", + "full_description": "LDL-C" + }, + { + "pValueText": "ratio of low frequency to high frequency power", + "full_description": "LF/HF" + }, + { + "pValueText": "The free cholesterol content of large LDL", + "full_description": "L-LDL-FC" + }, + { + "pValueText": "Total lipids in large HDL", + "full_description": "L-HDL-L" + }, + { + "pValueText": "lipoprotein (a)", + "full_description": "Lp(a)" + }, + { + "pValueText": "lumbar spine", + "full_description": "LS" + }, + { + "pValueText": "lamotrigine-induced hypersensitivity", + "full_description": "LTG" + }, + { + "pValueText": "leukocyte telomere length", + "full_description": "LTL" + }, + { + "pValueText": "Left ventricular ejection fraction", + "full_description": "LVEF" + }, + { + "pValueText": "large-vessel disease", + "full_description": "LVD" + }, + { + "pValueText": "left ventricular diastolic diameter", + "full_description": "LVDD" + }, + { + "pValueText": "left ventricular fractional shortening", + "full_description": "LVFS" + }, + { + "pValueText": "left ventricular mass", + "full_description": "LVM" + }, + { + "pValueText": "left ventricular mass index", + "full_description": "LVMI" + }, + { + "pValueText": "left ventricular systolic dimension", + "full_description": "LVSD" + }, + { + "pValueText": "Lymphoma subtypes", + "full_description": "LYM" + }, + { + "pValueText": "from clamp", + "full_description": "M" + }, + { + "pValueText": "male athletes", + "full_description": "MA" + }, + { + "pValueText": "mean arterial pressure", + "full_description": "MAP" + }, + { + "pValueText": "mean arterial pressure, long-term average", + "full_description": "MAPLTA" + }, + { + "pValueText": "maximum L* (reflectance)", + "full_description": "maxL*" + }, + { + "pValueText": "mother’s criticism", + "full_description": "MC" + }, + { + "pValueText": "mean corpuscular hemoglobin", + "full_description": "MCH" + }, + { + "pValueText": "mean corpuscular hemoglobin concentration", + "full_description": "MCHC" + }, + { + "pValueText": "mild cognitive impairment", + "full_description": "MCI" + }, + { + "pValueText": "Mental Component Summary", + "full_description": "MCS" + }, + { + "pValueText": "mean corpuscular volume", + "full_description": "MCV" + }, + { + "pValueText": "monocyte chemoattractant protein-1", + "full_description": "MCP1" + }, + { + "pValueText": "Mild disease course", + "full_description": "MDC" + }, + { + "pValueText": "mean forced vital capacity from 2 exams", + "full_description": "meanFVC" + }, + { + "pValueText": "mean FEV1/FVC from 2 exams", + "full_description": "meanratio" + }, + { + "pValueText": "Methamphetamine", + "full_description": "METH" + }, + { + "pValueText": "2-(N-acetyl-L-cystein-S-yl)-1-hydroxybut-3-ene and 1-(N-acetyl-L-cystein-S-yl)-1-hydroxybut-3-ene", + "full_description": "MHBMA" + }, + { + "pValueText": "Total lipids in medium HDL", + "full_description": "M-HDL-L" + }, + { + "pValueText": "3-methoxy-4-hydroxyphenlglycol", + "full_description": "MHPG" + }, + { + "pValueText": "myocardial infarction", + "full_description": "MI" + }, + { + "pValueText": "macrophage inflammatory protein beta", + "full_description": "MIP-1b" + }, + { + "pValueText": "Total cholesterol in medium LDL", + "full_description": "M-LDL-C" + }, + { + "pValueText": "Phospholipids in medium LDL", + "full_description": "M-LDL-PL" + }, + { + "pValueText": "male long endurance athletes", + "full_description": "MLE" + }, + { + "pValueText": "double-bond protons of mobile lipids", + "full_description": "MobCH" + }, + { + "pValueText": "mismatch negativity (300-710 ms)", + "full_description": "MMnb" + }, + { + "pValueText": "measles, mumps and rubella vaccination", + "full_description": "MMR" + }, + { + "pValueText": "Mini-mental state examination", + "full_description": "MMSE" + }, + { + "pValueText": "middle and short endurance athletes", + "full_description": "MSE" + }, + { + "pValueText": "Multiple Sclerosis Severity Scale", + "full_description": "MSSS" + }, + { + "pValueText": "Multi-Trait Analysis of GWAS", + "full_description": "MTAG" + }, + { + "pValueText": "Phospholipids in medium VLDL", + "full_description": "M-VLDL-PL" + }, + { + "pValueText": "mother’s warmth", + "full_description": "MW" + }, + { + "pValueText": "not applicable", + "full_description": "NA" + }, + { + "pValueText": "non-albumin protein", + "full_description": "NAP" + }, + { + "pValueText": "Boston Naming Test", + "full_description": "Nam" + }, + { + "pValueText": "neurocognitive impairment", + "full_description": "NCI" + }, + { + "pValueText": "Neck section modulus", + "full_description": "NeckZ1" + }, + { + "pValueText": "neck section modulus in females", + "full_description": "NeckZ1rf" + }, + { + "pValueText": "neck width in females", + "full_description": "NeckW1rf" + }, + { + "pValueText": "neck section modulus in males", + "full_description": "NeckZ1rm" + }, + { + "pValueText": "fasting serum nonesterified fatty acids", + "full_description": "NEFA" + }, + { + "pValueText": "neurofibrillary tangles", + "full_description": "NFT" + }, + { + "pValueText": "Non-Hodgkin’s Lymphoma", + "full_description": "NHL" + }, + { + "pValueText": "neck length", + "full_description": "NL" + }, + { + "pValueText": "nasopharyngeal carcinoma", + "full_description": "NPC" + }, + { + "pValueText": "normal-pressure glaucoma", + "full_description": "NPG" + }, + { + "pValueText": "not reported", + "full_description": "NR" + }, + { + "pValueText": "none significant", + "full_description": "NS" + }, + { + "pValueText": "neck shaft angle", + "full_description": "NSA" + }, + { + "pValueText": "neck-shaft angle in males", + "full_description": "NSAm" + }, + { + "pValueText": "nonsyndromic cleft lip with or without cleft palate", + "full_description": "NSCL/P" + }, + { + "pValueText": "neck width", + "full_description": "NW" + }, + { + "pValueText": "Non Verbal", + "full_description": "Nvrb" + }, + { + "pValueText": "Childhood Obsessive-Compulsive Personality Disorder", + "full_description": "OCPD" + }, + { + "pValueText": "odds ratio", + "full_description": "OR" + }, + { + "pValueText": "particle concentration", + "full_description": "P" + }, + { + "pValueText": "post exercise 3 minute recovery systolic blood pressure", + "full_description": "P3MRSBP" + }, + { + "pValueText": "peripheral artery disease", + "full_description": "PAD" + }, + { + "pValueText": "plasminogen activator inhibitor", + "full_description": "PAI-1" + }, + { + "pValueText": "paired associates learning", + "full_description": "PAL" + }, + { + "pValueText": "paragraph delayed recall", + "full_description": "PAR-dr" + }, + { + "pValueText": "Protein C", + "full_description": "PC" + }, + { + "pValueText": "principal component axis 1, CANTAB measures", + "full_description": "PC1" + }, + { + "pValueText": "principal component analysis 2", + "full_description": "PC2" + }, + { + "pValueText": "principal component analysis 3", + "full_description": "PC3" + }, + { + "pValueText": "Phosphatidylcholine diacyl C36:3", + "full_description": "PC aa C36:3" + }, + { + "pValueText": "Phosphatidylcholine diacyl C34:4", + "full_description": "PC aa C36:4" + }, + { + "pValueText": "Physical Component Summary", + "full_description": "PCS" + }, + { + "pValueText": "periodontal complex trait", + "full_description": "PCT" + }, + { + "pValueText": "packed cell volume", + "full_description": "PCV" + }, + { + "pValueText": "Parkinson’s disease", + "full_description": "PD" + }, + { + "pValueText": "Porphyromonas gingivalis", + "full_description": "P gingi" + }, + { + "pValueText": "Phenylalanine", + "full_description": "Phe" + }, + { + "pValueText": "phenytoin-induced hypersensitivity", + "full_description": "PHT" + }, + { + "pValueText": "pack-years", + "full_description": "PKYRS" + }, + { + "pValueText": "phospholipid", + "full_description": "PL" + }, + { + "pValueText": "platelets", + "full_description": "PLT" + }, + { + "pValueText": "platelet aggregation (ADP-induced)", + "full_description": "pltadp" + }, + { + "pValueText": "platelet aggregation (collagen-induced)", + "full_description": "pltcoll" + }, + { + "pValueText": "pulse pressure", + "full_description": "PP" + }, + { + "pValueText": "percent predicted FEF25-75 for latest exam", + "full_description": "ppfef" + }, + { + "pValueText": "percent predicted FEF25-75/FVC for latest exam", + "full_description": "ppfefrat" + }, + { + "pValueText": "percent predicted FVC for latest exam", + "full_description": "ppfvc" + }, + { + "pValueText": "percent predicted FEV1 for latest exam", + "full_description": "ppfev1" + }, + { + "pValueText": "percent predicted FEV1/FVC/FEF", + "full_description": "ppFEV1/FEC/FEE" + }, + { + "pValueText": "percent predicted FEV1/FVC for latest exam", + "full_description": "ppratio" + }, + { + "pValueText": "pattern recognition memory", + "full_description": "PRM" + }, + { + "pValueText": "propylthiouracil solution", + "full_description": "PROP" + }, + { + "pValueText": "protein S", + "full_description": "PS" + }, + { + "pValueText": "primary sclerosing cholangitis", + "full_description": "PSC" + }, + { + "pValueText": "prothrombin time", + "full_description": "PT" + }, + { + "pValueText": "polyunsaturated fatty acids", + "full_description": "PUFA" + }, + { + "pValueText": "peak-valley respiratory sinus arrhythmia or high frequency power", + "full_description": "pvRSA/HF" + }, + { + "pValueText": "quality control", + "full_description": "QC" + }, + { + "pValueText": "fasting serum quantitative insulin sensitivity check index", + "full_description": "QUICKI" + }, + { + "pValueText": "Rheumatoid arthritis", + "full_description": "RA" + }, + { + "pValueText": "risk allele frequency", + "full_description": "RAF" + }, + { + "pValueText": "fasting serum regulated upon activation, normal T-cell expressed and secreted", + "full_description": "RANTES" + }, + { + "pValueText": "Rey’s Auditory Verbal Learning Test delayed recall (belongs to WL-dr category)", + "full_description": "RAVLT-dr" + }, + { + "pValueText": "red blood cell", + "full_description": "RBC" + }, + { + "pValueText": "red blood cell count", + "full_description": "RBCC" + }, + { + "pValueText": "red cell distribution width", + "full_description": "RDW" + }, + { + "pValueText": "root mean square of the successive differences of inter beat intervals", + "full_description": "RMSSD" + }, + { + "pValueText": "maximum respiratory quotient during treadmill fitness test", + "full_description": "RQmax" + }, + { + "pValueText": "rapid visual processing", + "full_description": "RVP" + }, + { + "pValueText": "reflected wave amplitude", + "full_description": "RW" + }, + { + "pValueText": "reflected wave amplitude, long-term average", + "full_description": "RWLTA" + }, + { + "pValueText": "Stage 2 exercise heart rate", + "full_description": "S2EHR" + }, + { + "pValueText": "stage 2 exercise systolic blood pressure", + "full_description": "S2ESBP" + }, + { + "pValueText": "subset-based meta-analysis approach", + "full_description": "SBM" + }, + { + "pValueText": "standard deviation of the normal-to-normal inter beat intervals", + "full_description": "SDNN" + }, + { + "pValueText": "serum total triglyceride content", + "full_description": "serum TG" + }, + { + "pValueText": "Simpson-Angus Scale", + "full_description": "SAS" + }, + { + "pValueText": "systolic blood pressure", + "full_description": "SBP" + }, + { + "pValueText": "systolic blood pressure, long-term average", + "full_description": "SBPLTA" + }, + { + "pValueText": "serum creatinine", + "full_description": "sCR" + }, + { + "pValueText": "Schizophrenia and Bipolar disorder", + "full_description": "SCZ and BD" + }, + { + "pValueText": "standard deviation", + "full_description": "s.d." + }, + { + "pValueText": "sleep efficiency", + "full_description": "SE" + }, + { + "pValueText": "Glucose effectiveness", + "full_description": "SG" + }, + { + "pValueText": "Shaft width combined", + "full_description": "ShaftW1" + }, + { + "pValueText": "shaft width in females", + "full_description": "ShaftW1f" + }, + { + "pValueText": "shaft section modulus in females", + "full_description": "ShaftZ1rf" + }, + { + "pValueText": "shaft section modulus", + "full_description": "ShaftZ1R" + }, + { + "pValueText": "sex hormone binding globulin", + "full_description": "SHBG" + }, + { + "pValueText": "SI from FSIGT", + "full_description": "SI" + }, + { + "pValueText": "fasting serum soluble intercellular adhesion molecule-1", + "full_description": "sICAM-1" + }, + { + "pValueText": "soluble interleukin", + "full_description": "sIL-6R" + }, + { + "pValueText": "Similarities", + "full_description": "Sim" + }, + { + "pValueText": "respiratory quotient during sleep", + "full_description": "Sleep RQ" + }, + { + "pValueText": "systemic lupus erythematosus", + "full_description": "SLE" + }, + { + "pValueText": "butyrylcarnitine / propionylcarnitine", + "full_description": "SM-1" + }, + { + "pValueText": "N-acetylornithine", + "full_description": "SM-2" + }, + { + "pValueText": "1-arachidonoylglycero phosphoethanolamine / 1-linoleoylglycerophospho-ethanolamine", + "full_description": "SM-3" + }, + { + "pValueText": "bilirubin (E,E) / oleoylcarnitine", + "full_description": "SM-4" + }, + { + "pValueText": "hexanoylcarnitine / oleate (18:1n9)", + "full_description": "SM-5" + }, + { + "pValueText": "myristate (14:0) / myristoleate (14:1n5)", + "full_description": "SM-6" + }, + { + "pValueText": "1-methylxanthine / 4-acetamidobutanoate", + "full_description": "SM-7" + }, + { + "pValueText": "ADpSGEGDFXAEGGGVR / ADSGEGDFXAEGGGVR", + "full_description": "SM-8" + }, + { + "pValueText": "10-nonadecenoate (19:1n9) / 10-undecenoate (11:1n1)", + "full_description": "SM-9" + }, + { + "pValueText": "eicosenoate (20:1n9 or 11) / tetradecanedioate", + "full_description": "SM-10" + }, + { + "pValueText": "ADpSGEGDFXAEGGGVR / ADSGEGDFXAEGGGVR", + "full_description": "SM-11" + }, + { + "pValueText": "ADSGEGDFXAEGGGVR / DSGEGDFXAEGGGVR", + "full_description": "SM-12" + }, + { + "pValueText": "androsterone sulfate / epiandrosterone sulfate", + "full_description": "SM-13" + }, + { + "pValueText": "ADpSGEGDFXAEGGGVR / DSGEGDFXAEGGGVR", + "full_description": "SM-14" + }, + { + "pValueText": "1-eicosatrienoylglycero-phosphocholine / 1-linoleoylglycero phosphocholine", + "full_description": "SM-15" + }, + { + "pValueText": "docosahexaenoate (DHA; 22:6n3) / eicosapentaenoate (EPA; 20:5n3)", + "full_description": "SM-16" + }, + { + "pValueText": "3-(4-hydroxyphenyl)lactate / isovalerylcarnitine", + "full_description": "SM-17" + }, + { + "pValueText": "age of initiation (years)", + "full_description": "SMKAGE" + }, + { + "pValueText": "duration (years)", + "full_description": "SMKDU" + }, + { + "pValueText": "sphingomyelin", + "full_description": "SM" + }, + { + "pValueText": "spectrum", + "full_description": "Spc" + }, + { + "pValueText": "surfactant protein D", + "full_description": "SP-D" + }, + { + "pValueText": "processing speed", + "full_description": "SPEED" + }, + { + "pValueText": "spatial recognition memory", + "full_description": "SRM" + }, + { + "pValueText": "spatial span", + "full_description": "SSP" + }, + { + "pValueText": "selective serotonin reuptake inhibitor", + "full_description": "SSRI" + }, + { + "pValueText": "Soluble Transferrin Receptor", + "full_description": "sTfR" + }, + { + "pValueText": "soluble receptor Tie-2", + "full_description": "sTie-2" + }, + { + "pValueText": "strict", + "full_description": "Str" + }, + { + "pValueText": "small-vessel disease", + "full_description": "SVD" + }, + { + "pValueText": "spatial working memory", + "full_description": "SWM" + }, + { + "pValueText": "Tetra-antennary glycans", + "full_description": "TA" + }, + { + "pValueText": "Total adipose tissue area", + "full_description": "TAT" + }, + { + "pValueText": "total-body less head bone mineral density", + "full_description": "TBLH-BMD" + }, + { + "pValueText": "total-body lean mass", + "full_description": "TB-LM" + }, + { + "pValueText": "total cholesterol", + "full_description": "TC" + }, + { + "pValueText": "24-h total energy expenditure", + "full_description": "TEE" + }, + { + "pValueText": "28 year time averaged fasting plasma glucose (FPG)", + "full_description": "tFPG" + }, + { + "pValueText": "triglycerides", + "full_description": "TG" + }, + { + "pValueText": "transforming growth factor", + "full_description": "TGF-b1" + }, + { + "pValueText": "fasting serum triglycerides/high density lipoprotein cholesterol", + "full_description": "TG/HDLC" + }, + { + "pValueText": "Type 1 diabetes diabetic nephropathy", + "full_description": "TIDN" + }, + { + "pValueText": "Total Protein S", + "full_description": "Total PS" + }, + { + "pValueText": "fasting serum triiodothyronine", + "full_description": "Total T3" + }, + { + "pValueText": "fasting serum thyroxine", + "full_description": "Total T4" + }, + { + "pValueText": "total protein", + "full_description": "TP" + }, + { + "pValueText": "tissue plasminogen activator", + "full_description": "tPA" + }, + { + "pValueText": "tumor necrosis factor alpha", + "full_description": "TNFA" + }, + { + "pValueText": "Trochanter bone mineral density", + "full_description": "TRBMD" + }, + { + "pValueText": "Trochanter bone mineral density males", + "full_description": "TRBMDm" + }, + { + "pValueText": "thyroid stimulating hormone", + "full_description": "TSH" + }, + { + "pValueText": "Tyrosine", + "full_description": "Tyr" + }, + { + "pValueText": "urinary albumin excretion", + "full_description": "UAE" + }, + { + "pValueText": "Valine", + "full_description": "Val" + }, + { + "pValueText": "volumetric bone mineral density", + "full_description": "vBMD" + }, + { + "pValueText": "Vitamin D plasma 25(OH)-D", + "full_description": "VitD250H" + }, + { + "pValueText": "Vitamin K plasma phylloquinone", + "full_description": "VitkPhylloq" + }, + { + "pValueText": "very-low-density lipoprotein", + "full_description": "VLDL" + }, + { + "pValueText": "Mean diameter for VLDL particles", + "full_description": "VLDL-D" + }, + { + "pValueText": "maximum oxygen consumption during treadmill fitness test", + "full_description": "VO2max" + }, + { + "pValueText": "velocity of sound", + "full_description": "VOS" + }, + { + "pValueText": "delayed recall for visually presented word list", + "full_description": "VPWL-dr" + }, + { + "pValueText": "verbal", + "full_description": "Vrb" + }, + { + "pValueText": "verbal recall", + "full_description": "VRM" + }, + { + "pValueText": "Willebrand factor", + "full_description": "vWF" + }, + { + "pValueText": "white blood cell", + "full_description": "WBC" + }, + { + "pValueText": "waist circumference", + "full_description": "WC" + }, + { + "pValueText": "WC adjustment for BMI", + "full_description": "WCadjBMI" + }, + { + "pValueText": "weight fluctuation", + "full_description": "WF" + }, + { + "pValueText": "Women’s Genome Health Study", + "full_description": "WGHS" + }, + { + "pValueText": "waist hip ratio", + "full_description": "WHR" + }, + { + "pValueText": "WHR adjustment for BMI", + "full_description": "WHRadjBMI" + }, + { + "pValueText": "word interference test", + "full_description": "WIT" + }, + { + "pValueText": "word list delayed recall", + "full_description": "WL-dr" + }, + { + "pValueText": "Wide-Range Achievement Test", + "full_description": "WRAT" + }, + { + "pValueText": "The cholesterol ester content of extra large HD", + "full_description": "XL-HDL-CE" + }, + { + "pValueText": "Triglycerides in very large HDL", + "full_description": "XL-HDL-TG" + }, + { + "pValueText": "extremely large VLDL particles", + "full_description": "XXL-VLDL-P" + }, + { + "pValueText": "(Chitinase 3-like 1) protein levels", + "full_description": "YKL-40" + } +] diff --git a/src/etl/json/data/gwascat_2_gnomad_superpopulation.json b/src/etl/json/data/gwascat_2_gnomad_superpopulation.json new file mode 100644 index 000000000..728b92551 --- /dev/null +++ b/src/etl/json/data/gwascat_2_gnomad_superpopulation.json @@ -0,0 +1,70 @@ +[ + { + "gwas_catalog_ancestry": "European", + "gnomadPopulation": "nfe" + }, + { + "gwas_catalog_ancestry": "African American or Afro-Caribbean", + "gnomadPopulation": "afr" + }, + { + "gwas_catalog_ancestry": "Native American", + "gnomadPopulation": "amr" + }, + { + "gwas_catalog_ancestry": "Asian unspecified", + "gnomadPopulation": "eas" + }, + { + "gwas_catalog_ancestry": "Hispanic or Latin American", + "gnomadPopulation": "amr" + }, + { + "gwas_catalog_ancestry": "East Asian", + "gnomadPopulation": "eas" + }, + { + "gwas_catalog_ancestry": "Central Asian", + "gnomadPopulation": "seu" + }, + { + "gwas_catalog_ancestry": "Oceanian", + "gnomadPopulation": "eas" + }, + { + "gwas_catalog_ancestry": "South East Asian", + "gnomadPopulation": "eas" + }, + { + "gwas_catalog_ancestry": "Other admixed ancestry", + "gnomadPopulation": "nfe" + }, + { + "gwas_catalog_ancestry": "African unspecified", + "gnomadPopulation": "afr" + }, + { + "gwas_catalog_ancestry": "Sub-Saharan African", + "gnomadPopulation": "afr" + }, + { + "gwas_catalog_ancestry": "Greater Middle Eastern (Middle Eastern, North African or Persian)", + "gnomadPopulation": "seu" + }, + { + "gwas_catalog_ancestry": "Aboriginal Australian", + "gnomadPopulation": "eas" + }, + { + "gwas_catalog_ancestry": "Other", + "gnomadPopulation": "nfe" + }, + { + "gwas_catalog_ancestry": "South Asian", + "gnomadPopulation": "eas" + }, + { + "gwas_catalog_ancestry": "NR", + "gnomadPopulation": "nfe" + } +] diff --git a/src/etl/json/schemas/studies.json b/src/etl/json/schemas/studies.json index 24cd11e46..232eb66db 100644 --- a/src/etl/json/schemas/studies.json +++ b/src/etl/json/schemas/studies.json @@ -2,37 +2,47 @@ "type": "struct", "fields": [ { - "name": "studyAccession", + "name": "studyId", "type": "string", "nullable": true, "metadata": {} }, { - "name": "pubmedId", + "name": "diseaseTrait", "type": "string", "nullable": true, "metadata": {} }, { - "name": "firstAuthor", + "name": "efos", + "type": { + "type": "array", + "elementType": "string", + "containsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "pubmedId", "type": "string", "nullable": true, "metadata": {} }, { - "name": "publicationDate", + "name": "firstAuthor", "type": "string", "nullable": true, "metadata": {} }, { - "name": "journal", + "name": "publicationDate", "type": "string", "nullable": true, "metadata": {} }, { - "name": "link", + "name": "journal", "type": "string", "nullable": true, "metadata": {} @@ -44,8 +54,12 @@ "metadata": {} }, { - "name": "diseaseTrait", - "type": "string", + "name": "backgroundEfos", + "type": { + "type": "array", + "elementType": "string", + "containsNull": true + }, "nullable": true, "metadata": {} }, @@ -73,38 +87,6 @@ "nullable": true, "metadata": {} }, - { - "name": "summarystatsLocation", - "type": "string", - "nullable": true, - "metadata": {} - }, - { - "name": "hasSumstats", - "type": "boolean", - "nullable": false, - "metadata": {} - }, - { - "name": "efos", - "type": { - "type": "array", - "elementType": "string", - "containsNull": true - }, - "nullable": true, - "metadata": {} - }, - { - "name": "backgroundEfos", - "type": { - "type": "array", - "elementType": "string", - "containsNull": true - }, - "nullable": true, - "metadata": {} - }, { "name": "discoverySamples", "type": { @@ -126,7 +108,7 @@ } ] }, - "containsNull": false + "containsNull": true }, "nullable": true, "metadata": {} @@ -152,20 +134,20 @@ } ] }, - "containsNull": false + "containsNull": true }, "nullable": true, "metadata": {} }, { - "name": "initialSampleCountEuropean", - "type": "double", + "name": "summarystatsLocation", + "type": "string", "nullable": true, "metadata": {} }, { - "name": "initialSampleCount", - "type": "double", + "name": "hasSumstats", + "type": "boolean", "nullable": true, "metadata": {} } diff --git a/src/etl/v2g/intervals/Liftover.py b/src/etl/v2g/intervals/Liftover.py index 3101127e0..c304c34fe 100644 --- a/src/etl/v2g/intervals/Liftover.py +++ b/src/etl/v2g/intervals/Liftover.py @@ -34,7 +34,7 @@ def __init__( Args: chain_file (str): Path to the chain file - max_difference (int): Maximum difference between the length of the mapped region and the original region. Defaults to 0. + max_difference (int): Maximum difference between the length of the mapped region and the original region. Defaults to 100. """ self.chain_file = chain_file self.max_difference = max_difference diff --git a/src/run_gwas_ingest.py b/src/run_gwas_ingest.py index 1bb7e0788..7be213e37 100644 --- a/src/run_gwas_ingest.py +++ b/src/run_gwas_ingest.py @@ -3,13 +3,23 @@ from typing import TYPE_CHECKING +import hail as hl import hydra if TYPE_CHECKING: from omegaconf import DictConfig from etl.common.ETLSession import ETLSession -from etl.gwas_ingest.study_ingestion import ingest_gwas_catalog_studies +from etl.gwas_ingest.pics import pics_all_study_locus +from etl.gwas_ingest.process_associations import ( + generate_association_table, + ingest_gwas_catalog_associations, +) +from etl.gwas_ingest.study_ingestion import ( + generate_study_table, + ingest_gwas_catalog_studies, + spliting_gwas_studies, +) @hydra.main(version_base=None, config_path=".", config_name="config") @@ -17,37 +27,67 @@ def main(cfg: DictConfig) -> None: """Run GWASCatalog ingestion.""" etl = ETLSession(cfg) - etl.logger.info("Ingesting GWAS Catalog data...") - - # This section is commented out for testing study ingestion: - # assoc = ingest_gwas_catalog_associations( - # etl, - # cfg.etl.gwas_ingest.inputs.gwas_catalog_associations, - # cfg.etl.gwas_ingest.inputs.variant_annotation, - # cfg.etl.gwas_ingest.parameters.p_value_cutoff, - # ).transform(harmonize_effect) - # - # etl.logger.info( - # f"Writing data to: {cfg.etl.gwas_ingest.outputs.gwas_catalog_associations}" - # ) - # - # ( - # assoc.write.mode(cfg.environment.sparkWriteMode).parquet( - # cfg.etl.gwas_ingest.outputs.gwas_catalog_associations - # ) - # ) - # - # Ingest GWAS Catalog studies: - gwas_studies = ingest_gwas_catalog_studies( + hl.init(sc=etl.spark.sparkContext, default_reference="GRCh38") + etl.logger.info("Ingesting GWAS Catalog association data...") + + # Ingesting GWAS Catalog associations: + raw_associations = ingest_gwas_catalog_associations( etl, - cfg.etl.gwas_ingest.inputs.gwas_catalog_studies, - cfg.etl.gwas_ingest.inputs.gwas_catalog_ancestries, - cfg.etl.gwas_ingest.inputs.summary_stats_list, + cfg.etl.gwas_ingest.inputs.gwas_catalog_associations, + cfg.etl.variant_annotation.outputs.variant_annotation, + cfg.etl.gwas_ingest.parameters.p_value_cutoff, + ) + + # Joining study and association + study_assoc = ( + # Ingesting GWAS Catalog studies: + ingest_gwas_catalog_studies( + etl, + cfg.etl.gwas_ingest.inputs.gwas_catalog_studies, + cfg.etl.gwas_ingest.inputs.gwas_catalog_ancestries, + cfg.etl.gwas_ingest.inputs.summary_stats_list, + ) + # Joining with associations: + .join(raw_associations, on="studyAccession", how="left") + # Splitting studies: + .transform(spliting_gwas_studies) + ) + + # Extracting study table and save: + studies = generate_study_table(study_assoc) + etl.logger.info( + f"Writing studies data to: {cfg.etl.gwas_ingest.outputs.gwas_catalog_studies}" + ) + ( + studies.write.mode(cfg.environment.sparkWriteMode).parquet( + cfg.etl.gwas_ingest.outputs.gwas_catalog_studies + ) ) - # Saving temporary output: - gwas_studies.write.mode(cfg.environment.sparkWriteMode).parquet( - cfg.etl.gwas_ingest.outputs.gwas_catalog_studies + # Extracting associations from the combined study/assoc table: + associations = generate_association_table(study_assoc) + + # Saved association data: + etl.logger.info( + f"Writing associations data to: {cfg.etl.gwas_ingest.outputs.gwas_catalog_associations}" + ) + ( + associations.write.mode(cfg.environment.sparkWriteMode).parquet( + cfg.etl.gwas_ingest.outputs.gwas_catalog_associations + ) + ) + + # Running PICS: + pics_data = pics_all_study_locus( + etl, + associations, + studies, + cfg.etl.gwas_ingest.inputs.gnomad_populations, + cfg.etl.gwas_ingest.parameters.min_r2, + cfg.etl.gwas_ingest.parameters.k, + ) + pics_data.write.mode("overwrite").parquet( + cfg.etl.gwas_ingest.outputs.pics_credible_set ) diff --git a/src/run_precompute_ld_indexes.py b/src/run_precompute_ld_indexes.py new file mode 100644 index 000000000..129def1a3 --- /dev/null +++ b/src/run_precompute_ld_indexes.py @@ -0,0 +1,32 @@ +"""Precompute LD indexes.""" +from __future__ import annotations + +from typing import TYPE_CHECKING + +import hydra + +from etl.gwas_ingest.ld import precompute_ld_index + +if TYPE_CHECKING: + from omegaconf import DictConfig + + +@hydra.main(version_base=None, config_path=".", config_name="config") +def main(cfg: DictConfig) -> None: + """Precompute LD indexes for all populations in gnomAD.""" + for population in cfg.etl.gwas_ingest.inputs.gnomad_populations: + parsed_index = precompute_ld_index( + population.index, + cfg.etl.gwas_ingest.parameters.ld_window, + cfg.etl.gwas_ingest.inputs.grch37_to_grch38_chain, + ) + + parsed_index.write.mode(cfg.environment.sparkWriteMode).parquet( + population.parsed_index + ) + + return None + + +if __name__ == "__main__": + main() diff --git a/tests/conftest.py b/tests/conftest.py index 2176c993d..7bc59d75f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,4 +12,9 @@ def spark() -> SparkSession: Returns: SparkSession: local spark session """ - return SparkSession.builder.master("local").appName("test").getOrCreate() + return ( + SparkSession.builder.config("spark.driver.bindAddress", "127.0.0.1") + .master("local") + .appName("test") + .getOrCreate() + ) diff --git a/tests/test_effect_harmonization.py b/tests/test_effect_harmonization.py index f83507130..0b958c916 100644 --- a/tests/test_effect_harmonization.py +++ b/tests/test_effect_harmonization.py @@ -6,42 +6,153 @@ import pyspark.sql.functions as f import pytest -from etl.gwas_ingest.effect_harmonization import get_reverse_complement +from etl.gwas_ingest.effect_harmonization import harmonize_effect if TYPE_CHECKING: - from pyspark.sql import DataFrame, SparkSession + from pyspark.sql import DataFrame + from pyspark.sql.session import SparkSession @pytest.fixture -def mock_allele_columns(spark: SparkSession) -> DataFrame: - """Mock Dataframe to test harmonisation.""" - return spark.createDataFrame( - [ - {"allele": "A", "reverseComp": "T", "isPalindrome": False}, - {"allele": "C", "reverseComp": "G", "isPalindrome": False}, - {"allele": "T", "reverseComp": "A", "isPalindrome": False}, - {"allele": "G", "reverseComp": "C", "isPalindrome": False}, - {"allele": "AT", "reverseComp": "AT", "isPalindrome": True}, - {"allele": "TTGA", "reverseComp": "TCAA", "isPalindrome": False}, - {"allele": "-", "reverseComp": None, "isPalindrome": False}, - {"allele": None, "reverseComp": None, "isPalindrome": False}, - {"allele": "CATATG", "reverseComp": "CATATG", "isPalindrome": True}, - ] - ).persist() +def mock_assoc_table(spark: SparkSession) -> DataFrame: + """Mock Dataframe to test effect harmonisation. + Args: + spark (SparkSession): SparkSession -@pytest.fixture -def call_get_reverse_complement(mock_allele_columns: DataFrame) -> DataFrame: - """Test reverse complement on mock data.""" - return mock_allele_columns.transform( - lambda df: get_reverse_complement(df, "allele") + Returns: + A dataframe with columns required for effect harmonization + """ + data = [ + # Beta, no need to harmonization: + ( + "A", + "C", + "C", + 5, + -11, + 0.6, + "[0.013-0.024] unit increase", + True, + True, + False, + False, + False, + ), + # Beta, harmonization required: + ( + "A", + "C", + "T", + 5, + -11, + 0.6, + "[0.013-0.024] unit increase", + True, + False, + False, + False, + False, + ), + # Beta, effect dropped due to palindromic alleles: + ( + "A", + "T", + "A", + 5, + -11, + 0.6, + "[0.013-0.024] unit increase", + False, + False, + False, + False, + True, + ), + ] + mock_df = ( + spark.createDataFrame( + data, + [ + "referenceAllele", + "alternateAllele", + "riskAllele", + "pValueMantissa", + "pValueExponent", + "effectSize", + "confidenceInterval", + "isBeta", + "isBetaPositive", + "isOR", + "isORPositive", + "isTestPalindrome", + ], + ) + .withColumn("qualityControl", f.array()) + .persist() ) + return mock_df + + +@pytest.fixture +def call_effec_harmonized(mock_assoc_table: DataFrame) -> DataFrame: + """Test filter association by rsid based on mock DataFrame.""" + return mock_assoc_table.transform(harmonize_effect) + + +def test_harmonize_effect__betas(call_effec_harmonized: DataFrame) -> None: + """Testing if the betas are found as expected.""" + betas_count = call_effec_harmonized.filter( + ~f.col("isBeta") + .cast("int") + .bitwiseXOR(f.col("beta").isNotNull().cast("int")) + .cast("boolean") + ).count() + + assert betas_count == call_effec_harmonized.count() + + +def test_harmonize_effect__schema( + call_effec_harmonized: DataFrame, mock_assoc_table: DataFrame +) -> None: + """Testing if the expected columns are added and removed.""" + columns_to_drop = [ + "effectSize", + "confidenceInterval", + "riskAllele", + ] + columns_to_add = [ + "odds_ratio_ci_lower", + "odds_ratio_ci_upper", + "odds_ratio", + "beta", + "beta_ci_upper", + "beta_ci_lower", + ] + + # Generating the new set of columns from the input table: + expected_columns = [ + col for col in mock_assoc_table.columns if col not in columns_to_drop + ] + columns_to_add + expected_columns.sort() + + # Observed columns: + observed_columns = call_effec_harmonized.columns + observed_columns.sort() + + assert expected_columns == observed_columns + + +def test_harmonize_effect__palindrome_flag(call_effec_harmonized: DataFrame) -> None: + """Test to ensure all palindromic alleles are flagged as such.""" + palindrome_flagged = ( + # Keeping + call_effec_harmonized.filter( + f.col("isTestPalindrome") | (f.size(f.col("qualityControl")) > 0) + ) + .filter(~f.col("isTestPalindrome") | (f.size(f.col("qualityControl")) == 0)) + .count() + ) -def test_reverse_complement(call_get_reverse_complement: DataFrame) -> None: - """Test reverse complement.""" - assert ( - call_get_reverse_complement.filter( - f.col("reverseComp") != f.col("revcomp_allele") - ).count() - ) == 0 + assert palindrome_flagged == 0 diff --git a/tests/test_gwas_process_assoc.py b/tests/test_gwas_process_assoc.py index a4f6f1a59..b3a90e1ea 100644 --- a/tests/test_gwas_process_assoc.py +++ b/tests/test_gwas_process_assoc.py @@ -5,282 +5,75 @@ import pytest from pyspark.sql import DataFrame, SparkSession from pyspark.sql import functions as f +from pyspark.sql import types as t -from etl.gwas_ingest.process_associations import ( - concordance_filter, - filter_assoc_by_maf, - filter_assoc_by_rsid, -) +from etl.gwas_ingest.association_mapping import clean_mappings @pytest.fixture -def mock_maf_filter_data(spark: SparkSession) -> DataFrame: - """Mock minor allele frequency DataFrame for filtering. - - Args: - spark (SparkSession): Spark session - - Returns: - DataFrame: Mock minor allele frequency DataFrame - """ - return ( - spark.createDataFrame( - [ - # Simple case: - {"aid": 1, "pop1": 0.1, "pop2": 0.4, "keep": True}, - {"aid": 1, "pop1": 0.1, "pop2": 0.2, "keep": False}, - # Flip AF -> MAF required: - {"aid": 2, "pop1": 0.9, "pop2": 0.6, "keep": True}, - {"aid": 2, "pop1": 0.1, "pop2": 0.8, "keep": False}, - # Missing values handled properly: - {"aid": 3, "pop1": None, "pop2": 0.1, "keep": False}, - {"aid": 3, "pop1": 0.1, "pop2": 0.2, "keep": True}, - {"aid": 4, "pop1": None, "pop2": 0.6, "keep": True}, - {"aid": 4, "pop1": 0.1, "pop2": 0.3, "keep": False}, - ] - ) - .select( - f.col("aid").alias("associationId"), - f.struct(f.col("pop1").alias("pop1"), f.col("pop2").alias("pop2")).alias( - "alleleFrequencies" - ), - "keep", - f.monotonically_increasing_id().alias("id"), - ) - .persist() - ) - - -@pytest.fixture -def call_maf_filter(mock_maf_filter_data: DataFrame) -> DataFrame: - """Test filter association by MAF based on mock DataFrame.""" - return mock_maf_filter_data.transform(filter_assoc_by_maf) - - -@pytest.fixture -def mock_concordance_filter_data(spark: SparkSession) -> DataFrame: - """Mock DataFrame to assess allele concordance. - - Args: - spark (SparkSession): Spark session - - Returns: - DataFrame: Mock allele concordances - """ +def mock_mapping_dataset(spark: SparkSession) -> DataFrame: + """Mock association dataset with arbitrary mappings.""" data = [ - ( - 0, - "A", - "A", - "T", - True, - ), # Concordant positive, ref. - ( - 1, - "A", - "G", - "A", - True, - ), # Concordant positive, alt. - ( - 2, - "A", - "T", - "G", - True, - ), # Concordant negative, ref. - ( - 3, - "A", - "G", - "T", - True, - ), # Concordant negative, alt. - ( - 4, - "?", - "G", - "T", - True, - ), # Concordant ambigious. - ( - 5, - "ATCG", - "C", - "T", - False, - ), # discordant. + # Association No1: Although both mappings are concordant, only v2 has matching rsid: + (1, "A", ["rs123"], "v1", [], "A", "G", [[0.9], [0.5]], False), + (1, "A", ["rs123"], "v2", ["rs123"], "T", "C", [[0.3], [0.4]], True), + # Association No2: no mapping, we keep the row: + (2, "A", ["rs123"], None, None, None, None, None, True), + # Association No3: No matching rsid, keep concordant variant V3: + (3, "A", ["rs123"], "v3", [], "A", "G", [[0.9], [0.5]], True), + (3, "A", ["rs123"], "v4", [], "G", "C", [[0.3], [0.4]], False), + # Association No4: No matching rsid, all concordant, keep variant with highest MAF- V6: + (4, "A", ["rs123"], "v5", [], "A", "G", [[0.9], [0.3]], False), + (4, "A", ["rs123"], "v6", [], "G", "T", [[0.6], [0.2]], True), ] - return spark.createDataFrame( - data, ["id", "riskAllele", "referenceAllele", "alternateAllele", "concordant"] + schema = t.StructType( + [ + t.StructField("associationId", t.IntegerType(), True), + t.StructField("riskAllele", t.StringType(), True), + t.StructField("rsIdsGwasCatalog", t.ArrayType(t.StringType()), True), + t.StructField("variantId", t.StringType(), True), + t.StructField("rsIdsGnomad", t.ArrayType(t.StringType()), True), + t.StructField("referenceAllele", t.StringType(), True), + t.StructField("alternateAllele", t.StringType(), True), + t.StructField( + "alleleFrequencies", + t.ArrayType( + t.StructType( + [t.StructField("alleleFrequency", t.FloatType(), True)] + ) + ), + ), + t.StructField("keptMapping", t.BooleanType(), True), + ] ) + return spark.createDataFrame(data, schema=schema).persist() @pytest.fixture -def call_concordance_filter(mock_concordance_filter_data: DataFrame) -> DataFrame: - """Test allele concordance filter based on mock DataFrame.""" - return mock_concordance_filter_data.transform(concordance_filter) - - -@pytest.fixture -def call_rsid_filter(mock_rsid_filter: DataFrame) -> DataFrame: - """Test filter association by rsid based on mock DataFrame.""" - return mock_rsid_filter.transform(filter_assoc_by_rsid) - - -@pytest.fixture -def mock_rsid_filter(spark: SparkSession) -> DataFrame: - """Mock DataFrame to evaluate rsids. - - Args: - spark (SparkSession): Spark session - - Returns: - DataFrame: Configurations of rsids in resources and expected outcomes - """ - data = [ - # Assoc id 1: matching rsId exist: - ( - 1, - ["rs123", "rs523"], - ["rs123"], - True, - False, - ), - ( - 1, - ["rs123", "rs523"], - ["rs12"], - False, - True, - ), - # Assoc id 2: matching rsId exist: - ( - 2, - ["rs523"], - [], - False, - True, - ), - ( - 2, - ["rs523"], - ["rs12", "rs523"], - True, - False, - ), - # Assoc id 3: matching rsId doesn't exists, so keep all: - ( - 3, - ["rs123"], - [], - True, - False, - ), - ( - 3, - ["rs123"], - ["rs643", "rs523"], - True, - False, - ), - # Assoc id 4: two matching rsids exist, keep both: - ( - 4, - ["rs123", "rs523"], - ["rs123"], - True, - False, - ), - ( - 4, - ["rs123", "rs523"], - ["rs523"], - True, - False, - ), - ( - 4, - ["rs123", "rs523"], - ["rs666"], - False, - True, - ), - ] - return spark.createDataFrame( - data, ["associationId", "rsIdsGwasCatalog", "rsIdsGnomad", "retain", "drop"] - ) - - -def test_filter_assoc_by_rsid__all_columns_are_there( - mock_rsid_filter: DataFrame, call_rsid_filter: DataFrame -) -> None: - """Testing if the returned dataframe contains all columns from the source.""" - source_columns = mock_rsid_filter.columns - processed_columns = call_rsid_filter.columns - assert any(column in processed_columns for column in source_columns) - - -def test_filter_assoc_by_rsid__right_rows_are_dropped( - call_rsid_filter: DataFrame, -) -> None: - """Testing if all the retained columns should not be dropped.""" - dropped = call_rsid_filter.transform(filter_assoc_by_rsid).select("drop").collect() - assert not any(d["drop"] for d in dropped) - - -def test_filter_assoc_by_rsid__right_rows_are_kept( - call_rsid_filter: DataFrame, -) -> None: - """Testing if all the retained columns should be kept.""" - kept = call_rsid_filter.transform(filter_assoc_by_rsid).select("retain").collect() - assert all(d["retain"] for d in kept) - - -def test_concordance_filter__type(call_concordance_filter: DataFrame) -> None: - """Testing if the function returns the right type.""" - assert isinstance(call_concordance_filter, DataFrame) +def cleaned_mock(mock_mapping_dataset: DataFrame) -> DataFrame: + """Function to return the cleaned mappings.""" + return clean_mappings(mock_mapping_dataset).persist() -def test_concordance_filter__all_columns_returned( - call_concordance_filter: DataFrame, mock_concordance_filter_data: DataFrame +def test_clean_mappings__schema( + cleaned_mock: DataFrame, mock_mapping_dataset: DataFrame ) -> None: - """Testing if the function returns the right type.""" - source_columns = mock_concordance_filter_data.columns - processed_columns = call_concordance_filter.columns - - assert any(column in processed_columns for column in source_columns) - - -def test_concordance_filter__right_rows_retained( - call_concordance_filter: DataFrame, mock_concordance_filter_data: DataFrame -) -> None: - """Testing if the filter generated the expected output.""" - target_ids = [ - row["id"] - for row in ( - mock_concordance_filter_data.filter(f.col("concordant")) - .select("id") - .orderBy("id") - .collect() - ) - ] - filtered_ids = [ - row["id"] - for row in (call_concordance_filter.select("id").orderBy("id").collect()) - ] - - assert filtered_ids == target_ids + """Test to make sure the right columns are returned from the function.""" + assert any( + column in cleaned_mock.columns for column in mock_mapping_dataset.columns + ) -def test_maf_filter__right_rows_retained( - call_maf_filter: DataFrame, mock_maf_filter_data: DataFrame +def test_clean_mappings__rows( + cleaned_mock: DataFrame, mock_mapping_dataset: DataFrame ) -> None: - """Testing if the filter generated the expected output.""" - target_ids = [ - row["id"] - for row in (mock_maf_filter_data.filter(f.col("keep")).orderBy("id").collect()) - ] - filtered_ids = [row["id"] for row in (call_maf_filter.orderBy("id").collect())] - - assert filtered_ids == target_ids + """Test to make sure the right GnomAD mappings are kept.""" + columns = cleaned_mock.columns + expected = ( + mock_mapping_dataset.select(*columns) + .filter(f.col("keptMapping")) + .orderBy("variantId") + .collect() + ) + observed = cleaned_mock.select(*columns).orderBy("variantId").collect() + assert expected == observed