diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index 10dc9c10d..5abb30c8e 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -113,7 +113,7 @@ class StudyLocusQualityCheck(Enum): EXPLAINED_BY_SUSIE = "Study locus in region explained by a SuSiE credible set" OUT_OF_SAMPLE_LD = "Study locus finemapped without in-sample LD reference" ABNORMAL_PIPS = ( - "Study locus with a sum of PIPs that not in the expected range [0.99,1]" + "Study locus with a sum of PIPs that not in the expected range [0.95,1]" ) INVALID_CHROMOSOME = "Chromosome not in 1:22, X, Y, XY or MT" TOP_HIT_AND_SUMMARY_STATS = ( diff --git a/src/gentropy/study_locus_validation.py b/src/gentropy/study_locus_validation.py index 4c0c8c4c5..cf36a4389 100644 --- a/src/gentropy/study_locus_validation.py +++ b/src/gentropy/study_locus_validation.py @@ -46,24 +46,24 @@ def __init__( .annotate_study_type(study_index) # Add study type to study locus .qc_redundant_top_hits_from_PICS() # Flagging top hits from studies with PICS summary statistics .qc_explained_by_SuSiE() # Flagging credible sets in regions explained by SuSiE - # Flagging credible sets with PIP > 1 or PIP < 0.99 + # Annotates credible intervals and filter to only keep 95% credible sets + .filter_credible_set(credible_interval=CredibleInterval.IS95) + # Flagging credible sets with PIP > 1 or PIP < 0.95 .qc_abnormal_pips( - sum_pips_lower_threshold=0.99, sum_pips_upper_threshold=1.0001 + sum_pips_lower_threshold=0.95, sum_pips_upper_threshold=1.0001 ) - # Annotates credible intervals and filter to only keep 99% credible sets - .filter_credible_set(credible_interval=CredibleInterval.IS99) # Annotate credible set confidence: .assign_confidence() ).persist() # we will need this for 2 types of outputs # Valid study locus partitioned to simplify the finding of overlaps - study_locus_with_qc.valid_rows( - invalid_qc_reasons, invalid=True - ).df.repartitionByRange("chromosome", "position").sortWithinPartitions( + study_locus_with_qc.valid_rows(invalid_qc_reasons).df.repartitionByRange( "chromosome", "position" - ).write.mode(session.write_mode).parquet(invalid_study_locus_path) - - # Infalid study locus - study_locus_with_qc.valid_rows(invalid_qc_reasons).df.write.mode( + ).sortWithinPartitions("chromosome", "position").write.mode( session.write_mode ).parquet(valid_study_locus_path) + + # Invalid study locus + study_locus_with_qc.valid_rows(invalid_qc_reasons, invalid=True).df.write.mode( + session.write_mode + ).parquet(invalid_study_locus_path)