merge_haplotypePandas.py

#!/home/bin/python


print("\nChecking required modules \n")
import argparse
from functools import reduce
import time
import resource
import re
import sys
import os
import shutil
import pandas as pd


""" Purpose of the program: Used to merge the haplotype file generated by phase-Extender
and phase-Stitcher. """

start01 = time.time()


""" Call input and output argument variable names"""
# inFile = sys.argv[1]
# outFile = sys.argv[2]

# hapList = hapList.txt
# f1List= f1_list.txt
# outFile = mergedHaplotype

# hap_paths="/media/everestial007/SeagateBackup4.0TB2/RNAseq_Data_Analyses/" \
#        "04_B-GenomeWidePhasing/Step02_A-PhaseExtension/hapList.txt"
# f1_paths="/media/everestial007/SeagateBackup4.0TB2/RNAseq_Data_Analyses/" \
#        "04_B-GenomeWidePhasing/Step02_A-PhaseExtension/f1List.txt"


def main():

    """Define required argument for interactive mode program."""
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--hapList",
        help="name of the file that contains list of path to haplotype files obtained from phase-Extender.",
        required=False,
    )

    parser.add_argument(
        "--f1List",
        help="name of the file that contains list of the path to F1 hybrid haplotype files obtained "
        "from phase-Stitcher.",
        required=False,
    )

    parser.add_argument(
        "--f1ParentIDs",
        help="comma separated names of the column header that indicates paternal haplotype "
        "vs. maternal haplotype in F1 hybrids haplotypes. "
        "Only required if 'f1List' is reported. ",
        required=False,
    )

    parser.add_argument(
        "--output",
        help="Directory name to store the merged haplotype file.",
        required=True,
    )

    global args
    args = parser.parse_args()
    global pat_hap
    global mat_hap

    ## Assign the argument variables
    hap_paths = args.hapList
    f1_paths = args.f1List
    output = args.output

    if args.f1ParentIDs != None:
        pat_hap = args.f1ParentIDs.split(",")[0] + "_hap"
        mat_hap = args.f1ParentIDs.split(",")[1] + "_hap"
    else:
        pat_hap = "pat_hap"
        mat_hap = "mat_hap"

    # Create an output directory
    if os.path.exists(output):
        shutil.rmtree(output, ignore_errors=False, onerror=None)
    os.makedirs(output, exist_ok=True)

    with open(output + "/merged_haplotype.txt", "w+") as fileout:

        if hap_paths == None and f1_paths == None:
            print(
                "Haplotype files from both phase-Stitcher and phase-Extender are missing."
            )
            print("Provide HAPLOTYPE files from at least one category.")
            print()
            sys.exit()

        else:
            if hap_paths != None:
                hap_list = open(hap_paths, "r")
                print("Reading HAPLOTYPE file names obtained from phase-Extender")

            if f1_paths != None:
                f1_list = open(f1_paths, "r")
                print("Reading HAPLOTYPE file names obtained from phase-Stitcher")

        pd_df_list = []  # to store dataframes

        """Now, pipe the list to function, that will extract required information. """

        print()
        print("## Loading the haplotype files")
        print()
        """Step 01: Now, read each haplotype file using pandas dataframe. 
                    ** - possible future upgrade using sqlite. """

        ## from regular haplotype obtained from phase-Extender
        if hap_paths != None:
            pd_df_list = extract_haplotype(hap_list, pd_df_list, list_type="phaseExt")

        ## from regular haplotype obtained from phase-Stitcher
        if f1_paths != None:
            pd_df_list = extract_haplotype(f1_list, pd_df_list, list_type="phaseStc")

        print()
        """ Step 02: Merging the dataframe """
        print("Merging all the haplotype files together")
        pd_df_merged = reduce(
            lambda left, right: pd.merge(
                left,
                right,
                on=["CHROM", "POS", "REF", "all-alleles"],
                how="outer",
                sort=["CHROM", "POS"],
            ),
            pd_df_list,
        ).fillna(".")

        ## ** if "concat" is required
        # df_by_dask_merged = dd.concat(dask_df)

        # add column with empty/default value
        # - this is done to maintain required data stucture downstream
        if "all-freq" not in pd_df_merged.columns:
            pd_df_merged.insert(4, "all-freq", ".")

        """ Step 03: Write data to file (single or multiple) """

        ## 03 - A: find the unique CHROM values, column names and group the df by index (CHROM)

        ## ** If a single file is to be written then the header should be written early.
        ## find all the column names in dask dataframe
        dask_cols = list(pd_df_merged.columns)

        # write the header for the file that is output as a single file
        fileout.write("\t".join(dask_cols) + "\n")

        pd_df_merged.to_csv(fileout, mode="a", sep="\t", index=False, header=False)

        print("  - Worker maximum memory usage : %.2f (mb)" % (current_mem_usage()))

    print("Global maximum memory usage: %.2f (mb)" % current_mem_usage())
    print("elapsed time: ", time.time() - start01)


def extract_haplotype(list_names, pd_df_list, list_type):
    for names in list_names:
        names = names.rstrip("\n")
        print("names :", names)
        print()

        ## Reading dataframe  (use dask if the file is too large)
        # make sure the "CHROM" and "POS" fields are read as integer
        # df_by_pd = pd.read_csv(names, sep='\t', dtype={"CHROM": object, "POS": int})
        df_by_pd = pd.read_csv(names, sep="\t")
        # df_by_pd['CHROM'] = df_by_pd['CHROM'].apply(lambda row: int(row))
        df_by_pd["CHROM"] = df_by_pd["CHROM"].apply(pd.to_numeric, errors="coerce")
        df_by_pd["POS"] = df_by_pd["POS"].apply(pd.to_numeric, errors="coerce")

        for cols in list(df_by_pd.columns):
            if ":" in cols:
                df_by_pd[[cols]] = df_by_pd[[cols]].astype(object)

        ### find the sample name
        column_names = list(df_by_pd.columns)
        sample_name = [x for x in column_names if x.endswith(":PI")]
        sample_name = [x.split(":")[0] for x in sample_name]
        sample_name = ",".join(sample_name)
        print("Sample name: %s " % sample_name)

        print("Dropping columns and appending the pandas to a list")
        if "all-freq" in df_by_pd.columns:
            df_by_pd.drop(["all-freq"], axis=1, inplace=True)

        if "log2odds" in df_by_pd.columns:
            df_by_pd.drop(["log2odds"], axis=1, inplace=True)

        if "lods" in df_by_pd.columns:
            df_by_pd.drop(["lods"], axis=1, inplace=True)

        """ for the haplotypes that are from "F1-Hybrids" we need to change the
        format a little bit. It should have GT,former PI-PG, new PI-PG. """
        if list_type == "phaseStc":
            print("Using F1 haplotype")

            ### first find the sample name
            column_names = list(df_by_pd.columns)
            sample_name = [x for x in column_names if x.endswith(":PI")]
            sample_name = [x.split(":")[0] for x in sample_name]
            sample_name = ",".join(sample_name)
            # print('Sample name: %s ' % sample_name)

            """ Update the column values """
            # ** Note: the column names assignment i.e "mat_hap" might change in the future.
            df_by_pd[sample_name + ":PG_al"] = (
                df_by_pd[pat_hap] + "|" + df_by_pd[mat_hap]
            )
            df_by_pd[sample_name + ":PI"] = df_by_pd.apply(
                lambda row: update_cols(row, sample_name), axis=1
            )

            ## further update the ":PG_al" values
            df_by_pd[sample_name + ":PG_al"] = df_by_pd[sample_name + ":PG_al"].apply(
                lambda x: "." if "N" in x else x
            )

            # Drop the non-required columns
            df_by_pd.drop([pat_hap], axis=1, inplace=True)
            df_by_pd.drop([mat_hap], axis=1, inplace=True)

        ## append the dataframe to a list
        pd_df_list.append(df_by_pd)

        # measuring memory
        print("  - Worker maximum memory usage : %.2f (mb)" % (current_mem_usage()))
        print()

    return pd_df_list


def update_cols(df_row, sample_name):

    if "N|N" in df_row[sample_name + ":PG_al"]:
        return "."
    else:
        return df_row["CHROM"]


""" to monitor memory """


def current_mem_usage():
    return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0


## ** deprecated - use for alpha-numeric sorting in the future.
"""function for name sorting while reading file.
   - This function helps to read the file in alpha-numerical order when multiprocessing. """
numbers = re.compile(r"(\d+)")


def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])

    return parts


if __name__ == "__main__":
    main()