remove_relateds.py

#!/usr/bin/env python

"""This program outputs a set of unrelateds, with parameters specified by the user"""
# Usage: pyhton3.6 remove_relateds.py ponderosa_pairs_file.txt king.seg fam_file.fam <5th, 4th, 3rd, 2nd>
# the last parameter is the maximum degree of relatedness to allow
# Author: CW

import sys
import os.path
try:
	import pandas as pd
	import numpy as np
	from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
except:
	print('Unable to find either pandas, numpy or sklearn. If using augrabies, rerun the command but replace "python" with "/software/anaconda3/4.5.12/lssc0-linux/bin/python3.6"')
	print("Alternatively, run with a virtual environment and pip install pandas, numpy, and sklearn.")
	sys.exit()

__author__ =  "Cole Williams"
__email__ = "cole_williams@brown.edu"

try:
	degree = sys.argv[-1]
	fam_file = sys.argv[-2]
	king_file = sys.argv[-3]
	pairs_file = sys.argv[-4]
except:
	print("Please format command as follows: python remove_relateds.py [pairs file] [king file] [fam file] [degree threshold]")
	print("The pairs file is the .pairs file generated from PONDEROSA.")
	print("The pairs file should be included to most accurately remove relateds.")
	print("If you do not wish to include the pairs file, replace it with None and the script will use KING's degree inference.")
	print("Alternatively, you can replace this argument with a float indicating the max pi hat value to be considered unrelated.")
	print("The king file is the .seg file generated by KING.")
	print("The fam file is the .fam file of individuals to consider. Assumes all individuals in the 2nd column (IID col) are genotyped.")
	print("The degree threshold is the maximum degree of relatedness to include as unrelated, either 5th, 4th, 3rd, or 2nd")
	print("For instance, using 3rd will include all pairs who are 3rd degree related or less")
	sys.exit()

def find_unrelated(degree,king_file,fam_file,pairs_file):
	ibd_data = {(i.split()[1],i.split()[3]):[float(i.split()[6]),float(i.split()[7])] for i in open(king_file).readlines()[1:]}
	fam_list = [i.split()[1] for i in open(fam_file).readlines()]

	sd = 2
	degree_list = ["5th","4th","3rd","2nd","FS","PO"]

	def lda_classif(degree,pairs_df):
		def in_sd(ibd1_val,data):
			return abs(ibd1_val-data[0]) < (sd*data[1])
		degrees = degree_list[degree_list.index(degree):]
		pairs_df = pairs_df[pairs_df["DEGREE"].isin(degrees)]
		mean_stdev = {}
		for i in degrees:
			mean_stdev[i] = [pairs_df[pairs_df["DEGREE"]==i]["IBD1"].mean(),pairs_df[pairs_df["DEGREE"]==i]["IBD1"].std()]
		pairs_df["KEEP"] = pairs_df.apply(lambda x: in_sd(x.IBD1,mean_stdev[x.DEGREE]),axis=1)
		pairs_df = pairs_df[pairs_df["KEEP"]].copy()
		train_val,train_lab = pairs_df[["IBD1","IBD2"]].values.tolist(),pairs_df["DEGREE"].values.tolist()
		classif = LinearDiscriminantAnalysis().fit(train_val,train_lab)
		return classif

	def fifth_classif(pairs_df):
		pairs_df = pairs_df[pairs_df["DEGREE"]=="4th"].copy()
		mean,stdev = pairs_df["PIHAT"].mean(),pairs_df["PIHAT"].std()
		return mean - (sd*stdev)

	def king_classif(degree):
		related = degree_list[degree_list.index(degree):][1:]
		return [(i.split()[1],i.split()[3]) for i in open(king_file).readlines()[1:] if i.split()[-1] in related]

	def get_coeff(iid1,iid2):
		if (iid1,iid2) in ibd_data:
			return ibd_data[(iid1,iid2)]
		if (iid2,iid1) in ibd_data:
			return ibd_data[(iid2,iid1)]
		return [0,0]

	def unrelated(iid1,iid2,degree,classif,mode):
		ibd1,ibd2 = get_coeff(iid1,iid2)
		pihat = (0.5*ibd1) + ibd2
		if mode == "float":
			return pihat <= classif
		if mode == "king":
			return not((iid1,iid2) in classif or (iid2,iid1) in classif)
		if mode == "fifth":
			return pihat < classif
		if mode == "lda":
			pred_deg = classif.predict([[ibd1,ibd2]])[0]
			return pred_deg in  degree_list[:degree_list.index(degree)+1]

	def init_list(degree,classif,mode):
		out_list = []
		for iid1 in fam_list:
			l = [0,iid1]
			for iid2 in fam_list:
				if iid1 == iid2:
					continue
				if unrelated(iid1,iid2,degree,classif,mode):
					l[0] += 1
			out_list.append(l)
		out_list.sort(reverse=True)
		return out_list

	def pick_new(cur_list,iid_list,degree,classif,mode):
		found = False
		for iid in iid_list:
			iid1 = iid[1]
			add = True
			for iid2 in cur_list:
				if iid1 == iid2:
					add = False
					continue
				if not unrelated(iid1,iid2,degree,classif,mode):
					add = False
					break
			if add:
				found = True
				break
		if found:
			iid_list.remove(iid)
			cur_list.append(iid1)
		return cur_list

	def make_list(degree,classif,mode,iid_list,unrelated_list):
		stop = False
		while not stop:
			cur_len = len(unrelated_list)
			unrelated_list = pick_new(unrelated_list,iid_list,degree,classif,mode)
			new_len = len(unrelated_list)
			if cur_len == new_len:
				stop = True
		return unrelated_list

	def iterate_lists(degree,classif,mode):
		unrelated_len = 0
		iid_list = init_list(degree,classif,mode)
		master_copy = iid_list[:]
		starting_iid = []
		while unrelated_len < master_copy[0][0]:
			unrelated_list = make_list(degree,classif,mode,iid_list[:],[master_copy[0][1]])
			unrelated_len = len(unrelated_list)
			starting_iid.append([unrelated_len,master_copy[0][1],unrelated_list])
			master_copy = master_copy[1:]
			if master_copy == []:
				break
			break
		starting_iid.sort(reverse=True)
		return starting_iid[0][2]

	if os.path.isfile(pairs_file):
		pairs_df = pd.read_csv(pairs_file,delim_whitespace=True)
		pairs_df = pairs_df.dropna(subset=["IBD1","IBD2","PIHAT"])
		if degree == "5th":
			mode = "fifth"
			classif = fifth_classif(pairs_df)
		else:
			mode = "lda"
			classif = lda_classif(degree,pairs_df)
	else:
		try:
			mode = "float"
			classif = float(pairs_file)
		except:
			mode = "king"
			classif = king_classif(degree)

	print("mode: %s" % mode)
	print("degree: %s" % degree)
	unrelated_list = iterate_lists(degree,classif,mode)
	print("Total unrelateds: %s" % len(unrelated_list))
	write_df = pd.DataFrame(unrelated_list,columns=["IID"])
	write_df.to_csv(path_or_buf="unrelateds.csv")
	max_ibd = []
	for iid1 in unrelated_list:
		unrelated_list = unrelated_list[1:]
		for iid2 in unrelated_list:
			ibd1,ibd2 = get_coeff(iid1,iid2)
			pihat = (0.5*ibd1) + ibd2
			max_ibd.append(pihat)
	print("Max pi hat of unrelateds: %s" % max(max_ibd))
	print('Writing to "unrelateds.csv" in the current directory. Here is a preview:')
	print(write_df.head())
find_unrelated(degree,king_file,fam_file,pairs_file)