Skip to content

Commit

Permalink
plot_pileup has automatic baseshift calculation. docs updated
Browse files Browse the repository at this point in the history
  • Loading branch information
hiruna72 committed Apr 28, 2024
1 parent 9db3814 commit de85c2f
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 17 deletions.
8 changes: 7 additions & 1 deletion docs/commands.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,10 @@ Plot read/reference - signal alignments.
(optional) The base width when plotting with fixed base width [default value: 10].
* `--base_shift INT`
(optional) The number of bases to shift to align fist signal move [default value: 0]. More information on this can be found at [here](pore_model.md)
* `--auto`
(optional) Calculate and automatically set the base shift. [default value: false]
* `--profile STR`:<br/>
(optional) This is used to determine base shift using preset values. The available profiles can be listed using `--list_profile`
(optional) This is used to determine base shift using preset values. The available profiles can be listed using `--list_profile`. The precedence is in the order of `--base_shift < --auto < --profile`.
* `--list_profile`:<br/>
(optional) Print the available profiles and exit.
* `--plot_limit INT`
Expand Down Expand Up @@ -191,7 +193,11 @@ Plot reference - signal alignment pileups.
(optional) The base width when plotting with fixed base width [default value: 10].
* `--base_shift INT`
(optional) The number of bases to shift to align fist signal move [default value: 0]. More information on this can be found at [here](pore_model.md)
* `--auto`
(optional) Calculate and automatically set the base shift. [default value: false]
* `--profile STR`:<br/>
(optional) This is used to determine base shift using preset values. The available profiles can be listed using `--list_profile`. The precedence is in the order of `--base_shift < --auto < --profile`.
* `--profile STR`:<br/>
(optional) This is used to determine base shift using preset values. The available profiles can be listed using `--list_profile`
* `--list_profile`:<br/>
(optional) Print the available profiles and exit.
Expand Down
4 changes: 2 additions & 2 deletions src/calculate_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,10 +245,10 @@ def calculate_base_shift(moves, raw_signal, sequence, kmer_length, record_is_rev
# output_pdf.close()

if record_is_reverse or rna:
print("Only {} ditinct {}-mers were found in the sequence. The calculated base shift value ({}) might not be correct. Reduce kmer length (-k) or use a preset base shift (--profile)".format(reverse_shift, num_kmers, kmer_length))
print("Only {} ditinct {}-mers were found in the sequence. The calculated base shift value ({}) might not be correct. Reduce kmer length (-k) or increase the region interval or use a preset base shift (--profile)".format(num_kmers, kmer_length, reverse_shift))
return reverse_shift
else:
print("Only {} ditinct {}-mers were found in the sequence. The calculated base shift value ({}) might not be correct. Reduce kmer length (-k) or use a preset base shift (--profile)".format(forward_shift, num_kmers, kmer_length))
print("Only {} ditinct {}-mers were found in the sequence. The calculated base shift value ({}) might not be correct. Reduce kmer length (-k) or increase the region interval or use a preset base shift (--profile)".format(num_kmers, kmer_length, forward_shift))
return forward_shift

def run(args):
Expand Down
34 changes: 26 additions & 8 deletions src/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,7 @@ def run(args):
if args.plot_reverse:
draw_data["sig_dir"] = "<-"

# priority order --base_shift < --auto < --profile
kmer_correction = 0
if args.profile == "":
draw_data["base_shift"] = args.base_shift
Expand All @@ -635,6 +636,10 @@ def run(args):
else:
draw_data["base_shift"] = plot_utils.search_for_profile_base_shift(args.profile)[0]
kmer_correction = -1*(plot_utils.search_for_profile_base_shift(args.profile)[0] + plot_utils.search_for_profile_base_shift(args.profile)[1])
if args.auto:
kmer_correction = 0
draw_data["base_shift"] = 0
print("Info: Base shift will be automatically calculated and set.")
if args.kmer_length < abs(draw_data["base_shift"]):
print("Info: increased the kmer length to {} match with the base shift".format(abs(draw_data["base_shift"])+1))
draw_data["kmer_length"] = abs(draw_data["base_shift"]) + 1
Expand Down Expand Up @@ -665,6 +670,9 @@ def run(args):
args_ref_end = None

for paf_record in parse_paf(handle):
if args.auto:
kmer_correction = 0
draw_data["base_shift"] = 0
if paf_record.query_name != paf_record.target_name:
raise Exception("Error: this paf file is a signal to reference mapping. Please provide the argument --sig_ref ")
read_id = paf_record.query_name
Expand Down Expand Up @@ -820,10 +828,11 @@ def run(args):
sig_algn_dic['data_is_rna'] = data_is_rna
sig_algn_dic['ss'] = moves

if args.auto:
draw_data["base_shift"] = calculate_offsets.calculate_base_shift(moves, y, fasta_seq, args.kmer_length, record_is_reverse, data_is_rna)

signal_tuple, region_tuple, sig_algn_dic, fasta_seq = plot_utils.adjust_before_plotting(ref_seq_len, signal_tuple, region_tuple, sig_algn_dic, fasta_seq, draw_data)
if args.auto:
caculated_base_shift = calculate_offsets.calculate_base_shift(sig_algn_dic['ss'], signal_tuple[2], fasta_seq, args.kmer_length, record_is_reverse, data_is_rna)
draw_data["base_shift"] = caculated_base_shift
signal_tuple, sig_algn_dic, fasta_seq = plot_utils.treat_for_base_shift(draw_data, signal_tuple, sig_algn_dic, fasta_seq)

if args.fixed_width:
sig_algn_dic['tag_name'] = args.tag_name + indt + "base_shift: " + str(draw_data["base_shift"]) + indt + "scale:" + scaling_str + indt + "fixed_width: " + str(args.base_width) + indt + strand_dir + indt + "region: "
Expand Down Expand Up @@ -882,6 +891,9 @@ def run(args):
samfile = pysam.AlignmentFile(args.alignment, mode='r')

for sam_record in samfile.fetch(contig=args_ref_name, start=args_ref_start, stop=args_ref_end):
if args.auto:
kmer_correction = 0
draw_data["base_shift"] = 0
if args_ref_name is not None and args_ref_name != sam_record.reference_name:
raise Exception("Error: sam record's reference name [" + sam_record.reference_name + "] and the name specified are different [" + args_ref_name + "]")
read_id = sam_record.query_name
Expand Down Expand Up @@ -1036,10 +1048,12 @@ def run(args):
sig_algn_dic['ss'] = moves
# print(len(moves))
# print(fasta_seq)
if args.auto:
draw_data["base_shift"] = calculate_offsets.calculate_base_shift(moves, y, fasta_seq, args.kmer_length, sam_record.is_reverse, data_is_rna)

signal_tuple, region_tuple, sig_algn_dic, fasta_seq = plot_utils.adjust_before_plotting(ref_seq_len, signal_tuple, region_tuple, sig_algn_dic, fasta_seq, draw_data)
if args.auto:
caculated_base_shift = calculate_offsets.calculate_base_shift(sig_algn_dic['ss'], signal_tuple[2], fasta_seq, args.kmer_length, sam_record.is_reverse, data_is_rna)
draw_data["base_shift"] = caculated_base_shift
signal_tuple, sig_algn_dic, fasta_seq = plot_utils.treat_for_base_shift(draw_data, signal_tuple, sig_algn_dic, fasta_seq)

if args.fixed_width:
sig_algn_dic['tag_name'] = args.tag_name + indt + "base_shift: " + str(draw_data["base_shift"]) + indt + "scale:" + scaling_str + indt + "fixed_width: " + str(args.base_width) + indt + strand_dir + indt + "region: " + ref_name + ":"
Expand Down Expand Up @@ -1092,6 +1106,9 @@ def run(args):
args_ref_end = None

for paf_record in tbxfile.fetch(args_ref_name, args_ref_start, args_ref_end, parser=pysam.asTuple()):
if args.auto:
kmer_correction = 0
draw_data["base_shift"] = 0
if paf_record[READ_ID] == paf_record[SEQUENCE_ID]:
raise Exception("Error: this paf file is a signal to read mapping.")
if args_ref_name is not None and args_ref_name != paf_record[SEQUENCE_ID]:
Expand Down Expand Up @@ -1247,10 +1264,11 @@ def run(args):
sig_algn_dic['data_is_rna'] = data_is_rna
sig_algn_dic['ss'] = moves

if args.auto:
draw_data["base_shift"] = calculate_offsets.calculate_base_shift(moves, y, fasta_seq, args.kmer_length, record_is_reverse, data_is_rna)

signal_tuple, region_tuple, sig_algn_dic, fasta_seq = plot_utils.adjust_before_plotting(ref_seq_len, signal_tuple, region_tuple, sig_algn_dic, fasta_seq, draw_data)
if args.auto:
caculated_base_shift = calculate_offsets.calculate_base_shift(sig_algn_dic['ss'], signal_tuple[2], fasta_seq, args.kmer_length, record_is_reverse, data_is_rna)
draw_data["base_shift"] = caculated_base_shift
signal_tuple, sig_algn_dic, fasta_seq = plot_utils.treat_for_base_shift(draw_data, signal_tuple, sig_algn_dic, fasta_seq)

if args.fixed_width:
sig_algn_dic['tag_name'] = args.tag_name + indt + "base_shift: " + str(draw_data["base_shift"]) + indt + "scale:" + scaling_str + indt + "fixed_width: " + str(args.base_width) + indt + strand_dir + indt + "region: " + ref_name + ":"
Expand Down
42 changes: 37 additions & 5 deletions src/plot_pileup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@
import math
from src import bed_annotation
from src import plot_utils
from src import calculate_offsets
import cProfile, pstats, io
# ref_start is always 1based closed
# ref_end is always 1based closed
# start_kmer is always 0based closed
# end_kmer is always 0based open

DEFAULT_KMER_SIZE = 9
FIXED_BASE_WIDTH = 10
FIXED_INSERTION_WIDTH = 10
BASE_LIMIT = 1000
Expand Down Expand Up @@ -303,8 +305,6 @@ def run(args):
pr = cProfile.Profile()
pr.enable()



use_fasta = 0
if args.file:
print(f'sequence file: {args.file}')
Expand Down Expand Up @@ -380,18 +380,27 @@ def run(args):
draw_data["overlap_only"] = args.overlap_only
draw_data["plot_num_samples"] = args.plot_num_samples
draw_data["bed_labels"] = args.print_bed_labels
draw_data["kmer_length"] = args.kmer_length

# priority order --base_shift < --auto < --profile
kmer_correction = 0
if args.profile == "":
draw_data["base_shift"] = args.base_shift
else:
args.auto = False
if args.plot_reverse:
draw_data["base_shift"] = plot_utils.search_for_profile_base_shift(args.profile)[1]
else:
draw_data["base_shift"] = plot_utils.search_for_profile_base_shift(args.profile)[0]

kmer_correction = 0
if args.profile != "":
kmer_correction = -1*(plot_utils.search_for_profile_base_shift(args.profile)[0] + plot_utils.search_for_profile_base_shift(args.profile)[1])
prev_base_shift = 0
if args.auto:
kmer_correction = 0
draw_data["base_shift"] = 0
print("Info: Base shift will be automatically calculated and set.")
if args.kmer_length < abs(draw_data["base_shift"]):
print("Info: increased the kmer length to {} match with the base shift".format(abs(draw_data["base_shift"])+1))
draw_data["kmer_length"] = abs(draw_data["base_shift"]) + 1

sig_algn_dic = {}

Expand Down Expand Up @@ -427,6 +436,9 @@ def run(args):
# p = bed_annotation.plot_bed_annotation(p=p, ref_id=args_ref_name, bed_dic=bed_dic, sig_algn_data=sig_algn_dic, draw_data=draw_data, base_limit=base_limit)

for sam_record in samfile.fetch(contig=args_ref_name, start=args_ref_start, stop=args_ref_end):
if args.auto:
kmer_correction = 0
draw_data["base_shift"] = 0
if args_ref_name != sam_record.reference_name:
raise Exception("Error: sam record's reference name [" + sam_record.reference_name + "] and the name specified are different [" + ref_name + "]")
read_id = sam_record.query_name
Expand Down Expand Up @@ -583,6 +595,13 @@ def run(args):
# print(len(moves))
# print(fasta_seq)
signal_tuple, region_tuple, sig_algn_dic, fasta_seq = plot_utils.adjust_before_plotting(ref_seq_len, signal_tuple, region_tuple, sig_algn_dic, fasta_seq, draw_data)
if args.auto:
caculated_base_shift = calculate_offsets.calculate_base_shift(sig_algn_dic['ss'], signal_tuple[2], fasta_seq, args.kmer_length, sam_record.is_reverse, data_is_rna)
if prev_base_shift != caculated_base_shift and num_plots > 0:
raise Exception("Error: different base shift values were calculated for different reads! Please use a preset base shift using --profile")
prev_base_shift = caculated_base_shift
draw_data["base_shift"] = caculated_base_shift
signal_tuple, sig_algn_dic, fasta_seq = plot_utils.treat_for_base_shift(draw_data, signal_tuple, sig_algn_dic, fasta_seq)
# print(len(sig_algn_dic['ss']))

sig_algn_dic['tag_name'] = args.tag_name + indt + "base_shift: " + str(draw_data["base_shift"]) + indt + "scale:" + scaling_str + indt + "fixed_width: " + str(args.base_width) + indt + strand_dir + indt + "region: " + ref_name + ":"
Expand Down Expand Up @@ -646,6 +665,9 @@ def run(args):
args_ref_start = int(args_region.split(":")[1].split("-")[0])
args_ref_end = int(args_region.split(":")[1].split("-")[1])
for paf_record in tbxfile.fetch(args_ref_name, args_ref_start, args_ref_end, parser=pysam.asTuple()):
if args.auto:
kmer_correction = 0
draw_data["base_shift"] = 0
# if paf_record[READ_ID] == paf_record[SEQUENCE_ID]:
# raise Exception("Error: this paf file is a signal to read mapping.")
if args_ref_name != paf_record[SEQUENCE_ID]:
Expand Down Expand Up @@ -800,6 +822,14 @@ def run(args):
# print(fasta_seq)
signal_tuple, region_tuple, sig_algn_dic, fasta_seq = plot_utils.adjust_before_plotting(ref_seq_len, signal_tuple, region_tuple, sig_algn_dic, fasta_seq, draw_data)

if args.auto:
caculated_base_shift = calculate_offsets.calculate_base_shift(sig_algn_dic['ss'], signal_tuple[2], fasta_seq, args.kmer_length, record_is_reverse, data_is_rna)
if prev_base_shift != caculated_base_shift and num_plots > 0:
raise Exception("Error: different base shift values were calculated for different reads! Please use a preset base shift using --profile")
prev_base_shift = caculated_base_shift
draw_data["base_shift"] = caculated_base_shift
signal_tuple, sig_algn_dic, fasta_seq = plot_utils.treat_for_base_shift(draw_data, signal_tuple, sig_algn_dic, fasta_seq)

sig_algn_dic['tag_name'] = args.tag_name + indt + "base_shift: " + str(draw_data["base_shift"]) + indt + "scale:" + scaling_str + indt + "fixed_width: " + str(args.base_width) + indt + strand_dir + indt + "region: " + ref_name + ":"

# print(len(sig_algn_dic['ss']))
Expand Down Expand Up @@ -925,6 +955,7 @@ def argparser():
parser.add_argument('-s', '--slow5', required=False, type=str, default="", help="slow5 file")
parser.add_argument('--region', required=False, type=str, default="", help="[start-end] 1-based closed interval region to plot. For SAM/BAM eg: chr1:6811428-6811467 or chr1:6,811,428-6,811,467. For PAF eg:100-200.")
parser.add_argument('--tag_name', required=False, type=str, default="", help="a tag name to easily identify the plot")
parser.add_argument('-k', '--kmer_length', required=False, type=int, default=DEFAULT_KMER_SIZE, help="kmer length")
parser.add_argument('-r', '--read_id', required=False, type=str, default="", help="plot the read with read_id")
parser.add_argument('-l', '--read_list', required=False, type=str, default="", help="a file with read_ids to plot")
parser.add_argument('--base_limit', required=False, type=int, default=BASE_LIMIT, help="maximum number of bases to plot")
Expand All @@ -940,6 +971,7 @@ def argparser():
parser.add_argument('--point_size', required=False, type=int, default=0.5, help="signal point radius [0.5]")
parser.add_argument('--base_width', required=False, type=int, default=FIXED_BASE_WIDTH, help="base width when plotting with fixed base width")
parser.add_argument('--base_shift', required=False, type=int, default=PLOT_BASE_SHIFT, help="the number of bases to shift to align fist signal move")
parser.add_argument('--auto', required=False, action='store_true', help="calculate base shift automatically")
parser.add_argument('--profile', required=False, default="", type=str, help="determine base_shift using preset values")
parser.add_argument('--list_profile', action='store_true', help="list the available profiles")
parser.add_argument('--plot_limit', required=False, type=int, default=1000, help="limit the number of plots generated")
Expand Down
4 changes: 3 additions & 1 deletion src/plot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ def adjust_before_plotting(ref_seq_len, signal_tuple, region_tuple, sig_algn_dat
y = signal_tuple[2][eat_signal:]
signal_tuple = (x, x_real, y)
sig_algn_data['ss'] = moves
return signal_tuple, region_tuple, sig_algn_data, fasta_seq

def treat_for_base_shift(draw_data, signal_tuple, sig_algn_data, fasta_seq):
if draw_data["base_shift"] < 0:
abs_base_shift = abs(draw_data["base_shift"])
x = signal_tuple[0]
Expand All @@ -79,8 +81,8 @@ def adjust_before_plotting(ref_seq_len, signal_tuple, region_tuple, sig_algn_dat
base_shift_seq = 'N' * draw_data['base_shift']
if draw_data["base_shift"] > 0:
fasta_seq = base_shift_seq + fasta_seq[:-1*draw_data["base_shift"]]
return signal_tuple, sig_algn_data, fasta_seq

return signal_tuple, region_tuple, sig_algn_data, fasta_seq
def create_figure(args, plot_mode):
p_defualt = None
if plot_mode == 0:
Expand Down
13 changes: 13 additions & 0 deletions test/test_plot_pileup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,19 @@ testcase_20s() {
BASE_SHIFT="--base_shift 0"
squigualiser plot_pileup ${BASE_SHIFT} --region ${REGION} -f ${FASTA} -s ${SIGNAL} -a ${ALIGNMENT} -o ${OUTPUT} --tag_name "testcase-${TESTCASE}" --plot_limit ${PLOT_LIMIT} --sig_scale ${SCALING} || die "testcase:$TESTCASE failed"

TESTCASE=20.8.1
info "testcase:$TESTCASE - reference-signal plot"
FASTA=${GENOME}
SIGNAL="${RAW_DIR}/f5c_eventalign/reads.blow5"
ALIGNMENT="${RAW_DIR}/f5c_eventalign/sorted_eventalign.paf.gz"
OUTPUT="${OUTPUT_DIR}/testcase_${TESTCASE}"
PLOT_LIMIT=10
REGION="chr1:6,811,011-6,811,198"
SCALING="znorm"
BASE_SHIFT="--base_shift 0"
squigualiser plot_pileup --auto --region ${REGION} -f ${FASTA} -s ${SIGNAL} -a ${ALIGNMENT} -o ${OUTPUT} --tag_name "testcase-${TESTCASE}" --plot_limit ${PLOT_LIMIT} --sig_scale ${SCALING} || die "testcase:$TESTCASE failed"


TESTCASE=20.9
info "testcase:$TESTCASE - reference-signal plot"
FASTA=${GENOME}
Expand Down

0 comments on commit de85c2f

Please sign in to comment.