Skip to content

Commit

Permalink
core processes
Browse files Browse the repository at this point in the history
  • Loading branch information
Eric Kofman committed Feb 3, 2025
1 parent 7a5cac3 commit 8e182d8
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 16 deletions.
17 changes: 9 additions & 8 deletions marine.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def get_suffix_pairs_from_bam_filepath(bam_filepaths):
return suffix_pairs, suffix_pair_to_bam_filepath


def prepare_combinations_for_split(df, bam_filepaths, output_folder, output_suffix, n_processes=4):
def prepare_combinations_for_split(df, bam_filepaths, output_folder, output_suffix, processes=4):
"""
Prepares the chromosome-suffix combinations for multiprocessing.
For each position in a given barcode, we want to look at the coverage at that
Expand All @@ -105,7 +105,7 @@ def prepare_combinations_for_split(df, bam_filepaths, output_folder, output_suff
bam_filepaths (list): List of processed BAM files to incorporate
output_folder (str): Path to the output folder for split BED files.
output_suffix (str): Suffix for output files.
n_processes (int): Number of processes for multiprocessing.
processes (int): Number of processes for multiprocessing.
Returns:
list: List of tuples for processing.
Expand Down Expand Up @@ -138,8 +138,8 @@ def prepare_combinations_for_split(df, bam_filepaths, output_folder, output_suff
barcode_finding_tasks.append([chrom, prefix, suffix, bam_filepath, unique_positions, output_folder, output_suffix])

# Get unique barcodes contained in each bam, using a multiprocessing pool for maximal core usage efficiency
print(f"Starting multiprocessing to figure out unique barcodes per bam, with {n_processes} processes...")
with Pool(n_processes) as pool:
print(f"Starting multiprocessing to figure out unique barcodes per bam, with {processes} processes...")
with Pool(processes) as pool:
list_of_unique_barcodes_per_bam = pool.map(get_unique_barcodes_for_reads_in_bamfile, barcode_finding_tasks)

# Make new task list for next step
Expand Down Expand Up @@ -261,7 +261,8 @@ def generate_and_split_bed_files_for_all_positions(output_folder, bam_filepaths,
coverage_calc_sites_bed_df,
bam_filepaths,
split_bed_folder,
output_suffix
output_suffix,
processes=processes
)

print("Pivoting edits dataframe into sparse h5ad files...")
Expand All @@ -275,7 +276,7 @@ def generate_and_split_bed_files_for_all_positions(output_folder, bam_filepaths,
raise # Properly re-raise the exception

# Run the processing with multiprocessing
with Pool(processes=processes) as pool:
with Pool(processes=cores) as pool:
pool.map(process_combination_for_split, combinations)

print(f"\nAll split BED files generated in {output_folder}/combined_{output_suffix}_split_by_suffix\n")
Expand Down Expand Up @@ -464,8 +465,8 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
output_folder,
bam_filepaths,
tabulation_bed=tabulation_bed,
processes=cores,
output_suffix=output_suffix
output_suffix=output_suffix,
processes=cores
)

make_depth_command_script_single_cell(
Expand Down
18 changes: 10 additions & 8 deletions tests/integration_tests_auto_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,13 +366,6 @@
print("\n\t ~~~ single cell vs bulk modes on sc dataset equivalency test FAILED! ~~~\n")
failures += 1

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Check that the coverage and edit matrices are correct
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
print("Checking that the position X barcode coverage and edit h5ad sparse matrices are correct and contain the same information as the flat tsv final sites.")

test_folder = 'singlecell_tests/only_5_cells_all_cells_coverage_test/'

def get_all_edited_positions_and_barcodes(test_folder):
name_to_obs = {}
Expand Down Expand Up @@ -422,7 +415,16 @@ def get_all_edited_positions_and_barcodes(test_folder):

return edited_pos, covered_pos, edited_obs, covered_obs, \
final_filtered_site_info, name_to_pos, name_to_obs, name_to_adata



# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Check that the coverage and edit matrices are correct
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
print("Checking that the position X barcode coverage and edit h5ad sparse matrices are correct and contain the same information as the flat tsv final sites.")

test_folder = 'singlecell_tests/only_5_cells_all_cells_coverage_test/'

edited_pos, covered_pos, edited_obs, covered_obs, final_filtered_site_info,\
name_to_pos, name_to_obs, name_to_adata = get_all_edited_positions_and_barcodes(test_folder)

Expand Down

0 comments on commit 8e182d8

Please sign in to comment.