tweaks to the alignstats reports and lumen theme (#203)

* rm unclear * deal with na and nan * rm comments * add lumen style back, with flair * rm the is.nan() call b/c is.na() apparently treats NaN as NA. shrug
pdimens · Feb 13, 2025 · ae8b913 · ae8b913
1 parent 94b395e
commit ae8b913
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 94 deletions.
diff --git a/harpy/reports/_quarto.yml b/harpy/reports/_quarto.yml
@@ -1,20 +1,26 @@
 brand:
-  logo: 
-    medium: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_report.png
+  logo:
+    medium: https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.svg
 execute:
     echo: false
     warning: false
 embed-resources: true
-format: 
+format:
   dashboard:
-    theme: materia
+    theme: 
+      - lumen
+      - harpy.scss
     scrolling: true
     expandable: false
-    logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_report.png
+    logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.svg
     nav-buttons:
       - icon: book-half
         href: https://pdimens.github.io/harpy
       - icon: bug
         href: https://github.com/pdimens/harpy/issues/new/choose
       - icon: code-slash
-        href: https://github.com/pdimens/harpy
+        href: https://github.com/pdimens/harpy
+    include-in-header:
+      text: |
+        <link rel="shortcut icon" href="https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.svg" />
+        <link rel="icon" type="image/x-icon" href="https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.svg">
diff --git a/harpy/reports/align_bxstats.qmd b/harpy/reports/align_bxstats.qmd
@@ -32,81 +32,58 @@ std_error <- function(x){
 process_input <- function(infile){
   samplename <- gsub(".bxstats.gz", "", basename(infile))
   tb <- read.table(infile, header = T, sep = "\t") %>% select(-4, -5)
-  if(nrow(tb) == 0){
-    return(
-      data.frame(
-        sample = samplename,
-        totalreads = 0,
-        totaluniquemol = 0,
-        singletons = 0,
-        nonsingletons = 0,
-        totaluniquebx = 0,
-        molecule2bxratio = 0,
-        totalvalidreads = 0,
-        totalinvalidreads = 0,
-        totalvalidpercent = 0,
-        averagereadspermol = 0,
-        sereadspermol = 0,
-        averagemolcov = 0,
-        semolcov = 0,
-        averagemolcovbp = 0,
-        semolcovbp = 0,
-        N50 = 0,
-        N75 = 0,
-        N90 = 0
-      )
-    )
-  } else {
-    tb$valid <- tb$molecule
-    tb[tb$valid != "invalidBX", "valid"] <- "validBX"
-    tb$valid <- gsub("BX", " BX", tb$valid)
-    # isolate non-singletons b/c molecules with 1 read pair aren't linked reads
-    multiplex_df <- filter(tb, valid == "valid BX", reads >= 2)
-    singletons <- sum(tb$reads < 2 & tb$valid == "valid BX")
-    tot_uniq_bx <- read.table(infile, header = F, sep = "\n", as.is = T, skip = nrow(tb) + 1, comment.char = "+")
-    tot_uniq_bx <- gsub("#total unique barcodes: ", "", tot_uniq_bx$V1[1]) |> as.integer()
-    tot_mol <- sum(tb$valid == "valid BX")
-    tot_valid_reads <- sum(tb[tb$valid == "valid BX", "reads"])
-    avg_reads_per_mol <- round(mean(multiplex_df$reads),1)
-    sem_reads_per_mol <- round(std_error(multiplex_df$reads), 2)
-    tot_invalid_reads <- sum(tb[tb$valid == "invalid BX", "reads"])
-    avg_mol_cov <- round(mean(multiplex_df$coverage_inserts), 2)
-    sem_mol_cov <- round(std_error(multiplex_df$coverage_inserts), 4)
-    avg_mol_cov_bp <- round(mean(multiplex_df$coverage_bp), 2)
-    sem_mol_cov_bp<- round(std_error(multiplex_df$coverage_bp), 4)
-    n50 <- NX(multiplex_df$length_inferred, 50)
-    n75 <- NX(multiplex_df$length_inferred, 75)
-    n90 <- NX(multiplex_df$length_inferred, 90)
-    return(
-      data.frame(
-        sample = samplename,
-        totalreads = tot_valid_reads + tot_invalid_reads,
-        totaluniquemol = tot_mol,
-        singletons = singletons,
-        nonsingletons = nrow(multiplex_df),
-        totaluniquebx = tot_uniq_bx,
-        molecule2bxratio = round(tot_mol / tot_uniq_bx,2),
-        totalvalidreads = tot_valid_reads,
-        totalinvalidreads = tot_invalid_reads,
-        totalvalidpercent = round(tot_valid_reads/(tot_valid_reads + tot_invalid_reads) * 100,2),
-        averagereadspermol = avg_reads_per_mol,
-        sereadspermol = sem_reads_per_mol,
-        averagemolcov = avg_mol_cov,
-        semolcov = sem_mol_cov,
-        averagemolcovbp = avg_mol_cov_bp,
-        semolcovbp = sem_mol_cov_bp,
-        N50 = n50,
-        N75 = n75,
-        N90 = n90
-      )
-    )
-  }
+  tb$valid <- tb$molecule
+  tb[tb$valid != "invalidBX", "valid"] <- "validBX"
+  tb$valid <- gsub("BX", " BX", tb$valid)
+  # isolate non-singletons b/c molecules with 1 read pair aren't linked reads
+  multiplex_df <- filter(tb, valid == "valid BX", reads >= 2)
+  singletons <- sum(tb$reads < 2 & tb$valid == "valid BX")
+  tot_uniq_bx <- read.table(infile, header = F, sep = "\n", as.is = T, skip = nrow(tb) + 1, comment.char = "+")
+  tot_uniq_bx <- gsub("#total unique barcodes: ", "", tot_uniq_bx$V1[1]) |> as.integer()
+  tot_mol <- sum(tb$valid == "valid BX")
+  tot_valid_reads <- sum(tb[tb$valid == "valid BX", "reads"])
+  avg_reads_per_mol <- round(mean(multiplex_df$reads),1)
+  sem_reads_per_mol <- round(std_error(multiplex_df$reads), 2)
+  tot_invalid_reads <- sum(tb[tb$valid == "invalid BX", "reads"])
+  avg_mol_cov <- round(mean(multiplex_df$coverage_inserts), 2)
+  sem_mol_cov <- round(std_error(multiplex_df$coverage_inserts), 4)
+  avg_mol_cov_bp <- round(mean(multiplex_df$coverage_bp), 2)
+  sem_mol_cov_bp<- round(std_error(multiplex_df$coverage_bp), 4)
+  n50 <- NX(multiplex_df$length_inferred, 50)
+  n75 <- NX(multiplex_df$length_inferred, 75)
+  n90 <- NX(multiplex_df$length_inferred, 90)
+  datarow <- data.frame(
+    sample = samplename,
+    totalreads = tot_valid_reads + tot_invalid_reads,
+    totaluniquemol = tot_mol,
+    singletons = singletons,
+    nonsingletons = nrow(multiplex_df),
+    totaluniquebx = tot_uniq_bx,
+    molecule2bxratio = round(tot_mol / tot_uniq_bx,2),
+    totalvalidreads = tot_valid_reads,
+    totalinvalidreads = tot_invalid_reads,
+    totalvalidpercent = round(tot_valid_reads/(tot_valid_reads + tot_invalid_reads) * 100,2),
+    averagereadspermol = avg_reads_per_mol,
+    sereadspermol = sem_reads_per_mol,
+    averagemolcov = avg_mol_cov,
+    semolcov = sem_mol_cov,
+    averagemolcovbp = avg_mol_cov_bp,
+    semolcovbp = sem_mol_cov_bp,
+    N50 = n50,
+    N75 = n75,
+    N90 = n90
+  )
+  #datarow[is.na(datarow)] <- 0
+  #datarow[is.nan(datarow)] <- 0
+  return(datarow)
 }
+
 ```
 
 ```{r setup_df}
 infiles <- list.files(params$indir, "bxstats.gz", full.names = TRUE)
 aggregate_df <- Reduce(rbind, Map(process_input, infiles))
+aggregate_df[is.na(aggregate_df)] <- 0
 
 if(nrow(aggregate_df) == 0){
   print("All input files were empty")

diff --git a/harpy/reports/align_stats.qmd b/harpy/reports/align_stats.qmd
@@ -26,9 +26,6 @@ using("dplyr","highcharter","DT","BioCircos")
 ```
 
 ```{r import_file, results = F}
-#infile <- "~/test.gz"
-#samplename <- "test sample"
-#sm_moldist <- 50000
 infile <- params$bxstats
 samplename <- params$sample
 sm_moldist <- params$mol_dist

diff --git a/harpy/reports/harpy.scss b/harpy/reports/harpy.scss
@@ -0,0 +1,17 @@
+/*-- scss:defaults --*/
+$navbar-bg:     #f4f4f4;
+$navbar-fg:     #3d3d3d;
+
+// Fonts
+
+$font-family-sans-serif:    Roboto, -apple-system, BlinkMacSystemFont, "Segoe UI", "Helvetica Neue", Arial, sans-serif !default;
+$font-size-root:            1rem !default;
+$font-weight-base:          400 !default;
+
+/*-- scss:rules --*/
+// Variables
+
+$web-font-path: "https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" !default;
+@if $web-font-path {
+  @import url($web-font-path);
+}
diff --git a/harpy/scripts/demultiplex_gen1.py b/harpy/scripts/demultiplex_gen1.py
@@ -109,7 +109,6 @@ def get_read_codes(index_read, left_segment, right_segment):
 R1_output = {sample: open(f"{outdir}/{sample}.{part}.R1.fq", 'w') for sample in samples}
 R2_output = {sample: open(f"{outdir}/{sample}.{part}.R2.fq", 'w') for sample in samples}
 segments = {'A':'', 'B':'', 'C':'', 'D':''}
-unclear_read_map={}
 clear_read_map={}
 with (
     pysam.FastxFile(r1) as R1,
@@ -136,22 +135,18 @@ def get_read_codes(index_read, left_segment, right_segment):
         R2_output[sample_name].write(f"{r2_rec}\n")
 
         if "unclear" in statuses:
-            if BX_code in unclear_read_map:
-                unclear_read_map[BX_code] += 1
+            continue
+        if "corrected" in statuses:
+            if  BX_code in clear_read_map:
+                clear_read_map[BX_code][1] += 1
             else:
-                unclear_read_map[BX_code] = 1
-        else:        
-            if "corrected" in statuses:
+                clear_read_map[BX_code] = [0,1] 
+        else:
+            if all(status == "found" for status in statuses):
                 if  BX_code in clear_read_map:
-                    clear_read_map[BX_code][1] += 1
+                    clear_read_map[BX_code][0] += 1
                 else:
-                    clear_read_map[BX_code] = [0,1] 
-            else:
-                if all(status == "found" for status in statuses):
-                    if  BX_code in clear_read_map:
-                        clear_read_map[BX_code][0] += 1
-                    else:
-                        clear_read_map[BX_code] = [1,0]          
+                    clear_read_map[BX_code] = [1,0]          
 
     for sample_name in samples:
         R1_output[sample_name].close()
@@ -160,6 +155,4 @@ def get_read_codes(index_read, left_segment, right_segment):
     BC_log.write("Barcode\tTotal_Reads\tCorrect_Reads\tCorrected_Reads\n")
     for code in clear_read_map:
         BC_log.write(f"{code}\t{sum(clear_read_map[code])}\t{clear_read_map[code][0]}\t{clear_read_map[code][1]}\n")
-    for code in unclear_read_map:
-        BC_log.write(f"{code}\t{unclear_read_map[code]}\t0\t0\n")
 f.close()