mod score adding done in modkit

trichelab · Jul 28, 2023 · e91edfe · e91edfe
1 parent f601f2e
commit e91edfe
Show file tree

Hide file tree

Showing 6 changed files with 44 additions and 55 deletions.
diff --git a/README.md b/README.md
@@ -91,6 +91,29 @@ Convert input files (bam or txt) to bed files that can be used as input to predi
 
 It is critical that the data is correctly aligned to a reference genome, otherwise the results will be wrong. We have used the Telomere-to-telomere reference genome (CHM13v2) for all of our work, and it is the default setting for this program. We encourage you to also use it, as we have not thoroughly tested other reference genomes. If you have aligned your data to `hg38`, we encourage you to re-align (or "lift it" with [CrossMap](https://crossmap.sourceforge.net/#)) it to the T2T reference genome. If you still prefer to use `hg38`, then you can pass `--reference-genome hg38` to use the correct probe coordinates. The `hg38` coordinates have been generated via liftover, this lead to the loss of 26 out of 427680 probes.
 
+### Extract modification calls (modkit)
+
+If you have bam files that contain modifications basecalled with Guppy or Dorado, this is the recommended option. You will have to install [modkit](https://github.com/nanoporetech/modkit), the official ONT tool to extract modifications. Once installed, given a bam file you should do the following:
+
+If, besides 5mC you also have 5hmC modification calls, you should add them up via:
+```
+modkit adjust-mods --convert h m INPUT.bam OUTPUT.bam
+```
+The reasoning behind this is that both 5mC and 5hmC poorly react to bisulfite treatment [Huang et al., 2010](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2811190/), therefore 5hmC sites would appear as 5mC in a methylation microarray experiment. We trained on those kind of experiments, we therefore think that adding the two scores up reflects the most the training data. 
+
+You can then extract the 5mC scores using:
+```
+modkit extract OUTPUT.bam OUTPUT.txt
+```
+
+This `OUTPUT.txt` can then be directly used in sturgeon using:
+```
+sturgeon inputtobed -i MODKIT_OUTPUT_DIR -o OUTPUT_DIR -s modkit
+```
+
+Please note that Sturgeon will only use 5mC scores, other modification scores will be filtered out.
+Please refer to the [modkit wiki](https://nanoporetech.github.io/modkit/quick_start.html) in case these commands change.
+
 ### Alignment bam files (Guppy)
 
 Convert a bam that contains methylation calls into the adequate format (.bed) so that it can be used for prediction.

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend="hatchling.build"
 
 [project]
 name = "sturgeon"
-version = "0.4.1"
+version = "0.4.2"
 authors = [
     { name="Marc Pagès-Gallego" },
     { name="Carlo Vermeulen" },

diff --git a/sturgeon/__init__.py b/sturgeon/__init__.py
@@ -1,3 +1,3 @@
-__version__ = "0.4.1"
+__version__ = "0.4.2"
 
 import sturgeon.callmapping
diff --git a/sturgeon/callmapping.py b/sturgeon/callmapping.py
@@ -479,7 +479,6 @@ def modkit_file_to_bed(
     neg_threshold: Optional[float] = 0.3,
     pos_threshold: Optional[float] = 0.7,
     fivemc_code:str = 'm',
-    fivehmc_code: str = 'h',
 ) -> pd.DataFrame:
 
     mandatory_columns = [
@@ -499,62 +498,22 @@ def modkit_file_to_bed(
         usecols= mandatory_columns
     )
 
-    # if 5hmc is present, merge 5mc and 5hmc scores
-    present_mods = modkit_df['mod_code'].unique()
+    modkit_df = modkit_df[modkit_df['mod_code'] == fivemc_code]
+    modkit_df = modkit_df[modkit_df['canonical_base'] == 'C']
+    modkit_df = modkit_df[modkit_df['modified_primary_base'] == 'C']
 
-    try:
-        assert fivemc_code in present_mods
-    except AssertionError:
-        logging.error('5mC code: {}. Not found in modkit file'.format(fivemc_code))
-        return None
-
-    if len(modkit_df['mod_code'].unique()) > 1 and fivehmc_code in present_mods:
-
-        modkit_df_m = modkit_df[modkit_df['mod_code'] == fivemc_code].reset_index(inplace=False, drop=True)
-        modkit_df_h = modkit_df[modkit_df['mod_code'] == fivehmc_code].reset_index(inplace=False, drop=True)
-
-        try:
-            assert len(modkit_df_m) == len(modkit_df_h)
-        except AssertionError:
-            logging.error('Expected same number of 5mC ({}) and 5hmC ({})calls.'.format(len(modkit_df_m), len(modkit_df_h)))
-            return None
-
-        try:
-            assert np.all(modkit_df_m['read_id'].astype(str) == modkit_df_h['read_id'].astype(str))
-        except AssertionError:
-            logging.error('Read ids between 5mC and 5hmC calls do not match. Maybe incomplete modkit file?')
-            return None
-
-        try:
-            assert np.all(modkit_df_m['ref_position'].astype(str) == modkit_df_h['ref_position'].astype(str))
-        except AssertionError:
-            logging.error('Mapping positions between 5mC and 5hmC calls do not match. Maybe incomplete modkit file?')
-            return None
-
-        try:
-            assert np.all(modkit_df_m['chrom'].astype(str) == modkit_df_h['chrom'].astype(str))
-        except AssertionError:
-            logging.error('Chromosome mappings between 5mC and 5hmC calls do not match. Maybe incomplete modkit file?')
-            return None
-
-        modkit_df_m['mod_qual'] = modkit_df_m['mod_qual'] + modkit_df_h['mod_qual']
-        del modkit_df_h
-
-    else:
-        modkit_df_m = modkit_df
-
-    modkit_df_m = modkit_df_m.rename(columns={
+    modkit_df = modkit_df.rename(columns={
         'chrom': 'chr',
         'ref_position': 'reference_pos',
         'mod_qual': 'score'
     })
-    modkit_df_m = modkit_df_m.drop(columns=[
+    modkit_df = modkit_df.drop(columns=[
         'mod_code',
         'canonical_base',
         'modified_primary_base'
     ])
-    modkit_df_m = modkit_df_m[modkit_df_m['reference_pos'] != -1]
-    modkit_df_m = modkit_df_m[modkit_df_m['chr'] != '.']
+    modkit_df = modkit_df[modkit_df['reference_pos'] != -1]
+    modkit_df = modkit_df[modkit_df['chr'] != '.']
 
     probes_df = read_probes_file(probes_file)
 
@@ -572,7 +531,7 @@ def modkit_file_to_bed(
 
         calls_per_probe_chr =  map_methyl_calls_to_probes_chr(
             probes_df =  probes_methyl_df[probes_methyl_df['chr'] == chrom.item()],
-            methyl_calls_per_read = modkit_df_m[modkit_df_m['chr'] == 'chr'+str(chrom.item())],
+            methyl_calls_per_read = modkit_df[modkit_df['chr'] == 'chr'+str(chrom.item())],
             margin = margin, 
             neg_threshold = neg_threshold,
             pos_threshold = pos_threshold,
@@ -598,7 +557,6 @@ def modkit_path_to_bed(
     neg_threshold: Optional[float] = 0.3,
     pos_threshold: Optional[float] = 0.7,
     fivemc_code:str = 'c',
-    fivehmc_code: str = 'h',
 ):
 
     output_files = list()
@@ -633,7 +591,6 @@ def modkit_path_to_bed(
             neg_threshold = neg_threshold,
             pos_threshold = pos_threshold,
             fivemc_code = fivemc_code,
-            fivehmc_code = fivehmc_code,
         )
 
         if calls_per_probe is None:

diff --git a/sturgeon/cli/inputtobed.py b/sturgeon/cli/inputtobed.py
@@ -19,6 +19,7 @@ def filetobed(
     margin: Optional[int] = 25,
     neg_threshold: Optional[float] = 0.3,
     pos_threshold: Optional[float] = 0.7,
+    fivemc_code: str = 'm',
 ):
 
     logging.info("Sturgeon start up")
@@ -96,6 +97,7 @@ def filetobed(
             margin = margin,
             neg_threshold = neg_threshold,
             pos_threshold = pos_threshold,
+            fivemc_code = fivemc_code,
         )
 
 
@@ -221,7 +223,6 @@ def modkittobed(
     neg_threshold: Optional[float] = 0.3,
     pos_threshold: Optional[float] = 0.7,
     fivemc_code: str = 'm',
-    fivehmc_code: str = 'h',
 ):
 
     logging.info("Modkit to bed program")
@@ -265,7 +266,6 @@ def modkittobed(
         neg_threshold = neg_threshold,
         pos_threshold = pos_threshold,
         fivemc_code = fivemc_code,
-        fivehmc_code = fivehmc_code,
     )
 
 

diff --git a/sturgeon/parsers.py b/sturgeon/parsers.py
@@ -243,6 +243,14 @@ def register_inputtobed(parser):
         Positions with scores above this threshold will be considered methylated
         '''
     )
+    subparser.add_argument(
+        '--fivemc-code',
+        type=str,
+        default = 'm',
+        help='''
+        Onle letter code used to annotate 5mC in modkit files
+        '''
+    )
 
     subparser.set_defaults(func=run_inputtobed)
 
@@ -259,6 +267,7 @@ def run_inputtobed(args):
         margin = args.margin,
         neg_threshold = args.neg_threshold,
         pos_threshold = args.pos_threshold,
+        fivemc_code = args.fivemc_code,
     )
 
 def register_models(parser):