Add documentation

yufongpeng · Jun 30, 2022 · 32b8118 · 32b8118
1 parent 1879c40
commit 32b8118
Show file tree

Hide file tree

Showing 8 changed files with 125 additions and 53 deletions.
diff --git a/README.md b/README.md
@@ -3,10 +3,10 @@
 [![Build Status](https://github.com/yufongpeng/PeptideSeq.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/yufongpeng/PeptideSeq.jl/actions/workflows/CI.yml?query=branch%3Amain)
 [![Coverage](https://codecov.io/gh/yufongpeng/PeptideSeq.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/yufongpeng/PeptideSeq.jl)
 
-*PeptideSeq.jl* is a julia package for predicting digested peptide sequence, adding modification and generating expected fragments in mass spectrometry. 
+*PeptideSeq.jl* is a julia package for predicting peptide sequence after digestion, adding modification and generating expected fragments in mass spectrometry. Digestion enzyme and modification can be customized.
 
 ## Installation
-This package is not yet registered. Insall it through github:
+This package is not registered yet. Insall it through github:
 ```julia
 julia> using Pkg; Pkg.add("https://github.com/yufongpeng/PeptideSeq.jl")
 ```

diff --git a/src/config/ENZYME.tsv → config_example/ENZYME.tsv b/src/config/ENZYME.tsv → config_example/ENZYME.tsv
diff --git a/src/config/MODIFICATION.tsv → config_example/MODIFICATION.tsv b/src/config/MODIFICATION.tsv → config_example/MODIFICATION.tsv
@@ -1,2 +1,2 @@
 Modification	Accurate Mass	Avearge Mass	Sites		
-3NPH	135.043262	135.043262	D	E	$
+3NPH	135.043262	135.12472	D	E	$
diff --git a/src/PeptideSeq.jl b/src/PeptideSeq.jl
@@ -1,8 +1,26 @@
 module PeptideSeq
 using IterTools,  PrettyTables
 
-export Protein, Peptide, Fragments, digest!, modify!, ionize!, fragmentation
+export Protein, Peptide, Fragments, 
 
+    digest!, modify!, ionize!, fragmentation,
+
+    add_enzyme!, add_modification!,
+
+    MODIFICATION_SITE, ENZYME, CONFIG
+
+"""
+    Peptide
+
+Repressentation of a peptide digested from a `Protein`.
+# Field
+* `origin`: Oringinal protein sequence
+* `position`: Position of this peptide counting from N-terminal
+* `mass`: Monoisotopic or average mass depending on `CONFIG["ACCURACY"]`
+* `adduct`: Adduct of the ionized peptide. When the peptide is not ionized, it is "[M]". 
+* `miss_cleavage`: Number of miss cleavages allowed
+* `modification`: modification on the peptide
+"""
 mutable struct Peptide
     origin::AbstractString
     position::UnitRange{Int}
@@ -18,6 +36,17 @@ struct Fragments
     fragments::NamedTuple{(:type, :mass), Tuple{Vector{String}, Vector{Float64}}}
 end
 
+"""
+    Protein
+    Protein(sequence, enzyme = "", modification = Dict{String, Vector{Int}}())
+
+Repressentation of a protein
+# Field
+* `origin`: Protein sequence
+* `peptides`: a vector of `Peptide` digested with `enzyme`
+* `modification`: modification on the protein
+* `enzyme`: an enzyme for digesting the protein
+"""
 mutable struct Protein
     origin::AbstractString
     peptides::Vector{Peptide}
@@ -34,5 +63,33 @@ include("io.jl")
 include("preparation.jl")
 include("ms_spec.jl")
 
+"""
+    add_enzyme!(source)
+
+Add custom enzyme. The `source` must be a tsv file. The first row is the header (can be empty), the first column is the name of enzyme and the other column is regular expressions of the cleavage sites. 
+If multiple regular expressions are given, the digested sites will be the union of all possible sites.
+See the example file "config_example/ENZYME.tsv".
+"""
+function add_enzyme!(source)
+    for e in CSV.Rows(source, delim = "\t")
+        push!(ENZYME,  e[1] => [eval(Meta.parse(r)) for r in getindex.(Ref(e), 2:length(e)) if !ismissing(r)])
+    end
+end
+
+"""
+    add_modification!(source)
+
+Add custom modification. The first row is the header (can be empty), the first column is the name of modification, the second and third columns are addtional monoisotopic mass and average mass, respectively, and the other columns are the modification site. 
+^ repressents the N-terminal and \$ repressents the C-terminal.
+See the example file "config_example/MODIFICATION.tsv".
+"""
+function add_modification!(source)
+    for m in CSV.Rows(source, delim = "\t")
+        push!(MODIFICATION_MS[1], m[1] => parse(Float64, m[2]))
+        push!(MODIFICATION_MS[2], m[1] => parse(Float64, m[3]))
+        push!(MODIFICATION_SITE, m[1] => [loc for loc in getindex.(Ref(m), 4:length(m)) if !ismissing(loc)])
+    end
+end
+
 
 end
diff --git a/src/config.jl b/src/config.jl
@@ -66,27 +66,13 @@ const ADD_MS = (
     )  
 )
 
-const MODIFICATION_SITE = Dict{String, Vector{String}}()
+const MODIFICATION_SITE = Dict{String, Vector{String}}("3NPH" => ["D", "E", "\$"])
 
 const MODIFICATION_MS = (
-    Dict{String, Float64}(),
-    Dict{String, Float64}()
+    Dict{String, Float64}("3NPH" => 135.043262),
+    Dict{String, Float64}("3NPH" => 135.12472)
 )
 
-
-for m in CSV.Rows("src/config/MODIFICATION.tsv", delim = "\t")
-    push!(MODIFICATION_MS[1], m[1] => parse(Float64, m[2]))
-    push!(MODIFICATION_MS[2], m[1] => parse(Float64, m[3]))
-    push!(MODIFICATION_SITE, m[1] => [loc for loc in getindex.(Ref(m), 4:length(m)) if !ismissing(loc)])
-end
-
-const ENZYME = Dict{String, Vector{Regex}}()
-
-for e in CSV.Rows("src/config/ENZYME.tsv", delim = "\t")
-    push!(ENZYME,  e[1] => [eval(Meta.parse(r)) for r in getindex.(Ref(e), 2:length(e)) if !ismissing(r)])
-end
-
-#=
 const ENZYME = Dict(
     "Trypsin" => [r"[K, R][^P]"],                           # R/KX, X≠P
     "Trypsin(low specificity)" =>  [r"[K, R]."],            # R/KX
@@ -110,7 +96,7 @@ const ENZYME = Dict(
     "Trypsin/Chymotrypsin" => [r"[F, Y, W, L, X, K, R][^P]"],
     "MAFA" => [r"D."]  # DX
 )
-=#
+
 const ADDUCT_FN = (
     Dict(
         "[M]" => identity,

diff --git a/src/ms_spec.jl b/src/ms_spec.jl
@@ -1,5 +1,11 @@
 # Function related mass spectrometry
 
+"""
+    ionize!(protein, adducts...)
+
+Ionize the peptides as the `adducts`.
+Available adducts are "[M]", "[M+H]+", "[M+2H]2+", "[M-H]-", "[M-2H]2-".
+"""
 function ionize!(protein::Protein, adducts::String...)
     if CONFIG["ACCURATE"]
         adduct_fn = first(ADDUCT_FN)
@@ -25,6 +31,13 @@ end
 # y: +H             => +H2O             / peptide - b
 # z: -NH2           => -NH3 + H2O       / peptide - c
 
+"""
+    fragmentation(peptide; ion_type = [:b, :y], charge_state = :auto)
+
+Fragmentation of a peptide.
+`ion_type` is the type of fragments. It can be an vector containing :a, :b, :c, :x, :y, :z. The default is [:b, :c] which are major fragments in CID or HCD.
+`charge_state` determines the number of charges on the fragments. It can be a vector containing integers or a symbol. The default is `:auto` which means that doulbly charged fragments will be included for fragments containing more than 5 amino acids.
+"""
 function fragmentation(peptide::Peptide; ion_type = [:b, :y], charge_state = :auto)
     if CONFIG["ACCURATE"]
         aa_ms = first(AA_MS)
@@ -80,10 +93,10 @@ function _charge_fragments(neutral_fragments::Dict{Symbol, Vector{Float64}}, cha
     id = 1
     for ion in ion_type 
         for charge in charge_state 
+            charge = charge == 1 ? "" : "$charge"
+            adduct = "[M$(ion_mode)$(charge)H]$(charge)$(ion_mode)"
+            charge = UPPER_INDEX[charge * ion_mode]
             for (i, v) in enumerate(neutral_fragments[ion])
-                charge = charge == 1 ? "" : "$charge"
-                adduct = "[M$(ion_mode)$(charge)H]$(charge)$(ion_mode)"
-                charge = UPPER_INDEX[charge * ion_mode]
                 type[id] = String(ion) * Char(8320 + i) * charge
                 mass[id] = adduct_fn[adduct](v)
                 id += 1

diff --git a/src/preparation.jl b/src/preparation.jl
@@ -1,5 +1,39 @@
 # Digestion and modification
 
+"""
+    modify!(protein, modification...)
+
+Modify an intact protein or digested protein.
+Modification must be a key of `MODIFICATION_SITE` or it will be ignored.
+See object `MODIFICATION_SITE` for available modifications.
+Currently, "3NPH" is supported.
+"""
+function modify!(protein::Protein, modification::String...)
+    # If digestion had been done, add mass to each peptides
+    if CONFIG["ACCURATE"]
+        modification_ms = first(MODIFICATION_MS)
+    else
+        modification_ms = last(MODIFICATION_MS)
+    end
+    isempty(protein.peptides) || return _modify_mass!(protein, modification...)
+
+    for k in modification
+        haskey(modification_ms, k) || continue
+        locs = Int[]
+        for loc in modification_ms[k]
+            if loc == "^"
+                push!(locs, 1)
+            elseif loc == "\$"
+                push!(locs, length(protein.origin))
+            else
+                append!(locs, findall(==(first(loc)), protein.origin))
+            end
+        end
+        protein.modification[k] = locs
+    end
+    protein
+end
+
 function modify!(protein::Protein, modification::Dict{String, Vector{Int}})
     for (k, v) in modification
         protein.modification[k] = sort!(union!(get(protein.modification, k, Int[]), v))
@@ -46,32 +80,6 @@ function _modify_mass!(protein::Protein, modification::String...)
     protein
 end
 
-function modify!(protein::Protein, modification::String...)
-    # If digestion had been done, add mass to each peptides
-    if CONFIG["ACCURATE"]
-        modification_ms = first(MODIFICATION_MS)
-    else
-        modification_ms = last(MODIFICATION_MS)
-    end
-    isempty(protein.peptides) || return _modify_mass!(protein, modification...)
-
-    for k in modification
-        haskey(modification_ms, k) || continue
-        locs = Int[]
-        for loc in modification_ms[k]
-            if loc == "^"
-                push!(locs, 1)
-            elseif loc == "\$"
-                push!(locs, length(protein.origin))
-            else
-                append!(locs, findall(==(first(loc)), protein.origin))
-            end
-        end
-        protein.modification[k] = locs
-    end
-    protein
-end
-
 function full_digestion(sequence::AbstractString, enzyme::String)
     regex = ENZYME[enzyme]
     cleavage_sites = mapreduce(union, regex) do rs
@@ -97,6 +105,14 @@ function merge_modification(modifications)
     new
 end
 
+"""
+    digest!(protein, n_miss, enzyme = "")
+
+Digest a protein with an enzyme.
+`n_miss` is the allowed number of miss cleavage.
+If `protein.enzyme` is an empty string, `enzyme` must be provided.
+See object `ENZYME` for available enzymes.
+"""
 function digest!(protein::Protein, n_miss::Int, enzyme::String = "")
     if CONFIG["ACCURATE"]
         aa_ms = first(AA_MS)

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -13,6 +13,6 @@ using Test
     ionize!(p2, "[M+H]+", "[M+2H]2+")
     fragmentation(p1.peptides[1])
     fragmentation(p2.peptides[1])
-    fragmentation(p1.peptides[2])
-    fragmentation(p2.peptides[2])
+    fragmentation(p1.peptides[2]; charge_state = [1])
+    fragmentation(p2.peptides[2]; charge_state = [1, 2])
 end