From 32b8118c4d3f7ba1816ddd25db83fb07b5e92ea3 Mon Sep 17 00:00:00 2001
From: yufongpeng <54415349+yufongpeng@users.noreply.github.com>
Date: Thu, 30 Jun 2022 22:54:18 +0800
Subject: [PATCH] Add documentation

---
 README.md                                     |  4 +-
 {src/config => config_example}/ENZYME.tsv     |  0
 .../MODIFICATION.tsv                          |  2 +-
 src/PeptideSeq.jl                             | 59 +++++++++++++++-
 src/config.jl                                 | 22 ++----
 src/ms_spec.jl                                | 19 +++++-
 src/preparation.jl                            | 68 ++++++++++++-------
 test/runtests.jl                              |  4 +-
 8 files changed, 125 insertions(+), 53 deletions(-)
 rename {src/config => config_example}/ENZYME.tsv (100%)
 rename {src/config => config_example}/MODIFICATION.tsv (59%)

diff --git a/README.md b/README.md
index 561b182..76c73e4 100644
--- a/README.md
+++ b/README.md
@@ -3,10 +3,10 @@
 [![Build Status](https://github.com/yufongpeng/PeptideSeq.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/yufongpeng/PeptideSeq.jl/actions/workflows/CI.yml?query=branch%3Amain)
 [![Coverage](https://codecov.io/gh/yufongpeng/PeptideSeq.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/yufongpeng/PeptideSeq.jl)
 
-*PeptideSeq.jl* is a julia package for predicting digested peptide sequence, adding modification and generating expected fragments in mass spectrometry. 
+*PeptideSeq.jl* is a julia package for predicting peptide sequence after digestion, adding modification and generating expected fragments in mass spectrometry. Digestion enzyme and modification can be customized.
 
 ## Installation
-This package is not yet registered. Insall it through github:
+This package is not registered yet. Insall it through github:
 ```julia
 julia> using Pkg; Pkg.add("https://github.com/yufongpeng/PeptideSeq.jl")
 ```
diff --git a/src/config/ENZYME.tsv b/config_example/ENZYME.tsv
similarity index 100%
rename from src/config/ENZYME.tsv
rename to config_example/ENZYME.tsv
diff --git a/src/config/MODIFICATION.tsv b/config_example/MODIFICATION.tsv
similarity index 59%
rename from src/config/MODIFICATION.tsv
rename to config_example/MODIFICATION.tsv
index b17810b..cf144a5 100644
--- a/src/config/MODIFICATION.tsv
+++ b/config_example/MODIFICATION.tsv
@@ -1,2 +1,2 @@
 Modification	Accurate Mass	Avearge Mass	Sites		
-3NPH	135.043262	135.043262	D	E	$
+3NPH	135.043262	135.12472	D	E	$
diff --git a/src/PeptideSeq.jl b/src/PeptideSeq.jl
index 6767f89..83e1236 100644
--- a/src/PeptideSeq.jl
+++ b/src/PeptideSeq.jl
@@ -1,8 +1,26 @@
 module PeptideSeq
 using IterTools,  PrettyTables
 
-export Protein, Peptide, Fragments, digest!, modify!, ionize!, fragmentation
+export Protein, Peptide, Fragments, 
 
+    digest!, modify!, ionize!, fragmentation,
+
+    add_enzyme!, add_modification!,
+
+    MODIFICATION_SITE, ENZYME, CONFIG
+
+"""
+    Peptide
+
+Repressentation of a peptide digested from a `Protein`.
+# Field
+* `origin`: Oringinal protein sequence
+* `position`: Position of this peptide counting from N-terminal
+* `mass`: Monoisotopic or average mass depending on `CONFIG["ACCURACY"]`
+* `adduct`: Adduct of the ionized peptide. When the peptide is not ionized, it is "[M]". 
+* `miss_cleavage`: Number of miss cleavages allowed
+* `modification`: modification on the peptide
+"""
 mutable struct Peptide
     origin::AbstractString
     position::UnitRange{Int}
@@ -18,6 +36,17 @@ struct Fragments
     fragments::NamedTuple{(:type, :mass), Tuple{Vector{String}, Vector{Float64}}}
 end
 
+"""
+    Protein
+    Protein(sequence, enzyme = "", modification = Dict{String, Vector{Int}}())
+
+Repressentation of a protein
+# Field
+* `origin`: Protein sequence
+* `peptides`: a vector of `Peptide` digested with `enzyme`
+* `modification`: modification on the protein
+* `enzyme`: an enzyme for digesting the protein
+"""
 mutable struct Protein
     origin::AbstractString
     peptides::Vector{Peptide}
@@ -34,5 +63,33 @@ include("io.jl")
 include("preparation.jl")
 include("ms_spec.jl")
 
+"""
+    add_enzyme!(source)
+
+Add custom enzyme. The `source` must be a tsv file. The first row is the header (can be empty), the first column is the name of enzyme and the other column is regular expressions of the cleavage sites. 
+If multiple regular expressions are given, the digested sites will be the union of all possible sites.
+See the example file "config_example/ENZYME.tsv".
+"""
+function add_enzyme!(source)
+    for e in CSV.Rows(source, delim = "\t")
+        push!(ENZYME,  e[1] => [eval(Meta.parse(r)) for r in getindex.(Ref(e), 2:length(e)) if !ismissing(r)])
+    end
+end
+
+"""
+    add_modification!(source)
+
+Add custom modification. The first row is the header (can be empty), the first column is the name of modification, the second and third columns are addtional monoisotopic mass and average mass, respectively, and the other columns are the modification site. 
+^ repressents the N-terminal and \$ repressents the C-terminal.
+See the example file "config_example/MODIFICATION.tsv".
+"""
+function add_modification!(source)
+    for m in CSV.Rows(source, delim = "\t")
+        push!(MODIFICATION_MS[1], m[1] => parse(Float64, m[2]))
+        push!(MODIFICATION_MS[2], m[1] => parse(Float64, m[3]))
+        push!(MODIFICATION_SITE, m[1] => [loc for loc in getindex.(Ref(m), 4:length(m)) if !ismissing(loc)])
+    end
+end
+
 
 end
diff --git a/src/config.jl b/src/config.jl
index 2e3b894..6cf924f 100644
--- a/src/config.jl
+++ b/src/config.jl
@@ -66,27 +66,13 @@ const ADD_MS = (
     )  
 )
 
-const MODIFICATION_SITE = Dict{String, Vector{String}}()
+const MODIFICATION_SITE = Dict{String, Vector{String}}("3NPH" => ["D", "E", "\$"])
 
 const MODIFICATION_MS = (
-    Dict{String, Float64}(),
-    Dict{String, Float64}()
+    Dict{String, Float64}("3NPH" => 135.043262),
+    Dict{String, Float64}("3NPH" => 135.12472)
 )
 
-
-for m in CSV.Rows("src/config/MODIFICATION.tsv", delim = "\t")
-    push!(MODIFICATION_MS[1], m[1] => parse(Float64, m[2]))
-    push!(MODIFICATION_MS[2], m[1] => parse(Float64, m[3]))
-    push!(MODIFICATION_SITE, m[1] => [loc for loc in getindex.(Ref(m), 4:length(m)) if !ismissing(loc)])
-end
-
-const ENZYME = Dict{String, Vector{Regex}}()
-
-for e in CSV.Rows("src/config/ENZYME.tsv", delim = "\t")
-    push!(ENZYME,  e[1] => [eval(Meta.parse(r)) for r in getindex.(Ref(e), 2:length(e)) if !ismissing(r)])
-end
-
-#=
 const ENZYME = Dict(
     "Trypsin" => [r"[K, R][^P]"],                           # R/KX, X≠P
     "Trypsin(low specificity)" =>  [r"[K, R]."],            # R/KX
@@ -110,7 +96,7 @@ const ENZYME = Dict(
     "Trypsin/Chymotrypsin" => [r"[F, Y, W, L, X, K, R][^P]"],
     "MAFA" => [r"D."]  # DX
 )
-=#
+
 const ADDUCT_FN = (
     Dict(
         "[M]" => identity,
diff --git a/src/ms_spec.jl b/src/ms_spec.jl
index dac70d1..ec3e1cb 100644
--- a/src/ms_spec.jl
+++ b/src/ms_spec.jl
@@ -1,5 +1,11 @@
 # Function related mass spectrometry
 
+"""
+    ionize!(protein, adducts...)
+
+Ionize the peptides as the `adducts`.
+Available adducts are "[M]", "[M+H]+", "[M+2H]2+", "[M-H]-", "[M-2H]2-".
+"""
 function ionize!(protein::Protein, adducts::String...)
     if CONFIG["ACCURATE"]
         adduct_fn = first(ADDUCT_FN)
@@ -25,6 +31,13 @@ end
 # y: +H             => +H2O             / peptide - b
 # z: -NH2           => -NH3 + H2O       / peptide - c
 
+"""
+    fragmentation(peptide; ion_type = [:b, :y], charge_state = :auto)
+
+Fragmentation of a peptide.
+`ion_type` is the type of fragments. It can be an vector containing :a, :b, :c, :x, :y, :z. The default is [:b, :c] which are major fragments in CID or HCD.
+`charge_state` determines the number of charges on the fragments. It can be a vector containing integers or a symbol. The default is `:auto` which means that doulbly charged fragments will be included for fragments containing more than 5 amino acids.
+"""
 function fragmentation(peptide::Peptide; ion_type = [:b, :y], charge_state = :auto)
     if CONFIG["ACCURATE"]
         aa_ms = first(AA_MS)
@@ -80,10 +93,10 @@ function _charge_fragments(neutral_fragments::Dict{Symbol, Vector{Float64}}, cha
     id = 1
     for ion in ion_type 
         for charge in charge_state 
+            charge = charge == 1 ? "" : "$charge"
+            adduct = "[M$(ion_mode)$(charge)H]$(charge)$(ion_mode)"
+            charge = UPPER_INDEX[charge * ion_mode]
             for (i, v) in enumerate(neutral_fragments[ion])
-                charge = charge == 1 ? "" : "$charge"
-                adduct = "[M$(ion_mode)$(charge)H]$(charge)$(ion_mode)"
-                charge = UPPER_INDEX[charge * ion_mode]
                 type[id] = String(ion) * Char(8320 + i) * charge
                 mass[id] = adduct_fn[adduct](v)
                 id += 1
diff --git a/src/preparation.jl b/src/preparation.jl
index 7840fae..1f90f81 100644
--- a/src/preparation.jl
+++ b/src/preparation.jl
@@ -1,5 +1,39 @@
 # Digestion and modification
 
+"""
+    modify!(protein, modification...)
+
+Modify an intact protein or digested protein.
+Modification must be a key of `MODIFICATION_SITE` or it will be ignored.
+See object `MODIFICATION_SITE` for available modifications.
+Currently, "3NPH" is supported.
+"""
+function modify!(protein::Protein, modification::String...)
+    # If digestion had been done, add mass to each peptides
+    if CONFIG["ACCURATE"]
+        modification_ms = first(MODIFICATION_MS)
+    else
+        modification_ms = last(MODIFICATION_MS)
+    end
+    isempty(protein.peptides) || return _modify_mass!(protein, modification...)
+
+    for k in modification
+        haskey(modification_ms, k) || continue
+        locs = Int[]
+        for loc in modification_ms[k]
+            if loc == "^"
+                push!(locs, 1)
+            elseif loc == "\$"
+                push!(locs, length(protein.origin))
+            else
+                append!(locs, findall(==(first(loc)), protein.origin))
+            end
+        end
+        protein.modification[k] = locs
+    end
+    protein
+end
+
 function modify!(protein::Protein, modification::Dict{String, Vector{Int}})
     for (k, v) in modification
         protein.modification[k] = sort!(union!(get(protein.modification, k, Int[]), v))
@@ -46,32 +80,6 @@ function _modify_mass!(protein::Protein, modification::String...)
     protein
 end
 
-function modify!(protein::Protein, modification::String...)
-    # If digestion had been done, add mass to each peptides
-    if CONFIG["ACCURATE"]
-        modification_ms = first(MODIFICATION_MS)
-    else
-        modification_ms = last(MODIFICATION_MS)
-    end
-    isempty(protein.peptides) || return _modify_mass!(protein, modification...)
-
-    for k in modification
-        haskey(modification_ms, k) || continue
-        locs = Int[]
-        for loc in modification_ms[k]
-            if loc == "^"
-                push!(locs, 1)
-            elseif loc == "\$"
-                push!(locs, length(protein.origin))
-            else
-                append!(locs, findall(==(first(loc)), protein.origin))
-            end
-        end
-        protein.modification[k] = locs
-    end
-    protein
-end
-
 function full_digestion(sequence::AbstractString, enzyme::String)
     regex = ENZYME[enzyme]
     cleavage_sites = mapreduce(union, regex) do rs
@@ -97,6 +105,14 @@ function merge_modification(modifications)
     new
 end
 
+"""
+    digest!(protein, n_miss, enzyme = "")
+
+Digest a protein with an enzyme.
+`n_miss` is the allowed number of miss cleavage.
+If `protein.enzyme` is an empty string, `enzyme` must be provided.
+See object `ENZYME` for available enzymes.
+"""
 function digest!(protein::Protein, n_miss::Int, enzyme::String = "")
     if CONFIG["ACCURATE"]
         aa_ms = first(AA_MS)
diff --git a/test/runtests.jl b/test/runtests.jl
index 0a355a3..18ff903 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -13,6 +13,6 @@ using Test
     ionize!(p2, "[M+H]+", "[M+2H]2+")
     fragmentation(p1.peptides[1])
     fragmentation(p2.peptides[1])
-    fragmentation(p1.peptides[2])
-    fragmentation(p2.peptides[2])
+    fragmentation(p1.peptides[2]; charge_state = [1])
+    fragmentation(p2.peptides[2]; charge_state = [1, 2])
 end