From 32b8118c4d3f7ba1816ddd25db83fb07b5e92ea3 Mon Sep 17 00:00:00 2001 From: yufongpeng <54415349+yufongpeng@users.noreply.github.com> Date: Thu, 30 Jun 2022 22:54:18 +0800 Subject: [PATCH] Add documentation --- README.md | 4 +- {src/config => config_example}/ENZYME.tsv | 0 .../MODIFICATION.tsv | 2 +- src/PeptideSeq.jl | 59 +++++++++++++++- src/config.jl | 22 ++---- src/ms_spec.jl | 19 +++++- src/preparation.jl | 68 ++++++++++++------- test/runtests.jl | 4 +- 8 files changed, 125 insertions(+), 53 deletions(-) rename {src/config => config_example}/ENZYME.tsv (100%) rename {src/config => config_example}/MODIFICATION.tsv (59%) diff --git a/README.md b/README.md index 561b182..76c73e4 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,10 @@ [![Build Status](https://github.com/yufongpeng/PeptideSeq.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/yufongpeng/PeptideSeq.jl/actions/workflows/CI.yml?query=branch%3Amain) [![Coverage](https://codecov.io/gh/yufongpeng/PeptideSeq.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/yufongpeng/PeptideSeq.jl) -*PeptideSeq.jl* is a julia package for predicting digested peptide sequence, adding modification and generating expected fragments in mass spectrometry. +*PeptideSeq.jl* is a julia package for predicting peptide sequence after digestion, adding modification and generating expected fragments in mass spectrometry. Digestion enzyme and modification can be customized. ## Installation -This package is not yet registered. Insall it through github: +This package is not registered yet. Insall it through github: ```julia julia> using Pkg; Pkg.add("https://github.com/yufongpeng/PeptideSeq.jl") ``` diff --git a/src/config/ENZYME.tsv b/config_example/ENZYME.tsv similarity index 100% rename from src/config/ENZYME.tsv rename to config_example/ENZYME.tsv diff --git a/src/config/MODIFICATION.tsv b/config_example/MODIFICATION.tsv similarity index 59% rename from src/config/MODIFICATION.tsv rename to config_example/MODIFICATION.tsv index b17810b..cf144a5 100644 --- a/src/config/MODIFICATION.tsv +++ b/config_example/MODIFICATION.tsv @@ -1,2 +1,2 @@ Modification Accurate Mass Avearge Mass Sites -3NPH 135.043262 135.043262 D E $ +3NPH 135.043262 135.12472 D E $ diff --git a/src/PeptideSeq.jl b/src/PeptideSeq.jl index 6767f89..83e1236 100644 --- a/src/PeptideSeq.jl +++ b/src/PeptideSeq.jl @@ -1,8 +1,26 @@ module PeptideSeq using IterTools, PrettyTables -export Protein, Peptide, Fragments, digest!, modify!, ionize!, fragmentation +export Protein, Peptide, Fragments, + digest!, modify!, ionize!, fragmentation, + + add_enzyme!, add_modification!, + + MODIFICATION_SITE, ENZYME, CONFIG + +""" + Peptide + +Repressentation of a peptide digested from a `Protein`. +# Field +* `origin`: Oringinal protein sequence +* `position`: Position of this peptide counting from N-terminal +* `mass`: Monoisotopic or average mass depending on `CONFIG["ACCURACY"]` +* `adduct`: Adduct of the ionized peptide. When the peptide is not ionized, it is "[M]". +* `miss_cleavage`: Number of miss cleavages allowed +* `modification`: modification on the peptide +""" mutable struct Peptide origin::AbstractString position::UnitRange{Int} @@ -18,6 +36,17 @@ struct Fragments fragments::NamedTuple{(:type, :mass), Tuple{Vector{String}, Vector{Float64}}} end +""" + Protein + Protein(sequence, enzyme = "", modification = Dict{String, Vector{Int}}()) + +Repressentation of a protein +# Field +* `origin`: Protein sequence +* `peptides`: a vector of `Peptide` digested with `enzyme` +* `modification`: modification on the protein +* `enzyme`: an enzyme for digesting the protein +""" mutable struct Protein origin::AbstractString peptides::Vector{Peptide} @@ -34,5 +63,33 @@ include("io.jl") include("preparation.jl") include("ms_spec.jl") +""" + add_enzyme!(source) + +Add custom enzyme. The `source` must be a tsv file. The first row is the header (can be empty), the first column is the name of enzyme and the other column is regular expressions of the cleavage sites. +If multiple regular expressions are given, the digested sites will be the union of all possible sites. +See the example file "config_example/ENZYME.tsv". +""" +function add_enzyme!(source) + for e in CSV.Rows(source, delim = "\t") + push!(ENZYME, e[1] => [eval(Meta.parse(r)) for r in getindex.(Ref(e), 2:length(e)) if !ismissing(r)]) + end +end + +""" + add_modification!(source) + +Add custom modification. The first row is the header (can be empty), the first column is the name of modification, the second and third columns are addtional monoisotopic mass and average mass, respectively, and the other columns are the modification site. +^ repressents the N-terminal and \$ repressents the C-terminal. +See the example file "config_example/MODIFICATION.tsv". +""" +function add_modification!(source) + for m in CSV.Rows(source, delim = "\t") + push!(MODIFICATION_MS[1], m[1] => parse(Float64, m[2])) + push!(MODIFICATION_MS[2], m[1] => parse(Float64, m[3])) + push!(MODIFICATION_SITE, m[1] => [loc for loc in getindex.(Ref(m), 4:length(m)) if !ismissing(loc)]) + end +end + end diff --git a/src/config.jl b/src/config.jl index 2e3b894..6cf924f 100644 --- a/src/config.jl +++ b/src/config.jl @@ -66,27 +66,13 @@ const ADD_MS = ( ) ) -const MODIFICATION_SITE = Dict{String, Vector{String}}() +const MODIFICATION_SITE = Dict{String, Vector{String}}("3NPH" => ["D", "E", "\$"]) const MODIFICATION_MS = ( - Dict{String, Float64}(), - Dict{String, Float64}() + Dict{String, Float64}("3NPH" => 135.043262), + Dict{String, Float64}("3NPH" => 135.12472) ) - -for m in CSV.Rows("src/config/MODIFICATION.tsv", delim = "\t") - push!(MODIFICATION_MS[1], m[1] => parse(Float64, m[2])) - push!(MODIFICATION_MS[2], m[1] => parse(Float64, m[3])) - push!(MODIFICATION_SITE, m[1] => [loc for loc in getindex.(Ref(m), 4:length(m)) if !ismissing(loc)]) -end - -const ENZYME = Dict{String, Vector{Regex}}() - -for e in CSV.Rows("src/config/ENZYME.tsv", delim = "\t") - push!(ENZYME, e[1] => [eval(Meta.parse(r)) for r in getindex.(Ref(e), 2:length(e)) if !ismissing(r)]) -end - -#= const ENZYME = Dict( "Trypsin" => [r"[K, R][^P]"], # R/KX, X≠P "Trypsin(low specificity)" => [r"[K, R]."], # R/KX @@ -110,7 +96,7 @@ const ENZYME = Dict( "Trypsin/Chymotrypsin" => [r"[F, Y, W, L, X, K, R][^P]"], "MAFA" => [r"D."] # DX ) -=# + const ADDUCT_FN = ( Dict( "[M]" => identity, diff --git a/src/ms_spec.jl b/src/ms_spec.jl index dac70d1..ec3e1cb 100644 --- a/src/ms_spec.jl +++ b/src/ms_spec.jl @@ -1,5 +1,11 @@ # Function related mass spectrometry +""" + ionize!(protein, adducts...) + +Ionize the peptides as the `adducts`. +Available adducts are "[M]", "[M+H]+", "[M+2H]2+", "[M-H]-", "[M-2H]2-". +""" function ionize!(protein::Protein, adducts::String...) if CONFIG["ACCURATE"] adduct_fn = first(ADDUCT_FN) @@ -25,6 +31,13 @@ end # y: +H => +H2O / peptide - b # z: -NH2 => -NH3 + H2O / peptide - c +""" + fragmentation(peptide; ion_type = [:b, :y], charge_state = :auto) + +Fragmentation of a peptide. +`ion_type` is the type of fragments. It can be an vector containing :a, :b, :c, :x, :y, :z. The default is [:b, :c] which are major fragments in CID or HCD. +`charge_state` determines the number of charges on the fragments. It can be a vector containing integers or a symbol. The default is `:auto` which means that doulbly charged fragments will be included for fragments containing more than 5 amino acids. +""" function fragmentation(peptide::Peptide; ion_type = [:b, :y], charge_state = :auto) if CONFIG["ACCURATE"] aa_ms = first(AA_MS) @@ -80,10 +93,10 @@ function _charge_fragments(neutral_fragments::Dict{Symbol, Vector{Float64}}, cha id = 1 for ion in ion_type for charge in charge_state + charge = charge == 1 ? "" : "$charge" + adduct = "[M$(ion_mode)$(charge)H]$(charge)$(ion_mode)" + charge = UPPER_INDEX[charge * ion_mode] for (i, v) in enumerate(neutral_fragments[ion]) - charge = charge == 1 ? "" : "$charge" - adduct = "[M$(ion_mode)$(charge)H]$(charge)$(ion_mode)" - charge = UPPER_INDEX[charge * ion_mode] type[id] = String(ion) * Char(8320 + i) * charge mass[id] = adduct_fn[adduct](v) id += 1 diff --git a/src/preparation.jl b/src/preparation.jl index 7840fae..1f90f81 100644 --- a/src/preparation.jl +++ b/src/preparation.jl @@ -1,5 +1,39 @@ # Digestion and modification +""" + modify!(protein, modification...) + +Modify an intact protein or digested protein. +Modification must be a key of `MODIFICATION_SITE` or it will be ignored. +See object `MODIFICATION_SITE` for available modifications. +Currently, "3NPH" is supported. +""" +function modify!(protein::Protein, modification::String...) + # If digestion had been done, add mass to each peptides + if CONFIG["ACCURATE"] + modification_ms = first(MODIFICATION_MS) + else + modification_ms = last(MODIFICATION_MS) + end + isempty(protein.peptides) || return _modify_mass!(protein, modification...) + + for k in modification + haskey(modification_ms, k) || continue + locs = Int[] + for loc in modification_ms[k] + if loc == "^" + push!(locs, 1) + elseif loc == "\$" + push!(locs, length(protein.origin)) + else + append!(locs, findall(==(first(loc)), protein.origin)) + end + end + protein.modification[k] = locs + end + protein +end + function modify!(protein::Protein, modification::Dict{String, Vector{Int}}) for (k, v) in modification protein.modification[k] = sort!(union!(get(protein.modification, k, Int[]), v)) @@ -46,32 +80,6 @@ function _modify_mass!(protein::Protein, modification::String...) protein end -function modify!(protein::Protein, modification::String...) - # If digestion had been done, add mass to each peptides - if CONFIG["ACCURATE"] - modification_ms = first(MODIFICATION_MS) - else - modification_ms = last(MODIFICATION_MS) - end - isempty(protein.peptides) || return _modify_mass!(protein, modification...) - - for k in modification - haskey(modification_ms, k) || continue - locs = Int[] - for loc in modification_ms[k] - if loc == "^" - push!(locs, 1) - elseif loc == "\$" - push!(locs, length(protein.origin)) - else - append!(locs, findall(==(first(loc)), protein.origin)) - end - end - protein.modification[k] = locs - end - protein -end - function full_digestion(sequence::AbstractString, enzyme::String) regex = ENZYME[enzyme] cleavage_sites = mapreduce(union, regex) do rs @@ -97,6 +105,14 @@ function merge_modification(modifications) new end +""" + digest!(protein, n_miss, enzyme = "") + +Digest a protein with an enzyme. +`n_miss` is the allowed number of miss cleavage. +If `protein.enzyme` is an empty string, `enzyme` must be provided. +See object `ENZYME` for available enzymes. +""" function digest!(protein::Protein, n_miss::Int, enzyme::String = "") if CONFIG["ACCURATE"] aa_ms = first(AA_MS) diff --git a/test/runtests.jl b/test/runtests.jl index 0a355a3..18ff903 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -13,6 +13,6 @@ using Test ionize!(p2, "[M+H]+", "[M+2H]2+") fragmentation(p1.peptides[1]) fragmentation(p2.peptides[1]) - fragmentation(p1.peptides[2]) - fragmentation(p2.peptides[2]) + fragmentation(p1.peptides[2]; charge_state = [1]) + fragmentation(p2.peptides[2]; charge_state = [1, 2]) end