diff --git a/README.md b/README.md index 76c73e4..b549f4b 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,8 @@ julia> using Pkg; Pkg.add("https://github.com/yufongpeng/PeptideSeq.jl") ## Example ```julia +julia> using PeptideSeq + julia> p = Protein("DPCHKPKRRKP") Protein: DPCHKPKRRKP Modification: diff --git a/config_example/MODIFICATION.tsv b/config_example/MODIFICATION.tsv index cf144a5..a3712c8 100644 --- a/config_example/MODIFICATION.tsv +++ b/config_example/MODIFICATION.tsv @@ -1,2 +1,9 @@ -Modification Accurate Mass Avearge Mass Sites -3NPH 135.043262 135.12472 D E $ +Modification Accurate Mass Avearge Mass Sites +3NPH 135.043262 135.12472 D E $ +N-acetylation 42.010565 42.03672 K ^ +methylation 14.01565 14.02705 K R +O-Hex 162.052824 162.14072 S T +O-HexNAc 203.079374 162.14072 S T +hydroxylation 15.994915 15.999 K P +phosphorylation 79.966333 79.97872 S T Y H +N-GlcNAc 203.079374 203.19472 "r""K[^P][S, T]""" diff --git a/src/PeptideSeq.jl b/src/PeptideSeq.jl index 83e1236..89fd084 100644 --- a/src/PeptideSeq.jl +++ b/src/PeptideSeq.jl @@ -80,14 +80,14 @@ end add_modification!(source) Add custom modification. The first row is the header (can be empty), the first column is the name of modification, the second and third columns are addtional monoisotopic mass and average mass, respectively, and the other columns are the modification site. -^ repressents the N-terminal and \$ repressents the C-terminal. +Modification sites can be string without quotation or regular expression like r"...". ^ repressents the N-terminal and \$ repressents the C-terminal. See the example file "config_example/MODIFICATION.tsv". """ function add_modification!(source) for m in CSV.Rows(source, delim = "\t") push!(MODIFICATION_MS[1], m[1] => parse(Float64, m[2])) push!(MODIFICATION_MS[2], m[1] => parse(Float64, m[3])) - push!(MODIFICATION_SITE, m[1] => [loc for loc in getindex.(Ref(m), 4:length(m)) if !ismissing(loc)]) + push!(MODIFICATION_SITE, m[1] => [startwith(loc, "r") ? eval(Meta.parse(loc)) : loc for loc in getindex.(Ref(m), 4:length(m)) if !ismissing(loc)]) end end diff --git a/src/config.jl b/src/config.jl index 6cf924f..923c698 100644 --- a/src/config.jl +++ b/src/config.jl @@ -66,11 +66,38 @@ const ADD_MS = ( ) ) -const MODIFICATION_SITE = Dict{String, Vector{String}}("3NPH" => ["D", "E", "\$"]) +const MODIFICATION_SITE = Dict{String, Vector{Any}}( + "3NPH" => ["D", "E", "\$"], + "N-acetylation" => ["K", "^"], + "methylation" => ["K", "R"], + "O-Hex" => ["S", "T"], + "O-HexNAc" => ["S", "T"], + "hydroxylation" => ["K", "P"], + "phosphorylation" => ["S", "T", "Y", "H"], + "N-GlcNAc" => [r"K[^P][S, T]"] + ) const MODIFICATION_MS = ( - Dict{String, Float64}("3NPH" => 135.043262), - Dict{String, Float64}("3NPH" => 135.12472) + Dict{String, Float64}( + "3NPH" => 135.043262, + "N-acetylation" => 42.010565, + "methylation" => 14.015650, + "O-Hex" => 162.052824, + "O-HexNAc" => 203.079374, + "hydroxylation" => 15.994915, + "phosphorylation" => 79.966333, + "N-GlcNAc" => 203.079374 + ), + Dict{String, Float64}( + "3NPH" => 135.12472, + "N-acetylation" => 42.03672, + "methylation" => 14.02705, + "O-Hex" => 162.14072, + "O-HexNAc" => 203.19472, + "hydroxylation" => 15.999, + "phosphorylation" => 79.97872, + "N-GlcNAc" => 203.19472 + ) ) const ENZYME = Dict( diff --git a/src/preparation.jl b/src/preparation.jl index 1f90f81..69627d3 100644 --- a/src/preparation.jl +++ b/src/preparation.jl @@ -9,19 +9,13 @@ See object `MODIFICATION_SITE` for available modifications. Currently, "3NPH" is supported. """ function modify!(protein::Protein, modification::String...) - # If digestion had been done, add mass to each peptides - if CONFIG["ACCURATE"] - modification_ms = first(MODIFICATION_MS) - else - modification_ms = last(MODIFICATION_MS) - end - isempty(protein.peptides) || return _modify_mass!(protein, modification...) - for k in modification - haskey(modification_ms, k) || continue + haskey(MODIFICATION_SITE, k) || continue locs = Int[] - for loc in modification_ms[k] - if loc == "^" + for loc in MODIFICATION_SITE[k] + if isa(loc, Regex) + append!(locs, locc.offset for locc in eachmatch(loc, protein.origin)) + elseif loc == "^" push!(locs, 1) elseif loc == "\$" push!(locs, length(protein.origin)) @@ -31,7 +25,8 @@ function modify!(protein::Protein, modification::String...) end protein.modification[k] = locs end - protein + # If digestion had been done, add mass to each peptides + return isempty(protein.peptides) ? protein : _modify_mass!(protein, modification...) end function modify!(protein::Protein, modification::Dict{String, Vector{Int}}) @@ -52,20 +47,6 @@ function _modify_mass!(protein::Protein, modification::String...) modification_ms = last(MODIFICATION_MS) end - for k in modification - haskey(MODIFICATION_SITE, k) || continue - locs = Int[] - for loc in MODIFICATION_SITE[k] - append!(locs, findall(==(first(loc)), protein.origin)) - if loc == "^" - push!(locs, 0) - elseif loc == "\$" - push!(locs, -1) - end - end - protein.modification[k] = locs - end - for (k, v) in protein.modification for (i, pep) in enumerate(protein.peptides) id = filter(in(v), pep.position) @@ -125,7 +106,7 @@ function digest!(protein::Protein, n_miss::Int, enzyme::String = "") end if enzyme == "" - protein.enzyme != "" && throw(ArgumentError("Please provide enzyme!")) + protein.enzyme == "" && throw(ArgumentError("Please provide enzyme!")) else protein.enzyme = enzyme end