From 7d942f72733df1dc16256e4ac430e4e6b1670c2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=B0=D1=81=D1=82=D0=B0=D1=81=D0=B8=D1=8F=20?= =?UTF-8?q?=D0=A8=D1=82=D0=BE=D0=BC=D0=BF=D0=B5=D0=BB=D1=8C?= Date: Sun, 1 Oct 2023 00:25:26 +0300 Subject: [PATCH 01/11] create operation functions --- HW4_Shtompel/protein_tools.py | 64 +++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 HW4_Shtompel/protein_tools.py diff --git a/HW4_Shtompel/protein_tools.py b/HW4_Shtompel/protein_tools.py new file mode 100644 index 0000000..b23a998 --- /dev/null +++ b/HW4_Shtompel/protein_tools.py @@ -0,0 +1,64 @@ +def count_protein_mass(seq: str, kda_scale=False) -> float: + """ + Calculates mass of all aminoacids of input peptide in g/mol or kDa scale. + Arguments: + - seq (str): one-letter code peptide sequence, case is not important; + - kda_scale (bool): if True converts peptide mass into kDa scale (1KDa = 1000g/mol). + Output: + Returns mass of peptide (float). + """ + aa_mass = 0 + for aminoacid in seq.upper(): + if aminoacid in AMINO_ACIDS_MASSES: + aa_mass += AMINO_ACIDS_MASSES[aminoacid] + if kda_scale is True: + kda = round(aa_mass/1000, 1) + return kda + return aa_mass + + +def count_aliphatic_index(seq: str) -> float: + """ + Calculates aliphatic index - relative proportion of aliphatic aminoacids in input peptide. + The higher aliphatic index the higher thermostability of peptide. + Argument: + - seq (str): one-letter code peptide sequence, letter case is not important. + Output: + Returns alipatic index (float). + """ + ala_count = seq.count('A')/len(seq) + val_count = seq.count('V')/len(seq) + lei_count = seq.count('L')/len(seq) + izlei_count = seq.count('I')/len(seq) + aliph_index = ala_count + 2.9 * val_count + 3.9 * lei_count + 3.9 * izlei_count + return aliph_index + + +def not_trypsin_cleaved(seq: str) -> int: + """ + Counts non-cleavable sites of trypsin: Arginine/Proline (RP) and Lysine/Proline (KP) pairs. + Argument: + - seq (str): one-letter code peptide sequence, case is not important. + Output: + Returns number of exception sites that cannot be cleaved by trypsin (int). + """ + not_cleavage_count = 0 + not_cleavage_count += seq.upper().count('RP') + not_cleavage_count += seq.upper().count('KP') + return not_cleavage_count + + +def count_trypsin_sites(seq: str) -> int: + """ + Counts number of valid trypsin cleavable sites: + Arginine/any aminoacid and Lysine/any aminoacid (except Proline). + Argument: + - seq (str): one-letter code peptide sequence, case is not important. + Output: + Returns number of valid trypsin cleavable sites (int). + If peptide has not any trypsin cleavable sites, it will return zero. + """ + arginine_value = seq.upper().count('R') + lysine_value = seq.upper().count('K') + count_cleavage = arginine_value + lysine_value - not_trypsin_cleaved(seq) + return count_cleavage From 2eb2ff2494d6b704b56cc86ddacf91a05fee43a3 Mon Sep 17 00:00:00 2001 From: Elizaveta Chevokina Date: Sun, 1 Oct 2023 02:10:07 +0300 Subject: [PATCH 02/11] Add README.md file and a part of protein_tools.py file. --- README.md | 115 ++++++++++++++++++++------------------------- hw4_python.py | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+), 65 deletions(-) create mode 100644 hw4_python.py diff --git a/README.md b/README.md index f918170..157a958 100644 --- a/README.md +++ b/README.md @@ -1,65 +1,50 @@ -# HW 4. Functions 2 -> *This is the repo for the fourth homework of the BI Python 2023 course* - -### Homework description - -На прошлой неделе вы делали утилиту для работы с последовательностями нуклеиновых кислот (с весьма строгим ТЗ). Пришло время для чего-то более самостоятельного. - -#### Основное задание - - -Напишите утилиту для работы с последовательностями белков. Там должно быть минимум 5 различных операций, должна быть какая-то точка входа через которую пользователь будет всё это дело использовать. На этом, по сути, всё. Всё целиком зависит от вашей фантазии и креативности. Можете опираться на ДЗ №2 и №3. - -Самая главная часть задания - это файл `README.md`. Сделайте краткое введение, напишите описание тула, приведите документацию по использованию со списком аргументов. Добавьте примеры использования. Возможно, вы захотите сделать секцию Troubleshooting. ***Почему это нужно?*** В этот раз проверяющий не будет знать того, как должен работать ваш тул. Это ваш авторский код. Даже самая прекрасная функциональность, не будучи отраженной в README, скорее всего останется незамеченной. README - это ваш способ познакомить пользователя с тулом, показать всё лучше и обосновать, почему именно ваша команда должна получить наивысший балл. - -Есть люди которые, любят писать документации, а есть те - кто не любит. Найдите в вашей команде того, кто любит. И в будущем в своих рабочих проектах всегда держите рядом такого человек (или будьте им). - -Примеры некоторых README, которыми можно вдохновляться: - -- [MetaFX](https://github.com/ctlab/metafx), тул Артёма Иванова. Там еще и [wiki](https://github.com/ctlab/metafx/wiki) крутое. -- [samovar](https://github.com/nvaulin/samovar) -- [MetaGEM](https://github.com/franciscozorrilla/metaGEM) -- [Pharokka](https://github.com/gbouras13/pharokka) - -Типовые секции, на которые стоит обратить внимание: Title, Overview, Usage, Options, Examples, Troubleshooting, Contacts. - -**Tехническое требование к заданию.** - -Это задание будет выполняться в командах по 3 человека. Каждый из членов команды должен внести ***как минимум*** 2 функции. Каждое внесение функции должно сопровождаться коммитом с осмысленным описанием коммита. Ниже приведена последовательность действий для успешного выполнения задания (аналогично ДЗ №2): - -1. Посмотрите состав своей команды здесь ([**ССЫЛКА**](https://docs.google.com/spreadsheets/d/1KMBBBu8LqauRpDJb0v1ldPwpvzNn8-KakcHexAcqLsE/edit?usp=sharing)). -2. Тимлид делает форк данного репозитория. **В форке создает ветку `HW4_`, в ветке создает папку `HW4_`, в этой папке вы всё делаете.** -3. Члены команды могут либо делать свои форки, либо работать в репозитории тимлида в качестве колабораторов ("contributors"). В любом случае делаете клоны => пишите код локально => пушите. -4. В конце тимлид делайет pull-request из `HW4_` своего репозитория в `main` этого. - - -А также: -- Сопроводите программу лучшим `README.md` файлом в вашей жизни (на английском языке). -- В этом ДЗ проблемы с качеством кода (нейминги, пустые строки, анноатции типов, док.стринги, пробелы) могут привести к снижению балла. Воспользуйтесь линтерами чтобы себя обезопасить. IDE по типу PyCharm или VSCode имеют фунцонал по авто-исправлению многих проблем такого рода. - -Автотестов на GitHub в этом ДЗ нет, но вы можете прогнать линтеры на качество кода локально (как в ДЗ №3, подробнее читайте [тут](https://plausible-cannon-091.notion.site/Code-auto-checks-02b2ea69c1d545fca07b50ce5933ed5f?pvs=4)). - -- Программа должна сохранять регистр символов. -- Программа должна работать только с последовательностями белков. -- Запрещается использование сторонних модулей. - - -### Форма сдачи - -Прикрепите ссылку на pull-request тимлида в Google Class (можете сделать от лица каждого члена команды, но это не обязательно). - - -### Pазбалловка - -- За каждую из 5 операций - максимум **1.5 балла** -- За README - максимум **2.5 балла** -- Если вы не внесли как минимум 2 функции от себя, вы получаете 0 баллов (на баллы остальных членов команды это не влияет). -- За фото созвона в README можно получить 0.2 доп. балла (но не более 10 баллов суммарно) - - - -### **Предполагаемый учебный результат** - -Это задание позволит вам проявить креативность и учиться быть не только кодером, но и автором. Также это задание поможет окончательно закрепить материал по функциям который мы прошли. - -Удачи! ✨✨ +# Protein_tools +### Overview +**Protein_tools** is a tool for basic analysis of protein and polypeptide sequenses. Using this tool you can estimate sequence length, charge, aminoacid compound and mass of the protein, find out the aliphatic index and see if the protein could be cleaved by trypsin. + +### Usage +If you want to use the **Protein_tools**, use `git clone` to this repo. To run this tool, you can use this command: +`run_protein_tools('', '')`, where `` is the protein sequence (or several sequences) that should be analysed, and `` is the name of option that you want to be done with the sequence(-s). Please write the name of option and sequences in quotes separated by commas, use only one option per time and make sure that your sequences contain the one-letter names of aminoacids (the case is not important). + +### Options +1. `count_seq_length`: counts the length of protein sequence and output the number of aminoacids. +2. `classify_aminoacids`: classify all aminoacids from the input sequence in accordance with the 'AA_ALPHABET' classification. If aminoacid is not included in this list, it should be classified as 'Unusual'. + + AA_ALPHABET classification: + | Class | Aminoacids | + |----------|-----------| + | Nonpolar | G, A, V, I, L, P| + | Polar uncharged | S, T, C, M, N, Q | + | Aromatic | F, W, Y | + | Polar with negative charge | D, E | + | Polar with positive charge | K, R, H | + +3. `check_unusual_aminoacids`: checks the composition of aminoacids and return the list of unusual aminoacids if they present in the sequence. We call the aminoacid unusual when it does not belong to the list of proteinogenic aminoacids (see AA_ALPHABET classification). +4. `count_charge`: counts the charge of the protein by the subtraction between the number of positively and negatively charged aminoacids. +5. `count_protein_mass`: calculates mass of all aminoacids of input sequence in g/mol or kDa scale. +6. `count_aliphatic_index`: calculates aliphatic index - relative proportion of aliphatic aminoacids in input peptide. The higher aliphatic index the higher thermostability of peptide. +7. `count_trypsin_sites`: counts number of valid trypsin cleavable sites: Arginine/any aminoacid and Lysine/any. aminoacid (except Proline). If peptide has not any trypsin cleavable sites, it will return zero. + +### Examples +An illustration of the capabilities of **Protein_tools** using a random protein sequence is presented below: +*sequence:* CVWGWAMGEACPNPIKINISAYAKTWYQNGPIGRCCCWVGYTAIRFPHQEMQQNTRFNKP + +| Option | Output | +|--------|---------| +| count_seq_length | 60 | +| classify_aminoacids | 'Nonpolar': 22, 'Polar uncharged': 20, 'Aromatic': 9, 'Polar with negative charge': 2, 'Polar with positive charge': 7, 'Unusual': 0 | +| check_unusual_aminoacids | This sequence contains only proteinogenic aminoacids. | +| count_charge | 5 | +| count_protein_mass | 6918.99 | +| count_aliphatic_index | 0.5049999999999999 | +| count_trypsin_sites | 5 | + +### Limitations and troubleshooting +**Protein_tools** has several limitations that can raise the errors in the work of the program. Here are some of them: +1. **Protein_Tools** works only with protein sequences that contains letters of Latin alphabet (the case is not important); also every aminoacid should be coded by one letter. If there are other symbols in the sequence, the tool raise `ValueError` "One of these sequences is not protein sequence or does not match the rools of input. Please select another sequence.". In this case you should check if there are punctuation marks, spaces or some other symbols in your sequence. +2. Be careful to work only with the sequences that contain aminoacids that coded with one letter. If your sequense is "SerMetAlaGly", **Protein_tools** reads it as "SERMETALAGLY". +3. The list of available functions is available in section "Options". If you see ValueError "This procedure is not available. Please choose another procedure.", probably your spelling of the name of function is incorrect. Please check the name of chosen prosedure and make sure that it is available in the **Protein_Tools**. + +### Contribution and contacts +- Shtompel Anastasia (Telegram: @Aenye) — teamlead, developer (options 'count_protein_mass', 'count_aliphatic_index', 'count_trypsin_sites') +- Chevokina Elizaveta (Telegram: @lzchv) — developer (options 'count_seq_length', 'classify_aminoacids', 'check_unusual_aminoacids', 'count_charge'), author of README file \ No newline at end of file diff --git a/hw4_python.py b/hw4_python.py new file mode 100644 index 0000000..d8cca10 --- /dev/null +++ b/hw4_python.py @@ -0,0 +1,128 @@ +""" +Global variables: +- AA_ALPHABET — a dictionary variable that contains a list of proteinogenic aminoacids classes. +- ALL_AMINOACIDS — a set variable that contains a list of all proteinogenic aminoacids. +- FEATURE_FUNCTIONS — a list of available functions of the feature. +- AMINO_ACIDS_MASSES — a dictionary variable that contains masses of all proteinogenic aminoacids. +""" + +AA_ALPHABET = {'Nonpolar': ['G', 'A', 'V', 'I', 'L', 'P'], + 'Polar uncharged': ['S', 'T', 'C', 'M', 'N', 'Q'], + 'Aromatic': ['F', 'W', 'Y'], + 'Polar with negative charge': ['D', 'E'], + 'Polar with positive charge': ['K', 'R', 'H'] + } + +ALL_AMINOACIDS = set(('G', 'A', 'V', 'I', 'L', 'P', 'S', 'T', 'C', 'M', 'N', 'Q', 'F', 'W', 'Y', 'D', 'E', 'K', 'R', 'H')) + +FEATURE_FUNCTIONS = ['count_seq_length', 'classify_aminoacids', 'check_unusual_aminoacids', 'count_charge', + 'count_protein_mass', 'count_aliphatic_index', 'count_trypsin_sites'] + + +AMINO_ACIDS_MASSES = { + 'G': 57.05, 'A': 71.08, 'S': 87.08, 'P': 97.12, 'V': 99.13, + 'T': 101.1, 'C': 103.1, 'L': 113.2, 'I': 113.2, 'N': 114.1, + 'D': 115.1, 'Q': 128.1, 'K': 128.2, 'E': 129.1, 'M': 131.2, + 'H': 137.1, 'F': 147.2, 'R': 156.2, 'Y': 163.2, 'W': 186.2 +} + + +def is_protein(seq: str) -> bool: + """ + Input: a protein sequence (a str type). + Output: boolean value. + 'is_protein' function check if the sequence contains only letters in the upper case. + """ + if seq.isalpha() and seq.isupper(): + return True + + +def count_seq_length(seq: str) -> int: + """ + Input: a protein sequence (a str type). + Output: length of protein sequence (an int type). + 'count_seq_length' function counts the length of protein sequence. + """ + return len(seq) + + +def classify_aminoacids(seq: str) -> dict: + """ + Input: a protein sequence (a str type). + Output: a classification of all aminoacids from the sequence (a dict type — 'all_aminoacids_classes' variable). + 'classify_aminoacids' function classify all aminoacids from the input sequence in accordance with the 'AA_ALPHABET' classification. If aminoacid is not included in this list, + it should be classified as 'Unusual'. + """ + all_aminoacids_classes = dict.fromkeys(['Nonpolar', 'Polar uncharged', 'Aromatic', 'Polar with negative charge', 'Polar with positive charge', 'Unusual'], 0) + for aminoacid in seq: + aminoacid = aminoacid.upper() + if aminoacid not in ALL_AMINOACIDS: + all_aminoacids_classes['Unusual'] += 1 + for aa_key, aa_value in AA_ALPHABET.items(): + if aminoacid in aa_value: + all_aminoacids_classes[aa_key] += 1 + return all_aminoacids_classes + + +def check_unusual_aminoacids(seq: str) -> str: + """ + Input: a protein sequence (a str type). + Output: an answer whether the sequense contains unusual aminoacids (a str type). + 'check_unusual_aminoacids' function checks the composition of aminoacids and return the list of unusual aminoacids if they present in the sequence. We call the aminoacid + unusual when it does not belong to the list of proteinogenic aminoacids (see 'ALL_AMINOACIDS' global variable). + """ + seq_aminoacids = set() + for aminoacid in seq: + aminoacid = aminoacid.upper() + seq_aminoacids.add(aminoacid) + if seq_aminoacids <= ALL_AMINOACIDS: + return 'This sequence contains only proteinogenic aminoacids.' + else: + unusual_aminoacids = seq_aminoacids - ALL_AMINOACIDS + unusual_aminoacids_str = '' + for elem in unusual_aminoacids: + unusual_aminoacids_str += elem + unusual_aminoacids_str += ', ' + return f'This protein contains unusual aminoacids: {unusual_aminoacids_str[:-2]}.' + + +def count_charge(seq: str) -> int: + """ + Input: a protein sequence (a str type). + Output: a charge of the sequence (an int type). + 'count_charge' function counts the charge of the protein by the subtraction between the number of positively and negatively charged aminoacids. + """ + seq_classes = classify_aminoacids(seq) + positive_charge = seq_classes['Polar with positive charge'] + negative_charge = seq_classes['Polar with negative charge'] + sum_charge = positive_charge - negative_charge + return sum_charge + + +OPERATIONS = {'count_protein_mass':count_protein_mass, + 'count_aliphatic_index': count_aliphatic_index, + 'count_trypsin_sites': count_trypsin_sites, + 'count_seq_length': count_seq_length, + 'classify_aminoacids': classify_aminoacids, + 'check_unusual_aminoacids': check_unusual_aminoacids, + 'count_charge': count_charge} + +def protein_tools(*args): + """ + Input: a list of protein sequences and one procedure that should be done with these sequences (str type, several values). + Output: a list of outputs from the chosen procedure (list type). + 'run_protein_tools' function take the protein sequences and the name of the procedure that the user gives and applies this procedure by one of the available functions + to all the given sequences. Also this function check the availabilaty of the procedure and raise the ValueError when the procedure is not in the list of available + functions (see 'FEATURE_FUNCTIONS' global variable). + """ + operation = args[-1] + parsed_seq_list = [] + for seq in args[0:-1]: + if not is_protein(seq): + raise ValueError("One of these sequences is not protein sequence or does not match the rools of input. Please select another sequence.") + else: + if operation in FEATURE_FUNCTIONS: + parsed_seq_list.append(OPERATIONS[operation](seq)) + else: + raise ValueError("This procedure is not available. Please choose another procedure.") + return parsed_seq_list From 9125a08f9842e1b389dcb000ff5f0eb2409439c7 Mon Sep 17 00:00:00 2001 From: Anastasia Shtompel <143889922+anshtompel@users.noreply.github.com> Date: Sun, 1 Oct 2023 02:41:27 +0300 Subject: [PATCH 03/11] Update README.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 157a958..982f771 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ If you want to use the **Protein_tools**, use `git clone` to this repo. To run t 4. `count_charge`: counts the charge of the protein by the subtraction between the number of positively and negatively charged aminoacids. 5. `count_protein_mass`: calculates mass of all aminoacids of input sequence in g/mol or kDa scale. 6. `count_aliphatic_index`: calculates aliphatic index - relative proportion of aliphatic aminoacids in input peptide. The higher aliphatic index the higher thermostability of peptide. -7. `count_trypsin_sites`: counts number of valid trypsin cleavable sites: Arginine/any aminoacid and Lysine/any. aminoacid (except Proline). If peptide has not any trypsin cleavable sites, it will return zero. +7. `count_trypsin_sites`: counts number of valid trypsin cleavable sites: Arginine/any aminoacid and Lysine/any aminoacid (except Proline). If peptide has not any trypsin cleavable sites, it will return zero. ### Examples An illustration of the capabilities of **Protein_tools** using a random protein sequence is presented below: @@ -41,10 +41,10 @@ An illustration of the capabilities of **Protein_tools** using a random protein ### Limitations and troubleshooting **Protein_tools** has several limitations that can raise the errors in the work of the program. Here are some of them: -1. **Protein_Tools** works only with protein sequences that contains letters of Latin alphabet (the case is not important); also every aminoacid should be coded by one letter. If there are other symbols in the sequence, the tool raise `ValueError` "One of these sequences is not protein sequence or does not match the rools of input. Please select another sequence.". In this case you should check if there are punctuation marks, spaces or some other symbols in your sequence. +1. **Protein_Tools** works only with protein sequences that contains letters of Latin alphabet (the case is not important); also every aminoacid should be coded by one letter. If there are other symbols in the sequence, the tool raise `ValueError` *"One of these sequences is not protein sequence or does not match the rools of input. Please select another sequence."*. In this case you should check if there are punctuation marks, spaces or some other symbols in your sequence. 2. Be careful to work only with the sequences that contain aminoacids that coded with one letter. If your sequense is "SerMetAlaGly", **Protein_tools** reads it as "SERMETALAGLY". -3. The list of available functions is available in section "Options". If you see ValueError "This procedure is not available. Please choose another procedure.", probably your spelling of the name of function is incorrect. Please check the name of chosen prosedure and make sure that it is available in the **Protein_Tools**. +3. The list of available functions is available in section "Options". If you see `ValueError` *"This procedure is not available. Please choose another procedure."*, probably your spelling of the name of function is incorrect. Please check the name of chosen prosedure and make sure that it is available in the **Protein_Tools**. ### Contribution and contacts - Shtompel Anastasia (Telegram: @Aenye) — teamlead, developer (options 'count_protein_mass', 'count_aliphatic_index', 'count_trypsin_sites') -- Chevokina Elizaveta (Telegram: @lzchv) — developer (options 'count_seq_length', 'classify_aminoacids', 'check_unusual_aminoacids', 'count_charge'), author of README file \ No newline at end of file +- Chevokina Elizaveta (Telegram: @lzchv) — developer (options 'count_seq_length', 'classify_aminoacids', 'check_unusual_aminoacids', 'count_charge'), author of README file From 380cee3ff7afb5c0d25f558635efd3821d199542 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=B0=D1=81=D1=82=D0=B0=D1=81=D0=B8=D1=8F=20?= =?UTF-8?q?=D0=A8=D1=82=D0=BE=D0=BC=D0=BF=D0=B5=D0=BB=D1=8C?= Date: Sun, 1 Oct 2023 11:09:41 +0300 Subject: [PATCH 04/11] modify HW4 directory --- HW4_Shtompel/README.md | 50 +++++++++ HW4_Shtompel/pre-commit | 36 +++++++ HW4_Shtompel/protein_tools.py | 192 ++++++++++++++++++++++------------ 3 files changed, 214 insertions(+), 64 deletions(-) create mode 100644 HW4_Shtompel/README.md create mode 100644 HW4_Shtompel/pre-commit diff --git a/HW4_Shtompel/README.md b/HW4_Shtompel/README.md new file mode 100644 index 0000000..982f771 --- /dev/null +++ b/HW4_Shtompel/README.md @@ -0,0 +1,50 @@ +# Protein_tools +### Overview +**Protein_tools** is a tool for basic analysis of protein and polypeptide sequenses. Using this tool you can estimate sequence length, charge, aminoacid compound and mass of the protein, find out the aliphatic index and see if the protein could be cleaved by trypsin. + +### Usage +If you want to use the **Protein_tools**, use `git clone` to this repo. To run this tool, you can use this command: +`run_protein_tools('', '')`, where `` is the protein sequence (or several sequences) that should be analysed, and `` is the name of option that you want to be done with the sequence(-s). Please write the name of option and sequences in quotes separated by commas, use only one option per time and make sure that your sequences contain the one-letter names of aminoacids (the case is not important). + +### Options +1. `count_seq_length`: counts the length of protein sequence and output the number of aminoacids. +2. `classify_aminoacids`: classify all aminoacids from the input sequence in accordance with the 'AA_ALPHABET' classification. If aminoacid is not included in this list, it should be classified as 'Unusual'. + + AA_ALPHABET classification: + | Class | Aminoacids | + |----------|-----------| + | Nonpolar | G, A, V, I, L, P| + | Polar uncharged | S, T, C, M, N, Q | + | Aromatic | F, W, Y | + | Polar with negative charge | D, E | + | Polar with positive charge | K, R, H | + +3. `check_unusual_aminoacids`: checks the composition of aminoacids and return the list of unusual aminoacids if they present in the sequence. We call the aminoacid unusual when it does not belong to the list of proteinogenic aminoacids (see AA_ALPHABET classification). +4. `count_charge`: counts the charge of the protein by the subtraction between the number of positively and negatively charged aminoacids. +5. `count_protein_mass`: calculates mass of all aminoacids of input sequence in g/mol or kDa scale. +6. `count_aliphatic_index`: calculates aliphatic index - relative proportion of aliphatic aminoacids in input peptide. The higher aliphatic index the higher thermostability of peptide. +7. `count_trypsin_sites`: counts number of valid trypsin cleavable sites: Arginine/any aminoacid and Lysine/any aminoacid (except Proline). If peptide has not any trypsin cleavable sites, it will return zero. + +### Examples +An illustration of the capabilities of **Protein_tools** using a random protein sequence is presented below: +*sequence:* CVWGWAMGEACPNPIKINISAYAKTWYQNGPIGRCCCWVGYTAIRFPHQEMQQNTRFNKP + +| Option | Output | +|--------|---------| +| count_seq_length | 60 | +| classify_aminoacids | 'Nonpolar': 22, 'Polar uncharged': 20, 'Aromatic': 9, 'Polar with negative charge': 2, 'Polar with positive charge': 7, 'Unusual': 0 | +| check_unusual_aminoacids | This sequence contains only proteinogenic aminoacids. | +| count_charge | 5 | +| count_protein_mass | 6918.99 | +| count_aliphatic_index | 0.5049999999999999 | +| count_trypsin_sites | 5 | + +### Limitations and troubleshooting +**Protein_tools** has several limitations that can raise the errors in the work of the program. Here are some of them: +1. **Protein_Tools** works only with protein sequences that contains letters of Latin alphabet (the case is not important); also every aminoacid should be coded by one letter. If there are other symbols in the sequence, the tool raise `ValueError` *"One of these sequences is not protein sequence or does not match the rools of input. Please select another sequence."*. In this case you should check if there are punctuation marks, spaces or some other symbols in your sequence. +2. Be careful to work only with the sequences that contain aminoacids that coded with one letter. If your sequense is "SerMetAlaGly", **Protein_tools** reads it as "SERMETALAGLY". +3. The list of available functions is available in section "Options". If you see `ValueError` *"This procedure is not available. Please choose another procedure."*, probably your spelling of the name of function is incorrect. Please check the name of chosen prosedure and make sure that it is available in the **Protein_Tools**. + +### Contribution and contacts +- Shtompel Anastasia (Telegram: @Aenye) — teamlead, developer (options 'count_protein_mass', 'count_aliphatic_index', 'count_trypsin_sites') +- Chevokina Elizaveta (Telegram: @lzchv) — developer (options 'count_seq_length', 'classify_aminoacids', 'check_unusual_aminoacids', 'count_charge'), author of README file diff --git a/HW4_Shtompel/pre-commit b/HW4_Shtompel/pre-commit new file mode 100644 index 0000000..9c2219b --- /dev/null +++ b/HW4_Shtompel/pre-commit @@ -0,0 +1,36 @@ +#!/bin/bash + +echo "Hi! I'm your pre-commit code checker." + +FILE="dna_rna_tools.py" +TESTS="${FILE%.py}_test.py" + +if [ -f $FILE ]; then + + if [ ! -f hooks_env/bin/activate ]; then + echo "For the first time I need to prepare an environment, give me a minute..." + python3 -m venv hooks_env + source hooks_env/bin/activate + python3 -m pip install --upgrade pip --quiet + pip install pytest flake8 flake8-bugbear pep8-naming flake8-builtins flake8-functions-names flake8-variables-names pep8-naming pylint mypy --quiet + echo "hooks_env" >> .gitignore + echo ".gitignore" >> .gitignore + else + source hooks_env/bin/activate + fi + + echo "$(tput setab 7 setaf 1)>>>> Code quality checks <<<<$(tput sgr 0)" + echo ">>>> flake8 check" + flake8 $FILE + echo ">>>> pylint check" + pylint $FILE + echo ">>>> mypy check" + mypy $FILE + + deactivate + +else + + echo "Seems no python code to be checked. You can configure me in .git/hook/pre-commit" + +fi diff --git a/HW4_Shtompel/protein_tools.py b/HW4_Shtompel/protein_tools.py index b23a998..d8cca10 100644 --- a/HW4_Shtompel/protein_tools.py +++ b/HW4_Shtompel/protein_tools.py @@ -1,64 +1,128 @@ -def count_protein_mass(seq: str, kda_scale=False) -> float: - """ - Calculates mass of all aminoacids of input peptide in g/mol or kDa scale. - Arguments: - - seq (str): one-letter code peptide sequence, case is not important; - - kda_scale (bool): if True converts peptide mass into kDa scale (1KDa = 1000g/mol). - Output: - Returns mass of peptide (float). - """ - aa_mass = 0 - for aminoacid in seq.upper(): - if aminoacid in AMINO_ACIDS_MASSES: - aa_mass += AMINO_ACIDS_MASSES[aminoacid] - if kda_scale is True: - kda = round(aa_mass/1000, 1) - return kda - return aa_mass - - -def count_aliphatic_index(seq: str) -> float: - """ - Calculates aliphatic index - relative proportion of aliphatic aminoacids in input peptide. - The higher aliphatic index the higher thermostability of peptide. - Argument: - - seq (str): one-letter code peptide sequence, letter case is not important. - Output: - Returns alipatic index (float). - """ - ala_count = seq.count('A')/len(seq) - val_count = seq.count('V')/len(seq) - lei_count = seq.count('L')/len(seq) - izlei_count = seq.count('I')/len(seq) - aliph_index = ala_count + 2.9 * val_count + 3.9 * lei_count + 3.9 * izlei_count - return aliph_index - - -def not_trypsin_cleaved(seq: str) -> int: - """ - Counts non-cleavable sites of trypsin: Arginine/Proline (RP) and Lysine/Proline (KP) pairs. - Argument: - - seq (str): one-letter code peptide sequence, case is not important. - Output: - Returns number of exception sites that cannot be cleaved by trypsin (int). - """ - not_cleavage_count = 0 - not_cleavage_count += seq.upper().count('RP') - not_cleavage_count += seq.upper().count('KP') - return not_cleavage_count - - -def count_trypsin_sites(seq: str) -> int: - """ - Counts number of valid trypsin cleavable sites: - Arginine/any aminoacid and Lysine/any aminoacid (except Proline). - Argument: - - seq (str): one-letter code peptide sequence, case is not important. - Output: - Returns number of valid trypsin cleavable sites (int). - If peptide has not any trypsin cleavable sites, it will return zero. - """ - arginine_value = seq.upper().count('R') - lysine_value = seq.upper().count('K') - count_cleavage = arginine_value + lysine_value - not_trypsin_cleaved(seq) - return count_cleavage +""" +Global variables: +- AA_ALPHABET — a dictionary variable that contains a list of proteinogenic aminoacids classes. +- ALL_AMINOACIDS — a set variable that contains a list of all proteinogenic aminoacids. +- FEATURE_FUNCTIONS — a list of available functions of the feature. +- AMINO_ACIDS_MASSES — a dictionary variable that contains masses of all proteinogenic aminoacids. +""" + +AA_ALPHABET = {'Nonpolar': ['G', 'A', 'V', 'I', 'L', 'P'], + 'Polar uncharged': ['S', 'T', 'C', 'M', 'N', 'Q'], + 'Aromatic': ['F', 'W', 'Y'], + 'Polar with negative charge': ['D', 'E'], + 'Polar with positive charge': ['K', 'R', 'H'] + } + +ALL_AMINOACIDS = set(('G', 'A', 'V', 'I', 'L', 'P', 'S', 'T', 'C', 'M', 'N', 'Q', 'F', 'W', 'Y', 'D', 'E', 'K', 'R', 'H')) + +FEATURE_FUNCTIONS = ['count_seq_length', 'classify_aminoacids', 'check_unusual_aminoacids', 'count_charge', + 'count_protein_mass', 'count_aliphatic_index', 'count_trypsin_sites'] + + +AMINO_ACIDS_MASSES = { + 'G': 57.05, 'A': 71.08, 'S': 87.08, 'P': 97.12, 'V': 99.13, + 'T': 101.1, 'C': 103.1, 'L': 113.2, 'I': 113.2, 'N': 114.1, + 'D': 115.1, 'Q': 128.1, 'K': 128.2, 'E': 129.1, 'M': 131.2, + 'H': 137.1, 'F': 147.2, 'R': 156.2, 'Y': 163.2, 'W': 186.2 +} + + +def is_protein(seq: str) -> bool: + """ + Input: a protein sequence (a str type). + Output: boolean value. + 'is_protein' function check if the sequence contains only letters in the upper case. + """ + if seq.isalpha() and seq.isupper(): + return True + + +def count_seq_length(seq: str) -> int: + """ + Input: a protein sequence (a str type). + Output: length of protein sequence (an int type). + 'count_seq_length' function counts the length of protein sequence. + """ + return len(seq) + + +def classify_aminoacids(seq: str) -> dict: + """ + Input: a protein sequence (a str type). + Output: a classification of all aminoacids from the sequence (a dict type — 'all_aminoacids_classes' variable). + 'classify_aminoacids' function classify all aminoacids from the input sequence in accordance with the 'AA_ALPHABET' classification. If aminoacid is not included in this list, + it should be classified as 'Unusual'. + """ + all_aminoacids_classes = dict.fromkeys(['Nonpolar', 'Polar uncharged', 'Aromatic', 'Polar with negative charge', 'Polar with positive charge', 'Unusual'], 0) + for aminoacid in seq: + aminoacid = aminoacid.upper() + if aminoacid not in ALL_AMINOACIDS: + all_aminoacids_classes['Unusual'] += 1 + for aa_key, aa_value in AA_ALPHABET.items(): + if aminoacid in aa_value: + all_aminoacids_classes[aa_key] += 1 + return all_aminoacids_classes + + +def check_unusual_aminoacids(seq: str) -> str: + """ + Input: a protein sequence (a str type). + Output: an answer whether the sequense contains unusual aminoacids (a str type). + 'check_unusual_aminoacids' function checks the composition of aminoacids and return the list of unusual aminoacids if they present in the sequence. We call the aminoacid + unusual when it does not belong to the list of proteinogenic aminoacids (see 'ALL_AMINOACIDS' global variable). + """ + seq_aminoacids = set() + for aminoacid in seq: + aminoacid = aminoacid.upper() + seq_aminoacids.add(aminoacid) + if seq_aminoacids <= ALL_AMINOACIDS: + return 'This sequence contains only proteinogenic aminoacids.' + else: + unusual_aminoacids = seq_aminoacids - ALL_AMINOACIDS + unusual_aminoacids_str = '' + for elem in unusual_aminoacids: + unusual_aminoacids_str += elem + unusual_aminoacids_str += ', ' + return f'This protein contains unusual aminoacids: {unusual_aminoacids_str[:-2]}.' + + +def count_charge(seq: str) -> int: + """ + Input: a protein sequence (a str type). + Output: a charge of the sequence (an int type). + 'count_charge' function counts the charge of the protein by the subtraction between the number of positively and negatively charged aminoacids. + """ + seq_classes = classify_aminoacids(seq) + positive_charge = seq_classes['Polar with positive charge'] + negative_charge = seq_classes['Polar with negative charge'] + sum_charge = positive_charge - negative_charge + return sum_charge + + +OPERATIONS = {'count_protein_mass':count_protein_mass, + 'count_aliphatic_index': count_aliphatic_index, + 'count_trypsin_sites': count_trypsin_sites, + 'count_seq_length': count_seq_length, + 'classify_aminoacids': classify_aminoacids, + 'check_unusual_aminoacids': check_unusual_aminoacids, + 'count_charge': count_charge} + +def protein_tools(*args): + """ + Input: a list of protein sequences and one procedure that should be done with these sequences (str type, several values). + Output: a list of outputs from the chosen procedure (list type). + 'run_protein_tools' function take the protein sequences and the name of the procedure that the user gives and applies this procedure by one of the available functions + to all the given sequences. Also this function check the availabilaty of the procedure and raise the ValueError when the procedure is not in the list of available + functions (see 'FEATURE_FUNCTIONS' global variable). + """ + operation = args[-1] + parsed_seq_list = [] + for seq in args[0:-1]: + if not is_protein(seq): + raise ValueError("One of these sequences is not protein sequence or does not match the rools of input. Please select another sequence.") + else: + if operation in FEATURE_FUNCTIONS: + parsed_seq_list.append(OPERATIONS[operation](seq)) + else: + raise ValueError("This procedure is not available. Please choose another procedure.") + return parsed_seq_list From 81b386fd18e3a74e37509fc33fe4163019f3e601 Mon Sep 17 00:00:00 2001 From: Anastasia Shtompel <143889922+anshtompel@users.noreply.github.com> Date: Sun, 1 Oct 2023 11:14:34 +0300 Subject: [PATCH 05/11] Delete hw4_python.py --- hw4_python.py | 128 -------------------------------------------------- 1 file changed, 128 deletions(-) delete mode 100644 hw4_python.py diff --git a/hw4_python.py b/hw4_python.py deleted file mode 100644 index d8cca10..0000000 --- a/hw4_python.py +++ /dev/null @@ -1,128 +0,0 @@ -""" -Global variables: -- AA_ALPHABET — a dictionary variable that contains a list of proteinogenic aminoacids classes. -- ALL_AMINOACIDS — a set variable that contains a list of all proteinogenic aminoacids. -- FEATURE_FUNCTIONS — a list of available functions of the feature. -- AMINO_ACIDS_MASSES — a dictionary variable that contains masses of all proteinogenic aminoacids. -""" - -AA_ALPHABET = {'Nonpolar': ['G', 'A', 'V', 'I', 'L', 'P'], - 'Polar uncharged': ['S', 'T', 'C', 'M', 'N', 'Q'], - 'Aromatic': ['F', 'W', 'Y'], - 'Polar with negative charge': ['D', 'E'], - 'Polar with positive charge': ['K', 'R', 'H'] - } - -ALL_AMINOACIDS = set(('G', 'A', 'V', 'I', 'L', 'P', 'S', 'T', 'C', 'M', 'N', 'Q', 'F', 'W', 'Y', 'D', 'E', 'K', 'R', 'H')) - -FEATURE_FUNCTIONS = ['count_seq_length', 'classify_aminoacids', 'check_unusual_aminoacids', 'count_charge', - 'count_protein_mass', 'count_aliphatic_index', 'count_trypsin_sites'] - - -AMINO_ACIDS_MASSES = { - 'G': 57.05, 'A': 71.08, 'S': 87.08, 'P': 97.12, 'V': 99.13, - 'T': 101.1, 'C': 103.1, 'L': 113.2, 'I': 113.2, 'N': 114.1, - 'D': 115.1, 'Q': 128.1, 'K': 128.2, 'E': 129.1, 'M': 131.2, - 'H': 137.1, 'F': 147.2, 'R': 156.2, 'Y': 163.2, 'W': 186.2 -} - - -def is_protein(seq: str) -> bool: - """ - Input: a protein sequence (a str type). - Output: boolean value. - 'is_protein' function check if the sequence contains only letters in the upper case. - """ - if seq.isalpha() and seq.isupper(): - return True - - -def count_seq_length(seq: str) -> int: - """ - Input: a protein sequence (a str type). - Output: length of protein sequence (an int type). - 'count_seq_length' function counts the length of protein sequence. - """ - return len(seq) - - -def classify_aminoacids(seq: str) -> dict: - """ - Input: a protein sequence (a str type). - Output: a classification of all aminoacids from the sequence (a dict type — 'all_aminoacids_classes' variable). - 'classify_aminoacids' function classify all aminoacids from the input sequence in accordance with the 'AA_ALPHABET' classification. If aminoacid is not included in this list, - it should be classified as 'Unusual'. - """ - all_aminoacids_classes = dict.fromkeys(['Nonpolar', 'Polar uncharged', 'Aromatic', 'Polar with negative charge', 'Polar with positive charge', 'Unusual'], 0) - for aminoacid in seq: - aminoacid = aminoacid.upper() - if aminoacid not in ALL_AMINOACIDS: - all_aminoacids_classes['Unusual'] += 1 - for aa_key, aa_value in AA_ALPHABET.items(): - if aminoacid in aa_value: - all_aminoacids_classes[aa_key] += 1 - return all_aminoacids_classes - - -def check_unusual_aminoacids(seq: str) -> str: - """ - Input: a protein sequence (a str type). - Output: an answer whether the sequense contains unusual aminoacids (a str type). - 'check_unusual_aminoacids' function checks the composition of aminoacids and return the list of unusual aminoacids if they present in the sequence. We call the aminoacid - unusual when it does not belong to the list of proteinogenic aminoacids (see 'ALL_AMINOACIDS' global variable). - """ - seq_aminoacids = set() - for aminoacid in seq: - aminoacid = aminoacid.upper() - seq_aminoacids.add(aminoacid) - if seq_aminoacids <= ALL_AMINOACIDS: - return 'This sequence contains only proteinogenic aminoacids.' - else: - unusual_aminoacids = seq_aminoacids - ALL_AMINOACIDS - unusual_aminoacids_str = '' - for elem in unusual_aminoacids: - unusual_aminoacids_str += elem - unusual_aminoacids_str += ', ' - return f'This protein contains unusual aminoacids: {unusual_aminoacids_str[:-2]}.' - - -def count_charge(seq: str) -> int: - """ - Input: a protein sequence (a str type). - Output: a charge of the sequence (an int type). - 'count_charge' function counts the charge of the protein by the subtraction between the number of positively and negatively charged aminoacids. - """ - seq_classes = classify_aminoacids(seq) - positive_charge = seq_classes['Polar with positive charge'] - negative_charge = seq_classes['Polar with negative charge'] - sum_charge = positive_charge - negative_charge - return sum_charge - - -OPERATIONS = {'count_protein_mass':count_protein_mass, - 'count_aliphatic_index': count_aliphatic_index, - 'count_trypsin_sites': count_trypsin_sites, - 'count_seq_length': count_seq_length, - 'classify_aminoacids': classify_aminoacids, - 'check_unusual_aminoacids': check_unusual_aminoacids, - 'count_charge': count_charge} - -def protein_tools(*args): - """ - Input: a list of protein sequences and one procedure that should be done with these sequences (str type, several values). - Output: a list of outputs from the chosen procedure (list type). - 'run_protein_tools' function take the protein sequences and the name of the procedure that the user gives and applies this procedure by one of the available functions - to all the given sequences. Also this function check the availabilaty of the procedure and raise the ValueError when the procedure is not in the list of available - functions (see 'FEATURE_FUNCTIONS' global variable). - """ - operation = args[-1] - parsed_seq_list = [] - for seq in args[0:-1]: - if not is_protein(seq): - raise ValueError("One of these sequences is not protein sequence or does not match the rools of input. Please select another sequence.") - else: - if operation in FEATURE_FUNCTIONS: - parsed_seq_list.append(OPERATIONS[operation](seq)) - else: - raise ValueError("This procedure is not available. Please choose another procedure.") - return parsed_seq_list From 4fc57719a41c738073a018d4eb09daf1eb301e13 Mon Sep 17 00:00:00 2001 From: Anastasia Shtompel <143889922+anshtompel@users.noreply.github.com> Date: Sun, 1 Oct 2023 11:15:09 +0300 Subject: [PATCH 06/11] Delete external README.md --- README.md | 50 -------------------------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index 982f771..0000000 --- a/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# Protein_tools -### Overview -**Protein_tools** is a tool for basic analysis of protein and polypeptide sequenses. Using this tool you can estimate sequence length, charge, aminoacid compound and mass of the protein, find out the aliphatic index and see if the protein could be cleaved by trypsin. - -### Usage -If you want to use the **Protein_tools**, use `git clone` to this repo. To run this tool, you can use this command: -`run_protein_tools('', '')`, where `` is the protein sequence (or several sequences) that should be analysed, and `` is the name of option that you want to be done with the sequence(-s). Please write the name of option and sequences in quotes separated by commas, use only one option per time and make sure that your sequences contain the one-letter names of aminoacids (the case is not important). - -### Options -1. `count_seq_length`: counts the length of protein sequence and output the number of aminoacids. -2. `classify_aminoacids`: classify all aminoacids from the input sequence in accordance with the 'AA_ALPHABET' classification. If aminoacid is not included in this list, it should be classified as 'Unusual'. - - AA_ALPHABET classification: - | Class | Aminoacids | - |----------|-----------| - | Nonpolar | G, A, V, I, L, P| - | Polar uncharged | S, T, C, M, N, Q | - | Aromatic | F, W, Y | - | Polar with negative charge | D, E | - | Polar with positive charge | K, R, H | - -3. `check_unusual_aminoacids`: checks the composition of aminoacids and return the list of unusual aminoacids if they present in the sequence. We call the aminoacid unusual when it does not belong to the list of proteinogenic aminoacids (see AA_ALPHABET classification). -4. `count_charge`: counts the charge of the protein by the subtraction between the number of positively and negatively charged aminoacids. -5. `count_protein_mass`: calculates mass of all aminoacids of input sequence in g/mol or kDa scale. -6. `count_aliphatic_index`: calculates aliphatic index - relative proportion of aliphatic aminoacids in input peptide. The higher aliphatic index the higher thermostability of peptide. -7. `count_trypsin_sites`: counts number of valid trypsin cleavable sites: Arginine/any aminoacid and Lysine/any aminoacid (except Proline). If peptide has not any trypsin cleavable sites, it will return zero. - -### Examples -An illustration of the capabilities of **Protein_tools** using a random protein sequence is presented below: -*sequence:* CVWGWAMGEACPNPIKINISAYAKTWYQNGPIGRCCCWVGYTAIRFPHQEMQQNTRFNKP - -| Option | Output | -|--------|---------| -| count_seq_length | 60 | -| classify_aminoacids | 'Nonpolar': 22, 'Polar uncharged': 20, 'Aromatic': 9, 'Polar with negative charge': 2, 'Polar with positive charge': 7, 'Unusual': 0 | -| check_unusual_aminoacids | This sequence contains only proteinogenic aminoacids. | -| count_charge | 5 | -| count_protein_mass | 6918.99 | -| count_aliphatic_index | 0.5049999999999999 | -| count_trypsin_sites | 5 | - -### Limitations and troubleshooting -**Protein_tools** has several limitations that can raise the errors in the work of the program. Here are some of them: -1. **Protein_Tools** works only with protein sequences that contains letters of Latin alphabet (the case is not important); also every aminoacid should be coded by one letter. If there are other symbols in the sequence, the tool raise `ValueError` *"One of these sequences is not protein sequence or does not match the rools of input. Please select another sequence."*. In this case you should check if there are punctuation marks, spaces or some other symbols in your sequence. -2. Be careful to work only with the sequences that contain aminoacids that coded with one letter. If your sequense is "SerMetAlaGly", **Protein_tools** reads it as "SERMETALAGLY". -3. The list of available functions is available in section "Options". If you see `ValueError` *"This procedure is not available. Please choose another procedure."*, probably your spelling of the name of function is incorrect. Please check the name of chosen prosedure and make sure that it is available in the **Protein_Tools**. - -### Contribution and contacts -- Shtompel Anastasia (Telegram: @Aenye) — teamlead, developer (options 'count_protein_mass', 'count_aliphatic_index', 'count_trypsin_sites') -- Chevokina Elizaveta (Telegram: @lzchv) — developer (options 'count_seq_length', 'classify_aminoacids', 'check_unusual_aminoacids', 'count_charge'), author of README file From 0aae778f1ec4013447255508c27fd124e245312b Mon Sep 17 00:00:00 2001 From: Anastasia Shtompel <143889922+anshtompel@users.noreply.github.com> Date: Sun, 1 Oct 2023 11:31:55 +0300 Subject: [PATCH 07/11] add new functions, check for mistakes --- HW4_Shtompel/protein_tools.py | 72 ++++++++++++++++++++++++++++++++--- 1 file changed, 67 insertions(+), 5 deletions(-) diff --git a/HW4_Shtompel/protein_tools.py b/HW4_Shtompel/protein_tools.py index d8cca10..5f2aa11 100644 --- a/HW4_Shtompel/protein_tools.py +++ b/HW4_Shtompel/protein_tools.py @@ -15,10 +15,6 @@ ALL_AMINOACIDS = set(('G', 'A', 'V', 'I', 'L', 'P', 'S', 'T', 'C', 'M', 'N', 'Q', 'F', 'W', 'Y', 'D', 'E', 'K', 'R', 'H')) -FEATURE_FUNCTIONS = ['count_seq_length', 'classify_aminoacids', 'check_unusual_aminoacids', 'count_charge', - 'count_protein_mass', 'count_aliphatic_index', 'count_trypsin_sites'] - - AMINO_ACIDS_MASSES = { 'G': 57.05, 'A': 71.08, 'S': 87.08, 'P': 97.12, 'V': 99.13, 'T': 101.1, 'C': 103.1, 'L': 113.2, 'I': 113.2, 'N': 114.1, @@ -99,6 +95,72 @@ def count_charge(seq: str) -> int: return sum_charge +def count_protein_mass(seq: str, kda_scale = False) -> float: + """ + Calculates mass of all aminoacids of input peptide in g/mol or KDa scale. + Arguments: + - seq (str): one-letter code peptide sequence, case is not important; + - kda_scale (bool): if True converts peptide mass into kDa scale (1KDa = 1000g/mol). + Output: + Returns mass of peptide (float). + """ + aa_mass = 0 + for aminoacid in seq.upper(): + if aminoacid in AMINO_ACIDS_MASSES: + aa_mass += AMINO_ACIDS_MASSES[aminoacid] + if kda_scale is True: + kda = round(aa_mass / 1000, 1) + return kda + return aa_mass + + +def count_aliphatic_index(seq: str) -> float: + """ + Calculates aliphatic index - relative proportion of aliphatic aminoacids in input peptide. + The higher aliphatic index the higher thermostability of peptide. + Argument: + - seq (str): one-letter code peptide sequence, letter case is not important. + Output: + Returns alipatic index (float). + """ + ala_count = seq.count('A') / len(seq) + val_count = seq.count('V') / len(seq) + lei_count = seq.count('L') / len(seq) + izlei_count = seq.count('I') / len(seq) + aliph_index = ala_count + 2.9 * val_count + 3.9 * lei_count + 3.9 * izlei_count + return aliph_index + + +def not_trypsin_cleaved(seq: str) -> int: + """ + Counts non-cleavable sites of trypsin: Arginine/Proline (RP) and Lysine/Proline (KP) pairs. + Argument: + - seq (str): one-letter code peptide sequence, case is not important. + Output: + Returns number of exception sites that cannot be cleaved by trypsin (int). + """ + not_cleavage_count = 0 + not_cleavage_count += seq.upper().count('RP') + not_cleavage_count += seq.upper().count('KP') + return not_cleavage_count + + +def count_trypsin_sites(seq: str) -> int: + """ + Counts number of valid trypsin cleavable sites: + Arginine/any aminoacid and Lysine/any aminoacid (except Proline). + Argument: + - seq (str): one-letter code peptide sequence, case is not important. + Output: + Returns number of valid trypsin cleavable sites (int). + If peptide has not any trypsin cleavable sites, it will return zero. + """ + arginine_value = seq.upper().count('R') + lysine_value = seq.upper().count('K') + count_cleavage = arginine_value + lysine_value - not_trypsin_cleaved(seq) + return count_cleavage + + OPERATIONS = {'count_protein_mass':count_protein_mass, 'count_aliphatic_index': count_aliphatic_index, 'count_trypsin_sites': count_trypsin_sites, @@ -121,7 +183,7 @@ def protein_tools(*args): if not is_protein(seq): raise ValueError("One of these sequences is not protein sequence or does not match the rools of input. Please select another sequence.") else: - if operation in FEATURE_FUNCTIONS: + if operation in OPERATIONS: parsed_seq_list.append(OPERATIONS[operation](seq)) else: raise ValueError("This procedure is not available. Please choose another procedure.") From 9d4123fbe77c5842a75ab06e02b4c7278a20469a Mon Sep 17 00:00:00 2001 From: Anastasia Shtompel <143889922+anshtompel@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:17:15 +0300 Subject: [PATCH 08/11] update docstring protein_tools --- HW4_Shtompel/protein_tools.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/HW4_Shtompel/protein_tools.py b/HW4_Shtompel/protein_tools.py index 5f2aa11..4564cda 100644 --- a/HW4_Shtompel/protein_tools.py +++ b/HW4_Shtompel/protein_tools.py @@ -95,12 +95,11 @@ def count_charge(seq: str) -> int: return sum_charge -def count_protein_mass(seq: str, kda_scale = False) -> float: +def count_protein_mass(seq: str) -> float: """ - Calculates mass of all aminoacids of input peptide in g/mol or KDa scale. + Calculates mass of all aminoacids of input peptide in g/mol scale. Arguments: - seq (str): one-letter code peptide sequence, case is not important; - - kda_scale (bool): if True converts peptide mass into kDa scale (1KDa = 1000g/mol). Output: Returns mass of peptide (float). """ @@ -108,9 +107,6 @@ def count_protein_mass(seq: str, kda_scale = False) -> float: for aminoacid in seq.upper(): if aminoacid in AMINO_ACIDS_MASSES: aa_mass += AMINO_ACIDS_MASSES[aminoacid] - if kda_scale is True: - kda = round(aa_mass / 1000, 1) - return kda return aa_mass @@ -169,13 +165,27 @@ def count_trypsin_sites(seq: str) -> int: 'check_unusual_aminoacids': check_unusual_aminoacids, 'count_charge': count_charge} -def protein_tools(*args): +def protein_tools(*args: str) -> list: """ + Calculates protein phisical properties: mass, charge, length, aliphatic index; + as well as defines biological features: aminoacid composition, trypsin cleavable sites. + Input: a list of protein sequences and one procedure that should be done with these sequences (str type, several values). + + Valid operations: + Protein_tools include several operations: + - count_seq_length: returns length of protein (int); + - classify_aminoacids: returns collection of classified aminoacids, included in the protein (dict); + - check_unusual_aminoacids: informs about whether the unusual aminoacis include into the protein (str); + - count_charge: returns charge value of protein (int); + - count_protein_mass: calculates mass of all aminoacids of input peptide in g/mol scale (float); + - count_aliphatic_index: calculates relative proportion of aliphatic aminoacids in input peptide (float); + - count_trypsin_sites: counts number of valid trypsin cleavable sites. + Output: a list of outputs from the chosen procedure (list type). 'run_protein_tools' function take the protein sequences and the name of the procedure that the user gives and applies this procedure by one of the available functions to all the given sequences. Also this function check the availabilaty of the procedure and raise the ValueError when the procedure is not in the list of available - functions (see 'FEATURE_FUNCTIONS' global variable). + functions (see 'OPERATIONS' global variable). """ operation = args[-1] parsed_seq_list = [] From dc34a874238ddf9f818e038c1fc89f4777f68d35 Mon Sep 17 00:00:00 2001 From: Anastasia Shtompel <143889922+anshtompel@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:17:50 +0300 Subject: [PATCH 09/11] Update README.md --- HW4_Shtompel/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HW4_Shtompel/README.md b/HW4_Shtompel/README.md index 982f771..d9e9465 100644 --- a/HW4_Shtompel/README.md +++ b/HW4_Shtompel/README.md @@ -21,7 +21,7 @@ If you want to use the **Protein_tools**, use `git clone` to this repo. To run t 3. `check_unusual_aminoacids`: checks the composition of aminoacids and return the list of unusual aminoacids if they present in the sequence. We call the aminoacid unusual when it does not belong to the list of proteinogenic aminoacids (see AA_ALPHABET classification). 4. `count_charge`: counts the charge of the protein by the subtraction between the number of positively and negatively charged aminoacids. -5. `count_protein_mass`: calculates mass of all aminoacids of input sequence in g/mol or kDa scale. +5. `count_protein_mass`: calculates mass of all aminoacids of input sequence in g/mol scale. 6. `count_aliphatic_index`: calculates aliphatic index - relative proportion of aliphatic aminoacids in input peptide. The higher aliphatic index the higher thermostability of peptide. 7. `count_trypsin_sites`: counts number of valid trypsin cleavable sites: Arginine/any aminoacid and Lysine/any aminoacid (except Proline). If peptide has not any trypsin cleavable sites, it will return zero. From 7c346f2c10d959652ca95c930b40d2368d42fa4d Mon Sep 17 00:00:00 2001 From: Anastasia Shtompel <143889922+anshtompel@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:18:49 +0300 Subject: [PATCH 10/11] Update protein_tools.py --- HW4_Shtompel/protein_tools.py | 1 + 1 file changed, 1 insertion(+) diff --git a/HW4_Shtompel/protein_tools.py b/HW4_Shtompel/protein_tools.py index 4564cda..f1762ac 100644 --- a/HW4_Shtompel/protein_tools.py +++ b/HW4_Shtompel/protein_tools.py @@ -165,6 +165,7 @@ def count_trypsin_sites(seq: str) -> int: 'check_unusual_aminoacids': check_unusual_aminoacids, 'count_charge': count_charge} + def protein_tools(*args: str) -> list: """ Calculates protein phisical properties: mass, charge, length, aliphatic index; From ec74fd954c60e410357c3078b74db0510258d7e9 Mon Sep 17 00:00:00 2001 From: Elizaveta Chevokina <144058117+e-chevokina@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:34:30 +0300 Subject: [PATCH 11/11] Update protein_tools.py Delete a unnecessary string in the list of global variables. --- HW4_Shtompel/protein_tools.py | 1 - 1 file changed, 1 deletion(-) diff --git a/HW4_Shtompel/protein_tools.py b/HW4_Shtompel/protein_tools.py index f1762ac..fbe9a13 100644 --- a/HW4_Shtompel/protein_tools.py +++ b/HW4_Shtompel/protein_tools.py @@ -2,7 +2,6 @@ Global variables: - AA_ALPHABET — a dictionary variable that contains a list of proteinogenic aminoacids classes. - ALL_AMINOACIDS — a set variable that contains a list of all proteinogenic aminoacids. -- FEATURE_FUNCTIONS — a list of available functions of the feature. - AMINO_ACIDS_MASSES — a dictionary variable that contains masses of all proteinogenic aminoacids. """