Python-BI-2023 · OtterLawyer · Oct 1, 2023 · Oct 1, 2023 · Oct 1, 2023 · Oct 1, 2023
diff --git a/HW4_Sivtsev/protein_tools.py b/HW4_Sivtsev/protein_tools.py
@@ -0,0 +1,160 @@
+aminoacid_alphabet_1to3 = {'A': 'Ala', 'R': 'Arg', 'N': 'Asn', 'D': 'Asp', 'C': 'Cys', 
+							'Q': 'Gln', 'E': 'Glu', 'G': 'Gly', 'H': 'His', 'I': 'Ile', 
+							'L': 'Leu', 'K': 'Lys', 'M': 'Met', 'F': 'Phe', 'P': 'Pro', 
+							'S': 'Ser', 'T': 'Thr', 'W': 'Trp', 'Y': 'Tyr', 'V': 'Val'}
+
+molecular_mass = {'A': 89.094, 'R': 174.203, 'N': 132.119, 'D': 133.104, 'C': 121.154, 
+                  'E': 147.131, 'Q': 146.146, 'G': 75.067, 'H': 155.156, 'I': 131.175,
+                  'L': 131.175, 'K': 146.189, 'M': 149.208, 'F': 165.192, 'P': 115.132,
+                  'S': 105.093, 'T': 119.119, 'W': 204.228, 'Y': 181.191, 'V': 117.148}
+
+
+def is_prot(prot: str) -> bool:
+    """
+    Checks is given sequence a protein
+    Arguments:
+        prot (str) - aminoacid sequence of protein
+    Return:
+        bool if sequence is correct
+        ValueError('Please  check proteins sequences') if there were wrong symbols
+    """
+    aas = {'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'}
+    prot = prot.upper()
+    uniq_aas = set(prot)
+    aa_test = (uniq_aas <= aas)
+    if aa_test == 0:
+        raise ValueError('Please  check proteins sequences')
+    return True
-    aa_test = (uniq_aas <= aas)
-    if aa_test == 0:
-        raise ValueError('Please  check proteins sequences')
-    return True
+    aa_test = (uniq_aas <= aas)
+    if uniq_aas.issubset(aas):
+        return True
+    else:
+        raise ValueError('Please  check proteins sequences')
-    aa_test = (uniq_aas <= aas)
-    if aa_test == 0:
-        raise ValueError('Please  check proteins sequences')
-    return True
+    aa_test = (uniq_aas <= aas)
+    if uniq_aas.issubset(aas):
+        return True
+    else:
+        raise ValueError('Please  check proteins sequences')
+
+
+def convert_1to3(prot: str) -> str:
+    """
+    Converts 1-symbol aminoacid sequence into 3-symbol aminoacid sequence.
+    Arguments: 
+        -prot (str) - aminoacid sequence in uppercase 1-symbol format
+    Return: 
+        -output (str) - aminoacid sequence in 3-symbol format.
+    """
+    output = ''
+    if len(prot) > 0:
+        for i in prot:
+            if i in aminoacid_alphabet_1to3:
+                output += aminoacid_alphabet_1to3[i]
+            else:
+                raise ValueError('Input format: aminoacids in uppercase 1-letter symbols')
+    return output
+
+
+def calculate_mm(prot: str) -> float:
+    """
+    Calculates molecular mass of protein.
+    Argumets:
+        -prot (str) - aminoacid sequence in uppercase 1-symbol format.
+    Return:
+        -output (float) - molecular mass in float format with 2 digits after dot.
+    """
+    prot_seq = set(prot)
+    output = 0
+    if len(prot) == 1:
+        output = molecular_mass[prot]
+    else:
+        for i in prot_seq:
+            output += prot.count(i) * molecular_mass[i]
+    output -= 18.0153*(len(prot)-1)
-    output -= 18.0153*(len(prot)-1)
+    output -= 18.0153 * (len(prot) - 1)
-    output -= 18.0153*(len(prot)-1)
+    output -= 18.0153 * (len(prot) - 1)
+    return round(output,3)
+
+
+def count_aa_length(prot: str) -> int:
+    """ 
+    Counts the length of the sequence
+     Arguments: 
+      -prot (str) - the sequence, which length should be counted
+     Return:  
+      -int - the result of the count
+    """
+    return len(prot)
+
+
+def count_nucl_length (prot: str) -> int: 
+    """
+    Counts the length of the nucleotide sequence that codes the inputted aminoacid sequence
+     Arguments: 
+      -prot (str) - the sequence, which coding nucleotide sequence length should be counted
+     Return:
+      -int - the result of the count
+    """
+    return len(prot)*3
-    return len(prot)*3
+    return len(prot) * 3
-    return len(prot)*3
+    return len(prot) * 3
+
+
+def count_aa_content(prot: str) -> dict:
+    """
+    Counts each aminoacid in protein and returns thire quantity
+
+    Arguments: prot (str) - one of the input protein sequences was given by protein_tools
+    Return: aa_content (dict) - dict of aminoacids and their quantity in protein
+    """
+
+    aas = 'ACDEFGHIKLMNPQRSTVWY'
+    prot = prot.upper()
+    aa_counter = [0] * 20
+    for i in range(len(prot)):
+        n = aas.index(prot[i])
+        aa_counter[n] += 1
+
+    aa_content = dict(zip(list(aas), aa_counter))
+    return aa_content
+
+
+def count_extinction_280nm(prot: str) -> int:
+    """
+    Counts extinction in 280nm according to W, Y, C (cystine) number.
+
+    Transforms prot sequence into dictionary using count_aa_content(prot) function.
+    Uses the formula: e = 5500 * W + 1490 * Y + 125 * C
+    Cystine number is counted roughly.
+
+    Arguments: prot (str) - one of the input protein sequences
+    Return: e (int) - result of counts: extinction coefficient at 280 nm
+
+    """
+    aa_cont_dict = count_aa_content(prot)
+
+    W_number = aa_cont_dict.get('W')
+    Y_number = aa_cont_dict.get('Y')
+    C_number = aa_cont_dict.get('C')
+
+    if C_number == 0:
+        e = 5500 * W_number + 1490 * Y_number
+    else:
+        e = 5500 * W_number + 1490 * Y_number + 125*(C_number//2)
+    return e
+
+
+def protein_tools (function : str, *prots : str) -> (int, list, str): 
+    """
+    Consists of several functions, is able to:
+      -check whether the inputted sequence is a peptide 
+      -count the length of the sequence
+      -count the length of the coding nucleotide sequence of the inputted sequence
+      -count the molecular mass of the sequence
+      -convert 1-letter input style into 3-letter and vice versa
+      -show the aminoacid content of the sequence
+     Arguments:
+      -function (str) - the name of the action, the user wants to do on the sequence(s)
+      -prots (str) - the sequence(s) that should be manipulated
+     Return:
+      -int - results of counts
+      -list or str - result of convertation or showing the content
+
+    """
+    functions = {'count_length':count_aa_length, 'count_nucleotide_length':count_nucl_length,
+                 'count_molecular_mass':calculate_mm, 'show_content':count_aa_content, 'convert_1_to_3':convert_1to3,
+                  'count_extinction_280nm':count_extinction_280nm }
+    protein = []
+    for prot in prots:
+        is_prot(prot)
+        protein.append(functions[function](prot))
+    if len(protein) == 1:
+        return protein[0]
+    else:
+        return protein
diff --git a/README.md b/README.md
@@ -1,65 +1,79 @@
-# HW 4. Functions 2
-> *This is the repo for the fourth homework of the BI Python 2023 course*
-
-### Homework description
-
-На прошлой неделе вы делали утилиту для работы с последовательностями нуклеиновых кислот (с весьма строгим ТЗ). Пришло время для чего-то более самостоятельного. 
-
-#### Основное задание
-
-
-Напишите утилиту для работы с последовательностями белков. Там должно быть минимум 5 различных операций, должна быть какая-то точка входа через которую пользователь будет всё это дело использовать. На этом, по сути, всё. Всё целиком зависит от вашей фантазии и креативности. Можете опираться на ДЗ №2 и №3. 
-
-Самая главная часть задания - это файл `README.md`. Сделайте краткое введение, напишите описание тула, приведите документацию по использованию со списком аргументов. Добавьте примеры использования. Возможно, вы захотите сделать секцию Troubleshooting. ***Почему это нужно?*** В этот раз проверяющий не будет знать того, как должен работать ваш тул. Это ваш авторский код. Даже самая прекрасная функциональность, не будучи отраженной в README, скорее всего останется незамеченной. README - это ваш способ познакомить пользователя с тулом, показать всё лучше и обосновать, почему именно ваша команда должна получить наивысший балл. 
-
-Есть люди которые, любят писать документации, а есть те - кто не любит. Найдите в вашей команде того, кто любит. И в будущем в своих рабочих проектах всегда держите рядом такого человек (или будьте им). 
-
-Примеры некоторых README, которыми можно вдохновляться:
-
-- [MetaFX](https://github.com/ctlab/metafx), тул Артёма Иванова. Там еще и [wiki](https://github.com/ctlab/metafx/wiki) крутое.
-- [samovar](https://github.com/nvaulin/samovar)
-- [MetaGEM](https://github.com/franciscozorrilla/metaGEM)
-- [Pharokka](https://github.com/gbouras13/pharokka)
-
-Типовые секции, на которые стоит обратить внимание: Title, Overview, Usage, Options, Examples, Troubleshooting, Contacts.
-
-**Tехническое требование к заданию.**
-
-Это задание будет выполняться в командах по 3 человека. Каждый из членов команды должен внести <ins>***как минимум***</ins> 2 функции. Каждое внесение функции должно сопровождаться коммитом с осмысленным описанием коммита. Ниже приведена последовательность действий для успешного выполнения задания (аналогично ДЗ №2):
-
-1. Посмотрите состав своей команды здесь ([**ССЫЛКА**](https://docs.google.com/spreadsheets/d/1KMBBBu8LqauRpDJb0v1ldPwpvzNn8-KakcHexAcqLsE/edit?usp=sharing)). 
-2. Тимлид делает форк данного репозитория. **В форке создает ветку `HW4_<surname>`, в ветке создает папку `HW4_<surname>`, в этой папке вы всё делаете.**
-3. Члены команды могут либо делать свои форки, либо работать в репозитории тимлида в качестве колабораторов ("contributors"). В любом случае делаете клоны => пишите код локально => пушите.
-4. В конце тимлид делайет pull-request из `HW4_<surname>` своего репозитория в `main` этого.
-
-
-А также:
-- Сопроводите программу лучшим `README.md` файлом в вашей жизни (на английском языке).
-- В этом ДЗ проблемы с качеством кода (нейминги, пустые строки, анноатции типов, док.стринги, пробелы) могут привести к снижению балла. Воспользуйтесь линтерами чтобы себя обезопасить. IDE по типу PyCharm или VSCode имеют фунцонал по авто-исправлению многих проблем такого рода. 
-
-Автотестов на GitHub в этом ДЗ нет, но вы можете прогнать линтеры на качество кода локально (как в ДЗ №3, подробнее читайте [тут](https://plausible-cannon-091.notion.site/Code-auto-checks-02b2ea69c1d545fca07b50ce5933ed5f?pvs=4)). 
-
-- Программа должна сохранять регистр символов.
-- Программа должна работать только с последовательностями белков.
-- Запрещается использование сторонних модулей.
-
-
-### Форма сдачи
-
-Прикрепите ссылку на pull-request тимлида в Google Class (можете сделать от лица каждого члена команды, но это не обязательно).
-
-
-### Pазбалловка
-
-- За каждую из 5 операций - максимум **1.5 балла**
-- За README - максимум **2.5 балла**
-- Если вы не внесли как минимум 2 функции от себя, вы получаете 0 баллов (на баллы остальных членов команды это не влияет).
-- За фото созвона в README можно получить 0.2 доп. балла (но не более 10 баллов суммарно)
-
-
-
-### **Предполагаемый учебный результат**
-
-Это задание позволит вам проявить креативность и учиться быть не только кодером, но и автором. Также это задание поможет окончательно закрепить материал по функциям который мы прошли.
-
-Удачи! ✨✨
+# protein_tools.py
+There is a tool, written in Python, for working with protein sequences. It contains several functions, described below in the section "Usage".
+
+## Installation
+Download protein_tools.py, adapt it to your code and relax.
+
+## Usage
+Provide a tool with the sequence(s) of the protein(s) in 1-letter format (for example, DYKDDDDK) and the function needed. If you
+occasionally write down a non-peptide sequence, the programm will return an error.  
+
+Here is the catalogue of actions the user can choose: 
+
+- *count_length*: gives the length(s) of the protein sequence(s)  
+- *count_nucleotide_length*: counts the length(s) of the coding nucleotide sequence(s) of the protein sequence(s)  
+- *count_molecular_mass*: calculates molecular mass of the input (the algorithm takes into consideration water mass and subtracts it)    
+- *show_content*: shows the aminoacid content of the protein(s)  
+- *convert_1_to_3*: converts 1-letter format into 3-letter one  
+- *count_extinction_280nm*: counts the molar extinction coefficient (this function counts cystine contribution to extinction coefficient as two cysteins give 1 SS-bond) 
+
+## Examples:  
+Examples for some of the protein_tools.py functions:  
+```
+function = 'count_aa_length'
+prot1 = 'DYKDDDDK'
+prot2 = 'DYKDDdDk'
+```
+The result would be:
+```
+[8, 8]
+```
+Almost same result will be obtained when using 'count_nucl_length'
+
+Count molecular mass:
+```
+Count molecular mass:
+function = 'count_molecular_mass'
+prot1 = 'DYKDDDDK'
+```
+The result of programm work:
+```
+760.768
+```
+Converting into 3-letter format
+```
+function = 'convert_1to3'
+prot1 = 'DYKDDDDK'
+```
+The result:
+```
+'AspTyrLysAspAspAspAspLys'
+```
+Showing the content:
+```
+function = 'show_content'
+prot1 = 'DYKDDDDK'
+```
+The user gets this:
+```
+{'A': 0, 'C': 0, 'D': 5, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'K': 2, 'L': 0, 'M': 0, 'N': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'V': 0, 'W': 0, 'Y': 1}
+```
+Count extinction coefficient 280nm:
+```
+function = 'count_extinction_280nm'
+prot1 = 'DYKDDDDK'
+prot2 = 'AADDRR'
+```
+The result:
+```
+[1490, 0]
+```
+## Troubleshooting
+If the user sees ValueError, the user may inputted a non-protein sequence. The programm works with protein sequences in 1-letter format only. Please, check the sequence.
+## Authors' contribution:
+- Alexei Sivtsev: *calculate_mm*, *convert_1to3* (team leader)   
+- Albina Khairetdinova: *count_aa_content*, *count_extinction_280nm*, *is_prot* (it is the inner function, that appears only when the sequence is non-protein and returns ValueError)  
+- Elizaveta Zolotenkova: main function *protein_tools*, *count_aa_length*, *count_nucl_length* and Read.me   
+
+## Additional information (a photo of the authors)
+![authors](https://github.com/Zoea1/HW4_Functions2/assets/143959084/114d6852-8fb8-4bcc-baf7-873eb3d85a5e)