From fa64ed0a36cdcf9fff400b1664e9326617c243ca Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Sun, 8 Oct 2023 05:35:36 +0300 Subject: [PATCH 01/29] Create README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..99dc414 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# Bioinformatics_toolkit_for_beginner +This repository contains bioinformatics utilities from a study project From 278379acc0a5fbb1c8c42ec31d8f74dcb182099d Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Sun, 8 Oct 2023 05:41:58 +0300 Subject: [PATCH 02/29] Create protein_toolkit.py --- protein_toolkit.py | 289 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 289 insertions(+) create mode 100644 protein_toolkit.py diff --git a/protein_toolkit.py b/protein_toolkit.py new file mode 100644 index 0000000..4f260fd --- /dev/null +++ b/protein_toolkit.py @@ -0,0 +1,289 @@ +from typing import Iterable + +PROT_SET_1 = frozenset('ARNDCEQGHILKMFPSTWYV') +PROT_SET_3 = frozenset({'Ala','Arg', 'Asn', 'Asp', 'Cys', + 'Gln', 'Glu', 'Gly', 'His', 'Ile', + 'Leu', 'Lys', 'Met', 'Phe', 'Pro', + 'Ser', 'Thr', 'Trp', 'Tyr', 'Val'}) +AA_TR_DICT = {'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', + 'Cys': 'C', 'Gln': 'E', 'Glu': 'Q', 'Gly': 'G', + 'His': 'H', 'Ile': 'I', 'Leu': 'L', 'Lys': 'K', + 'Met': 'M', 'Phe': 'F', 'Pro': 'P', 'Ser': 'S', + 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V'} +AA_UNIPROT_CONTENT = { +"A": 9.03, "R": 5.84, "N": 3.79, "D": 5.47, "C": 1.29, +"Q": 3.80, "E": 6.24, "G": 7.27, "H": 2.22, "I": 5.53, +"L": 9.85, "K": 4.93, "M": 2.33, "F": 3.88, "P": 4.99, +"S": 6.82, "T": 5.55, "W": 1.30, "Y": 2.88, "V": 6.86 +} +AA_CHARGES = { +"A": 0, "R": 1, "N": 0, "D": -1, "C": 0, +"Q": 0, "E": -1, "G": 0, "H": 1, "I": 0, +"L": 0, "K": 1, "M": 0, "F": 0, "P": 0, +"S": 0, "T": 0, "W": 0, "Y": 0, "V": 0 +} + + +def aa_content_check(seq: str) -> dict: + seq_content = dict.fromkeys(AA_UNIPROT_CONTENT.keys(), 0) + for AAcd in seq.upper(): + seq_content[AAcd] = seq_content[AAcd] + 1 + + seq_length = len(seq) + for AAcd, occurence in seq_content.items(): + seq_content[AAcd] = 100 * occurence / seq_length + + return seq_content + + +def Mann_Whitney_U(seq1: Iterable[int], seq2: Iterable[int]) -> bool: + """ + Mann-Whitney U-test. Used to compare aminoacids composition in sequence with average composition provided by Uniprot. + Used as a second step in `check_seq` function if sequence is 1-letter abbreviation. + """ + len_seq1, len_seq2 = len(seq1), len(seq2) + r1, r2 = dict.fromkeys(map(str, seq1), 0), dict.fromkeys(map(str, seq2), 0) + + r = sorted(list(seq1) + list(seq2)) + r_dict = dict.fromkeys(map(str, r), ()) + + for index, value in enumerate(r): + value = str(value) + r_dict[value] = r_dict[value] + (index + 1,) + for elem in r_dict: + r_dict[elem] = sum(r_dict[elem]) / len(r_dict[elem]) + + for value in seq1: + value = str(value) + r1[value] = r1[value] + r_dict[value] + + for value in seq2: + value = str(value) + r2[value] = r2[value] + r_dict[value] + + u1 = (len_seq1 * len_seq2) + len_seq1 * (len_seq1 + 1) / 2 - sum(r1.values()) + u2 = (len_seq1 * len_seq2) + len_seq2 * (len_seq2 + 1) / 2 - sum(r2.values()) + + u_stat = min(u1, u2) + + if u_stat <= 127: + return False + return True + + +def decomposition(seq): + len_seq, dec_seq = len(seq), [] + for i in range(0, len_seq, 3): + dec_seq.append(seq[i:i+3].lower().capitalize()) + return dec_seq + + +def check_seq(seq: str, abbreviation: int = 1) -> bool: + """ + Checks whether the string is protein. + """ + if abbreviation == 3: + exit_code = set(seq).issubset(PROT_SET_3) + elif abbreviation == 1: + seq_set = set(seq.upper()) + exit_code = seq_set.issubset(PROT_SET_1) + if exit_code: + seq_content, uniprot_content = aa_content_check(seq).values(), AA_UNIPROT_CONTENT.values() + seq_Mann_Whitney_U = Mann_Whitney_U(seq_content, uniprot_content) if len(seq_set) == 20 else True + exit_code = seq_Mann_Whitney_U + + return exit_code + + +def seq_transform(seq: list): + seq_tr = '' + for aa in seq: + seq_tr += AA_TR_DICT[aa] + + return seq_tr + + +def seq_length(seq: str) -> int: + return len(seq) + + +def protein_mass(seq: str): + "Counts molecular weight of the protein" + count = 0 + for amino in seq: + if amino == "A": + count += 89 + elif amino == "R": + count += 174 + elif amino == "N": + count += 132 + elif amino == "V": + count += 117 + elif amino == "H": + count += 155 + elif amino == "G": + count += 75 + elif amino == "Q": + count += 146 + elif amino == "E": + count += 147 + elif amino == "I": + count += 131 + elif amino == "L": + count += 131 + elif amino == "K": + count += 146 + elif amino == "M": + count += 149 + elif amino == "P": + count += 115 + elif amino == "S": + count += 105 + elif amino == "Y": + count += 181 + elif amino == "T": + count += 119 + elif amino == "W": + count += 204 + elif amino == "F": + count += 165 + else: + count += 133 + return count + + +def protein_formula(seq: str): + "Returns molecular formula of the protein" + fС = 0 + fH = 0 + fN = 0 + fO = 0 + fS = 0 + for amino in seq: + if amino == "A": + fС += 3 + fH += 7 + fN += 1 + fO += 2 + fS += 0 + elif amino == "R": + fС += 6 + fH += 14 + fN += 4 + fO += 2 + fS += 0 + elif amino == "N": + fС += 4 + fH += 8 + fN += 2 + fO += 3 + fS += 0 + elif amino == "V": + fС += 5 + fH += 11 + fN += 1 + fO += 2 + fS += 0 + elif amino == "H": + fС += 6 + fH += 9 + fN += 3 + fO += 2 + fS += 0 + elif amino == "G": + fС += 2 + fH += 5 + fN += 1 + fO += 2 + fS += 0 + elif amino == "Q": + fС += 5 + fH += 10 + fN += 2 + fO += 3 + fS += 0 + elif amino == "E": + fС += 5 + fH += 9 + fN += 1 + fO += 4 + fS += 0 + elif amino == "I": + fС += 6 + fH += 13 + fN += 1 + fO += 2 + fS += 0 + elif amino == "L": + fС += 6 + fH += 13 + fN += 1 + fO += 2 + fS += 0 + elif amino == "K": + fС += 6 + fH += 14 + fN += 2 + fO += 2 + fS += 0 + elif amino == "M": + fС += 5 + fH += 11 + fN += 1 + fO += 2 + fS += 1 + elif amino == "P": + fС += 5 + fH += 9 + fN += 1 + fO += 2 + fS += 0 + elif amino == "S": + fС += 3 + fH += 7 + fN += 1 + fO += 3 + fS += 0 + elif amino == "Y": + fС += 9 + fH += 11 + fN += 1 + fO += 3 + fS += 0 + elif amino == "T": + fС += 4 + fH += 9 + fN += 1 + fO += 3 + fS += 0 + elif amino == "W": + fС += 11 + fH += 12 + fN += 2 + fO += 2 + fS += 0 + elif amino == "F": + fС += 9 + fH += 11 + fN += 1 + fO += 2 + fS += 0 + else: + fС += 4 + fH += 7 + fN += 1 + fO += 4 + fS += 0 + if fS == 0: + aa_formula = f'С: {fС}, H: {fH}, N: {fN}, O:{fO}' + else: + aa_formula = f'С: {fС}, H: {fH}, N: {fN}, O:{fO}, S: {fS}' + return aa_formula + + +def aa_chain_charge(seq: str, aa_charges: dict = AA_CHARGES) -> int: + aa_charge = 0 + for AAcd in seq.upper(): + aa_charge += aa_charges[AAcd] + + return aa_charge \ No newline at end of file From e05d8ebf55f86de652b1952a8c23ad803c10d621 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Sun, 8 Oct 2023 05:43:40 +0300 Subject: [PATCH 03/29] Create fastq_toolkit.py --- fastq_toolkit.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 fastq_toolkit.py diff --git a/fastq_toolkit.py b/fastq_toolkit.py new file mode 100644 index 0000000..cc63746 --- /dev/null +++ b/fastq_toolkit.py @@ -0,0 +1,43 @@ +def GC_content (seq:str) -> float: + "Defines GC-content of sequence" + + gc = 0 + for nucl in seq: + if nucl == "G" or nucl == "C": + gc += 1 + gc_count = gc / len(seq) * 100 + + return gc_count + +def length_seq (seq: str) -> float: + "Defines length of sequence" + + seq_length = len(seq) + + return seq_length + +ascii_dict = { + "!": 0, '"': 1, "#": 2, "$":3, "%":4, + "&": 5, "'": 6, "(": 7, ")":8, "*": 9, + "+": 10, ",": 11, "-": 12, ".": 13, "/": 14, + "0": 15, "1": 16, "2": 17, "3": 18, "4": 19, + "5": 20, "6": 21, "7": 22, "8": 23, "9": 24, + ":": 25, ";": 26, "<": 27, "=": 28, ">": 29, + "?": 30, "@": 31, "A": 32, "B": 33, "C": 34, + "D": 35, "E": 36, "F": 37, "G": 38, "H": 39, + "I": 40 +} + +def quality_seq (seqs: str) -> int: + """ + Converts symbolic quality metrics to numeric ones + and counts average quality score of FastQ sequence + """ + quality_score = 0 + + for char in seqs: + quality_score += ascii_dict[char] + + average_quality_score = quality_score / len(seqs) + + return average_quality_score \ No newline at end of file From fada259166704d57939e49ec06b9959366a4a4aa Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Sun, 8 Oct 2023 05:49:52 +0300 Subject: [PATCH 04/29] Relocate protein_toolkit.py to /modules --- protein_toolkit.py => modules/protein_toolkit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename protein_toolkit.py => modules/protein_toolkit.py (96%) diff --git a/protein_toolkit.py b/modules/protein_toolkit.py similarity index 96% rename from protein_toolkit.py rename to modules/protein_toolkit.py index 4f260fd..3f3eb19 100644 --- a/protein_toolkit.py +++ b/modules/protein_toolkit.py @@ -286,4 +286,4 @@ def aa_chain_charge(seq: str, aa_charges: dict = AA_CHARGES) -> int: for AAcd in seq.upper(): aa_charge += aa_charges[AAcd] - return aa_charge \ No newline at end of file + return aa_charge From 5aa6e1f4e702f89260590129c46442f616cb732d Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Sun, 8 Oct 2023 05:50:29 +0300 Subject: [PATCH 05/29] Relocate fastq_toolkit.py to /modules --- fastq_toolkit.py => modules/fastq_toolkit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename fastq_toolkit.py => modules/fastq_toolkit.py (93%) diff --git a/fastq_toolkit.py b/modules/fastq_toolkit.py similarity index 93% rename from fastq_toolkit.py rename to modules/fastq_toolkit.py index cc63746..29e31c5 100644 --- a/fastq_toolkit.py +++ b/modules/fastq_toolkit.py @@ -40,4 +40,4 @@ def quality_seq (seqs: str) -> int: average_quality_score = quality_score / len(seqs) - return average_quality_score \ No newline at end of file + return average_quality_score From 28a43bf5ccc3262a49692ad25e0ac6e171636c25 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Sun, 8 Oct 2023 06:02:15 +0300 Subject: [PATCH 06/29] Create main script.py --- main script.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 main script.py diff --git a/main script.py b/main script.py new file mode 100644 index 0000000..ea84561 --- /dev/null +++ b/main script.py @@ -0,0 +1,69 @@ +from modules.fastq_toolkit import +from modules.protein_toolkit import + +def print_result(result: list, corrupt_seqs: list): + len_seq, len_corr_seq = len(result), len(corrupt_seqs) + len_seqs = len_seq + len_corr_seq + success = ["+" for _ in range(len_seqs)] + if not len_corr_seq: + print(f"All {len_seqs} sequence(s) processed successfully") + elif len_corr_seq: + for i in corrupt_seqs: + success[i[0]] = "-" + print(f'Processing result: [{"".join(success)}]\n') + print(f"{len_seq} sequence(s) out of {len_seq + len_corr_seq} given have been processed successfully.") + print(f"{len_corr_seq} has been recognized as corrupted, i.e. non-protein") + +OPERATIONS = {"content_check": aa_content_check, "seq_length": seq_length, "protein_formula": protein_formula, "protein_mass": protein_mass, "charge": aa_chain_charge} + +def protein_processing(*args, abbreviation: int = 1): + """ + This function makes it possible to process protein sequences to identify them, determine their length, molecular weight, amino acid composition and charge. + """ + *seqs, operation = args + if operation not in OPERATIONS: + raise ValueError(f'Unknown operation `{operation}`. Please, select from: "content_check", "seq_length", "protein_formula", "protein_mass", "charge"') + + result, corrupt_seqs = [], [] + for seq_index, seq in enumerate(seqs): + if abbreviation == 3: + seq = decomposition(seq) + is_seq_valid = check_seq(seq, abbreviation) + if is_seq_valid: + if abbreviation == 3: + seq = seq_transform(seq) + result.append(OPERATIONS[operation](seq)) + elif not is_seq_valid: + corrupt_seqs.append((seq_index, seq)) + + print_result(result, corrupt_seqs) + + res_len, cor_seq_len = len(result), len(corrupt_seqs) + result = result[0] if res_len >= 1 else result + corrupt_seqs = corrupt_seqs[0] if cor_seq_len >= 1 else corrupt_seqs + return result, corrupt_seqs + + +def filter_fastq(seqs:dict, gc_bounds=(0,100), length_bounds=(0, 2**23), quality_threshold=0): + """ + The function filtering FastQ sequences using parameters: + seqs: dictionary with file`s contents + gc_bounds: specified GC content + length_bounds: specified sequence length limits + """ + for name, seq in seqs.items(): + gc_count = GC_content(seq) + seq_length = length_seq(seq) + average_quality_score = quality_seq(name) + + filtered_seqs = {} + + lowergc, uppergc = gc_bounds + lowerlength, upperlength = length_bounds + + for name, seq in seqs.items(): + if lowergc <= gc_count <= uppergc and lowerlength <= seq_length <= upperlength: + if quality_threshold <= average_quality_score: + filtered_seqs[name] = seq[name] + + return filtered_seqs \ No newline at end of file From 59937fca8ed82dac3d5c0c84b12aff3578e76a73 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Sun, 8 Oct 2023 07:26:56 +0300 Subject: [PATCH 07/29] Update README.md --- README.md | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 99dc414..3e949d4 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,60 @@ -# Bioinformatics_toolkit_for_beginner -This repository contains bioinformatics utilities from a study project +# Bioinformatics toolkit for beginner +The utility is designed for processing protein sequences, as well as working with DNA sequences in the fastQ format (new functions for RNA and DNA processing will be added in future versions). + +## Table of content ++ [Overview](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#overview) ++ [Installation](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#installation) ++ [Usage](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#usage) ++ [Credits](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#credits) + +## Overview +The toolkit contains a number of functions that allow you to filter data in the format of fastQ аnd also to analyze protein sequences according to a number of important physical and chemical properties. The programm is suitable for a wide range of users, including biologists with minimal knowledge of Python. + +## Installation +To install the program, download files main_script.py and contents of the folder "modules". +OR + +You can simple clone this repository using +``` +git clone git@github.com:LinaWhite15/Bioinformatics_toolkit_for_beginner.git +``` +(for Linux and WSL users) + +**Python3 is required.** + +## Usage +Before running the script, you must import additional modules +``` +from modules.fastq_toolkit import * +from modules.protein_toolkit import * +``` +### Input +For running **protein_toolkit** you must enter the protein sequence in one-letter or three-letter format and select one of the operations. +### List of operation in **protein_toolkit** +* ```content_check``` - Analyzes the amino acid composition of the protein. The output gives the percentage content of each molecule in the peptide. + +* ```seq_length``` - Measures the length of the peptide and gives the number of amino acids. + +* ```protein_formula``` - Gives the atomic composition of the polymer. + +* ```protein_mass``` - Сalculates the molecular mass of a protein in g/mol + +* ```charge``` - Determines the charge of a protein when pH = 7. +### fastq_toolkit +**fastq_toolkit** takes 4 arguments as input: ```seqs```, ```gc_bounds```, ```length_bounds```, ```quality_threshold```: +* ```seqs``` - a dictionary consisting of fastq sequences. Key: string, containing name of sequence. The value is a tuple of two strings: sequence and quality. +* ```gc_bounds``` - composition GC interval (in percent) for filtering. The default is (0 :100). If you pass one number as an argument, other will be considered as the upper limit. +* ```length_bounds``` - length interval for filtering, by default it is equal to (0, 2**32). +* ```quality_threshold``` - threshold value of average read quality for filtering, default is 0 (phred33 scale). + +### Output +* **protein_toolkit** - a string with result of performed operation. +* **fastq_toolkit** - a dictionary containing sequences corresponding to user-specified conditions. + +## Credits +**Team:** +Belikova Angelina - kiit@gmail.com Implemented: ```protein_formula```, ```protein_mass```, ```seq_length```, **fastq_toolkit**. + +Aryuna Ayusheeva - aryuna.ayusheeva.1998@mail.ru Implemented: ```aa_content_check```, ```aa_chain_charge```. + +Bredov Denis - d2707bredov@gmail.com Implemented: ```Mann_Whitney_U```, ```decomposition```, ```seq_transform```, ```check_and_procees_seq```, ```print_result```, ```run_protein_analyzer_tool```. From a0a734543d61db226d25448f0b78efcb8c429673 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Sun, 8 Oct 2023 07:32:26 +0300 Subject: [PATCH 08/29] Update main script.py --- main script.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main script.py b/main script.py index ea84561..6a1863b 100644 --- a/main script.py +++ b/main script.py @@ -1,5 +1,5 @@ -from modules.fastq_toolkit import -from modules.protein_toolkit import +from modules.fastq_toolkit import * +from modules.protein_toolkit import * def print_result(result: list, corrupt_seqs: list): len_seq, len_corr_seq = len(result), len(corrupt_seqs) @@ -66,4 +66,4 @@ def filter_fastq(seqs:dict, gc_bounds=(0,100), length_bounds=(0, 2**23), quality if quality_threshold <= average_quality_score: filtered_seqs[name] = seq[name] - return filtered_seqs \ No newline at end of file + return filtered_seqs From bae08026ea6a6046e02ea79378b924926c13acec Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 18 Oct 2023 23:42:54 +0300 Subject: [PATCH 09/29] Add functions read_fastq and write_fastq --- main script.py | 46 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/main script.py b/main script.py index 6a1863b..6b75f1d 100644 --- a/main script.py +++ b/main script.py @@ -1,5 +1,37 @@ from modules.fastq_toolkit import * from modules.protein_toolkit import * +import os + +def read_fastq(input_path: str) -> Dict[str, Tuple[str, str,str]]: + with open(input_path, 'r') as file: + titles = [] + contigs = [] + comms = [] + quals = [] + for line in file: + if line.startswith('@SRX'): + name = line.strip('\n') + titles.append(name) + contig = file.readline().strip('\n') + contigs.append(contig) + comm = file.readline().strip('\n') + comms.append(comm) + qual = file.readline().strip('\n') + quals.append(qual) + val = list(zip(contigs, comms, quals)) + fastq = dict(zip(titles, val)) + return fastq + +def write_fastq(filtered_fasq: dict, output_filename: str) -> None: + if not os.path.exists('fastq_filtrator_resuls'): + os.mkdir('fastq_filtrator_resuls') + with open(f'fastq_filtrator_resuls.fastq', 'w') as output_file: + for title, val in filtered_seqs.items(): + output_file.write(f'{title}\n') + output_file.write(f'{val[0]}\n') + output_file.write(f'{val[1]}\n') + output_file.write(f'{val[2]}\n') + def print_result(result: list, corrupt_seqs: list): len_seq, len_corr_seq = len(result), len(corrupt_seqs) @@ -44,18 +76,16 @@ def protein_processing(*args, abbreviation: int = 1): return result, corrupt_seqs -def filter_fastq(seqs:dict, gc_bounds=(0,100), length_bounds=(0, 2**23), quality_threshold=0): - """ - The function filtering FastQ sequences using parameters: - seqs: dictionary with file`s contents - gc_bounds: specified GC content - length_bounds: specified sequence length limits - """ +def filter_fastq(input_path: str, gc_bounds=(0,100), length_bounds=(0, 2**23), quality_threshold=0, output_filename = ''): for name, seq in seqs.items(): gc_count = GC_content(seq) seq_length = length_seq(seq) average_quality_score = quality_seq(name) + if output_filename is None: + output_filename = os.path.split(input_path)[-1] + + seqs = read_fastq(input_path) filtered_seqs = {} lowergc, uppergc = gc_bounds @@ -66,4 +96,4 @@ def filter_fastq(seqs:dict, gc_bounds=(0,100), length_bounds=(0, 2**23), quality if quality_threshold <= average_quality_score: filtered_seqs[name] = seq[name] - return filtered_seqs + write_fastq(filtered_seqs, output_filename) \ No newline at end of file From c44c77d30c19ae5556a9f50a87d5252219de6635 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 18 Oct 2023 23:49:00 +0300 Subject: [PATCH 10/29] Add functions read_fastq and write_fastq --- main script.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/main script.py b/main script.py index 6b75f1d..35079d8 100644 --- a/main script.py +++ b/main script.py @@ -22,6 +22,7 @@ def read_fastq(input_path: str) -> Dict[str, Tuple[str, str,str]]: fastq = dict(zip(titles, val)) return fastq + def write_fastq(filtered_fasq: dict, output_filename: str) -> None: if not os.path.exists('fastq_filtrator_resuls'): os.mkdir('fastq_filtrator_resuls') @@ -46,8 +47,10 @@ def print_result(result: list, corrupt_seqs: list): print(f"{len_seq} sequence(s) out of {len_seq + len_corr_seq} given have been processed successfully.") print(f"{len_corr_seq} has been recognized as corrupted, i.e. non-protein") + OPERATIONS = {"content_check": aa_content_check, "seq_length": seq_length, "protein_formula": protein_formula, "protein_mass": protein_mass, "charge": aa_chain_charge} + def protein_processing(*args, abbreviation: int = 1): """ This function makes it possible to process protein sequences to identify them, determine their length, molecular weight, amino acid composition and charge. From 12ad015a251e913ea8d1dff1671385f0673fad09 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Sun, 25 Feb 2024 05:18:07 +0300 Subject: [PATCH 11/29] Delete modules directory --- modules/fastq_toolkit.py | 43 ------ modules/protein_toolkit.py | 289 ------------------------------------- 2 files changed, 332 deletions(-) delete mode 100644 modules/fastq_toolkit.py delete mode 100644 modules/protein_toolkit.py diff --git a/modules/fastq_toolkit.py b/modules/fastq_toolkit.py deleted file mode 100644 index 29e31c5..0000000 --- a/modules/fastq_toolkit.py +++ /dev/null @@ -1,43 +0,0 @@ -def GC_content (seq:str) -> float: - "Defines GC-content of sequence" - - gc = 0 - for nucl in seq: - if nucl == "G" or nucl == "C": - gc += 1 - gc_count = gc / len(seq) * 100 - - return gc_count - -def length_seq (seq: str) -> float: - "Defines length of sequence" - - seq_length = len(seq) - - return seq_length - -ascii_dict = { - "!": 0, '"': 1, "#": 2, "$":3, "%":4, - "&": 5, "'": 6, "(": 7, ")":8, "*": 9, - "+": 10, ",": 11, "-": 12, ".": 13, "/": 14, - "0": 15, "1": 16, "2": 17, "3": 18, "4": 19, - "5": 20, "6": 21, "7": 22, "8": 23, "9": 24, - ":": 25, ";": 26, "<": 27, "=": 28, ">": 29, - "?": 30, "@": 31, "A": 32, "B": 33, "C": 34, - "D": 35, "E": 36, "F": 37, "G": 38, "H": 39, - "I": 40 -} - -def quality_seq (seqs: str) -> int: - """ - Converts symbolic quality metrics to numeric ones - and counts average quality score of FastQ sequence - """ - quality_score = 0 - - for char in seqs: - quality_score += ascii_dict[char] - - average_quality_score = quality_score / len(seqs) - - return average_quality_score diff --git a/modules/protein_toolkit.py b/modules/protein_toolkit.py deleted file mode 100644 index 3f3eb19..0000000 --- a/modules/protein_toolkit.py +++ /dev/null @@ -1,289 +0,0 @@ -from typing import Iterable - -PROT_SET_1 = frozenset('ARNDCEQGHILKMFPSTWYV') -PROT_SET_3 = frozenset({'Ala','Arg', 'Asn', 'Asp', 'Cys', - 'Gln', 'Glu', 'Gly', 'His', 'Ile', - 'Leu', 'Lys', 'Met', 'Phe', 'Pro', - 'Ser', 'Thr', 'Trp', 'Tyr', 'Val'}) -AA_TR_DICT = {'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', - 'Cys': 'C', 'Gln': 'E', 'Glu': 'Q', 'Gly': 'G', - 'His': 'H', 'Ile': 'I', 'Leu': 'L', 'Lys': 'K', - 'Met': 'M', 'Phe': 'F', 'Pro': 'P', 'Ser': 'S', - 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V'} -AA_UNIPROT_CONTENT = { -"A": 9.03, "R": 5.84, "N": 3.79, "D": 5.47, "C": 1.29, -"Q": 3.80, "E": 6.24, "G": 7.27, "H": 2.22, "I": 5.53, -"L": 9.85, "K": 4.93, "M": 2.33, "F": 3.88, "P": 4.99, -"S": 6.82, "T": 5.55, "W": 1.30, "Y": 2.88, "V": 6.86 -} -AA_CHARGES = { -"A": 0, "R": 1, "N": 0, "D": -1, "C": 0, -"Q": 0, "E": -1, "G": 0, "H": 1, "I": 0, -"L": 0, "K": 1, "M": 0, "F": 0, "P": 0, -"S": 0, "T": 0, "W": 0, "Y": 0, "V": 0 -} - - -def aa_content_check(seq: str) -> dict: - seq_content = dict.fromkeys(AA_UNIPROT_CONTENT.keys(), 0) - for AAcd in seq.upper(): - seq_content[AAcd] = seq_content[AAcd] + 1 - - seq_length = len(seq) - for AAcd, occurence in seq_content.items(): - seq_content[AAcd] = 100 * occurence / seq_length - - return seq_content - - -def Mann_Whitney_U(seq1: Iterable[int], seq2: Iterable[int]) -> bool: - """ - Mann-Whitney U-test. Used to compare aminoacids composition in sequence with average composition provided by Uniprot. - Used as a second step in `check_seq` function if sequence is 1-letter abbreviation. - """ - len_seq1, len_seq2 = len(seq1), len(seq2) - r1, r2 = dict.fromkeys(map(str, seq1), 0), dict.fromkeys(map(str, seq2), 0) - - r = sorted(list(seq1) + list(seq2)) - r_dict = dict.fromkeys(map(str, r), ()) - - for index, value in enumerate(r): - value = str(value) - r_dict[value] = r_dict[value] + (index + 1,) - for elem in r_dict: - r_dict[elem] = sum(r_dict[elem]) / len(r_dict[elem]) - - for value in seq1: - value = str(value) - r1[value] = r1[value] + r_dict[value] - - for value in seq2: - value = str(value) - r2[value] = r2[value] + r_dict[value] - - u1 = (len_seq1 * len_seq2) + len_seq1 * (len_seq1 + 1) / 2 - sum(r1.values()) - u2 = (len_seq1 * len_seq2) + len_seq2 * (len_seq2 + 1) / 2 - sum(r2.values()) - - u_stat = min(u1, u2) - - if u_stat <= 127: - return False - return True - - -def decomposition(seq): - len_seq, dec_seq = len(seq), [] - for i in range(0, len_seq, 3): - dec_seq.append(seq[i:i+3].lower().capitalize()) - return dec_seq - - -def check_seq(seq: str, abbreviation: int = 1) -> bool: - """ - Checks whether the string is protein. - """ - if abbreviation == 3: - exit_code = set(seq).issubset(PROT_SET_3) - elif abbreviation == 1: - seq_set = set(seq.upper()) - exit_code = seq_set.issubset(PROT_SET_1) - if exit_code: - seq_content, uniprot_content = aa_content_check(seq).values(), AA_UNIPROT_CONTENT.values() - seq_Mann_Whitney_U = Mann_Whitney_U(seq_content, uniprot_content) if len(seq_set) == 20 else True - exit_code = seq_Mann_Whitney_U - - return exit_code - - -def seq_transform(seq: list): - seq_tr = '' - for aa in seq: - seq_tr += AA_TR_DICT[aa] - - return seq_tr - - -def seq_length(seq: str) -> int: - return len(seq) - - -def protein_mass(seq: str): - "Counts molecular weight of the protein" - count = 0 - for amino in seq: - if amino == "A": - count += 89 - elif amino == "R": - count += 174 - elif amino == "N": - count += 132 - elif amino == "V": - count += 117 - elif amino == "H": - count += 155 - elif amino == "G": - count += 75 - elif amino == "Q": - count += 146 - elif amino == "E": - count += 147 - elif amino == "I": - count += 131 - elif amino == "L": - count += 131 - elif amino == "K": - count += 146 - elif amino == "M": - count += 149 - elif amino == "P": - count += 115 - elif amino == "S": - count += 105 - elif amino == "Y": - count += 181 - elif amino == "T": - count += 119 - elif amino == "W": - count += 204 - elif amino == "F": - count += 165 - else: - count += 133 - return count - - -def protein_formula(seq: str): - "Returns molecular formula of the protein" - fС = 0 - fH = 0 - fN = 0 - fO = 0 - fS = 0 - for amino in seq: - if amino == "A": - fС += 3 - fH += 7 - fN += 1 - fO += 2 - fS += 0 - elif amino == "R": - fС += 6 - fH += 14 - fN += 4 - fO += 2 - fS += 0 - elif amino == "N": - fС += 4 - fH += 8 - fN += 2 - fO += 3 - fS += 0 - elif amino == "V": - fС += 5 - fH += 11 - fN += 1 - fO += 2 - fS += 0 - elif amino == "H": - fС += 6 - fH += 9 - fN += 3 - fO += 2 - fS += 0 - elif amino == "G": - fС += 2 - fH += 5 - fN += 1 - fO += 2 - fS += 0 - elif amino == "Q": - fС += 5 - fH += 10 - fN += 2 - fO += 3 - fS += 0 - elif amino == "E": - fС += 5 - fH += 9 - fN += 1 - fO += 4 - fS += 0 - elif amino == "I": - fС += 6 - fH += 13 - fN += 1 - fO += 2 - fS += 0 - elif amino == "L": - fС += 6 - fH += 13 - fN += 1 - fO += 2 - fS += 0 - elif amino == "K": - fС += 6 - fH += 14 - fN += 2 - fO += 2 - fS += 0 - elif amino == "M": - fС += 5 - fH += 11 - fN += 1 - fO += 2 - fS += 1 - elif amino == "P": - fС += 5 - fH += 9 - fN += 1 - fO += 2 - fS += 0 - elif amino == "S": - fС += 3 - fH += 7 - fN += 1 - fO += 3 - fS += 0 - elif amino == "Y": - fС += 9 - fH += 11 - fN += 1 - fO += 3 - fS += 0 - elif amino == "T": - fС += 4 - fH += 9 - fN += 1 - fO += 3 - fS += 0 - elif amino == "W": - fС += 11 - fH += 12 - fN += 2 - fO += 2 - fS += 0 - elif amino == "F": - fС += 9 - fH += 11 - fN += 1 - fO += 2 - fS += 0 - else: - fС += 4 - fH += 7 - fN += 1 - fO += 4 - fS += 0 - if fS == 0: - aa_formula = f'С: {fС}, H: {fH}, N: {fN}, O:{fO}' - else: - aa_formula = f'С: {fС}, H: {fH}, N: {fN}, O:{fO}, S: {fS}' - return aa_formula - - -def aa_chain_charge(seq: str, aa_charges: dict = AA_CHARGES) -> int: - aa_charge = 0 - for AAcd in seq.upper(): - aa_charge += aa_charges[AAcd] - - return aa_charge From 466c6cdef700bf9d7254708b1e7509ee2d05e6b1 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Sun, 25 Feb 2024 05:21:53 +0300 Subject: [PATCH 12/29] Delete main script.py --- main script.py | 102 ------------------------------------------------- 1 file changed, 102 deletions(-) delete mode 100644 main script.py diff --git a/main script.py b/main script.py deleted file mode 100644 index 35079d8..0000000 --- a/main script.py +++ /dev/null @@ -1,102 +0,0 @@ -from modules.fastq_toolkit import * -from modules.protein_toolkit import * -import os - -def read_fastq(input_path: str) -> Dict[str, Tuple[str, str,str]]: - with open(input_path, 'r') as file: - titles = [] - contigs = [] - comms = [] - quals = [] - for line in file: - if line.startswith('@SRX'): - name = line.strip('\n') - titles.append(name) - contig = file.readline().strip('\n') - contigs.append(contig) - comm = file.readline().strip('\n') - comms.append(comm) - qual = file.readline().strip('\n') - quals.append(qual) - val = list(zip(contigs, comms, quals)) - fastq = dict(zip(titles, val)) - return fastq - - -def write_fastq(filtered_fasq: dict, output_filename: str) -> None: - if not os.path.exists('fastq_filtrator_resuls'): - os.mkdir('fastq_filtrator_resuls') - with open(f'fastq_filtrator_resuls.fastq', 'w') as output_file: - for title, val in filtered_seqs.items(): - output_file.write(f'{title}\n') - output_file.write(f'{val[0]}\n') - output_file.write(f'{val[1]}\n') - output_file.write(f'{val[2]}\n') - - -def print_result(result: list, corrupt_seqs: list): - len_seq, len_corr_seq = len(result), len(corrupt_seqs) - len_seqs = len_seq + len_corr_seq - success = ["+" for _ in range(len_seqs)] - if not len_corr_seq: - print(f"All {len_seqs} sequence(s) processed successfully") - elif len_corr_seq: - for i in corrupt_seqs: - success[i[0]] = "-" - print(f'Processing result: [{"".join(success)}]\n') - print(f"{len_seq} sequence(s) out of {len_seq + len_corr_seq} given have been processed successfully.") - print(f"{len_corr_seq} has been recognized as corrupted, i.e. non-protein") - - -OPERATIONS = {"content_check": aa_content_check, "seq_length": seq_length, "protein_formula": protein_formula, "protein_mass": protein_mass, "charge": aa_chain_charge} - - -def protein_processing(*args, abbreviation: int = 1): - """ - This function makes it possible to process protein sequences to identify them, determine their length, molecular weight, amino acid composition and charge. - """ - *seqs, operation = args - if operation not in OPERATIONS: - raise ValueError(f'Unknown operation `{operation}`. Please, select from: "content_check", "seq_length", "protein_formula", "protein_mass", "charge"') - - result, corrupt_seqs = [], [] - for seq_index, seq in enumerate(seqs): - if abbreviation == 3: - seq = decomposition(seq) - is_seq_valid = check_seq(seq, abbreviation) - if is_seq_valid: - if abbreviation == 3: - seq = seq_transform(seq) - result.append(OPERATIONS[operation](seq)) - elif not is_seq_valid: - corrupt_seqs.append((seq_index, seq)) - - print_result(result, corrupt_seqs) - - res_len, cor_seq_len = len(result), len(corrupt_seqs) - result = result[0] if res_len >= 1 else result - corrupt_seqs = corrupt_seqs[0] if cor_seq_len >= 1 else corrupt_seqs - return result, corrupt_seqs - - -def filter_fastq(input_path: str, gc_bounds=(0,100), length_bounds=(0, 2**23), quality_threshold=0, output_filename = ''): - for name, seq in seqs.items(): - gc_count = GC_content(seq) - seq_length = length_seq(seq) - average_quality_score = quality_seq(name) - - if output_filename is None: - output_filename = os.path.split(input_path)[-1] - - seqs = read_fastq(input_path) - filtered_seqs = {} - - lowergc, uppergc = gc_bounds - lowerlength, upperlength = length_bounds - - for name, seq in seqs.items(): - if lowergc <= gc_count <= uppergc and lowerlength <= seq_length <= upperlength: - if quality_threshold <= average_quality_score: - filtered_seqs[name] = seq[name] - - write_fastq(filtered_seqs, output_filename) \ No newline at end of file From a13186ef58bb058433744acb0edbff56aae2b222 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Sun, 25 Feb 2024 05:23:23 +0300 Subject: [PATCH 13/29] Update main_script.py --- main_script.py | 115 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 main_script.py diff --git a/main_script.py b/main_script.py new file mode 100644 index 0000000..590d5e4 --- /dev/null +++ b/main_script.py @@ -0,0 +1,115 @@ +from abc import ABC, abstractmethod + + +class BiologicalSequence(ABC): + @abstractmethod + def seq_len(): + pass + + def seq_slices(): + pass + + def seq_indexes(): + pass + + def seq_to_string(): + pass + + def check_alphabet(): + pass + + +class NucleicAcidSequence(BiologicalSequence): + COMPLEMENTARITY_DICT = {"A": "T", "T": "A", "C": "G", "G": "C"} + alphabet = ["A", "T", "U", "G", "C"] + + def __init__(self, seq): + self.seq = seq + + def seq_len(self): + return len(self.seq) + + def seq_slices(self, start, stop): + slice = self.seq[start:stop] + return slice + + def seq_indexes(self, index): + return self.seq[index] + + def seq_to_string(self): + return str(self.seq) + + def check_alphabet(self): + if set(self.seq) <= set(self.alphabet): + return 'That`s a nucleic acid sequence' + else: + return 'That`s not a nucleic acid sequence' + + def complement(self): + complement = "" + for nucl in self.seq: + complement += self.COMPLEMENTARITY_DICT.get(nucl) + return complement + + def gc_content(self): + return (self.seq.count('C') + self.seq.count('G'))/len(self.seq) + + +class DNASequence(NucleicAcidSequence): + COMPLEMENTARITY_DICT = {"A": "T", "T": "A", "C": "G", "G": "C"} + TRANSCRIBE_DICT = {"A": "U", "T": "A", "C": "G", "G": "C"} + alphabet = ["A", "T", "G", "C"] + + def __init__(self, seq): + self.seq = seq + + def transcribe(self): + transcribed = "" + for nucl in self.seq: + transcribed += self.TRANSCRIBE_DICT.get(nucl) + return transcribed + + +class RNASequence(NucleicAcidSequence): + COMPLEMENTARITY_DICT = {"A": "U", "U": "A", "C": "G", "G": "C"} + alphabet = ["A", "U", "G", "C"] + + def __init__(self, seq): + self.seq = seq + + +class AminoAcidSequence(BiologicalSequence): + alphabet = frozenset('ARNDCEQGHILKMFPSTWYV') + + AA_CHARGES = {"A": 0, "R": 1, "N": 0, "D": -1, "C": 0, + "Q": 0, "E": -1, "G": 0, "H": 1, "I": 0, + "L": 0, "K": 1, "M": 0, "F": 0, "P": 0, + "S": 0, "T": 0, "W": 0, "Y": 0, "V": 0} + + def __init__(self, seq): + self.seq = seq + + def seq_len(self): + return len(self.seq) + + def seq_slices(self, start, stop): + slice = self.seq[start:stop] + return slice + + def seq_indexes(self, index): + return self.seq[index] + + def seq_to_string(self): + return str(self.seq) + + def check_alphabet(self): + if set(self.seq) <= self.alphabet: + return 'That`s a protein sequence' + else: + return 'That`s not a protein sequence' + + def aa_chain_charge(self): + aa_charge = 0 + for amino in self.seq: + aa_charge += self.AA_CHARGES.get(amino) + return aa_charge From af1bef97b66b7b7462683facb1265a27661ce157 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Sun, 25 Feb 2024 05:34:23 +0300 Subject: [PATCH 14/29] Create requirements.txt --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..11fa53d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +numpy==1.26.2 + From ba11f4928de2614d04d57a5aed5b588384aaa484 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Sun, 25 Feb 2024 05:47:53 +0300 Subject: [PATCH 15/29] Update requirements.txt From 59fc34c948e16b773c5e90df29f7ae09f95ab282 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 1 May 2024 12:53:59 +0300 Subject: [PATCH 16/29] Delete main_script.py --- main_script.py | 115 ------------------------------------------------- 1 file changed, 115 deletions(-) delete mode 100644 main_script.py diff --git a/main_script.py b/main_script.py deleted file mode 100644 index 590d5e4..0000000 --- a/main_script.py +++ /dev/null @@ -1,115 +0,0 @@ -from abc import ABC, abstractmethod - - -class BiologicalSequence(ABC): - @abstractmethod - def seq_len(): - pass - - def seq_slices(): - pass - - def seq_indexes(): - pass - - def seq_to_string(): - pass - - def check_alphabet(): - pass - - -class NucleicAcidSequence(BiologicalSequence): - COMPLEMENTARITY_DICT = {"A": "T", "T": "A", "C": "G", "G": "C"} - alphabet = ["A", "T", "U", "G", "C"] - - def __init__(self, seq): - self.seq = seq - - def seq_len(self): - return len(self.seq) - - def seq_slices(self, start, stop): - slice = self.seq[start:stop] - return slice - - def seq_indexes(self, index): - return self.seq[index] - - def seq_to_string(self): - return str(self.seq) - - def check_alphabet(self): - if set(self.seq) <= set(self.alphabet): - return 'That`s a nucleic acid sequence' - else: - return 'That`s not a nucleic acid sequence' - - def complement(self): - complement = "" - for nucl in self.seq: - complement += self.COMPLEMENTARITY_DICT.get(nucl) - return complement - - def gc_content(self): - return (self.seq.count('C') + self.seq.count('G'))/len(self.seq) - - -class DNASequence(NucleicAcidSequence): - COMPLEMENTARITY_DICT = {"A": "T", "T": "A", "C": "G", "G": "C"} - TRANSCRIBE_DICT = {"A": "U", "T": "A", "C": "G", "G": "C"} - alphabet = ["A", "T", "G", "C"] - - def __init__(self, seq): - self.seq = seq - - def transcribe(self): - transcribed = "" - for nucl in self.seq: - transcribed += self.TRANSCRIBE_DICT.get(nucl) - return transcribed - - -class RNASequence(NucleicAcidSequence): - COMPLEMENTARITY_DICT = {"A": "U", "U": "A", "C": "G", "G": "C"} - alphabet = ["A", "U", "G", "C"] - - def __init__(self, seq): - self.seq = seq - - -class AminoAcidSequence(BiologicalSequence): - alphabet = frozenset('ARNDCEQGHILKMFPSTWYV') - - AA_CHARGES = {"A": 0, "R": 1, "N": 0, "D": -1, "C": 0, - "Q": 0, "E": -1, "G": 0, "H": 1, "I": 0, - "L": 0, "K": 1, "M": 0, "F": 0, "P": 0, - "S": 0, "T": 0, "W": 0, "Y": 0, "V": 0} - - def __init__(self, seq): - self.seq = seq - - def seq_len(self): - return len(self.seq) - - def seq_slices(self, start, stop): - slice = self.seq[start:stop] - return slice - - def seq_indexes(self, index): - return self.seq[index] - - def seq_to_string(self): - return str(self.seq) - - def check_alphabet(self): - if set(self.seq) <= self.alphabet: - return 'That`s a protein sequence' - else: - return 'That`s not a protein sequence' - - def aa_chain_charge(self): - aa_charge = 0 - for amino in self.seq: - aa_charge += self.AA_CHARGES.get(amino) - return aa_charge From 5a6e924e6aab0ed16570008dce936d92a95fe1ab Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 1 May 2024 13:00:11 +0300 Subject: [PATCH 17/29] Add bio_files_processor.py --- bio_files_processor.py | 116 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 bio_files_processor.py diff --git a/bio_files_processor.py b/bio_files_processor.py new file mode 100644 index 0000000..45705bd --- /dev/null +++ b/bio_files_processor.py @@ -0,0 +1,116 @@ +from dataclasses import dataclass + + +@dataclass +class FastaRecord: + f_id: str = '' + description: str = '' + seq: str = '' + + def __repr__(self): + return f'{self.f_id} {self.description} \n {self.seq}' + + +class OpenFasta(): + def __init__(self, file_path): + self.file_path = file_path + self.fasta_handler = None + self.line = None + + def __enter__(self): + self.fasta_handler = open(self.file_path) + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.fasta_handler.close() + + def __iter__(self): + return self + + def read_records(self): + return next(self) + + def read_record(self): + records = [] + for rec in self: + records.append(rec) + return records + + def __next__(self): + if self.line is None: + self.line = self.fasta_handler.readline().rstrip() + elif self.line == '': + raise StopIteration + + f_id, descr = self.line.split(' ', 1) + seq_id = seq_id.strip('>') + seq = '' + + self.line = self.file.readline().strip() + while (not (self.line.startswith('>'))) and (self.line != ''): + seq = seq + self.line + self.line = self.file.readline().rstrip() + return FastaRecord(f_id=f_id, description=descr, seq=seq) + + +def convert_multiline_fasta_to_oneline(input_fasta, output_fasta): + if output_fasta is None: + raise ValueError('Please, enter name of output file') + + with open(input_fasta) as fasta: + record = "" + + for line in fasta.readlines(): + if line.startswith('>'): + pass + else: + line = line.replace('\n', '') + record += line + + with open(f'{output_fasta}', 'w') as output: + output.write(record) + print(f'The record was saved to {output_fasta} file') + + +def select_genes_from_gbk_to_fasta(input_gbk, genes, n_before=1, n_after=1, output_fasta='output.fasta'): + records = [] + choosen_genes = [] + choosen_indexes = [] + crutch = True + + with open(input_gbk) as gbk: + for line in gbk.readlines(): + if 'gene=' in line: + if crutch == True: + records.append(line) + crutch = False + if 'translation=' in line: + records.append(line) + crutch = True + + for line in records: + for gene in genes: + if gene in line: + indexes_up = range(n_before*2) + indexes_down = range(n_after*2) + + indexes_up = [(records.index(line)-1) - i for i in indexes_up] + indexes_up = indexes_up[::-1] + indexes_down = [(records.index(line)+2) + + j for j in indexes_down] + + choosen_indexes.append(indexes_up) + choosen_indexes.append(indexes_down) + + for sub in choosen_indexes: + for idx in sub: + choosen_genes.append(records[idx]) + + with open(f'{output_fasta}', 'w') as output: + for line in choosen_genes: + if 'gene=' in line: + record = '>' + line.split('"')[1] + "\n" + elif 'translation=' in line: + record = line.split('"')[1] + "\n" + output.write(record) + print(f'The result was saved to {output_fasta} file') From fd2fa02ba37e31fd2c974197e9108b0dcbac9fb1 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 1 May 2024 13:01:03 +0300 Subject: [PATCH 18/29] Add bioinfUtils.py --- bioinfUtils.py | 231 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 bioinfUtils.py diff --git a/bioinfUtils.py b/bioinfUtils.py new file mode 100644 index 0000000..bc30eba --- /dev/null +++ b/bioinfUtils.py @@ -0,0 +1,231 @@ +import os +import sys +import io +import datetime +import requests +import numpy as np +import re +from abc import ABC, abstractmethod +from os.path import join, dirname +from bs4 import BeautifulSoup +from dotenv import load_dotenv +from dataclasses import dataclass +from datetime import time + + +def telegram_logger(chat_id=1507200540, parse_mode='HTML'): + load_dotenv() + TG_API_TOKEN = os.getenv('TG_API_TOKEN') + happy = 'https://media.tenor.com/lCKwsD2OW1kAAAAj/happy-cat-happy-happy-cat.gif' + not_happy = 'https://memepedia.ru/wp-content/uploads/2024/02/grustnyj-homjak-768x512.jpg' + + def decorator(fun): + def wrapper(*args, **kwargs): + start_time = datetime.datetime.now() + try: + result = fun(*args, **kwargs) + except Exception as exc: + # result = exc + raise + + time = str(datetime.datetime.now() - start_time) + + if isinstance(result, Exception): + message = f'Функция {fun.__name__} завершилась с исключением { + type(result).__name__}:{str(result)}' + request = requests.post( + f'https://api.telegram.org/bot{TG_API_TOKEN}/sendMessage?chat_id={chat_id}&text={message}&parse_mode={parse_mode}') + request2 = requests.post( + f'https://api.telegram.org/bot{TG_API_TOKEN}/sendPhoto?chat_id={chat_id}&photo={not_happy}') + + else: + message = f'Функция { + fun.__name__} выполнена за {time}' + request = requests.post(f'https://api.telegram.org/bot{TG_API_TOKEN}/sendMessage?chat_id={ + chat_id}&text={message}&parse_mode={parse_mode}') + request2 = requests.post( + f'https://api.telegram.org/bot{TG_API_TOKEN}/sendDocument?chat_id={chat_id}&document={happy}') + return result + return wrapper + return decorator + + +@dataclass +class GenscanOutput: + status: int + cds_list: list + intron_list: list = "Empty" + exon_list: list = "Empty" + + +def genscan(sequence=None, sequence_file=None, organism="Vertebrate ", exon_cutoff=1.00, sequence_name=""): + ''' + Python API for Genscan service (http://hollywood.mit.edu/GENSCAN.html) + Args: + sequence: nucleotide sequence in str format + sequence_file: path to the FASTA file with the nucleotide sequence + organism: the type of organism whose nucleotide sequence was uploaded. Сould be either Vertebrate, Arabidopsis or Maize. + exon_cutoff: confidence threshold for intron excision in float format. Can take on the following values: 1.00, 0.50, 0.25, 0.10, 0.05, 0.02, 0.01. + sequence_name: optionally, sequence name in str format + + Returns: + GenscanOutput class object, with the following attributes: + status: server request status + cds_list: list of predicted protein sequences + intron_list: list of predicted introns + exon_list: list of predicted exons + ''' + url = 'http://hollywood.mit.edu/cgi-bin/genscanw_py.cgi' + rec_param = {"-o": "organism", "-e": "exon_cutoff", + "-p": "Predicted peptides only", "-s": "sequence", "-u": ""} + + if sequence_file is not None: + sequence_file = open(sequence_file, 'rb').read() + + rec_param["-o"] = organism + rec_param["-e"] = exon_cutoff + rec_param["-p"] = "Predicted peptides only" + rec_param["-s"] = sequence + rec_param["-u"] = sequence_file + + response = requests.post(url, data=rec_param) + status = response.status_code + soup = BeautifulSoup(response.content, "lxml") + str_soup = str(soup) + + peptide_pattern = r"GENSCAN_predicted_peptide_\d\|\d+_aa\n\n[A-Z\\n]+" + peptides = re.findall(peptide_pattern, str_soup) + + intron_pattern = r"\d.\d+ Intr [+-] +\d+ +\d+" + introns = re.findall(intron_pattern, str_soup) + + str_intr = str(introns) + sub_pattern = r"'\d.\d\d " + replacement = "" + str_intr_clear = re.sub(sub_pattern, replacement, str_intr) + exon_pattern = r"\d+" + exons = re.findall(exon_pattern, str_intr_clear) + exons = list(map(int, exons)) + + exons_borders = [] + counter = 0 + + for i in exons[1:]: + if counter == len(exons)-2: + break + counter += 1 + if counter % 2 == 0: + border = i - 1 + else: + border = i + 1 + exons_borders.append(border) + + exons_borders_array = np.asarray(exons_borders) + exons_borders_array = exons_borders_array.reshape( + int(len(exons_borders_array)/2), 2) + + counter_2 = 0 + for obj in exons_borders: + counter_2 += 1 + new_str = str(counter_2) + " " + str(obj) + exons_borders[counter_2 - 1] = new_str + + genscan_output = GenscanOutput( + status=status, cds_list=peptides, intron_list=introns, exon_list=exons_borders) + + return genscan_output + + +class BiologicalSequence(ABC): + def __init__(self, seq): + self.seq = seq + + def __len__(self): + return len(self.seq) + + def seq_slices(self, start, stop): + slice = self.seq[start:stop] + return slice + + def seq_indexes(self, index): + return self.seq[index] + + def __str__(self): + return str(self.seq) + + def check_alphabet(): + pass + + +class NucleicAcidSequence(BiologicalSequence): + COMPLEMENTARITY_DICT = {"A": "T", "T": "A", "C": "G", "G": "C"} + alphabet = ["A", "T", "U", "G", "C"] + + def __init__(self, seq): + super().__init__(seq) + self.seq = seq + + def check_alphabet(self): + if set(self.seq) <= set(self.alphabet): + return True + else: + return False + + def complement(self): + complement = "" + for nucl in self.seq: + complement += self.COMPLEMENTARITY_DICT.get(nucl) + return complement + + def gc_content(self): + return (self.seq.count('C') + self.seq.count('G'))/len(self.seq) + + +class DNASequence(NucleicAcidSequence): + COMPLEMENTARITY_DICT = {"A": "T", "T": "A", "C": "G", "G": "C"} + TRANSCRIBE_DICT = {"A": "U", "T": "A", "C": "G", "G": "C"} + alphabet = ["A", "T", "G", "C"] + + def __init__(self, seq): + super().__init__(seq) + self.seq = seq + + def transcribe(self): + transcribed = "" + for nucl in self.seq: + transcribed += self.TRANSCRIBE_DICT.get(nucl) + return transcribed + + +class RNASequence(NucleicAcidSequence): + COMPLEMENTARITY_DICT = {"A": "U", "U": "A", "C": "G", "G": "C"} + alphabet = ["A", "U", "G", "C"] + + def __init__(self, seq): + super().__init__(seq) + self.seq = seq + + +class AminoAcidSequence(BiologicalSequence): + alphabet = frozenset('ARNDCEQGHILKMFPSTWYV') + + AA_CHARGES = {"A": 0, "R": 1, "N": 0, "D": -1, "C": 0, + "Q": 0, "E": -1, "G": 0, "H": 1, "I": 0, + "L": 0, "K": 1, "M": 0, "F": 0, "P": 0, + "S": 0, "T": 0, "W": 0, "Y": 0, "V": 0} + + def __init__(self, seq): + super().__init__(seq) + self.seq = seq + + def check_alphabet(self): + if set(self.seq) <= self.alphabet: + return True + else: + return False + + def aa_chain_charge(self): + aa_charge = 0 + for amino in self.seq: + aa_charge += self.AA_CHARGES.get(amino) + return aa_charge From 3a2353c17ff50948937ea0fa4381da6811eb71aa Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 1 May 2024 13:01:37 +0300 Subject: [PATCH 19/29] Add test_bioinf_utils.py --- test_bioinf_utils.py | 73 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 test_bioinf_utils.py diff --git a/test_bioinf_utils.py b/test_bioinf_utils.py new file mode 100644 index 0000000..89c4f55 --- /dev/null +++ b/test_bioinf_utils.py @@ -0,0 +1,73 @@ +import pytest +import os + +from bioinfUtils import genscan, GenscanOutput +from bio_files_processor import convert_multiline_fasta_to_oneline, select_genes_from_gbk_to_fasta + + +def test_genscan_status(): + inp = 'sequence_file="./data/SOWAHA.fasta"' + result = genscan(inp) + result = result.status + assert result == 200 + + +def test_genscan_content(inp): + result = genscan(inp) + result = len(result.cds_list) + assert result != 0 + + +def test_write_fasta_to_oneline(input_data, output_data): + convert_multiline_fasta_to_oneline(input_data, output_data) + assert os.path.exists(output_data) + + +def test_empty_fasta_to_oneline(input_data, output_data): + convert_multiline_fasta_to_oneline(input_data, output_data) + target = os.stat(output_data).st_size == 0 + assert target is False + + +def test_fasta_to_oneline_is_oneline(input_data, output_data): + convert_multiline_fasta_to_oneline(input_data, output_data) + result = 0 + target = 0 + with open(output_data) as file: + for line in file: + target += 1 + if line.startswith('>'): + result += 1 + result = result*2 + assert target == result + + +def test_fasta_to_oneline_no_output(input_data, output_data=None): + with pytest.raises(ValueError): + convert_multiline_fasta_to_oneline(input_data, output_data) + + +def test_gbk_fasta_len(input_gbk, genes, n_before, n_after, output_fasta): + select_genes_from_gbk_to_fasta( + input_gbk, genes, n_before, n_after, output_fasta) + target = len(genes)*(n_before+n_after) + + result = 0 + with open(output_fasta) as file: + for line in file: + if line.startswith('>'): + result += 1 + assert target == result + + +def test_gbk_empty(input_gbk, genes, n_before, n_after, output_fasta): + select_genes_from_gbk_to_fasta( + input_gbk, genes, n_before, n_after, output_fasta) + target = len(genes)*(n_before+n_after) + + result = 0 + with open(output_fasta) as file: + for line in file: + if line.startswith('>'): + result += 1 + assert target == result From 1ff2082bd0a1c24f6ae9925999f4a41872ae1696 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 1 May 2024 13:02:40 +0300 Subject: [PATCH 20/29] Add Showcases.ipynb --- Showcases.ipynb | 1146 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1146 insertions(+) create mode 100644 Showcases.ipynb diff --git a/Showcases.ipynb b/Showcases.ipynb new file mode 100644 index 0000000..ce2c2db --- /dev/null +++ b/Showcases.ipynb @@ -0,0 +1,1146 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from bioinfUtils import genscan, GenscanOutput,BiologicalSequence, DNASequence, AminoAcidSequence\n", + "from custom_random_forest import RandomForestClassifierCustom\n", + "from bio_files_processor import OpenFasta\n", + "\n", + "from sklearn.datasets import make_classification" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### RandomForestClassifierCastom" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = make_classification(n_samples=100000)\n", + "random_forest = RandomForestClassifierCustom(max_depth=30, n_estimators=10, \n", + " max_features=2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 2.38 s\n", + "Wall time: 5.63 s\n" + ] + }, + { + "data": { + "text/html": [ + "
RandomForestClassifierCustom(max_depth=30, max_features=2, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "RandomForestClassifierCustom(max_depth=30, max_features=2, random_state=42)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time \n", + "random_forest.fit(X,y)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 31.2 ms\n", + "Wall time: 1.02 s\n" + ] + } + ], + "source": [ + "%%time \n", + "pred_1 = random_forest.predict(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 2.25 s\n", + "Wall time: 3.5 s\n" + ] + }, + { + "data": { + "text/html": [ + "
RandomForestClassifierCustom(max_depth=30, max_features=2, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "RandomForestClassifierCustom(max_depth=30, max_features=2, random_state=42)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time \n", + "random_forest.fit(X,y,n_jobs = 2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 15.6 ms\n", + "Wall time: 1.32 s\n" + ] + } + ], + "source": [ + "%%time\n", + "pred_2 = random_forest.predict(X,n_jobs = 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_1.sort() == pred_2.sort()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Genscan" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "BRCA2 = genscan(sequence_file = 'data/BRCA2.fasta', exon_cutoff=0.50, organism=\"Vertebrate\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "200" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "BRCA2.status" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['1.01 Intr + 739 822',\n", + " '1.02 Intr + 3613 3861',\n", + " '1.03 Intr + 10778 10818',\n", + " '1.04 Intr + 11035 11149',\n", + " '1.05 Intr + 13979 14028',\n", + " '1.06 Intr + 15455 15566',\n", + " '1.07 Intr + 16808 17923',\n", + " '1.08 Intr + 19404 19493',\n", + " '1.09 Intr + 20801 25732',\n", + " '1.10 Intr + 31363 31432']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "BRCA2.intron_list[0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['1 823',\n", + " '2 3612',\n", + " '3 3862',\n", + " '4 10777',\n", + " '5 10819',\n", + " '6 11034',\n", + " '7 11150',\n", + " '8 13978',\n", + " '9 14029',\n", + " '10 15454']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "BRCA2.exon_list[0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['GENSCAN_predicted_peptide_1|3160_aa\\n\\nXALATFRVLNVASGTGLDSTAVKCSHPHNLGPISLNWFEELSSEAPPYNSEPAEESEHKN',\n", + " 'GENSCAN_predicted_peptide_2|331_aa\\n\\nMNEALEYYECYNLLAIKFWIDLNEDIIKPHMLIAASNLQWRPESKSGLLTLFAGDFSVFS']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "BRCA2.cds_list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### DNASequence and AminoAcidSequence" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Let`s transcride our sequence: UCUCCGCCUCGGCGACACCGUGACGACGCGGAGACGACGCGGAGCCCACAGAAAACGCCGCCACCCAGCGGCGGCCCUCUUCGCACUCCCCUGUCUAAACACUGGCCGCGCCAAAAACAGUCGAAUGAGGCCGGUUUUUUCUUGACGUGGAGACCUCGCCCAAUCACCACCACCAUCACCCAACCCUGCUCGCGCAGAAGGCGUCAGGGUCAGGUCGCACCGCCCCCUCGCGGAGUGCGGGGCCCAGCGACGGCGCCGAAGAACGGGAAAACAGAGACGGUUGGGGGUGGGUACGGACUCUCUUUCCAGGAACGGGCUUCCGUCUAAAAGCGGUUCGUUUAAGCUCGGGGCGGGGAAGGGACCCAGAGGUAAAGGGCGGAGGCCGGGCCGGAAACCCGAGGCGGAAGUCGAGUUCUGAAUUGAAGGGAGGGUCGACAGGGUCUACUGCGGUAGACUUUAAAGAACCUUUGUGCUAGUGAAAUU\n" + ] + } + ], + "source": [ + "seq = 'AGAGGCGGAGCCGCTGTGGCACTGCTGCGCCTCTGCTGCGCCTCGGGTGTCTTTTGCGGCGGTGGGTCGCCGCCGGGAGAAGCGTGAGGGGACAGATTTGTGACCGGCGCGGTTTTTGTCAGCTTACTCCGGCCAAAAAAGAACTGCACCTCTGGAGCGGGTTAGTGGTGGTGGTAGTGGGTTGGGACGAGCGCGTCTTCCGCAGTCCCAGTCCAGCGTGGCGGGGGAGCGCCTCACGCCCCGGGTCGCTGCCGCGGCTTCTTGCCCTTTTGTCTCTGCCAACCCCCACCCATGCCTGAGAGAAAGGTCCTTGCCCGAAGGCAGATTTTCGCCAAGCAAATTCGAGCCCCGCCCCTTCCCTGGGTCTCCATTTCCCGCCTCCGGCCCGGCCTTTGGGCTCCGCCTTCAGCTCAAGACTTAACTTCCCTCCCAGCTGTCCCAGATGACGCCATCTGAAATTTCTTGGAAACACGATCACTTTAA'\n", + "dna = DNASequence(seq)\n", + "print(f'Let`s transcride our sequence: {dna.transcribe()}')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Let`s measure the total charge of our peptide: -2\n" + ] + } + ], + "source": [ + "aa_seq = 'EEERQKDKRSNEEESSRRD'\n", + "peptide = AminoAcidSequence(aa_seq)\n", + "print(f'Let`s measure the total charge of our peptide: {peptide.aa_chain_charge()}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 02720781c4d28b231e7e4ee8eadf90f552cae8a7 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 1 May 2024 13:04:07 +0300 Subject: [PATCH 21/29] Add custom_random_forest.py --- custom_random_forest.py | 69 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 custom_random_forest.py diff --git a/custom_random_forest.py b/custom_random_forest.py new file mode 100644 index 0000000..512f22f --- /dev/null +++ b/custom_random_forest.py @@ -0,0 +1,69 @@ +import random +import threading +import numpy as np +from sklearn.base import BaseEstimator +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor +from sklearn.tree import (DecisionTreeRegressor, + DecisionTreeClassifier) + +SEED = 111 +random.seed(SEED) +np.random.seed(SEED) + + +class RandomForestClassifierCustom(BaseEstimator): + def __init__( + self, n_estimators=10, max_depth=None, max_features=None, random_state=SEED + ): + self.n_estimators = n_estimators + self.max_depth = max_depth + self.max_features = max_features + self.random_state = random_state + + self.trees = [] + self.feat_ids_by_tree = [] + + def fit(self, X, y, n_jobs=1): + + self.classes_ = sorted(np.unique(y)) + + with ThreadPoolExecutor(n_jobs) as pool: + temp_pool = [] + for i in range(0, self.n_estimators): + + np.random.seed(self.random_state + i) + + idx = np.random.choice( + X.shape[1], size=self.max_features, replace=False) + self.feat_ids_by_tree.append(idx) + pseudo_idx = np.random.choice( + X.shape[0], size=X.shape[0], replace=True) + X_sample = X[pseudo_idx][:, idx] + y_sample = y[pseudo_idx] + + tree = DecisionTreeClassifier( + max_depth=self.max_depth, max_features=self.max_features, random_state=self.random_state+i) + + temp_pool.append(pool.submit(tree.fit, X_sample, y_sample)) + self.trees.append(tree) + + return self + + def predict_proba(self, X, n_jobs=1): + y_pred = np.zeros((X.shape[0], len(self.classes_))) + + with ProcessPoolExecutor(n_jobs) as pool: + + temp_pool = [pool.submit(tree.predict_proba, X[:, self.feat_ids_by_tree[i]]) + for i, tree in enumerate(self.trees)] + + for p in temp_pool: + y_pred += p.result() + + return y_pred / len(self.trees) + + def predict(self, X, n_jobs=1): + probas = self.predict_proba(X, n_jobs=n_jobs) + predictions = np.argmax(probas, axis=1) + + return predictions From 3a0366e84c74fc027b864bd6ea09c7d927a3d61a Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 1 May 2024 13:07:08 +0300 Subject: [PATCH 22/29] Create data folder --- data/. gitkeep | 1 + 1 file changed, 1 insertion(+) create mode 100644 data/. gitkeep diff --git a/data/. gitkeep b/data/. gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/data/. gitkeep @@ -0,0 +1 @@ + From 3825362b73c36af6a88fb4829f4c21b54246c231 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 1 May 2024 13:07:34 +0300 Subject: [PATCH 23/29] Add BRCA2.fasta --- data/BRCA2.fasta | 1213 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1213 insertions(+) create mode 100644 data/BRCA2.fasta diff --git a/data/BRCA2.fasta b/data/BRCA2.fasta new file mode 100644 index 0000000..d708fd6 --- /dev/null +++ b/data/BRCA2.fasta @@ -0,0 +1,1213 @@ +>NC_000013.11:32315508-32400268 Homo sapiens chromosome 13, GRCh38.p14 Primary Assembly +AGAGGCGGAGCCGCTGTGGCACTGCTGCGCCTCTGCTGCGCCTCGGGTGTCTTTTGCGGCGGTGGGTCGC +CGCCGGGAGAAGCGTGAGGGGACAGATTTGTGACCGGCGCGGTTTTTGTCAGCTTACTCCGGCCAAAAAA +GAACTGCACCTCTGGAGCGGGTTAGTGGTGGTGGTAGTGGGTTGGGACGAGCGCGTCTTCCGCAGTCCCA +GTCCAGCGTGGCGGGGGAGCGCCTCACGCCCCGGGTCGCTGCCGCGGCTTCTTGCCCTTTTGTCTCTGCC +AACCCCCACCCATGCCTGAGAGAAAGGTCCTTGCCCGAAGGCAGATTTTCGCCAAGCAAATTCGAGCCCC +GCCCCTTCCCTGGGTCTCCATTTCCCGCCTCCGGCCCGGCCTTTGGGCTCCGCCTTCAGCTCAAGACTTA +ACTTCCCTCCCAGCTGTCCCAGATGACGCCATCTGAAATTTCTTGGAAACACGATCACTTTAACGGAATA +TTGCTGTTTTGGGGAAGTGTTTTACAGCTGCTGGGCACGCTGTATTTGCCTTACTTAAGCCCCTGGTAAT +TGCTGTATTCCGAAGACATGCTGATGGGAATTACCAGGCGGCGTTGGTCTCTAACTGGAGCCCTCTGTCC +CCACTAGCCACGCGTCACTGGTTAGCGTGATTGAAACTAAATCGTATGAAAATCCTCTTCTCTAGTCGCA +CTAGCCACGTTTCGAGTGCTTAATGTGGCTAGTGGCACCGGTTTGGACAGCACAGCTGTAAAATGTTCCC +ATCCTCACAGTAAGCTGTTACCGTTCCAGGAGATGGGACTGAATTAGAATTCAAACAAATTTTCCAGCGC +TTCTGAGTTTTACCTCAGTCACATAATAAGGAATGCATCCCTGTGTAAGTGCATTTTGGTCTTCTGTTTT +GCAGACTTATTTACCAAGCATTGGAGGAATATCGTAGGTAAAAATGCCTATTGGATCCAAAGAGAGGCCA +ACATTTTTTGAAATTTTTAAGACACGCTGCAACAAAGCAGGTATTGACAAATTTTATATAACTTTATAAA +TTACACCGAGAAAGTGTTTTCTAAAAAATGCTTGCTAAAAACCCAGTACGTCACAGTGTTGCTTAGAACC +ATAAACTGTTCCTTATGTGTGTATAAATCCAGTTAACAACATAATCATCGTTTGCAGGTTAACCACATGA +TAAATATAGAACGTCTAGTGGATAAAGAGGAAACTGGCCCCTTGACTAGCAGTAGGAACAATTACTAACA +AATCAGAAGCATTAATGTTACTTTATGGCAGAAGTTGTCCAACTTTTTGGTTTCAGTACTCCTTATACTC +TTAAAAATGATCTAGGACCCCCGGAGTGCTTTTGTTTATGTAGCTTACCATATTAGAAATTTAAAACTAA +GAATTTAAGGCTGGGCGTGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGTGGGCGGATC +ACTTGAGGCCAGAAGTTTGAGACCAGCCTGGCCAACATGGTGAAACCCTATCTCTACTAAAAATACAAAA +AATGTGCTGCGTGTGGTGGTGCGTGCCTGTAATCCCAGCTACACGGGAGGTGGAGGCAGGAGAATCGCTT +GAACCCTGGAGGCAGAGGTTGCAGTGAGCCAAGATCATGCCACTGCACTCTAGCCTGGGCCACATAGCAT +GACTCTGTCTCAAAACAAACAAACAAACAAAAAACTAAGAATTTAAAGTTAATTTACTTAAAAATAATGA +AAGCTAACCCATTGCATATTATCACAACATTCTTAGGAAAAATAACTTTTTGAAAACAAGTGAGTGGAAT +AGTTTTTACATTTTTGCAGTTCTCTTTAATGTCTGGCTAAATAGAGATAGCTGGATTCACTTATCTGTGT +CTAATCTGTTATTTTGGTAGAAGTATGTGAAAAAAAATTAACCTCACGTTGAAAAAAGGAATATTTTAAT +AGTTTTCAGTTACTTTTTGGTATTTTTCCTTGTACTTTGCATAGATTTTTCAAAGATCTAATAGATATAC +CATAGGTCTTTCCCATGTCGCAACATCATGCAGTGATTATTTGGAAGATAGTGGTGTTCTGAATTATACA +AAGTTTCCAAATATTGATAAATTGCATTAAACTATTTTAAAAATCTCATTCATTAATACCACCATGGATG +TCAGAAAAGTCTTTTAAGATTGGGTAGAAATGAGCCACTGGAAATTCTAATTTTCATTTGAAAGTTCACA +TTTTGTCATTGACAACAAACTGTTTTCCTTGCAGCAACAAGATCACTTCATTGATTTGTGAGAAAATGTC +TACCAAATTATTTAAGTTGAAATAACTTTGTCAGCTGTTCTTTCAAGTAAAAATGACTTTTCATTGAAAA +AATTGCTTGTTCAGATCACAGCTCAACATGAGTGCTTTTCTAGGCAGTATTGTACTTCAGTATGCAGAAG +TGCTTTATGTATGCTTCCTATTTTGTCAGAGATTATTAAAAGAAGTGCTAAAGCATTGAGCTTCGAAATT +AATTTTTACTGCTTCATTAGGACATTCTTACATTAAACTGGCATTATTATTACTATTATTTTTAACAAGG +ACACTCAGTGGTAAGGAATATAATGGCTACTAGTATTAGTTTGGTGCCACTGCCATAACTCATGCAAATG +TGCCAGCAGTTTTACCCAGCATCATCTTTGCACTGTTGATACAAATGTCAACATCATGAAAAAGGGAAAT +GATTCCATAGCGTTATTATGAAAGTAGTTTTGAACTGTAATGGTAGAGGATGAATAGCTCACAATACAAA +TTTGTCATTTCCCTTTAAGAGAGAATTCCCATTTTATGTGAGAGTCCACATGTTCCTCATACCCATAGTT +TGCCACATCTTGAGTACTCTTCAGAATTATTTGAATTTTTTGAATTTTATCTGTGGAATGTATTTTTTTT +TTTTTCTTTTTTGAGACACAGTCTTGCTCTGTTGCCCAGGCTGGAATGCAGTGGCGTGATCTCGGCTCAC +TGCAACCACCGCCTCCTGGGTTCAAGTGATTCTCCTGTGGCAGCCTCCGGAGTAGCTGGGACTACAGGCG +TGTGCCACCATGCTTGGCTAATTTTTTGTGTTTTTAGTAAAGATGGGGTTTCAACGTGTTAGCAAGGTTG +GTCTCGATCTGACCTCGTGATCTGCTCGCCTCAGCCTCCCAAAGTGTTGGGATTACAGGCGTGAGCCCCC +GCACCTGGCCGAATTTTATCGTGGAATGTATTCTTAATGTGAATAGTTTTTGATTCCGAACCATGAATAA +TAAGAAAATAAATAAAATTTAAATGAAAATAAAAGCTAATATATACAGCTTTTAATAATATAGTTAAATG +CCATCTTGTAACTTTTGTGAACTCTTGTTACACCTTTCTATAGATTCGCAAGAGAATGGATTAATGATCT +TGTTTAATTAATATGCCTTAACAAAAGTAATCCATAGTCAAGATCTTAAGCATTTTTTTCCTTATGATCT +TTAACTGTTCTGGGTCACAAATTTGTCTGTCACTGGTTAAAACTAAGGTGGGATTTTTTTTTTAAATAGA +TTTAGGACCAATAAGTCTTAATTGGTTTGAAGAACTTTCTTCAGAAGCTCCACCCTATAATTCTGAACCT +GCAGAAGAATCTGAACATAAAAACAACAATTACGAACCAAACCTATTTAAAACTCCACAAAGGAAACCAT +CTTATAATCAGCTGGCTTCAACTCCAATAATATTCAAAGAGCAAGGGCTGACTCTGCCGCTGTACCAATC +TCCTGTAAAAGAATTAGATAAATTCAAATTAGACTTAGGTAAGTAATGCAATATGGTAGACTGGGGAGAA +CTACAAACTAGGAATTTAGGCAAACCTGTGTTAAAATCTTAGCTCATTCATTAATTGTGTCATGCTGGGC +AAATCAGTCTCTCTGGCCTCTTTTTCCTCACTCGAAAAATGGAGACGATGAAAATAATGTCTCATAGGTT +TGGATTAAATTAAATAATGTAGGTACTTAGTAAATGTTCTCTTTCATCCCTCCTTTGATAAATTTGCCAA +CTGAGATTTGCTGAATTACGTCTTTCTTATGCCAAAAAAACCTAGGACTTGTTTTGATGTTAATTAAACT +AAACTATATTTCTGCAAGCTATCACAGAGGACAGAGATTATTTTACCGATATACTATAAGTATCATGATT +TGGAAGGAGTTTCCCTGGCGTAGGTGCCGCATGTTTCTAAGCAATTATGTAATAAGATTATATATTCAGT +CATTCAAATAATTATTACCTACTTGACATAAGTAATGAACTTTCCCTTTTCTTCAGAGTGTTAATCTCTA +GTAAGGGGAATAAAGAGTACACAGATAAAGTATAGTGTAAGGTTGAATGTAGTATGTGCTAAGAGAAAAA +TATAAAAAAGTATAATGAGAGTTGAGAAGAAAGAGCAAATAGTATTGGGCAAAGTTAGGCAATTATTCCT +TTGAGCTAAACCTTGAAGGATAGGTGAGAGATTAAGAAATTTGAAGATGTGGTAGAGTGATAATGTTCTA +GGCAGAGGGAACAACATGAGGAAGAATATGTAGTGTGTTCAGGAAATAGCAAGTAATTCAGGTTGGCTTT +GGTTGTTTTGTGTCTGAAAGGGACCAATAGACAAGGCAAAAAGGCAGACTAAAGGCAGGCATTGAATGCC +AAGCTAAAGAAATTGAATTTGTTTGGTTGGTTGGTGAGCAGAGAAATCACATGCAAATTTCATCATGCTA +CTTATTGTGTCAAACCTTAGATCACCTCCCTTTGTCCTTATAGCAAAATCTAAACTTGATATGGCTTTCA +AGTTCCTTTGTGATCAGGCCCCTGATTTACACTCTTGGCTCAGCTTGCCATATTCATCCTCTCACCTATC +TTCATTTGCCATTCATTCCTACTGAATTTCTTTTCGTTACCAAAACCACAATGCTCTCTGGCTCTTTATT +AAACATATTGTTACCTCTACCCACAACCTACTTTTTCCCTACTTTTTGTCTAGCTAATTTGCGTGCTCGT +CTTTCAGATCTTGGCTTATTTCTGCTTCTGAGAAATACTTCCTGTCTGCCCTCGTTGAGCTTCTAGTGAA +GGAGACATACATAAGCAATTATAGTGTGATACATGCTTTGAAAGAAATTCATGGCTATAGGGAGTGCATA +TACAAAGGGAATATAGGTAATGGGCAAATATTTACATGTATGTTATTGGATACCAAATGGTATACATAGG +ATTCAGTAAATATTTGTAGAGTGAGTATTAGTATTATTTGCTTTAGAAAGCCTAATGATCAAACAGCAGT +CTTTGGAGATAACGTTTTTCAAAATGTCATGTCTGTGCCATTAGAATCTTCTAGACTGCTCATTGAAAGG +ACAGATTCCAGGCCCCACTCTGAATCTCTTAATTTATAATTTTTGGAAATGATGCCCATGAGTCTACATT +TTAAACTACCTGAATGATCCCTATAGAAAGAGAAAACTGGAGGTAGGAAGATCAGTTAGGGGATGTGTAA +TGGTCTAGGTGATAGAGACAAGTGCCTGAATTACAGTAATAACAGTGAAAGTAAATATGGAACATAAAAC +TATAGGACCTTGCAGTAGTCTAGATATGGAGGATTCAAAAAAAGGAACAAATGACAGGGCAAAGCATATG +CAGAACACAGTAGTAACAGTCATAGAAATGGATAAGGGAGTCATCCATTCTGCAAATACTTAGTGCTTAC +TTGTGTCTGGCAACCTGCTCGGCATTAAGGATACAAATATGAATAAGATGTCCTTTGACCTCTAAGTACT +CAGTCTCGTAAGCACGTCTTGTAAGCACATCTTGGTTGCTTCCATAAAAATAAATACACTAGTGTGATAT +GTTATAAGAGCATGTACCAAGTGCATGAAAAGTGAGCAGCCATCTCTGGTTGGTCAGAAAAAGCTCCATA +AAGCAGTTTTTGCTGAATCTTGAAAGATATACCTAAGGTCAAATGGTTAATTCTTTAATCATAACCTGCT +AGAATTGATCTATAACCAAGGAAGGATAGTAAGGAATTAATAAGGCCACTCTCAACTCACTGCAAAGGAG +TTAACTTTTTGAAGGCTGTAATACATAAATCTGCTGACTAGTCTCTTGAGACCTTTTGCTTTTACGTTTA +CTTTAGATTCAGTATTGAAAAGTAAGAGTAATGGACTTAAGCTGTGTTTTTCAACCTGTTTTGTTCAGTT +CTAACATGTAATATTTTTTAAAAAATTATTCCTAAAGTTCTATGAGGAATTGTGCTGTTTCTGCCTCTCA +GCAGTCCTTCCTTTTGCATTAAATCATAGGCATTTCTGTTACCATTCTTCAGCTTATTAATGAGATCCTC +AGGTTATTTGGGAAATGTTTATTTGGTAATTAACTCTTTTTCACCTAGTTCATTTTTTTAACTTTTTTTT +TTAAATAGCCGAGTTTCTTTTCATTGCTGAACTAAAATGGATGTGTTATTATTAGCTGAACTCCTTAGTT +TACTTTAGAGTTCACCCTTTGTATGGTTCTATGGATTTTGACAAATTGTATAATGTCGTATATCTGCCAT +TATGGCATTATACAGAATAATTTTGCTGCCCTAAAAATCTCCCGAGTTCCACCTGCTCACCCATCCCTCC +TCCTGAGCCCCTGGCAGCCACTGATCTTTTTACTGTCTGTATAGTTTTGCCTTTTCCAGAATGTCATGTA +GTTGGAATCATACAGAATATAGCATTTTCAGACTGGCTTCTTTCACTTAGCAATATGCCGAGACCAGCTC +GATTGTAGAGACCCTAACCCAGCGGCACTAGAGGAATTAAAGGCACACAGAAATATAGCGGTGTGGAGTG +GGAAATCAGGGGTCTCACAGCCTTTTGACAGCAAGCCAGTGATAAGCATTGTTTCTATAGATTATAGATT +AACTGAAAGTATTCCTTAGGGGAAATAAAGGGCTGGGCCGAAGTAAAGGGATGGGTCTGGCTAGTTATCT +GCAGCAGGAGAATGTCCTTAAGGCACAGGTCGCTCATGATAGTTTGTGGTTTAAGAACGCCTTTAAGCGG +TTTTCTGCCCCGGGTGGGCCAGGTGTTCCTTGCCCTCATTCCGGTAAACCCACAAGCTTCCAGCGTGGGT +GTCATGGCCATCACGAACATGTCACAGTGCTGCAGAGATTTTGTTTATGGCCAGTTTTGGGGCCAGTTCC +CAACAGCAATATGTGTTTAAGGTTCTTCCATGTCTTTTAATGATTTCATGCTGAATAATATTCCATCGTA +TTGATGTACCACAGCTTGTTTATCCATTCATCTATTGAAGGACATCTTGATTGCTTCCAAATTTTGGCAA +TTATGAATAAAGCTGGTATAAATATTCACATACAGGTTTGTGTGTGAATATATTTTCAACTCATTTTGGT +TCACACCAAAGAGCACGATTGTGGGATCATATAGTAAGAGTATGTTTAGTTTTATGAGAAACTACAAGCT +TTCTTCCAAAGTAGCTGTTGCATTTTGTATTCCCACCAGCAGTGAATGAGAGTTCTTGTTGCTCACATCC +TCACCAGCATTTGGTGTGTCAGTGTTTTGAATTCTAGCCATTCTAACAAGTGTGTAGTGGTACCTCATTG +TTTGTTTTATTTAATTTTTTTTTTTTTTTTTTGGAGATGAAATCTCGCTTTGTCGCCCAGGCTGGAGTGC +AGTGGCGTGATCTTGGCTCACTGCAAGCTCCGCCTCCCAGGTTCACGCCATTCTCCTGCCTTAGCCTCCT +GAGTAGCTGGGACTACAGGCACCCGCCACCACACCTGGCTGATTTTTTTGTATTTTTAGTAGAGACGGGG +TTTCACTGTGTTAGCCAGGATGGTCTTGATCTCCTGACCTCGTGATCCGCTCGCCTCGGCCTCCCAAAGT +GCTGGGATTACAGGCGTGAGCCATCATGCCCGGCCTGTTTTATTTTTTAAAGTCAATTTTCTTTCAAGAA +TTAGCTACTTTTTAGTATCTTTAATTAAAAATCTCATTAGAGAAGGAGGTTGGATATTTTGTTGAAGTGG +GGTTTTTAAGTTACACATCCATTTGCTTTATTAGTGATTATGTCTAGTCCATGTTAACTTGAAAAATGAG +ACTATAATGAGACATTTTATTTAGGCTGCTACAAACAGTTTTAAATTTGGTCTTCACTTTATTTTAGTAA +CATTGATAGAGCTTATTTTTCCCAAAAGCTAAGTTAGAGATTATAGGACCAACCGAAGCAACTATTTTCT +AAGAGTAATAATAAGTGACTCAGGTGCCAAATTTGTAGTTACCATCAACTATTGGAACCATATGAGTACT +TAATGCCCTGGAGAGTCAAATATAATCTACTCTAATACAGAAAATAGAAATATTGAAAAACTGTAAATTG +GATTTCATATTGTTAAAGCCACCTATAGCTTTAGAAACTCTGAACATTATTTTCTTAGAAAATGGATGTG +TTCAATAAGAATAGAAATTATGTATTACTGTCTGCAACTCACTTTGTCTAATTATATCCAATTTATTCAT +CCAGTCAATATTTCAGGAGTGACTAATATACCAGACATTTTTGTAGTTGCTAGGGATACAGTGACAAATA +AGACAAAATCTCTACCTCAGATTGCTCACAGCCTAGTAGGGGGAAAAAGAACAGTGTATGATCAAACTCT +TCAGGGAACACATAGGGGGGCAAACACTTAATCTTACCTTAGGGATCACTACAGTTTTCTGGAGGAGGTA +GTTTCTAAATGGAAGCCTGAAAGAGTTGTTCCAGGTCAAGAAAAGCAAAGAAGGGGAAACAGCTTGTACA +AAGTCCTAGAGGTTAAAGAAAACATTCTTTCAGGATATGCAAATGGTTGGGTATGGGTAAAAAGTAGACT +GTAAAAGAATGGCATCATAAAAATTAAGTAAATTGTCACATAAATATATATATTTCTTATGTACCCACAA +AAATTAAAAATGAAGAAATTAAGTAAATTGTGAAAGGCCTTCATACTATGGAGTTTGACTTGATCTTGAA +AAGTAAGATCTTGAAAGGTTTTTAGCACAAGTGATATTGTCAGATCTGGTACATTGGTAGGTTTTCAGTA +AATGTCTTCCCTTACTCCTTTTTTCTCTTTCCTTCTGCTTTTGTTTAAAGCGACAAGATGTTGCTCTTTT +CCCAGGCTGGAATACAGTGGCATGATCATAGCTCAAGCTCCTGGGCTCAAGTGATCCTCCCGCCTCAGCC +TCTCAAGTAGCTAGGACTACAGGCATATCACCACACCAGCGTTTTCTTTGTAGAGGCAGAGTCTCACTCT +GTTGCTCAGGCAGGTGTTGAACTCCTGCCTCAAGCAATCCTCCCACCTCAGCCTCCCAGAGCCCTCAAAT +TATAAGCCACTGTGCTCGGGGCATCCTTTTTGGGGGGTAATCAGCAAACTGAAAAACCTCTTCTTACAAC +TCCCTATACATTCTCATTCCCAGTATAGAGGAGACTTTTTGTTTTTAAACACTTCCAAAGAATGCAAATT +TATAATCCAGAGTATATACATTCTCACTGAATTATTGTACTGTTTCAGGAAGGAATGTTCCCAATAGTAG +ACATAAAAGTCTTCGCACAGTGAAAACTAAAATGGATCAAGCAGATGATGTTTCCTGTCCACTTCTAAAT +TCTTGTCTTAGTGAAAGGTATGATGAAGCTATTATATTAAAATATTTAAATGAAACATTTTCCTACATAT +ATTTGTTCTATAAAGATGAATCTGATTTTTATGCTAATATTTTGGCTAAGAGCCTGGTAGAAGATCTTAC +ATTTTTAAATAATCTTTTAGGTTGAGTCCTTTAATAGAATAGTTTTTACATTAGAAACATGTAAGTTGTT +GTTCTTGTGATGTTGAATTGGCTGGTTTTCTGTATATTCTGTGATTTTTTAAGTAACAAAAATAACAGTG +GTGAAAAGCAGTAAGTCAGTCCTTGAATTATCAATTTAAAATAAATTGTGTACTTTTCATCTTTGGAGAG +AATATGATTTACTTTACAAATTTTTTTTTTGTTTTTTTTTTTTTTGAGATGGAGTCTCTGTCACCCAGGC +TGTAGTGCAGTGGTGCGATCTCAGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTC +AGCCTCCCAAGTAGCTGGGACTACAGGCGCCCGCCACCATGCCCGGCTAATTTTTTGTATTTTTAGTAGA +GACGGGGTTTCACCGTGTTAGCTAGGATGGTCTCGATTTCCTGACCTCGTGATCCGCCCGCCTCAGCCTC +CCAGACTGCTGGGATTACAGGCGTGAACCACTGTGCCCGGCCTACTTTACAAAATTTTTGAGTTTAAAAT +ACACGGTTTCCAGCAGCTGAAATTTGTGAGTACATATGTGTTGGCATTTTAAACATCACTTGATGATTAT +TTAATGCTTCATGAGAGATTTACTTTTTAAAATGTAATATAAAATATCTAAAAGTAGTATTCCAACAATT +TATATGAATGAGAATCTTCTTTTAAAAATAAGATAAACTAGTTTTTGCCAGTTTTTTAAAATAACCTAAG +GGATTTGCTTTGTTTTATTTTAGTCCTGTTGTTCTACAATGTACACATGTAACACCACAAAGAGATAAGT +CAGGTATGATTAAAAACAATGCTTTTTATTCTTAGAATACTAGAAATGTTAATAAAAATAAAACTTAACA +ATTTTCCCCTTTTTTTACCCCCAGTGGTATGTGGGAGTTTGTTTCATACACCAAAGTTTGTGAAGGTAAA +TATTCTACCTGGTTTATTTTTATGACTTAGTAATTGAGAATTTGACAATAGCGTTATACCTTTGCCCTGA +GATTTACAAATCTGTACCTAGCATTCTGCCTCATACAGGCAATTCAGTAAACGTTAAGTGAAATAAAGAG +TGAATGAAAAAATAATATCCTTAATGATCAGGGCATTTCTATAAAAAATAAACTATTTTCTTTCCTCCCA +GGGTCGTCAGACACCAAAACATATTTCTGAAAGTCTAGGAGCTGAGGTGGATCCTGATATGTCTTGGTCA +AGTTCTTTAGCTACACCACCCACCCTTAGTTCTACTGTGCTCATAGGTAATAATAGCAAATGTGTATTTA +CAAGAAAGAGCAGATGAGGTTGATAATTGTCATCTCTAATACTTCTGTTAAAAGGAAATATGAAAAGAAA +ATATTAGATAATGTCTTTGATAAGTGTGTTAGTAACTGACAATAATTTTATTCTATTAAGTGTAGATTGG +AATAAATACAAATACATTTAGTGGTAGTCCAGTGGTGTCAAGCATTATGTTTTAGTACGATGTGATTAAC +GTAGAATAGCTTACAAATATTCCTTTACTGGCCTATATAAGCGTTTAAGAGGCAGTATTTGGTGTGACTG +AATTCTTTTTACAAATGATTGTGGTAATTGGGGCATTAAAGCAGCATTAAATAAGCTTTTGTTTTCTCTA +CTTAAATGTGTTCTAAGGTCTGTATTGCCAGTAGTACTGAATTGAGGTCTTAAATTCCACAAGTGTAATT +ACACAACTATGTGATAAACTGCAATATTTATCCATTCATTAAACTGTAAACTCTTTGCAGTCTCACCACA +GTTTCTCTTACTAGGATCTAGAAATATTTCCTATTGTAGGCTGGTTGCAGTGGCTCACGCCTGTAATCCC +AACACTTTGGGAGGCTGAGAAGGGTGGATCACGTGAGGCCAGGAGTTTGAGAGCAGCCTGTACAACGTGG +TGAAACCCTGTCTCTACTAAAAATAAAAAAATTGGCCAGGTGTGGTAACACACACCTGTAATCCCAGCTA +CCTGGGGGCTGAGGCATGGGAATTGCTTGAACCTGGGAGGCAGAGGGTGCAGTGAGCCGAGATTGTGCCA +CTGCACTCCAGCCTGGGTGACAGGGAGGCTGAGGTGGGAGGATCACGAGGTCAGGAGATCGAGACCATCC +TGGCTAACGTGGTGAAACCCTGTCTCTATTAAAATAGAAAAAATTAGCTGGGCGTGGTGGCAGACACCTG +TAGTCCCAGCTACTCAGGAGGCTGAGGCAGGAGAATGGCATGAACCCGGGAGGCGGATCTTGTAGTGATC +TGAGATCATCACGCCACTGCACTCCAGCCTGGGCAACAGAGCAAGACTCTGTCTCAAAAAAAAAAAAAAA +TCCTGTTATAAAACTACTTAAAAATCTCTGAGTAGCTGAGATTTGGCTAATCATGACTTAGTATTTGAAA +AGTTGTGACTATTTTTTTTTTTTTTAATTGAGACAAGGTTCTTCTCTGTTGCCCAGGCTGGAGTGCAGTG +GCACCGTCGCAGTTCACTGCAGCCTCAACCTCCCAGGCTCAATTAATCTTTCTTCCTCTTAGCCTTCCAA +GTATCTGGGACTACAGGTACCATGCCACCAGTATACTACCAGTCCTGGCTAATTTTTTTTGTATTTTTTG +TAGAGATGGGTCCCGCCATGTTGCCCACACTTGTCTCAAATTCCTGAGCTCAAGCAGCCACCACACCCAC +CTGTGACCATTCTTTTTTATTTTTATGAGATAATAAACATACAAGTTTAAAGAAATGTCTGTACATAAAT +GTGATTATAGTACAAACAAGTATTTGGAAGTTCATCTAAACAAATGCATCACAGTTTATAGGCAAAACAT +GAAAGATTGGATAATAATGGGAAAAAAAGTAAATATTCACCAACATTCTTTCTCTTTTTTCTTTTTCGTT +TTTTTTTTTTTTGAGGCGGAGTCTTGCCCTTTTGCCCAGGCTGGAGTACAGTGGCACCATCTCGGCTCAC +AGCAACCTCTGCCTTCTGGGTTCAGGCGATTCTCCTGCCTTAGACTCCCGAGTAGCTGGGATTACAGGCA +CCCACCACCACGCCTGACTAATTTTTGTATTTTTAGTGGAGAGGAGGTTTCACTGGGTTGGCCAGGGTGG +TCTTGAACTCCTGACCTCAAGTGATTCGTTTGTCTCAGCCACATTTTTTTTGTCTAAGAAGATACTGGGC +CAGATCATTGTTTCTCAAATTGCAGATTATGACCTGTTCATAGTTGTGAAACTTATTTTGTGAGTCGTAT +ATGCTCTTTTTAAATGAAATGAAAATTCTGAGTACATCACATGTAGTTAGGGTTTAGAAAATAAAAAATA +CAATATATCTAGTTAAATTTGGACTTCAGGTAAACAGCGAATAATTTTGAGATATACTTAACACTAAAAA +ATTATTCATTGTTTATCTGAAATTCAAATTTAATGAGGTGTCCTGTATTTTATCCAGAAGTCCTACACAC +AGTAAAGTTTGTTTTGTAAAACTTTTTTACTTAACCTTTGTGTGCCCATGTGTGTGTGCAGTCATAAAGT +GTGTGTGTGTGTGTGTATTTAAAAAACTAGGTTGTACTCAAAGCCTGAGCTTAATTTATTCCCAAACCAG +TATTACATTTTGTTTATTCTAGCAAAATAGCATTCTGTTTTGATTCCTCTTTAGCTGGGAGTAAGTTAAC +CCTATTCTGTTGCTTAGATGAAATAATATGGATAAAATCATTTTGAAAATATGTATTTAATATATAGTAT +GCCTTTAGGCTGTAGTGTTGTCTAAATGAATGCTAAAGTCTCCAAGCTTTAGCTTTTAAGTCATAACCTC +ACAGCATCATCTGACTTTCCAACTCATTGTGGACAGTATTACCATAAAGTAATGATCACCAAGCCATATC +TTACCACCTTGTGAGTAGTACTAAGGAAGTAAGTATAGTTTATTCACTGTGTTGATTGACCTTTCTAATT +ACTATACTTAAGTACTTGAATCAATTCATTTTGTTTCAAATGTGTCATGTAATCAAATAGTAGATGTGCT +TTTTGATGTCTGACAAAAAATAAGTTTTTGCATTCTAGTGATAATATACAATACACATAAATTTTTATCT +TACAGTCAGAAATGAAGAAGCATCTGAAACTGTATTTCCTCATGATACTACTGCTGTAAGTAAATATGAC +ATTGATTAGACTGTTGAAATTGCTAACAATTTTGGAATGCCTTGTTAAATTATTTATCTTACATTTTTAA +TTTCCTAATCTGTAATTTATCTAAGCCTTTGAGAAAGTCTCTAAACCTGGTCCTATATGTGATTTTAACT +TCCTGTGAAACTCTGCTGTCTCTCTGTTAAAGTTGCATATATACAATATATACCGTAGTCCCCTATTCAT +GGGGTATACATTCCAATATCCCCCAGTGAATGCTTGAAACCTTAGATAGTACCGAACCCTATATATATAT +ATTAAAAATGTGTAGTATTTATATATATATACCTATAATCTTTTTTTCTATAAGCACATACCCTGTGATA +AAGTTTAATTCATAAATTAGGCACAGTAAGAGATTAACAAGAACTAATAATAAAATAGGACAATTATAAC +AAAATACCGTAATAAAAGTTATGTGAATGTTGTCTCTCTGTCTCAAAATATCTTATTGTTCTGTACTCAT +GTGGCAGCAGCTTCATCAGCAGATGTGGCCTCTCCAGTAATTTTTATATTTTTCAGTCCAAACCTATTCT +TGAATCTGTGTAACCAACCATCCCTTACTTGCAGTAAATGGCTTGGTGTCATTCATTTCAGGGGATCCCT +TACTGAAGTTTTCGTTTAGGCTCTATGCTTTCTGGCGTAATATGTAGCTGTCAATCAAAACAACCTGTTC +ATGTTTTCTACCCACAAATGTAATACCTTTTCTACTTCTATGGTGCACTGTGTGGCCACAACATTTGCAG +TTTGAGGTGTGACAGCAAAACCAGCTCATATGTCTTTCTCCTTCACAATCTCACAGATAGATTTGTTCTT +ACCATAGATGTCGCAGTACAATTTTTTTCCTTTCCTTAAGTCGAGAACTTTCACTGTTTCAATTAAAGGA +AGCACTTTATGGCTTCTTTTTGGCATATTTGAATTGCCAGCATCATTATACTTGTGCTTTGGGGCCATTG +TTAAGTAAAATAAGGGTGACTTGAACACAAGCACTGTGGTACCACAATAGCCGATCTGATAACCAAGACA +ACTACTAAGTGACTAATAGGTGGGTACCATATACAGCCTGGATACGCTGGACAAAGGGATGATTCATGTC +CCAAGTGGGATGGAGCAAGATGGTGCAAGTTTTTTTTTCTCCATTTCCATTTTCCTTTCCTAAGATTTCC +ACATCCTAGTGGTGCAAGATTTCATCACACTACTCAGGATGACACACAATTTAAAACTTACTAATTGCTT +ACTTCTGGAATTTTCCATTAAAAATTTTTGGACCTAGGTTGATTGCAGATAACTGAAATCACCAAAAGTG +AAACCATGGATAAGGGGGGACTACTACTATATGTGCATTGAGAGTTTTTATACTAGTGATTTTAAACTAT +AATTTTTGCAGAATGTGAAAAGCTATTTTTCCAATCATGATGAAAGTCTGAAGAAAAATGATAGATTTAT +CGCTTCTGTGACAGACAGTGAAAACACAAATCAAAGAGAAGCTGCAAGTCATGGTAAGTCCTCTGTTTAG +TTGAACTACAGGTTTTTTTGTTGTTGTTGTTTTGATTTTTTTTTTTTGAGGTGGAGTCTTGCTCTGTCAC +CCGTGATCTCGGTTTACCGCAACCTCTGCCTCCCGTGCTCAAGCGATCCTGCCTCAGCTTGCCAAGTAGC +TGAGATTACAAGCATGCACCACCATGCCCAACTATTGTATTTTTAGTAGAGATGGCATTTCACCATGTTG +GCCAGGCTGGTCTCAAATGGTCGTGAGCCACCATGCCCAGCCTGAACTACTCTTTTTAATTGGCACCATT +GAAGGATTGCTCCTCTTTTCTTAAAGAGAAAATATATTACCTTTCCTTTCTTGACTACTGAAGTAGTATT +TTATCTCAAAGTATTGAGAGTAGAAACTAACTTGGTGTGCCTGTGATCCCAGCTACTCAGGAGGCTGAGG +TGGGAGGATCGCTTAAGCCCAGGCGGTCAAGGTTGCAGTGAGCTGTGTGTGTGCCACTGCACTCCCACCT +GGGCAACAGAGTGAGACCGTGTCTCAATGGAAAAAAAGAGAAACTAATTTGATTTCGATGACAGTATTTA +AATACTGTGTAAGACAGTACTATTTAATATGTGGTTGTGACACAAAAACAAAGCCTATTGAAAATTTTCA +GAGACAATAAGATATATAATTAACAAAATCTGAGCTTTTTTTTTTTCTAATTAGAAAGTAAATGTGGTTT +AGATATACCATAGTTTACCTAATCAGGTCATGGAATATTGCATTTTTCTTAGTATGTGTGTATGTCTGTA +TAACTGTGTAGGATTTGATATCTGTTTTTGTCTGTGTGGTATCATGTACGTATGTATATGCATATGTAAA +ATCAGATTTACCCTTGTTATAGGGCCACAGAATTGATTTGGAACATCTGTTTTGATAGGTCTTAGAATAT +TTAATTGTATATATAGTAAGATTAGGTGAGTTTTAATTGTGTAGAACTGCTAAAGAAAGGTTTTTAGGGA +TTGTTGTATGAATAAAAGGCTTTAGGTTCATTGGAATCAGGGGAATCAGGCTTTACTAGAAGAACAGGAG +AAGGGGTGACTGACCGAAAAATAAAATGCCAAGTACTCAGAATAACCCTTTAAATACTGATATGTAATAT +TTAGCACATTCTACATAAACTGTTTCTATGAGAAAGGTTGTGAGAATAATATAAATTATATGGCTTATAA +AATATTAATGTGCTTCTGTTTTATACTTTAACAGGATTTGGAAAAACATCAGGGAATTCATTTAAAGTAA +ATAGCTGCAAAGACCACATTGGAAAGTCAATGCCAAATGTCCTAGAAGATGAAGTATATGAAACAGTTGT +AGATACCTCTGAAGAAGATAGTTTTTCATTATGTTTTTCTAAATGTAGAACAAAAAATCTACAAAAAGTA +AGAACTAGCAAGACTAGGAAAAAAATTTTCCATGAAGCAAACGCTGATGAATGTGAAAAATCTAAAAACC +AAGTGAAAGAAAAATACTCATTTGTATCTGAAGTGGAACCAAATGATACTGATCCATTAGATTCAAATGT +AGCAAATCAGAAGCCCTTTGAGAGTGGAAGTGACAAAATCTCCAAGGAAGTTGTACCGTCTTTGGCCTGT +GAATGGTCTCAACTAACCCTTTCAGGTCTAAATGGAGCCCAGATGGAGAAAATACCCCTATTGCATATTT +CTTCATGTGACCAAAATATTTCAGAAAAAGACCTATTAGACACAGAGAACAAAAGAAAGAAAGATTTTCT +TACTTCAGAGAATTCTTTGCCACGTATTTCTAGCCTACCAAAATCAGAGAAGCCATTAAATGAGGAAACA +GTGGTAAATAAGAGAGATGAAGAGCAGCATCTTGAATCTCATACAGACTGCATTCTTGCAGTAAAGCAGG +CAATATCTGGAACTTCTCCAGTGGCTTCTTCATTTCAGGGTATCAAAAAGTCTATATTCAGAATAAGAGA +ATCACCTAAAGAGACTTTCAATGCAAGTTTTTCAGGTCATATGACTGATCCAAACTTTAAAAAAGAAACT +GAAGCCTCTGAAAGTGGACTGGAAATACATACTGTTTGCTCACAGAAGGAGGACTCCTTATGTCCAAATT +TAATTGATAATGGAAGCTGGCCAGCCACCACCACACAGAATTCTGTAGCTTTGAAGAATGCAGGTTTAAT +ATCCACTTTGAAAAAGAAAACAAATAAGTTTATTTATGCTATACATGATGAAACATCTTATAAAGGAAAA +AAAATACCGAAAGACCAAAAATCAGAACTAATTAACTGTTCAGCCCAGTTTGAAGCAAATGCTTTTGAAG +CACCACTTACATTTGCAAATGCTGATTCAGGTACCTCTGTCTTTTTTTTTTTGTAAATAGTACATATAGT +TTTATAGATGACGATTCCTTCTGTGTTTTTTTCTGCTTTTTAAAATCTTCATATCTTATATTTAATCTTA +GGCATCATCTGTATACATGATTGTTTAGGTCTTTAATTACCAGTGTTTAGAATCAGGTCACTCAAACATG +GTAGATAAGTTTGCATAGTTTGTGTATATCCATCACTCTTGAGACAGTTTTATTTTAAGTTCCGGGGTAC +ATGTGCAGGATGTGCAGGTTTGTTACATAAGTAAACGTATGCCATGTTGGTTTGCTGCACCTGTCAACCC +TTCACCTGAGTATTAAGCCCAGCATGCATTAGCTATTTTTCCTGGTGCTCTCCTTCCCCCCACACACCCC +CACCTCCTGACAGACCCTAGTGTGTGTTGTTCCCCTCCCTGTGTCCGTGTGTTCTCATTGTTCAGCTCCC +ACTTATGAGTGAGAACATGTGATGTTTAGTTTTCTGTTCCTGCATTAGTTTGCTTAGGATAATGGCTTCC +AGCTCCATCTGTGTCCCTGCAAAGGACGTGATCTTGTTCCTTTTTATGGCTACATGGTATTCCATGGTGT +ATAGTTCCACATTTTATTTATCCAGTCTATCATTGATGGGCATTTGGGTTGATTCCATGTCTGTGCTATT +GTGAATAGTGCTGCAGTGAATGTACAGGTGGATGTATCTTTATAATACAATGATTTATCTTCCTTTGGGT +ATATACCCCGTAATGGGATTGCTGAGTCAGATGGTATTTTTGGTTCTAGGTCTTTGAGGAATTGCCACAC +TGTCTTCCACAACGGTTGAACTAATTTACATTCCAGCCAACAACTTGAGACAGTTTTTGACTCATAAACA +TTCAGAGCTTGGCTAGCTAATTCCTGCTTTAATTTAAAAAGTGTTTATTATATGCAAATTGGACAACTCA +TATAAATATGTGGTGCTACTTACTATGTATTTTCTCTAAAGCATGTTAAAAAAATAGGCTAGATATAGTG +GCTCATGCCTGTAATCTTAGCACTTTGGGAGGCTAAGGCAGGAGGATCACTTATGGTCAGGAGTTTAAGA +ACACCCTGGGCAACATAGCGAGACCCCATCTCTACAAAAAATTTAAAATACCCAGGCATGGTGGCATGCT +TCTGATGTTGTAGCTACTCAGGATGCTCAGACAGGAGGATCACTTGAGCCCAAGTGACTGAGGCTGCAGT +GAACCAAAATTGTACCAGTGCACTCCAGCCTGGGCCACAAAATGAGACCTTGTCCCTGAAAAAAAAAAAA +GAAAAAAAAAATTTAAATAGAGGAAATACTAGCTAAGTTTAATGTAGGCCAGTTCTAAAATAATGATTTA +TTGCTGCTGTTGTTACATAATTTTCTTAAATATTTTAAAGATTGCATACTGTTACTGCTCTATTTCTGCA +TCTCCGTGGTGTAACTCTGTCCTCTTTGTTGTTGCAACAGTTCACTTAGCAACTAAACTGTATGTTTACA +AAGTGATTTTATCTCCCTATGAGAAGACTTTAGTGAATAGCTCAGTGAATAGTAGAGTTGGTGAGACCAC +AGTACAGAACTGTTTGAAGTTTGGGTTAAATTTTTAGAGGAAAATGTTTGATACTATGCATATCATAGTT +AAAGCCAATGAAAAAGCTAATATAGGCCAGGCGCAGTGGCTCACGCCTATAATCCCAGCACTTTGGGAGG +CCAAGGCAGGCAGATCACTTAAGGTCAAGAGTTCAAGACCAGCCTGGCCAACATGGTAAAACCCCATCTC +TATGAAAAAAAACAAAAATTATCCAGATGTGGTGGCATGTGCCTGTAATCCCAGCTACTCGGGACGCTAA +GGCAGGAGAATCACTTGAACCTGGGAGATGGAGGTTGCAATGAGCTGAGATCACGCCACTGCACTCCAGC +CTGGGTGACAGAACGAGACTCCATCTCAAAAAAAAAAAAAAAAAAAGCTAATACATGTGATCACTGATGA +AATGCAATTAAGAACTGGTTAGTAGAAAATTCAGAGGGTCAAGAAATTTAACAGAGCAGTTGAACTCATT +TGCCTTTATCGTTGAGATTAGATCATCTTTCAGGCTGTTAGTATATGGACCCTGTTTTTAAAAATTGTGG +TTTTGTTTTTTTCAATGTGAAAGAATTAAGAAAATTGTTACTTTTCTAATTCCTTTTCTGTGCCTTGCTT +TTCTGTTCACACCAGTATTAACAGCAATGAAATTTTTTCAATTTTATTTTCCAATAAAAATTACTTTGAG +TTTTTTTTATGGTAGCTAGCTACTTCCTTGACCTAGATACTAATTTTGATTGAGTTGGTAACTATTATTA +AAAAAACAACTTAGGTCTAATTTATCTTGAGCTAAAAAATGTAATAACTGAAAAATAGAGCATATTTAGG +ATTCTTTCTGCTTTAAATTTGACATTCAGTTATTTTCATGTAATTTGTGTTTTGAGCACTACCTTTTAAT +TAATTTATTTATTTTTATTTTTTAGAGACTGTCTCATTCTGTTACCTAGTCTGGAGTGCACTAGTGTGAT +CTCAGCTCACCGTAGCCTCACCCTCCTGGGCTCAAGCAGTCCTTGCACCTCACCCTCCTGAGTAACTGGC +ACCACAGGCATACACCACCACACCCAGCTAATTTTTATTTTTCATAGAGTCATGGTCTCACTATGTTGCC +CAGGCTAGTCTCGAACTCCTGGGCTCAAGCAGTCTTCCTGCCTCAGCCTCCCAAAAGTGCTGAGATTACA +GGCATGAGCCACTGTGCCCAAACACTACCTTTTTAACTTAGTGAAAAATATTTAGTGAATGTGATTGATG +GTACTTTAATTTTGTCACTTTGTGTTTTTATGTTTAGGTTTATTGCATTCTTCTGTGAAAAGAAGCTGTT +CACAGAATGATTCTGAAGAACCAACTTTGTCCTTAACTAGCTCTTTTGGGACAATTCTGAGGAAATGTTC +TAGAAATGAAACATGTTCTAATAATACAGTAATCTCTCAGGATCTTGATTATAAAGAAGCAAAATGTAAT +AAGGAAAAACTACAGTTATTTATTACCCCAGAAGCTGATTCTCTGTCATGCCTGCAGGAAGGACAGTGTG +AAAATGATCCAAAAAGCAAAAAAGTTTCAGATATAAAAGAAGAGGTCTTGGCTGCAGCATGTCACCCAGT +ACAACATTCAAAAGTGGAATACAGTGATACTGACTTTCAATCCCAGAAAAGTCTTTTATATGATCATGAA +AATGCCAGCACTCTTATTTTAACTCCTACTTCCAAGGATGTTCTGTCAAACCTAGTCATGATTTCTAGAG +GCAAAGAATCATACAAAATGTCAGACAAGCTCAAAGGTAACAATTATGAATCTGATGTTGAATTAACCAA +AAATATTCCCATGGAAAAGAATCAAGATGTATGTGCTTTAAATGAAAATTATAAAAACGTTGAGCTGTTG +CCACCTGAAAAATACATGAGAGTAGCATCACCTTCAAGAAAGGTACAATTCAACCAAAACACAAATCTAA +GAGTAATCCAAAAAAATCAAGAAGAAACTACTTCAATTTCAAAAATAACTGTCAATCCAGACTCTGAAGA +ACTTTTCTCAGACAATGAGAATAATTTTGTCTTCCAAGTAGCTAATGAAAGGAATAATCTTGCTTTAGGA +AATACTAAGGAACTTCATGAAACAGACTTGACTTGTGTAAACGAACCCATTTTCAAGAACTCTACCATGG +TTTTATATGGAGACACAGGTGATAAACAAGCAACCCAAGTGTCAATTAAAAAAGATTTGGTTTATGTTCT +TGCAGAGGAGAACAAAAATAGTGTAAAGCAGCATATAAAAATGACTCTAGGTCAAGATTTAAAATCGGAC +ATCTCCTTGAATATAGATAAAATACCAGAAAAAAATAATGATTACATGAACAAATGGGCAGGACTCTTAG +GTCCAATTTCAAATCACAGTTTTGGAGGTAGCTTCAGAACAGCTTCAAATAAGGAAATCAAGCTCTCTGA +ACATAACATTAAGAAGAGCAAAATGTTCTTCAAAGATATTGAAGAACAATATCCTACTAGTTTAGCTTGT +GTTGAAATTGTAAATACCTTGGCATTAGATAATCAAAAGAAACTGAGCAAGCCTCAGTCAATTAATACTG +TATCTGCACATTTACAGAGTAGTGTAGTTGTTTCTGATTGTAAAAATAGTCATATAACCCCTCAGATGTT +ATTTTCCAAGCAGGATTTTAATTCAAACCATAATTTAACACCTAGCCAAAAGGCAGAAATTACAGAACTT +TCTACTATATTAGAAGAATCAGGAAGTCAGTTTGAATTTACTCAGTTTAGAAAACCAAGCTACATATTGC +AGAAGAGTACATTTGAAGTGCCTGAAAACCAGATGACTATCTTAAAGACCACTTCTGAGGAATGCAGAGA +TGCTGATCTTCATGTCATAATGAATGCCCCATCGATTGGTCAGGTAGACAGCAGCAAGCAATTTGAAGGT +ACAGTTGAAATTAAACGGAAGTTTGCTGGCCTGTTGAAAAATGACTGTAACAAAAGTGCTTCTGGTTATT +TAACAGATGAAAATGAAGTGGGGTTTAGGGGCTTTTATTCTGCTCATGGCACAAAACTGAATGTTTCTAC +TGAAGCTCTGCAAAAAGCTGTGAAACTGTTTAGTGATATTGAGAATATTAGTGAGGAAACTTCTGCAGAG +GTACATCCAATAAGTTTATCTTCAAGTAAATGTCATGATTCTGTTGTTTCAATGTTTAAGATAGAAAATC +ATAATGATAAAACTGTAAGTGAAAAAAATAATAAATGCCAACTGATATTACAAAATAATATTGAAATGAC +TACTGGCACTTTTGTTGAAGAAATTACTGAAAATTACAAGAGAAATACTGAAAATGAAGATAACAAATAT +ACTGCTGCCAGTAGAAATTCTCATAACTTAGAATTTGATGGCAGTGATTCAAGTAAAAATGATACTGTTT +GTATTCATAAAGATGAAACGGACTTGCTATTTACTGATCAGCACAACATATGTCTTAAATTATCTGGCCA +GTTTATGAAGGAGGGAAACACTCAGATTAAAGAAGATTTGTCAGATTTAACTTTTTTGGAAGTTGCGAAA +GCTCAAGAAGCATGTCATGGTAATACTTCAAATAAAGAACAGTTAACTGCTACTAAAACGGAGCAAAATA +TAAAAGATTTTGAGACTTCTGATACATTTTTTCAGACTGCAAGTGGGAAAAATATTAGTGTCGCCAAAGA +GTCATTTAATAAAATTGTAAATTTCTTTGATCAGAAACCAGAAGAATTGCATAACTTTTCCTTAAATTCT +GAATTACATTCTGACATAAGAAAGAACAAAATGGACATTCTAAGTTATGAGGAAACAGACATAGTTAAAC +ACAAAATACTGAAAGAAAGTGTCCCAGTTGGTACTGGAAATCAACTAGTGACCTTCCAGGGACAACCCGA +ACGTGATGAAAAGATCAAAGAACCTACTCTATTGGGTTTTCATACAGCTAGCGGGAAAAAAGTTAAAATT +GCAAAGGAATCTTTGGACAAAGTGAAAAACCTTTTTGATGAAAAAGAGCAAGGTACTAGTGAAATCACCA +GTTTTAGCCATCAATGGGCAAAGACCCTAAAGTACAGAGAGGCCTGTAAAGACCTTGAATTAGCATGTGA +GACCATTGAGATCACAGCTGCCCCAAAGTGTAAAGAAATGCAGAATTCTCTCAATAATGATAAAAACCTT +GTTTCTATTGAGACTGTGGTGCCACCTAAGCTCTTAAGTGATAATTTATGTAGACAAACTGAAAATCTCA +AAACATCAAAAAGTATCTTTTTGAAAGTTAAAGTACATGAAAATGTAGAAAAAGAAACAGCAAAAAGTCC +TGCAACTTGTTACACAAATCAGTCCCCTTATTCAGTCATTGAAAATTCAGCCTTAGCTTTTTACACAAGT +TGTAGTAGAAAAACTTCTGTGAGTCAGACTTCATTACTTGAAGCAAAAAAATGGCTTAGAGAAGGAATAT +TTGATGGTCAACCAGAAAGAATAAATACTGCAGATTATGTAGGAAATTATTTGTATGAAAATAATTCAAA +CAGTACTATAGCTGAAAATGACAAAAATCATCTCTCCGAAAAACAAGATACTTATTTAAGTAACAGTAGC +ATGTCTAACAGCTATTCCTACCATTCTGATGAGGTATATAATGATTCAGGATATCTCTCAAAAAATAAAC +TTGATTCTGGTATTGAGCCAGTATTGAAGAATGTTGAAGATCAAAAAAACACTAGTTTTTCCAAAGTAAT +ATCCAATGTAAAAGATGCAAATGCATACCCACAAACTGTAAATGAAGATATTTGCGTTGAGGAACTTGTG +ACTAGCTCTTCACCCTGCAAAAATAAAAATGCAGCCATTAAATTGTCCATATCTAATAGTAATAATTTTG +AGGTAGGGCCACCTGCATTTAGGATAGCCAGTGGTAAAATCGTTTGTGTTTCACATGAAACAATTAAAAA +AGTGAAAGACATATTTACAGACAGTTTCAGTAAAGTAATTAAGGAAAACAACGAGAATAAATCAAAAATT +TGCCAAACGAAAATTATGGCAGGTTGTTACGAGGCATTGGATGATTCAGAGGATATTCTTCATAACTCTC +TAGATAATGATGAATGTAGCACGCATTCACATAAGGTTTTTGCTGACATTCAGAGTGAAGAAATTTTACA +ACATAACCAAAATATGTCTGGATTGGAGAAAGTTTCTAAAATATCACCTTGTGATGTTAGTTTGGAAACT +TCAGATATATGTAAATGTAGTATAGGGAAGCTTCATAAGTCAGTCTCATCTGCAAATACTTGTGGGATTT +TTAGCACAGCAAGTGGAAAATCTGTCCAGGTATCAGATGCTTCATTACAAAACGCAAGACAAGTGTTTTC +TGAAATAGAAGATAGTACCAAGCAAGTCTTTTCCAAAGTATTGTTTAAAAGTAACGAACATTCAGACCAG +CTCACAAGAGAAGAAAATACTGCTATACGTACTCCAGAACATTTAATATCCCAAAAAGGCTTTTCATATA +ATGTGGTAAATTCATCTGCTTTCTCTGGATTTAGTACAGCAAGTGGAAAGCAAGTTTCCATTTTAGAAAG +TTCCTTACACAAAGTTAAGGGAGTGTTAGAGGAATTTGATTTAATCAGAACTGAGCATAGTCTTCACTAT +TCACCTACGTCTAGACAAAATGTATCAAAAATACTTCCTCGTGTTGATAAGAGAAACCCAGAGCACTGTG +TAAACTCAGAAATGGAAAAAACCTGCAGTAAAGAATTTAAATTATCAAATAACTTAAATGTTGAAGGTGG +TTCTTCAGAAAATAATCACTCTATTAAAGTTTCTCCATATCTCTCTCAATTTCAACAAGACAAACAACAG +TTGGTATTAGGAACCAAAGTGTCACTTGTTGAGAACATTCATGTTTTGGGAAAAGAACAGGCTTCACCTA +AAAACGTAAAAATGGAAATTGGTAAAACTGAAACTTTTTCTGATGTTCCTGTGAAAACAAATATAGAAGT +TTGTTCTACTTACTCCAAAGATTCAGAAAACTACTTTGAAACAGAAGCAGTAGAAATTGCTAAAGCTTTT +ATGGAAGATGATGAACTGACAGATTCTAAACTGCCAAGTCATGCCACACATTCTCTTTTTACATGTCCCG +AAAATGAGGAAATGGTTTTGTCAAATTCAAGAATTGGAAAAAGAAGAGGAGAGCCCCTTATCTTAGTGGG +TAAGTGTTCATTTTTACCTTTCGTGTTGCCAATCACTATTTTTAAAGTGTTTATTCAGTAGACTTGGTAT +GCTAACAATTAAGAGTGTTATAAACTATGTCTTTTCAGCCATTTTTGTGTAGTCAGTTTGGGGGAGTATG +GTTTGATATACAGATACACAGATTCAGTATTCGTATACAGATTTGATATCTTGGTATACAGATTCGATAT +CTCTGAATCTGTATACCAAGAAATCATGTTTTAAGGGTCTCAATATATTTTCAAAAAGATTATTAGTATA +ATAATTGAGAAATTACTGTTAAAAAGTTTTGAGTTTCTCTAGAAAATTTGAAACTCTTAACAAAACCTGC +ATAATACTAACTTAACTGTTTTCATATACATAGCAAGTTCAGACTCTGACTTATATGAACTTTAAAAGTT +GGTTTCCGGGAGGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCCGGCTAAAACGGTGA +AACCCCGTCTCTACTAAAAAAATACAAAAAATTAGCCGGGCGTAGTGGCGGGCGCCTGTAGTCCCAGCTA +CTTGGGAGGCTGAGGCAGGAGAATGGCGTGAACCTGGGAGGCGGAGCTTGCAGTGAGCCGAGATCCCGCC +ACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAAAAAAAAAAAGTTGGTTTCCGATTA +TACCATTTACTGGGTAATATATACTACTTAGTTACACTACTTACATAGCTTCAGTTTCCTTATCTATAAA +ATGCAAATAACACCTCCCATGAGGGCTGGGCGTGGCGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCC +GAGGTGGGTGGATCACCTGAGGTCAGGAGTTTGAGACCAGCCTGACCAACATGGTGAAACCCCATCTTTA +CTAAAAATACAAAAAATTAGCCAAGCGTGGTGGCGCGCACCTATAATCCCAACTACTCCAGAAGCTGAGG +CAGGAGAATCACCTGAACCTGGGAGGTGGAGGGTGCAGTGAGCTGACATCACACCACTGCTCTCCAGCCT +GGGCAACAGAGCGAGACTGTCTCAAAAAAAAAAAAAAAAAAGTGTATTTAAAGCACTTAGCAGTGAACTT +GACATATAGTAGGCAGAGAGCATTCAGTAAGTGTTGGCTTGCTCCCTTTTTTTCATTTAGGAAGTGATCT +AAAAACAGTATTGTTAGTAAATGGTATCTTGATCTTAATGTTATGTGGACTATTTTAACTTCCCTTTTAA +ATGTATATATATCTAACAACTTAGTTCAACTACAGTCATGTGTCATTTGACAGGGATATATGTTCTGAGA +AATAGATTGTTAGATTTCATCATTGTGGGAACATCATAGAGTATACTTACACAAACCTAGGTGGTATAGC +CTACTATATACCTAGGCTGTATGGTATAGCTTATTGCTCCTAGGCTGCAAACCTATACAGCATGTTACTG +TCCTGAATACTCTAGGCAGTTTTAACACAGTGGCAAGCATTTGTGTATGTGAACATAGAAAAGGTACAGT +AAAAATACGGTATTAAAATCTTATGGGGCTGGGCTCAGTGGCTCATGCCTGTAATCCCAGCACTTTGGGA +GGCTGAGGCAGGCGGATCACCTGAGGTCAGGAGTTTGAGACCAGCCTGGCCAACATGGTAAAACCTTGTC +TCTACTAAAAATATAAAAATTAGCTGGGCATGGTGGTGGCACACGCCTGTAATCCCAGCTACTAGGGAAG +TTGAAGCAGGAGAATCACTTGAACCCTGGAGGCAGAGATTTCAGTGAGCCAAGATCGCACCACTGCACTC +CTGCCTGGGCGACAGAGCAAGACTCCATCTGAAAAAAAAAAAAAATCTTATGGGACCACTATTAAAGTCT +TATAGGATGACCATTGCATATGTGGTCTATTGTTGACCAAAATGTCATTATGTGGCAAATGACTGCATTA +GGTTAACCTTATACATACCTATATTAGGTATGTATTTGGTTTTGTTTTTTTGTGTGTGTTTTTTTCTATT +AGTGTATCTGACTGGTAATAATCTTAAATAATTGAATCTGTTTGTTAGTTGCAATTAAAGCAAATGCCAA +AACTCCAACATTTCAGTGGATAATCTTAAATAACTAGTTCCTTTTTAAAAAACCTATAAACTCATAAAAA +TATTTTAGTTATTAGAACTCTTCCTGTCTAGACCCCATGTATTACAGAGAGACACCGAAGTTAGTCTCCT +CATTCAAAAAGTGCCTTTTGCCCCTAAGTCATTCTGGTGGATACAGATTTACTTAATCAAGTGTTGTCCA +GGTCACATTCAATATAGGATTTACTTTATGGACAAAGTAGTACGTTTATAGTACTTAAACTATTTGCTGT +CCTTTAGTGTGAAATTCTGAGGTATATATGCTTAAAGATATTTGTAATTCTTTTGTGGAAAATAATGGCT +TTATTTATAGCAACCCATTCTGTTCTTGTGCATACTGAAGTATATTGACTTTCCACCTAGGGAAAAAAAA +AACAATAACTCAGACTTGTAAATGCTTTCAACGGTGTTACTACTTAATTTCCCTCATTTCTGTAACATAT +AAGTGTATAACTTAGTCAGCTTCTGGTTACTGGAACAGTACAGGTCACTGTTAAACAATTAAACCACTTT +TATAATAATCTAACACCTCCTAAAGCCTTGCATGGACATTTTTACTTATTAAATTATACAAATTTATTCC +CTGTAATAAAGCATCAAAAAGCAAAGTACCTGTTATATATTATCTCAGCATGACATGGAAATGCCTACCT +TGAATTATGGTTTAATCTTACCCTCTTAGCCTCTGTAGAATTTTTAAATAAGAATTGTTTCTATTACTAG +TACTTTAATGTAATTTGATAATTGTAAAAAGCCTCTTAACTCTAATTCAAGGACCTACATAATAAATTAC +TCCTTCAGTTAATGGCTGCCCCCGTGCTGAAAAAAAAAAAAAAAAAGAGAGAAAAAGTTTATTTGAAGAA +ATTTTGTTAGGCCTTATTGCCAGTAAACCTAGAGTTATATTTAGTGTCAGTTTTTCAAAAAGTAGCTTAT +CTGTGGTATCTGGTAGCATCTGTTTATCCTATTTAGGATTTATCCTGTTTAGACCCTGTTAAATAGTGGT +GTTTTAAAGTGGTCAAAACAGAACAAAAATGTAATTGACATTGAAGACTGACTTTACTCTTTCAAACATT +AGGTCACTATTTGTTGTAAGTATTTTTGTTTAACATTTAAAGAGTCAATACTTTAGCTTTAAAAAAATGG +TCTATAGACTTTTGAGAAATAAAACTGATATTATTTGCCTTAAAAACATATATGAAATATTTCTTTTTAG +GAGAACCCTCAATCAAAAGAAACTTATTAAATGAATTTGACAGGATAATAGAAAATCAAGAAAAATCCTT +AAAGGCTTCAAAAAGCACTCCAGATGGTAAAATTAGCTTTTTATTTATATCTGTTCTCCCTCTATAGGTA +TGGTATATAATATTCTGACCTCAGGTGATCCACCTGCCTCTCAAAGTGCTGGGATTACAGACATGAGCCA +CTGTGCCTAATCAAGGACCTCTTTATACTCTTAAAAATTACTGAGGACCTAAAAGAGCATTTGTTTATGT +GGAATATATCTATTGATATTTACCATATTAGAAATGTAAATTGATTAATGTTAAAATTAGTAATATTATG +CGTTGGTCATTTGGAAGATATGAGTTCACTGAGTTATGCGGATCTTCCGAAAGTTGACAGTTTTATTATG +CAGTATTAAACAATCACTTTCATTGATGCCATTACCGATCAGAAAAGTTTAAGTAGTAGAAAGCTGTCAA +GCTTACAGAGCCAGATACAAGCTTCCCAAAAATTCTGATTTTCATCTAAAAGCTTGAATTTTTCCCCGGC +AATAAGTATTGTCACTTATTTTTCTTGTAGGTGACAAGCTTATTTTCATTCATTTTTGAAAAGATGTCTG +CCGAATACCCAAGTCTGAATAACTATAGTTTGTTGGTTATTCTTTCAAGTAAAAGGTATTTCATGAAAAA +ATAGCTAGTATAGCTCACAACTCAATCATTTAAGTGTGTTTTCTTGAGAAACGCACTGAAGTATGCAAGC +ATAATATACCAACAGTACAAATATCAACAGTGAAAAGGACATACATAACATTTTACTAATAAGACAGTTT +TGACAGCTTGGATTCCCTAAAATGGTTGTAGATACCTAACAGGATTCCACTGATCATTTCTTGAGAATCA +TTGTCCTATAATATATACATAATAATCTAAATTTACAATATCAGTATTAACTACTGACAATAAAACTACT +AAGGAAAATGTAAGAATTGTTTGCAGTTTTTGTCCTTAGAGTATATAGGTTGAGTATCCCTATCTGAAAT +GCTTGGGACCAGGACTATTTCAGATTTCAGATTTTTTCAGATTTTGAAATGCTTGCATATACAATACATA +ATGAGATATCTGGGGATAGGACTCAAGTCTAAACACGAAATTTATTTAAGTTTCATAAACACCTTATACA +TATAACTTAAATGTAATTTTATACAATATTTTAAATAATTTTTGCATAAGACAATTTAAATTGTGATCCA +TCACATGAGGTCAGATGTGGAATTTTCTACTGGCCTCATGTTGGCACTCAAAAAGTTTCAGGTTTGTGAC +CATTTTGGATTTTCAGATTAGGGATACTCAACCCATATATTATTAAGAATGTTTAGTCAAAATACTGTGT +TCAAATGTCACTCAAAATAATTCTTCCGGATGTGGTTACCAATTTGATAATTAGGTTACATTCCTTTTTT +TCCATTTGTTTTCAATTTTAGGATTTGTCTTTTCTTATTTAATTTTACATTTGAATAAATAAAACATTAC +ATAGTTCATTCATCAGAACTACAAAAAGGTATACTTAGAGTTTTTATTCACCCACCTCTTGCTTACTATA +GGTAATCTTTTTTAGTGTTTTTTTTTCAGGATTCTGTTTAATAAAAATAAGCAAATACATGTATATACTC +ATTACCCTTTCTTACTCAAAAGATACAGTATATACACCATTTTGCACCTTGTTTATTGGTTGTTGTTTAC +TTAAGAATTATTTGGAGATGACTCCTTAATGAGTATATAGAGATCGTCCTCATTCTTTTTTGTGGTTACA +TAGTAGTTGATCATCTGGCTGTGTCAGTGTTTCCTAGTTTATTTAACCAATTTCCAACTAGTGGACTTAT +TGAAGATTTAATTAGGTTCCAGTTACATACTGAGAATGAACAATATCTAAAGCTTAGCTTTTAAACCTTC +ATAAGACTAAATTTTAAATTTGGTATTTGCATCAGAAATTAGCTAACACCTTTGAGTTATGATGGTTAAC +ATCAACTGACTAAATTTATGCTGATTTCTGTTGTATGCTTGTACTGTGAGTTATTTGGTGCATAGTCATT +ATCAATTTGTGAATCAATTTATTTTCATAGTTAACATTTATTGAGCATCTGTTACATTCACTGAAAATTG +TAAAGCCTATAATTGTCTCAAATTTTTTGTGTATTTACAGTAACATGGATATTCTCTTAGATTTTAACTA +ATATGTAATATAAAATAATTGTTTCCTAGGCACAATAAAAGATCGAAGATTGTTTATGCATCATGTTTCT +TTAGAGCCGATTACCTGTGTACCCTTTCGGTAAGACATGTTTAAATTTTTCTAAATTCTAATACAGTATG +AGAAAAGTCTCGTTTTTATAAATGAACATTTCTAAAAATAATGACACTAACGTTAAGAAGTTAACACTTC +CCGTTTTATAAAATTTATAAAATACTTTGGTAGTATTTTATAGTGCTGTTCATATCATTATTTTATTTTT +TAATTTTATGACAGCTTTGTAAAGTAGACAGATTTTATTCTAATTTTATGGATGAAGTACTAAGGTTGAG +AGGAATTAAGGAAATTGCTCCGAATCAGTTAACAAAAAGATTGCAGATATTAAAAATATCCTTTTATCTC +TCCTCTCTAAACCTTTAAAAAAGTACTAAGATAGTTTTTTTAATGTATAATTCCCAAGGACAATGATGAG +AAGAAACAACAAAAGTTTGGAAGCCAAAAACATAAAGGATTTAGTAAGCATGAGAAAGCTAAAACCTGAC +ACTAGAGCAAACAGAGATGCTTTCCCCTAAAAAACCTGAAAAAGATTCAAATTGGCAGCAACAGGTACTT +CTGAAGGTGAAGTAGAAAATAGGAAGATTAGTTGAAATTCTTTTTAAGAAACATCTATATTTCCTCCCCC +ACTGCAAATAGGCGGTTATCCTTCTTCTGCCAGGAAATCAGAAGGTTGTTCTTGAAAAAGATGAATTGAG +AGGATTCTGAATTGAAGGTGGGCTGGAGGGAGGGGACACCAGGCACAATTGAGGGAAAGATACTAAAATG +AAAGATCAGATACAAATCTGTATGTCAAGCAGTGAGACCTAGCTCCTTCCCACACTTGGTTCCCAAATGC +AGGCCCTCTAGGCATGAGACTGGAAGATTTTTTTTTCCTAGGGAATATGCCTGACCCAATAGAAAAGACC +AAAAAATACTGACAGTTGAGGATACTCAGATGAAACAGTATAGCCAGTCACCAGACCAGGAAGTTAACTG +TTGACATGCACAGAGCTTCCAGGAAGCTACTTAGTGCTTCACTTTTAAATAAGAAAAGATAGTCAAAGAT +AACTAGTCATTGGAAGAAAGCTACTATGAAACATAGTCACCAAAGTACAAAATCCATAGCAGAAAGGAAC +CTAGAGGAAATCGACTATGAAAACTTCATAAAAACCTACTAATATTCTCAGGTAAGAAAAGAAAAAATGG +CCGTAAAATAAGAACAAGTTGCTATAAAAAGCTCTTAGAAATTAAAAATATGATAGCACAAATAAATTAA +CTCAGTAGAAATAATGGAAGAATCATGAAAGTTCCCAGAATACAGAATAAAATGAAAAAAGGTATGAAAA +GTCAATTCTGTGGATCTATCATCTGAAAATACAGAGTTTGAGAAGGAAGGCACAGAAGAGAAATGAAGAA +AGAAATTTTAAAATAAATACATAATTTTAAAAGTTCTACTAGTACTGAAGGACATGAGTTTCCTTAATTA +AAAGGGCCCACTGAGTGAGCACACAAGTAAAAATGACCCACAGTAAGGCACATCCTTGTGAATTTTTAGA +ATAATAGAGGCAGACAGGAACCTTAAATTCATTAGAGGACCAAGAAGTTAGGTTTCAAATTGTTTCAAGC +CATAATAGTATGAATTCTCTTATTATCAACAATGGAATCTAGAAGACTGTAGATCTTATATAATACAGAG +AAGTGCCTTCAAAATACTGAGAGAAAATGATTTCCAACCTAGAATCTGAATTAAGTGTGAGGGTAGACAT +TTTTCAGATGTGAAGTACTAAAAGATCTCTTGTGCGCTTTTCTCAGGAAACTAACCAAAACAAATGCATA +CACCAAGAAGGAGGAAGGTATAGGACTTAAGAAATAAGAATTCAACATAGAAGAGAGGCAAAGGGAGCTT +TCAGGATGATATTGAAGGGAGATCCCAGAGTAGCTGTGTTGCTAAGTCTAGAAAGGCAGCTAGACTACTT +TGGAACTGAAGAAGATAAGAGACTTTGGAAGAGTTTGCCTTCAAGATAAAAATAAAGCAGTACCTGCATG +TTTTAATGTATTAGGAAACTTCTTAGTAAAGATGGTGAATTGAGGCCAGGCACAGTGGCTTACACCTGTA +ATCCAGCACATTGGGAGGCTGAGGTGGGTAGATCACTTGAGGCCAGGAGTTCGAGACTAGCCTGGCCAAC +ATGGTAAAATCCCATCTCTACTGAAAATACAAAAATTAGCCAGGCGTGGTGGCACACGCCTGTAATCCCA +GCTACTCCAGAGGCTGAGGCACAAGAACCGCTTGAACCTTTGAGGTGGAGGTTGTGGTAAAATTGCACCA +CTGCACTTCAGCCTGGGTGACAGAGTGAGACTCTGTCTCAAAAAAAAAAAAAAAAAAAAAAAGATGGTGA +ATTGAACATACTCATATCCTTTCTTTGCCTTCCAAACTTTTACCAAAACATCATTGAAGAAACTTACACA +CACACAAAAAAAAAACAAGGAAAATAGGAAATAACAAAGTAACTAAATTTCTCAAAGCATGCAGAAGGAA +ACTGAATGAAAGCTGGTGGTGGGGACAGCAGAGAACCAACGATTTTACACTCAGGTCTCAAAAGACTAGG +AATTGGTGGCTTCATTTCTTATCTTTAGAATTGGGTGGTGCAGAAGGAGGGAGCCAAAATGGAATAAGTT +GAAATTATGTTTAAGAAGCAATACTCGCCGGGTACGGTGGCTCACATGGAGGCTGAGGCGGGTGAATCAC +CTGAGGTCAGGAATTCGAGACCAGCCTGGCTAACATGGTGAAACCCCATCTCTGTTAAAAATGCAAAAAT +TAGCTCGGCATGGTAGCATGCCCCTGTAATCCAGCTACTCAGGAGGCTGAGGTGGGAGAACTGCTTGAAC +CCAGGAGGTGGAGGCTGCAGTGAGCCAAGATTGCGCCACTGCACTCCAGCCTGGACGACAGAGCAAGACC +CCACATCAAAAAAAAAAAAAAAAAAGCAGCAGCAGCAATACTCATGAAGCTGGGCAACTGTCTCCTGCCC +GCTCTATGAAAAGAACCAGAGGCTTATTCTCCAGAGAGGATACAGTAGAAGGTGAACACACTAGGCACAG +TTGAAGGCAGAAGCAACTACTTGAAAGCAAGAAGAAGTTAATATATGCATATTGAATGTTGGGATCTCCC +CTCACCAAGCCCTTTTCCACCACTCAGCTTCCAGAACATAGACAGCTAAGTTTTCACTAGTGGAAGTTTC +CATTTAATCAAGCTACTGTGTAGCTTGCAGTCAACAAGTTCTATCTTTGTACCAAGTGCTTCAAAACAGC +CTTTTGGTCCCTCACTCTTAACTATAAACAGACATCCAAAGATTATGAGACATCAGAAAAAGCAAAAATA +AAATAACCAAAAAACACATTAATGAAAACAACTTAGAAGAAACATTATTCAAGGAGAAGAAAAAATGTTT +TTTTAAAAACTATAATTTGTGAACAGAATGAAAAGAGGTTTATATATATAGCTAAGAGTTTAGATGTGAA +TAAACAGTAAGTACATAGAAAATAAGCAGATTTTAAAAATTAACTCAAGAGAAAGCAAAAGTTGTAAAGG +AAGTACACTATTTATATACTACCCATTAATGGCCGGGTGTGGTGGTTCACGCCTGTAATCCCAGCACTTT +GGGAGGCCGAGGCGGGTGGATCACAAGGTCAGGAGATCGAGACCATCCTGGCTAACATGGTGAAACCCCA +TCTCTACTAAAAATAACAAAACAAAATTAGCCAGACGTAGTGGTGGGCGCCTGTAGTCCCAGCTACTTGG +GAGGCTGAGGCAGGAGAATGGCATCAACCCAGGAGGCGGAGCTTTCAGTGAGCCGAGATTGCACCACTGC +ACTCCAGCCTGGGCGAGAGAGCGAGACTCCGTCTCAAAAAAAAAACAACAAAATAAAAAAATAAAATAAA +ATATACTGCCTATTAATACTACATATACTTTATACTGACTTAGCCGTAATGTAAATGTTGAACATTGATA +GTGAGAGGTGAAGCTGGCTGGGCTTCTGGGTCGTGTGGGGACTTGGAGAACTTTTCTGTCCGGCTAAAGG +ATTGTAAACACACCAATCAGCGCTCTGTGTCTAGCTAAAGGTTTGTAAACGCACCGGTCAGCACTCTGTG +TCTAGCTAAAGGTTTGTAAATGCACCAATCAGCACTCTGTAAAATAGACCAATCAGCAGGACGTGGGCGG +GGCCAAATAAGGGAATAAAAGCTGGCCACCTGAGCCAGCCCCAGCAGCCGCTCGGCTCCACTTCCATGCC +ATGGAATCTTTGTTTTTTCACTCTTTGCAATGAATCTTGCTGCTGCTCACTCTTTAGTGAGCACTACCTT +TATGAGCTGTAACACTCACCACGAAGGTCTGCGGCTTCACTCCTGAAGTCAGCAAGACCACGAACCCACC +AGGAAGAAGAAACAACCCTGTACGTGCCATCTTTGAGAGCTGTAACACTCACTGGGAAGGTCTGCGGCTT +CACTCCTGAAGTCAGCAAGACCACAAACCCACCAGAAAGAAGAAACTCTGGACACATCTGAACATCAGGA +AGAACAAACTCGGGACACACTATCTTTAAGAACTGTAACACCATGAGGGTCCACAGCTTCATTCTTGAAG +TCAGCAAGACCAAGAACCCACCAGAAGGAACCAATTCCGGACACAGTAGAATTAAATACGTAATTTAGGA +AGATGAAAGGCAAGAGTGTGTGTGTAGTAAGGTAGAAGCTGTGTTGACAGAGCTGAATTTTCATTTTCTG +TAGGGGTACTTCAAGAGAAAAAGTCAAGAAGAAACATGTCACTTAGACATATAAATATGATAAAATCATC +TAAAACTGTTTAAAGTAGTTGCAAAATCTTTTCTAGCTGATAAATTTTTAAGCCTAAAAATATCATTGAA +ATTATTTTAATGTTACATTTTATTTTATTTTATTTATTTATTTATTTATTTTGATACAGAGTCTCACTCT +GTCGCCCAGGCTGGAGTACAGTGGCACGATCTTGGCTCACTGCAACCTCTGCCTCCTAGGTTCAAGCGAC +TCTCCTGCTTCAGCCTCCCAAGTAGCCGGGATTACAGGCGCGTGCCACCATGCCCGGCTAATTTTTTGTA +TTTTTACTAGAGAAGGGGTTTCACCGTGTTAGCCAGGATGGTCTGGATCTCCTGACCTCGTGATCCGCCC +ACCTTGGCCTCCCAAGGTGCTGGGATTACAGACGTGAGCCACTGTACCAGGCCTAATGTTACCTTTTCAA +AAACACCTGATTGTGGAATTGTTGAAGTCACTGAGTTGTATTTCTGGAATGTGTTTTTTAGCAGGCTGCA +CATACACATATGTAGAAAGCCAGGTGATTTTTTTTTCATTTCTTTTTTTTTTATCAAAAACAGTTGTATT +AAATAAGAAAGGAAATACGTATTTACCCGTGTATTACCTTAATTTATGTGTAAAATGGGAGAATAGTTTA +ATGTATTTAACAAACAAACATTTGTTAAAGTACCTGCTCAAACTACCTAATATATACTATAGTGAAAGAT +ATAAGGATAAATAAGTCTAACTCAGATTGCTAGCCTGGGAACCAGACATGAAAACAAGAATTATAATGTA +ATATAAATTCTAGAATAGATGTAAAAAGTGATCTAAGAACATAGAAAAATTATCAGCTAATCACATGACT +GCTCAATGGGAAAAGTACTTCAGACAGAATGTAAAGAATGCTTGGTTAAAGATGGCATTCCAAATCTTGG +AATTTGGTTGGGGGACAGAGGGAAACAAAAAGAAATGGGGAGGTTAGGACCAAATAGGAAGCTTCCTGTA +TGTCATTTCTGATAAGTTGAAACCTAGGTAGGTGATAGGCTGTCTTTGGAAGTTTCTAACAAGAGGAACA +AAATAAGATTGGTGTTTTAGAAGTATACCAAAGCAAAACTGTTGCAAGGAGATTAGTAAATACAGGTCTT +AACCTAGCAGAGGAGGTAGAGGGTAGAGAATGATTGAGATAGAAATTCAGTAGATTTGGCCAGATAGTGA +TAAGTTGAGACTGGCAAATTATTTCCACTTAGATTTAAATAGATATCTTGAGCATAACCTACAAGGCAAA +CTCCTTATACTAAAAATATTCTGAATATTTAAAAAGAAAGGATTAAAAGATCAATCAATAGAAGTTTGGG +GACAGAAGGTTTATTCATTCTTGTGCATTAGATCTCATCTAGATCACCTGTTTGAAGAAATCATTCCAGC +AATTATCTTGTCTCTCTCCTGCATGGATTTTTTTCCTAATAGATTGTTCTCATCACCCTAAGCAGTTGTT +GTACATCTCTCATCTTAAAAAGAACAGCCTTTCTTAAGTAATCTCAACAGTCCATTTTCTTCTCTTAAAG +CCCAACTCATTAGAATTGTCCCTCCTCTTTTCACTTATCTCTTTGAGTACTCTCCTGAACCCAGTCTAGT +CAGTCCTTTCAGTAGAACTGGTCCCCCTGCTTACCTCCCTACTCCTCAATACACAGTGAATTCTCAACAA +AGAAGCCGGGGGATCCTTTTAAACATAAGACAGATTATGTCATTTCTTTACTCAGAACTATTCCGTGGTG +TGCCATCTCAGAGTAGAGACTAAAAGCCCTTGTCATGGTGTACAGATTCTTCATGATCTGGCTGCTTTGC +TATTTTTCCAGTCTGACCTTCTAATGTTCCCCTTGCTCTCCTTGCTCCAGGCACACTTGTGTCTAGGCCA +ATCGACATATTTGTTTGTCTGTTTCCTTCCACTGAAATATACATGCAAAAACAAAATTTTGTTACCGTGT +TCCCCAGCAAAACAATGTCTGGCACCTGGTAGGAATTCATTAAATAGTTGATGGATGGGCGAACGGATAA +CTAAAGGAACAACTTCAAGTTCCAGGTATCCAGGGTTTGGTAAAAGGAAATCTGGGGTTTTCAACAAGAT +ATCAAGTATTAGGAAGACCACGTATGCTGAGAAAGATGATCACTTTTGGACATGTTGAGTTTGAAATGAG +TGTGAAACATCAAGGTACAGATGTCTGATGCTATATGTAGTGTAAAATGTAGGAACAACCCTAGGAGAAA +AATCGGGCATGAGGATAAAGGATATTTTCATTGTTAGGTGATAATTTAAGCAATGGAAATGACTCACATT +AGCAAGGGAAAGTGTCTAAGGAAGACATCCAGTTTTGGAGACTTTTTTTGAGGAATCAGGAAGAGGTAAA +ACCAGTAAAAGATGAAAGAGGTACAGTGATGGTGAGAATTTTAAAAGAAGGAAAATGTAAACTGTCATAG +CTATTAGGAAAGTTGAGTAGAATGAGTTTGCGTGCATCCCACATGCATCTGGGAGGTCATTAACAACTTT +ATTGAGAACAGTTTCTGTAGAGTAGTGGGAGAAATGAGAGTTTATTGAGTAGAGATTGAGGAAGTGAAAA +TAGCTACATTACCTATTGAAGAAGGTTGACTGTGGAGTGTAACAGTGAGTATTAGCTTGAGGCAGAGATA +AAGGTGAGTGAGAAAATAAGAGTTTCAAAGGTAGGCAAGATTTTTGGGCTAAATAAAAAGGGCACTTTAA +AAAAGGTATAAATAGGTAGAAGAGAGAAAAGGGAGCGAGGTGGGATAATTGAAAGAGGGGATCTCCTGTG +GAGACTGAGGTATTAGGCGGAGTAGAGAGTTCAGGTGAAGATGTGAAGGTGAGAGAAGAGGATGGGTAGA +CATTTCCCTGGTGAAGGAGGTAAGGAGTACTATGATGGAATTAGAGGGGACACACTGAGAGGGTCCACAC +TTGACAGACTCTCTTCTATTATGTGTTATGTGAGGTAGATTGTAAAGTCAAAGGCTAGCCTTGAAAAATG +TGATATTGTTTTGGAATGGCAACCATGGTGAATACAAAACAGTTACCAGAATAGTATCACCATGTAGCAA +ATGAGGGTCTGCAACAAAGGCATATTCCTAAATATTTATATGTGTACTAGTCAATAAACTTATATATTTT +CTCCCCATTGCAGCACAACTAAGGAACGTCAAGAGATACAGAATCCAAATTTTACCGCACCTGGTCAAGA +ATTTCTGTCTAAATCTCATTTGTATGAACATCTGACTTTGGAAAAATCTTCAAGCAATTTAGCAGTTTCA +GGACATCCATTTTATCAAGTTTCTGCTACAAGAAATGAAAAAATGAGACACTTGATTACTACAGGCAGAC +CAACCAAAGTCTTTGTTCCACCTTTTAAAACTAAATCACATTTTCACAGAGTTGAACAGTGTGTTAGGAA +TATTAACTTGGAGGAAAACAGACAAAAGCAAAACATTGATGGACATGGCTCTGATGATAGTAAAAATAAG +ATTAATGACAATGAGATTCATCAGTTTAACAAAAACAACTCCAATCAAGCAGTAGCTGTAACTTTCACAA +AGTGTGAAGAAGAACCTTTAGGTATTGTATGACAATTTGTGTGATGAATTTTTGCCTTTCAGTTAGATAT +TTCCGTTGTTAAATAATGTCCTGATGGTTTTCCCCCTTTGGTGGTGGTAATTTTAAAGCCCTTTTTAATG +TTTTAGATTTTCTAAATCCAAAGATTAGGTTTAAATTATTCTAATGTTTCTTTCAAAGATAACTTCTTGT +GGACTTGTTAAAAAAAATTAGACACACAATCTAGGACTGCTGTTACTGGAATATATTTTCTATCATGCTA +CTAATTTTCTTTTTAAAATGTGATAAAAATAGGGCCGGGCGTGGTGGCTCATGCCTGTAATCCCAGAACT +TTGGGAGACTAAGGCGGGCGGATCACCTGAGGTCAGGAGTTCAAGACCAGCCTGGCCAACATAGTGAAAC +CCTGTCTCTACTAAAAATACAAAATAAATAAATAAATAAATAAATAGCTGAGCGTGGTGGCAGGCACCTG +TAATCCCAGCTGCTTGGGAGGCTGAGGCAGGAGAATCGTTTGAACCCGGGAGGCAGAGGTTGCAGTGAGC +CGAGATCGCGCCATTGCACTCCAGCCTGGGCAACAAGAGTGAAAAACTCTGTCTCAAAAAGAGATAAAAA +TAGTAAAGATATTCATATTTATACAGCTTTACAAGTTGAAACATCCTTTCATTTATGAAGAATTAAAAGG +GGTACCCTTTTTAGAGAAAAGGAGAGCATGTAAACTTCGAGGAAATTGATATGTATAATTTTATAAAACA +GGGCTTGCGCTTTTTTTTTTTTGAGACAGAGTTTCGCTCTTGTTGCCCAGGCTGGAGTGCAATGGTGCAA +CCTCGGCTCACCGCAACCTCCTCCTCCCGAGTTCAAGTGATTCTCCTGCCTCAGCCTGCTGAATAGCTGG +GATTACAGGCATGTGCCACCACACCTGGCTACTTTTGTGTTTTTTTTACTTTTATATATTTTTTTTTTGT +TTAGTAGAGACAGGGTTTCTCCATTTTGGTCAGGCTGGTCTTGAACTCCCGACCTCAGATGATCTGCCCG +CCTCAGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACTGTGCCTGGCCAGGGGTTGTGCTTTTTAA +ATTTCAATTTTATTTTTGCTAAGTATTTATTCTTTGATAGATTTAATTACAAGTCTTCAGAATGCCAGAG +ATATACAGGATATGCGAATTAAGAAGAAACAAAGGCAACGCGTCTTTCCACAGCCAGGCAGTCTGTATCT +TGCAAAAACATCCACTCTGCCTCGAATCTCTCTGAAAGCAGCAGTAGGAGGCCAAGTTCCCTCTGCGTGT +TCTCATAAACAGGTATGTGTTTGTCTACAATACTGATGGCTTTTATGACAGAGTGTAATTTTATTTCATT +AACTAGTATCTACAAATGGCTTTGTTTAAAGAATGAACACATTAGTGCAGGAATGGATGAATGAAATCAT +CATATTTTCTAATTAGCCTGCAGTGGCAGCCTCTGGCCCCTTGCTAGGCCTGCCTCATCCTGCTAAAGTG +ATCTGTGCTTCCAAATTACTACTTCTTTTCCCCCTTCAAATCTTTCTTATTTTGTCATTGTAAATGCTCT +CAGCTAGGTGTTAAAGTAGTCTTACTGATATTCAAATGTGAATAACTGATAGCCCTGAACCTTCTATGAG +CTATTTATATTTTCCAAAGAGGATTCTCCTTAAGCCAATATTATCTAGGTAGAATTTTAGGCAATGGAGA +GGTGAAAATAATATTGATGACATTAATAGCTAACTTTGAGCATTTTCTAGGTGTAAGATGCTCTTCTAAG +CACTTCACATGCATTAGGTATATCTTGCTTAATCCTCACAGTCACCTTGAAAGAAAGGCACTGTTACTTT +GTTTCCATTTTGCAAATGAGAGAACTGAAGCATAGAGAGGGTTAAGTAACTGCCCCAAAGTCACTTAACT +AGTAAGTGGAAGTGCTATGATTCCAAAGCAAAGAGTCTGACTCCAGAGTCAAACTCTGAACAAACAAAAA +GACACTTTGGGTTAGATATCCTGGGGTGAAAGCAAGCACTTTGAAAGTAAGCCAAGCCTGTGTACAGATC +TGACCACCTGAGGTCACATTCCCTAAAATACTTAAACTTCTCCCTTTTGTTTCCCATCTAAGTTTTTGAA +CTTAAGAGATTTTGTAAAACATCACATTTTTTTATCCTCACAGTACCTTCCTATGGCAGATTTAGCAGGA +GGCGTATAAACGGGGTGGAAAAGGTACAGCAGACTGTGGAATGTATGGATCATTTATATTACATTAAAAT +TTTTAGTTTCTAGTAAATAACTTAAATGTTTTTGTAGTGAAGATTCTAGTAGTTAATGAAAATTTTTGGT +AAATTCAGTTTTGGTTTGTTATAATTGTTTTTATTGTGTGATACATGTTTACTTTAAATTGTTTTTCTTT +TTTGTGTGTGTTTATTTTGTGTAGCTGTATACGTATGGCGTTTCTAAACATTGCATAAAAATTAACAGCA +AAAATGCAGAGTCTTTTCAGTTTCACACTGAAGATTATTTTGGTAAGGAAAGTTTATGGACTGGAAAAGG +AATACAGTTGGCTGATGGTGGATGGCTCATACCCTCCAATGATGGAAAGGCTGGAAAAGAAGAATTTTAT +AGGTACTCTATGCAAAAAGATTGTGTGTTAACTTTTATGTATTCCCTCATCCCTCTTTCTTCTCTTAACT +GTCTCTCGAACTAAAAAGTTGGCTAGAAATCAAATTTTTATGCATTTAATTGTTTTAAGTGCATTATGGT +TAAGCATTCTGTAGAAGTCTTTTGAAAAGTGCTGTTTGTCCTGGGGTTTAATGAACTGGATTTTCTTGAT +TTGGGACATTTTTCTTAGGCATTTATAAATATAGCCCAATTTATAAAGTTAAATTTGGCCGGGTACAGTG +GCTCATGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGTAGATCACCTGAGGTCAGGAGTTCGAGA +CCAGCCTGGCCAACGTGGCGAAACCCCATCTCTACTAAAAGTACAAGAACTATCTGGGCGTGGTGGCAGG +CACCTGTAATCCCGGCTACTCTGGAGGCTGAGGCAGGAGAATCGCTTGAACCTGGGAGGCAGAGGTTGCA +GTGAGCCAAGATTGAGCCACTGCACTCCAGGCTGGGCGATAAGAGTGAGACTCCATCTCAAAAAAAAAAA +AAAGAAAAAAGTTAAATTTGAGGGCCAGACATGGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCT +GAGGTGGGCAGATCTCTTGAGCACAGGAGTTTGAGACCAGCCTGGGCAACATGGTGAAAACCCATCTCTA +CAAACAAATTAAAAAATTAGCCCAGCCAGGCGCGGTGGCTCACGCCTGTAGTCCCAACACTTCGGAAGGC +CAAGATGGGCCAATTACCTGAGGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCAGCTC +TACTAAAAATACAAAAATTAGCCAGGTATGGTGATGCATGCCCGTAATCCCAGCTACTCGGGAGGCTGAG +GCAGGAGAATCATTTGAGCCCAGTAGGTGGAGGTTGCAGTGAGCCAAGATCACGCCACTGCACTCCAGCC +TGGGCAACAGAGCAAGACCCTATTTCAAAAAAGGCCAGGTGCGGTGGCTCACACCTATAATCCCAGCACT +TTGGGAGGTTGAGGTGGGCAGATCACCTGAGGTCAGGAATTTGAAACCAGCCTGGCCAACATGGCAAAAC +CCCATCTCTACTAAAAATACAAAAATTAGCTGGACGTGGTGGCACGCGCCTGCAATCCCACTTACTTGGG +AGGCCGAGGCAGGAAAATCGTTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCAAAATTGCACCACTGCA +CTGCAGCCTGGGCAACAGTGAGACTCCATCTCAAGAAAAAAAAAAAAAAAAAAAAAGAAAGTTAAATTTG +AAATGGCCTTATGGTAGATTCCTCCCCCGACACACACTTACTTCATGTTTTCTTTCATTATATATTTTAA +TGGATACAAAATATAAATAAACACTAAAAGTTAAACAGAAATATTTGAATATCAATAATGCCAAATAACT +AGAAAATCTCAGAGCTCTAAAACAGCAACAATTTAGAAACTATATAACCTCTTTTTATTGTAGTTTTTAC +AGAAACATAATTTAAAGCTTTTTGTTATCAGAGATATATTACATTATGCCAGTGGCAAAAGATGGGATTT +ATTTCCTCAGCATCCTTATCTTTAAATTTCTGTACATCTTTCCAAAATTTATAGCTTTGGAAAAGTGATA +AAACTTTTTTTCCTGAATTTTGTTTTAACTTTTAAAAACAGAAATATTGTTTACATCTTGCGTATCTTAT +ATAACAAACATCTGCTTATAGATTCCAGTAAGAAAAGTTGGTTAAACGGTTGTATTATTTTCTCGTACTA +AATAGACTGCATAAGGTAGAAGTTAAGAATGATTGCCCTGTAGTCTAAGTGGAAATGTGGAGGCTTTCGT +TAGTTTTTTCTGATAATTCAGCAAATCTCTATTGAGCACTTGCTATGTGCCAGGTACTATTCTGGGTACT +AGGGATAATAAAGGAAAACAAAAAAGTCCCTGCCCTGATGAGTCATACATTCTATGTGGAAGGCATAGAA +AATATTGAAATATAAGTGAATTGTGTAGTATGTTAGAAGAAGATACATACTATAAAGATAGATAAAGTTG +GAAAGGTGGCAGAGAAAGTTGGGCAAGGAGATGCGATTTTTAATCTAATAAGTAGTTAGGAAGGCTTCAC +TGAGTCAGCTACATTTGATAAATGACCTAAAGTAAAAGGAGGGAGCATAGGACTATCCTAGCAAAAGACC +CCCAGCCTCTAAGAGGGGAGCATGCTTGAAGTATTTGAGGAACAGGAAGTTAGTGCAACTGGAGTAGAGT +GGGCAGGAGAAGAGTAGTAGTAGATGAGATACAAAAGGAAGACCTCATAGACCTTCGTAAGACCCTTACC +TTTTACTCTGCACGATTTTTACTGAATAAACCACTGGAAGGCTTAAAGGGTAACATGATCTGACTTTTTT +TTGAGACAGTCTCACTCTATTGCTCAGGCTGGAGTGCAGTGGTGCAATCTCGGCTCACTGCAGCCTCCAC +CTCCCAGGTTCAAGCGATTCTCCTGCCTCAGCCTCCCGTGTAGCTGGGACTACAGGCGCGCACCACCGTG +CCCGGCTGATTTTTGTGTTTTTAGTAGAGACAGGGTTTCACCATGTTGGCCAGGTTGGTCTCAAACTCCT +GACCTCAGGATCCACCCTCCTCGGCCTCCCAAAGTGCTGGAATTACAGGTGTGAGCCACTGTGTCCAGCC +TGATCTGACTTATTTTTGAAAAAATAATTCTGGCTGTTTGTTGAGGAGAGGGGCAAAGATGGACACACAG +AGACCACTTAAGCTATTGCAGAAATACATGTGAGAGGTGGTTGGTTGGACCAGGGAAGTGGCAGTGGAAT +TGGTGGAAAGCAGTTGGACTCTGGGGTATTTTGAAAGTGGCACTATTAGGAGTTGCTCAAGGATTAGATA +TAAAACGTGAGAGAGGAGGAGAATAAGAATGGCTGTGAAGATTTTGGCCTTAGCAGCTGGAAGGATAGAG +TTGTATGTAACTACTAGAATTGAGAAGACCAAAGATGGAGGGAAATGGAGAGTTTGGTTTTGGACATTTG +AAGCTTGAGATGTAATAGTAGACAGCCAAGTGGAGATGTTAGGTAGGCAGTTGGATATGGAAGTCTATAC +ACAGGTAAAGTATAGGCCAAATAAATCAATTCACAAGTCATCGGCATATAGATGGACTTGAAGGCCATGA +AAAAGAGACTGAGAAAGAGCAGCCAGAAAGTTAGGAATAAATGCAGAATGGGGTGTTGCATTCCAAATGA +AGATGGAATTTTAGGGAATAGAAAATGACCAGCTGTGGAAGCTGCTTCTAATAGGTAAAGTAAGATGAGG +ACTGAGATTGGCCGCTGGATTTAGCACTGCAGAAGACATTACTAATGTTATTAAAAATAGCTCAATAGAT +TGGTGGAGTGATATCCTGATTAGAATGCAATTAAAGAGCAGTTGAAGAGGAGGAATTGGAGACACAGAAT +ACAGTCGATTCTTTTGGGAGTTGCCAAAAAGAAGCAGAGAGAGGGACATTGCTTGGAGAGGAAGTAGGAT +CAAAGAGTCTTAGTTTTGGCTTGTTTTAAGGTAGAAAAAAACCGTTTCTTATGCTGATTCACATTGTTCA +GTAGAGAGGGAAAAAATTGATGATGCAGGAGAGAGAGGAGGCATTTCCTGATCGTTGGCCTTGTAGGCAG +CAAGGGGTAGGAGCTAGTGCACAAATGGTAGAAGAGGGCAAAGTGTAAGGATGCAGATGCTTGGAAGAGG +GCAAAGTGTAGGGATACAGATGCTGTGAGTGGATAGATATGAGGGTGGGAGCTTATGGAAGTTCTCTTTT +GATTACTTCTGTTGTCTTAGTGCTAAGAGTATGAATGAGAAAGGAGGAGTTAGAGATTTGAGAACAGAGG +GGACAGGATCAAAGAGAGCACCAAGACTAAGAAAGGCAGTTCTCAGGCGTGATTTCTAAAAAAATCTCTT +TCATAAGAAAAATAATCTAAAATATAATTATTTAAAATCAAGGATCTCATTTTTCAGGAACAAATATGAG +TTGAAATCATTCTGTTGACTGTTAAGTGGAATTTTTTGTTTTGTTTTTATATTTTGAGATAGGGTCTCAC +TCTTGTCCAGGCTGGAGTGCAGTGGCACTATCATGGCTCACTGCAGCCTCAACCTCCTGGGCTCAAGCAA +TCCTCCCACCACAGCCTCCTAAGTAGCTGGGACCACAGATGTGAGCTACCACTCTTGGCTGATTTTTTTT +ATTATTTTTTGTAGAGATGTGGGGGTCTCACTATGTTGCCTAGGCTGGTCTCAAACTTCTGGCCTCAAGC +AATCCTCCTGCCTCAGCTTCCCAAAATGCTGGGAGTATAGGCATGAGCCACCATGCTCAGCAATGAAGTT +TTTATCAGTATGATACTTTGATACATGTCAAATAATTTTCTGAAATTATATTGTAGATCATATGAACTCA +TAAAAACTTAATGATCTTGAACAATGTAGTTTTTGTACAGAGAATAGTTGTAGTTGTTGAATTCAGTATC +ATCCTATGTGGTTTTTATGATAATATTCTACTTTTATTTGTTCAGGGCTCTGTGTGACACTCCAGGTGTG +GATCCAAAGCTTATTTCTAGAATTTGGGTTTATAATCACTATAGATGGATCATATGGAAACTGGCAGCTA +TGGAATGTGCCTTTCCTAAGGAATTTGCTAATAGATGCCTAAGCCCAGAAAGGGTGCTTCTTCAACTAAA +ATACAGGCAAGTTTAAAGCATTACATTACGTAATCATATACGGCAGTATGGTTAAGGTTTCTGTGTAGTC +TGTGACTTCCATGTCAAAATGTTGCACAAGCCAGTTGTCAGTGACAGTTGCCATCCCACACTGCTGTTCT +CCTGTCATCCCTAGCCCCCATTTAAGAGAGATCACACATTCATGCATTGCTTGCTTCCCTCTTTCCCCAC +CCCCTCCTTAACCTCTTGATGTATGAGAAGAATATGAGTTACTAATTTGATCCACTATTTGGGGATTGCT +AATAAAGCATTTTTGCATTTTATTTTTTGCTTTTTAAAAATAATTGATATTTTAACAATATGAAACAATA +TATTCCTAGCTACAAAATTTTTAATTCTCAGTATTTCTTAGATAAATTCAGTTTTTATTCTCAGTTATTC +AGTGACTTGTTTAAACAGTGGAATTCTAGAGTCACACTTCCTAAAATATGCATTTTTGTTTTCACTTTTA +GATATGATACGGAAATTGATAGAAGCAGAAGATCGGCTATAAAAAAGATAATGGAAAGGGATGACACAGC +TGCAAAAACACTTGTTCTCTGTGTTTCTGACATAATTTCATTGAGCGCAAATATATCTGAAACTTCTAGC +AATAAAACTAGTAGTGCAGATACCCAAAAAGTGGCCATTATTGAACTTACAGATGGGTGGTATGCTGTTA +AGGCCCAGTTAGATCCTCCCCTCTTAGCTGTCTTAAAGAATGGCAGACTGACAGTTGGTCAGAAGATTAT +TCTTCATGGAGCAGAACTGGTGGGCTCTCCTGATGCCTGTACACCTCTTGAAGCCCCAGAATCTCTTATG +TTAAAGGTAAATTAATTTGCACTCTTGGTAAAAATCAGTCATTGATTCAGTTAAATTCTAGAAGTTTTAC +ATTTAAATTTTAAATGCTTACTAAGGATGCTCAATTTCTTAGATGTACTGATAATTTTAGTATAAAAAGC +ATATTCTTCAGACAGTTAAAGTTTTTGTGCAGTTTTTGGGAGGTCCAGAGATCTTTCTTGAGCTTAAATA +ATGCATTTCCAATTAAAAAGCAAAATAAATTTGCACCATTTGATTTTGGTATCTGTAGCTTGCTGCCCTC +TTGTTCTCATAGCTTTGCTTTGATCAGATCCCTATTCCACTCTGGATTAGAGAATTACATTTTAGTACTT +TTCAAATATGTAATAGATACACTTTTTATCTCTATGTAGATTTTAAACTACATAACAGGACTCTTTGTCA +TATTGAATGGTCTGCAGTATTGCTATCTGAAATTACCGATAATATTGTACATTCAGATTCACTTAAGAGG +TAACCTTGCAGAGAATTTACTTCTGTGGTATTCTGGATCACTCTAAAGAGAATGTTTTATAAATTAAACA +TTTTTAAGGTAAAGATATATTTTGTTTGGCATTAGTTCCATGTTGGATTGATTGCTTTTTACTGAAAGCA +TTCCATCAAGCTGAAACAGTCTTTTGTTTTATGTTGCTTAGAACATCAAGCTTGCAGTGGCTTTCATTTT +CTTGTTTTTGTTTTTTTTTAAATCAAATCAATGCATGTGCATAATTTGGAAACTCAAGTAATATAAGACA +TATAACCAAAAAAAAGCAGTTGCTAGCACAACCCTCCCCATCCTCATTCATGGTCCCAAGAGGCAATCGT +TTCCAGTCTGTTTGACTATTTTTTAGCTTGTACTGCTGTCTTCTTTCTCTCTTCAGTTTAGACAGCAACC +GTTAACTTCCTGCTATGGAAAATGAGGATTTCATTGTCTTACTCCAACACTGCTACATATTTCTCTACCC +TCTTCCCCATCTTCCCTCTACTAATACAGCATGATTTTTAGTGAAATCTGTATTCAGTGTATACATTATA +ATGACTGAAAAATATTTTTTGCAATTTAATACCTAATCGACTGTGATCACATTTCCTTTCTTTACAGCTT +ACTGCTTTCCATGAAGTTATTAACTGTCCTTTTCATTTCTCTTAATTTTCTGTGTCCTTACACAGGAGCT +ATAAAAATCCTGCCCAATATGTTAAAATTTCTCCATTGGTTCCATTTCTGTTTCTTGGAGAACCCCTGGA +CCTCTTCATATGCTTACTCCATTCTGAAGTGCTCATTTTCTATGTGTGCCATACAGCTCTCATTCTGTAA +ATTATTCTTACCTCTTTTCTGTGTTAGAAACCCCATTTCCTGGTTTCCATGTCATCTTTCTAGAGTTTTT +CCCCCAATTTTAAGGGCCCACAACTCTCAGAGGCTTGTTGGGAAAAGGAGCATGCAAGCTTTTTGATATC +TTAGGGTAAGGTTGGCTGGATATAGAATTCTAGGTTGAAATAATTTCTTCAGAATTTTTAAGACATTTCA +TTTATCTTCTTTAGCATAGCTCCTGCTAATTGCAGTGCATTCTTTTTTTTTTTTTTTTTTTTTTTTTTTT +GAGACAACGTCTCACTGTGTTACATAGGCTGGAGTGTTTCATTTTGTACAGATGAGGTCTCCCTGTGTTG +CCTAGGCTGGTCTTGAACTTCTGGGCTCAAACGGTCCTCACGCTTTAGCCTCCGAGAGTGCTGGGATTAC +AGACATGAACCACTGTACCTGGCCTGCGGTGCTGTTTTTATCTTCAGTCATTCATTATGTGCTCTGGTTT +TAAGAAGTTAAGATTGTTTTGTCTTTATTTTTATTTATTTATTTATTTTTGAGACAAAGTCTCGCTCTTG +TTGCCCAGGCTGGAGTGCAATGGCGTGACCTCTGCTCACTGCAACCTCCGCCTCTCGGGTTCAAGTGATT +CTCCTGTCTCAGCCTCCCGAGTAGCTGGGATTACAGGTACCTGCCACCACACCCAGCCAAATTTTGTATT +TTTAGTAGAGACAGGGTTTCACCATATTGGCCAGGCTGGTCTCATCCTGACCTCAGGTGATCCACCTGCC +TCAGCCTCCCAAAGTGTTGGGATTACGGGCATGAGCCACCACGCCCAGCCTGTTTTGTTGTTATTTAAAT +TTCACAGTAATGTACTTTGGTGTCTTTTTTTTTTTTTTTTTTTTTAACTTTTTTTTTTCCTTTAATTCTT +GGCCCAGATGCTTACTGGCACTTACTCTAGAAACACATAGTCCTTCATTGAGTTCTGGAAAATTTTCTTT +AAGTGGTTCTTTTATAATTTCCCTCCATTTTTTTCTCTAGTTTTTTCTGGAACTCCTGATGTTTGGACAC +TAGGCCTCCTATATTGTTCCTATAATTTTCTTGTCTTTTCTCCTACTTTCTATTTGATTGTCTTTTTTTA +TTCTAACTTCTGGGATGGTTTTTAATTTTTTTCTTCTAATTCTTTATTAATATGCCATGTTTTATAGATT +TGTCTTTAAATATTTTTACATAATTTTGTAACAAAATACAAAAAAAGCAATTCCTAAAAATTTAGAAAGT +CAAATGAAAGCAAAGAACTTAAGTGTGTTTTCAATTAGAGCATAATCATACCAAGAAAGTATTTCAAGTA +ACTTAAAAAATGTTTTATGTCCCTAGTGGTATATACCCCAAGAACAACAATAGCAACAACAACTATAAAA +TGAAACAAAATCTTAAGCTATTGTTAGTAATCATATTGCTGGTGGTAGTGTTGGTATTCCTATTCTGAAG +TTATAGTGGATGTGAAGTATGTTGTATGTGTATACTCTTTTACGTATATTTGTTGTATGTGAGTAATTAT +ATGATTATAGAGAACAGGGATCTTTTTATCAGAGAAAGGTGCAGATGTGAGATTGAAGTAAAAGAAAACT +TGTGGTTCTGCATTTGTATTGGAAATATCATTATGAACTCGAGATCTATATTATCTTTAAAAAATACATG +CTGGCTGGGCACAGTGGCTCACACCTATAATCCCAGCACTTTGAGAGGCCAAGGTGAATGGATCACTTGA +GGTCAGGAGTTCAAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATATAAAACTTAGC +CAGGCATGATGGCATGCTCCTGTAGTCCCAGCTACTGTGGAGGCTGAGGTGGGAGAATCACTTGAACCTG +GGACGTAGAGGTTGCAGTGAGCCAAGGTCGCACCACTGCACTCCAGCCTGGGCGACAAAGTGAGACCCTG +TCTCAAAGAAAAAAAAAAATATGCTTATTGGTTTCATCTATCAAAAAGAACAAAAAAAAAAAAAAAAGGA +ATAATCCAGTAGTAGTGAGTAACCCTAGCTCTCAGACTGTTTTCTAAGTACTGTTTTCTCTTAAAAGGAT +GCAAGGTGTCTTGAAGAAATGGCTGATTCCAGATTCAGAGGAGGAAATATATGAATCTGGAAAGTCTTGA +CATACCAGAAAAAAACTAAGCATCAAACACTACTAGTGTCATGTCAAAAGGACTTAGGAGTGTAATAGAA +TAGATTCTTACTGATCACAGATAAAATAATTTGAGCATCAGAAAGGATAATAACAGGCTGGGCACACTGG +CCCACACCTGTAATCCCAAGATTTTGGGAGGCCGAGGCAGGCAGATCACTTGGGGTCAGGAGTTTGAGAC +CAGCCTGGCCAACATGGTGAAACCCCTTCTCTACTAAAAAATACAAAAATTAGGTAGGCCTGGGGGCGGG +TGTCTGTAATCCCAGCTATTTGGGAGGCTGAGGCAGGGAGAATTGCTTGAACCCAGGAAGCGGAGGTTGC +AGTGAGCCGAGATTGTGCCACTGCATTCTAGCCTGGGTGACAGAGCGAGACTCCATCTCAAAGAAAAAAA +AAAGGATAATAACAGCAAAAAATTGAAATTCATAACAAATGATAACTTCTATTCTCATTGTTTTAAAAAC +TAAAGCCCAGGCACAGTGTCTCACGCGTGTAATTGCAGTACTTTGGGAGGCTGAGGTGGGCATACTCAGG +AGTATGAGACCAGCCTGGGCAACATGACAAAACCCCATCTCTACAAAAAATACAAAAATTAACCAGGTGT +GGTGGCATGTGCCTATAGTCCCAGCTACTTGAGAGGCCGTGGTGGGAGGATGACCTGAGCCCAGGAGGCA +GAGGTTGCAGCGAGTTGAGATCGTGCTACTGCACTTCAGCCTGGGTGACAGAGCCAGAGCCAGACCCAGC +CTCAGAAAAACAACAAAAACTAGGTAAAAGGAGAAAAAAATCAAGCATTTATCTTTCTTCTCTTATATGA +TCTATATTTTGGGACCCTCAAGTAGATGAGGGGAAGTTTCTGTTTATAAAAATGTTTCAACAAATAAGGA +ATAATAGAATTAAAATATAACCATTTCGCAACCCTCAAATTAGGGTTGTCTTTTCTTCTAATTCTTTTTT +TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGAGATGGAGTCTCGCACTGTCGCCCAGGCTGGAGTGCAGT +GGCACGATCTTGGCTCACTGCAACCTCTGCCTCCTGGATTCAAGCGATTCTCCTGCCTCAGCCTCCTAAG +TAGCTGGGATTATAGGCACCCACCACCACACCTGGCTAATTTTTTTGTATTTTTAGTAGAGACAGGGCTT +CACTATGTTGGCTACGCTGGTGTTGAACTCCTGACCTCGTGATCTGCCCGCCTTGGCCTCCCAAAGTGCT +GGGATTACAGGCTTGAGCCTGTAAATCCAGAAAAGGATTACAGCCCTTTTCTTCTAGTTCTTAAAGTGAA +TTTGTGTCTACCAACATAATTTTCAAGAGACCTTTATTATTCCCTAAATGTTTTTCTTTTTTTTGTGACA +TTCAGTTCTTGTTTCATAGATACAGTATCTTCTCATCTTTCTAAAGCTAATGATTAATGTTTTATTTTCA +TTTTCGTTTTTTCTGCTCCCTGCATTTTTTTTTTTTAATGAAAACCTTTGTCTATTTGAGTCTCTCTGGT +TAGAGACTTTCCTCAATGGTGATCATTCACTGATCCAGAAGTTGTATGTGAGGTGAGGCTTGTCAGCTCA +TAGGGTAGTAATGTAGTGATTTAGTTTTTAACTAGGAGACCCTCAAATATCAGTGACTGTTCTGAGAGCT +GAGCAGAGTAAGGAAATTAATAGGGAGACTCATTGTCAGTGTAAGAATTTTATTTCAGTTTGTTGTTTGT +TGTTTGTTTTTTGTTTTTTTGTTTTTTTTTTGGTTTTTTTTGTTTTTTTTTTAGATGGAGTCTTGCTCTG +TTGCCCAGGCTAGAGTGCAGTGGTGCGATCTCGGCTCACTGCAACCTCCACCTCCCAGGTTCGAGGAATT +CTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCGCCTGCCACCACCCCTGGCTAATTTTTTATTT +TTAGTAGAGACGGGGTTTCACCATCTTGGCCAGGCTGGTCTCCAACTCCTGACCTTGTGATCCACCCGCC +TCGGCCTCCCAAAGTGTTGGGATTACAGGCGTGAGCCACAGCGCCCGGCCCAGTTTTGTTTTCAATAATA +GCATCTCACCCCCACCCTGGCTGCGTCTGCTCAGTATTCCAGAGTCCACGATATGTATGGTTCAGCCCTC +CAGAAAATAAAGTCTCCTGCATTTTTTTTTTTAATGCTGCAGCAAGAATAGGGTCCTGGGTTCTTTTTTA +GTATGAGGGAGAGACAGCCACCTGGCTACTTGGGATAGGAAAGAAAATCTGGTGTTTCAACTACGTTTGT +ACAAAATGTCAACCATTTCTTCCTATTTTCAGCCCCACCATATGCTCCTGCCTTCACAGGTACCTGTTGC +CTCCAATTTCTGAGTGTTTTCCTTTAATTATTTTGTTTCATGTTAACTCCTTACAACAAGTTTGGTGGCT +GAATAACCTTGGGCAAGTTGTGTAGTTTCTCATATACTTTAGTTTTATCGTTGTCTGTAAAATGGAGATG +AGTCTTCAGATTATTGTGAAGATAATTTGTTTGTTTGTTTTTTGGAGACGGAGGCACGCTCTGTCCCCCA +GGCTGGAGTGCAGTGGCACAATCTCGGCTCACTGCACCCTCTGCCTCCCAGGTTCAAGCAGTTCTCCTGC +CTCAGCCTCCCGAGTAGCTGGGATTACAGGCGCCCACCACCTCACCTGGCTACTTTTTTGGTTTTTAGTA +GAGACGGGGTTTCACCATGTTGGCCCGGGTGGTCTCGAACTCCTGACCTCAGGTGATACGCCTGCCTCGG +CCTCCCAAAGTGTTGGGATTACAGGCATGAGCCACCTTGCCTGGCCAAAGATTAATTGTTAATATACATA +AAGCGCTTAACACCATGCCAGGTACCTTAGTAAGTGTTCGATGAATATTTGCTTTTTGTATTAGCCATAA +TCATTCTCAGGCTGCTTTGTCATTTACTTGTTCCACAAATTCTTAGCTTCCAAAATTTTGGTGATACCTC +ATTTCCTATTCTCTCTAGTTGCCTTTGTCCATGTAGATTTTTTGAGGAAGCTTGGGTAAATAAGTGTATT +TTAAACTATTATGTTTAAATCGAAGTTCCTTTTATCTGTTTTCTAATAGAAACATTTAAATAGCATTAAG +AACTTGTAGCAGTATAAACAATATGTTTGAGAAGTACTATATTGTGAAAATATTTTCACTTTTATACAGT +TTTTTACTTATTTACTGTCTTACTAATCTTCCTAAGACTTTTTAAAGTGAATATTTTTAAGGCAGTTCTA +GAAGAATGAAAACTCTTATGATATCTGTAATAGAATTGAATACATATTTAACTACTAAATCAATATATTT +ATTAATTTGTCCAGATTTCTGCTAACAGTACTCGGCCTGCTCGCTGGTATACCAAACTTGGATTCTTTCC +TGACCCTAGACCTTTTCCTCTGCCCTTATCATCGCTTTTCAGTGATGGAGGAAATGTTGGTTGTGTTGAT +GTAATTATTCAAAGAGCATACCCTATACAGGTATGATGTATTCTTGAAACTTACCATATATTTCTTTCTT +TTGATACAATTAATTTGTTTGTTTGTTTGAGATGGAGTTTCGGTCTCTTGCCCAGGCTGGAGTGCAATGG +CGTGATCTTGGTTCACTGCAGCCTCCACCTCCCGGGTTCAAGTGATTCTCCTGCCTCAGCCTCTCAAGTA +GCTGAGCCACCACACCTGGCTAATTTTGTATTTTTGGTAGAGAAGGGGTTTCATCATGTTGGTCAGGCTG +ATCTCGAACTCCTGACCTCAGGTGATCCACTAATCTCAGCCTCCCAAAGTTCTGGGATTACAGATGTGAG +CCACTGTGCCTGGCCTGATACAATTAACTTGAATGTTATATATGTGACTTTTTTGGTGTGTGTAACACAT +TATTACAGTGGATGGAGAAGACATCATCTGGATTATACATATTTCGCAATGAAAGAGAGGAAGAAAAGGA +AGCAGCAAAATATGTGGAGGCCCAACAAAAGAGACTAGAAGCCTTATTCACTAAAATTCAGGAGGAATTT +GAAGAACATGAAGGTAAAATTAGTTATATGGTACACATTGTTATTTCTAATATGAGAACAAAGTCTTAGA +GACTTTGAATTTAACATTTTTAATGAGTAAATTGTTTTTATTTTGAGTAGTAAATTGACTTTATTTTTTA +GTATCTAGGGTATTCTTTTTTGGTGTTAGACAAAGAATAGCAACAAGGGACAGAAATATCAGGTCTAAGC +CATTTGTAATATTTTTCCTGAATTCTTACCTATATGATGTGGCTTTTGCATTTTTGTCATGGTAGTTATT +AGCTTTCATGTGTTATTATGCCTGGAACTAGGACCTATTGTGGTGTCAATTTTAATATTAAAAATCATGG +TGTTTTGATGTTTATATGACATAAATTTTATTTTTTCGTATCTCCCTTTTGTTGTTGCTGAAGATTTTAT +GTTTTTCTGCATTTCCTCATGATTTATATAGATGTAACATGTTCTATAGGACATGTAATTTACATGTCCT +ATAGAACTATAAGTTACATGTCCTATAGAACTTACAGTTCTATAGTTATCTGCAGAAATATTGCTCCTTA +TGCTTTATTTGCTTAAAATTATCACTAGATCATACTATTTTCATAAATAAATGAATATGAAATCATTCAC +AGGCATACCTCAGAGATACTGTGGATTTGATTCTAGACCACCGCAATAAAGCAAATATTACAGTAGAGCA +AATCACACGAATATTTTGGTTTCCCAGAGCATACAAAAGTAATGTTTACACTATAGCATAATCTCTTAAA +TGTGTAGTAGCATTGTATCTAAAAAAAACAATGCACATACCTTAATACACTTTATTGCTAAAAAATGCCA +ATGATCATCTGAGCCTTCAGTGAGTTGTAATATTTTTGCTGGTGGAGGATCTTTCCTCAATGTTGATGCC +TGCTGAGTGATCAGAGTGGTAGTTGGTGAAGGTTGGGGCAGTTGTGGCAATTTCTTAAAATAAGACAATG +GCATTTGCAACATTGATTGGCTTTTCCTTTCATGAAAGATTTCTCTGTAGCATGCAATGCTGTTTGATAG +CATTTTATCCATGGTAGAACTGCTTTCATAATTGGAGTCAATTCTATCAAACTCTGCTTTATCAGAATAT +TATGTAATATTCTAAATCCTTTGTTGTCATTTCAACAATATTCACAGCACCTTCGCCAGGACTAGATTCC +CTCTCAAGAAACTACTTTCTTTGCTTATCCATAAGAAGCAGCTCTGTATTAATCTGTTCCCACACTGCTA +TAAAGAATACCTGAGACTGGGTAATTTCTAAAGGAAAGAAGCTTAATTGACTTACAGTTCCACATGGCTG +AGGAGGCCTCAGGAAACTTACAATCATGGCGGAAGGCAAAGGCAAAGCAAGTACCCTTTTCATAAGGTGG +CAGAAGAGAGAGTGCAGGGGAAACTGCCACTTGTAAGCCATCAGATCTCATAAGAACTCCCTCACTAGCA +CAAGAATAGCATGGGGGAAACCACCCCCATGATCCAATCACCTCCCACCAGGTCTCTCCCTCAACACATG +GGGATTACAATTTGAGATGAGATTTGGGTAGGGACACAGAGCCAAACCATATCATTCTGCCCTGGACCCT +CCCAAATCTCGTGTCCTTTTCACATTTCAAAACCAATCATGCCTTCCTAACAGTCTCCGAAAGTCTTAAC +TAATTCCAGCATTAACTCAAAAGTTCAAGTCCAAAGTCTTCATCTGAGACAAGACAAATCTCTTCCACCT +ATGAGCCTGTAAAATCAAAAGCAAATTCTTTACTTCCAAGATACAATGGGGGTACAGGCATTGGGTAAAT +GTTCCCATCTCAAGTGGGAGAAATTGGCCAAAACAAAGGGGCCACAGTCCCCATGCAAGTCCGAAACCTA +GCCAGGCAATCAATGAATCTTTTTTTTTTTTTTGAGACAGGGTCTTGCTCTGTTGTCCAGGCTAGAGTGC +AGTGGAGAGATACTGGCTCACTGCAACCTCCGCCTCCTGTTTCAAGCAATTCTCATGCCTCAGCCTCCCA +AGTAGCTGGGATTACAGGTGTGCACCACCATGCCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTT +CACCATGTTGGCCAGGCTAGTCTTAAAACTCCTAGCCTCGCTGGGTGGGGTGTCTCATTCCTGTAATCCC +AGCACTTTGGGAGGCTGAAGTGGGCAGATCACAAGGTCAGGAGTTGAAGACCAGCCTGGCCAACATGGTG +AAACCCTGTTTCTATGAAAAATTCAAAAATTAGCTGGGCGTGGTGGCACGCGTCTGTAATCCCAGCTATT +CCAGAGGCTGAGGCAGGAGAATTGCTTGAATCCAGGAGGCAGAGATTGCAGTAAGCCAAAATCACACCAC +TGCACTCTAGCCTGGGCAACAAAGCAAGACTCTGTCTCAAAAAAATAAATAAAAAAAAATAATAAAAATA +ACTCCTAGCCTCAAGTGAGCCACTGCACCCAGGCCAGTCATTAAAGCTCTAAAATCTCCTTTGACTCCAT +GTCTCACATCCAGGGCATGCTGATATAAGGGGTGGGCTCCCACGGCCTTGGGCAGCTCTGCCCTTTGGCT +CTTCAGTCTACAGCCCCTGCAGCTGCTTTCATGGGCTGCCATTGAGTGACAGCTGCACAGTGCAGGCTGT +CAGTGGGGGATGATGGCCCTCTTCTCACAGCTCCACTAGGCAGTTCCCCACTGAGGACTGTGGGAGGTGG +CTCCAACCCCGTATTTCCCTCACAGTTTCCCTCCCGGTTTCCCTCCTGTACTGCACTAACAGAGGTTCTC +TATGAGGGCTCTGCCCCCACAGCAGACTTGTGTGTGGACATCCTGGCACTTCCATACATCCTCTGAAATC +TAGGCAGAGGCTCCCACAGCTGAACTCTTGTCTTCTACATACCCACAGGCCCAGCATCACATGGAGGCCA +CCAAGGCTTAGGGCTTGCACCCTCTCAAGCAATGGCCTGATCTGTACCTTGGCCCTTTTTATCAATGGCT +GGAGCTGGAGCAAGTGGGACACAGGTTGCCATGTCCCATGGCTGCACAGAGCAGTGGGGCCTTGGGCCCA +GCCCACAAAACCATTTTTCCCTCCTAGGCCTCAAGGCCTGTGATAGAAGGGTCTACTGTGGAGATCTCTG +ACATGCCTTGGAGACACTGTCTCCATTGCCTTGGCTATTAACGTTTGTTTCCTTGTTACTTATGCAAACT +TCTGCAGCTGGCTTGAATTCTTTCCCGGGAAATGGATTTTTCTTTTCTACTTCATGGTTAGGCTGCAGAT +TTTCCAAACTTGCATGCTCTGCTTCCCTTTTAAATATAGGTTCCAATTTCAAACCATCTCTTTGTGAACA +TGTATGACTGTGTTTCTAGAAAAAGCCACATCACACCTTGAACACTCTGCTGCTTAGAAATTTCTTCCAC +AGGATACCCTAAATCATCTCTCTGAAGTTCAACATTCCACGGATCTCTAGCGCAGGGGCAAAATGCCACT +AGTCTCTTTGCTCTAAAGCATAGCAAGAGTGACCTTTGCTCCAGTTCCCAATAAGATCCTCATCTCCATC +TGAGACCACCTCAGGCTGGACTTCACTGGCCACATCACTGTCAGAATTTTGGTCAAAACCATTCAACAAG +TCTCTAGGGAGTTCCAGACTTTCCCACATCTTCCTGTCTTCTTCTGAGCCCTCCAAACTCTTCCAATCTC +TGCCTGTTGTCTAGTTCCAAAGTCACTTCCACATTTTCAGGTTATCGGTATAGCAGTCCCCCACTCCTGG +TAACAATTATCTGTATTAGTTCATCCTCATACTGCTATAAAAAATACCTGAGACTGGGTAATTTGTAAAG +GAAAAAGGTTTAATTGATTCACAGTTCCACATGGCTGGGGAGGCCTCAGGAAACTTACAATCATAGTGGA +AGATGAAGGAGAAGCAAGTACCTTCTTCACAAGGTGGCAGGAAAGAGAGTACAGGGAGAACTCTCACTTT +TAAGCCATCAGATCTTGTGAGAACTCCCTCACTATCATGGAACAGCATGGGAGAAACTGCCCCCGTGATC +CAGTTACCTCCCACCAGGTCCGTCCCTCGACACGTGAGGATTACCGTTCAAGATGAGATTTGGGTGAGGA +CACAGAGCCAAATCGTATGAAGCTCCTTATCATTTAAGTTTTATTGTGACATTGCAGTAATTCAGTCACA +TCTTCAGGCTTTACTTCTAATTCTAGTTCTCTTGCTATTTTCACCATATATGCAGTTACTTCCTCCACTG +AAGTCTTGAACCCCCCAAAGTCATCCATGAGGGTTGGAATCAACTTCTGGTAAACTTGTTAATGTTGATA +TTTTGACTTCCCATGAAACACTAGTGTTCTTAATGGTATTTAGAATGGTGAATCCTTTCTGGAAGGTTTT +CAATTCACTTTACCCATATCTATCAGAGAAATCACTATGGCAGCTATTCTTTATAAGACATGTTTCTTTT +TTTTTTTTTTGAGATGGAGTTTCGCACTTGTTGCCCAGGCTGGAGTGCAATGGCGTGATCTTGGCTCACA +GCAACCTCTGCCTCCCAGGTTCAAGCAATTCTCCTGCCTCAGCCTCCGGAGTAGCTAGGATTACAGGCAT +GTGCCACCACGCCCGGCTAATTTTGTATTTTTAGTAGAGACGGGGTTTCACCATGTTGGTCAGGCTGGTC +ACGAACTCCGGACCTCAGGTGATCCACCCGCCTCGGCCTCCCAAAGTGCTGGGATTACAGGTGTAAGCCA +CCGTACCCGGCAAGACATGTTTCTTAAATAATAAGACTTGAAAGTTGAAATGACTCCATGATCCATGGGC +CACAAAATGGATATTGTGTTAGGAGTCATGAAAATAACATTAATCACTCTATACATCTCTGTCAGAGCTC +TGTGACAGAGACATGAAGTGAGCACATACTGTTGCGAAAATGGCGCCAGTAGGCTTGCTCAACATAGTTT +CCACAAACCTTCAATTTGTGTTTTTAAAAAAATGCAGTATCTATGAAACTCAGTGAAGTGAAATACATTA +AAACAAATATACCTATGTTAACTCACATATTACTGTAATTAAACTCTGTATGACTTTTTTTTTTTTTAAA +CATGAGTACACTGGTTTCAAAATTTCCTGGAAAACTTATAGCAGGCCAGGTGTCATGGGTCACATCTGTA +ATCCCAACAGTTTGGGAGTCCAAGGTGGTGGATCACTTGAGGTCAGGAGTTTGAGACCAGCCTGGCCAAT +ATGGTGAAACTCCGTCTCTACCAAAAATGCAAAAATTAACCGGGCATGTTGGATGTGCCTCTAATCCCAG +CTACTCGGGAGGCTGAGGCAGGAGAACCACTTGAACCCAGGAGACAGAGGTTGCAATGAGCCGAGATCAC +ACCACTGCACTCCCAGATTGGGTGACAGAGTGAGACCCTGTCTCAAAAAAAAAAAAAAGAAAAAACTTTT +AGCAGTTATATAGTTTCTTATCTTTAAATCTCCCTTCTTTGGGTGTTTTATGCTTGGTTCTTTAGTTTTA +GTTGCTTTTGAATTTACAGTTTAGTGAATTAATAATCCTTTTGTTTTCTTAGAAAACACAACAAAACCAT +ATTTACCATCACGTGCACTAACAAGACAGCAAGTTCGTGCTTTGCAAGATGGTGCAGAGCTTTATGAAGC +AGTGAAGAATGCAGCAGACCCAGCTTACCTTGAGGTGAGAGAGTAAGAGGACATATAATGAGGCTTGATG +ATTATTCAAGGTGAGAAGCTGTTTTAGACTCTCTGGCCATCACAGGAAGGAGTATGTTGAAATGCTGCAT +TTCTCAAAAGGGATGTGTACATTTCTGGGATTTTCAGTGATGTGCCAGACGAGTGTGGTGGTATGTTTTC +AACTATATACCGAGTAGAGGATGGGAGGGTTCTAGAATTTTATATATTAATTAAATTTGGTTTAAAATGC +AGGCAAAACTTGTTTTATTTTTGTCCCTCCTGTACTCTGAAGCAAAAAAACTTTTTTATTTTTAAGATAA +AACAAATATCTTCAAAGTAATGGCTTAGTTTCCATGTTCTTAGCTGTTTCTCAAGTCCTTCCTGGAGTGT +ACTTGATAATCCTCTACCCTAAGGGTACTTGGGTAGAAATGTTTCCGAAGCACTAAACTGTTAGAAGTAG +CATAGGCTTTAGAATCGTGGCACTCTCATTTTATTAGCAAAGTAAATGACAATAAAATAGCTGGCCAGGC +GCGGTGGCTCACGCCTATAATCCCAGCACTTTGGGAGACCGAGGCAGAAGGATCACCTGAGGTCGGGAGT +TCGTGACCAGCCTGGCCAACGTGGTGAAACCCCGTCCCTATTAAAAATACAAAATTAGCCAGGCGTGGTG +CACATGCCTGTAATCCCAGCTGTTCGGGAGGCTCAGGCAGGAGAATCGCTTGAATCCAGGAGGCAGAGGT +TGCAGTGAGCCAAGGTCATGCCATTGCACTCCAGCCTAGGCAACAAGAGCAAAACTCCGTCTCAAAAAAA +AAAAAAAAAAGCTATTAAATGGGCGTAAAATGTTGTTTTAGGATCAAATAAATAATCTATATAAAAGTTC +CATATAAATGTTAGTTACTATTATTAGAACATAATTTTATATATTAAACTACCTCCTAAATTTTTAGACA +GGTAGATAGCTAAAAAAAAATTCAAATTCTAAGATTAGTTTGTTAGGGAGGAAGGAGCAAATATTTTACC +AAAACTACTTGTTTTTAATTGATTAATTTCATTCACTTGATGACTTAGTAAATCTTGTGAATATAGCCTT +AAATTTCTTAAATAGTGGGACTACAAAATAAACAATATTTCATCAGTAATGTAAGCAGTGCTATACTGAG +TAGAATTCCCTCCTGTTCCGAAATGTTACAATTTGGGTTCTCCCTGTGAGAAGTGAGTCCGGTTTTAAAA +CCTGTGAGTATACTTGCTGCAGGTCTGAAAATGAAGGCTTTATGATTCTTTCTTGAAAAATTATTTGCCT +CTATCTTTTATAATATTATTTGTTGAAGCTTGTGCATTCTATGAATCATCATGAAGATAGCTTTAATTTC +ATCCACAAAATTTAACAATATTTTTTTGTCTGGACATAAGGGGGCAGAATAAGAGTTGGAGTAGGGCCTT +GCCCAGCCACTCTGTAACTGGACAAGTGATGTATTTATTTCTTAGGACCTCATTTCCACCTTCTATCAAG +GGAAAACCTAAGAGTAGGTTATCTTTAGGGTTCTAAGTGCCTATGAGTCTATGAGATTTGACTTTATTAA +AGTTATCTTTGTAATTCTTTGAGGAGAACGTAGGCATCCATTTTTAAAACAGTCCTGTTAGAATTTGTTT +TCAGTAACAATGTTGAATGATGGCCTTTTGAAATCAGGTTTTACAACAAAATTGTTTAAACACTGCCTGC +ATATTTAGAATCTCTATACCTATATTAAGATACAGAGATTGGATAGTCTCCCTTTTCAGTATAGATAATC +TCCCTTCCCGGTAGAGATTAGGATATTAAGATGTATAATATCCTAAAGTGTAGCAGCAGTCTGGTATGTT +ACATGTCTAAATTCCATTTCCTATTTTATTTGTTTATTGATTTATCTGTTTATTTTTGTTACACTGGGTA +AGATTCCCAAGAGGTACAAGTAGAAATTTGCTAAAGTGAGTAGGACAGAAGTGTAGAGGCAAACATAAAA +GTATGTTTAGTACATATCTGTTTTAAATTGTATCTACTATTTCAAAGTTAATGGAATTATACTCCTGGGG +CTAAGAATGAGGGTTCTAGGGCCAACCTCTACTACCTATGTGGCTTGTGCAAATTAGTTGTCCCCTTTGT +GCCTCAGTTTTACCTACAACACAGAAACAATGATATTACCTACCCCATGGACTGTTGTGAAGATTAAATG +AATTAGTACATTTACTACACATAGATCTATTTCTCAAAATAATGAGCATTCAGATATTAGCCATCTGTAA +TGTAGTTGGTGATGATTATGATTATTAGAGTACATTTATAATTGGAGGATCATTTTTGCCGTAGGGAAAT +AGAATTATTAATAGTTTGAGGCACCTGAGAATATTATGTGAGAAACTGATTACATTAACCACACCCTTAA +GATGAGCTCTAATTTTGTTGTATTTGTCCTGTTTAAAGCCATCTAGTTACAATAGATGGAACTTTTTTGT +TCTGATTGCTTTTTATTCCAATATCTTAAATGGTCACAGGGTTATTTCAGTGAAGAGCAGTTAAGAGCCT +TGAATAATCACAGGCAAATGTTGAATGATAAGAAACAAGCTCAGATCCAGTTGGAAATTAGGAAGGCCAT +GGAATCTGCTGAACAAAAGGAACAAGGTTTATCAAGGGATGTCACAACCGTGTGGAAGTTGCGTATTGTA +AGCTATTCAAAAAAAGAAAAAGATTCAGGTAAGTATGTAAATGCTTTGTTTTTATCAGTTTTATTAACTT +AAAAAATGACCTTACTAACAAAATGATTATAAATCCAGATAAAGTATAAAGTTAGTTTATATCAGAGAAG +CAAAATCCACTACTAATGCCCACAAAGAGATAATATAAAAGAGGATCTGTATTTATTTTGAAACAAACAT +TTAAATGATAATCACTTCTTCCATTGCATCTTTCTCATCTTTCTCCAAACAGTTATACTGAGTATTTGGC +GTCCATCATCAGATTTATATTCTCTGTTAACAGAAGGAAAGAGATACAGAATTTATCATCTTGCAACTTC +AAAATCTAAAAGTAAATCTGAAAGAGCTAACATACAGTTAGCAGCGACAAAAAAAACTCAGTATCAACAA +CTACCGGTACAAACCTTTCATTGTAATTTTTCAGTTTTGATAAGTGCTTGTTAGTTTATGGAATCTCCAT +ATGTTGAATTTTTGTTTTGTTTTCTGTAGGTTTCAGATGAAATTTTATTTCAGATTTACCAGCCACGGGA +GCCCCTTCACTTCAGCAAATTTTTAGATCCAGACTTTCAGCCATCTTGTTCTGAGGTGGACCTAATAGGA +TTTGTCGTTTCTGTTGTGAAAAAAACAGGTAATGCACAATATAGTTAATTTTTTTTATTGATTCTTTTAA +AAAACATTGTCTTTTAAAATCTCTTATGATTAGTTGGAGCTACCAGTTGGCAAATTTGCTAGCTAACTAG +TGATCTGAAAGTAAGCCTCTTTGAACCTCTGATTTTTCATGAAAAGCAATTCTCTCAATTCTATATTATT +TCAAGGGTAACAAGTTACATCCTAGTCTGTGTACTTAATTTTATAGAAATTGTCCTTAATTTTATTTTCT +GCAATTTATGTTTTCTTACTATTTCTGGTGTATGTGTTTATCCCATTGTGATGTTATATTGGTGTCCTCA +ATTTATTTCCTTAGCCATACACTCTACTTTTCATTGTACAGGGCTATTTATTATCTCAGAGTCAAGCTTT +TTTTTTTTTTTTTTTTTCCCCGAGATGGAGTCTCACTCTGTTGCCCAGGCTGGAGTGCAGTGGCGCTATC +TCAGCCCACTGCAAGTTCTGCCTCCCAGGTTCACACCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGA +CTACATATACCCGCCACCGAGCCTGGCCAATTTTTTGTATTTTTAGTAGAGTCGGGGTTTCACCGTGTTA +ACCAGGATAGTCTAAATCTCCTGACCTCGTGATCTACCAGCCTCGGCCTCCCAAAGTGCTGGGATTACAG +GCGTGAGCCACCGTGCCTGGCCAGAGTCAAGCTTTTATTTTATTGAATATATGGTCTTACTAAGTTCAAT +AGCATGAATCTGTTGTGAAGAATTCAAGAATTTTCTTCTATTTGTTGAGTTTTGTTTTCTTAGGAGTTTT +GCTCTTTCTCTTTTGCTGTGTTTTCTCCTTATTTTTTAAATGTGTTTGTGTTTGGTGAGTTATGTTTTAG +TGCTTCGTAGGTTTTTCTTTGACTATATTATATTAGTAAGCCACATTGTTCCCATGCCATTTTATTTCAT +CTTGGTCATATTTGGATGACTCTTTTCACACATTTTATTGTTATTATAGAAGGTGGATAACTTTTGTTCA +TTTAATTCATCAATATTTATTTAATGACTGTTATGTGCTAGACAGTGTTTTAAGTGCTGGGTACATAGCG +ATTAACAAAACAGATAAGAATCCCTACCCTCATAGAGCTTACATTATGAGGTTGGGGGAGGGAGATTACA +AACAAAGAAATAAGTAATATACATGTGTATAGTTTTTTTAGTGCTCAGAAAAAAAATTAAGTGGGTAAGG +GGGTAATGTCAGAGAAGAGAGAGGGATGTAATTTTAGATTGAGAGGTGAGGAGAGAGACCTCCCTGGAAA +GCTGACATTTGAGTGAAGCTTGAAGGAATTGAGGGAGTGAGGTGAGGCATGTGGCCATCTGGGGAAAGCT +TTCCAGGCAATTACAAAGGCCGCAGTACAGCAGGATCATGCCTAGTGTGCCGTGAAGCATTGGCAGAGAC +CAGAGAGTGAGAAGTAACATCCAGGGACAGAGGCAGTGAAGAGCCAGGTCGTGTGGGGGTCCTTGTGTGG +ACTGTAACTTCCTGTGATGACAGGAAGTCACAGGAAAATTCCAGGTAGAGGGACACTGTCTGACAGGTTT +TCACAGAATCATTCAGGCCACTGTGTTGAGAATAGGCTGTAGGGGGCACAAGAGTACAAACAAGCCATTT +GGAGGCTCTTTCAAGCACTTAGGCAAAAGATGATGAACCAAACAAAAGCAATGGAAGTGGTGAGAAGCAG +TCAGATTCTTGTTGTATTTTGAAGGTAGGGGGACGGTGCAGGATGGTCTGAACATTGGGAAAAATGGAAT +TGCCACTTAGAAGGAAAGACTGCAAGAAAAGCAAGTATGTGGGGAAGTTCAGGAGCTCAGTTTTAGACAG +TTAAGTTTTAGATGCTTATTAGGCATCTAAGTAGAAATGTCTACTTGATGGTTACATAGGAATCTGTTCA +GAGGAATGGCTGGATATGAATTTGGGAGTCTTTACTACAAATTTTTTTGTATTTTTAGTAGAGACGGGGT +TTCACCGTGTTAGCCAGAATGGTCTCGATCTCCTGACCTCGTGATCCACCCACCTTGGCCTCCCAAAGTG +CTGGGATTACAGGCATGAGCCACTGCACCCGGCCAGTCATACAGGGGACATTTAAAGCCGTGAGACTGGA +TGAGGTCACTGTGGGCATGGGAGTAGATAGAGACGGGAAGAGATCCAAGACCTGATTGAAGCCTTTTATA +CTTAGAAGCAGGGAAATATAAATGTAAATATAGGAATCAGTAAAAGAAACAGAGGAATGGCCAGAGAGGT +TGGAGGAATACTGGAGTGAGGTATGCTGAAAGCCAAGAGAAAAAAAGAATTGTCGAGTAGTGAGAGTGAT +TAAGTCTGCCAAATGCTATTTCATAGAATTGATAATGAAGTGAGGACCAAGAATTGATCATTGGCTTTAA +CACCGTGGAGGAGCACTTTCAGTGGACTGAAGTGGGGCAAAGGAAATGGAGGGAAAGGAGGAATGATAGT +GAATATAGGCATTTCAAGGATTTTTGCTTTAAGAGAAGAAGAGAAATGAATCAGTAGCCAGAAGGGGAAT +CAGGATCAAGAGAACATTTGCTTTTTCAGTTGAAAGTGCTAATAGCATACTGATGAGATACTGTATGCTG +ATGAGAAAGATCCAATAAAGAAGGTAAAATGCAAGATGGAAGCAAAACAGGAACAGCTGTGGGGCACTGT +TCTCAGATACTGTGTGGTATGGTATCTAGAGGCTCTGTTGAAATTGGCCTTAGCTAGCAGGAGAGACTGT +TCATCTGTAATCACAGGAAAAAAGTAAAGTACGTAGGTATAGATACCAATGGAAGAGTTGATATACAAGA +GGAAACTTGTGGCAGACCTCTTTTGATTGCTCTATTTCCTCGCTGAAACAGGGCACAAAATCATCAGCTG +AGAGTCAGAATGAAGAAAAGGGGGCCAGGCGCGGTGGCTAACGCCTGTAATCCAGCACTTTGGGAGGCCA +AGGTGGGTGGATCACGAGGTCAGGAGATGGAGACAATCCTGGATAACACGGTGAAACCCCGTCTCTACTA +AAAATACAAAAAATTAGCCGGGTGTGGTGGCGGGCGCCTGCAGTCTCAGCTACTCGGGAGGCAGAGGCAG +GAGAATGGCGTGAACCTGGGAGGTGGAGCTTGCAGTGAGCTGAGATCACACCACTGCACTCCAGACTGGG +AGATTCCGTCTCAAAAAAGAAAAAGAGAAAGAAAAGGGTGTTGAAGGTTTGAGAGAAGAGGAAAGGCATG +AAATCATTATCTAAGAAAGTGGTAGAGTAAATGGACTAAGTAAACACATCATGACTGCCAGGGCCCACTG +GAGGTTTAATGTTCATGAATATATTGTTGTTGTGTGATATTTTTTCAACCGTGTTCAGCTCTGATGGTGT +GGGCATGAAGTAGTTGGAAAGTAGAATTTAACCAGGTCTGTAGTTTAGCTGGGTAAGTAATGCAAAGCAA +GAAGGGCAAAGAATTTGGGGGTATATGCAAAAGGAGGATTTAAATAATTGACCTTGGACACAATGCAGAG +CAAAGAAGAGACATTAGAAGACGTGGATCAATGAACAGGAGATAAGAAAAGCTGATTGTAGGTCACGGTG +GGTTTGAGTTAGGGTTTTAGAGGGAGTGAACTGGGCAGATCAAAGGTAGGTGGTTGAAGAAGGAGGTACT +TCAAATTGAGATTCTGGGGGAAATGGAGTTATTGGAAATAAAAGTCTTGGGTATGTCCATTGCAGTGAGT +TACCAGTGGAAAATAGAGGACATGATCATTTAGGAAGAAAACAAGGAACTTGGGAGCAACAGAGTATTGG +AAGGATTGCCTGTGAGGATACTGAAATTTCCAGGAAACATGACCATCGTGATGACAGAATGACAGTGAGT +TATGAGTTAAAATCTTCAAGGAATGAAAGGCAATGAGTGAGCCAGGGTCAATGGATGCCTGTAGCAAGGA +ACAGTAATGAATGACAGTCTGATAACACGAGGTTCAAAACTGAGTGTTTTTAGAGTGGGAGGAGCAGCAA +TGAAGCATGAGGAAGACATCTGCCTCATCTCAGCCTCCAGTAGCACAAGGTCTGCAGGGGCAACTGTACA +GGCAGACAAGAACCAGGTTTGTTGCAACAAGATGGCAATGAGAGCACCTGCAGGAAAGGGTGACGGTAGT +GGAGATCTTACTGAGTTCCAGAGGCCCCATTGAAAGGATTCGAGGAGATGAAGAGGTAGGAGGAGATGGT +GCCCAGAAAGGCCACATCCAAAGCCTGGAAGCGGAATCCAGGGAATTTGGCATGACTGAGAGCCTGTGCT +GCCTTTTTTAAATGTTTTAATTTTTGTGTGTTTATAGCAGGTGTATATATTTATGGGGGAGCCTGTGCTT +GTTATGGGGACTGACACAGATCAGCTCTTGGCCCCAAGGCAAGGTGTGTGGGAGAAGAAAAAGTGAGGAG +GCCTAGATGTCAGAGGAGTCCGGCTAAACCACTGCAGAACTGCTGCCTAATTCACAGCAACCATGAGTAA +AAATGCTGATGATCATCAGGTCAAGGATAGTCTGGAGCAGTTAAGATGTTACTTTACATGGGAGGTATCA +ATTAAAGATGATGAAATGCCTGATTTGGAAAACAGAGTCTTGGACCAGATTGGGTTTCTAGACTAAATAC +AGTGTGGGAATACACAATACACAACCTACTAGCCTATGTGAAACACCCGAAAGGCCAGAATGAGGAAGTG +CTGGAGAACTTGAAAGAAGCTGAAGACTTAATCCAGAAAGAAGATGCCAATCAGATTTGAGAAGCCTGGT +AACCTGGGGCAACTTTGCCTGGGTGTATTACCACATGGGCAGACTGGCAGAAACCCAGACTTACCTGGAC +AAGGTAGAGAACATTTGCAAGAAGTTTTCAAGTCCTTTCTGTCACAGAATGGAATGTCCAGAGATGGACT +GTGAGGAAGAACGGGCCTTGCTGGAATGTGGAGGGAAGAATTATGAACAGGCCAAGGCCTGCTTTGAAAA +GGATCTGGCAGTGGCTGCTGAAAACCCTGAACTCAACACTGGGTATGAAATCACCGCCTGTCGCCTGGAT +GGCTTTAAATTAGCAACGGGGGATCACAAGTCATTTTCTTTGCCTACCCTAAGGCAGGCTGTCAGGCTAA +ATGTAGATGATAGATATAGTAAGGTTCTTCTTGCCCTGAAGCTTTGGGATGAAGGACAGGAAGCTGAAGG +AGAAAAGTACTTTGAAGGAGCTCTGGCCAATACGTCCATGCAGACCTTTGTCTTTGGATATGTCTTTGGA +TATGTCTTCTCTTCTTACTGAAGAGAAGACTTTGTGGATGAAGCTCTTGAGCTCTTAAAAACCTCCTTGC +AGGCAACTCCCACTTCTGCCTTCCTGCATCACCATATAGGGCTTTGTGACAGGACACAAAAGATCCAAAT +GAAGGAAGCTACCAACAAGCAGCCTAGAGGGCAGAACAGAGAAAAGGTAGACAAAATGATAAGATCACCT +GTATTTCATCTTCAATCTGCTGTGGAACAAAAGCCCACATTTGAGGTTGCCTATATAGAACTGGCAGGAA +AGTATATAGAAGCAGGCAATCACAGAAAAGCTGAAGACACTTTTCAAAAAGTGTTGTGCATGAAACCAGT +GGTAGAAGAAATAATGCAAGACATATATTTGCACTATGGTAGATTTCAGGAATTTCAAAAGAAATCTGAT +GTCAGTGCAATTATCCGTTATTTAAAAGCAATAAAAATAGAAAAAGGCATAATTTTCAAGGGATAAAAGT +GTCAATTCTTTGGAGAAACTGGTTTTAAGGAAACTTCAGAGAAATGCGTTAGACCTGGAAAGCTTAAGCC +TCCTTAGGTTTCTGTACAAATTGAAAGGAAACATGAATGAAGCTCTAGAGTACTGTGAGTGGGCCCTGAG +ACTGGCTGCTGAGTTTTAGAACTCTGTGGGACAGGGTCCTTAGGCACTCAAATATTGACCACTTTCATAT +TTCATTTGATTTTCTGTTAACGTGCTAATCAAGGCTGAGGTGAGTGGATCACCTGAGGTCAGGAGTTCAA +AACCAGTCTGGCCAAAGTTGCGAAACCGCTTCTCTACTAAAAATACAAAAATTAGCCAGGGGTGGTGGCA +CACACCTGTAATCCCAGCTACTCAGGAGGCTGAGGCAGGAGAATTGCTTGAACCTGGGAAGCGGAGGTTG +CAGTGAAATGAGATCGCACCACTGCACTCCAGCCTGGGCCACAGAACGAGACTGCATCTCAAAATAAAGA +AAAAATGATAATCATCTTTTCTGCTTGCTGTTTTCAGAAACATATTATGTAATTCACTGTAATGATGTAA +TTATTGAACAGTTACTCAAATCTGATAAAATATTAGTTGCATTCAATAATTAATGAATCAATGTGTGTGT +GCATGTAAGAAAAACCATTTGTATGAAGCAGAAAAAAAATGTTAGGTAGATTTGTAGGAAACAAAATACT +GGACTTACACTAAAATAAAGTGAGAAGGTCGGTGCCATGACCCAGTGTAATTCTTGACTCTCTATTTATC +CAAGATTAATTTCTCCCCCCCTTAGTTACAGTGTGAGGGCTGGATATTTGAGTTTCTATTCATGATACAG +TATGTATCCCTGGAAGATGTCTCTGGTTCTAGATCATTTTTCCCAGTTCAGTGAGTAACGCCGTGTATTT +ATTTGCTATAACAAAGAACCATAGACTGGGGGCTTAAACAACAGAAATTTGTTTTCTCACAGTTCTGGAG +GCTGGAAATCCAAGATCAAAGTGTTGGCAGGGTTAATTCCTTCTGAGGCTGTGAGAATCTGTTCATTGCC +TCTCTCTTTGCTTCTGGTGATTTGCTGGCATTTTTTGGCATTCTTTGGCTTGTAGATGTCTCACCTCCAC +CTTTGCTTCATTGTAAGGTGGTGGTCTCCCAATGTACATGTCTGTGTCTAAATTTCCCCATATATTAGGA +CACCAGTCATTATATTAGAGCCTGCCCATATGTGTGGGTGGAGGATTACCCAGGTGCCGAGGCAAGAGAC +TGAAGGCACAAACTATTTCAGTATAATAAAGAAAATAGTTAGATTAAGGATAGTCATAATACAAATTAGA +TATAGAGATGATCATGGACAATTAGCAATCATTATAAACCTTAATCATTAGCTTTTAGTATTATTCTTTG +CTGCATTACTAATATAACCTAGGAATAACTGGCGGGTATAGGGTGAGGTGCTGAAGGGACATTGTGAGAA +GTGACCTAGAAGGCAAGAGGTGAGCCTTCTGTCACGCCCGCATAAGGGCCACTTGAGGGCTCCTTGGTCA +AGCGGTAACGGCAGTGTCTGGGAAGACACCCGTTACTTAGCAGACCGCAAAAGGGAGTCTCATTTCCTTG +GAGGAGTCAGGGAACACTCTGCTCCACCAGCTTCTTGTGGAAGCCTGGATATTACGCAGGCCTGCCCGCA +GTCATCCGGAGGCCTAAATCCCCTCCCTGTGGTGCTGTGCTTCAGTGGTCACACTCCTTGTCCACTTTTA +TGCTTCTCCCGTACTCTTGGTTCCTCTTTGAAGTTCGTAGTAGATAGCGGTAGAAGGAATAGTGAAAGTC +TTGAAGTCTTTGATCTTTCTTATAAGTGCAGAGAAGAAAACGCTGACGTATGCTGCCTTCCCTCTCTGTT +TCGGCTACCTAAAAGGAAAGGGCCCCCTATCCTGTAATCACGTGAATTGCTTCACCTTTTCAATCACTTA +GAAGATTCACCCTCCTTACCATGCCCCCTTGTCTTGTATGCAATAAATATCAGCAAGCCCAGCCGTTCGG +GCCGCTACCGGTCTCCCGCGTCTTGATGGTAGTGGTCCCCTGGGTCCAGCTGTTTTCTCTTTATCTCTTT +GTCTTGTGTCTTTATTTCTTACAATCTCTTGTCTCCGCACACGAGAACACCCGCTAAGCCGCATAGGGCT +GGACCCTACACATATGACCTCATCTTTTTTTTTTTTTTTGAGTTAGGGTCTCACTCTGTTGCCCAGGCTG +GAGTGCAGTGGTGTGATCTCAGCTCACTGCAGCCTCTGCCTCCCCAGCTCAAGCAATTATTGTGCCTTAG +CCTCCCGAGTAGCTGGGACGACAGGCACGTGCCACCACACACGGCTAATTTTTTGTATTTTAATAGAGCT +GGGGTTTCACCATGTTGCCCAGGGTGGTCTTGAACTCCTGAGCTCAGGCATTCTGCTTGCCTCAGCATCA +CAAAGTGCTGGGATTATAGGCTTGAGCCACTATGCCCAGCGTGTCTTAGGATTTTTCAACTTTATAATGG +TCTGAAAACAAAACATGAGTACCTTACAACTATTCTTTTTCACTTCCAGTACAGTATGCAATAAATTACA +TGAGATATTCAACACTTTTTTTTTTAAAAACAGGCCTTGCGTTAGATGATTTTGCTCAACTGTAGGCTAA +TATAAGTGTTCTGAACACATTTAAGATAGGCTAGGCTAACCTCTATGCTCAGTAGGTTAAATGTATTAAA +TGCATTTTCTTTTTTTTTTTTTGAGATTGAAAGTCACATTTATTGCCACGTGAAGGAAAGCCAATTAAGT +CAAATTGATTAATGTTTGCAGAGTATATCTGTTTCGATCCTTTTGTTTTTAATCTACCTTTGTCGAATTT +AAAGTGAGTTTCCTATAAGGAACAAATAGTCGGGTTATAAATTTTTATCCACTCTTTCAATCTCTGTCCA +TTGATTGGATATTTACATCACTTACAATGAAGGTAATTATTGATATGCCAGCATTCAACTTTGTCATTTT +ATTATTTGTTTTCTATTTGTTTCTCCTGCTTTTCATTCCTCCATTTCTCTTTTCTTGCCTTCCTGAAGGT +TACTTAAATATATTTTAGGATTCCATCTTGATTTATTTTTAGTGTTTTCAGGCATTTTTGTATTATTTTT +GTAGTTGTTTCTCTAGGTGTTACAACATACATGTATGACTTACAGCAGTCTACTAGTATCAATATTCTAC +CACTCTAAGTGAAGTGTGAGAATTTTGCTTTCATTTTGATTTTGGAGGGAAAGGGATTGGATAGTGACCA +TATACAGACCTTTTTCTTCCCTACTTTTAAATACTATTTTCTTGGATATCATAGTATTATTATTTTTGTT +TTAGTCATCAAACATAATTTATAAACTGCATGAGGAAATGATAGCTTATCGTATATGTCCATTTCTCAAA +CTCTTTCCATTGTTTCTTCCTCCATCCTATTGTCCTCAGTCCTGATGCTCTGAGAGTCTTCCTTTAATCT +TCTCTTTTCTGTTTAAACAATTTCTTTTAGGCAATTTTTGAGTTAGAGAAAATTCTTTTAGTTTTCTCTC +ATCTCAGAATATTTTATTTCTCCTTTATTCCTTGAGAATTGTTTCAGGATATAGAATTTGTAGTTGATAA +TTCTTTCTTTCAGTACTTGAAAAATATTGTGCCACTTCCCTCTGGCCTCTGGTTTTAGATGAGAAATCTG +TCATCATTCCAATTGATGTGTCCCTATTTAAATGCATTTTCAACTAACAACGGGTTTATTGTGACATAAC +CCCATCATAAGTTGTCCTCGACTGTGTTTCCAGATAAGGCTGCATTCACAGGTACTGGGAATTAGTCACA +ATGTATCCTGTAACATCCTGGAACCTTTCCTTATGTCAGTCAGTCCTGGTCACCTGTTTGGTCCCTGGCA +GTTTTTTCTCATTTCTGTTCAGAAGAAGAAAAAGGTAAAGATTCTTCAGATTTAGGAATAACAGAATTCT +CAAAAATGTCATTTTACCAGTTTAGTTGGGGAAGGAGATTAGACTCTGAGCCTTCCTGTGCTTAGTTCTA +TCCATAATCTTCTGTTTCTTAATCACCAGATTTTAAATTTTATATCATGATATTCCACGTTTGTAGATTT +TGGGTGAATGTTTCCTTTTTAAAATTTAATTTGTTTTTAAGTGTTAAAGTTCTATTAGAAAGTTTCAAGG +AGCCACCTTAAACCAGAAGTCTTTAACACCCAGATCTAGTCATGATGCCTCCCAGCTTAAAATCTTCAGT +GGTCCCCTGTCCCTTAGAAAATGAAACCCAGACTCTTTTGTATACCCCCAAAACCATTCAACTTCCTGCC +TTAAACCACCACCATACACACGTCAGCTAAACTCTAATTTTAGGCTGGGCGCAGTAGCTAATGCTTGTAA +TCCCAGGACTTTGGGAGGCTGAGGCGGCTCGCAAGGCTAGGAGTTTGGGACAAGTCTGGGCAACAGTGAG +ATTCTGTCTCTATAAACATAAAAACTAAATTTAAAAAAAAAACTACTTTAAAACATAATAATAAAACAAA +CTGTACAAGTTCATATTCCTGATGCATGATAACATTTAATGCTTCCTACCTTTAAAAATCTTCTTTGTTT +TGCATAGATGCTCTTTTGTTCTCCCCCTCTTGATGAATTTGTCCTATTCATTATTCAGCTCAAGTATTAC +CTCCCTAGTCAGATGCTTACAGTCTGGGTTAGATCCTCCTTCCTCCAGCTTTTGAAGCATTTTGTTGGTT +TGGCTGTTTCAGCACTTGTCACATGGTCCATGATCCTACCTGTACACTTCTGAATAAGCCACCTACTAGC +CTCAGTAATACAATGATCTCTTCAAGGACAGGGACTCTGTTTGGATTTCTACTTCCACTTCTGACATAAT +GGGTTGAAGCAGGGGTCAGTTGTTGAAAACTCCAGGTATGCAGCATCTCATACGGTCCTCATAGTCACAC +TGTTCATTTGCTATTATCATCCCACTTTACAGATGGGGAAACTGGAGCTCAGAGAGGTTAAGTAGCCTGC +TCAGGGTCACAATGCTATAAATTGATTTGAACTTATACTTTCAGGCCCTAAAGCTCTTAGGTTCTTTCTT +ATACTTCATGCTGACAAAACAAAAGCAAACTCAACATTTGAGAGTTGGGCTTAAAACAGGGACATCTGTA +TTTTTAATCTAATGCTTTGTTACTGTATTACAGAAACACTGTGATATATAATGAGTTAATTAAACGAGAA +CCTTTCTTAGGTTGGGAAAGATTTGTTTTGGGGAAAGCCTGTTTCCCTGGGAAACAGACTTACAGATTTC +ATGTAGGAGATTAGAAAATGCCCTTAGGAACAGCACTTGTGAGGAAGCAAATGCAGCAGGATTGGGCAGA +AGAAGAAGAAAGGTTCAGTTAGTCCCACAGGGACTTCTGGAGCTGGGTGGCCCTTCAGAGTCTTGCTAGC +CTGAGGCAAGAGCCAGTCCTTGCATACTGGCTCCTCCAGGGAGGGGCATAGCCTTAAGCAAGACAATGCC +TTTCGACAGAGGGCAAGTCCAGGAGGGAACTCAGAGGTGAGTTGTCAATAGCTAGCTCTCCCCAGAAGCT +GGAGGATCACATGCCTTGGTCCTGAAGGGGATCTGCACCCGCACCATGGCATCCACTCCAGGTGGAATCG +GTGACAGTGGTTTTATAAATTGCTAAGTCCTTTATTGTTTCAGTGACATTTTCTAAAAGGAATTAAAAGC +TTTGAAGAATTCTATGCAAGTTCAGAGTAGGCCAGACCACAAAGCTGCAAAATAAGTTTCTTCTTTTCCT +TTTGTCTTGTGAAGATTTACCTAGTGTAGGATTCTAAAAGTGTTACACTAAAAATTCTTCCAGTTGTTAT +CAGTTGTATGATAATTAAAGAAATAAGGCTTCCATTTCTTTTGATGGAATTTGATCCAGACATGGAAACC +TCCACTTACGTAAAAGTGCAATGAGGAAACTACACTGTTTACTTTTTATTTTTCTGTATTCTAAATGTTT +TTCAATAAACATGTATTTTATATTTGGGGAAAACAAGATATATTTAAAAAGGACATTTGGGAGAAGGACA +GTACTATCATCAAGTTTTGAATATAATTAGAGGCAAGTTCCAAAATTAGAAAATGGAACATGCTTGAGAA +ACTTATAATTTCTTTAAAAGCAGTGTTTAAAATTAAGATCCATTAAATAAATTTGAAATGAAAAGAAAGA +AAAGATGAACCATCATTTGCAATAGAATAAACAAATCCTTATCTGAGAATGACATTTCCATTCCCTGTTA +TTTACAACGGATACATAATTCTCAGAGTAACAAAAACAATAAAGTATAAAGAGTAGAATTAAATGCTTAG +TGCCTCTTCCAACTCCAGCTTTCCTAGCCTCAGTTCTATCCTGGTTTATCCCAGGATAGAATGGATAGCA +CCTACCCATCTTTCACTTTAGAATTTTATAACTCTTGGCTAGGCACAGTGGCTCATGCCTGTAATCCCAG +CACTTTGGGAGGCCAAAGCAGGCAGATCACTTGAGGCCAGGAGTTCAAGACCAGCCTGGCCAACATGGCT +TGAACCCGGGAGGCGGGGGCTGCGGTGACCCAAGATTGCACCACTGCACCCTAGCCTAGGCCACAGAGTG +AGACTGCGTCTCAAAAAAAAAAAAAAAAAATTATAACTCTTTATTTTGAACTAATTTCAGACCTTTAAAC +AAGTTGCAAAAATCGTGGAATTTCCAAATATTCCTCACCTAGCTTCCCCTAATGTTAACATCTTGCATAA +CCGTAGTACAATTAGAATCAGAAAGTTAATAGTGGTATAATTATTACCCATACTGTAGATGTATTTGAAA +AATTGTTTGAGTTTAAGGTATTTTACCCTGTTTCCCCTTTTTTGTTCTGGGATCCCAAATTGCATTTAGT +CATTTTTCCCCTGTATTTTCTACCAGTCTTTAATACTTCCTGTCTTCTTTTTCATGATCATTATGCTTTT +GAATAGACTGATAATGATCATTATGTTTTTGAATAGACTGATCAATTATTTTGTAGCATTCCCCTCAATT +TGAGTTCGTCTGATGTTTTCTCATGACTAGGATGAAGTTATGCATTTCTGGCAAGACTACCACTGAAGTG +ATGATGTGTCTTTCTCAGTACATCATATCAAGGGGTTAATGATACTGATCTTAATCACTTGATTAAGGTG +ATAATCTGCTGGGTTTCTCCTCTGTACAATAACTTCCTTTTTCTTTGTAGTTAATAAATATCTTGAGGGA +GATTCTTTGAGACTGAATCCTATTTCACATCAAACTAGCATTCATCAGTTGATTTTGTTTGCAACAATGA +TTACTGTGGTATTTGCCTAATTATGATTTTTCTCCCTTTCTTTCCTTCTACATTAATTGGAATTCTATAA +GGAAAAGCTGTGCCCTTTCCACCAATGTATTTATTTGGTTATTTATATCAGTATGGACTCAAGAACATTT +ATTTTATTCTGCAGGTTAAAATTTGGTACCTTCATTATTTTATTGTTTAACTTTTTTTTTTTAGCTTTGA +CCATTAGGAGCTTTCTCATATGGACTCGTGTGTTCTTTCAACCAGCTTTCTTCACTTTTTGAACACTTCC +TTATTTTTTGACATCACAAGATGTTCCAAGTTCATCTTATATGTTCCCTGCCCTAGTCTTGGAATCAGCC +ATTTCTCTTGGCTTCTTTTTTTTAGTAGAAAATGGTGTTTAGATCCAAGATCTGGTTGCTAGATAGGCTC +ATTACTTTACTGTGGAGGCATCAGTACTGCCAGGCCCTCTCAGCAGACAGAGGTGGGAAATGTTTGTCTT +CCATTTTGGATTCCCTCATGTCTAGTTGGATTTGTTTGTTGGTTGGTTTATTGGGTATGTGAAGTATCAC +TGTGGTTCTAGGGAGTCAGACCTGTACAAAAAGATATACTTCATGTCACTTCCTCCTTATCCTTGCAGTC +CCAATCCTAATCCTCCTTTTTCCCCCACTCACACCCTGTTCCTTTAGTTTCTGAGTTTATCCTTCCTGAT +TTCTTTTGCTCAAATGAACAGATACATGTGTATTTTCTTATATTCCCTTCTTCCTTATAAGAAGGGGAAC +ATACTCTAGCCTTTCTTTTATGCTTTTCTTTTTTAACTCAACACTGTCCTGGAGATCAGTTCATAGAAAT +CGTCCTCACTCTTTTTTTACAGCTACGTGGTACACCATTGTTTGGATGTACCACAGTTTATCCAACTCTA +TCCTGTATATGAGCTAATAAAAGTTGCTTCCAATATTTTATAATTATAATGTTTCAGTGAGTAACCTTGT +TCATAGGTGTTTTCATGAACTTTATGTTCATGTGTATTTTACTATTATTAGAGGTCTATCTTCAGAGAGG +AGTACAAGAAATGGGATTACTGGGTGCAAAGGTAAATGGATATGTGTCTTTGCTAGGTATTGCCAAATTT +ATCTCCAGAAATCTTGCACAAATCTGTACTCCTGTTAGCAATGTGTGCGTATACCTGCTTCCACATGACC +TCAGTAAAAGAATGTGTTGTCATATTGGTATTGAAATTTTAGCACTGTAAGCAACAGGTCATTTTGGAAA +ACCTGAGCTTTCGCCAAATTCAGCTATTTTGATTTGCTTTTATTATTAGCATATACCAAAATAAATAGGC +ATATTAGAGTTTCCTTTCTTGCATCTTAAAATTCATCTAACACATCTATAATAACATTCTTTTCTTTTTT +TTCCATTCTAGGACTTGCCCCTTTCGTCTATTTGTCAGACGAATGTTACAATTTACTGGCAATAAAGTTT +TGGATAGACCTTAATGAGGACATTATTAAGCCTCATATGTTAATTGCTGCAAGCAACCTCCAGTGGCGAC +CAGAATCCAAATCAGGCCTTCTTACTTTATTTGCTGGAGATTTTTCTGTGTTTTCTGCTAGTCCAAAAGA +GGGCCACTTTCAAGAGACATTCAACAAAATGAAAAATACTGTTGAGGTAAGGTTACTTTTCAGCATCACC +ACACATTTTGGTATTTTTCTATTTTGACAGTCCAGTATCAAGGAAATAGCTTTTATACAAATTGGATAGT +TGAGGTAGTATGTGAGGTAAAGTTTAATCATATATTAATTGCCCATGAACCTCAGGAGATGGGGGAATGG +GGAAATGACAGCAACTAGAAAGAGAAGAATGACTTGAAGGGAAATGAGTTAGGAGAAATTGTGAGAAGGA +TGTTCAGAAATGCAGACTTTGTAAGCAAACTGGAAATTGGTTACAAGAATAATATGAGTTATCTGTGGTT +TGCAGCAGTCAGCAGTGTGATTAGTTAATAATATAGAGACTACAGGTTTACATTTAAACTCCATATCTAG +TGTTTTATACAGATTATATTTCTTTGACTTGATTTAATCCCAGATAAGAGACACTGATATTATTTTCCCT +AGATCATGTATGCATTTTCTGCTTAAATCTATATATACATTATATAATATTAGCTGGTGTTTATTGAGTG +TTTACTATGTGTCAGACCTTGTTCTAAGCTTCTCATTTAATTCTCCCACAACCTTATGAGGTAGGGAACT +GTTTTTCTATTTTATCAGTGAGAAACAGGTTAAATGGCTTGCCTTAGGTCAAATGCCAAGTTAGTAAAAC +TAGGATTCATACTTAGGCCATCGAATGCAGAACCCAGACTAGGAACTGCTATGCAATGCTGCTTCCCAGT +AAAATTTGAGATTTCATGAGTTGGTAACTAGTGAAGAATACACAAAAAATAAGCCTCTTAATTCTGTAGT +TTAATATTTGAAATGTGTGTTATTCAGAATTTATATAAAAATATATTTTAAAAGCATTAGAGTAGCTGTA +TAAAGAAAGCTGTGTTGACATTTTACCTAGAGACTCTATGCATAATGAATAACACTCTGCTATATCTAGT +TTCTAAATTAGGGGTGGGAGTTGTATTCATTATTTAGTTCCCATACAGCATATCTACTGTTTACACCCCA +CATTTTCTTTTTTTCTTTCTTTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTAGAGACAGAGTCTTGCTTT +GTCACCCCCAGGCTGGAGTACAATAGCACAATCTCGGATCACTGCAGTGTCTGCCTCCTGGGCTCAAGCG +ATTCTCGTGCCTTAGCCTCCCAAGTAGCTGGGACTACAGGTGCGTGCCACCACGCCCAGCTAATTTTTGT +ATTTTTAGTAGAGACATGGTTTCACCATGTTGGGCAGTCTGGTCTCGAACTCTTGGCCTCAAGTGATCTG +CCCACCTTGGCCTCCCCAAGTACTGGGTTTACAGGCATGAGCCACCGAGCCCGCATTTTCTCTGAGACGT +CTTCAAAGGCAGTTTACTAATCCTGCTGAAGAGACAACTGTCATTTACACAGCATTTTAAAGTTTTACAA +AATACTGTCATGAATTAGGTTAAACCATATGAAATTGCTGATATTTGACCAGTTGTGATTTCTCAAGCAA +CAGTTTCATGTAGTTTAACCTATAAATCATTTCAATTAATTCTTGGAACAGACGTGAGGTAGGTGAGGCA +ATTCTTTCTTTTCTCTAACCAAAGAAGTACCTTTATAGATGTGAGATGATTCCCAGCTATTAAGTAGTAA +ATAGAGCTAGGACTTGAGCCCCAATCTTCCAGCTTCAATCCAGATCATATGACAGCTTGCTGATTAAACT +AGATGACAGAGAAGATCTCTTTCCTTCAGATACACATACTTTTTCTCTGTTCCCCTCTCCCTATCAGCTA +GATTCCCCTAAATCACTGATACTGGTTTTGTAATTTTGCATCGGCATGTTTGACAATTGGTATCACATTT +AGGGTTTTTCATTCTTTTTTGGTCCAAACTTTTCATTTCTGCTTTTAAAGGAAATACTTTTGGAAACATA +AATATGTGGGTTTGCAATTTATAAAGCAGCTTTTCCACTTATTTTCTTAGAATATTGACATACTTTGCAA +TGAAGCAGAAAACAAGCTTATGCATATACTGCATGCAAATGATCCCAAGTGGTCCACCCCAACTAAAGAC +TGTACTTCAGGGCCGTACACTGCTCAAATCATTCCTGGTACAGGAAACAAGCTTCTGGTAAGTTAATGTA +AACTCAAGGAATATTATAAGAAGTATATATGGAGGCCATCGTATATTCTGTTGTATACCTAGTAAACATG +GTAAAATGTAATTAAACTTAATTAGAAAATGTGGTTGTTATGTGGCTCCTGTAAGTATAGTTATTTAGAA +ATTTTATTTATTGAAGCAAGATATGAAACTCTGGGTGCACACTTTCCAAACAGGTGCTTTCATTTACATG +TGATTGAAAAGTGTTTTTTGTCATTTATTTCACTGTTCCATACAATTAGGGTTGTTTCTAAGCTGTTTGT +AAGCTGTTTCTAAGCTATTTAAGTGGTTAAATCACAGTAGATGCAAATCAAGCTAAAGTCTTTAACATTG +GCTAATGGCTGATTCTTAAATAGCTAATACTTGCTAAGGGTATCTATATTAACTCATTTAATCCTCATAA +CAACCCTATGAGATAAAACCTAAGTCCTCACTTAACATTGTCAATAGGTTTTTGGAAACTGATTTTAAGG +GAAGTGATGTATAACAAAACCATTTTTTTTTCTCATCACTGTTCTAACAAAATGATGTTGAAGATTTAAA +TGACATTGCTCAAAGACCTGCTATACATTGTTTGACTTAAAGTCACAGTTTCCGAGAACCTATCAATTAT +GTTAAGTGAGGACTTGACTCTATTATCCTGATTTTGTAGATGAGGAGACTGTGGCATAGAGAGAGGTTAA +GCAATTGCCTAATAAGGTCACAAAGCTAGAAAAGTAGGTATTAGAACCCAGATAGTGTGTGTTCTCAAGA +TGGCTTTAAAATATTTATCTTTGTTTAATCTGTTAATAATAAAAAACAAAAGATTAAAGCATAAGTGACG +TCCCCTACCTCCTTTTTTATCTTTTACTGTGATTATTCTTCATCTTCCTTCCTTTTCATGTCATTTTATA +TGTTCTTATGTAAAATTACTTTCATCTAGAATAGGAATAATGTGAACTGAAATCACCTAACCTATTAGGA +GTTAGGGGAGGGAGACTGTGTGTAATATTTGCGTGCTTAAATATTTTCAATGAAAAGTTACTTTGATTTA +GTTTTTTATGTTACTACATAATTATGATAGGCTACGTTTTCATTTTTTTATCAGATGTCTTCTCCTAATT +GTGAGATATATTATCAAAGTCCTTTATCACTTTGTATGGCCAAAAGGAAGTCTGTTTCCACACCTGTCTC +AGCCCAGATGACTTCAAAGTCTTGTAAAGGGGAGAAAGAGATTGATGACCAAAAGAACTGCAAAAAGAGA +AGAGCCTTGGATTTCTTGAGTAGACTGCCTTTACCTCCACCTGTTAGTCCCATTTGTACATTTGTTTCTC +CGGCTGCACAGAAGGCATTTCAGCCACCAAGGAGTTGTGGCACCAAATACGAAACACCCATAAAGAAAAA +AGAACTGAATTCTCCTCAGATGACTCCATTTAAAAAATTCAATGAAATTTCTCTTTTGGAAAGTAATTCA +ATAGCTGACGAAGAACTTGCATTGATAAATACCCAAGCTCTTTTGTCTGGTTCAACAGGAGAAAAACAAT +TTATATCTGTCAGTGAATCCACTAGGACTGCTCCCACCAGTTCAGAAGATTATCTCAGACTGAAACGACG +TTGTACTACATCTCTGATCAAAGAACAGGAGAGTTCCCAGGCCAGTACGGAAGAATGTGAGAAAAATAAG +CAGGACACAATTACAACTAAAAAATATATCTAAGCATTTGCAAAGGCGACAATAAATTATTGACGCTTAA +CCTTTCCAGTTTATAAGACTGGAATATAATTTCAAACCACACATTAGTACTTATGTTGCACAATGAGAAA +AGAAATTAGTTTCAAATTTACCTCAGCGTTTGTGTATCGGGCAAAAATCGTTTTGCCCGATTCCGTATTG +GTATACTTTTGCTTCAGTTGCATATCTTAAAACTAAATGTAATTTATTAACTAATCAAGAAAAACATCTT +TGGCTGAGCTCGGTGGCTCATGCCTGTAATCCCAACACTTTGAGAAGCTGAGGTGGGAGGAGTGCTTGAG +GCCAGGAGTTCAAGACCAGCCTGGGCAACATAGGGAGACCCCCATCTTTACAAAGAAAAAAAAAAGGGGA +AAAGAAAATCTTTTAAATCTTTGGATTTGATCACTACAAGTATTATTTTACAAGTGAAATAAACATACCA +TTTTCTTTTAGATTGTGTCATTAAATGGAATGAGGTCTCTTAGTACAGTTATTTTGATGCAGATAATTCC +TTTTAGTTTAGCTACTATTTTAGGGGATTTTTTTTAGAGGTAACTCACTATGAAATAGTTCTCCTTAATG +CAAATATGTTGGTTCTGCTATAGTTCCATCCTGTTCAAAAGTCAGGATGAATATGAAGAGTGGTGTTTCC +TTTTGAGCAATTCTTCATCCTTAAGTCAGCATGATTATAAGAAAAATAGAACCCTCAGTGTAACTCTAAT +TCCTTTTTACTATTCCAGTGTGATCTCTGAAATTAAATTACTTCAACTAAAAATTCAAATACTTTAAATC +AGAAGATTTCATAGTTAATTTATTTTTTTTTTCAACAAAATGGTCATCCAAACTCAAACTTGAGAAAATA +TCTTGCTTTCAAATTGGCACTGATTCTGCCTGCTTTATTTTTAGCGCTATCACAGGACCCAGAGCCTATG +CCCTTTTAAACTTACCACAAAAGCAGAAGATTAATTCAATTTAAGATGATACTCTCATTTGTTACGTCCT +TTTTTTTTTTTTTTGGAGATGGAGTCTTGCTTTGTCGCCCATGCTGGAGTGCAGTGGCATGATCCTGGCT +CACTGCAGCCTCCACTTCCCGGGTTCACGTAATTCTCCCACCTCAAGCCTCCCTAGTAGCTGGGATTACA +GGGACGCACCACCATGCCCAGCTAATTTTTGCATTTTTAGTAGAGACTGGGTTTTACCATGTTGGCCAAG +CTGGTCTCAAACTCCTGATGTCAGGTGATCCATCTGCCTCAGCCTCCCAAAGTGCTGGGATTATAGGCGT +GAGCCACTGTGCCCGGCCAATATTTGTTACTTTCTTAGGTTTAATAGAGAAAAGGGATAAAACATTTCTA +ACTGGGAGTTAATTGCATGGAGAAGGTCTTAAATCAGATGTTTTAATGCCTTAAATGTCTGTATAATATC +ATGTTTTCAAATCTAATTATAAATACGTTTAAAGCCAAGAATAAATCTTTTAAAAAATTGA + From a2ef22f08da9c07e4efa2f8a4e317fa508bf9c45 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 1 May 2024 13:07:59 +0300 Subject: [PATCH 24/29] Add SOWAHA.fasta --- data/SOWAHA.fasta | 54 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 data/SOWAHA.fasta diff --git a/data/SOWAHA.fasta b/data/SOWAHA.fasta new file mode 100644 index 0000000..983e31d --- /dev/null +++ b/data/SOWAHA.fasta @@ -0,0 +1,54 @@ +>NC_000077.7:c53371022-53367405 Mus musculus strain C57BL/6J chromosome 11, GRCm39 +CTTTCCCATTCCCTCCGCACCTCCAGTGGCAGCTGTCACCTGTAAGCCGAGCGCCGCACAGGGGCATGGA +GCCTGGCGCCCATTGGACAGGACCTCTCAGCAGCAACCTACCCCCCAGTCTGGGGGCCAGGCCTGGCGCA +GGCGGAGGATAGGAGTCCCAGGTGCCCTTCGCCCGGAGACTTCTCAGCACGGCGGCGAGGGCGTGACTGC +AGTCCAGTTTCGGAAATCAGAGTCCCAGGTGAGCCCGGCTGGGCGGCTGGGGCGACCCCGGAGTCGCAGA +CACCGAGAATGGCGCTGGCAGCCGCCGCCGCTGCCGCCGCTGCTGCCGCAGGGGTGAGCCAGGCGGCGGT +GTTGGGCTTCCTGCGGGAGCACGGCGGGCAGGTGCGCAACTCCGAGCTGCTGAGCCGCTTCAAGCCGCTG +CTGGACGCTGGGGACCCACGGGGCCGCGCGGCCCGTAGGGACCGCTTCAAGCAGTTCGTCAACAACGTGG +CCGTGGTGAAGGAGCTGGATGGGGTCAAGTTCGTGGTGCTGAGAAAGAAGCCCCGGCCGCCGGAAGGACC +CGAGGCCCCACTCCCCTCCAGCCCTGGAGTGCCAGCCGCGCTGGCCCAGTGCGCTGCTGTCCCAGCGGAG +GACAACTGTGCCCCCGGGGCTCCCCACTCCCCACAGCGATCAGGAGAACCGCCGGAGGACTCGTCCGCAC +CATCCGAGCTACAGCACACCCCTGAGACCCTACCCTCAGAGGTCACCCAGGTTGAGGCGCCATCAGGCTC +AGCGCCACAGCCCGGGGGGCCGGAGGATCCGGCGCTACCTCGGTCCTCGGAGCTAGCCCGACCCGCCAGC +GTGCCGTCGGGGCTGGCCCTGACGAGCACAGAGTCGCCAGGCCCGGAGCCTGCACCTCCCACGGCGCAAG +TGCCACCGCAGAAGCCTTGCATGCTGCCGGTGCGCTGCGTCGTTCCTGGCCCCGCGGCGCTGCGGATCCG +CGCGGAGGAGCAGGGCCTGCGCCGGCAGCGCTCGGAGGAGCCGAGCCCACGGGGCTCCCCGATGCTCCTG +CGGAGGCTTTCGGTAGAGGAGTCGGGTCTGGGACTCCACCTGGGACCCGGCCGTTCCCCGCATCTCAGGC +GCCTGTCCCGCGCCGGTCCGCGCCTGCTGAGTCCGGACACGGAGGAGATGCCCGTCGCGCCGCTACCGTC +ACCCGCGGTGCCCCTGGAGCCCACGGAGCACGAGTGGCTGGTGCGCACGGCCAGCGGTCGTTGGAGTCAC +CAGCTGCACGGGTTGCTCTTGCGGGACCGCGGCCTGGCTGCCAAGCGCGACTTCATGTCCGGCTTCACCG +CGCTACACTGGGCGGCCAAGAACGGCGACCGGGAGATGGCTTTACAGCTGGTGGAGGTGGCGCGGCGTGG +AGGCGCGCCCGTGGACGTGAACGCGCGCTCTCACGGTGGCTACACGCCGCTGCACCTGGCGGCTCTGCAC +GGCCACGAGGATGCTGCTGTGCTGCTAGTGGTGCGCTTGGGTGCCCAAGTGCACGTCCGCGACTACAGCG +GCCGGCGTGCCTACCAGTACCTACGGCCTGGCTCCTCCTACGCGCTGAGGCGTTTACTGGGTGACCCTGG +CCTGCGATCTATGATGGAGCCTGATGCGGCCAGTGGTGGCAGTGGGAGCCTTGTGTCTCGGCACCCCGTG +CAGGTAGCCGCCACCATCCTCAGCTCCACCACTAGTGCGTTTCTGGGCGTCCTGGCCGATGACTTGATGC +TCCAGGACCTGGCTCGTGGCTTGAAGAAGTCAAGTTCCTTCAGCAAGTTCTTGGGTGCCTCGCCCATGGC +TCCCCGAAAAAAGACCAAGATTCGCGGTGGCCTGCCGTCCTTCACCGAAATCTCTCATCGATCCACTCCA +GGACCTTTAGCTGGTTTAGTGCCCAGTCTGCCCCCTCCAACCTGAATGGCATTGGAGCTACCACCTACCA +GGGCCTGTGGCACGCTGTCTAAGCCTGTCCAAGACAGGCCCCAAACCGGCAGTCTGTCTCATTTCCGATT +TTATCTATTGCAGCCTGGTGTAACCAGATGGGCCTCCACCTTCTGTTTGAAATTTGACGTGCCTACAGAG +ATGGGATTCTTTTTCCCCCGTTAGAGAAACCAAGCAGTCATGCCCGATCTTGGGAGGAGACTTGAAGTTT +AGGGTCAAAGGTTAAGGGGCAGCAGCTGGTCTGGCCCCTGGAGGTGTTAACCAGAGGCCTCTGGGTGGAC +ACATTCCCAAATCAGAAGTAGAGGGTGGCTTTCCAGATGTTCAAAGATGGCGACTTAAGATCTGCAGTGG +CTTTGTAATTTGATGCTTTTATTCTGTTTTTAAATTAGAATGGGGTACAGCAACTTTGATAATACCTTTG +AAATTCTATTTTTTTCCATTTTTAGGCAAAGTAGTAAGACACTTTCAGTTTTTTGCTAAATATTTTGGAA +ATTTATCTTTTAAAGAGATGGTTGAGTCTAAAAGAAATTAGTCCTAAGCCAACAACACTAAAGCTTATCT +ACCTCATTGTAGGCAGTACAGATTCTGGAATCTTGAACATGGAGTAACTATTGTGAGCCTAATGAATGTA +ACTAATCCCAATTATACTGTGAATGTCGAATATTGAGAAACAAGGGTATGGTTGGGCTTGTGAGCCATGT +TTTTAATCCTAGCACTTGGGAGGCAGGGACAGGCAGATCTCTGTGAGTTTAAGACCAACCTGGTCTACAG +AGCAAGTTCTAGGCCAGCCAGGGCTACACAGAGAGACTCTGACTCAAAACAAAACAAACAACCTCCGTGC +ACACACACACACGGACAAAGACAGACAGACAGACAGAAACACACACACACACACAAATCCCACACCAGAA +GAAAAAAAAACCTCAAAGTGGACAGAAAATGAACAAACAAACAAAAGTGTATCTTGAGAGTGAAAGTGAA +GTGATTTAACGCAGAAAGAACGAGACGTTTTGGTAAATACTTTTAAACTACTCCGGAACATGTGTTCATC +ACTAACCGGTGGCTGCTTTGATTTGGCATGCCATTTTAATTTTTTTTTAATTATGATTTAATTGTGAATT +CTGAATCCATAGTTTTCAGTGCAATGGGTGACTGACTAGTCAAACAAAAAAAGAAGTTATTATACCAGAA +GGAAAAAAAACTGATAAATAGAAACTGAAGTGATTGCTACTTTTTTATATGAAAGTTCTGAACAAATAAG +TATGTGATATTCATTTTTCAAAAACCATTTGAAATAGCAAATGTAGAGCAGAGACCTCGATGAGTGGTTT +AACTTCTGCCCCTTCGTTCCTTCCTATGAATGGGGCAGGCTCATGGGCCCCGAGGTCTGGGTGGGCACAG +GAGAACTGTAAAAGTAATCCCCCCCCTCTGAGGTATGACATCTCTTTGCTCAGTCAACTTTTTGTATGAA +ATCATGATATATTACTGTTTTTGAAAATTATGTTTGCATGTCATCAGTCATACTGAATGGTTCTTGTGGC +ACATGGAGTATGGCTTCCTTTCCTATATTTGTCAGTGAGACCTTTGAAATCATTAAAGGCATCCTAATCG +TCAATTATGGAACAATGTAAGCAATGAAATAAATAGTTTCTGGTTTTA + From 334ffa465d04f6b85f6e656097acd2586af59317 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 1 May 2024 13:17:53 +0300 Subject: [PATCH 25/29] Update requirements.txt --- requirements.txt | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 11fa53d..f73348e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,10 @@ +beautifulsoup4==4.12.3 +biopython==1.81 +json5==0.9.14 +logging==0.4.9.6 numpy==1.26.2 - +pandas==2.1.3 +requests==2.31.0 +scanpy==1.10.1 +scikit-learn==1.4.0 +scipy==1.12.0 From 64c19876209152f2c24092327da886992d8cf23c Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 1 May 2024 14:26:24 +0300 Subject: [PATCH 26/29] Update README.md --- README.md | 78 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 3e949d4..7ae8dd9 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,17 @@ -# Bioinformatics toolkit for beginner -The utility is designed for processing protein sequences, as well as working with DNA sequences in the fastQ format (new functions for RNA and DNA processing will be added in future versions). +# Bioinformatics toolkit for beginner

Final homework of Python course at the BI

+ +The repository contains homework of the Python course during retraining program at the [Bioinformatics Institute(2023/2024)](https://bioinf.me/). + +The repository content is scripts, scripts designed to work with multiple biological data storage formats (such as FASTA and GBK), for processing biological sequences, parsing and for interacting with a telegram bot. ## Table of content -+ [Overview](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#overview) + [Installation](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#installation) + [Usage](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#usage) ++ [Content](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#content) + [Credits](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#credits) -## Overview -The toolkit contains a number of functions that allow you to filter data in the format of fastQ аnd also to analyze protein sequences according to a number of important physical and chemical properties. The programm is suitable for a wide range of users, including biologists with minimal knowledge of Python. - ## Installation -To install the program, download files main_script.py and contents of the folder "modules". +To install the program, download files in main directory and, additionally, contents of the folder "data". OR You can simple clone this repository using @@ -23,38 +23,52 @@ git clone git@github.com:LinaWhite15/Bioinformatics_toolkit_for_beginner.git **Python3 is required.** ## Usage -Before running the script, you must import additional modules +Before running the script, you must import required modules in your script. For example: +``` +from bioinfUtils import genscan, GenscanOutput + ``` -from modules.fastq_toolkit import * -from modules.protein_toolkit import * +or ``` -### Input -For running **protein_toolkit** you must enter the protein sequence in one-letter or three-letter format and select one of the operations. -### List of operation in **protein_toolkit** -* ```content_check``` - Analyzes the amino acid composition of the protein. The output gives the percentage content of each molecule in the peptide. +import custom_random_forest + +``` +## Content + +### `bioinfUtils.py` + +`genscan` - [GENSCAN](http://hollywood.mit.edu/GENSCAN.html) API + +`telegram_logger` - decorator function for launching a Telegram bot, logging function execution + +`DNASequence`, `RNASequence` and `AminoAcidSequence` - classes for processing of biological sequences. + +### `bio_files_processor.py` + + +`OpenFasta`, `FastaRecord` - context manager and data class for handling with .fasta files. Provides less resource-intensive storage and iteration. + +`convert_multiline_fasta_to_oneline` - conwerts a multi-line FASTA to wide format + +`select_genes_from_gbk_to_fasta` - function for processing of GenBank files. The function selects genes flanking the given ones and saves its translated sequence into a FASTA file. + +### `custom_random_forest.py` + +`RandomForestClassifier` - custom implementation of random forest algorithm with parallelization -* ```seq_length``` - Measures the length of the peptide and gives the number of amino acids. +### `test_bioinf_utils.py` -* ```protein_formula``` - Gives the atomic composition of the polymer. +Contains tests for functions from modules `bio_files_processor` and `bio_files_processor.py`. -* ```protein_mass``` - Сalculates the molecular mass of a protein in g/mol +### `Showcases.ipynb` -* ```charge``` - Determines the charge of a protein when pH = 7. -### fastq_toolkit -**fastq_toolkit** takes 4 arguments as input: ```seqs```, ```gc_bounds```, ```length_bounds```, ```quality_threshold```: -* ```seqs``` - a dictionary consisting of fastq sequences. Key: string, containing name of sequence. The value is a tuple of two strings: sequence and quality. -* ```gc_bounds``` - composition GC interval (in percent) for filtering. The default is (0 :100). If you pass one number as an argument, other will be considered as the upper limit. -* ```length_bounds``` - length interval for filtering, by default it is equal to (0, 2**32). -* ```quality_threshold``` - threshold value of average read quality for filtering, default is 0 (phred33 scale). +Hotebook with demonstration of `RandomForestClassifier`, `genscan`, `DNASequence`, and `AminoAcidSequence` functionality. -### Output -* **protein_toolkit** - a string with result of performed operation. -* **fastq_toolkit** - a dictionary containing sequences corresponding to user-specified conditions. +## Special gratitudes +Many thanks to the team of the Bioinformatics Institute, especially to the teachers and assistants of the Python program, for preparing and supporting such a wonderful course! It was a wonderful journey, or hopefully just the beginning of one. -## Credits -**Team:** -Belikova Angelina - kiit@gmail.com Implemented: ```protein_formula```, ```protein_mass```, ```seq_length```, **fastq_toolkit**. +# Contacts +If you have any comments or suggestions regarding this software, you can adress me using these contacts: -Aryuna Ayusheeva - aryuna.ayusheeva.1998@mail.ru Implemented: ```aa_content_check```, ```aa_chain_charge```. +Belikova Angelina - kiit8321@gmail.com -Bredov Denis - d2707bredov@gmail.com Implemented: ```Mann_Whitney_U```, ```decomposition```, ```seq_transform```, ```check_and_procees_seq```, ```print_result```, ```run_protein_analyzer_tool```. From 45474507715deb9690074952730b6dc10b80d673 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 1 May 2024 14:29:43 +0300 Subject: [PATCH 27/29] Fix README.md --- README.md | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/README.md b/README.md index 7ae8dd9..75af2d1 100644 --- a/README.md +++ b/README.md @@ -8,30 +8,27 @@ The repository content is scripts, scripts designed to work with multiple biolog + [Installation](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#installation) + [Usage](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#usage) + [Content](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#content) ++ [Special gratitudes](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#Special gratitudes) + [Credits](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#credits) ## Installation To install the program, download files in main directory and, additionally, contents of the folder "data". OR - You can simple clone this repository using ``` git clone git@github.com:LinaWhite15/Bioinformatics_toolkit_for_beginner.git ``` (for Linux and WSL users) - **Python3 is required.** ## Usage Before running the script, you must import required modules in your script. For example: ``` from bioinfUtils import genscan, GenscanOutput - ``` or ``` import custom_random_forest - ``` ## Content @@ -44,8 +41,6 @@ import custom_random_forest `DNASequence`, `RNASequence` and `AminoAcidSequence` - classes for processing of biological sequences. ### `bio_files_processor.py` - - `OpenFasta`, `FastaRecord` - context manager and data class for handling with .fasta files. Provides less resource-intensive storage and iteration. `convert_multiline_fasta_to_oneline` - conwerts a multi-line FASTA to wide format @@ -53,15 +48,12 @@ import custom_random_forest `select_genes_from_gbk_to_fasta` - function for processing of GenBank files. The function selects genes flanking the given ones and saves its translated sequence into a FASTA file. ### `custom_random_forest.py` - `RandomForestClassifier` - custom implementation of random forest algorithm with parallelization ### `test_bioinf_utils.py` - Contains tests for functions from modules `bio_files_processor` and `bio_files_processor.py`. ### `Showcases.ipynb` - Hotebook with demonstration of `RandomForestClassifier`, `genscan`, `DNASequence`, and `AminoAcidSequence` functionality. ## Special gratitudes From 7d5643ce04ee96b6b725bb522b9cc951660f585a Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 1 May 2024 14:30:29 +0300 Subject: [PATCH 28/29] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 75af2d1..5a8c2a9 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ The repository content is scripts, scripts designed to work with multiple biolog + [Installation](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#installation) + [Usage](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#usage) + [Content](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#content) -+ [Special gratitudes](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#Special gratitudes) ++ [Special_gratitudes](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#Special_gratitudes) + [Credits](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#credits) ## Installation From 1fd02af1a78bae32dda86e07e1c2ed2cdfe5fab5 Mon Sep 17 00:00:00 2001 From: LinaWhite15 <129277151+LinaWhite15@users.noreply.github.com> Date: Wed, 1 May 2024 14:30:51 +0300 Subject: [PATCH 29/29] Fix link README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5a8c2a9..69894ba 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ The repository content is scripts, scripts designed to work with multiple biolog + [Installation](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#installation) + [Usage](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#usage) + [Content](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#content) -+ [Special_gratitudes](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#Special_gratitudes) ++ [Special gratitudes](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#Special_gratitudes) + [Credits](https://github.com/LinaWhite15/Bioinformatics_toolkit_for_beginner/edit/Development/README.md#credits) ## Installation