diff --git a/README.md b/README.md index 58261f5..a6d192b 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,11 @@ network models. ## How to use DeepNovo? -DeepNovo is implemented and tested with Python 2.7, TensorFlow 1.2 and Cython. +DeepNovo has been developed and implemented with Python 2.7, TensorFlow 1.2 and Cython. +It has been tested with Python 3.7, TensorFlow 2.0 and Cython as well. +To install requirements use: + + pip install -r requirements.txt **Step 0**: Build deepnovo_cython_setup to accelerate Python with C. diff --git a/deepnovo_config.py b/deepnovo_config.py index 609fff0..9ee533c 100644 --- a/deepnovo_config.py +++ b/deepnovo_config.py @@ -182,7 +182,7 @@ 'V': 99.06841, # 19 } -mass_ID = [mass_AA[vocab_reverse[x]] for x in xrange(vocab_size)] +mass_ID = [mass_AA[vocab_reverse[x]] for x in range(vocab_size)] mass_ID_np = np.array(mass_ID, dtype=np.float32) mass_AA_min = mass_AA["G"] # 57.02146 @@ -413,6 +413,7 @@ input_file_valid = "data.training/dia.xchen.nov27/fraction_1.mgf.split.valid.dup" input_file_test = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup" decode_test_file = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup" +decode_output_file = decode_test_file + ".deepnovo_decode" # denovo files denovo_input_file = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup" denovo_output_file = denovo_input_file + ".deepnovo_denovo" diff --git a/deepnovo_main_modules.py b/deepnovo_main_modules.py index a448f87..241a418 100644 --- a/deepnovo_main_modules.py +++ b/deepnovo_main_modules.py @@ -37,16 +37,17 @@ import sys import time import re -import resource +# import resource import numpy as np #~ cimport numpy as np #~ ctypedef np.float32_t C_float32 #~ ctypedef np.int32_t C_int32 -from six.moves import xrange # pylint: disable=redefined-builtin +# from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf import deepnovo_config +from deepnovo_config import knapsack_file as knapsack_file import deepnovo_model from deepnovo_cython_modules import process_spectrum, get_candidate_intensity @@ -266,7 +267,7 @@ def read_spectra(file_handle, data_format, spectra_locations): candidate_intensity_list_forward = [] prefix_mass = 0.0 - for index in xrange(decoder_size): + for index in range(decoder_size): prefix_mass += deepnovo_config.mass_ID[peptide_ids_forward[index]] candidate_intensity = get_candidate_intensity( @@ -281,7 +282,7 @@ def read_spectra(file_handle, data_format, spectra_locations): candidate_intensity_list_backward = [] suffix_mass = 0.0 - for index in xrange(decoder_size): + for index in range(decoder_size): suffix_mass += deepnovo_config.mass_ID[peptide_ids_backward[index]] candidate_intensity = get_candidate_intensity( @@ -385,19 +386,19 @@ def get_batch_01(index_list, data_set, bucket_id): batch_decoder_inputs = [] batch_weights = [] decoder_size = deepnovo_config._buckets[bucket_id] - for length_idx in xrange(decoder_size): + for length_idx in range(decoder_size): # batch_intensity_inputs and batch_decoder_inputs are just re-indexed. batch_intensity_inputs.append( np.array([candidate_intensity_lists[batch_idx][length_idx] - for batch_idx in xrange(batch_size)], dtype=np.float32)) + for batch_idx in range(batch_size)], dtype=np.float32)) batch_decoder_inputs.append( np.array([decoder_inputs[batch_idx][length_idx] - for batch_idx in xrange(batch_size)], dtype=np.int32)) + for batch_idx in range(batch_size)], dtype=np.int32)) # Create target_weights to be 0 for targets that are padding. batch_weight = np.ones(batch_size, dtype=np.float32) - for batch_idx in xrange(batch_size): + for batch_idx in range(batch_size): # The corresponding target is decoder_input shifted by 1 forward. if length_idx < decoder_size - 1: target = decoder_inputs[batch_idx][length_idx + 1] @@ -448,25 +449,25 @@ def get_batch_2(index_list, data_set, bucket_id): batch_decoder_inputs_backward = [] batch_weights = [] decoder_size = deepnovo_config._buckets[bucket_id] - for length_idx in xrange(decoder_size): + for length_idx in range(decoder_size): # batch_intensity_inputs and batch_decoder_inputs are re-indexed. batch_intensity_inputs_forward.append( np.array([candidate_intensity_lists_forward[batch_idx][length_idx] - for batch_idx in xrange(batch_size)], dtype=np.float32)) + for batch_idx in range(batch_size)], dtype=np.float32)) batch_intensity_inputs_backward.append( np.array([candidate_intensity_lists_backward[batch_idx][length_idx] - for batch_idx in xrange(batch_size)], dtype=np.float32)) + for batch_idx in range(batch_size)], dtype=np.float32)) batch_decoder_inputs_forward.append( np.array([decoder_inputs_forward[batch_idx][length_idx] - for batch_idx in xrange(batch_size)], dtype=np.int32)) + for batch_idx in range(batch_size)], dtype=np.int32)) batch_decoder_inputs_backward.append( np.array([decoder_inputs_backward[batch_idx][length_idx] - for batch_idx in xrange(batch_size)], dtype=np.int32)) + for batch_idx in range(batch_size)], dtype=np.int32)) # Create target_weights to be 0 for targets that are padding. batch_weight = np.ones(batch_size, dtype=np.float32) - for batch_idx in xrange(batch_size): + for batch_idx in range(batch_size): # The corresponding target is decoder_input shifted by 1 forward. if length_idx < decoder_size - 1: target = decoder_inputs_forward[batch_idx][length_idx + 1] @@ -664,9 +665,9 @@ def test_AA_decode_batch(scans, #~ batch_size = len(decoder_inputs) - for index in xrange(len(scans)): + for index in range(len(scans)): #~ # for testing - #~ for index in xrange(15,20): + #~ for index in range(15,20): scan = scans[index] decoder_input = decoder_inputs[index] @@ -757,7 +758,7 @@ def test_logit_batch_01(decoder_inputs, output_logits): num_exact_match = 0.0 num_len_match = 0.0 batch_size = len(decoder_inputs[0]) - for batch in xrange(batch_size): + for batch in range(batch_size): decoder_input = [x[batch] for x in decoder_inputs] output_logit = [x[batch] for x in output_logits] @@ -800,7 +801,7 @@ def test_logit_batch_2(decoder_inputs_forward, num_exact_match = 0.0 num_len_match = 0.0 batch_size = len(decoder_inputs_forward[0]) - for batch in xrange(batch_size): + for batch in range(batch_size): decoder_input_forward = [x[batch] for x in decoder_inputs_forward] decoder_input_backward = [x[batch] for x in decoder_inputs_backward] @@ -841,10 +842,10 @@ def test_random_accuracy(sess, model, data_set, bucket_id): data_set_len = len(data_set[bucket_id]) num_step = deepnovo_config.random_test_batches - for _ in xrange(num_step): + for _ in range(num_step): start_time = time.time() - random_index_list = random.sample(xrange(data_set_len), + random_index_list = random.sample(range(data_set_len), deepnovo_config.batch_size) # get_batch_01/2 @@ -1044,8 +1045,8 @@ def knapsack_example(): print("mass_aa = ", mass_aa) knapsack_matrix = np.zeros(shape=(4, 11), dtype=bool) - for aa_id in xrange(4): - for col in xrange(peptide_mass): + for aa_id in range(4): + for col in range(peptide_mass): current_mass = col + 1 @@ -1089,13 +1090,13 @@ def knapsack_build(): peptide_mass_upperbound), dtype=bool) - for aa_id in xrange(3, deepnovo_config.vocab_size): # excluding PAD, GO, EOS + for aa_id in range(3, deepnovo_config.vocab_size): # excluding PAD, GO, EOS mass_aa_round = int(round(deepnovo_config.mass_ID[aa_id] * deepnovo_config.KNAPSACK_AA_RESOLUTION)) print(deepnovo_config.vocab_reverse[aa_id], mass_aa_round) - for col in xrange(peptide_mass_upperbound): + for col in range(peptide_mass_upperbound): # col 0 ~ mass 1 # col + 1 = mass @@ -1118,7 +1119,7 @@ def knapsack_build(): else: knapsack_matrix[aa_id, col] = False - np.save("knapsack.npy", knapsack_matrix) + np.save(knapsack_file, knapsack_matrix) def knapsack_search(knapsack_matrix, peptide_mass, mass_precision_tolerance): @@ -1280,14 +1281,14 @@ def decode_true_feeding_01(sess, model, direction, data_set): #~ block_state0 = np.vstack(block_state0) # MAIN decoding LOOP in STACKS - output_log_probs = [[] for x in xrange(len(data_set[0][2]))] + output_log_probs = [[] for x in range(len(data_set[0][2]))] for stack_index, stack in enumerate(data_set_index_stack_list): stack_c_state = block_c_state0[stack_index] stack_h_state = block_h_state0[stack_index] - for index in xrange(len(data_set[0][2])): + for index in range(len(data_set[0][2])): block_candidate_intensity = np.array([data_set[x][1][index] for x in stack]) @@ -1401,7 +1402,7 @@ def decode_beam_select_01(output_top_paths, direction): LAST_LABEL = deepnovo_config.GO_ID outputs = [] - for entry in xrange(len(output_top_paths)): + for entry in range(len(output_top_paths)): top_paths = output_top_paths[entry] @@ -1498,7 +1499,7 @@ def decode_beam_search_01(sess, # peptide_mass # 3 # our TARGET - output_top_paths = [[] for x in xrange(data_set_len)] + output_top_paths = [[] for x in range(data_set_len)] # how many spectra to process at 1 block-run decode_block_size = deepnovo_config.batch_size @@ -1542,7 +1543,7 @@ def decode_beam_search_01(sess, active_search = [] # fill in the first entries of active_search - for spectrum_id in xrange(decode_block_size): + for spectrum_id in range(decode_block_size): active_search.append([]) active_search[-1].append(spectrum_id) @@ -1685,7 +1686,7 @@ def decode_beam_search_01(sess, new_paths = [] - for index in xrange(block_index, + for index in range(block_index, block_index + entry_block_size[entry_index]): for aa_id in block_mass_filter_candidate[index]: @@ -1713,7 +1714,7 @@ def decode_beam_search_01(sess, top_k_indices = np.argpartition(-new_path_scores, deepnovo_config.FLAGS.beam_size)[:deepnovo_config.FLAGS.beam_size] # pylint: disable=line-too-long #~ top_k_indices = np.argpartition(-new_path_scores/new_path_lengths,deepnovo_config.FLAGS.beam_size)[:deepnovo_config.FLAGS.beam_size] # pylint: disable=line-too-long entry[1] = [new_paths[top_k_indices[x]] - for x in xrange(deepnovo_config.FLAGS.beam_size)] + for x in range(deepnovo_config.FLAGS.beam_size)] else: entry[1] = new_paths[:] @@ -1732,7 +1733,7 @@ def decode_beam_search_01(sess, - active_search_len, data_set_len) - for spectrum_id in xrange(spectrum_count, new_spectrum_count): + for spectrum_id in range(spectrum_count, new_spectrum_count): active_search.append([]) active_search[-1].append(spectrum_id) active_search[-1].append([[[FIRST_LABEL], # current_paths @@ -1803,7 +1804,7 @@ def decode_beam_search_2(sess, model, data_set, knapsack_matrix): argmax_mass_complement_list = [] # by choosing the location of max intensity from (0, peptide_mass_C_location) - for spectrum_id in xrange(data_set_len): + for spectrum_id in range(data_set_len): peptide_mass = peptide_mass_list[spectrum_id] peptide_mass_C = peptide_mass - mass_EOS @@ -1829,7 +1830,7 @@ def decode_beam_search_2(sess, model, data_set, knapsack_matrix): argmax_mass_complement_list.append(argmax_mass_complement) # Add the mass and its complement to candidate_mass_list - for position in xrange(num_position): + for position in range(num_position): prefix_mass_list = [x[position] for x in argmax_mass_list] suffix_mass_list = [x[position] for x in argmax_mass_complement_list] @@ -1846,7 +1847,7 @@ def decode_beam_search_2(sess, model, data_set, knapsack_matrix): 1000]) # knapsack_precision # Start decoding for each candidate_mass - output_top_paths = [[] for x in xrange(data_set_len)] + output_top_paths = [[] for x in range(data_set_len)] for candidate_mass in candidate_mass_list: top_paths_forward = decode_beam_search_01( @@ -1869,7 +1870,7 @@ def decode_beam_search_2(sess, model, data_set, knapsack_matrix): candidate_mass[3], # knapsack_precision data_set_backward) - for spectrum_id in xrange(data_set_len): + for spectrum_id in range(data_set_len): if ((not top_paths_forward[spectrum_id]) or (not top_paths_backward[spectrum_id])): # any list is empty @@ -1889,8 +1890,8 @@ def decode_beam_search_2(sess, model, data_set, knapsack_matrix): #~ return output_top_paths # Refinement using peptide_mass_list, especially for middle mass - output_top_paths_refined = [[] for x in xrange(data_set_len)] - for spectrum_id in xrange(data_set_len): + output_top_paths_refined = [[] for x in range(data_set_len)] + for spectrum_id in range(data_set_len): top_paths = output_top_paths[spectrum_id] for path in top_paths: seq = path[0] @@ -2005,8 +2006,8 @@ def decode(input_file=deepnovo_config.decode_test_file): # DECODE with BEAM SEARCH if deepnovo_config.FLAGS.beam_search: - print("Load knapsack_matrix from default: knapsack.npy") - knapsack_matrix = np.load("knapsack.npy") + print("Load knapsack_matrix from default:", knapsack_file) + knapsack_matrix = np.load(knapsack_file) # READ & DECODE in stacks print("READ & DECODE in stacks") @@ -2024,7 +2025,8 @@ def decode(input_file=deepnovo_config.decode_test_file): total_peptide_decode = 0.0 # print to output file - decode_output_file = deepnovo_config.FLAGS.train_dir + "/decode_output.tab" + # decode_output_file = deepnovo_config.FLAGS.train_dir + "/decode_output.tab" + decode_output_file = deepnovo_config.decode_output_file with open(decode_output_file, 'w') as output_file_handle: print("scan\ttarget_seq\toutput_seq\toutput_score\taccuracy_AA\tlen_AA" "\texact_match\n", @@ -2088,7 +2090,7 @@ def decode(input_file=deepnovo_config.decode_test_file): # decode_true_feeding each bucket separately, like in training/validation print("DECODE with TRUE FEEDING") - for bucket_id in xrange(len(deepnovo_config._buckets)): + for bucket_id in range(len(deepnovo_config._buckets)): if not data_set[bucket_id]: # empty bucket continue @@ -2151,17 +2153,17 @@ def train_cycle(model, # to select a bucket, length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_bucket_sizes = [len(train_set[b]) - for b in xrange(len(deepnovo_config._buckets))] + for b in range(len(deepnovo_config._buckets))] train_total_size = float(sum(train_bucket_sizes)) print("train_bucket_sizes ", train_bucket_sizes) print("train_total_size ", train_total_size) train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size - for i in xrange(len(train_bucket_sizes))] + for i in range(len(train_bucket_sizes))] print("train_buckets_scale ", train_buckets_scale) # to monitor the number of spectra in the current stack # that have been used for training - train_current_spectra = [0 for b in xrange(len(deepnovo_config._buckets))] + train_current_spectra = [0 for b in range(len(deepnovo_config._buckets))] # Get a batch and train while True: @@ -2172,7 +2174,7 @@ def train_cycle(model, # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() - bucket_id = min([i for i in xrange(len(train_buckets_scale)) + bucket_id = min([i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # not enough spectra left in this bucket of the current stack @@ -2186,7 +2188,7 @@ def train_cycle(model, break # Get a RANDOM batch from the current stack and make a step. - random_index_list = random.sample(xrange(train_bucket_sizes[bucket_id]), + random_index_list = random.sample(range(train_bucket_sizes[bucket_id]), deepnovo_config.batch_size) # for testing @@ -2481,8 +2483,8 @@ def train(): #~ gc .collect() - print("RESOURCE-train_cycle: ", - resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000) + # print("RESOURCE-train_cycle: ", + # resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000) # stop training if >= 50 epochs epoch = (model.global_step.eval() @@ -2527,7 +2529,7 @@ def test_true_feeding(): print("Create model for testing") model = create_model(sess, training_mode=False) - for bucket_id in xrange(len(deepnovo_config._buckets)): + for bucket_id in range(len(deepnovo_config._buckets)): #~ if valid_set[bucket_id]: # bucket not empty #~ print("valid_set - bucket {0}".format(bucket_id)) diff --git a/deepnovo_model.py b/deepnovo_model.py index 8ed76e2..16761f3 100644 --- a/deepnovo_model.py +++ b/deepnovo_model.py @@ -33,15 +33,16 @@ import sys import numpy as np -from six.moves import xrange # pylint: disable=redefined-builtin +# from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import embedding_ops from tensorflow.python.ops import rnn_cell -from tensorflow.python.ops import rnn_cell_impl +# from tensorflow.python.ops import rnn_cell_impl from tensorflow.python.ops import variable_scope +from tensorflow.contrib.rnn.python.ops import core_rnn_cell import deepnovo_config import deepnovo_model_training @@ -71,7 +72,7 @@ def __init__(self, session, training_mode): # TODO(nh2tran): session-unused # candidate intensity self.intensity_inputs_forward = [] self.intensity_inputs_backward = [] - for x in xrange(deepnovo_config._buckets[-1]): # TODO(nh2tran): _buckets + for x in range(deepnovo_config._buckets[-1]): # TODO(nh2tran): _buckets self.intensity_inputs_forward.append(tf.placeholder( dtype=tf.float32, shape=[None, deepnovo_config.vocab_size, deepnovo_config.num_ion, deepnovo_config.WINDOW_SIZE], # TODO(nh2tran): line-too-long, config @@ -85,7 +86,7 @@ def __init__(self, session, training_mode): # TODO(nh2tran): session-unused self.decoder_inputs_forward = [] self.decoder_inputs_backward = [] self.target_weights = [] - for x in xrange(deepnovo_config._buckets[-1] + 1): # TODO(nh2tran): _buckets + for x in range(deepnovo_config._buckets[-1] + 1): # TODO(nh2tran): _buckets self.decoder_inputs_forward.append(tf.placeholder( dtype=tf.int32, shape=[None], @@ -101,9 +102,9 @@ def __init__(self, session, training_mode): # TODO(nh2tran): session-unused # Our targets are decoder inputs shifted by one. self.targets_forward = [self.decoder_inputs_forward[x + 1] - for x in xrange(len(self.decoder_inputs_forward) - 1)] # TODO(nh2tran): line-too-long + for x in range(len(self.decoder_inputs_forward) - 1)] # TODO(nh2tran): line-too-long self.targets_backward = [self.decoder_inputs_backward[x + 1] - for x in xrange(len(self.decoder_inputs_backward) - 1)] # TODO(nh2tran): line-too-long + for x in range(len(self.decoder_inputs_backward) - 1)] # TODO(nh2tran): line-too-long # OUTPUTS and LOSSES (self.outputs_forward, @@ -125,7 +126,7 @@ def __init__(self, session, training_mode): # TODO(nh2tran): session-unused self.gradient_norms = [] self.updates = [] opt = tf.train.AdamOptimizer() - for b in xrange(len(deepnovo_config._buckets)): # TODO(nh2tran): _buckets + for b in range(len(deepnovo_config._buckets)): # TODO(nh2tran): _buckets gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, @@ -138,7 +139,7 @@ def __init__(self, session, training_mode): # TODO(nh2tran): session-unused # for TensorBoard #~ self.train_writer = tf.train.SummaryWriter(deepnovo_config.FLAGS.train_dir, session.graph) #~ self.loss_summaries = [tf.scalar_summary("losses_" + str(b), self.losses[b]) - #~ for b in xrange(len(deepnovo_config._buckets))] + #~ for b in range(len(deepnovo_config._buckets))] #~ dense1_W_penalty = tf.get_default_graph().get_tensor_by_name( #~ "model_with_buckets/embedding_rnn_seq2seq/embedding_rnn_decoder/rnn_decoder_forward/dense1_W_penalty:0") #~ self.dense1_W_penalty_summary = tf.scalar_summary("dense1_W_penalty_summary", dense1_W_penalty) @@ -167,7 +168,7 @@ def step(self, # Input feed forward if deepnovo_config.FLAGS.direction == 0 or deepnovo_config.FLAGS.direction == 2: - for x in xrange(decoder_size): + for x in range(decoder_size): input_feed[self.intensity_inputs_forward[x].name] = intensity_inputs_forward[x] # TODO(nh2tran): line-too-long input_feed[self.decoder_inputs_forward[x].name] = decoder_inputs_forward[x] # TODO(nh2tran): line-too-long # Since our targets are decoder inputs shifted by one, we need one more. @@ -177,7 +178,7 @@ def step(self, # Input feed backward if deepnovo_config.FLAGS.direction == 1 or deepnovo_config.FLAGS.direction == 2: - for x in xrange(decoder_size): + for x in range(decoder_size): input_feed[self.intensity_inputs_backward[x].name] = intensity_inputs_backward[x] # TODO(nh2tran): line-too-long input_feed[self.decoder_inputs_backward[x].name] = decoder_inputs_backward[x] # TODO(nh2tran): line-too-long # Since our targets are decoder inputs shifted by one, we need one more. @@ -186,7 +187,7 @@ def step(self, dtype=np.int32) # Input feed target weights - for x in xrange(decoder_size): + for x in range(decoder_size): input_feed[self.target_weights[x].name] = target_weights[x] # keeping probability for dropout layers @@ -207,12 +208,12 @@ def step(self, # Output forward logits if deepnovo_config.FLAGS.direction == 0 or deepnovo_config.FLAGS.direction == 2: - for x in xrange(decoder_size): + for x in range(decoder_size): output_feed.append(self.outputs_forward[bucket_id][x]) # Output backward logits if deepnovo_config.FLAGS.direction == 1 or deepnovo_config.FLAGS.direction == 2: - for x in xrange(decoder_size): + for x in range(decoder_size): output_feed.append(self.outputs_backward[bucket_id][x]) # RUN @@ -334,7 +335,7 @@ def build_network(self, input_dict, dropout_keep): # linear transform to logit [128, 26] # TODO(nh2tran): replace _linear and remove scope with tf.variable_scope("output_logit"): - feature_logit = rnn_cell_impl._linear(args=feature, + feature_logit = core_rnn_cell._linear(args=feature, output_size=self.vocab_size, bias=True, bias_initializer=None,#0.1, @@ -437,7 +438,7 @@ def _build_cnn_ion(self, input_intensity, direction): # linear transform to logit [128, 26], in case only cnn_ion model is used # TODO(nh2tran): replace _linear and remove scope with tf.variable_scope("intensity_output_projected"): - cnn_ion_logit = rnn_cell_impl._linear(args=cnn_ion_feature, + cnn_ion_logit = core_rnn_cell._linear(args=cnn_ion_feature, output_size=self.vocab_size, bias=True, bias_initializer=None,#0.1, @@ -616,7 +617,7 @@ def _build_lstm(self, cnn_spectrum, input_lstm_state, embedding_AAid, direction) # linear transform to logit [128, 26], in case only lstm model is used # TODO(nh2tran): replace _linear and remove scope with tf.variable_scope("lstm_output_projected"): - lstm_logit = rnn_cell_impl._linear( + lstm_logit = core_rnn_cell._linear( args=lstm_feature, output_size=self.vocab_size, bias=True, diff --git a/deepnovo_worker_denovo.py b/deepnovo_worker_denovo.py index 47d5c62..40568e2 100644 --- a/deepnovo_worker_denovo.py +++ b/deepnovo_worker_denovo.py @@ -84,6 +84,7 @@ def search_denovo(self, model, worker_io): predicted_denovo_list = [] # load/build knapsack matrix + print('knapsack file:', self.knapsack_file) if os.path.isfile(self.knapsack_file): print("WorkerDenovo: search_denovo() - load knapsack matrix") self.knapsack_matrix = np.load(self.knapsack_file) @@ -145,11 +146,13 @@ def _build_knapsack(self): dtype=bool) # fill up the knapsack_matrix by rows and columns, using dynamic programming - for AAid in xrange(3, self.vocab_size): # excluding PAD, GO, EOS + for AAid in range(3, self.vocab_size): # excluding PAD, GO, EOS mass_AA = int(round(self.mass_ID[AAid] * self.KNAPSACK_AA_RESOLUTION)) - for col in xrange(max_mass_upperbound): + print(AAid, mass_AA) + + for col in range(max_mass_upperbound): # col 0 ~ mass 1 # col + 1 = mass @@ -218,7 +221,7 @@ def _extend_peak(self, # path["score_list"] # path["score_sum"] spectrum_batch_size = len(spectrum_batch) - top_path_batch = [[] for x in xrange(spectrum_batch_size)] + top_path_batch = [[] for x in range(spectrum_batch_size)] # forward/backward direction setting # the direction determines the model, the spectrum and the peak mass @@ -270,7 +273,7 @@ def _extend_peak(self, # path["c_state"] # path["h_state"] active_search_list = [] - for spectrum_id in xrange(spectrum_batch_size): + for spectrum_id in range(spectrum_batch_size): search_entry = {} search_entry["spectrum_id"] = spectrum_id path = {} @@ -416,7 +419,7 @@ def _extend_peak(self, # find all possible new paths within knapsack filter new_path_list = [] - for index in xrange(block_index, block_index + search_entry_size[entry_index]): + for index in range(block_index, block_index + search_entry_size[entry_index]): for AAid in block_knapsack_candidate[index]: new_path = {} new_path["AAid_list"] = block_AAid_list[index] + [AAid] @@ -440,7 +443,7 @@ def _extend_peak(self, new_path_score = np.array([x["score_sum"] for x in new_path_list]) top_k_index = np.argpartition(-new_path_score, self.beam_size)[:self.beam_size] # pylint: disable=line-too-long search_entry["current_path_list"] = [new_path_list[top_k_index[x]] - for x in xrange(self.beam_size)] + for x in range(self.beam_size)] else: search_entry["current_path_list"] = new_path_list @@ -494,7 +497,7 @@ def _search_denovo_batch(self, spectrum_batch, model, session): # candidate["sequence"] # candidate["position_score"] # candidate["score"] - top_candidate_batch = [[] for x in xrange(spectrum_batch_size)] + top_candidate_batch = [[] for x in range(spectrum_batch_size)] for peak_batch in peak_list: forward_path_batch = self._extend_peak("forward", @@ -509,7 +512,7 @@ def _search_denovo_batch(self, spectrum_batch, model, session): peak_batch) # concatenate forward and backward paths - for spectrum_id in xrange(spectrum_batch_size): + for spectrum_id in range(spectrum_batch_size): if ((not forward_path_batch[spectrum_id]) or (not backward_path_batch[spectrum_id])): # any list is empty continue @@ -637,7 +640,7 @@ def _select_peak(self, spectrum_batch): mass_tolerance = 1./self.SPECTRUM_RESOLUTION # add middle peaks and their complements to peak_list - for index in xrange(self.num_position): + for index in range(self.num_position): # treat the peak as a b-ion, so it corresponds to a prefix, and its # complement y-ion corresponds to a suffix @@ -684,8 +687,8 @@ def _select_sequence(self, spectrum_batch, top_candidate_batch): # refine/filter predicted sequences by precursor mass, # especially for middle peak extension - refine_batch = [[] for x in xrange(spectrum_batch_size)] - for spectrum_id in xrange(spectrum_batch_size): + refine_batch = [[] for x in range(spectrum_batch_size)] + for spectrum_id in range(spectrum_batch_size): precursor_mass = spectrum_batch[spectrum_id]["precursor_mass"] candidate_list = top_candidate_batch[spectrum_id] for candidate in candidate_list: @@ -696,8 +699,8 @@ def _select_sequence(self, spectrum_batch, top_candidate_batch): refine_batch[spectrum_id].append(candidate) # select the best len-normalized scoring candidate - predicted_batch = [[] for x in xrange(spectrum_batch_size)] - for spectrum_id in xrange(spectrum_batch_size): + predicted_batch = [[] for x in range(spectrum_batch_size)] + for spectrum_id in range(spectrum_batch_size): predicted_batch[spectrum_id] = {} predicted_batch[spectrum_id]["scan"] = spectrum_batch[spectrum_id]["scan"] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5de339a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +cython +tensorflow +lxml +numpy +matplotlib +pyteomics +biopython \ No newline at end of file