diff --git a/readme.md b/readme.md index 95bf6938..d481a54b 100644 --- a/readme.md +++ b/readme.md @@ -28,7 +28,7 @@ You can watch a video tutorial on how to use UniDec here: [https://www.youtube.c UniDec is distributed under a completely open source license. Our hope is that this allows UniDec to be more widely used. If you are interested in including UniDec in another academic or commercial software distribution, -you are welcome to email mtmarty@email.arizona.edu for more information. +you are welcome to email mtmarty@arizona.edu for more information. UniDec source code and compiled binaries are released under a modified BSD License as described below. Note, we ask that you cite us in any publications. Quantitative citation metrics will help grant applications to support future development. @@ -68,6 +68,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Waters DLLs are distributed under the Waters MassLynxSDK EULA license. The Thermo RawFileReader DLLs are distributed under the RawFileReader.doc license. By downloading and using these DLLs, you are accepting those licenses and these terms. Thermo and Waters are indemnified, defended, and held harmless from any claims, including attorneys’ fees, related to the distribution or use of this software. Redistribution of Waters and Thermo libraries is restricted as described in those licenses. Info on other licenses are provided below. +Intel libraries from the oneAPI are distributed under licenses found in the bin folder. + +RDKit libraries are gratefully used for LipiDec and are distributed under the CC-SA license. Please [cite them when using these features](https://www.rdkit.org/docs/Overview.html#citing-the-rdkit). + ## UniDec Compatible File Types UniDec is built to open .txt files using [numpy.loadtxt](http://docs.scipy.org/doc/numpy-1.10.0/reference/generated/numpy.loadtxt.html). @@ -191,9 +195,17 @@ Of course, using the pre-compiled version means you don't need to know Python at ## Change Log +v.6.0.2 + +**Added a new Drug-to-Antibody Ratio (DAR) calculation mode for UPP.** See help file for the new keywords. + +Added color to the columns on UPP. + +Fixed bug with HTML float sorting as text. Fixed major bugs with file imports. + v.6.0.1 -*Added Noise Removal Tool* to UniDec and UniDecCD. This allows you to select specific noise regions (common in UHMR data) and set them to 0. Try Tools > Select Noise Peaks and the checking "Remove Noise Peaks" in the Advanced Data Processing Parameters. +**Added Noise Removal Tool** to UniDec and UniDecCD. This allows you to select specific noise regions (common in UHMR data) and set them to 0. Try Tools > Select Noise Peaks and the checking "Remove Noise Peaks" in the Advanced Data Processing Parameters. UPP Changes: * Added "Config m/z Peak FWHM", "Config m/z Peak Shape", and "Config Sample Mass Every" parameters. If "Config m/z Peak FWHM" is specified as a float or integer, it will use that and skip the automatic peak width determination. diff --git a/unidec/LipiDec/AddFeaturestoDF.py b/unidec/LipiDec/AddFeaturestoDF.py index a2d49575..263442d0 100644 --- a/unidec/LipiDec/AddFeaturestoDF.py +++ b/unidec/LipiDec/AddFeaturestoDF.py @@ -6,10 +6,10 @@ def get_overall_tails(name): if "|" in name: try: - tname = name.split("|")[0] # Remove before vertical line - tname = tname.split()[1] # Remove after space to get tails - tl = tname.split(":")[0] # Get length - tu = tname.split(":")[1] # Get unsaturation + tname = name.split("|")[0] # Remove before vertical line + tname = tname.split()[1] # Remove after space to get tails + tl = tname.split(":")[0] # Get length + tu = tname.split(":")[1] # Get unsaturation except: return name, -1, -1 try: @@ -23,15 +23,16 @@ def get_overall_tails(name): else: tname = name.split()[1] fas = tname.split("_") - tls=0 - tus=0 + tls = 0 + tus = 0 for f in fas: tl = f.split(":")[0] # Get length try: tu = f.split(":")[1] # Get unsaturation except: - print(f, name) - exit() + print("Unable to parse part: ", f, name) + continue + # exit() try: tl = int(tl) except: @@ -67,6 +68,52 @@ def set_tails(df, name="Metabolite name"): return df +def set_class_name(df, name="Metabolite name"): + df = df.copy(deep=True) + names = df[name].to_numpy() + classes = [] + for n in names: + if "|" in n: + n = n.split("|")[1] + n = n.split()[0] + classes.append(n) + df["Class Name"] = classes + return df + + +def calc_tail_diff(df, class_col="Class Name", tail_col="Tail Lengths", unsat_col="Tail Unsaturation"): + # Calculate average tail lengths and unsaturations for each class + unique_classes = np.unique(df[class_col]) + avg_class_lengths = [] + avg_class_unsat = [] + for c in unique_classes: + avg_class_lengths.append(np.mean(df[df[class_col] == c][tail_col])) + avg_class_unsat.append(np.mean(df[df[class_col] == c][unsat_col])) + + # Subtract each value from the average for that class + tail_diffs = [] + unsat_diffs = [] + for i, row in df.iterrows(): + c = row[class_col] + t = row[tail_col] + u = row[unsat_col] + index = np.where(unique_classes == c)[0][0] + tdiff = t - avg_class_lengths[index] + udiff = u - avg_class_unsat[index] + tail_diffs.append(tdiff) + unsat_diffs.append(udiff) + + df["Tail Length Diff"] = tail_diffs + df["Tail Unsat Diff"] = unsat_diffs + + print('Unique Classes: ', unique_classes) + print('Average Tail Lengths: ', avg_class_lengths) + print('Average Tail Unsaturation: ', avg_class_unsat) + + + return df + + adduct_translator = np.array([["[M+H]+", "[M+H]", "+1", 1, "[M+H]1+"], ["[M+NH4]+", "[M+NH4]", "+1", 1, "[M+NH4]1+"], ["[M+2H]2+", "[M+2H]", "+2", 2, "[M+2H]2+"], @@ -82,7 +129,7 @@ def parse_names(df, namecolumn="Metabolite name", parse_adduct=True): full_names_adduct = [] charges = [] for i, row in df.iterrows(): - #Parse Name + # Parse Name mol_name = row[namecolumn] if "|" in mol_name: names = mol_name.split("|") @@ -93,7 +140,7 @@ def parse_names(df, namecolumn="Metabolite name", parse_adduct=True): simp_names.append(simp_name) full_names.append(full_name) - #Parse Adduct + # Parse Adduct if parse_adduct: prec_adduct = row["Adduct type"] if prec_adduct in adduct_translator[:, 0]: @@ -152,7 +199,7 @@ def isotope_ratios(df): s = np.array(re.split(" |:", sraw)) try: ints = s[1::2].astype(float) - ratio = ints[1]/ints[0] + ratio = ints[1] / ints[0] except: ratio = -1 ratios.append(ratio) diff --git a/unidec/UPP.py b/unidec/UPP.py index b29948ef..3ff9ba5c 100644 --- a/unidec/UPP.py +++ b/unidec/UPP.py @@ -13,6 +13,7 @@ def __init__(self, num=1, *args, **kw): super().__init__(*args, **kw) pathtofile = os.path.dirname(os.path.abspath(__file__)) self.imagepath = os.path.join(pathtofile, "images") + self.htmlstr = "" # print(pathtofile) # print(self.imagepath) if num == 1: @@ -23,28 +24,39 @@ def __init__(self, num=1, *args, **kw): def help_frame(self): wx.Frame.__init__(self, None, wx.ID_ANY, title="Help", size=(600, 600)) + + # Add a menu button to open in the web browser + menu = wx.Menu() + menu.Append(wx.ID_ANY, "Open in Browser") + menu.Bind(wx.EVT_MENU, self.open_in_browser) + menu_bar = wx.MenuBar() + menu_bar.Append(menu, "Help") + self.SetMenuBar(menu_bar) + html = wx.html.HtmlWindow(self) - html_str = "
" \ - "" \ - "Welcome to the UniDec Processing Pipeline (UPP)! " \ - "This module is designed to help you process, deconvolve, " \ - "and extract specific information from your data. " \ - "Expanding on the batch processing features present in UniDec from the beginning, " \ - "it is designed to interface with Excel/CSV files so that you can connect it with your workflows. " \ - "
" \ - "" \ - "Although you can type everything directly into UPP, " \ - "we recommend you start by importing your information from " \ - "an Excel/CSV file. " \ - "After you have your file ready, you can open it by clicking the" \ - " \"File > Open File\" button and selecting your file. " \ - "You can also open a file by dragging and dropping it onto the window. " \ - "After opening the file, you should see it populate the main window like a spreadsheet.
" \ - "All you need is to specify the \"Sample name\" column with the file path. " \ - "However, there are other optional parameters that you can specifiy. Note, capitalization is" \ - " important, so make sure to specify caps carefully.
" + html_str = gen_style_str() + + html_str += "" \ + "" \ + "Welcome to the UniDec Processing Pipeline (UPP)! " \ + "This module is designed to help you process, deconvolve, " \ + "and extract specific information from your data. " \ + "Expanding on the batch processing features present in UniDec from the beginning, " \ + "it is designed to interface with Excel/CSV files so that you can connect it with your workflows. " \ + "
" \ + "" \ + "Although you can type everything directly into UPP, " \ + "we recommend you start by importing your information from " \ + "an Excel/CSV file. " \ + "After you have your file ready, you can open it by clicking the" \ + " \"File > Open File\" button and selecting your file. " \ + "You can also open a file by dragging and dropping it onto the window. " \ + "After opening the file, you should see it populate the main window like a spreadsheet.
" \ + "All you need is to specify the \"Sample name\" column with the file path. " \ + "However, there are other optional parameters that you can specifiy. Note, capitalization is" \ + " important, so make sure to specify caps carefully.
" html_str += array_to_html(basic_parameters, cols=["Parameter", "Required", "Description"], rows=None, colors=None, index=False, sortable=False) @@ -123,10 +135,40 @@ def help_frame(self): html_str += array_to_html(recipe_w, cols=["Parameter", "Required", "Description"], rows=None, colors=None, index=False, sortable=False) + html_str += "" \ + "This recipe will calculate the drug-to-antibody ratio for an ADC. " \ + "The column keyword of \"Protein Mass\" defines the mass of the antibody, either with a " \ + "mass value, an amino acid sequence, or a Seq code combination. " \ + "The column keyword of \"Drug Mass\" defines the mass of the drug. " \ + "The column keyword of \"Max Drug\" defines the maximum number of drug molecules to consider. " \ + "UPP will then loop through all possible combinations of drug and antibody " \ + "and see if they match any of the detected peaks. " \ + "If a match is found, it will color the peak green. " \ + "If no match is found, it will color the peak yellow. " \ + "After extracting the height for each peak, UPP will calculate the drug-to-antibody ratio. " \ + "Additional details on keywords are provided below. " + + html_str += array_to_html(recipe_d, cols=["Parameter", "Required", "Description"], rows=None, + colors=None, index=False, sortable=False) + html_str += "" + self.htmlstr = html_str + ''' + # For Writing to File + # Copy html_str to clipboard + if wx.TheClipboard.Open(): + wx.TheClipboard.SetData(wx.TextDataObject(html_str)) + wx.TheClipboard.Close()''' + html.SetPage(html_str) + def open_in_browser(self, event): + # Open in Browser + with open("help.html", "w") as f: + f.write(self.htmlstr) + webbrowser.open("help.html") + class MyFileDropTarget(wx.FileDropTarget): """""" @@ -154,7 +196,18 @@ def __init__(self, nrows=2, ncolumns=2, title="UniDec Processing Pipeline"): self.use_decon = True self.use_converted = True self.use_interactive = False - self.bpeng = BPEngine() + self.bpeng = BPEngine(parent=self) + + try: + if os.path.isfile(self.bpeng.eng.config.iconfile): + favicon = wx.Icon(self.bpeng.eng.config.iconfile, wx.BITMAP_TYPE_ANY) + wx.Frame.SetIcon(self, favicon) + self.icon_path = os.path.abspath(self.bpeng.eng.config.iconfile) + else: + self.icon_path = None + except Exception as e: + print(e) + self.icon_path = None menu = wx.Menu() # Open File Menu @@ -202,36 +255,7 @@ def __init__(self, nrows=2, ncolumns=2, title="UniDec Processing Pipeline"): hsizer.Add(self.runbtn2, 0) # Insert Spacer Text - hsizer.Add(wx.StaticText(panel, label=" "), 0) - - # Insert a button for Open All HTML Reports and bind to function - btn = wx.Button(panel, label="Open All HTML Reports") - btn.Bind(wx.EVT_BUTTON, self.on_open_all_html) - hsizer.Add(btn, 0) - - # Insert a button for Run in UniDec and bind to function - btn = wx.Button(panel, label="Open in UniDec") - btn.Bind(wx.EVT_BUTTON, self.on_open_unidec) - hsizer.Add(btn, 0) - - # Insert a static text of directory - # hsizer.Add(wx.StaticText(panel, label=" Data Directory:", style=wx.ALIGN_CENTER_VERTICAL)) - # Insert a text box to read out the directory - # self.dirtxtbox = wx.TextCtrl(panel, size=(400, -1)) - # hsizer.Add(self.dirtxtbox, 0, wx.EXPAND) - # Add a button to set the directory - # btn = wx.Button(panel, label="...") - - # Insert a static text of tolerance - # hsizer.Add(wx.StaticText(panel, label=" Tolerance:", style=wx.ALIGN_CENTER_VERTICAL)) - # Insert a text box to read out the directory - # self.tolbox = wx.TextCtrl(panel, size=(50, -1)) - # self.tolbox.SetValue("50") - # hsizer.Add(self.tolbox, 0, wx.EXPAND) - # hsizer.Add(wx.StaticText(panel, label="Da ", style=wx.ALIGN_CENTER_VERTICAL)) - - # Insert Spacer Text - hsizer.Add(wx.StaticText(panel, label=" "), 0) + hsizer.Add(wx.StaticText(panel, label=" "), 0) # Insert a checkbox to select whether to use already converted data self.useconvbox = wx.CheckBox(panel, label="Use Converted Data ") @@ -251,31 +275,68 @@ def __init__(self, nrows=2, ncolumns=2, title="UniDec Processing Pipeline"): # Insert Spacer Text hsizer.Add(wx.StaticText(panel, label=" "), 0) + # Insert a button for Open Global HTML Reports and bind to function + btn = wx.Button(panel, label="Open Combined Report") + btn.Bind(wx.EVT_BUTTON, self.on_open_global_html) + btn.SetBackgroundColour("#5B2C6F") + btn.SetOwnForegroundColour("#FFFFFF") + hsizer.Add(btn, 0) + + # Insert a button for Open All HTML Reports and bind to function + btn = wx.Button(panel, label="Open All Reports") + btn.Bind(wx.EVT_BUTTON, self.on_open_all_html) + btn.SetBackgroundColour("#F15628") + btn.SetOwnForegroundColour("#0B31A5") + hsizer.Add(btn, 0) + + # Insert a button for Open the Results File and bind to function + btn = wx.Button(panel, label="Open Results") + btn.Bind(wx.EVT_BUTTON, self.open_results_file) + btn.SetBackgroundColour("#196F3D") + btn.SetOwnForegroundColour("#FFFFFF") + hsizer.Add(btn, 0) + + # Insert a button for Run in UniDec and bind to function + btn = wx.Button(panel, label="Open in UniDec") + btn.Bind(wx.EVT_BUTTON, self.on_open_unidec) + btn.SetBackgroundColour("#1F618D") + btn.SetOwnForegroundColour("#FFFF00") + hsizer.Add(btn, 0) + + # Insert Spacer Text + # hsizer.Add(wx.StaticText(panel, label=" "), 0) + hsizer2 = wx.BoxSizer(wx.HORIZONTAL) # Insert a button to hide columns self.hidebtn = wx.Button(panel, label="Hide Columns") self.hidebtn.Bind(wx.EVT_BUTTON, self.on_hide_columns) - hsizer.Add(self.hidebtn, 0) + hsizer2.Add(self.hidebtn, 0) self.hide_col_flag = False + hsizer2.Add(wx.StaticText(panel, label=" "), 0) + # Insert a button to hide columns with height in the title self.hideheightbtn = wx.Button(panel, label="Hide Height Columns") self.hideheightbtn.Bind(wx.EVT_BUTTON, self.on_hide_height_columns) - hsizer.Add(self.hideheightbtn, 0) + self.hideheightbtn.SetBackgroundColour("#5D6D7E") + self.hideheightbtn.SetOwnForegroundColour("#FFFFFF") + hsizer2.Add(self.hideheightbtn, 0) self.hide_height_flag = False # Insert a button to hide columns with % in the title self.hidepercentbtn = wx.Button(panel, label="Hide % Columns") self.hidepercentbtn.Bind(wx.EVT_BUTTON, self.on_hide_percent_columns) - hsizer.Add(self.hidepercentbtn, 0) + self.hidepercentbtn.SetBackgroundColour("#D5D8DC") + # self.hidepercentbtn.SetOwnForegroundColour("#FFFFFF") + hsizer2.Add(self.hidepercentbtn, 0) self.hide_percentcol_flag = False # Insert Spacer Text - hsizer.Add(wx.StaticText(panel, label=" "), 0) + hsizer2.Add(wx.StaticText(panel, label=" "), 0) # Insert a button to hide columns that are empty self.hideemptybtn = wx.Button(panel, label="Hide Empty Columns") self.hideemptybtn.Bind(wx.EVT_BUTTON, self.on_hide_empty_columns) - hsizer.Add(self.hideemptybtn, 0) + hsizer2.Add(self.hideemptybtn, 0) sizer.Add(hsizer, 0, wx.ALL | wx.EXPAND) @@ -283,23 +344,36 @@ def __init__(self, nrows=2, ncolumns=2, title="UniDec Processing Pipeline"): self.ss.set_col_headers(["Sample name", "Data Directory"]) sizer.Add(self.ss, 1, wx.EXPAND) + sizer.Add(hsizer2, 0, wx.ALL | wx.EXPAND) + file_drop_target = MyFileDropTarget(self) self.ss.SetDropTarget(file_drop_target) + # self.ss.Bind(wx.grid.EVT_GRID_CELL_LEFT_CLICK, self.on_cell_clicked) panel.SetSizer(sizer) + + # Create Status bar + self.CreateStatusBar(3) + self.Show() def on_run(self, event=None): - print("Run button pressed") + self.SetStatusText("Running...", 2) self.runbtn.SetBackgroundColour("red") self.get_from_gui() + wx.Yield() self.bpeng.run_df(decon=self.use_decon, use_converted=self.use_converted, interactive=self.use_interactive) - self.ss.set_df(self.bpeng.rundf) + self.load_to_gui() self.runbtn.SetBackgroundColour("green") if not self.hide_col_flag: self.on_hide_columns() + self.SetStatusText("Outputs: " + self.bpeng.outfile, 1) + self.SetStatusText("Completed: " + str(self.bpeng.runtime) + " seconds", 2) + def on_run_selected(self, event=None, rows=None): + # Get from GUI + self.SetStatusText("Running...", 2) self.runbtn2.SetBackgroundColour("red") if rows is None: # Get Selected Rows @@ -309,34 +383,49 @@ def on_run_selected(self, event=None, rows=None): print("Running Selected Rows:", selected_rows) # Get Sub Dataframe with Selected Rows self.get_from_gui() + wx.Yield() + + # RUN THE SELECTED ROWS topdf = deepcopy(self.bpeng.rundf) subdf = self.bpeng.rundf.iloc[selected_rows] # Run SubDF subdf2 = self.bpeng.run_df(df=subdf, decon=self.use_decon, use_converted=self.use_converted, - interactive=self.use_interactive) + interactive=self.use_interactive, write_xlsx=False, write_html=False) + # Update the main dataframe # topdf.iloc[selected_rows] = subdf2 topdf = set_row_merge(topdf, subdf, selected_rows) self.bpeng.rundf = topdf - self.ss.set_df(self.bpeng.rundf) + + # Write the results + self.bpeng.write_xlsx() + + # Load to GUI + self.load_to_gui() # Finish by coloring the button green self.runbtn2.SetBackgroundColour("green") if not self.hide_col_flag: self.on_hide_columns() + self.SetStatusText("Outputs: " + self.bpeng.outfile, 1) + self.SetStatusText("Completed: " + str(self.bpeng.runtime) + " seconds", 2) + def clear_all(self, event=None): self.ss.delete_all() self.ss.set_col_headers(["Sample name", "Data Directory"]) def load_file(self, filename): print("Loading File:", filename) + # Set status + self.SetStatusText("Inputs: " + filename, 0) try: self.ss.delete_all() except Exception: pass self.bpeng.top_dir = os.path.dirname(filename) - df = file_to_df(filename) - self.ss.set_df(df) + self.bpeng.filename = filename + self.bpeng.rundf = file_to_df(filename) + self.load_to_gui() # dirname = os.path.dirname(filename) # self.set_dir_tet_box(dirname) self.reset_hidden_columns() @@ -381,24 +470,28 @@ def get_from_gui(self): self.use_decon = self.usedeconbox.GetValue() self.use_interactive = self.interactivebox.GetValue() - # dirname = self.dirtxtbox.GetValue() - # tol = self.tolbox.GetValue() - # self.bpeng.data_dir = dirname - # try: - # self.bpeng.tolerance = float(tol) - # except Exception as e: - # print("Error with Tolerance Value. Using default value of 50 Da", e) - # self.bpeng.tolerance = 10 - # self.tolbox.SetValue("10") - self.ss.remove_empty() ssdf = self.ss.get_df() self.bpeng.rundf = ssdf + def load_to_gui(self): + self.ss.set_df(self.bpeng.rundf) + self.color_columns() + + def open_results_file(self, event): + print("Open Results button pressed") + # Open Results File in Explorer + if os.path.isfile(self.bpeng.outfile): + os.startfile(self.bpeng.outfile) + def on_open_all_html(self, event): print("Open All HTML Reports button pressed") self.bpeng.open_all_html() + def on_open_global_html(self, event): + print("Open Global HTML Report button pressed") + self.bpeng.open_global_html() + def on_open_unidec(self, event): ssdf = self.ss.get_df() self.bpeng.rundf = ssdf @@ -419,6 +512,9 @@ def open_unidec(self, row): if self.bpeng.correct_pair_mode: self.bpeng.run_correct_pair(row, app.eng.pks) app.after_pick_peaks() + elif self.bpeng.dar_mode: + self.bpeng.run_dar(row, app.eng.pks) + app.after_pick_peaks() app.start() def on_add_files(self, event=None): @@ -458,11 +554,12 @@ def add_files(self, paths): newdf = pd.DataFrame({"Sample name": sample_names, "Data Directory": data_dir}) self.bpeng.rundf = pd.concat([self.bpeng.rundf, newdf], ignore_index=True) - self.ss.set_df(self.bpeng.rundf) + self.load_to_gui() self.reset_hidden_columns() def on_hide_columns(self, event=None, reset=False): - columns_to_hide = ["Tolerance", "File", "Time", "Config", "Sequence", "Directory", "Matches"] + columns_to_hide = ["Tolerance", "File", "Time", "Config", "Sequence", "Directory", "Matches", + "Global Fixed Mod", "Favored Match", "Apply Fixed Mods", "Disulfides Oxidized"] if not self.hide_col_flag and not reset: for keyword in columns_to_hide: self.ss.hide_columns_by_keyword(keyword) @@ -493,14 +590,58 @@ def on_hide_height_columns(self, event=None, reset=False): def on_hide_percent_columns(self, event=None, reset=False): if not self.hide_percentcol_flag and not reset: - self.ss.hide_columns_by_keyword("%") + self.ss.hide_columns_by_keyword(" %") self.hide_percentcol_flag = True self.hidepercentbtn.SetLabel("Show % Columns") else: self.hidepercentbtn.SetLabel("Hide % Columns") - self.ss.show_columns_by_keyword("%") + self.ss.show_columns_by_keyword(" %") self.hide_percentcol_flag = False + def color_columns(self): + for row in basic_parameters: + if row[1]: + self.ss.color_columns_by_keyword(row[0], "#D1F2EB") + else: + self.ss.color_columns_by_keyword(row[0], "#D6EAF8") + + for row in config_parameters: + if row[1]: + self.ss.color_columns_by_keyword(row[0], "#FEF9E7") + else: + self.ss.color_columns_by_keyword(row[0], "#FEF9E7") + + for row in recipe_w: + if row[1]: + self.ss.color_columns_by_keyword(row[0], "#BB8FCE") + else: + self.ss.color_columns_by_keyword(row[0], "#E8DAEF") + + for row in recipe_d: + if row[1]: + self.ss.color_columns_by_keyword(row[0], "#D98880") + else: + self.ss.color_columns_by_keyword(row[0], "#F5B7B1") + + self.ss.color_columns_by_keyword("Height", "#5D6D7E") + self.ss.color_columns_by_keyword("%", "#D5D8DC") + self.ss.color_columns_by_keyword("BsAb Pairing Calculated", "#7DCEA0") + self.ss.color_columns_by_keyword("DAR", "#7DCEA0") + self.ss.color_columns_by_keyword("Light Chain Scrambled", "#F7DC6F") + + self.ss.color_columns_by_keyword("Sequence", "#FDEBD0") + self.ss.color_columns_by_keyword("Matches", "#D5F5E3") + + def update_progress(self, i, total_n): + status_text = "Processing file {} of {}".format(i+1, total_n) + self.SetStatusText(status_text, 2) + # update gui + wx.Yield() + + """ + def on_cell_clicked(self, event=None): + print("Cell clicked")""" + def on_help_page(self, event=None): print("Help button pressed") dlg = HelpDlg() @@ -513,17 +654,26 @@ def on_exit(self, event=None): if __name__ == "__main__": app = wx.App() frame = UPPApp() - frame.usedeconbox.SetValue(True) + frame.usedeconbox.SetValue(False) path = "C:\\Data\\Wilson_Genentech\\sequences_short.xlsx" path = "C:\\Data\\Wilson_Genentech\\BsAb\\BsAb test short.xlsx" - + path = "C:\\Data\\UPPDemo\\BsAb\\BsAb test - Copy.xlsx" + # path = "C:\\Data\\UPPDemo\\DAR\\Biotin UPP template WP_MTM_DoubleDec.xlsx" + # path = "C:\\Data\\Wilson_Genentech\\BsAb\\BsAb test.xlsx" + # path = "C:\\Data\\Wilson_Genentech\\DAR\\Biotin UPP template test.xlsx" + # path = "C:\\Data\\UPPDemo\\BsAb\\outliers.xlsx" + # path = "C:\\Data\\UPPDemo\\BsAb\\BsAb test short.xlsx" + path = "C:\\Data\\UPPDemo\\DAR\\Biotin UPP template WP_MTM.xlsx" + path = "C:\\Data\\UPPDemo\\FileOpening\\filetest.xlsx" # frame.on_help_page() # exit() if False: + frame.useconvbox.SetValue(False) + frame.usedeconbox.SetValue(True) frame.load_file(path) # frame.set_dir_tet_box("C:\\Data\\Wilson_Genentech\\Data") # print(df) - # frame.on_run() + frame.on_run() # frame.on_run_selected(rows=[1]) # frame.on_run_selected(rows=[0]) # frame.on_add_files() diff --git a/unidec/UniChrom.py b/unidec/UniChrom.py index e643eebc..2562eaff 100644 --- a/unidec/UniChrom.py +++ b/unidec/UniChrom.py @@ -329,7 +329,7 @@ def plot_chrom_shading(self, e=None): tstart = s.attrs["timestart"] tend = s.attrs["timeend"] #print(tstart, tend) - self.view.plotc.add_rect(tstart, min, tend - tstart, max - min, facecolor=s.color) + self.view.plotc.add_rect(tstart, min, tend - tstart, max - min, facecolor=s.color, nopaint=True) self.view.plotc.repaint() diff --git a/unidec/batch.py b/unidec/batch.py index a915252f..86e692ea 100644 --- a/unidec/batch.py +++ b/unidec/batch.py @@ -1,11 +1,12 @@ from unidec.engine import UniDec from unidec.modules.matchtools import * -from unidec.tools import known_extensions, strip_char_from_string +from unidec.tools import known_extensions, strip_char_from_string, find_kernel_file import os import numpy as np import time import webbrowser import sys +import re basic_parameters = [["Sample name", True, "The File Name or Path. File extensions are optional."], ["Data Directory", False, "The directory of the data files. If you do not specify this, " @@ -24,6 +25,12 @@ ["Config Low Mass", False, "Deconvolution Setting: The Low Mass Limit in Da"], ["Config High m/z", False, "Deconvolution Setting: The High m/z Limit"], ["Config Low m/z", False, "Deconvolution Setting: The Low m/z Limit"], + ["Config High z", False, "Deconvolution Setting: The High Charge Limit"], + ["Config Low z", False, "Deconvolution Setting: The Low Charge Limit"], + ["Config Integral Lower", False, "Deconvolution Setting: The Lower Integral Limit"], + ["Config Integral Upper", False, "Deconvolution Setting: The Upper Integral Limit"], + ["Quant Mode", False, "Quantification Setting: How to extract intensity from the peaks. " + "Unless this is set to \"Integral\", it will use peak height"], ["Config Sample Mass Every", False, "Deconvolution Setting: The Mass Bin Size in Da"], ["Config m/z Peak FWHM", False, "Deconvolution Setting: The Peak Width in m/z used for Deconvolution"], @@ -31,13 +38,28 @@ "Deconvolution. 0=gaussian, 1=lorentzian, 2=split G/L"], ["Config File", False, "Path to Config File. Will load this file and use the settings. Note," " any settings specified in the batch file will override the config file."], + ["DoubleDec Kernel File", False, "Path to DoubleDec Kernel File. If specified, it will load " + "this file and use DoubleDec. WARNING: Do not use Kernel files " + "that are also in the list. " + "If you want to use a kernel file from your list, " + "first deconvolve it, " + "then copy the _mass.txt file out to a separate folder. " + "Then, add that _mass.txt file as the path. Otherwise, " + "you will be overwriting " + "the kernel each time, which can cause unstable results."], ] recipe_w = [["Tolerance (Da)", False, "The Tolerance in Da. Default is 50 Da if not specified."], ["Variable Mod File", False, "The File Name or Path of the Variable Mod File. " "Can be either Excel or CSV. The file should have \"Mass\" and \"Name\" " "columns with these exact headers. If not specified," - " no modifications will be used."], + " no modifications will be used. " + "Note, if a Variable Mod File is provided, it will apply at least one mod " + "to each complex. In other words, if you only have one line in the file, " + "it will act as a global fixed mod (see below) to the whole complex. " + "If you want an option for unmodified, you need to include a line with " + "a mass of 0.0 and a name of \"Unmodified\" or \" \". " + ], ["Fixed Mod File", False, "The File Name or Path of the Fxied Mod File. " "Can be either Excel or CSV. The file should have \"Mass\" and \"Name\" " "columns with these exact headers. Can also have \"Number\" if a " @@ -71,8 +93,8 @@ "closesr. Other keywords like \"Ignore\" and \"Correct\" can be used to select " "specific types of matches over others."], ["Sequence {n}", False, - "The amino acid sequence or mass for the {n}th protein. Can be multiple sequences, " - "each as their own columns. If it can convert to float, it will. " + "The amino acid sequence or mass for the {n} protein or molecule. There can be multiple sequences, " + "each as their own columns and with unique names. If it can convert to float, it will. " "If not, it will assume it is an amino acid sequence with 1 letter codes."], ["Correct", True, "The Correct Pairing. This may be a list of the correct pairs, " "listed as Seq1+Seq2, for example. Can also be a single mass value. "], @@ -88,6 +110,54 @@ "Form more details, see https://dx.doi.org/10.1080/19420862.2016.1232217. "] ] +recipe_d = [ + ["Protein Mass", True, "The Protein Mass in either (1) a float in Da, (2) as an amino acid sequence, or " + "(3) as a string of Seq{n}+Seq{m}, where Seq{n} and Seq{m} are the names of " + "the columns containing the amino acid sequences or masses of individual species. " + "See below for more details on the Sequence {n} column. " + ], + ["Drug Mass", True, "The Drug Mass in Da."], + ["Min Drug", False, + "The minimum number of drug molecules to include in the ADC. Default is 0 if not specified."], + ["Max Drug", True, "The maximum number of drug molecules to include in the ADC."], + ["Tolerance (Da)", False, "The Tolerance in Da. Default is 50 Da if not specified."], + ["Fixed Mod File", False, "The File Name or Path of the Fxied Mod File. " + "Can be either Excel or CSV. The file should have \"Mass\" and \"Name\" " + "columns with these exact headers. Can also have \"Number\" if a " + "multiple is used If not specified, no modifications will be used. " + "Note, it will apply one set of all fixed mods to each sequence. " + "If you specify, \"Seq1+Seq2\", it will apply the fixed mods to both sequences." + "If you specify a single amino acid sequence or float mass in the " + "Protein Mass column, it will apply the fixed mods only once. "], + ["Apply Fixed Mods", False, + "A column specifying which sequences should get the fixed mods. " + "Should have the format of \"Seq1 Seq2 Seq3\" where Seq1 is the Sequence 1 " + "column name, etc. Delimiters do not matter. " + "You can specify \"All\" to apply mods to all sequences " + "or \"None\" to apply to none. " + "It will assume yes to all if this column is not present. "], + ["Global Fixed Mod", False, "A column specifying a global fixed mass shift to apply to all complexes. " + "Unlike Fixed Mod File, which applies a fixed modification to each sequence, " + "this is applied only once to each complex. " + "Also, it is a single float value rather than a file. " + "This will be have identically to the Fixed Mod File " + "if you specify a single amino acid sequence or mass float in the " + "Protein Mass column. "], + ["Sequence {n}", False, + "The amino acid sequence or mass for the {n} protein or molecule. There can be multiple sequences, " + "each as their own columns and with unique names. If it can convert to float, it will. " + "If not, it will assume it is an amino acid sequence with 1 letter codes."], + ["Disulfides Oxidized", False, + "A column specifying the sequences that should be fully disulfide oxidized. " + "Should have the format of \"Seq1 Seq2 Seq3\" where Seq1 is the Sequence 1 " + "column name, etc. It will not work if only the amino acid sequence is given. Delimiters do not matter. " + "It will assume no to all if this column is not present. " + "You can specify \"All\" to oxidize all sequences " + "or \"None\" to oxidize none. " + "Will only work if sequences are amino acid codes with C. " + "It will subtract one H mass for each C."], +] + def find_file(fname, folder, use_converted=True): # If use_converted is true, it will look for the converted file first, then the original. @@ -114,23 +184,23 @@ def find_file(fname, folder, use_converted=True): else: # Return the original file name and hope for the best return fname - else: # If a file extension is not regonized, it will try to find the file with the extensions it knows for ext in extensions: - if os.path.exists(os.path.join(folder, fname + ext)): - return os.path.join(folder, fname + ext) + testpath = os.path.join(folder, fname + ext) + if os.path.exists(testpath): + return testpath return fname -def check_for_correct_in_keys(df): +def check_for_word_in_keys(df, word="Correct"): for k in df.keys(): - if "Correct" in k: + if word in k: return True return False -def set_param_from_row(eng, row): +def set_param_from_row(eng, row, dirname=""): for k in row.keys(): val = row[k] if isinstance(val, (float, int)): @@ -150,6 +220,7 @@ def set_param_from_row(eng, row): eng.config.peakthresh = float(val) except Exception as e: print("Error setting peak threshold", k, val, e) + if "Config Low Mass" in k: try: eng.config.masslb = float(val) @@ -172,6 +243,17 @@ def set_param_from_row(eng, row): except Exception as e: print("Error setting high m/z", k, val, e) + if "Config Low z" in k: + try: + eng.config.startz = int(float(val)) + except Exception as e: + print("Error setting low z", k, val, e) + if "Config High z" in k: + try: + eng.config.endz = int(float(val)) + except Exception as e: + print("Error setting high z", k, val, e) + if "Config Sample Mass Every" in k: try: eng.config.massbins = float(val) @@ -186,10 +268,39 @@ def set_param_from_row(eng, row): if "Config m/z Peak Shape" in k or "Config mz Peak Shape" in k: try: - eng.config.psfun = int(val) + eng.config.psfun = int(float(val)) except Exception as e: print("Error setting massbins", k, val, e) + if "Config Integral Lower" in k: + try: + eng.config.integratelb = float(val) + except Exception as e: + print("Error setting integral lower", k, val, e) + eng.config.integratelb = "" + + if "Config Integral Upper" in k: + try: + eng.config.integrateub = float(val) + except Exception as e: + print("Error setting integral upper", k, val, e) + eng.config.integrateub = "" + + if "DoubleDec Kernel File" in k: + print(val) + kernel_file = find_kernel_file(val) + if kernel_file is None: + kernel_file = find_kernel_file(os.path.join(dirname, val)) + if kernel_file is None: + kernel_file = find_kernel_file(os.path.join(os.getcwd(), val)) + + if kernel_file is not None: + eng.config.kernel = kernel_file + eng.config.doubledec = True + print("Using DoubleDec kernel", kernel_file) + else: + eng.config.doubledec = False + # print(eng.config.maxmz, eng.config.minmz, k) return eng @@ -255,7 +366,7 @@ def remove_columns(df, key): class UniDecBatchProcessor(object): - def __init__(self): + def __init__(self, parent=None): self.eng = UniDec() self.tolerance = 50 self.data_dir = "" @@ -267,16 +378,26 @@ def __init__(self): self.fmoddf = None self.autopw = True self.correct_pair_mode = False + self.dar_mode = False self.time_range = None + self.integrate = False + self.global_html_str = "" + self.filename = "" + self.outbase = "" + self.outfile = "" + self.global_html_file = "results.html" + self.parent = parent + self.runtime = -1 def run_file(self, file=None, decon=True, use_converted=True, interactive=False): + self.filename = file self.top_dir = os.path.dirname(file) self.rundf = file_to_df(file) self.run_df(decon=decon, use_converted=use_converted, interactive=interactive) - def run_df(self, df=None, decon=True, use_converted=True, interactive=False): - + def run_df(self, df=None, decon=True, use_converted=True, interactive=False, write_html=True, write_xlsx=True): + self.global_html_str = "" # Print the data directory and start the clock clockstart = time.perf_counter() # Set the Pandas DataFrame @@ -291,12 +412,18 @@ def run_df(self, df=None, decon=True, use_converted=True, interactive=False): htmlfiles = [] # Check if the "Correct" column is in the DataFrame to start correct pair mode - self.correct_pair_mode = check_for_correct_in_keys(self.rundf) + self.correct_pair_mode = check_for_word_in_keys(self.rundf, "Correct") + # Check for the DAR mode + self.dar_mode = check_for_word_in_keys(self.rundf, "Max Drugs") # if self.correct_pair_mode: # self.rundf = remove_columns(self.rundf, "Height") + total_n = len(self.rundf) # Loop through the DataFrame for i, row in self.rundf.iterrows(): + if self.parent is not None: + self.parent.update_progress(i, total_n) + self.autopw = True self.eng.reset_config() path = self.get_file_path(row, use_converted=use_converted) @@ -305,7 +432,7 @@ def run_df(self, df=None, decon=True, use_converted=True, interactive=False): self.time_range = get_time_range(row) # If the file exists, open it - if os.path.isfile(path): + if os.path.exists(path): print("Opening:", path) if not use_converted: print("Refreshing") @@ -322,7 +449,14 @@ def run_df(self, df=None, decon=True, use_converted=True, interactive=False): print("Error loading config file", row["Config File"], e) # Set the deconvolution parameters from the DataFrame - self.eng = set_param_from_row(self.eng, row) + self.eng = set_param_from_row(self.eng, row, self.data_dir) + + # Check whether to integrate or use peak height + self.integrate = False + if "Quant Mode" in row: + if row["Quant Mode"] == "Integral": + self.integrate = True + print("Using Integral Mode") # Run the deconvolution or import the prior deconvolution results if decon: @@ -332,8 +466,24 @@ def run_df(self, df=None, decon=True, use_converted=True, interactive=False): print("Auto Peak Width", self.autopw) self.eng.autorun(auto_peak_width=self.autopw, silent=True) else: - self.eng.unidec_imports(efficiency=False) - self.eng.pick_peaks() + try: + self.eng.unidec_imports(efficiency=False) + self.eng.pick_peaks() + except FileNotFoundError: + # If the Config m/z Peak FWHM is specified, do not use the auto peak width + if "Config m/z Peak FWHM" in row: + self.autopw = not check_for_floatable(row, "Config m/z Peak FWHM") + print("Auto Peak Width", self.autopw) + self.eng.autorun(auto_peak_width=self.autopw, silent=True) + + if self.integrate: + try: + self.eng.autointegrate() + except Exception as err: + print("Error in integrating", err) + self.integrate = False + + results_string = None # The First Recipe, correct pair mode if self.correct_pair_mode: @@ -343,9 +493,28 @@ def run_df(self, df=None, decon=True, use_converted=True, interactive=False): # Merge the row back in the df self.rundf = set_row_merge(self.rundf, newrow, [i]) + # Add the results string + if "BsAb Pairing Calculated (%)" in newrow.keys(): + results_string = "The BsAb Pairing Calculated is: " + str(newrow["BsAb Pairing Calculated (%)"]) + + if self.dar_mode: + # Run DAR mode + newrow = self.run_dar(row) + + # Merge the row back in the df + self.rundf = set_row_merge(self.rundf, newrow, [i]) + try: + results_string = "The Drug-to-Antibody Ratio (DAR) is: " + str(newrow["DAR"]) + except Exception: + results_string = None + # Generate the HTML report - outfile = self.eng.gen_html_report(open_in_browser=False, interactive=interactive) + outfile = self.eng.gen_html_report(open_in_browser=False, interactive=interactive, + results_string=results_string) htmlfiles.append(outfile) + + # Add the HTML report to the global HTML string + self.global_html_str += self.eng.html_str else: # When files are not found, print the error and add empty results print("File not found:", path) @@ -358,20 +527,54 @@ def run_df(self, df=None, decon=True, use_converted=True, interactive=False): if self.top_dir == "" and self.data_dir != "": self.top_dir = os.path.dirname(self.data_dir) - # Write the results to an Excel file to the top directory - outfile = os.path.join(self.top_dir, "results.xlsx") - self.rundf.to_excel(outfile) - print("Write to: ", outfile) - # print(self.rundf) + # Get Output File Base Name + try: + filebase = os.path.splitext(os.path.basename(self.filename))[0] + self.outbase = os.path.join(self.top_dir, filebase) + except Exception: + self.outbase = os.path.join(self.top_dir, "results") + + if write_xlsx: + self.write_xlsx() + + if write_html: + # Write the global HTML string to a file + self.global_html_file = self.outbase + "_report.html" + with open(self.global_html_file, "w", encoding="utf-8") as f: + title_string = "UPP Results " + str(self.filename) + f.write("") + f.write("
', ' | ', 1) + for i in range(0, len(df.columns) + 1): + html_str = html_str.replace(' | ', ' | ', 1) # Javascript for sorting html_str += """ |
---|