Merge pull request #82 from michaelmarty/v3

V3
michaelmarty · Apr 5, 2023 · a32af14 · a32af14
2 parents ecef9b3 + 26aa849
commit a32af14
Show file tree

Hide file tree

Showing 25 changed files with 1,298 additions and 520 deletions.
diff --git a/readme.md b/readme.md
@@ -28,7 +28,7 @@ You can watch a video tutorial on how to use UniDec here: [https://www.youtube.c
 
 UniDec is distributed under a completely open source license. Our hope is that this allows UniDec to be
 more widely used. If you are interested in including UniDec in another academic or commercial software distribution, 
-you are welcome to email mtmarty@email.arizona.edu for more information. 
+you are welcome to email [email protected] for more information. 
 
 UniDec source code and compiled binaries are released under a modified BSD License as described below. Note, we ask
 that you cite us in any publications. Quantitative citation metrics will help grant applications to support future development.
@@ -68,6 +68,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 Waters DLLs are distributed under the Waters MassLynxSDK EULA license. The Thermo RawFileReader DLLs are distributed under the RawFileReader.doc license. By downloading and using these DLLs, you are accepting those licenses and these terms. Thermo and Waters are indemnified, defended, and held harmless from any claims, including attorneys’ fees, related to the distribution or use of this software. Redistribution of Waters and Thermo libraries is restricted as described in those licenses. Info on other licenses are provided below.
 
+Intel libraries from the oneAPI are distributed under licenses found in the bin folder. 
+
+RDKit libraries are gratefully used for LipiDec and are distributed under the CC-SA license. Please [cite them when using these features](https://www.rdkit.org/docs/Overview.html#citing-the-rdkit). 
+
 ## UniDec Compatible File Types
 
 UniDec is built to open .txt files using [numpy.loadtxt](http://docs.scipy.org/doc/numpy-1.10.0/reference/generated/numpy.loadtxt.html). 
@@ -191,9 +195,17 @@ Of course, using the pre-compiled version means you don't need to know Python at
 
 ## Change Log
 
+v.6.0.2
+
+**Added a new Drug-to-Antibody Ratio (DAR) calculation mode for UPP.** See help file for the new keywords. 
+
+Added color to the columns on UPP.
+
+Fixed bug with HTML float sorting as text. Fixed major bugs with file imports.
+
 v.6.0.1
 
-*Added Noise Removal Tool* to UniDec and UniDecCD. This allows you to select specific noise regions (common in UHMR data) and set them to 0. Try Tools > Select Noise Peaks and the checking "Remove Noise Peaks" in the Advanced Data Processing Parameters.
+**Added Noise Removal Tool** to UniDec and UniDecCD. This allows you to select specific noise regions (common in UHMR data) and set them to 0. Try Tools > Select Noise Peaks and the checking "Remove Noise Peaks" in the Advanced Data Processing Parameters.
 
 UPP Changes: 
 * Added "Config m/z Peak FWHM", "Config m/z Peak Shape", and "Config Sample Mass Every" parameters. If "Config m/z Peak FWHM" is specified as a float or integer, it will use that and skip the automatic peak width determination.

diff --git a/unidec/LipiDec/AddFeaturestoDF.py b/unidec/LipiDec/AddFeaturestoDF.py
@@ -6,10 +6,10 @@
 def get_overall_tails(name):
     if "|" in name:
         try:
-            tname = name.split("|")[0] # Remove before vertical line
-            tname = tname.split()[1] # Remove after space to get tails
-            tl = tname.split(":")[0] # Get length
-            tu = tname.split(":")[1] # Get unsaturation
+            tname = name.split("|")[0]  # Remove before vertical line
+            tname = tname.split()[1]  # Remove after space to get tails
+            tl = tname.split(":")[0]  # Get length
+            tu = tname.split(":")[1]  # Get unsaturation
         except:
             return name, -1, -1
         try:
@@ -23,15 +23,16 @@ def get_overall_tails(name):
     else:
         tname = name.split()[1]
         fas = tname.split("_")
-        tls=0
-        tus=0
+        tls = 0
+        tus = 0
         for f in fas:
             tl = f.split(":")[0]  # Get length
             try:
                 tu = f.split(":")[1]  # Get unsaturation
             except:
-                print(f, name)
-                exit()
+                print("Unable to parse part: ", f, name)
+                continue
+                # exit()
             try:
                 tl = int(tl)
             except:
@@ -67,6 +68,52 @@ def set_tails(df, name="Metabolite name"):
     return df
 
 
+def set_class_name(df, name="Metabolite name"):
+    df = df.copy(deep=True)
+    names = df[name].to_numpy()
+    classes = []
+    for n in names:
+        if "|" in n:
+            n = n.split("|")[1]
+        n = n.split()[0]
+        classes.append(n)
+    df["Class Name"] = classes
+    return df
+
+
+def calc_tail_diff(df, class_col="Class Name", tail_col="Tail Lengths", unsat_col="Tail Unsaturation"):
+    # Calculate average tail lengths and unsaturations for each class
+    unique_classes = np.unique(df[class_col])
+    avg_class_lengths = []
+    avg_class_unsat = []
+    for c in unique_classes:
+        avg_class_lengths.append(np.mean(df[df[class_col] == c][tail_col]))
+        avg_class_unsat.append(np.mean(df[df[class_col] == c][unsat_col]))
+
+    # Subtract each value from the average for that class
+    tail_diffs = []
+    unsat_diffs = []
+    for i, row in df.iterrows():
+        c = row[class_col]
+        t = row[tail_col]
+        u = row[unsat_col]
+        index = np.where(unique_classes == c)[0][0]
+        tdiff = t - avg_class_lengths[index]
+        udiff = u - avg_class_unsat[index]
+        tail_diffs.append(tdiff)
+        unsat_diffs.append(udiff)
+
+    df["Tail Length Diff"] = tail_diffs
+    df["Tail Unsat Diff"] = unsat_diffs
+
+    print('Unique Classes: ', unique_classes)
+    print('Average Tail Lengths: ', avg_class_lengths)
+    print('Average Tail Unsaturation: ', avg_class_unsat)
+
+
+    return df
+
+
 adduct_translator = np.array([["[M+H]+", "[M+H]", "+1", 1, "[M+H]1+"],
                               ["[M+NH4]+", "[M+NH4]", "+1", 1, "[M+NH4]1+"],
                               ["[M+2H]2+", "[M+2H]", "+2", 2, "[M+2H]2+"],
@@ -82,7 +129,7 @@ def parse_names(df, namecolumn="Metabolite name", parse_adduct=True):
     full_names_adduct = []
     charges = []
     for i, row in df.iterrows():
-        #Parse Name
+        # Parse Name
         mol_name = row[namecolumn]
         if "|" in mol_name:
             names = mol_name.split("|")
@@ -93,7 +140,7 @@ def parse_names(df, namecolumn="Metabolite name", parse_adduct=True):
         simp_names.append(simp_name)
         full_names.append(full_name)
 
-        #Parse Adduct
+        # Parse Adduct
         if parse_adduct:
             prec_adduct = row["Adduct type"]
             if prec_adduct in adduct_translator[:, 0]:
@@ -152,7 +199,7 @@ def isotope_ratios(df):
         s = np.array(re.split(" |:", sraw))
         try:
             ints = s[1::2].astype(float)
-            ratio = ints[1]/ints[0]
+            ratio = ints[1] / ints[0]
         except:
             ratio = -1
         ratios.append(ratio)