From 3f057620100d16e580ed5c1e55c75c645cf68f30 Mon Sep 17 00:00:00 2001
From: deeenes <turei.denes@gmail.com>
Date: Fri, 24 Jan 2025 10:59:52 +0100
Subject: [PATCH] `service.legacy`: methods to access SQLAlchemy tables and
 column names

---
 omnipath_server/service/_legacy.py | 225 ++++++++++++++++++-----------
 1 file changed, 142 insertions(+), 83 deletions(-)

diff --git a/omnipath_server/service/_legacy.py b/omnipath_server/service/_legacy.py
index 95e15e2..92b9f46 100644
--- a/omnipath_server/service/_legacy.py
+++ b/omnipath_server/service/_legacy.py
@@ -13,26 +13,33 @@
 # https://www.gnu.org/licenses/gpl-3.0.txt
 #
 
+from collections.abc import Generator
 import os
 import re
 import copy
-import collections
-import itertools
 import json
 import warnings
+import itertools
 import contextlib
-from collections.abc import Generator
+import collections
+
+from pypath import resources as resources_mod
+from pypath_common import _misc, _settings
+from pypath_common import _constants as _const
+from sqlalchemy.sql.base import ReadOnlyColumnCollection
 
 import numpy as np
 import pandas as pd
-from pypath_common import _misc
-from pypath_common import _constants as _const
-from pypath_common import _settings
-from pypath import resources as resources_mod
 
 from .. import _log, _connection
 from ..schema import _legacy as _schema
 
+__all__ = [
+    'LICENSE_IGNORE',
+    'LegacyService',
+    'ignore_pandas_copywarn',
+]
+
 
 LICENSE_IGNORE = 'ignore'
 
@@ -89,7 +96,7 @@ class LegacyService:
                 'tab',
                 'text',
                 'tsv',
-                'table'
+                'table',
             },
             'license': {
                 'ignore',
@@ -244,7 +251,7 @@ class LegacyService:
                 'OR',
                 'and',
                 'or',
-            }
+            },
         },
         'annotations': {
             'header': None,
@@ -359,7 +366,7 @@ class LegacyService:
                 'trans',
                 'receiver',
                 'rec',
-                'both'
+                'both',
             },
             'topology': {
                 'secreted',
@@ -619,7 +626,7 @@ class LegacyService:
             'secreted': 'bool',
             'plasma_membrane_transmembrane': 'bool',
             'plasma_membrane_peripheral': 'bool',
-        }
+        },
     )
 
     # the annotation attributes served for the cytoscape app
@@ -635,7 +642,7 @@ class LegacyService:
                 'family',
                 'subfamily',
                 #'has_protein_substrates',
-            )
+            ),
         ),
         ('CancerSEA', 'state'),
         ('GO_Intercell', 'mainclass'),
@@ -646,7 +653,7 @@ class LegacyService:
             (
                 'mainclass',
                 #'secreted',
-            )
+            ),
         ),
         (
             'OPM',
@@ -654,7 +661,7 @@ class LegacyService:
                 'membrane',
                 'family',
                 #'transmembrane',
-            )
+            ),
         ),
         ('KEGG', 'pathway'),
         #(
@@ -674,7 +681,7 @@ class LegacyService:
         #('MSigDB', 'geneset'),
         #('Integrins', 'in Integrins'),
         ('HGNC', 'mainclass'),
-        ('CPAD', ('pathway', 'effect_on_cancer', 'cancer', )),
+        ('CPAD', ('pathway', 'effect_on_cancer', 'cancer')),
         ('Signor', 'pathway'),
         ('Ramilowski2015', 'mainclass'),
         ('HPA_subcellular', 'location'),
@@ -704,7 +711,7 @@ def __init__(
             input_files = None,
             only_tables = None,
             exclude_tables = None,
-        ):
+    ):
         """
         Server based on ``pandas`` data frames.
 
@@ -750,12 +757,12 @@ def _read_tables(self):
             fname_gz = f'{fname}.gz'
             fname = fname_gz if os.path.exists(fname_gz) else fname
 
-            _log('Loading dataset `%s` from file `%s`.' % (name, fname))
+            _log(f'Loading dataset `{name}` from file `{fname}`.')
 
             if not os.path.exists(fname):
 
                 _log(
-                    'Missing table: `%s`.' % fname
+                    'Missing table: `%s`.' % fname,
                 )
                 continue
 
@@ -769,7 +776,7 @@ def _read_tables(self):
             )
 
             _log(
-                'Table `%s` loaded from file `%s`.' % (name, fname)
+                f'Table `{name}` loaded from file `{fname}`.',
             )
 
 
@@ -782,8 +789,11 @@ def _network(self, req):
         if b'format' in req.args and req.args['format'] == b'json':
             return json.dumps(val)
         else:
-            return '%s\n%s' % ('\t'.join(hdr), '\t'.join(
-                [str(val[h]) for h in hdr]))
+            return '{}\n{}'.format(
+                '\t'.join(hdr), '\t'.join(
+                [str(val[h]) for h in hdr],
+                ),
+            )
 
 
     def _preprocess_interactions(self):
@@ -795,15 +805,15 @@ def _preprocess_interactions(self):
         _log('Preprocessing interactions.')
         tbl = self.data['interactions']
         tbl['set_sources'] = pd.Series(
-            [set(s.split(';')) for s in tbl.sources]
+            [set(s.split(';')) for s in tbl.sources],
         )
         tbl['set_dorothea_level'] = pd.Series(
             [
                 set(s.split(';'))
                 if not pd.isnull(s) else
-                set([])
+                set()
                 for s in tbl.dorothea_level
-            ]
+            ],
         )
 
 
@@ -816,7 +826,7 @@ def _preprocess_enzsub(self):
         _log('Preprocessing enzyme-substrate relationships.')
         tbl = self.data['enzsub']
         tbl['set_sources'] = pd.Series(
-            [set(s.split(';')) for s in tbl.sources]
+            [set(s.split(';')) for s in tbl.sources],
         )
 
 
@@ -832,7 +842,7 @@ def _preprocess_annotations_old(self):
         def _agg_values(vals):
 
             result = (
-                '#'.join(sorted(set(str(ii) for ii in vals)))
+                '#'.join(sorted({str(ii) for ii in vals}))
                 if not all(
                     isinstance(i, (int, float)) or (
                         isinstance(i, str) and
@@ -941,7 +951,7 @@ def _update_resources(self):
                 ('database', 'databases'),
                 ('sources', 'databases'),
                 ('source', 'databases'),
-                ('category', 'categories')
+                ('category', 'categories'),
             ):
 
                 if colname in tbl.columns:
@@ -949,11 +959,15 @@ def _update_resources(self):
                     break
 
             # collecting all resource names
-            values = sorted(set(
-                itertools.chain(*(
-                    val.split(';') for val in getattr(tbl, colname)
-                ))
-            ))
+            values = sorted(
+                set(
+                    itertools.chain(
+                        *(
+                            val.split(';') for val in getattr(tbl, colname)
+                        ),
+                    ),
+                ),
+            )
 
             for db in values:
 
@@ -1011,7 +1025,7 @@ def _update_resources(self):
 
                         self._resources_dict[db]['queries'][query_type] = {
                             'generic_categories': sorted(
-                                set(tbl_db.category)
+                                set(tbl_db.category),
                             ),
                         }
 
@@ -1053,10 +1067,10 @@ def _check_args(self, args: dict, query_type: str):
                 if unknowns:
 
                     result.append(
-                        ' ==> Unknown values for argument `%s`: `%s`' % (
+                        ' ==> Unknown values for argument `{}`: `{}`'.format(
                             arg,
-                            ', '.join(str(u) for u in unknowns)
-                        )
+                            ', '.join(str(u) for u in unknowns),
+                        ),
                     )
 
             else:
@@ -1097,13 +1111,11 @@ def queries(self, req):
 
         if query_type in self.args_reference:
 
-            result = dict(
-                (
-                    k,
+            result = {
+                    k:
                     sorted(v) if isinstance(v, _const.LIST_LIKE) else v
-                )
                 for k, v in self.args_reference[query_type].items()
-            )
+            }
 
             if query_param is not None and query_param in result:
 
@@ -1126,11 +1138,11 @@ def queries(self, req):
         else:
 
             return 'argument\tvalues\n%s' % '\n'.join(
-                '%s\t%s' % (
+                '{}\t{}'.format(
                     k,
                     ';'.join(v)
                         if isinstance(v, (list, set, tuple)) else
-                    str(v)
+                    str(v),
                 )
                 for k, v in result.items()
             )
@@ -1139,9 +1151,8 @@ def queries(self, req):
     @classmethod
     def _dict_set_to_list(cls, dct):
 
-        return dict(
-            (
-                key,
+        return {
+                key:
                 (
                     sorted(val)
                         if isinstance(val, _const.LIST_LIKE) else
@@ -1149,9 +1160,8 @@ def _dict_set_to_list(cls, dct):
                         if isinstance(val, dict) else
                     val
                 )
-            )
             for key, val in dct.items()
-        )
+        }
 
 
     def databases(self, req):
@@ -1191,8 +1201,10 @@ def databases(self, req):
 
             for dataset in datasets:
 
-                result[dataset] = sorted(set.union(
-                    *tbl[tbl.type == dataset].set_sources)
+                result[dataset] = sorted(
+                    set.union(
+                        *tbl[tbl.type == dataset].set_sources,
+                    ),
                 )
 
         else:
@@ -1207,7 +1219,7 @@ def databases(self, req):
         else:
 
             return 'dataset\tresources\n%s' % '\n'.join(
-                '%s\t%s' % (k, ';'.join(v)) for k, v in result.items()
+                '{}\t{}'.format(k, ';'.join(v)) for k, v in result.items()
             )
 
 
@@ -1240,6 +1252,17 @@ def datasets(self, req):
 
             return ';'.join(result)
 
+
+    def _schema(self, query_type: str) -> ReadOnlyColumnCollection:
+
+        return getattr(_schems, query_type.capitalize())
+
+
+    def _columns(self, query_type: str) -> list[str]:
+
+        return self._schema(query_type).__table__.columns
+
+
     def _query(self, args: dict, query_type: str) -> str:
         """
         Generates and executes the SQL query based on the request
@@ -1260,7 +1283,39 @@ def _query(self, args: dict, query_type: str) -> str:
 
             args['resources'] = args['databases']
 
-        # HELLO
+        hdr = self._columns(query_type)
+
+        # Filtering for resources
+        if b'resources' in req.args:
+
+            resources = self._args_set(req, 'resources')
+
+            tbl = tbl.loc[
+                [
+                    bool(sources & resources)
+                    for sources in tbl.set_sources
+                ]
+            ]
+
+        # Filtering for proteins
+        if b'proteins' in req.args:
+
+            proteins = self._args_set(req, 'proteins')
+
+            tbl = tbl.loc[
+                [
+                    bool(this_proteins & proteins)
+                    for this_proteins in tbl.set_proteins
+                ]
+            ]
+
+        license = self._get_license(req)
+
+        tbl = self._filter_by_license_complexes(tbl, license)
+
+        tbl = tbl.loc[:,hdr]
+
+        return self._serve_dataframe(tbl, req)
 
 
     def interactions(
@@ -1271,7 +1326,7 @@ def interactions(
             dorothea_levels = {'A', 'B'},
             organisms = {9606},
             source_target = 'OR',
-        ):
+    ):
 
         bad_req = self._check_args(req)
 
@@ -1328,9 +1383,9 @@ def interactions(
         # keep only valid dataset names
         args['datasets'] = args['datasets'] & self.datasets_
 
-        args['organisms'] = set(
+        args['organisms'] = {
             int(t) for t in args['organisms'] if t.isdigit()
-        )
+        }
         args['organisms'] = args['organisms'] or organisms
 
         # do not allow impossible values
@@ -1481,10 +1536,12 @@ def interactions(
             self._parse_bool_arg(req.args['signed'])
         ):
 
-            tbl = tbl.loc[np.logical_or(
-                tbl.is_stimulation == 1,
-                tbl.is_inhibition == 1
-            )]
+            tbl = tbl.loc[
+                np.logical_or(
+                    tbl.is_stimulation == 1,
+                    tbl.is_inhibition == 1,
+                )
+            ]
 
         # loops: remove by default
         if (
@@ -1533,7 +1590,7 @@ def interactions(
                 hdr.extend(
                     set(tbl.columns) &
                     self.args_reference['interactions']['datasets'] &
-                    args['datasets']
+                    args['datasets'],
                 )
 
             else:
@@ -1606,8 +1663,8 @@ def enzsub(
             self,
             req,
             organisms = {9606},
-            enzyme_substrate = 'OR'
-        ):
+            enzyme_substrate = 'OR',
+    ):
 
         bad_req = self._check_args(req)
 
@@ -1617,7 +1674,7 @@ def enzsub(
 
         hdr = [
             'enzyme', 'substrate', 'residue_type',
-            'residue_offset', 'modification'
+            'residue_offset', 'modification',
         ]
 
         if b'enzyme_substrate' in req.args:
@@ -1635,14 +1692,14 @@ def enzsub(
         for arg in (
             'enzymes', 'substrates', 'partners',
             'resources', 'organisms', 'types',
-            'residues'
+            'residues',
         ):
 
             args[arg] = self._args_set(req, arg)
 
-        args['organisms'] = set(
+        args['organisms'] = {
             int(t) for t in args['organisms'] if t.isdigit()
-        )
+        }
         args['organisms'] = args['organisms'] or organisms
 
         # provide genesymbols: yes or no
@@ -1975,12 +2032,12 @@ def intercell(self, req):
                     (topology & {'secreted', 'sec'}, 'secreted'),
                     (
                         topology & {'plasma_membrane_peripheral', 'pmp'},
-                        'plasma_membrane_peripheral'
+                        'plasma_membrane_peripheral',
                     ),
                     (
                         topology & {'plasma_membrane_transmembrane', 'pmtm'},
-                        'plasma_membrane_transmembrane'
-                    )
+                        'plasma_membrane_transmembrane',
+                    ),
                 )
                 if enabled
             )
@@ -2076,7 +2133,7 @@ def complexes(
             proteins: list[str] | None = None,
             fields: list[str] | None = None,
             limit: int | None = None,
-        ) -> Generator[tuple]:
+    ) -> Generator[tuple]:
 
         req = locals()
         bad_req = self._check_args(req)
@@ -2146,8 +2203,8 @@ def resources(self, req):
         license = self._get_license(req)
 
         return json.dumps(
-            dict(
-                (k, v)
+            {
+                k: v
                 for k, v in self._resources_dict.items()
                 if (
                     res_ctrl.license(k).enables(license) and
@@ -2156,7 +2213,7 @@ def resources(self, req):
                         datasets & set(v['datasets'].keys())
                     )
                 )
-            )
+            },
         )
 
 
@@ -2220,7 +2277,7 @@ def _filter_by_license(
             res_col,
             simple = False,
             prefix_col = None,
-        ):
+    ):
 
         def filter_resources(res):
 
@@ -2284,14 +2341,16 @@ def filter_resources(res):
 
                 _new_prefix_col = [
 
-                    ';'.join(sorted(
-                        pref_res
-                        for pref_res in pref_ress.split(';')
-                        if (
-                            pref_res.split(':', maxsplit = 1)[0] in
-                            _res_to_keep[i]
-                        )
-                    ))
+                    ';'.join(
+                        sorted(
+                            pref_res
+                            for pref_res in pref_ress.split(';')
+                            if (
+                                pref_res.split(':', maxsplit = 1)[0] in
+                                _res_to_keep[i]
+                            )
+                        ),
+                    )
 
                         if isinstance(pref_ress, str) else
 
@@ -2375,7 +2434,7 @@ def _args_set(req, arg):
             if arg in req.args
             else set()
         )
-    
+
     def _parse_bool_arg(self, arg):
 
         if isinstance(arg, list) and arg: