freezing development

costero-e · Feb 16, 2023 · 60e05cb · 60e05cb
1 parent 81b37ad
commit 60e05cb
Show file tree

Hide file tree

Showing 6 changed files with 310 additions and 32 deletions.
diff --git a/beacon/db/filters.py b/beacon/db/filters.py
@@ -222,7 +222,7 @@ def apply_alphanumeric_filter(query: dict, filter: AlphanumericFilter, collectio
                         dict_text['$or'].append(dict_filter)
                     query['$and'].append(dict_text)
                     query['$and'].append(dict_text_2)
-                if collection == 'runs':
+                elif collection == 'runs':
                     query['$and']=[]
                     dict_text={}
                     dict_text['$or']=[]
@@ -410,7 +410,7 @@ def apply_alphanumeric_filter(query: dict, filter: AlphanumericFilter, collectio
                 query['$and'].append(dict_text_2)
     else:
         query['measurementValue.quantity.value'] = { formatted_operator: float(formatted_value) }
-        query['assayCode.label']=filter.id
+        query['assayCode.id']=filter.id
         LOG.debug(query)
         dict_elemmatch={}
         dict_elemmatch['$elemMatch']=query

diff --git a/beacon/obo.py b/beacon/obo.py
diff --git a/beacon/owl.py b/beacon/owl.py
@@ -0,0 +1,61 @@
+#This version simply retrieves all classes for a given ancestor from an .owl ontology provided locally, creating a .txt dictionary 
+
+import owlready2
+
+#input to provide
+
+#wanted_classes_and_new_labels = {'Abnormal Cell' : 'Cell', 'Tissue Culture' : 'ResearchTechnique' , 'Cellular Process': 'AssociatedBiologicalProcess'}
+#new_dictionary = 'name.txt'
+
+
+# load ontology
+from owlready2 import *
+
+
+#lists
+ontology = '/Users/oriol/Desktop/ncit.owl'
+onto = get_ontology(ontology).load()
+namespace = onto.get_namespace(ontology)
+class_list = list(onto.classes())
+label_list = []
+ancestor_list = []
+relevant_terms = []
+
+#get all classes (if needed)
+'''
+for c in class_list:
+    label_list.append(c.label)
+'''
+
+#get all classes from wanted ancestors
+for c in class_list:
+    ancestors = list(c.ancestors())
+    print(ancestors)
+'''
+    for a in ancestors:
+        x = str(a.label)
+        for term, label in wanted_classes_and_new_labels.items():
+            if term == x[2:-2] and c not in relevant_terms:
+                relevant_terms.append(c)
+                with open(new_dictionary, 'a') as n:
+                    n.write(c.label[0] + '\t' + 'LABEL=' + label + '\t' + 'ID=' + str(c) + '\n')
+'''
+
+'''
+#add synonyms and their preferred label
+for c in relevant_terms:
+    ancestors = list(c.ancestors())
+    for a in ancestors:
+        x = str(a.label)
+        for term, label in new_labels.items():
+            if term == x[2:-2]:
+                try:
+                    s = list(onto.search_one(label=c.label).hasExactSynonym)
+                    print(s)
+                    for term in s:
+                        with open(new_dictionary, 'a') as n:
+                            n.write(term + '\t' + 'LABEL=' + label + '\t' + 'ID=' + str(c) + '\t' + 'PrefSynonym=' + c.label[0] + '\n')
+                except:
+                    print('no synonym')
+                    continue
+'''
diff --git a/deploy/extract_filtering_terms.py b/deploy/extract_filtering_terms.py
@@ -12,7 +12,7 @@
 from owlready2 import OwlReadyOntologyParsingError
 from tqdm import tqdm
 
-ONTOLOGY_REGEX = re.compile(r"([_A-Za-z]+):(\w+)")
+ONTOLOGY_REGEX = re.compile(r"([_A-Za-z]+):([_A-Za-z0-9^\-]+)")
 
 client = MongoClient(
     "mongodb://127.0.0.1:27017/"
@@ -199,9 +199,11 @@ def insert_all_alphanumeric_terms_used():
     for c_name in collections:
         terms = find_alphanumeric_terms_used(c_name)
         print(terms)
-        if len(terms) > 0:
-            client.beacon.filtering_terms.insert_many(terms)
+        #if len(terms) > 0:
+            #client.beacon.filtering_terms.insert_many(terms)
 
 
-insert_all_ontology_terms_used()
-insert_all_alphanumeric_terms_used()
+#insert_all_ontology_terms_used()
+#insert_all_alphanumeric_terms_used()
+terms=find_ontology_terms_used("individuals")
+print(terms)
diff --git a/deploy/extract_filtering_terms_oriol.py b/deploy/extract_filtering_terms_oriol.py
@@ -0,0 +1,200 @@
+import os.path
+import urllib.request
+from typing import List, Dict, Optional
+import re
+from urllib.error import HTTPError
+
+import requests
+import owlready2
+from pymongo.mongo_client import MongoClient
+import progressbar
+from bson.objectid import ObjectId
+from owlready2 import OwlReadyOntologyParsingError
+from tqdm import tqdm
+import obonet
+
+ONTOLOGY_REGEX = re.compile(r"([_A-Za-z]+):([_A-Za-z0-9^\-]+)")
+
+client = MongoClient(
+    "mongodb://127.0.0.1:27017/"
+)
+
+class MyProgressBar:
+    def __init__(self):
+        self.pbar = None
+
+    def __call__(self, block_num: int, block_size: int, total_size: int):
+        if not self.pbar:
+            self.pbar = progressbar.ProgressBar(maxval=total_size)
+            self.pbar.start()
+
+        downloaded = block_num * block_size
+        if downloaded < total_size:
+            self.pbar.update(downloaded)
+        else:
+            self.pbar.finish()
+
+
+def insert_all_ontology_terms_used():
+    collections = client.beacon.list_collection_names()
+    if 'filtering_terms' in collections:
+        collections.remove('filtering_terms')
+    print("Collections:", collections)
+    for c_name in collections:
+        terms = find_ontology_terms_used(c_name)
+        if len(terms) > 0:
+            client.beacon.filtering_terms.insert_many(terms)
+
+
+def get_ontology_name(ontology: owlready2.Ontology) -> str:
+    path = "ontologies/{}.obo".format(ontology)
+    try:
+        graph = obonet.read_obo(path)
+        name = graph.graph['remark']
+        if '.' in name[0]:
+            list_name = name[0].split('.')
+            name = list_name[0]
+        return name
+    except:
+        pass
+
+def load_ontology(ontology_id: str) -> Optional[owlready2.Ontology]:
+    if ontology_id.isalpha():
+        url = "http://purl.obolibrary.org/obo/{}.obo".format(ontology_id.lower())
+        path = "ontologies/{}.obo".format(ontology_id)
+        try:
+            if not os.path.exists(path):
+                urllib.request.urlretrieve(url, path, MyProgressBar())
+            return '{}'.format(ontology_id)
+        except HTTPError:
+            # TODO: Handle error
+            print("ERROR", HTTPError)
+            pass
+        except ValueError:
+            print("ERROR", ValueError)
+            pass
+
+
+def get_ontology_term_label(ontology: owlready2.Ontology, term: str) -> Optional[str]:
+    path = "ontologies/{}.obo".format(ontology.lower())
+    try:
+        graph = obonet.read_obo(path)
+        id_to_name = {id_: data.get('name') for id_, data in graph.nodes(data=True)}
+        return id_to_name['{}:{}'.format(ontology,term)]
+    except:
+        pass
+
+
+def get_ontology_term_count(collection_name: str, term: str) -> int:
+    query = {
+        '$text': {
+            '$search': '\"' + term + '\"'
+        }
+    }
+    return client.beacon\
+        .get_collection(collection_name)\
+        .count_documents(query)
+
+
+def find_ontology_terms_used(collection_name: str) -> List[Dict]:
+    terms = []
+    terms_ids = set()
+    ontologies = dict()
+    count = client.beacon.get_collection(collection_name).estimated_document_count()
+    xs = client.beacon.get_collection(collection_name).find()
+    for r in tqdm(xs, total=count):
+        matches = ONTOLOGY_REGEX.findall(str(r))
+        for ontology_id, term_id in matches:
+            term = ':'.join([ontology_id, term_id])
+            print(term, ontology_id)
+            if term not in terms_ids:
+                terms_ids.add(term)
+                if ontology_id not in ontologies:
+                    ontologies[ontology_id] = load_ontology(ontology_id)
+                if ontologies[ontology_id] is not None:
+                    terms.append({
+                        'type': get_ontology_name(ontologies[ontology_id]),
+                        'id': term,
+                        'label': get_ontology_term_label(ontology_id, term_id),
+                        # TODO: Use conf.py -> beaconGranularity to not disclouse counts in the filtering terms
+                        'count': get_ontology_term_count(collection_name, term),
+                        'collection': collection_name,
+                    })
+    return terms
+
+
+def get_alphanumeric_term_count(collection_name: str, key: str) -> int:
+    return len(client.beacon\
+        .get_collection(collection_name)\
+        .distinct(key))
+
+
+def get_properties_of_document(document, prefix="") -> List[str]:
+    properties = []
+    if document is None or isinstance(document, str) or isinstance(document, int):
+        return []
+    elif isinstance(document, list):
+        for elem in document:
+            properties += get_properties_of_document(elem, prefix)
+    elif isinstance(document, dict):
+        for key, value in document.items():
+            if isinstance(value, ObjectId):
+                continue
+            elif value is None:
+                properties.append(prefix + '.' + key if prefix else key)
+            elif isinstance(value, int):
+                properties.append(prefix + '.' + key if prefix else key)
+            elif isinstance(value, str):
+                properties.append(prefix + '.' + key if prefix else key)
+            elif isinstance(value, list):
+                properties += get_properties_of_document(value, prefix + '.' + key if prefix else key)
+            elif isinstance(value, dict):
+                properties += get_properties_of_document(value, prefix + '.' + key if prefix else key)
+            else:
+                print('Unknown type:', value, ' (', type(value), ')')
+                exit(0)
+    else:
+        print('Unknown type2:', document, ' (', type(document), ')')
+        exit(0)
+    return properties
+
+
+def find_alphanumeric_terms_used(collection_name: str) -> List[Dict]:
+    terms = []
+    terms_ids = set()
+    count = client.beacon.get_collection(collection_name).estimated_document_count()
+    xs = client.beacon.get_collection(collection_name).find()
+    for r in tqdm(xs, total=count):
+        properties = get_properties_of_document(r)
+        for p in properties:
+            if p not in terms_ids:
+                terms_ids.add(p)
+                terms.append({
+                    'type': 'alphanumeric',
+                    'id': p,
+                    'count': get_alphanumeric_term_count(collection_name, p),
+                    'collection': collection_name,
+                })
+    return terms
+
+
+def insert_all_alphanumeric_terms_used():
+    collections = client.beacon.list_collection_names()
+    if 'filtering_terms' in collections:
+        collections.remove('filtering_terms')
+    print("Collections:", collections)
+    for c_name in collections:
+        terms = find_alphanumeric_terms_used(c_name)
+        print(terms)
+        #if len(terms) > 0:
+            #client.beacon.filtering_terms.insert_many(terms)
+
+
+#insert_all_ontology_terms_used()
+#insert_all_alphanumeric_terms_used()
+#terms=find_ontology_terms_used("individuals")
+#print(terms)
+#hola = get_ontology_term_label('NCIT','C173381')
+#print(hola)
+find_alphanumeric_terms_used('analyses')
+
diff --git a/deploy/obo.py b/deploy/obo.py
@@ -0,0 +1,40 @@
+import networkx
+import obonet
+
+url = '/Users/oriol/Desktop/beacon_canada/deploy/ontologies/ncit.obo'
+graph = obonet.read_obo(url)
+
+# Number of nodes
+#len(graph)
+
+#print(graph.graph)
+'''
+data_version = graph.graph['data-version']
+if '/' in data_version:
+    list_data_version = data_version.split('/')
+    data_version = list_data_version[1]
+print(data_version)
+
+name = graph.graph['remark']
+if '.' in name[0]:
+    list_name = name[0].split('.')
+    name = list_name[0]
+print(name)
+'''
+# Number of edges
+#graph.number_of_edges()
+
+#networkx.is_directed_acyclic_graph(graph)
+
+# Mapping from term ID to name
+
+id_to_name = {id_: data.get('name') for id_, data in graph.nodes(data=True)}
+print(id_to_name['NCIT:C173381'])
+#for id_, data in graph.nodes(data=True):
+    #print (id_)
+    #print(data.get('name'))
+#print(id_to_name['NCIT:C143048'] )
+
+#descendants = networkx.ancestors(graph, 'NCIT:C173381')
+
+#print(descendants)