Skip to content

Commit

Permalink
freezing development
Browse files Browse the repository at this point in the history
  • Loading branch information
costero-e committed Feb 16, 2023
1 parent 81b37ad commit 60e05cb
Show file tree
Hide file tree
Showing 6 changed files with 310 additions and 32 deletions.
4 changes: 2 additions & 2 deletions beacon/db/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def apply_alphanumeric_filter(query: dict, filter: AlphanumericFilter, collectio
dict_text['$or'].append(dict_filter)
query['$and'].append(dict_text)
query['$and'].append(dict_text_2)
if collection == 'runs':
elif collection == 'runs':
query['$and']=[]
dict_text={}
dict_text['$or']=[]
Expand Down Expand Up @@ -410,7 +410,7 @@ def apply_alphanumeric_filter(query: dict, filter: AlphanumericFilter, collectio
query['$and'].append(dict_text_2)
else:
query['measurementValue.quantity.value'] = { formatted_operator: float(formatted_value) }
query['assayCode.label']=filter.id
query['assayCode.id']=filter.id
LOG.debug(query)
dict_elemmatch={}
dict_elemmatch['$elemMatch']=query
Expand Down
25 changes: 0 additions & 25 deletions beacon/obo.py

This file was deleted.

61 changes: 61 additions & 0 deletions beacon/owl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#This version simply retrieves all classes for a given ancestor from an .owl ontology provided locally, creating a .txt dictionary

import owlready2

#input to provide

#wanted_classes_and_new_labels = {'Abnormal Cell' : 'Cell', 'Tissue Culture' : 'ResearchTechnique' , 'Cellular Process': 'AssociatedBiologicalProcess'}
#new_dictionary = 'name.txt'


# load ontology
from owlready2 import *


#lists
ontology = '/Users/oriol/Desktop/ncit.owl'
onto = get_ontology(ontology).load()
namespace = onto.get_namespace(ontology)
class_list = list(onto.classes())
label_list = []
ancestor_list = []
relevant_terms = []

#get all classes (if needed)
'''
for c in class_list:
label_list.append(c.label)
'''

#get all classes from wanted ancestors
for c in class_list:
ancestors = list(c.ancestors())
print(ancestors)
'''
for a in ancestors:
x = str(a.label)
for term, label in wanted_classes_and_new_labels.items():
if term == x[2:-2] and c not in relevant_terms:
relevant_terms.append(c)
with open(new_dictionary, 'a') as n:
n.write(c.label[0] + '\t' + 'LABEL=' + label + '\t' + 'ID=' + str(c) + '\n')
'''

'''
#add synonyms and their preferred label
for c in relevant_terms:
ancestors = list(c.ancestors())
for a in ancestors:
x = str(a.label)
for term, label in new_labels.items():
if term == x[2:-2]:
try:
s = list(onto.search_one(label=c.label).hasExactSynonym)
print(s)
for term in s:
with open(new_dictionary, 'a') as n:
n.write(term + '\t' + 'LABEL=' + label + '\t' + 'ID=' + str(c) + '\t' + 'PrefSynonym=' + c.label[0] + '\n')
except:
print('no synonym')
continue
'''
12 changes: 7 additions & 5 deletions deploy/extract_filtering_terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from owlready2 import OwlReadyOntologyParsingError
from tqdm import tqdm

ONTOLOGY_REGEX = re.compile(r"([_A-Za-z]+):(\w+)")
ONTOLOGY_REGEX = re.compile(r"([_A-Za-z]+):([_A-Za-z0-9^\-]+)")

client = MongoClient(
"mongodb://127.0.0.1:27017/"
Expand Down Expand Up @@ -199,9 +199,11 @@ def insert_all_alphanumeric_terms_used():
for c_name in collections:
terms = find_alphanumeric_terms_used(c_name)
print(terms)
if len(terms) > 0:
client.beacon.filtering_terms.insert_many(terms)
#if len(terms) > 0:
#client.beacon.filtering_terms.insert_many(terms)


insert_all_ontology_terms_used()
insert_all_alphanumeric_terms_used()
#insert_all_ontology_terms_used()
#insert_all_alphanumeric_terms_used()
terms=find_ontology_terms_used("individuals")
print(terms)
200 changes: 200 additions & 0 deletions deploy/extract_filtering_terms_oriol.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
import os.path
import urllib.request
from typing import List, Dict, Optional
import re
from urllib.error import HTTPError

import requests
import owlready2
from pymongo.mongo_client import MongoClient
import progressbar
from bson.objectid import ObjectId
from owlready2 import OwlReadyOntologyParsingError
from tqdm import tqdm
import obonet

ONTOLOGY_REGEX = re.compile(r"([_A-Za-z]+):([_A-Za-z0-9^\-]+)")

client = MongoClient(
"mongodb://127.0.0.1:27017/"
)

class MyProgressBar:
def __init__(self):
self.pbar = None

def __call__(self, block_num: int, block_size: int, total_size: int):
if not self.pbar:
self.pbar = progressbar.ProgressBar(maxval=total_size)
self.pbar.start()

downloaded = block_num * block_size
if downloaded < total_size:
self.pbar.update(downloaded)
else:
self.pbar.finish()


def insert_all_ontology_terms_used():
collections = client.beacon.list_collection_names()
if 'filtering_terms' in collections:
collections.remove('filtering_terms')
print("Collections:", collections)
for c_name in collections:
terms = find_ontology_terms_used(c_name)
if len(terms) > 0:
client.beacon.filtering_terms.insert_many(terms)


def get_ontology_name(ontology: owlready2.Ontology) -> str:
path = "ontologies/{}.obo".format(ontology)
try:
graph = obonet.read_obo(path)
name = graph.graph['remark']
if '.' in name[0]:
list_name = name[0].split('.')
name = list_name[0]
return name
except:
pass

def load_ontology(ontology_id: str) -> Optional[owlready2.Ontology]:
if ontology_id.isalpha():
url = "http://purl.obolibrary.org/obo/{}.obo".format(ontology_id.lower())
path = "ontologies/{}.obo".format(ontology_id)
try:
if not os.path.exists(path):
urllib.request.urlretrieve(url, path, MyProgressBar())
return '{}'.format(ontology_id)
except HTTPError:
# TODO: Handle error
print("ERROR", HTTPError)
pass
except ValueError:
print("ERROR", ValueError)
pass


def get_ontology_term_label(ontology: owlready2.Ontology, term: str) -> Optional[str]:
path = "ontologies/{}.obo".format(ontology.lower())
try:
graph = obonet.read_obo(path)
id_to_name = {id_: data.get('name') for id_, data in graph.nodes(data=True)}
return id_to_name['{}:{}'.format(ontology,term)]
except:
pass


def get_ontology_term_count(collection_name: str, term: str) -> int:
query = {
'$text': {
'$search': '\"' + term + '\"'
}
}
return client.beacon\
.get_collection(collection_name)\
.count_documents(query)


def find_ontology_terms_used(collection_name: str) -> List[Dict]:
terms = []
terms_ids = set()
ontologies = dict()
count = client.beacon.get_collection(collection_name).estimated_document_count()
xs = client.beacon.get_collection(collection_name).find()
for r in tqdm(xs, total=count):
matches = ONTOLOGY_REGEX.findall(str(r))
for ontology_id, term_id in matches:
term = ':'.join([ontology_id, term_id])
print(term, ontology_id)
if term not in terms_ids:
terms_ids.add(term)
if ontology_id not in ontologies:
ontologies[ontology_id] = load_ontology(ontology_id)
if ontologies[ontology_id] is not None:
terms.append({
'type': get_ontology_name(ontologies[ontology_id]),
'id': term,
'label': get_ontology_term_label(ontology_id, term_id),
# TODO: Use conf.py -> beaconGranularity to not disclouse counts in the filtering terms
'count': get_ontology_term_count(collection_name, term),
'collection': collection_name,
})
return terms


def get_alphanumeric_term_count(collection_name: str, key: str) -> int:
return len(client.beacon\
.get_collection(collection_name)\
.distinct(key))


def get_properties_of_document(document, prefix="") -> List[str]:
properties = []
if document is None or isinstance(document, str) or isinstance(document, int):
return []
elif isinstance(document, list):
for elem in document:
properties += get_properties_of_document(elem, prefix)
elif isinstance(document, dict):
for key, value in document.items():
if isinstance(value, ObjectId):
continue
elif value is None:
properties.append(prefix + '.' + key if prefix else key)
elif isinstance(value, int):
properties.append(prefix + '.' + key if prefix else key)
elif isinstance(value, str):
properties.append(prefix + '.' + key if prefix else key)
elif isinstance(value, list):
properties += get_properties_of_document(value, prefix + '.' + key if prefix else key)
elif isinstance(value, dict):
properties += get_properties_of_document(value, prefix + '.' + key if prefix else key)
else:
print('Unknown type:', value, ' (', type(value), ')')
exit(0)
else:
print('Unknown type2:', document, ' (', type(document), ')')
exit(0)
return properties


def find_alphanumeric_terms_used(collection_name: str) -> List[Dict]:
terms = []
terms_ids = set()
count = client.beacon.get_collection(collection_name).estimated_document_count()
xs = client.beacon.get_collection(collection_name).find()
for r in tqdm(xs, total=count):
properties = get_properties_of_document(r)
for p in properties:
if p not in terms_ids:
terms_ids.add(p)
terms.append({
'type': 'alphanumeric',
'id': p,
'count': get_alphanumeric_term_count(collection_name, p),
'collection': collection_name,
})
return terms


def insert_all_alphanumeric_terms_used():
collections = client.beacon.list_collection_names()
if 'filtering_terms' in collections:
collections.remove('filtering_terms')
print("Collections:", collections)
for c_name in collections:
terms = find_alphanumeric_terms_used(c_name)
print(terms)
#if len(terms) > 0:
#client.beacon.filtering_terms.insert_many(terms)


#insert_all_ontology_terms_used()
#insert_all_alphanumeric_terms_used()
#terms=find_ontology_terms_used("individuals")
#print(terms)
#hola = get_ontology_term_label('NCIT','C173381')
#print(hola)
find_alphanumeric_terms_used('analyses')

40 changes: 40 additions & 0 deletions deploy/obo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import networkx
import obonet

url = '/Users/oriol/Desktop/beacon_canada/deploy/ontologies/ncit.obo'
graph = obonet.read_obo(url)

# Number of nodes
#len(graph)

#print(graph.graph)
'''
data_version = graph.graph['data-version']
if '/' in data_version:
list_data_version = data_version.split('/')
data_version = list_data_version[1]
print(data_version)
name = graph.graph['remark']
if '.' in name[0]:
list_name = name[0].split('.')
name = list_name[0]
print(name)
'''
# Number of edges
#graph.number_of_edges()

#networkx.is_directed_acyclic_graph(graph)

# Mapping from term ID to name

id_to_name = {id_: data.get('name') for id_, data in graph.nodes(data=True)}
print(id_to_name['NCIT:C173381'])
#for id_, data in graph.nodes(data=True):
#print (id_)
#print(data.get('name'))
#print(id_to_name['NCIT:C143048'] )

#descendants = networkx.ancestors(graph, 'NCIT:C173381')

#print(descendants)

0 comments on commit 60e05cb

Please sign in to comment.