-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
557 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import re | ||
import polars as pl | ||
|
||
def templateQ(qlist, subject, type): | ||
template = qlist['q2'] # later select this based on the type variable | ||
template = template.replace('SUBJECTIRI', subject) | ||
# print(template) | ||
|
||
return template | ||
|
||
def qrSelects(qlist, t): | ||
template = qlist['q2'] ## TODO update to pull based on type passed | ||
|
||
start_word = 'DISTINCT' | ||
end_word = 'WHERE' | ||
|
||
start_index = template.find(start_word) + len(start_word) | ||
end_index = template.find(end_word) | ||
|
||
between_text = template[start_index:end_index].strip() | ||
matches = re.findall(r'\?\w+', between_text) | ||
result = [match[1:] for match in matches] | ||
|
||
if 's' in result: | ||
result.remove('s') | ||
|
||
# The regex finds all cases of ?var so remove duplicates if any | ||
unique_list = [] | ||
for item in result: | ||
if item not in unique_list: | ||
unique_list.append(item) | ||
|
||
return unique_list | ||
|
||
# Notes: | ||
# df comes in at 1893 | ||
def dataset_list(df, store, qlist): | ||
|
||
sl = qrSelects(qlist, "foo") # get vars for query of type "t" | ||
print(sl) | ||
|
||
dl = [] | ||
for i in range(len(df)): # this loop will run for len(df) | ||
row = df.slice(i, 1) | ||
s = row['id'][0] | ||
t = row['type'][0] # fetch column 'type' | ||
|
||
sl = qrSelects(qlist, t) # get vars for query of type "t" | ||
qr = list(store.query(templateQ(qlist, s, t))) # query RDF for subject s of type t | ||
|
||
# print("{} : {}".format(s, t)) | ||
# print(qr) | ||
|
||
d = dict() | ||
for r in qr: | ||
for term in sl: | ||
if r[term] is not None: | ||
# print("{} {}".format(term, r[term].value)) | ||
if r[term].value != '': | ||
d[term] = r[term].value | ||
|
||
print(d) | ||
if len(d) > 0: | ||
dl.append(d) | ||
|
||
print("Length of dataset query results: {}".format(len(dl))) | ||
|
||
df = pl.from_records(dl, schema=sl) | ||
|
||
print(len(df)) | ||
|
||
print(df) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
import argparse | ||
import gc | ||
import re | ||
import sys | ||
import io | ||
import warnings | ||
import datetime | ||
from functools import reduce | ||
|
||
import kglab | ||
import numpy as np | ||
import pandas as pd | ||
from dateutil import parser | ||
from rdflib import ConjunctiveGraph # needed for quads | ||
import polars as pl | ||
from tqdm import tqdm | ||
from pyoxigraph import * | ||
|
||
from defs import graphshapers | ||
from defs import load_queries | ||
from defs import readSource | ||
from defs import polar_calls | ||
from defs import regionFor | ||
from defs import spatial | ||
from defs import saveobject | ||
|
||
warnings.simplefilter(action='ignore', category=FutureWarning) # remove pandas future warning | ||
|
||
|
||
def main(): | ||
# Params | ||
parser = argparse.ArgumentParser(description="Process some arguments.") | ||
parser.add_argument("--source", type=str, help="Source URL") | ||
parser.add_argument("--output", type=str, help="Output file") | ||
|
||
args = parser.parse_args() | ||
|
||
if args.source is None: | ||
print("Error: the --source argument is required") | ||
sys.exit(1) | ||
|
||
if args.output is None: | ||
print("Error: the --output argument is required") | ||
sys.exit(1) | ||
|
||
u = args.source | ||
o = args.output | ||
|
||
# Load graph | ||
print("RDF download started", datetime.datetime.now()) | ||
dg = readSource.read_data(u) | ||
print("RDF downloaded, starting load stage", datetime.datetime.now()) | ||
|
||
mf = graphProcessor(dg) | ||
|
||
# # Reporting | ||
# print("Reporting Stage: The following is the current dataframe shape to exported") | ||
# print(mf.info()) | ||
# | ||
# # Save | ||
# saveobject.write_data(o, mf) | ||
|
||
|
||
def graphProcessor(dg): | ||
r = graphshapers.contextAlignment(dg) | ||
|
||
print("RDF loaded, starting query stage", datetime.datetime.now()) | ||
|
||
store = Store() | ||
mime_type = "application/n-quads" | ||
store.load(io.StringIO(r), mime_type, base_iri=None, to_graph=None) | ||
print("RDF loaded, starting query stage", datetime.datetime.now()) | ||
|
||
# Load Queries | ||
sfl = [ | ||
"./queries/subjectsTypes.rq", # q1 | ||
"./queries/template_dataset.rq", # q2 | ||
"./queries/baseQuery.rq", | ||
"./queries/course.rq", | ||
"./queries/dataset.rq", | ||
"./queries/person.rq", | ||
"./queries/sup_geo.rq", | ||
"./queries/sup_temporal.rq" | ||
] | ||
|
||
qlist = load_queries.read_files(sfl) | ||
|
||
# conduct initial query for types and associated subject IRIs | ||
qr = list(store.query(qlist['q1'])) | ||
|
||
print("Length of SPARQL query results: {}".format(len(qr))) | ||
|
||
qrl = [] | ||
for r in qr: | ||
qrl.append([r['id'].value, r['type'].value]) | ||
|
||
# print(qr[0]) | ||
# print(qr[0]['id'].value) | ||
# print(qr[0]['type'].value) | ||
|
||
# for binding in qr: | ||
# print("{} {}".format(binding['id'].value, binding['type'].value)) | ||
df = pl.from_records(qrl, schema=["id", "type"]) | ||
print("Length of Polars data frame: {}".format(len(df))) | ||
|
||
dsl = polar_calls.dataset_list(df, store, qlist) | ||
|
||
return 0 | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.