-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQuery.R
49 lines (45 loc) · 1.98 KB
/
Query.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#===========
# README
#===========
# Reuseable WikiData Molecules Query For Assignment 3 MSB1015
#============================================================
# Caroline Collins 6192527
# 16 October 2019
#
# Run this code to obtain the `query_result.tsv` file of all the molecules
# in the Wikidata database with their canonical SMILES and isoSMILES.
#
# Note :
# As a prerequisite, this code needs a certain Wikidata query R package
# which you can find here: https://github.com/bearloga/WikidataQueryServiceR
#===============================================
# STEP 0 Define your main directory/directories:
#===============================================
working_directory <- getwd()
DATA.DIR <- working_directory
RESULTS.DIR <- working_directory #change this to suit your working method
#===================================================
# INSTALL required library
#====================================================
library(WikidataQueryServiceR)
#=================
# QUERY WikiData
#=================
# Note: if you wish to alter the query, change the WikiData query (within the quotes '' ).
# The following query indiscriminately extracts
# any and every compound with a (iso)SMILES entry from the database.
# (P233 corresponds to canonical SMILES, P2017 to isomeric SMILES)
# Information on identifiers and query testing
# can be found at https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/Wikidata_Query_Help/SPARQL_Editor
my_query <- 'SELECT DISTINCT ?compound ?smiles ?isoSmiles WHERE {
?compound wdt:P233 | wdt:P2017 [] .
OPTIONAL { ?compound wdt:P233 ?smiles }
OPTIONAL { ?compound wdt:P2017 ?isoSmiles }
}'
query_result <- query_wikidata(my_query) #query_wikidata() requires library WikidataQueryServiceR
#===========================
# STORE RESULT IN A TSV FILE
#===========================
write.table(query_result, file = "query_result.tsv", sep = "\t", row.names = FALSE,
quote = F)
#===============================END