Skip to content

Commit

Permalink
✨ Add synonyms support (thesaurus feature fully implemented)
Browse files Browse the repository at this point in the history
- 🔨 Move to Python 3
- ✨ Add synonyms support for WHERE's columns (thesaurus
feature fully implemented)
- 🐛 Bugfix on values affectation in WHERE parsing
  • Loading branch information
FerreroJeremy committed Nov 25, 2017
1 parent bec3735 commit 45adf21
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 41 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
clean:
rm -rf *.json *.pyc
test:
python -m unittest test_unit
python3 -m unittest test_unit
rm -rf *.json *.pyc
106 changes: 68 additions & 38 deletions Parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def join(self):

class WhereParser(Thread):

def __init__(self, phrases, tables_of_from, columns_of_values_of_where, count_keywords, sum_keywords, average_keywords, max_keywords, min_keywords, greater_keywords, less_keywords, between_keywords, negation_keywords, junction_keywords, disjunction_keywords, like_keywords, distinct_keywords, database_dico):
def __init__(self, phrases, tables_of_from, columns_of_values_of_where, count_keywords, sum_keywords, average_keywords, max_keywords, min_keywords, greater_keywords, less_keywords, between_keywords, negation_keywords, junction_keywords, disjunction_keywords, like_keywords, distinct_keywords, database_dico, database_object):
Thread.__init__(self)
self.where_objects = []
self.phrases = phrases
Expand All @@ -279,6 +279,7 @@ def __init__(self, phrases, tables_of_from, columns_of_values_of_where, count_ke
self.like_keywords = like_keywords
self.distinct_keywords = distinct_keywords
self.database_dico = database_dico
self.database_object = database_object

def get_tables_of_column(self, column):
tmp_table = []
Expand Down Expand Up @@ -359,6 +360,13 @@ def predict_junction(self, previous_column_offset, current_column_offset):
else:
return 'OR'

def uniquify(self, list):
already = []
for element in list:
if element not in already:
already.append(element)
return already

def run(self):
number_of_where_columns = 0
columns_of_where = []
Expand All @@ -379,13 +387,18 @@ def run(self):

for phrase in self.phrases:
for i in range(0, len(phrase)):
for table in self.database_dico:
if phrase[i] in self.database_dico[table]:
number_of_where_columns += 1
columns_of_where.append(phrase[i])
offset_of[phrase[i]] = i
column_offset.append(i)
break
for table_name in self.database_dico:
columns = self.database_object.get_table_by_name(table_name).get_columns()
for column in columns:
if (phrase[i] == column.get_name()) or (phrase[i] in column.get_equivalences()):
number_of_where_columns += 1
columns_of_where.append(column.get_name())
offset_of[phrase[i]] = i
column_offset.append(i)
break
else:
continue
break

phrase_keyword = str(phrase[i]).lower() # for robust keyword matching

Expand Down Expand Up @@ -416,6 +429,8 @@ def run(self):
if phrase_keyword in self.like_keywords: # after the column
self.like_keyword_offset.append(i)

print(self.columns_of_values_of_where)
print(columns_of_where)

for table_of_from in self.tables_of_from:
where_object = Where()
Expand All @@ -437,7 +452,7 @@ def run(self):
operation_type = self.predict_operation_type(previous, current)

if len(self.columns_of_values_of_where) > i:
value = self.columns_of_values_of_where[i]
value = self.columns_of_values_of_where[len(self.columns_of_values_of_where) - len(columns_of_where) + i]
else:
value = 'OOV' # Out Of Vocabulary: default value

Expand All @@ -452,12 +467,13 @@ def join(self):

class GroupByParser(Thread):

def __init__(self, phrases, tables_of_from, database_dico):
def __init__(self, phrases, tables_of_from, database_dico, database_object):
Thread.__init__(self)
self.group_by_objects = []
self.phrases = phrases
self.tables_of_from = tables_of_from
self.database_dico = database_dico
self.database_object = database_object

def get_tables_of_column(self, column):
tmp_table = []
Expand All @@ -479,11 +495,12 @@ def run(self):
group_by_object = GroupBy()
for phrase in self.phrases:
for i in range(0, len(phrase)):
for table in self.database_dico:
if phrase[i] in self.database_dico[table]:
column = self.get_column_name_with_alias_table(
phrase[i], table_of_from)
group_by_object.set_column(column)
for table_name in self.database_dico:
columns = self.database_object.get_table_by_name(table_name).get_columns()
for column in columns:
if (phrase[i] == column.get_name()) or (phrase[i] in column.get_equivalences()):
column_with_alias = self.get_column_name_with_alias_table(column.get_name(), table_of_from)
group_by_object.set_column(column_with_alias)
self.group_by_objects.append(group_by_object)

def join(self):
Expand All @@ -493,14 +510,15 @@ def join(self):

class OrderByParser(Thread):

def __init__(self, phrases, tables_of_from, asc_keywords, desc_keywords, database_dico):
def __init__(self, phrases, tables_of_from, asc_keywords, desc_keywords, database_dico, database_object):
Thread.__init__(self)
self.order_by_objects = []
self.phrases = phrases
self.tables_of_from = tables_of_from
self.asc_keywords = asc_keywords
self.desc_keywords = desc_keywords
self.database_dico = database_dico
self.database_object = database_object

def get_tables_of_column(self, column):
tmp_table = []
Expand Down Expand Up @@ -531,10 +549,12 @@ def run(self):
order_by_object = OrderBy()
for phrase in self.phrases:
for i in range(0, len(phrase)):
for table in self.database_dico:
if phrase[i] in self.database_dico[table]:
column = self.get_column_name_with_alias_table(phrase[i], table_of_from)
order_by_object.add_column(column, self.predict_order(phrase))
for table_name in self.database_dico:
columns = self.database_object.get_table_by_name(table_name).get_columns()
for column in columns:
if (phrase[i] == column.get_name()) or (phrase[i] in column.get_equivalences()):
column_with_alias = self.get_column_name_with_alias_table(column.get_name(), table_of_from)
order_by_object.add_column(column_with_alias, self.predict_order(phrase))
self.order_by_objects.append(order_by_object)

def join(self):
Expand Down Expand Up @@ -624,27 +644,35 @@ def parse_sentence(self, sentence, stopwordsFilter=None):
med_phrase = ''
end_phrase = ''

''' @todo merge this part of the algorithm (detection of values of where) in the rest of the parsing algorithm (about line 725) '''

for i in range(0, len(input_word_list)):
if input_word_list[i] in self.database_dico:
if number_of_table_temp == 0:
start_phrase = input_word_list[:i]
number_of_table_temp += 1
last_table_position_temp = i
for table in self.database_dico:
if input_word_list[i] in self.database_dico[table]:
if number_of_where_column_temp == 0:
med_phrase = input_word_list[
len(start_phrase):last_table_position_temp + 1]
number_of_where_column_temp += 1
break
for table_name in self.database_dico:
if (input_word_list[i] == table_name) or (input_word_list[i] in self.database_object.get_table_by_name(table_name).get_equivalences()):
if number_of_table_temp == 0:
start_phrase = input_word_list[:i]
number_of_table_temp += 1
last_table_position_temp = i

columns = self.database_object.get_table_by_name(table_name).get_columns()
for column in columns:
if (input_word_list[i] == column.get_name()) or (input_word_list[i] in column.get_equivalences()):
if number_of_where_column_temp == 0:
med_phrase = input_word_list[len(start_phrase):last_table_position_temp + 1]
number_of_where_column_temp += 1
break
else:
if (number_of_table_temp != 0) and (number_of_where_column_temp == 0) and (i == (len(input_word_list) - 1)):
med_phrase = input_word_list[len(start_phrase):]
else:
if (number_of_table_temp != 0) and (number_of_where_column_temp == 0) and (i == (len(input_word_list) - 1)):
med_phrase = input_word_list[len(start_phrase):]
continue
break

end_phrase = input_word_list[len(start_phrase) + len(med_phrase):]

irext = ' '.join(end_phrase)

''' @todo set this part of the algorithm (detection of values of where) in the part of the phrases where parsing '''
''' @todo set this part of the algorithm (detection of values of where) in the WhereParser thread '''

if irext:
irext = self.remove_accents(irext.lower())
Expand Down Expand Up @@ -698,6 +726,8 @@ def parse_sentence(self, sentence, stopwordsFilter=None):
# replace back <_> to spaces from the values assigned
columns_of_values_of_where.append(str("'" + str(irext_list[index]).replace('<_>', ' ') + "'"))

''' ----------------------------------------------------------------------------------------------------------- '''

tables_of_from = []
select_phrase = ''
from_phrase = ''
Expand Down Expand Up @@ -819,9 +849,9 @@ def parse_sentence(self, sentence, stopwordsFilter=None):
try:
select_parser = SelectParser(columns_of_select, tables_of_from, select_phrase, self.count_keywords, self.sum_keywords, self.average_keywords, self.max_keywords, self.min_keywords, self.distinct_keywords, self.database_dico, self.database_object)
from_parser = FromParser(tables_of_from, columns_of_select, columns_of_where, self.database_object)
where_parser = WhereParser(new_where_phrase, tables_of_from, columns_of_values_of_where, self.count_keywords, self.sum_keywords, self.average_keywords, self.max_keywords, self.min_keywords, self.greater_keywords, self.less_keywords, self.between_keywords, self.negation_keywords, self.junction_keywords, self.disjunction_keywords, self.like_keywords, self.distinct_keywords, self.database_dico)
group_by_parser = GroupByParser(group_by_phrase, tables_of_from, self.database_dico)
order_by_parser = OrderByParser(order_by_phrase, tables_of_from, self.asc_keywords, self.desc_keywords, self.database_dico)
where_parser = WhereParser(new_where_phrase, tables_of_from, columns_of_values_of_where, self.count_keywords, self.sum_keywords, self.average_keywords, self.max_keywords, self.min_keywords, self.greater_keywords, self.less_keywords, self.between_keywords, self.negation_keywords, self.junction_keywords, self.disjunction_keywords, self.like_keywords, self.distinct_keywords, self.database_dico, self.database_object)
group_by_parser = GroupByParser(group_by_phrase, tables_of_from, self.database_dico, self.database_object)
order_by_parser = OrderByParser(order_by_phrase, tables_of_from, self.asc_keywords, self.desc_keywords, self.database_dico, self.database_object)

select_parser.start()
from_parser.start()
Expand Down
2 changes: 1 addition & 1 deletion lang/english.csv
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ ASC: ascending, increasing
DESC: descending, decreasing, inverse, reverse, opposite
GROUP: group, grouped
NEGATION: not, no
EQUAL: is, equal, equals, equal to, equals to
EQUAL: is, equal, equals, equal to, equals to, are
LIKE: like, likes
DISTINCT: distinct, different, distinctive, distinctly
2 changes: 1 addition & 1 deletion lang/french.csv
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ ASC: ascendant, ascendante, croissant
DESC: descendant, descendante, décroissant, inverse, inversé, inversée
GROUP: groupe, groupé, rangé
NEGATION: ne, pas, aucun
EQUAL: est, égal, égal à
EQUAL: est, égal, égal à, sont
LIKE: comme
DISTINCT: distinct, distincte, distincts, distinctes, distinctive, distinctement, distinctivement
30 changes: 30 additions & 0 deletions test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,12 @@ def test_main(self):
'database': './database/city.sql',
'language': './lang/english.csv',
'output': "SELECT DISTINCT emp.name FROM city INNER JOIN emp ON city.id = emp.cityId WHERE emp.score = '9';"
},
{
'input': "Compte les nom des élève dont les nom sont BELLE",
'database': './database/ecole.sql',
'language': './lang/french.csv',
'output': "SELECT COUNT(eleve.nom) FROM eleve WHERE eleve.nom = 'belle';"
}
]

Expand Down Expand Up @@ -252,6 +258,30 @@ def test_main(self):
'language': './lang/french.csv',
'thesaurus': 'thesaurus/th_french.dat',
'output': "SELECT classe.salle FROM classe;"
},
{
'input': "Compte les dénomination des étudiant dont les dénomination sont BELLE",
'database': './database/ecole.sql',
'language': './lang/french.csv',
'thesaurus': 'thesaurus/th_french.dat',
'output': "SELECT COUNT(eleve.nom) FROM eleve WHERE eleve.nom = 'belle';"
},
{
'input': "Compte les dénomination des étudiant dont les dénomination sont BELLE et l'ancienneté est 25",
'database': './database/ecole.sql',
'language': './lang/french.csv',
'thesaurus': 'thesaurus/th_french.dat',
'output': "SELECT COUNT(eleve.nom) FROM eleve WHERE eleve.nom = 'belle' AND eleve.age = '25';"
}
]

thesaurusTest2 = [
{
'input': "Quel est le cours où la pièce est B45",
'database': './database/ecole.sql',
'language': './lang/french.csv',
'thesaurus': 'thesaurus/th_french.dat',
'output': "SELECT * FROM classe WHERE classe.salle = 'b45';"
}
]

Expand Down

0 comments on commit 45adf21

Please sign in to comment.