-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathData.py
164 lines (119 loc) · 4.56 KB
/
Data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
######################################################################################################
#
# Organization: Asociacion De Investigacion En Inteligencia Artificial Para La Leucemia Peter Moss
# Repository: GeniSysAI
# Project: Natural Language Understanding Engine
#
# Author: Adam Milton-Barker (AdamMiltonBarker.com)
#
# Title: Data Class
# Description: Data helper functions.
# License: MIT License
# Last Modified: 2020-10-01
#
######################################################################################################
import json, random, nltk, numpy as np
from nltk.stem.lancaster import LancasterStemmer
from Classes.Helpers import Helpers
class Data():
""" Data Class
Data helper functions.
"""
def __init__(self):
""" Initializes the Data class. """
self.ignore = [',','.','!','?']
self.Helpers = Helpers("Data")
self.LancasterStemmer = LancasterStemmer()
self.Helpers.logger.info("Data class initialized.")
def loadTrainingData(self):
""" Loads the NLU and NER training data from Model/Data/training.json """
with open("Model/Data/training.json") as jsonData:
trainingData = json.load(jsonData)
self.Helpers.logger.info("Training Data Ready")
return trainingData
def loadTrainedData(self):
""" Loads the saved training configuratuon """
with open("Model/data.json") as jsonData:
modelData = json.load(jsonData)
self.Helpers.logger.info("Model Data Ready")
return modelData
def sortList(self, listToSort):
""" Sorts a list by sorting the list, and removing duplicates
More Info:
https://www.programiz.com/python-programming/methods/built-in/sorted
https://www.programiz.com/python-programming/list
https://www.programiz.com/python-programming/set
"""
return sorted(list(set(listToSort)))
def extract(self, data=None, splitIt=False):
""" Extracts words from sentences
More Info:
https://www.nltk.org/_modules/nltk/stem/lancaster.html
http://insightsbot.com/blog/R8fu5/bag-of-words-algorithm-in-python-introduction
"""
return [self.LancasterStemmer.stem(word) for word in (data.split() if splitIt == True else data) if word not in self.ignore]
def makeBagOfWords(self, sInput, words):
""" Makes a bag of words
Makes a bag of words used by the inference and training
features. If makeBagOfWords is called during training, sInput
will be a list.
More Info:
http://insightsbot.com/blog/R8fu5/bag-of-words-algorithm-in-python-introduction
"""
if type(sInput) == list:
bagOfWords = []
for word in words:
if word in sInput:
bagOfWords.append(1)
else:
bagOfWords.append(0)
return bagOfWords
else:
bagOfWords = np.zeros(len(words))
for cword in self.extract(sInput, True):
for i, word in enumerate(words):
if word == cword: bagOfWords[i] += 1
return np.array(bagOfWords)
def prepareClasses(self, intent, classes):
""" Prepares classes
Adds an intent key to classes if it does not already exist
"""
if intent not in classes: classes.append(intent)
return classes
def prepareData(self, trainingData = [], wordsHldr = [], dataCorpusHldr = [], classesHldr = []):
""" Prepares date
Prepares the NLU and NER training data, loops through the
intents from our dataset, converts any entities / synoynms
"""
counter = 0
intentMap = {}
for intent in trainingData['intents']:
theIntent = intent['intent']
for text in intent['text']:
if 'entities' in intent and len(intent['entities']):
i = 0
for entity in intent['entities']:
tokens = text.replace(trainingData['intents'][counter]["text"][i], "<"+entity["entity"]+">").lower().split()
wordsHldr.extend(tokens)
dataCorpusHldr.append((tokens, theIntent))
i = i + 1
else:
tokens = text.lower().split()
wordsHldr.extend(tokens)
dataCorpusHldr.append((tokens, theIntent))
intentMap[theIntent] = counter
classesHldr = self.prepareClasses(theIntent, classesHldr)
counter = counter + 1
return self.sortList(self.extract(wordsHldr, False)), self.sortList(classesHldr), dataCorpusHldr, intentMap
def finaliseData(self, classes, dataCorpus, words):
""" Finalises the NLU training data """
trainData = []
out = np.zeros(len(classes))
for document in dataCorpus:
output = list(out)
output[classes.index(document[1])] = 1
trainData.append([self.makeBagOfWords(self.extract(document[0], False), words), output])
random.shuffle(trainData)
trainData = np.array(trainData)
self.Helpers.logger.info("Finalised Training Data Ready")
return list(trainData[:,0]), list(trainData[:,1])