-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathsentenceEmbedder.py
89 lines (67 loc) · 2.55 KB
/
sentenceEmbedder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 22 18:27:57 2017
@author: simon
"""
import sys
import traceback
import nltk
import torch
from torch.autograd import Variable
import os
directory=os.getcwd()
if(not directory[-5:]=='model'):
directory=directory+ '/model'
sys.path.insert(0,directory)
print("new path added to sys.path : ", directory)
class Sentence2Vec(object):
def __init__(self,
glove_path=directory+"/InferSent/dataset/GloVe/glove.840B.300d.txt",
useCuda=False,
Nwords=10000,
pathToInferSentModel=directory+'/InferSent/infersent.allnli.pickle',
modelDirectory=directory+"/InferSent"):
print ("Loading Glove Model")
#adding directory to the InferSent module
if (not modelDirectory in sys.path):
print("adding local directory to load the model")
sys.path.append(modelDirectory)
else:
print("directory already in the sys.path")
nltk.download('punkt')
#loading model
if (useCuda):
print("you are on GPU (encoding ~1000 sentences/s, default)")
self.infersent = torch.load(pathToInferSentModel)
else:
print("you are on CPU (~40 sentences/s)")
self.infersent = torch.load(pathToInferSentModel, map_location=lambda storage, loc: storage)
self.infersent.set_glove_path(glove_path)
print("loading the {} most common words".format(Nwords))
try:
self.infersent.build_vocab_k_words(K=Nwords)
print("vocab trained")
except Exception as e:
print("ERROR")
print(e)
print("\nPOSSIBLE SOLUTION")
print("if you have an encoding error, specify encoder='utf8' in the models.py file line 111 " )
print("done")
def encodeSent(self,sentence):
if(type(sentence)==str):
#print("processing one sentence")
return(torch.from_numpy((self.infersent).encode([sentence],tokenize=True)))
else:
#print("processing {} sentences".format(len(sentence)))
return(torch.from_numpy((self.infersent).encode(sentence,tokenize=True)))
#test code
#model=Sentence2Vec()
#sentence='Hello I am Simon'
#sentences=[sentence,'How are you ?']
#x=model.encodeSent(sentence)
#print(x.size())
#x=model.encodeSent(sentences)
#print(x.size())
#model.infersent.visualize(sentence)
#
#