-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathArticle.py
39 lines (33 loc) · 1.49 KB
/
Article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# Holds the text and the ID to the article.
from __future__ import unicode_literals
from Truecaser import *
import nltk
class Article:
def __init__(self, text, uniDist=None, backwardBiDist=None, forwardBiDist=None, trigramDist=None, wordCasingLookup=None):
self.id = text.split()[0]
text = text[len(self.id):]
self.text = text
self.uniDist = uniDist
self.backwardBiDist = backwardBiDist
self.forwardBiDist = forwardBiDist
self.trigramDist = trigramDist
self.wordCasingLookup = wordCasingLookup
# self.cased_text = self.fix_casing(text)
def fix_casing(self, text):
cased_text = ''
sent_tokenize_list = nltk.sent_tokenize(text)
for sentence in sent_tokenize_list:
tokens_correct = nltk.word_tokenize(sentence)
tokens = [token.lower() for token in tokens_correct]
tokensTrueCase = getTrueCase(tokens, 'title', self.wordCasingLookup, self.uniDist, self.backwardBiDist, self.forwardBiDist, self.trigramDist)
# Group puncuation with word
puncuated_tokens = []
count = 0
for token in tokensTrueCase:
if token in string.punctuation and count != 0:
puncuated_tokens[count - 1] = puncuated_tokens[count - 1] + token
else:
puncuated_tokens.append(token)
count += 1
cased_text += ' ' + ' '.join(puncuated_tokens)
return cased_text