-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdt.py
executable file
·119 lines (81 loc) · 3.08 KB
/
dt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/local/bin/pypy
import json
import os
import sys
from conll import ConLLiter
from subs import subssentiter
__author__ = 'husnusensoy'
def ComplexHandler(Obj):
if hasattr(Obj, 'jsonable'):
return Obj.jsonable()
else:
raise TypeError, 'Object of type %s with value of %s is not JSON serializable' % (type(Obj), repr(Obj))
class Substitution():
def __init__(self, subs, logprob):
self.substitute = subs
self.logprobability = logprob
def jsonable(self):
return self.__dict__
class Form():
def __init__(self, form="Husnu", subs=[Substitution("Nuri", -15.0),Substitution("Fahri", -16.0),Substitution("Nurdan", -17.0)]):
self.form = form
self.subs = subs
def jsonable(self):
return self.__dict__
class ConLLSentence:
def __init__(self):
self.id = 1
self.section = 23
self.tokenid = []
self.form = Form()
self.lemma = []
self.cpostag = []
self.postag = []
self.extendedtag = {}
self.feats = []
self.head = []
self.deprel = []
self.phead = []
self.pdeprel = []
def jsonable(self):
return self.__dict__
if __name__ == "__main__":
corpus = []
tagdir = "/Users/husnusensoy/uparse/data/upos"
tagfiles = ["parsing-45","parsing-50",'parsing-100','parsing-250','parsing-500']
tagger = {}
for tf in tagfiles:
path = os.path.join(tagdir, tf)
tagger[tf] = open(path,"r")
subsiter = subssentiter()
for idx, (sentence, section) in enumerate(ConLLiter("/Users/husnusensoy/uparse/data/nlp/treebank/treebank-2.0/combined/conll",'.dp'),start=1):
conllsent = ConLLSentence()
conllsent.id = idx
conllsent.section = int(section)
conllsent.tokenid = [w._id for w in sentence]
subssentence = next(subsiter)
if not any([c._form == s for c, s in zip(sentence, subssentence.orginal())]):
sys.stderr.write("Unmatched substitute. Potential File error\n")
break
conllsent.form = [Form(form=w._form,subs=s.topN()) for w,s in zip(sentence,subssentence)]
conllsent.lemma = [w._lemma for w in sentence]
conllsent.cpostag = [w._cpostag for w in sentence]
conllsent.postag = [w._postag for w in sentence]
for t in tagger:
if not t in conllsent.extendedtag:
conllsent.extendedtag[t] = []
for w in sentence:
conllsent.extendedtag[t].append(int(tagger[t].next()))
conllsent.feats = [w._feats for w in sentence]
conllsent.head = [w._head for w in sentence]
conllsent.deprel = [w._deprel for w in sentence]
conllsent.phead = [w._phead for w in sentence]
conllsent.pdeprel = [w._pdeprel for w in sentence]
corpus.append(conllsent)
if idx%1000 == 0:
sys.stderr.write("%d sentences converted\n"%idx)
for t in tagger:
tagger[t].close()
import gzip
with gzip.open("corpus.json.gz","w") as fp:
json.dump(corpus,fp, default=ComplexHandler,indent=2)