-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdep_disc_features.py
62 lines (45 loc) · 2.28 KB
/
dep_disc_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
__author__ = 'husnusensoy'
sent = []
from collections import defaultdict
from math import fabs
import logging
logging.basicConfig(level=logging.INFO)
featbetween = True
disc_feat = defaultdict(int)
for sect in range(2, 22):
logging.info("Processing section %d..."%sect)
with open('/Users/husnusensoy/Documents/data/conllWSJToken_wikipedia2MUNK-25-fixed/%02d/wsj_00%02d.dp' % (
sect, sect)) as fp:
for line in fp:
if len(line.strip()) > 0:
wid, form, _, postag, _, _, parent, _, _, _, _ = line.strip().split('\t')
sent.append(postag)
else:
for _parent in range(0, len(sent) + 1):
for _child in range(1, len(sent) + 1):
if _parent != _child:
for offset in [-1, 0, +1]:
_from = _parent + offset
_to = _child + offset
if _from <= 0 or _from > len(sent):
disc_feat['parent(%d)=*' % offset] += 1
else:
disc_feat['parent(%d)=%s' % (offset, sent[_from - 1])] += 1
if _to <= 0 or _to > len(sent):
disc_feat['child(%d)=*' % (offset)] += 1
else:
disc_feat['child(%d)=%s' % (offset, sent[_to - 1])] += 1
if featbetween:
if fabs(_parent - _child) > 1:
for _b in range(min(_parent, _child) + 1, max(_parent, _child)):
b_postag = sent[_b - 1]
disc_feat['between=%s' % b_postag] += 1
else:
disc_feat['between=-'] += 1
sent = []
import logging
logging.basicConfig(level=logging.INFO)
with open("features.txt", "w") as fp:
for i, (feature, occurrence) in enumerate(sorted(disc_feat.iteritems(), key=lambda x: x[1], reverse=True)):
print >> fp, "%d\t%s\t%d" % (i, feature, occurrence)
logging.info("Total number of features are %d" % (len(disc_feat)))