-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgephi-wordpairs-5
76 lines (64 loc) · 2.5 KB
/
gephi-wordpairs-5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#this script prepares a single .txt file for use in gephi
#This Python script spits out each stemmed non-stopword as a node, and counts word-pairs as edges.
#That is, an edge occurs whenever one word occurs within 4 words of another word.
#The edge weight increases with the frequency of the word-pair.
#you will probably have to hand=prune the resulting file and make sure the last few tags were added.
import os,nltk,os.path,re,string
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
listing=os.listdir('---path here----')
filename = open('----filename----.txt','r')
fcontent=filename.read()
filename.close()
outText=open('----filename----.graphml','w')
outText.write('<?xml version="1.0" encoding="UTF-8"?> \n <graphml xmlns="http://graphml.graphdrawing.org/xmlns"> \n <key id="d0" for="edge" attr.name="weight" attr.type="double"/> \n <graph id="G" edgedefault="undirected">')
fs = fcontent.split()
stemmed=[]
wordlist=[]
for word in fs:
word = ps.stem(word.strip(string.punctuation).lower())
if word not in nltk.corpus.stopwords.words('english') and len(word)<15:
stemmed.append(word)
if word not in wordlist:
wordlist.append(word)
outText.write('<node id="' + word + '"/>' + '\n')
else:
pass
else:
pass
ls=len(stemmed)
counter = -1
pairlist=[]
pairlistnodupes=[]
for word in stemmed:
pl = stemmed.index(word)
if pl < ls:
adjword1 = stemmed[pl+1]
adjword2 = stemmed[pl+2]
adjword3 = stemmed[pl+3]
adjword4 = stemmed[pl+4]
pair1 = 'source="' + word + '" target="' + adjword1
pairlist.append(pair1)
pair2 = 'source="' + word + '" target="' + adjword2
pairlist.append(pair2)
pair3 = 'source="' + word + '" target="' + adjword3
pairlist.append(pair3)
pair4 = 'source="' + word + '" target="' + adjword4
pairlist.append(pair4)
else:
print 'stems done'
for pair in pairlist:
counter = counter + 1
occur = pairlist.count(pair)
if occur > 1:
if pair not in pairlistnodupes:
pairlistnodupes.append(pair)
else:
outText.write('<edge id="'+ str(counter) + '" ' + pair + '"><data key="d0"></data></edge>' + '\n')
for pair in pairlistnodupes:
counter = counter + 1
occur = pairlist.count(pair)
outText.write('<edge id="'+ str(counter) + '" ' + pair + '"><data key="d0">' + str(occur) + '.0</data></edge>' + '\n')
outText.write(' </graph></graphml>')
outText.close()
print 'donesies'