-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcreate_parallel.py
164 lines (107 loc) · 3.42 KB
/
create_parallel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
############################################################
# Script for creating parallel aligned data from the existing XML parsed data.
# Pre-requisites: -
# Usage : python create_parallel.py [../inputs/processed-etcsl-sumerian/] [../inputs/processed-etcsl-eng/]
############################################################
import sys, os
import numbers, codecs
SUM_DIR = sys.argv[1]
ENG_DIR = sys.argv[2]
eng_list = os.listdir(ENG_DIR)
sum_list = os.listdir(SUM_DIR)
bad_files = [] #create a file where proper alignment is not maintained
for eng_file in eng_list:
# Find equivalent Sumerian file for the English translations
sum_file = ''.join(eng_file)
sum_file = 'c' + sum_file[1:]
line_ranges = []
contents_eng = [] # Store eng lines for a particular file
contents_sum = []
sum_dict = {}
print (sum_file)
print (eng_file)
with codecs.open(ENG_DIR+eng_file, 'r', encoding='utf-8') as e, codecs.open(SUM_DIR+sum_file, 'r', encoding='utf-8') as s:
next(e) # Skip the first line
next(s)
for line in s:
line = line.split(',') # Split by comma for Sumerian
if len(line) < 4: #For broken lines in the Sumerian script
continue
sum_dict[line[3]] = line[4].strip() #Create a dictionary mapping of lines-strings in Sumerian
for line in e:
line = line.split('\t')
#Boundary conditions
if len(line) < 4:
bad_files.append(eng_file)
continue
try:
b = int(line[3].split('-')[0])
c = int(line[3].split('-')[1])
except:
continue
'''
if len(line[3].split('-'))==2:
line_ranges.append(int(line[3].split('-')[0]))
line_ranges.append(int(line[3].split('-')[1]))
else:
line_ranges.append(int(line[3]))
'''
line_ranges.append(line[3])
if (line[4]==''):
print ("EMPTY : ", eng_file)
continue
contents_eng.append(line[4])
print (line_ranges)
i=0
temp_string = ''
for item in line_ranges:
# Assuming coherence now, no alpha chars. in b/w etc.
if len(item.split('-')) == 2:
start = int(item.split('-')[0])
end = int(item.split('-')[1])
elif len(item.split('-') == 1):
start = int(item)
end = int(item)
for x in range(start, end+1):
if str(x) in sum_dict:
temp_string = temp_string + sum_dict[str(x)].strip() + ' '
contents_sum.append(temp_string)
temp_string = ''
'''
for line in s:
line = line.split(',')
#Boundary condition if line_ranges ends
if i >= len(line_ranges):
temp_string = temp_string + line[4].strip()
contents_sum.append(temp_string)
temp_string = ''
continue
# Error handling
try:
a = int(line[3])
except:
print ("bad file")
bad_files.append(eng_file)
continue
if (line_ranges[i][0].isalpha()):
break #this is for sections now, modify later.
if (len(line_ranges.split('-') == 2):
if int(line[3]) in list(range(line_ranges[i], line_ranges[i+1]+1)):
temp_string = temp_string + line[4].strip() + ' '
#print (temp_string)
continue
temp_string = temp_string.strip()
contents_sum.append(temp_string)
temp_string = ''
i+=1
'''
#print (contents_eng)
#print (contents_sum)
if eng_file not in bad_files:
with codecs.open('../inputs/parallel-etcsl-eng/'+eng_file, 'w+') as pe, codecs.open('../inputs/parallel-etcsl-sum/'+sum_file, 'w+') as ps:
for item in contents_eng:
pe.write(item)
#pe.write('\n')
for item in contents_sum:
ps.write(item)
ps.write('\n')