This repository has been archived by the owner on Jul 31, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgen_fragment.py
59 lines (45 loc) · 1.52 KB
/
gen_fragment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import re
IDENTIFIER_PREFIX = '$I:'
OOV_TOKEN = '$OOV'
# SENTENCE_SEPS = ('$P:;', '$P:}', '$P:{')
SENTENCE_SEPS = ('$P:;', )
def identifier_split_java(token):
token = token.rstrip()
if token.startswith('@'): # such as '@Override' # java specific
s2 = ['@'] + re.findall(r'[A-Z]+[a-z]*|[a-z]+|[:]', token[1:])
else:
s2 = re.findall(r'[A-Z]+[a-z]*|[a-z]+|[:]', token) # 'default:' -> 'default', ':'
if token.endswith('.*'): # such as 'import java.util.*' # java specific
s2.append('*')
return s2
def identifier_split_python(token):
token = token.rstrip()
if token.startswith('__'): # special method name (should be treated as a reserved word?)
return [token]
s2 = re.findall(r'[A-Z]+[a-z]*|[a-z]+', token)
return [w.lower() for w in s2]
def identifier_split_default(token):
token = token.rstrip()
s2 = re.findall(r'[A-Z]+[a-z]*|[a-z]+', token)
return [w.lower() for w in s2]
def to_tseq(tokens, identifier_split_func):
len_IDENTIFIER_PREFIX = len(IDENTIFIER_PREFIX)
r = []
for t in tokens:
if t.startswith(IDENTIFIER_PREFIX):
r.append(IDENTIFIER_PREFIX)
r.extend(identifier_split_func(t[len_IDENTIFIER_PREFIX:]))
else:
r.append(t)
return r
def split_to_tseqwon_and_liseq(tseq):
tseqwon = []
liseq = []
li = 0
for t in tseq:
if t == '$$':
li += 1
else:
tseqwon.append(t)
liseq.append(li)
return tseqwon, liseq