-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathparse_10K.py
79 lines (68 loc) · 2.69 KB
/
parse_10K.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Adapted from https://github.com/rsljr/edgarParser/blob/master/parse_10K.py
import re
import unicodedata
from bs4 import BeautifulSoup as bs
import requests
def parse_10k_filing(content, section):
if section not in [0, 1, 2, 3]:
print("Not a valid section")
sys.exit()
def get_text(content):
html = bs(content, "html.parser")
text = html.get_text()
text = unicodedata.normalize("NFKD", text).encode('ascii', 'ignore').decode('utf8')
text = text.split("\n")
text = " ".join(text)
return(text)
def extract_text(text, item_start, item_end):
item_start = item_start
item_end = item_end
starts = [i.start() for i in item_start.finditer(text)]
ends = [i.start() for i in item_end.finditer(text)]
positions = list()
for s in starts:
control = 0
for e in ends:
if control == 0:
if s < e:
control = 1
positions.append([s,e])
item_length = 0
item_position = list()
for p in positions:
if (p[1]-p[0]) > item_length:
item_length = p[1]-p[0]
item_position = p
item_text = text[item_position[0]:item_position[1]]
return(item_text)
text = get_text(content)
if section == 1 or section == 0:
try:
item1_start = re.compile("item\s*[1][\.\;\:\-\_]*\s*\\b", re.IGNORECASE)
item1_end = re.compile("item\s*1a[\.\;\:\-\_]\s*Risk|item\s*2[\.\,\;\:\-\_]\s*Prop", re.IGNORECASE)
businessText = extract_text(text, item1_start, item1_end)
except:
businessText = "Something went wrong!"
if section == 2 or section == 0:
try:
item1a_start = re.compile("(?<!,\s)item\s*1a[\.\;\:\-\_]\s*Risk", re.IGNORECASE)
item1a_end = re.compile("item\s*2[\.\;\:\-\_]\s*Prop|item\s*[1][\.\;\:\-\_]*\s*\\b", re.IGNORECASE)
riskText = extract_text(text, item1a_start, item1a_end)
except:
riskText = "Something went wrong!"
if section == 3 or section == 0:
try:
item7_start = re.compile("item\s*[7][\.\;\:\-\_]*\s*\\bM", re.IGNORECASE)
item7_end = re.compile("item\s*7a[\.\;\:\-\_]\sQuanti|item\s*8[\.\,\;\:\-\_]\s*", re.IGNORECASE)
mdaText = extract_text(text, item7_start, item7_end)
except:
mdaText = "Something went wrong!"
if section == 0:
data = [businessText, riskText, mdaText]
elif section == 1:
data = [businessText]
elif section == 2:
data = [riskText]
elif section == 3:
data = [mdaText]
return(data)