-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathextract.py
54 lines (40 loc) · 1.15 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from bs4 import BeautifulSoup
import requests
import os
import csv
file_headers_1 = ['key-ratio','balance-sheet','profit-loss','cashflow','quarterly','half']
file_headers_2 = ['consolidated','standalone']
filenames = []
for i in file_headers_1:
for j in file_headers_2:
filenames.append(i + "(" +j + ")" + ".csv")
FILE = "company_urls.txt" #Path of the text file containing the extracted URLs
with open(FILE,'r') as f:
data = f.read()
urls = data.split("\n")
for url in urls:
temp = url.split("/")
company = temp[4]
temp.insert(5,"ratio")
url = "/".join(temp)
if not os.path.exists(company):
os.makedirs(company)
os.chdir(company)
resp = requests.get(url)
soup = BeautifulSoup(resp.content,'html5lib')
tables = soup.findAll("table")
sub_counter = 0
counter = 0
for table in tables:
if counter % 2 == 0:
if sub_counter == 12:
break
headers = [th.text for th in table.select("tr th")]
with open(filenames[sub_counter], "w") as f:
wr = csv.writer(f)
wr.writerow(headers)
wr.writerows([[td.text for td in row.find_all("td")] for row in table.select("tr + tr")])
sub_counter+=1
counter+=1
print("Done")
os.chdir("..")