-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbrowser.py
153 lines (129 loc) · 6.29 KB
/
browser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from selenium import webdriver
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.edge.options import Options as EdgeOptions
from bs4 import BeautifulSoup
import time
from enum import Enum
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.remote.webdriver import WebDriver
from college import CollegeSearchResult
from college import Filters
import pandas as pd
import requests
import re
import openpyxl
from selenium import webdriver
from enum import Enum
'''
The relevant element we are looking for is the card element that houses the information we wish to scrape.
However the element is composed of other elements as well and the information is spread out amongst those elements. Below is a simplified subtree for the container element.
cs-college-card-outer-container -> (the base)
...
cs-college-card-college-name-link cb-roboto-medium cb-black1-color-> (where the href is stored)
...
...
cs-college-card-college-name cb-roboto-medium -> (where name's stored)
...
cs-college-card-college-address cb-roboto-light cb-paragraph1-st cb-padding-top-8 -> (where the locations stored)
...
'''
def getSchools():
driver_path = r"C:\Users\12035\edgedriver_win32\msedgedriver.exe"
service = EdgeService(driver_path)
options = EdgeOptions()
options.add_argument("headless")
driver = webdriver.Edge(service=service, options=options)
# Programmatically set the filter and enter the collegeboard website programmatically.
filter = Filters.SATascending.value
# URL is the scrapers entry point into the domain. The url in question links to a search result page that displays colleges indexed by college board.
entryURL = "https://bigfuture.collegeboard.org/college-search/filters?"
modifiedURL = f"{entryURL}{filter}"
driver.get(modifiedURL)
# Wait for the element to be visible the timeout will be 10s, if 10s passes without the element terminate.
wait = WebDriverWait(driver, 2) # Adjust the timeout as needed
element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.cs-college-card-outer-container')))
# Access the html
page_source = driver.page_source
oup = BeautifulSoup(page_source, 'html.parser')
loadMore(driver=driver)
# loadMore()
# loadMore()
searchResults = []
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
# Parsing the search result webpage for relevant information
containers = soup.find_all(class_='cs-college-card-outer-container')
for container in containers:
# The relevant information needed to build a CollegeSearchResult is located in multiple subelements that need to be search for accordingly.
hrefElement = container.find(class_='cs-college-card-college-name-link cb-roboto-medium cb-black1-color')
href = hrefElement.get('href')
nameElement = container.find(class_='cs-college-card-college-name-link-text')
name = nameElement.text
adddressElement = container.find(class_='cs-college-card-college-address')
address = adddressElement.text
characteristicsElement = container.find(class_='cs-college-card-details-profile-inline-list cb-text-list cs-college-card-details-profile-info-text')
characteristics = characteristicsElement.text
'''
Details
The detail/profile elements in the container are defined using the same class name (cb-no-padding cs-college-card-details-profile-info-text).
Therefore we have to search by their 'datatestid' an attribute that represents a psudoname likely created by the collegeboard devs to distinguish between the elements when accessing or debugging the html.
'''
graduationElement = container.find(attrs={"data-testid": "cs-college-card-details-profile-school-graduation-rate"})
graduation = graduationElement.text
apyElement = container.find(attrs={"data-testid": "cs-college-card-details-profile-school-average-cost"})
apy = apyElement.text
satElement = container.find(attrs={"data-testid": "cs-college-card-details-profile-school-sat-range"})
sat = satElement.text
searchResult = CollegeSearchResult(
name = name,
location = address,
characteristics = characteristics,
graduation_rate = graduation,
apy = apy,
sat = sat,
href= href
)
searchResults.append(searchResult)
return searchResults
# Loads additional colleges by pressing the load more button
def loadMore(driver):
element = driver.find_element(By.CSS_SELECTOR, 'button.cb-btn.cb-btn-black.cb-btn-block') # Identify the load more button
driver.execute_script("arguments[0].scrollIntoView();", element) # Scrolls the element into view
time.sleep(1)
driver.execute_script("window.scrollBy(0, -275);")
time.sleep(1)
clickable_element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'button.cb-btn.cb-btn-black.cb-btn-block'))
)
clickable_element.click()
time.sleep(1)
# Configure the Selenium Browser
# def configureBrowser() -> WebDriver:
# driver_path = r"C:\Users\12035\edgedriver_win32\msedgedriver.exe"
# service = EdgeService(driver_path)
# options = EdgeOptions()
# options.add_argument("headless")
# driver = webdriver.Edge(service=service, options=options)
# return driver
# Crawls the profile page for the average net price
def tuitionCrawler(url):
website = url + "/tuition-and-costs"
college_name = url[44:len(website)]
response = requests.get(
url = website
)
if response.ok:
soup = BeautifulSoup(response.content, 'html.parser')
content_of_page = soup.find('main')
tuition_tag = content_of_page.find(class_= 'sc-f0cac891-3 bhdQaF cb-margin-bottom-16')
return college_name, tuition_tag.text
else:
return
# Generates an xlsx file
def toExcel(dataFrame):
path = 'college_tuition.xlsx'
dataFrame.to_excel(path, sheet_name = 'college_sheet')
return