Skip to content

Commit

Permalink
Fixed Course Plan and Course Scrapping Issues
Browse files Browse the repository at this point in the history
- Changed workflow names.
- Fixed an issue with course plan scraping caused by downloading the same chrome driver from multiple threads.
- Added "retry from where left off" system to course plan scrapper
  • Loading branch information
AtaTrkgl committed Jan 4, 2025
1 parent d4e4622 commit 52df9b4
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 16 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/refresh_course_plans.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
python src/run.py -scrap_target course_plan
# # Commits the changes back to the data repo
# - name: Push course_plans.txt to Data
# - name: Push course_plans.txt to itu-helper/data
# uses: dmnemec/copy_file_to_another_repo_action@main
# env:
# API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/refresh_courses.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
python src/run.py -scrap_target course
# Commits the changes back to the data repo
- name: Push course_rows.txt to Data
- name: Push courses.psv to itu-helper/data
uses: dmnemec/copy_file_to_another_repo_action@main
env:
API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/refresh_lessons.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
python src/run.py -scrap_target lesson
# Commits the changes back to the data repo
- name: Push lesson_rows.txt to Data
- name: Push lessons.psv to itu-helper/data
uses: dmnemec/copy_file_to_another_repo_action@main
env:
API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/refresh_misc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
python src/run.py -scrap_target misc
# Commits the changes back to the data repo
- name: Push building_codes.txt to Data
- name: Push building_codes.psv to itu-helper/data
uses: dmnemec/copy_file_to_another_repo_action@main
env:
API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
Expand All @@ -56,7 +56,7 @@ jobs:
user_name: 'ITU Helper'

# Commits the changes back to the data repo
- name: Push programme_codes.txt to Data
- name: Push programme_codes.psv to itu-helper/data
uses: dmnemec/copy_file_to_another_repo_action@main
env:
API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
Expand Down
26 changes: 20 additions & 6 deletions src/course_plan_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def get_dropdown_options(self, name: str, driver=None, remove_first: bool=True,
return options[1:] if len(options) > 1 else None
return options if len(options) > 0 else None

def scrap_faculty_course_plans(self, faculty_name: str, driver, log_prefix: str="") -> None:
def scrap_faculty_course_plans(self, faculty_name: str, driver, log_prefix: str="", prev_loop_values: list | None=None) -> None:
# Open the course plans page.
Logger.log_info(f"{log_prefix} Starting fetching the faculty: [blue]\"{faculty_name}\"[/blue]")
driver.get(COURSE_PLANS_URL)
Expand All @@ -177,7 +177,9 @@ def scrap_faculty_course_plans(self, faculty_name: str, driver, log_prefix: str=
self.faculty_course_plans[faculty_name] = dict()

Logger.log(f"{log_prefix} Found the following program types for the faculty: [blue]{faculty_name}[/blue]: {', '.join([p.get_attribute('innerHTML') for p in program_types])}")
for program_type, program_type_name in self.get_attribute_element_pairs(program_types, "innerHTML"):
for i, (program_type, program_type_name) in enumerate(self.get_attribute_element_pairs(program_types, "innerHTML")):
if prev_loop_values is not None and i < prev_loop_values[0]: continue

# Make sure the program type is allowed.
if program_type_name not in ALLOWED_PROGRAM_TYPES:
# Logger.log_info(f"{log_prefix} Skipping the program type: [blue]{faculty_name}[/blue] [cyan]\"{program_type_name}\"[/cyan]. Not allowed")
Expand All @@ -189,14 +191,18 @@ def scrap_faculty_course_plans(self, faculty_name: str, driver, log_prefix: str=
Logger.log_info(f"{log_prefix} Skipping the program type: [blue]{faculty_name}[/blue]/[cyan]{program_type_name}[/cyan]. Programs empty")
continue

for program, program_name in self.get_attribute_element_pairs(programs, "innerHTML"):
for j, (program, program_name) in enumerate(self.get_attribute_element_pairs(programs, "innerHTML")):
if prev_loop_values is not None and (j < prev_loop_values[1] and i == prev_loop_values[0]): continue

# Read the plan types, if it's empty, skip the program.
plan_types = self.create_dropdown_and_get_elements(self.get_plan_type_dropdown_options, program, driver=driver)
if plan_types is None:
Logger.log_info(f"{log_prefix} Skipping the program: [blue]{faculty_name}[/blue]/[cyan]{program_type_name}[/cyan]/[magenta]{program_name}\"[/magenta]. Plan Types empty")
continue

for plan_type, plan_type_value in self.get_attribute_element_pairs(plan_types, "value"):
for k, (plan_type, plan_type_value) in enumerate(self.get_attribute_element_pairs(plan_types, "value")):
if prev_loop_values is not None and (k <= prev_loop_values[2] and j == prev_loop_values[1] and i == prev_loop_values[0]): continue

# Make sure the plan type is allowed.
if plan_type_value not in ALLOWED_PLAN_TYPE_VALS: continue

Expand All @@ -211,8 +217,16 @@ def scrap_faculty_course_plans(self, faculty_name: str, driver, log_prefix: str=
else:
self.faculty_course_plans[faculty_name][program_type_name].update(program_data)

driver.execute_script("window.history.back();")
# driver.back()
driver.back()

# After going back, sometimes the dropdown values are not the same. It stays the same consistently when running locally
# however, when running on GitHub actions, it sometimes changes. So we need to re-read the dropdown values.
# So, if the dropdowns are cleared, rerun this method, but skip the values that are already fetched.
# That's why we use enumerate. Make sure to increment k by 1 but not the others.
place_holders = self.find_elements_by_css_selector("span.select2-selection__placeholder", driver)
if len(place_holders) > 0:
Logger.log_info(f"{log_prefix} Dropdown values are cleared, rerunning the method.")
self.scrap_faculty_course_plans(faculty_name, driver, log_prefix, [i, j, k])

Logger.log_info(f"{log_prefix} Finished Scraping The Faculty: [blue]\"{faculty_name}\"[/blue]")

Expand Down
10 changes: 5 additions & 5 deletions src/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,21 +132,21 @@ def save_misc_data(data):
if __name__ == "__main__":
args = parser.parse_args()
t0 = perf_counter()

# Even though some scrappers don't use the given driver, not creating this sometimes throws issues, especially on GitHub Actions.
# When creating a driver, we install the Chrome WebDriver. When we create multiple drivers on different threads, the downloads
# may conflict and cause issues. So, we create the driver here. This way, even if we don't use it, the WebDriver is installed.
driver = DriverManager.create_driver()

# Scrap Courses
if args.scrap_target == "course":
course_rows = CourseScraper(None).scrap_courses()
save_course_rows(course_rows)
# Scrap Course Plans
elif args.scrap_target == "course_plan":
faculty_course_plans = CoursePlanScraper(driver).scrap_course_plans()
save_course_plans(faculty_course_plans)
# Scrap Building Codes and Programme Codes
elif args.scrap_target == "misc":
elif args.scrap_target == "misc": # Scrap Building Codes and Programme Codes
data = MiscScraper().scrap_data()
save_misc_data(data)
# Scrap Lessons
elif args.scrap_target == "lesson":
lesson_rows = LessonScraper(driver).scrap_tables()
save_lesson_rows(lesson_rows)
Expand Down

0 comments on commit 52df9b4

Please sign in to comment.