Fixed Course Plan and Course Scrapping Issues

- Changed workflow names. - Fixed an issue with course plan scraping caused by downloading the same chrome driver from multiple threads. - Added "retry from where left off" system to course plan scrapper
itu-helper · Jan 4, 2025 · 52df9b4 · 52df9b4
1 parent d4e4622
commit 52df9b4
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 16 deletions.
diff --git a/.github/workflows/refresh_course_plans.yml b/.github/workflows/refresh_course_plans.yml
@@ -45,7 +45,7 @@ jobs:
           python src/run.py -scrap_target course_plan
 
       # # Commits the changes back to the data repo
-      # - name: Push course_plans.txt to Data
+      # - name: Push course_plans.txt to itu-helper/data
       #   uses: dmnemec/copy_file_to_another_repo_action@main
       #   env:
       #     API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}

diff --git a/.github/workflows/refresh_courses.yml b/.github/workflows/refresh_courses.yml
@@ -45,7 +45,7 @@ jobs:
           python src/run.py -scrap_target course
 
       # Commits the changes back to the data repo
-      - name: Push course_rows.txt to Data
+      - name: Push courses.psv to itu-helper/data
         uses: dmnemec/copy_file_to_another_repo_action@main
         env:
           API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}

diff --git a/.github/workflows/refresh_lessons.yml b/.github/workflows/refresh_lessons.yml
@@ -45,7 +45,7 @@ jobs:
           python src/run.py -scrap_target lesson
 
       # Commits the changes back to the data repo
-      - name: Push lesson_rows.txt to Data
+      - name: Push lessons.psv to itu-helper/data
         uses: dmnemec/copy_file_to_another_repo_action@main
         env:
           API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}

diff --git a/.github/workflows/refresh_misc.yml b/.github/workflows/refresh_misc.yml
@@ -45,7 +45,7 @@ jobs:
           python src/run.py -scrap_target misc
 
       # Commits the changes back to the data repo
-      - name: Push building_codes.txt to Data
+      - name: Push building_codes.psv to itu-helper/data
         uses: dmnemec/copy_file_to_another_repo_action@main
         env:
           API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
@@ -56,7 +56,7 @@ jobs:
           user_name: 'ITU Helper'
 
       # Commits the changes back to the data repo
-      - name: Push programme_codes.txt to Data
+      - name: Push programme_codes.psv to itu-helper/data
         uses: dmnemec/copy_file_to_another_repo_action@main
         env:
           API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}

diff --git a/src/course_plan_scraper.py b/src/course_plan_scraper.py
@@ -153,7 +153,7 @@ def get_dropdown_options(self, name: str, driver=None, remove_first: bool=True,
             return options[1:] if len(options) > 1 else None
         return options if len(options) > 0 else None
 
-    def scrap_faculty_course_plans(self, faculty_name: str, driver, log_prefix: str="") -> None:
+    def scrap_faculty_course_plans(self, faculty_name: str, driver, log_prefix: str="", prev_loop_values: list | None=None) -> None:
         # Open the course plans page.
         Logger.log_info(f"{log_prefix} Starting fetching the faculty: [blue]\"{faculty_name}\"[/blue]")
         driver.get(COURSE_PLANS_URL)
@@ -177,7 +177,9 @@ def scrap_faculty_course_plans(self, faculty_name: str, driver, log_prefix: str=
             self.faculty_course_plans[faculty_name] = dict()
 
         Logger.log(f"{log_prefix} Found the following program types for the faculty: [blue]{faculty_name}[/blue]: {', '.join([p.get_attribute('innerHTML') for p in program_types])}")
-        for program_type, program_type_name in self.get_attribute_element_pairs(program_types, "innerHTML"):
+        for i, (program_type, program_type_name) in enumerate(self.get_attribute_element_pairs(program_types, "innerHTML")):
+            if prev_loop_values is not None and i < prev_loop_values[0]: continue
+
             # Make sure the program type is allowed.
             if program_type_name not in ALLOWED_PROGRAM_TYPES:
                 # Logger.log_info(f"{log_prefix} Skipping the program type: [blue]{faculty_name}[/blue] [cyan]\"{program_type_name}\"[/cyan]. Not allowed")
@@ -189,14 +191,18 @@ def scrap_faculty_course_plans(self, faculty_name: str, driver, log_prefix: str=
                 Logger.log_info(f"{log_prefix} Skipping the program type: [blue]{faculty_name}[/blue]/[cyan]{program_type_name}[/cyan]. Programs empty")
                 continue
 
-            for program, program_name in self.get_attribute_element_pairs(programs, "innerHTML"):
+            for j, (program, program_name) in enumerate(self.get_attribute_element_pairs(programs, "innerHTML")):
+                if prev_loop_values is not None and (j < prev_loop_values[1] and i == prev_loop_values[0]): continue
+
                 # Read the plan types, if it's empty, skip the program.
                 plan_types = self.create_dropdown_and_get_elements(self.get_plan_type_dropdown_options, program, driver=driver)
                 if plan_types is None:
                     Logger.log_info(f"{log_prefix} Skipping the program: [blue]{faculty_name}[/blue]/[cyan]{program_type_name}[/cyan]/[magenta]{program_name}\"[/magenta]. Plan Types empty")
                     continue
 
-                for plan_type, plan_type_value in self.get_attribute_element_pairs(plan_types, "value"):
+                for k, (plan_type, plan_type_value) in enumerate(self.get_attribute_element_pairs(plan_types, "value")):
+                    if prev_loop_values is not None and (k <= prev_loop_values[2] and j == prev_loop_values[1] and i == prev_loop_values[0]): continue
+
                     # Make sure the plan type is allowed.
                     if plan_type_value not in ALLOWED_PLAN_TYPE_VALS: continue
 
@@ -211,8 +217,16 @@ def scrap_faculty_course_plans(self, faculty_name: str, driver, log_prefix: str=
                     else:
                         self.faculty_course_plans[faculty_name][program_type_name].update(program_data)
 
-                    driver.execute_script("window.history.back();")
-                    # driver.back()
+                    driver.back()
+
+                    # After going back, sometimes the dropdown values are not the same. It stays the same consistently when running locally
+                    # however, when running on GitHub actions, it sometimes changes. So we need to re-read the dropdown values.
+                    # So, if the dropdowns are cleared, rerun this method, but skip the values that are already fetched.
+                    # That's why we use enumerate. Make sure to increment k by 1 but not the others.
+                    place_holders = self.find_elements_by_css_selector("span.select2-selection__placeholder", driver)
+                    if len(place_holders) > 0:
+                        Logger.log_info(f"{log_prefix} Dropdown values are cleared, rerunning the method.")
+                        self.scrap_faculty_course_plans(faculty_name, driver, log_prefix, [i, j, k])
 
         Logger.log_info(f"{log_prefix} Finished Scraping The Faculty: [blue]\"{faculty_name}\"[/blue]")
 

diff --git a/src/run.py b/src/run.py
@@ -132,21 +132,21 @@ def save_misc_data(data):
 if __name__ == "__main__":
     args = parser.parse_args()
     t0 = perf_counter()
+
+    # Even though some scrappers don't use the given driver, not creating this sometimes throws issues, especially on GitHub Actions.
+    # When creating a driver, we install the Chrome WebDriver. When we create multiple drivers on different threads, the downloads
+    # may conflict and cause issues. So, we create the driver here. This way, even if we don't use it, the WebDriver is installed.
     driver = DriverManager.create_driver()
 
-    # Scrap Courses
     if args.scrap_target == "course":
         course_rows = CourseScraper(None).scrap_courses()
         save_course_rows(course_rows)
-    # Scrap Course Plans
     elif args.scrap_target == "course_plan":
         faculty_course_plans = CoursePlanScraper(driver).scrap_course_plans()
         save_course_plans(faculty_course_plans)
-    # Scrap Building Codes and Programme Codes
-    elif args.scrap_target == "misc":
+    elif args.scrap_target == "misc":  # Scrap Building Codes and Programme Codes
         data = MiscScraper().scrap_data()
         save_misc_data(data)
-    # Scrap Lessons
     elif args.scrap_target == "lesson":
         lesson_rows = LessonScraper(driver).scrap_tables()
         save_lesson_rows(lesson_rows)