Skip to content

Commit

Permalink
Create selenium_dag.py
Browse files Browse the repository at this point in the history
  • Loading branch information
ldsalomone authored May 6, 2024
1 parent db1ccba commit a239382
Showing 1 changed file with 98 additions and 0 deletions.
98 changes: 98 additions & 0 deletions dags/selenium_dag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.operators.bash import BashOperator
from datetime import datetime
import asyncio
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromiumService
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.os_manager import ChromeType


def get_table_data(driver):
# Find the table element by its ID
table = driver.find_element(By.XPATH, '//*[@id="ContentPlaceHolder2_LData"]')
# Find all rows of the table
rows = table.find_elements(By.TAG_NAME, "tr")

table_data = []
print(len(rows))
for i, row in enumerate(rows[4:]): # Skip the first four rows
if i % 100 == 0:
print(f"Processing row {i}")
# Find all cells of the row
cells = row.find_elements(By.TAG_NAME, "td")
# Extract text from each cell and append to table_data
table_data.append([cell.text for cell in cells])

return table_data


def run_selenium():
# import chromedriver_autoinstaller

# chromedriver_autoinstaller.install()
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
# driver = webdriver.Chrome(
# service=ChromiumService(
# ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install(),
# options=options,
# )
# )

# service = webdriver.ChromeService(executable_path=chromedriver_bin)

# driver = webdriver.Chrome(service=service, options=options)
driver = webdriver.Chrome(options=options)

driver.get("https://oop.ky.gov/")

# Find and interact with elements using Selenium
pc_checkbox = driver.find_element(
By.XPATH, '//*[@id="ContentPlaceHolder2_chkBoards_9"]'
)
pc_checkbox.click()

all_data = []

for letter in "AB":
print(f"Scraping for {letter}...")
last_name_box = driver.find_element(
By.XPATH, '//*[@id="ContentPlaceHolder2_TLname"]'
)
last_name_box.clear()
last_name_box.send_keys(letter)

search_button = driver.find_element(
By.XPATH, '//*[@id="ContentPlaceHolder2_BSrch"]'
)
search_button.click()
time.sleep(5)
print("---getting table data...")
t_data = get_table_data(driver)
print(f"---{len(t_data)} rows retrieved...")
all_data.extend(t_data)

return all_data


# Define a DAG that runs the playwright script every day
dag = DAG(
dag_id="selenium_dag",
start_date=datetime(2021, 1, 1),
schedule_interval="@daily",
catchup=False,
)

# Define a task that runs the playwright script using the PythonOperator
selenium_task = PythonOperator(
task_id="selenium_task",
python_callable=run_selenium,
dag=dag,
)

0 comments on commit a239382

Please sign in to comment.