Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: selenium website loader #28

Merged
merged 7 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dyana/loaders/website/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
dyana.py
17 changes: 17 additions & 0 deletions dyana/loaders/website/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM python:3.10-alpine

# install chromedriver
RUN apk update
RUN apk add chromium chromium-chromedriver

ENV PYTHONUNBUFFERED 1
RUN pip install --upgrade pip
RUN pip install selenium

WORKDIR /app
COPY dyana.py .
COPY main.py .

ENV DISPLAY=:99

ENTRYPOINT ["python3", "-W", "ignore", "main.py"]
2 changes: 2 additions & 0 deletions dyana/loaders/website/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
all:
docker build -t dyana-website-loader .
168 changes: 168 additions & 0 deletions dyana/loaders/website/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import argparse
import json
import shutil
import time
import typing
from typing import Any

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from dyana import Profiler # type: ignore[attr-defined]


def collect_performance_metrics(driver: webdriver.Chrome) -> dict[str, Any]:
"""Collect detailed performance metrics using Chrome DevTools Protocol."""
metrics = {}

# Navigation Timing API metrics
navigation_timing = driver.execute_script("""
const performance = window.performance;
const timing = performance.timing;
return {
'navigationStart': timing.navigationStart,
'responseEnd': timing.responseEnd,
'domComplete': timing.domComplete,
'loadEventEnd': timing.loadEventEnd,
'pageLoadTime': timing.loadEventEnd - timing.navigationStart,
'dnsLookupTime': timing.domainLookupEnd - timing.domainLookupStart,
'tcpConnectTime': timing.connectEnd - timing.connectStart,
'serverResponseTime': timing.responseEnd - timing.requestStart,
'domProcessingTime': timing.domComplete - timing.domLoading
};
""")
metrics["timing"] = navigation_timing

# Memory info
memory_info = driver.execute_script("""
return {
'jsHeapSizeLimit': window.performance.memory.jsHeapSizeLimit,
'totalJSHeapSize': window.performance.memory.totalJSHeapSize,
'usedJSHeapSize': window.performance.memory.usedJSHeapSize
};
""")
metrics["memory"] = memory_info

# Resource timing data
resource_timing = driver.execute_script("""
return performance.getEntriesByType('resource').map(entry => ({
name: entry.name,
entryType: entry.entryType,
startTime: entry.startTime,
duration: entry.duration,
initiatorType: entry.initiatorType
}));
""")
metrics["resources"] = resource_timing

return metrics


def analyze_page_content(driver: webdriver.Chrome) -> dict[str, Any]:
"""Analyze page content and structure."""
return typing.cast(
dict[str, Any],
driver.execute_script("""
return {
'elements': document.getElementsByTagName('*').length,
'images': document.getElementsByTagName('img').length,
'links': document.getElementsByTagName('a').length,
'scripts': document.getElementsByTagName('script').length,
'styles': document.getElementsByTagName('link').length,
'iframes': document.getElementsByTagName('iframe').length,
'documentSize': document.documentElement.innerHTML.length,
};
"""),
)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Profile website performance")
parser.add_argument("--url", help="URL to open", required=True)
parser.add_argument("--wait-for", help="CSS selector to wait for", default=None)
parser.add_argument("--timeout", help="Timeout in seconds", type=int, default=30)
args = parser.parse_args()

# Normalize URL by adding https:// if protocol is missing
if "://" not in args.url:
args.url = f"https://{args.url}"

profiler: Profiler = Profiler()

try:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
# Enable performance logging
chrome_options.set_capability("goog:loggingPrefs", {"performance": "ALL", "browser": "ALL"})

driver = webdriver.Chrome(options=chrome_options, service=Service(shutil.which("chromedriver")))
driver.implicitly_wait(10)

profiler.track_memory("before_load")

start_time = time.time()
driver.get(args.url)

# Wait for specific element if requested
if args.wait_for:
try:
WebDriverWait(driver, args.timeout).until(
EC.presence_of_element_located((By.CSS_SELECTOR, args.wait_for))
)
except TimeoutException:
profiler.track_error("wait", f"Timeout waiting for element: {args.wait_for}")

load_time = time.time() - start_time
profiler.track_memory("after_load")

# Collect performance metrics
try:
metrics = collect_performance_metrics(driver)
content_analysis = analyze_page_content(driver)

# Get console logs
console_logs = driver.get_log("browser")

# Get network logs
network_logs = driver.get_log("performance")

# Add all metrics to profiler
profiler.extra = {
"load_time": load_time,
"performance_metrics": metrics,
"content_analysis": content_analysis,
"console_logs": console_logs,
"network_logs": network_logs,
"title": driver.title,
"url": driver.current_url,
"status_code": driver.execute_script("return window.performance.getEntries()[0].responseStatus"),
}
except Exception as e:
profiler.track_error("metrics", str(e))

# Take screenshot
try:
screenshot = driver.get_screenshot_as_base64()
profiler.extra["screenshot"] = screenshot
except Exception as e:
profiler.track_error("screenshot", str(e))

profiler.track_memory("after_profiling")

except Exception as e:
profiler.track_error("chrome", str(e))
finally:
try:
driver.quit()
profiler.track_memory("after_quit")
except Exception:
pass

print(json.dumps(profiler.as_dict()))
15 changes: 15 additions & 0 deletions dyana/loaders/website/settings.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
description: Opens a website in a headless browser and profiles its performance.
network: true

args:
- name: url
description: URL to open.
required: true
- name: wait-for
description: CSS selector to wait for before profiling.
required: false
- name: timeout
description: Timeout in seconds for page load and element wait.
required: false
type: int
default: "60"
Loading