From ec03887e0f740d060ac169f4236bf39a4ad320b3 Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Tue, 21 Jan 2025 08:24:08 -0500 Subject: [PATCH 1/7] feat: selenium website loader --- dyana/loaders/website/.gitignore | 1 + dyana/loaders/website/Dockerfile | 17 +++ dyana/loaders/website/Makefile | 2 + dyana/loaders/website/main.py | 159 +++++++++++++++++++++++++++++ dyana/loaders/website/settings.yml | 13 +++ 5 files changed, 192 insertions(+) create mode 100644 dyana/loaders/website/.gitignore create mode 100644 dyana/loaders/website/Dockerfile create mode 100644 dyana/loaders/website/Makefile create mode 100644 dyana/loaders/website/main.py create mode 100644 dyana/loaders/website/settings.yml diff --git a/dyana/loaders/website/.gitignore b/dyana/loaders/website/.gitignore new file mode 100644 index 0000000..e6ec67b --- /dev/null +++ b/dyana/loaders/website/.gitignore @@ -0,0 +1 @@ +dyana.py \ No newline at end of file diff --git a/dyana/loaders/website/Dockerfile b/dyana/loaders/website/Dockerfile new file mode 100644 index 0000000..6367621 --- /dev/null +++ b/dyana/loaders/website/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.10-alpine + +# install chromedriver +RUN apk update +RUN apk add chromium chromium-chromedriver + +ENV PYTHONUNBUFFERED 1 +RUN pip install --upgrade pip +RUN pip install selenium + +WORKDIR /app +COPY dyana.py . +COPY main.py . + +ENV DISPLAY=:99 + +ENTRYPOINT ["python3", "-W", "ignore", "main.py"] \ No newline at end of file diff --git a/dyana/loaders/website/Makefile b/dyana/loaders/website/Makefile new file mode 100644 index 0000000..6accab6 --- /dev/null +++ b/dyana/loaders/website/Makefile @@ -0,0 +1,2 @@ +all: + docker build -t dyana-website-loader . diff --git a/dyana/loaders/website/main.py b/dyana/loaders/website/main.py new file mode 100644 index 0000000..2abe94e --- /dev/null +++ b/dyana/loaders/website/main.py @@ -0,0 +1,159 @@ +import argparse +import json +import shutil +import time +from typing import Dict, Any + +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from selenium.common.exceptions import TimeoutException + +from dyana import Profiler # type: ignore[attr-defined] + +def collect_performance_metrics(driver: webdriver.Chrome) -> Dict[str, Any]: + """Collect detailed performance metrics using Chrome DevTools Protocol.""" + metrics = {} + + # Navigation Timing API metrics + navigation_timing = driver.execute_script(""" + const performance = window.performance; + const timing = performance.timing; + return { + 'navigationStart': timing.navigationStart, + 'responseEnd': timing.responseEnd, + 'domComplete': timing.domComplete, + 'loadEventEnd': timing.loadEventEnd, + 'pageLoadTime': timing.loadEventEnd - timing.navigationStart, + 'dnsLookupTime': timing.domainLookupEnd - timing.domainLookupStart, + 'tcpConnectTime': timing.connectEnd - timing.connectStart, + 'serverResponseTime': timing.responseEnd - timing.requestStart, + 'domProcessingTime': timing.domComplete - timing.domLoading + }; + """) + metrics['timing'] = navigation_timing + + # Memory info + memory_info = driver.execute_script(""" + return { + 'jsHeapSizeLimit': window.performance.memory.jsHeapSizeLimit, + 'totalJSHeapSize': window.performance.memory.totalJSHeapSize, + 'usedJSHeapSize': window.performance.memory.usedJSHeapSize + }; + """) + metrics['memory'] = memory_info + + # Resource timing data + resource_timing = driver.execute_script(""" + return performance.getEntriesByType('resource').map(entry => ({ + name: entry.name, + entryType: entry.entryType, + startTime: entry.startTime, + duration: entry.duration, + initiatorType: entry.initiatorType + })); + """) + metrics['resources'] = resource_timing + + return metrics + +def analyze_page_content(driver: webdriver.Chrome) -> Dict[str, Any]: + """Analyze page content and structure.""" + return driver.execute_script(""" + return { + 'elements': document.getElementsByTagName('*').length, + 'images': document.getElementsByTagName('img').length, + 'links': document.getElementsByTagName('a').length, + 'scripts': document.getElementsByTagName('script').length, + 'styles': document.getElementsByTagName('link').length, + 'iframes': document.getElementsByTagName('iframe').length, + 'documentSize': document.documentElement.innerHTML.length, + }; + """) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Profile website performance") + parser.add_argument("--url", help="URL to open", required=True) + parser.add_argument("--wait-for", help="CSS selector to wait for", default=None) + parser.add_argument("--timeout", help="Timeout in seconds", type=int, default=30) + args = parser.parse_args() + profiler: Profiler = Profiler() + + try: + chrome_options = webdriver.ChromeOptions() + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--headless=new") + chrome_options.add_argument("--disable-dev-shm-usage") + chrome_options.add_argument("--window-size=1920,1080") + chrome_options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess") + # Enable performance logging + chrome_options.set_capability("goog:loggingPrefs", {"performance": "ALL", "browser": "ALL"}) + + driver = webdriver.Chrome(options=chrome_options, service=Service(shutil.which("chromedriver"))) + driver.implicitly_wait(10) + + profiler.track_memory("before_load") + + start_time = time.time() + driver.get(args.url) + + # Wait for specific element if requested + if args.wait_for: + try: + WebDriverWait(driver, args.timeout).until( + EC.presence_of_element_located((By.CSS_SELECTOR, args.wait_for)) + ) + except TimeoutException: + profiler.track_error("wait", f"Timeout waiting for element: {args.wait_for}") + + load_time = time.time() - start_time + profiler.track_memory("after_load") + + # Collect performance metrics + try: + metrics = collect_performance_metrics(driver) + content_analysis = analyze_page_content(driver) + + # Get console logs + console_logs = driver.get_log('browser') + + # Get network logs + network_logs = driver.get_log('performance') + + # Add all metrics to profiler + profiler.extra = { + "load_time": load_time, + "performance_metrics": metrics, + "content_analysis": content_analysis, + "console_logs": console_logs, + "network_logs": network_logs, + "title": driver.title, + "url": driver.current_url, + "status_code": driver.execute_script( + "return window.performance.getEntries()[0].responseStatus" + ), + } + except Exception as e: + profiler.track_error("metrics", str(e)) + + # Take screenshot + try: + screenshot = driver.get_screenshot_as_base64() + profiler.extra["screenshot"] = screenshot + except Exception as e: + profiler.track_error("screenshot", str(e)) + + profiler.track_memory("after_profiling") + + except Exception as e: + profiler.track_error("chrome", str(e)) + finally: + try: + driver.quit() + profiler.track_memory("after_quit") + except: + pass + + print(json.dumps(profiler.as_dict())) \ No newline at end of file diff --git a/dyana/loaders/website/settings.yml b/dyana/loaders/website/settings.yml new file mode 100644 index 0000000..f60d3e8 --- /dev/null +++ b/dyana/loaders/website/settings.yml @@ -0,0 +1,13 @@ +description: Opens a website in a headless browser and profiles its performance. + +args: + - name: url + description: URL to open. + required: true + - name: wait-for + description: CSS selector to wait for before profiling. + required: false + - name: timeout + description: Timeout in seconds for page load and element wait. + required: false + default: "30" From ca5527e31b3814776bb4f2233d695a2f1c011cbb Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Tue, 21 Jan 2025 08:24:29 -0500 Subject: [PATCH 2/7] fix: add normalization to hostname --- dyana/loaders/website/main.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/dyana/loaders/website/main.py b/dyana/loaders/website/main.py index 2abe94e..1dc4b94 100644 --- a/dyana/loaders/website/main.py +++ b/dyana/loaders/website/main.py @@ -13,6 +13,7 @@ from dyana import Profiler # type: ignore[attr-defined] + def collect_performance_metrics(driver: webdriver.Chrome) -> Dict[str, Any]: """Collect detailed performance metrics using Chrome DevTools Protocol.""" metrics = {} @@ -33,7 +34,7 @@ def collect_performance_metrics(driver: webdriver.Chrome) -> Dict[str, Any]: 'domProcessingTime': timing.domComplete - timing.domLoading }; """) - metrics['timing'] = navigation_timing + metrics["timing"] = navigation_timing # Memory info memory_info = driver.execute_script(""" @@ -43,7 +44,7 @@ def collect_performance_metrics(driver: webdriver.Chrome) -> Dict[str, Any]: 'usedJSHeapSize': window.performance.memory.usedJSHeapSize }; """) - metrics['memory'] = memory_info + metrics["memory"] = memory_info # Resource timing data resource_timing = driver.execute_script(""" @@ -55,10 +56,11 @@ def collect_performance_metrics(driver: webdriver.Chrome) -> Dict[str, Any]: initiatorType: entry.initiatorType })); """) - metrics['resources'] = resource_timing + metrics["resources"] = resource_timing return metrics + def analyze_page_content(driver: webdriver.Chrome) -> Dict[str, Any]: """Analyze page content and structure.""" return driver.execute_script(""" @@ -73,12 +75,18 @@ def analyze_page_content(driver: webdriver.Chrome) -> Dict[str, Any]: }; """) + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Profile website performance") parser.add_argument("--url", help="URL to open", required=True) parser.add_argument("--wait-for", help="CSS selector to wait for", default=None) parser.add_argument("--timeout", help="Timeout in seconds", type=int, default=30) args = parser.parse_args() + + # Normalize URL by adding https:// if protocol is missing + if "://" not in args.url: + args.url = f"https://{args.url}" + profiler: Profiler = Profiler() try: @@ -117,10 +125,10 @@ def analyze_page_content(driver: webdriver.Chrome) -> Dict[str, Any]: content_analysis = analyze_page_content(driver) # Get console logs - console_logs = driver.get_log('browser') + console_logs = driver.get_log("browser") # Get network logs - network_logs = driver.get_log('performance') + network_logs = driver.get_log("performance") # Add all metrics to profiler profiler.extra = { @@ -131,9 +139,7 @@ def analyze_page_content(driver: webdriver.Chrome) -> Dict[str, Any]: "network_logs": network_logs, "title": driver.title, "url": driver.current_url, - "status_code": driver.execute_script( - "return window.performance.getEntries()[0].responseStatus" - ), + "status_code": driver.execute_script("return window.performance.getEntries()[0].responseStatus"), } except Exception as e: profiler.track_error("metrics", str(e)) @@ -156,4 +162,4 @@ def analyze_page_content(driver: webdriver.Chrome) -> Dict[str, Any]: except: pass - print(json.dumps(profiler.as_dict())) \ No newline at end of file + print(json.dumps(profiler.as_dict())) From 5236e95474a7f82d4d52f1a2168394ab7607748b Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Tue, 21 Jan 2025 08:50:30 -0500 Subject: [PATCH 3/7] fix: keep settings simple for first iteration --- dyana/loaders/website/settings.yml | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/dyana/loaders/website/settings.yml b/dyana/loaders/website/settings.yml index f60d3e8..059f21e 100644 --- a/dyana/loaders/website/settings.yml +++ b/dyana/loaders/website/settings.yml @@ -1,13 +1,7 @@ -description: Opens a website in a headless browser and profiles its performance. +description: Opens a website in an headless browser. +network: true args: - name: url description: URL to open. required: true - - name: wait-for - description: CSS selector to wait for before profiling. - required: false - - name: timeout - description: Timeout in seconds for page load and element wait. - required: false - default: "30" From 14e9dd507a6df005dc6ce76664b6cab2ba15057e Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Tue, 21 Jan 2025 08:58:34 -0500 Subject: [PATCH 4/7] fix: keep to sixty default timeout --- dyana/loaders/website/settings.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/dyana/loaders/website/settings.yml b/dyana/loaders/website/settings.yml index 059f21e..fcb20c3 100644 --- a/dyana/loaders/website/settings.yml +++ b/dyana/loaders/website/settings.yml @@ -1,7 +1,14 @@ -description: Opens a website in an headless browser. -network: true +description: Opens a website in a headless browser and profiles its performance. args: - name: url description: URL to open. required: true + - name: wait-for + description: CSS selector to wait for before profiling. + required: false + - name: timeout + description: Timeout in seconds for page load and element wait. + required: false + type: int + default: "60" From 5d251fa9a97f550975ffb78a7a8e16a963ae25d3 Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Tue, 21 Jan 2025 08:59:49 -0500 Subject: [PATCH 5/7] chore: add network true to website settings --- dyana/loaders/website/settings.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/dyana/loaders/website/settings.yml b/dyana/loaders/website/settings.yml index fcb20c3..601b005 100644 --- a/dyana/loaders/website/settings.yml +++ b/dyana/loaders/website/settings.yml @@ -1,4 +1,5 @@ description: Opens a website in a headless browser and profiles its performance. +network: true args: - name: url From b6003bd60b0151e4fed52273dd081be860a70d75 Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Tue, 21 Jan 2025 09:09:14 -0500 Subject: [PATCH 6/7] fix: ci checks for lint and typechecks --- dyana/loaders/website/main.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/dyana/loaders/website/main.py b/dyana/loaders/website/main.py index 1dc4b94..b2ef3af 100644 --- a/dyana/loaders/website/main.py +++ b/dyana/loaders/website/main.py @@ -2,19 +2,20 @@ import json import shutil import time -from typing import Dict, Any +import typing +from typing import Any from selenium import webdriver +from selenium.common.exceptions import TimeoutException from selenium.webdriver.chrome.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By -from selenium.common.exceptions import TimeoutException +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait from dyana import Profiler # type: ignore[attr-defined] -def collect_performance_metrics(driver: webdriver.Chrome) -> Dict[str, Any]: +def collect_performance_metrics(driver: webdriver.Chrome) -> dict[str, Any]: """Collect detailed performance metrics using Chrome DevTools Protocol.""" metrics = {} @@ -61,9 +62,11 @@ def collect_performance_metrics(driver: webdriver.Chrome) -> Dict[str, Any]: return metrics -def analyze_page_content(driver: webdriver.Chrome) -> Dict[str, Any]: +def analyze_page_content(driver: webdriver.Chrome) -> dict[str, Any]: """Analyze page content and structure.""" - return driver.execute_script(""" + return typing.cast( + dict[str, Any], + driver.execute_script(""" return { 'elements': document.getElementsByTagName('*').length, 'images': document.getElementsByTagName('img').length, @@ -73,7 +76,8 @@ def analyze_page_content(driver: webdriver.Chrome) -> Dict[str, Any]: 'iframes': document.getElementsByTagName('iframe').length, 'documentSize': document.documentElement.innerHTML.length, }; - """) + """), + ) if __name__ == "__main__": @@ -159,7 +163,7 @@ def analyze_page_content(driver: webdriver.Chrome) -> Dict[str, Any]: try: driver.quit() profiler.track_memory("after_quit") - except: + except Exception: pass print(json.dumps(profiler.as_dict())) From f8e94f3203c7f22f79dc64d05ea50ef56f5d01bd Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Tue, 21 Jan 2025 09:26:43 -0500 Subject: [PATCH 7/7] chore: rm network service as agreed --- dyana/loaders/website/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dyana/loaders/website/main.py b/dyana/loaders/website/main.py index b2ef3af..d60b557 100644 --- a/dyana/loaders/website/main.py +++ b/dyana/loaders/website/main.py @@ -99,7 +99,6 @@ def analyze_page_content(driver: webdriver.Chrome) -> dict[str, Any]: chrome_options.add_argument("--headless=new") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--window-size=1920,1080") - chrome_options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess") # Enable performance logging chrome_options.set_capability("goog:loggingPrefs", {"performance": "ALL", "browser": "ALL"})