-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsat-beams-scraper.py
128 lines (107 loc) · 5.52 KB
/
sat-beams-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import os
website = "https://www.satbeams.com/satellites?status=active"
t_body='//*[@id="sat_grid"]/tbody/tr[@class="class_tr"]'
def safe_find_next(element, label):
try:
next_elem = element.find(string=label)
if next_elem:
next_elem = next_elem.find_next()
if next_elem:
return next_elem.text.strip()
return "N/A"
except Exception as e:
print(f"Error finding next element for label '{label}': {e}")
return "N/A"
try:
driver = webdriver.Chrome()
driver.get(website)
driver.implicitly_wait(5)
tr_body_elements = driver.find_elements(By.XPATH,t_body)
# Set to store ALL URLs
hrefs = []
# Set to store unique URLs
unique_hrefs = set()
# Iterate over all tbody_tr elements
for tr in tr_body_elements:
try:
#Check for tag presence
a_tag = WebDriverWait(tr, 10).until(
EC.presence_of_element_located((By.XPATH, './/a[@class="link" and contains(@href, "/satellites?norad=")]'))
)
href_value = a_tag.get_attribute('href')
hrefs.append(href_value)
unique_hrefs.add(href_value)
except Exception as e:
print(f"Error finding a tag in tr: {e}")
print(f"Number of unique hrefs: {len(unique_hrefs)}")
# Visit each href
for href in unique_hrefs:
try:
# Navigate to the href
driver.get(href)
# Wait for the new page to load completely
time.sleep(5)
# Extract data from the new page
# For example, extract the title of the page
page_title = driver.title
print(f"Visited {href}, Page title: {page_title}")
individual_sat_details = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="sat_grid1"]/tbody/tr[2]'))
)
#Extract the text from the element
data_text = individual_sat_details.get_attribute('innerHTML')
#Parse the extracted text
from bs4 import BeautifulSoup
soup = BeautifulSoup(data_text, 'html.parser')
satellite_data = {}
#Extract relevant data
satellite_data['Satellite Name'] = soup.find(string="Satellite Name:").find_next().strip()
#Assign file title
def find_next_text(element, label):
next_elem = soup.find(string=label)
if next_elem:
next_elem = next_elem.find_next()
if next_elem:
return next_elem.strip()
return "N/A"
# Extract relevant data
satellite_data['Satellite Name'] = find_next_text(soup, "Satellite Name:")
satellite_name = satellite_data['Satellite Name']
satellite_data['Status'] = find_next_text(soup, "Status:")
satellite_data['Position'] = find_next_text(soup, "Position:")
satellite_data['NORAD'] = soup.find(string="NORAD:").find_next('a').string.strip() if soup.find(string="NORAD:") else "N/A"
satellite_data['Cospar number'] = soup.find(string="Cospar number:").find_next('a').string.strip() if soup.find(string="Cospar number:") else "N/A"
satellite_data['Operator'] = soup.find(string="Operator:").find_next('a').string.strip() if soup.find(string="Operator:") else "N/A"
satellite_data['Launch date'] = find_next_text(soup, "Launch date:")
satellite_data['Launch site'] = soup.find(string="Launch site:").find_next('a').string.strip() if soup.find(string="Launch site:") else "N/A"
satellite_data['Launch vehicle'] = soup.find(string="Launch vehicle:").find_next('a').string.strip() if soup.find(string="Launch vehicle:") else "N/A"
satellite_data['Launch mass (kg)'] = find_next_text(soup, "Launch mass (kg):")
satellite_data['Dry mass (kg)'] = find_next_text(soup, "Dry mass (kg):")
satellite_data['Manufacturer'] = soup.find(string="Manufacturer:").find_next('a').string.strip() if soup.find(string="Manufacturer:") else "N/A"
satellite_data['Model (bus)'] = soup.find(string="Model (bus):").find_next('a').string.strip() if soup.find(string="Model (bus):") else "N/A"
satellite_data['Orbit'] = find_next_text(soup, "Orbit:")
satellite_data['Expected lifetime'] = find_next_text(soup, "Expected lifetime:")
#Sanitize Satellite Name
sanitized_name = "".join(c if c.isalnum() or c in " ._-" else "_" for c in satellite_name)
# Ensure the output directory exists
if not os.path.exists(sanitized_name):
os.makedirs(sanitized_name)
file_path = os.path.join(sanitized_name, f"{sanitized_name}.txt")
with open(file_path, 'w') as file:
for key, value in satellite_data.items():
file.write(f'{key}: {value}\n')
# Go back to the initial page
driver.back()
# Optional: wait for the initial page to load completely
time.sleep(5)
except Exception as e:
print(f"Error visiting href {href}: {e}")
finally:
driver.quit()