-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Sourcery Starbot ⭐ refactored Liebmann5/Web_Scraper #2
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -106,7 +106,7 @@ def check_similarity(self, doc1, doc2, key, label, max_similarity): | |
max_similarity = similarity | ||
best_match = key | ||
if max_similarity == 1.0: | ||
return self.handle_match(key, label), max_similarity | ||
return self.handle_match(best_match, label), max_similarity | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
return best_match, max_similarity | ||
|
||
def find_the_bestest_match(self, label): #aka - "find_best_match" | ||
|
@@ -138,8 +138,7 @@ def get_synonyms(self, word): | |
synonyms = [] | ||
|
||
for syn in wordnet.synsets(word): | ||
for lemma in syn.lemmas(): | ||
synonyms.append(lemma.name()) | ||
synonyms.extend(lemma.name() for lemma in syn.lemmas()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
#TODO: DOUBLE CHECK THIS!!!!! Your asking for the synonyms of `phone number`?!?!?!?! Do we really want the synonyms for the key and not the label?!?!?! | ||
if word.lower() in self.custom_synonyms: | ||
#for custom_syn in self.custom_synonyms[word]: | ||
|
@@ -153,14 +152,14 @@ def get_synonyms(self, word): | |
|
||
print("self.custom_synonyms = ", end="") | ||
print(self.custom_synonyms) | ||
|
||
|
||
print("synonyms = ", end="") | ||
print(synonyms) | ||
print("\n--------------------") | ||
|
||
time.sleep(2) | ||
|
||
return synonyms | ||
|
||
#*Just for me to see what it does!! | ||
|
@@ -174,10 +173,7 @@ def jaccard_similarity(self, sentence1, sentence2): | |
print(f"union = {union}") | ||
jaccard_similarity = (len(intersection) / len(union)) | ||
print(f"jaccard_similarity = {jaccard_similarity}") | ||
if jaccard_similarity > 90: | ||
return True | ||
else: | ||
return False | ||
return jaccard_similarity > 90 | ||
Comment on lines
-177
to
+176
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
def submit_job_application(self, submit_button): | ||
|
||
|
@@ -192,32 +188,6 @@ def submit_job_application(self, submit_button): | |
self.keep_jobs_applied_to_info() | ||
#self.sessions_applied_to_info | ||
return | ||
|
||
|
||
|
||
|
||
|
||
#submit_button_index = self.form_input_details.get('KEY-NAME') | ||
#submit_button = self.extract_css(submit_button_index['HTML']) | ||
|
||
''' | ||
submit_button = self.extract_css(submit_button['HTML']) | ||
|
||
self.browser.find_element(By.CSS_SELECTOR, submit_button).click() | ||
|
||
WebDriverWait(self.browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".response-message"))) | ||
|
||
response_message = self.browser.find_element(By.CSS_SELECTOR, ".response-message").text | ||
if "success" in response_message.lower(): | ||
self.keep_jobs_applied_to_info() | ||
print("Form submission was successful!") | ||
else: | ||
print("Form submission failed!") | ||
|
||
error_messages = self.driver.find_elements(By.CSS_SELECTOR, ".error-message") | ||
for error_message in error_messages: | ||
print(f"Error: {error_message.text}") | ||
''' | ||
Comment on lines
-195
to
-220
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ):
|
||
|
||
#TODO: Add call to oxylabs captcha!!!!! | ||
|
||
|
@@ -236,26 +206,20 @@ def is_special_case(self, input_data): | |
if label == 'select': | ||
select_element = self.browser.find_element(label) | ||
is_multiple_choice = select_element.get_attribute('multiple') is not None | ||
if is_multiple_choice is True: | ||
if is_multiple_choice: | ||
self.form_input_extended['text'] = 'is_multiple_choice' | ||
elif is_multiple_choice is False: | ||
pass | ||
Comment on lines
-239
to
-242
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
elif label == 'checkbox': | ||
self.form_input_extended['checkbox'] = True | ||
self.form_input_extended = 'is_multiple_choice' | ||
elif label == 'radio': | ||
self.form_input_extended['radio'] = True | ||
elif label == 'file': | ||
self.form_input_extended['file'] = True | ||
elif label in ['text', 'textarea', 'button']: | ||
self.form_input_extended['text'] = True | ||
else: | ||
if label == 'text' or label == 'textarea': | ||
self.form_input_extended['text'] = True | ||
elif label == 'button': | ||
self.form_input_extended['text'] = True | ||
|
||
else: | ||
print("There has been an error father...") | ||
print("label = ", label) | ||
print("There has been an error father...") | ||
print("label = ", label) | ||
return | ||
|
||
#TODO: Once we submit the application confirm that here and then save everything!!! | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,51 +53,42 @@ def get_label(self, input_element): | |
input_element_str = str(input_element).lower() | ||
if 'button' in input_element_str and 'submit application' in input_element_str: | ||
return 'Submit Application' | ||
|
||
if input_element.get('type') == 'radio': | ||
label = self.find_radio_label(input_element) | ||
return label | ||
|
||
return self.find_radio_label(input_element) | ||
if input_element.get('type') == 'checkbox': | ||
div_parent, parents_text = self.get_div_parent(input_element) | ||
if div_parent == 'None' or parents_text == 'None': | ||
pass | ||
elif div_parent and parents_text: | ||
#return div_parent, parents_text | ||
checkbox_values = [div_parent, parents_text] | ||
return checkbox_values | ||
|
||
return [div_parent, parents_text] | ||
Comment on lines
-56
to
+64
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ):
|
||
label = None | ||
|
||
# Case 1: Check if the label is a direct previous sibling of the input element | ||
label = input_element.find_previous_sibling('label') | ||
|
||
# Case 2: Check if the label is inside a parent container | ||
if not label: | ||
parent = input_element.find_parent() | ||
if parent: | ||
if parent := input_element.find_parent(): | ||
label = parent.find('label') | ||
|
||
# Case 3: Check if the label is associated using the "for" attribute | ||
if not label: | ||
input_id = input_element.get('id') | ||
if input_id: | ||
if input_id := input_element.get('id'): | ||
label = input_element.find_previous('label', attrs={'for': input_id}) | ||
|
||
# Case 4: Check if the input element is a child of a label element | ||
if not label: | ||
parent_label = input_element.find_parent('label') | ||
if parent_label: | ||
if parent_label := input_element.find_parent('label'): | ||
label = parent_label | ||
|
||
# Case 5: Check if a label is inside a parent container of the input element | ||
if not label: | ||
parent = input_element.find_parent() | ||
if parent: | ||
if parent := input_element.find_parent(): | ||
label = parent.find('label') | ||
|
||
# Case 6: Checks if the input element has an 'aria-label' meaning it's dynamic so goes & searches | ||
# all previous label containers to see if any have text values that are equal to the aria-label' | ||
# all previous label containers to see if any have text values that are equal to the aria-label' | ||
if not label: | ||
if 'aria-label' in input_element.attrs: | ||
aria_label_match = None | ||
|
@@ -106,30 +97,35 @@ def get_label(self, input_element): | |
if parent_label.text.strip() == aria_label_value: | ||
aria_label_match = True | ||
if aria_label_match: | ||
dynamic_label = aria_label_value + " (dynamic " + input_element.get('type') + ")" | ||
if dynamic_label: | ||
if ( | ||
dynamic_label := f"{aria_label_value} (dynamic " | ||
+ input_element.get('type') | ||
+ ")" | ||
): | ||
return dynamic_label | ||
elif aria_label_match == None: | ||
elif aria_label_match is None: | ||
return aria_label_value | ||
|
||
# Case 7: Checks if the input element's style attribute is equal to 'display: none;' meaning it's | ||
# dynamic so goes & searches for the most previous label container to specify its text value is dynamic | ||
if not label: | ||
if input_element.get('style') == 'display: none;': | ||
previous_input = input_element.find_previous('input') | ||
if previous_input: | ||
if previous_input := input_element.find_previous('input'): | ||
parent_label = previous_input.find_previous('label') | ||
dynamic_label = parent_label.text.strip() + " (dynamic " + input_element.get('type') + ")" | ||
if dynamic_label: | ||
if ( | ||
dynamic_label := f"{parent_label.text.strip()} (dynamic " | ||
+ input_element.get('type') | ||
+ ")" | ||
): | ||
return dynamic_label | ||
|
||
# Case 8: Special case for Resume/CV | ||
if not label and self.one_resume_label == False: | ||
found_attach = False | ||
parent_label = input_element.find_previous('label') | ||
label = parent_label | ||
self.one_resume_label = True | ||
|
||
current_element = input_element | ||
while current_element: | ||
if isinstance(current_element, NavigableString) and 'attach' in str(current_element).lower(): | ||
|
@@ -138,8 +134,7 @@ def get_label(self, input_element): | |
current_element = current_element.next_sibling | ||
# Traverse up from the specific_element and find the label tag | ||
if found_attach: | ||
label_tag = input_element.find_previous('label') | ||
if label_tag: | ||
if label_tag := input_element.find_previous('label'): | ||
# Check if the immediate child is a text value | ||
first_child = label_tag.contents[0] | ||
if isinstance(first_child, NavigableString) and first_child.strip(): | ||
|
@@ -156,8 +151,10 @@ def get_label(self, input_element): | |
|
||
# Check if the label contains a nested div element with the class "application-label" (case for Input 18) | ||
if label: | ||
app_label = label.find(lambda tag: 'class' in tag.attrs and 'application-label' in tag['class']) | ||
if app_label: | ||
if app_label := label.find( | ||
lambda tag: 'class' in tag.attrs | ||
and 'application-label' in tag['class'] | ||
): | ||
label = app_label | ||
|
||
if label: | ||
|
@@ -174,9 +171,7 @@ def get_label(self, input_element): | |
|
||
return label_text | ||
|
||
# Case 6: Check if the input_element has a placeholder attribute | ||
placeholder = input_element.get('placeholder') | ||
if placeholder: | ||
if placeholder := input_element.get('placeholder'): | ||
return f"Placeholder ~ {placeholder}" | ||
|
||
return None | ||
|
@@ -185,13 +180,12 @@ def find_radio_label(self, element, stop_level=5): | |
current_level = 0 | ||
while (current_level <= stop_level): | ||
print(f"Level {current_level}:") | ||
if current_level == 0 or current_level == 5: | ||
if current_level == 0: | ||
print(element.prettify()) | ||
if current_level == 5: | ||
sauce = element.next_element.get_text(strip=True) | ||
print(sauce) | ||
return sauce | ||
if current_level == 0: | ||
print(element.prettify()) | ||
elif current_level == 5: | ||
sauce = element.next_element.get_text(strip=True) | ||
print(sauce) | ||
return sauce | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
element = element.parent | ||
current_level += 1 | ||
|
||
|
@@ -224,9 +218,9 @@ def get_div_parent(self, input_element): | |
|
||
def get_form_input_details(self, url): | ||
self.one_resume_label = False | ||
|
||
print("\nget_form_input_details()") | ||
print("URL = " + url) | ||
print(f"URL = {url}") | ||
Comment on lines
-227
to
+223
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
page = requests.get(url) | ||
soup = BeautifulSoup(page.content, 'lxml') | ||
|
||
|
@@ -259,9 +253,7 @@ def get_form_input_details(self, url): | |
values = [] | ||
if input_type == 'select': | ||
options = field.find_all('option') | ||
for option in options: | ||
values.append(option.text.strip()) | ||
|
||
values.extend(option.text.strip() for option in options) | ||
if input_type == 'radio': | ||
radio_name = field.get('name') | ||
if radio_name in processed_radios: | ||
|
@@ -270,10 +262,10 @@ def get_form_input_details(self, url): | |
radio_group = soup.find_all('input', {'name': radio_name}) | ||
values = [radio.get('value') for radio in radio_group] | ||
input_html = ''.join([str(radio).strip() for radio in radio_group]) | ||
|
||
# Call get_label for the entire radio button group | ||
input_label = self.get_label(field) | ||
|
||
elif input_type == 'checkbox': | ||
if field in processed_radios: | ||
continue | ||
|
@@ -290,7 +282,7 @@ def get_form_input_details(self, url): | |
for index, input_element in enumerate(checkbox_group): | ||
parent_label = input_element.find_previous('label') | ||
if input_element.get('type') == 'text': | ||
values.append(parent_label.text.strip() + "(dynamic)") | ||
values.append(f"{parent_label.text.strip()}(dynamic)") | ||
continue | ||
values.append(parent_label.text.strip()) | ||
processed_radios.add(input_element) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function
validate_job_data
refactored with the following changes:invert-any-all
)