Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sourcery Starbot ⭐ refactored Liebmann5/Web_Scraper #2

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions DataCollection/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,17 @@
allowed_experience_levels = ['Entry', '', 'Staff', 'Senior', 'Lead', 'Principal']

def validate_job_data(data):
if not all(field in data for field in expected_data):
if any(field not in data for field in expected_data):
return False, 'Invalid data format'

if data['Employment Type'] not in allowed_employment_types:
return False, 'Invalid Employment Type'

if data['Experience Level'] not in allowed_experience_levels:
return False, 'Invalid Experience Level'

#TODO: Add more checks like insurance it's within users country!!!

Comment on lines -22 to +32
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function validate_job_data refactored with the following changes:

return True, ''

#TODO: REMEMBER IT'S NOT A DB!! So create a method that adds things together if `Industry=Employment Type= Location=Experience Level`
Expand Down
14 changes: 5 additions & 9 deletions DataCollection/signature_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,12 @@ def sign_data(data, private_key_path):
password=None,
backend=default_backend()
)

# Create a signature
signature = private_key.sign(

return private_key.sign(
data,
padding.PSS(
mgf=padding.MGF1(hashes.SHA256()),
salt_length=padding.PSS.MAX_LENGTH
salt_length=padding.PSS.MAX_LENGTH,
),
hashes.SHA256()
)

# return signature.hex() #used this when I didn't have any default_backend code!!!
return signature
hashes.SHA256(),
)
293 changes: 137 additions & 156 deletions Legit/CompanyOpeningsAndApplications.py

Large diffs are not rendered by default.

60 changes: 12 additions & 48 deletions Legit/Form/Four/FormFour.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def check_similarity(self, doc1, doc2, key, label, max_similarity):
max_similarity = similarity
best_match = key
if max_similarity == 1.0:
return self.handle_match(key, label), max_similarity
return self.handle_match(best_match, label), max_similarity
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function check_similarity refactored with the following changes:

return best_match, max_similarity

def find_the_bestest_match(self, label): #aka - "find_best_match"
Expand Down Expand Up @@ -138,8 +138,7 @@ def get_synonyms(self, word):
synonyms = []

for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
synonyms.append(lemma.name())
synonyms.extend(lemma.name() for lemma in syn.lemmas())
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function get_synonyms refactored with the following changes:

#TODO: DOUBLE CHECK THIS!!!!! Your asking for the synonyms of `phone number`?!?!?!?! Do we really want the synonyms for the key and not the label?!?!?!
if word.lower() in self.custom_synonyms:
#for custom_syn in self.custom_synonyms[word]:
Expand All @@ -153,14 +152,14 @@ def get_synonyms(self, word):

print("self.custom_synonyms = ", end="")
print(self.custom_synonyms)


print("synonyms = ", end="")
print(synonyms)
print("\n--------------------")

time.sleep(2)

return synonyms

#*Just for me to see what it does!!
Expand All @@ -174,10 +173,7 @@ def jaccard_similarity(self, sentence1, sentence2):
print(f"union = {union}")
jaccard_similarity = (len(intersection) / len(union))
print(f"jaccard_similarity = {jaccard_similarity}")
if jaccard_similarity > 90:
return True
else:
return False
return jaccard_similarity > 90
Comment on lines -177 to +176
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function jaccard_similarity refactored with the following changes:


def submit_job_application(self, submit_button):

Expand All @@ -192,32 +188,6 @@ def submit_job_application(self, submit_button):
self.keep_jobs_applied_to_info()
#self.sessions_applied_to_info
return





#submit_button_index = self.form_input_details.get('KEY-NAME')
#submit_button = self.extract_css(submit_button_index['HTML'])

'''
submit_button = self.extract_css(submit_button['HTML'])

self.browser.find_element(By.CSS_SELECTOR, submit_button).click()

WebDriverWait(self.browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".response-message")))

response_message = self.browser.find_element(By.CSS_SELECTOR, ".response-message").text
if "success" in response_message.lower():
self.keep_jobs_applied_to_info()
print("Form submission was successful!")
else:
print("Form submission failed!")

error_messages = self.driver.find_elements(By.CSS_SELECTOR, ".error-message")
for error_message in error_messages:
print(f"Error: {error_message.text}")
'''
Comment on lines -195 to -220
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function submit_job_application refactored with the following changes:

This removes the following comments ( why? ):

#submit_button = self.extract_css(submit_button_index['HTML'])
#submit_button_index = self.form_input_details.get('KEY-NAME')


#TODO: Add call to oxylabs captcha!!!!!

Expand All @@ -236,26 +206,20 @@ def is_special_case(self, input_data):
if label == 'select':
select_element = self.browser.find_element(label)
is_multiple_choice = select_element.get_attribute('multiple') is not None
if is_multiple_choice is True:
if is_multiple_choice:
self.form_input_extended['text'] = 'is_multiple_choice'
elif is_multiple_choice is False:
pass
Comment on lines -239 to -242
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function is_special_case refactored with the following changes:

elif label == 'checkbox':
self.form_input_extended['checkbox'] = True
self.form_input_extended = 'is_multiple_choice'
elif label == 'radio':
self.form_input_extended['radio'] = True
elif label == 'file':
self.form_input_extended['file'] = True
elif label in ['text', 'textarea', 'button']:
self.form_input_extended['text'] = True
else:
if label == 'text' or label == 'textarea':
self.form_input_extended['text'] = True
elif label == 'button':
self.form_input_extended['text'] = True

else:
print("There has been an error father...")
print("label = ", label)
print("There has been an error father...")
print("label = ", label)
return

#TODO: Once we submit the application confirm that here and then save everything!!!
Expand Down
92 changes: 42 additions & 50 deletions Legit/Form/One/FormOne.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,51 +53,42 @@ def get_label(self, input_element):
input_element_str = str(input_element).lower()
if 'button' in input_element_str and 'submit application' in input_element_str:
return 'Submit Application'

if input_element.get('type') == 'radio':
label = self.find_radio_label(input_element)
return label

return self.find_radio_label(input_element)
if input_element.get('type') == 'checkbox':
div_parent, parents_text = self.get_div_parent(input_element)
if div_parent == 'None' or parents_text == 'None':
pass
elif div_parent and parents_text:
#return div_parent, parents_text
checkbox_values = [div_parent, parents_text]
return checkbox_values

return [div_parent, parents_text]
Comment on lines -56 to +64
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function get_label refactored with the following changes:

This removes the following comments ( why? ):

# Case 6: Check if the input_element has a placeholder attribute
#return div_parent, parents_text

label = None

# Case 1: Check if the label is a direct previous sibling of the input element
label = input_element.find_previous_sibling('label')

# Case 2: Check if the label is inside a parent container
if not label:
parent = input_element.find_parent()
if parent:
if parent := input_element.find_parent():
label = parent.find('label')

# Case 3: Check if the label is associated using the "for" attribute
if not label:
input_id = input_element.get('id')
if input_id:
if input_id := input_element.get('id'):
label = input_element.find_previous('label', attrs={'for': input_id})

# Case 4: Check if the input element is a child of a label element
if not label:
parent_label = input_element.find_parent('label')
if parent_label:
if parent_label := input_element.find_parent('label'):
label = parent_label

# Case 5: Check if a label is inside a parent container of the input element
if not label:
parent = input_element.find_parent()
if parent:
if parent := input_element.find_parent():
label = parent.find('label')

# Case 6: Checks if the input element has an 'aria-label' meaning it's dynamic so goes & searches
# all previous label containers to see if any have text values that are equal to the aria-label'
# all previous label containers to see if any have text values that are equal to the aria-label'
if not label:
if 'aria-label' in input_element.attrs:
aria_label_match = None
Expand All @@ -106,30 +97,35 @@ def get_label(self, input_element):
if parent_label.text.strip() == aria_label_value:
aria_label_match = True
if aria_label_match:
dynamic_label = aria_label_value + " (dynamic " + input_element.get('type') + ")"
if dynamic_label:
if (
dynamic_label := f"{aria_label_value} (dynamic "
+ input_element.get('type')
+ ")"
):
return dynamic_label
elif aria_label_match == None:
elif aria_label_match is None:
return aria_label_value

# Case 7: Checks if the input element's style attribute is equal to 'display: none;' meaning it's
# dynamic so goes & searches for the most previous label container to specify its text value is dynamic
if not label:
if input_element.get('style') == 'display: none;':
previous_input = input_element.find_previous('input')
if previous_input:
if previous_input := input_element.find_previous('input'):
parent_label = previous_input.find_previous('label')
dynamic_label = parent_label.text.strip() + " (dynamic " + input_element.get('type') + ")"
if dynamic_label:
if (
dynamic_label := f"{parent_label.text.strip()} (dynamic "
+ input_element.get('type')
+ ")"
):
return dynamic_label

# Case 8: Special case for Resume/CV
if not label and self.one_resume_label == False:
found_attach = False
parent_label = input_element.find_previous('label')
label = parent_label
self.one_resume_label = True

current_element = input_element
while current_element:
if isinstance(current_element, NavigableString) and 'attach' in str(current_element).lower():
Expand All @@ -138,8 +134,7 @@ def get_label(self, input_element):
current_element = current_element.next_sibling
# Traverse up from the specific_element and find the label tag
if found_attach:
label_tag = input_element.find_previous('label')
if label_tag:
if label_tag := input_element.find_previous('label'):
# Check if the immediate child is a text value
first_child = label_tag.contents[0]
if isinstance(first_child, NavigableString) and first_child.strip():
Expand All @@ -156,8 +151,10 @@ def get_label(self, input_element):

# Check if the label contains a nested div element with the class "application-label" (case for Input 18)
if label:
app_label = label.find(lambda tag: 'class' in tag.attrs and 'application-label' in tag['class'])
if app_label:
if app_label := label.find(
lambda tag: 'class' in tag.attrs
and 'application-label' in tag['class']
):
label = app_label

if label:
Expand All @@ -174,9 +171,7 @@ def get_label(self, input_element):

return label_text

# Case 6: Check if the input_element has a placeholder attribute
placeholder = input_element.get('placeholder')
if placeholder:
if placeholder := input_element.get('placeholder'):
return f"Placeholder ~ {placeholder}"

return None
Expand All @@ -185,13 +180,12 @@ def find_radio_label(self, element, stop_level=5):
current_level = 0
while (current_level <= stop_level):
print(f"Level {current_level}:")
if current_level == 0 or current_level == 5:
if current_level == 0:
print(element.prettify())
if current_level == 5:
sauce = element.next_element.get_text(strip=True)
print(sauce)
return sauce
if current_level == 0:
print(element.prettify())
elif current_level == 5:
sauce = element.next_element.get_text(strip=True)
print(sauce)
return sauce
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function find_radio_label refactored with the following changes:

  • Hoist conditional out of nested conditional [×2] (hoist-if-from-if)
  • Simplify conditional into switch-like form (switch)

element = element.parent
current_level += 1

Expand Down Expand Up @@ -224,9 +218,9 @@ def get_div_parent(self, input_element):

def get_form_input_details(self, url):
self.one_resume_label = False

print("\nget_form_input_details()")
print("URL = " + url)
print(f"URL = {url}")
Comment on lines -227 to +223
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function get_form_input_details refactored with the following changes:

page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')

Expand Down Expand Up @@ -259,9 +253,7 @@ def get_form_input_details(self, url):
values = []
if input_type == 'select':
options = field.find_all('option')
for option in options:
values.append(option.text.strip())

values.extend(option.text.strip() for option in options)
if input_type == 'radio':
radio_name = field.get('name')
if radio_name in processed_radios:
Expand All @@ -270,10 +262,10 @@ def get_form_input_details(self, url):
radio_group = soup.find_all('input', {'name': radio_name})
values = [radio.get('value') for radio in radio_group]
input_html = ''.join([str(radio).strip() for radio in radio_group])

# Call get_label for the entire radio button group
input_label = self.get_label(field)

elif input_type == 'checkbox':
if field in processed_radios:
continue
Expand All @@ -290,7 +282,7 @@ def get_form_input_details(self, url):
for index, input_element in enumerate(checkbox_group):
parent_label = input_element.find_previous('label')
if input_element.get('type') == 'text':
values.append(parent_label.text.strip() + "(dynamic)")
values.append(f"{parent_label.text.strip()}(dynamic)")
continue
values.append(parent_label.text.strip())
processed_radios.add(input_element)
Expand Down
Loading