Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

standard-search: Fix parsing of description and divs with the results #93

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 27 additions & 14 deletions googleapi/modules/standard_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self):
self.index = None # What index on this page it was on
self.number_of_results = None # The total number of results the query returned
self.is_pdf = None # This boolean is true if google thinks this result leads to a PDF file

def __repr__(self):
name = self._limit_str_size(self.name, 55)
description = self._limit_str_size(self.description, 49)
Expand Down Expand Up @@ -72,7 +72,17 @@ def search(query, pages=1, lang='en', area='com', ncr=False, void=True, time_per

if html:
soup = BeautifulSoup(html, "html.parser")

divs = soup.findAll("div", attrs={"class": "g"})
if len(divs) == 0:
divs = soup.select("#main > div")
if len(divs) == 1:
divs = soup.findAll("div", attrs={"data-hveid": "CAsQAA"})
else:
divs = divs[2:]

if len(divs) == 0:
divs = soup.select("body > div:nth-child(3) > div")[1:]

results_div = soup.find("div", attrs={"id": "resultStats"})
number_of_results = _get_number_of_results(results_div)
Expand All @@ -92,12 +102,13 @@ def search(query, pages=1, lang='en', area='com', ncr=False, void=True, time_per
res.cached = _get_cached(li)
res.number_of_results = number_of_results
res.is_pdf = _get_is_pdf(li)
if void is True:
if res.description is None:
continue

if void is True and (res.description is None or res.link is None):
continue

results.append(res)
j += 1

return results


Expand Down Expand Up @@ -188,15 +199,17 @@ def _get_description(li):

TODO: There are some text encoding problems to resolve."""

sdiv = li.find("div", attrs={"class": "IsZvec"})
if sdiv:
stspan = sdiv.find("span", attrs={"class": "aCOpRe"})
if stspan is not None:
# return stspan.text.encode("utf-8").strip()
return stspan.text.strip()
else:
return None
# div.IsZvec span.aCOpRE is an old selection which can be deleted
# after Google is not using it any more
description = li.select_one("div.BNeawe div.BNeawe")
if description == None:
description = li.select_one("div.IsZvec span.aCOpRe")
if description == None:
description = li.find("span", attrs={"class": "qXLe6d"})
if description == None:
description = li.find("div", attrs={"class": "I5aSse"})

return description.text.strip() if description != None else None

def _get_thumb():
"""Return the link to a thumbnail of the website."""
Expand All @@ -216,7 +229,7 @@ def _get_is_pdf(li):
"""Return if the link is marked by google as PDF"""
sdiv = li.find("span", attrs={"class": "ZGwO7 C0kchf NaCKVc"})
return True if sdiv else False

def _get_number_of_results(results_div):
"""Return the total number of results of the google search.
Note that the returned value will be the same for all the GoogleResult
Expand Down