Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix #2
Browse files Browse the repository at this point in the history
- Fix Bing search
- Add Google and Startpage scrapers (change in ui settings)
- Fix headers + add header randomizer
- Completely remove Brainly from the project due to new WAF + slow processing time
- Fix delay in cursor restoring after OCR
- Manually calculate selection rectangle for OCR
- Change minimum value for window transparency slider to 5%
- Upgrade Pillow and pytesseract to fix security concerns
- Add requests_html, fake_headers, and their dependencies to requirements.txt
daijro committed Apr 11, 2022

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent d88428a commit 692e8fb
Showing 7 changed files with 221 additions and 138 deletions.
2 changes: 1 addition & 1 deletion config.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"quizlet": true,
"quizizz": true,
"brainly": false,
"search_engine": 0,
"hide_show_key": "Ctrl+D",
"ocr_key": "Ctrl+Shift+X",
"paste_key": "Ctrl+Shift+V",
23 changes: 13 additions & 10 deletions gui.pyw
Original file line number Diff line number Diff line change
@@ -19,7 +19,7 @@ import tkinter as tk
root = tk.Tk()
root.withdraw()

from scraper import Searchify
from scraper import Searchify, SearchEngine
from textshot import *
from windoweffect import WindowEffect

@@ -96,17 +96,13 @@ class UI(QMainWindow):
self.status_label = self.findChild(QtWidgets.QLabel, "status_label")
self.quizlet_button = self.findChild(QtWidgets.QPushButton, "quizlet_button")
self.quizizz_button = self.findChild(QtWidgets.QPushButton, "quizizz_button")
self.brainly_button = self.findChild(QtWidgets.QPushButton, "brainly_button")
self.settings_button = self.findChild(QtWidgets.QPushButton, "settings_button")

self.quizlet_button.setChecked(self.conf['quizlet'])
self.quizizz_button.setChecked(self.conf['quizizz'])
self.brainly_button.setChecked(self.conf['brainly'])

self.quizlet_button.toggled.connect(lambda: self.updatejson('quizlet'))
self.quizizz_button.toggled.connect(lambda: self.updatejson('quizizz'))
self.brainly_button.toggled.connect(lambda: self.updatejson('brainly'))


self.settings_button.clicked.connect(lambda: self.stackedWidget.setCurrentIndex(1))

@@ -115,7 +111,6 @@ class UI(QMainWindow):

self.quizizz_button.setIcon(QtGui.QIcon(resource_path("img\\quizizz.png")))
self.quizlet_button.setIcon(QtGui.QIcon(resource_path("img\\quizlet.png")))
self.brainly_button.setIcon(QtGui.QIcon(resource_path("img\\brainly.png")))
self.titleIcon.setPixmap(QtGui.QPixmap(resource_path("img\\search.png")))


@@ -203,6 +198,11 @@ class UI(QMainWindow):

self.setting_on_top.setChecked(self.conf['on_top'])
self.setting_on_top.toggled.connect(lambda: self.set_window_on_top())

self.search_engine_combo = self.findChild(QtWidgets.QComboBox, "search_engine_combo")
self.search_engine_combo.setCurrentIndex(self.conf['search_engine'])
self.search_engine = SearchEngine(self.search_engine_combo.currentText().lower())
self.search_engine_combo.currentIndexChanged.connect(lambda: self.run_search_engine())

# window theme
self.themeInput = self.findChild(QtWidgets.QComboBox, "themeInput")
@@ -291,7 +291,7 @@ class UI(QMainWindow):
font_size = self.font_size.value()

# icon sizes
for obj in [self.quizizz_button, self.quizlet_button, self.brainly_button]:
for obj in [self.quizizz_button, self.quizlet_button]:
obj.setIconSize(QtCore.QSize(font_size*2, font_size*2))


@@ -313,6 +313,10 @@ class UI(QMainWindow):

# calling scraper and adding to ui

def run_search_engine(self):
self.search_engine = SearchEngine(self.search_engine_combo.currentText().lower())
self.updatejson('search_engine')

def run_searcher(self):
query = self.search_bar.text().strip()

@@ -328,14 +332,13 @@ class UI(QMainWindow):

if self.quizizz_button.isChecked(): sites.append('quizizz')
if self.quizlet_button.isChecked(): sites.append('quizlet')
if self.brainly_button.isChecked(): sites.append('brainly')

if not sites:
self.status_label.setText('Please select at least one site.')
self.search_frame.setEnabled(True)
return

searchify = Searchify(query, sites)
searchify = Searchify(query, sites, self.search_engine)

t = Thread(target=searchify.main)
t.daemon = True
@@ -530,7 +533,6 @@ class UI(QMainWindow):
# keybinds
"quizlet": lambda: self.quizlet_button.isChecked(),
"quizizz": lambda: self.quizizz_button.isChecked(),
"brainly": lambda: self.brainly_button.isChecked(),
"hide_show_key": lambda: self.hide_show_key.keySequence().toString(),
"ocr_key": lambda: self.ocr_key.keySequence().toString(),
"paste_key": lambda: self.paste_key.keySequence().toString(),
@@ -550,6 +552,7 @@ class UI(QMainWindow):
"hide_taskbar": lambda: self.setting_hide_taskbar.isChecked(),
"theme": lambda: self.themeInput.currentIndex(),
"font_size": lambda: self.font_size.value(),
"search_engine": lambda: self.search_engine_combo.currentIndex(),
}

def updatejson(self, key):
Binary file removed img/brainly.png
Binary file not shown.
26 changes: 24 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,22 +1,44 @@
appdirs==1.4.4
beautifulsoup4==4.10.0
bs4==0.0.1
certifi==2021.10.8
cffi==1.15.0
charset-normalizer==2.0.9
colorama==0.4.4
cssselect==1.1.0
fake-headers==1.0.2
fake-useragent==0.1.11
gevent==21.12.0
greenlet==1.1.2
grequests==0.6.0
html5lib==1.1
idna==3.3
importlib-metadata==4.11.3
keyboard==0.13.5
Pillow==8.4.0
lxml==4.8.0
packaging==21.3
parse==1.19.0
Pillow==9.1.0
pycparser==2.21
pyee==8.2.2
pyparsing==3.0.8
pyperclip==1.8.2
pyppeteer==1.0.2
PyQt5==5.15.6
PyQt5-Qt5==5.15.2
PyQt5-sip==12.9.0
pytesseract==0.3.8
pyquery==1.4.3
pytesseract==0.3.9
pywin32==303
requests==2.26.0
requests-html==0.10.0
six==1.16.0
soupsieve==2.3.1
tqdm==4.64.0
urllib3==1.26.7
w3lib==1.22.0
webencodings==0.5.1
websockets==10.2
zipp==3.8.0
zope.event==4.5.0
zope.interface==5.4.0
186 changes: 94 additions & 92 deletions scraper.py
Original file line number Diff line number Diff line change
@@ -1,71 +1,124 @@
import grequests
import json
from bs4 import BeautifulSoup
from difflib import SequenceMatcher
import json
import grequests
from requests_html import HTMLSession
from fake_headers import Headers
import re
import sys
import time
from urllib.parse import urlencode
from threading import Thread


headers = {
"Connection": "keep-alive",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.53',
"Sec-Fetch-Site": "same-origin",
"Sec-Ch-Ua": "\"(Not(A:Brand\";v=\"8\", \"Chromium\";v=\"99\"",
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": "\"Windows\"",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Sec-Fetch-Site": "none",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Referer": "https://www.bing.com/",
"Accept-Language": "en-US,en;q=0.9"
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "close"
}


get_text = lambda x: BeautifulSoup(x, features='lxml').get_text().strip()
sluggify = lambda a: ' '.join(re.sub(r'[^\\sa-z0-9\\.,\\(\\)]+', ' ', a.lower()).split())
similar = lambda a, b: SequenceMatcher(None, sluggify(a), sluggify(b)).ratio()
remove_duplicates = lambda a: list(set(a))

def _make_headers():
return {**headers, **Headers(headers=True, browser='chrome', os='windows').generate()}


class SearchBing:
class SearchEngine:
headers = headers.copy()
def __init__(self, engine_name):
self.sess = HTMLSession()
self.engine_name = engine_name
self._web_engines = { # simple scrapers using get requests
'google': ('https://www.google.com/search?', 'q', {'aqs': 'chrome..69i57.888j0j1', 'sourceid': 'chrome', 'ie': 'UTF-8'}),
'bing': ('https://www.bing.com/search?', 'q', {'pq': ''}),
}
if engine_name in self._web_engines:
return
elif engine_name == 'startpage':
print('Starting startpage instance...')
self.t = Thread(target=self._init_startpage)
self.t.daemon = True
self.t.start()

def find_items(self, soup, args):
return {i: soup.find('input', {'type': 'hidden', 'name': i})['value'] for i in args}

def get_startpage_items(self, r):
soup = BeautifulSoup(r.text, 'lxml')
return {'query': None, 'cat': 'web', **self.find_items(soup, ['lui', 'language', 'sc', 'abp'])}

def _init_startpage(self):
self._startpage_data = self.get_startpage_items(self.sess.get('https://www.startpage.com/', headers=self.headers))
self.headers.update({"Sec-Fetch-Site": "same-origin", 'Referer': 'https://www.startpage.com/'})

def startpage_get_page(self, query, sites):
self.t.join()
resps = grequests.map([
grequests.post('https://www.startpage.com/sp/search',
headers=self.headers,
data={**self._startpage_data, **{'query': f'{query} site:{site}.com'}}
)
for site in sites
])
self.t = Thread(target=self.get_startpage_items, args=(resps[-1],))
self.t.daemon = True
self.t.start()
return dict(zip(sites, resps))

def get_page(self, query, sites):
if self.engine_name == 'startpage':
return self.startpage_get_page(query, sites)
return dict(zip(
sites,
grequests.map([
grequests.get(
(web_engine := self._web_engines[self.engine_name])[0]
+ urlencode({web_engine[1]: f'{query} site:{site}.com', **web_engine[2]}),
headers=self.headers, session=self.sess
)
for site in sites
], size=len(sites))
))


class SearchWeb:
"""
search bing for query
search web for query
"""
def __init__(self, query, sites):
def __init__(self, query, sites, engine):
self.query = query
self.links = None
self.sites = sites
self.engine = engine
self._regex_objs = {
'quizlet': re.compile('https?://quizlet.com/\d+/[a-z0-9\\-]+/'),
'quizizz': re.compile('https?://quizizz.com/admin/quiz/[a-f0-9]+/[a-z\\-]+'),
'brainly': re.compile('https?://brainly.com/question/\d+'),
}

def search(self):
"""
search bing for query
search web for query
"""
resps = dict(zip(
self.sites,
grequests.map([
grequests.get(
'https://www.bing.com/search?'
+ urlencode({'q': self.query + f' site:{site}.com'}),
headers=headers,
)
for site in self.sites
], size=len(self.sites))
))

resps = self.engine.get_page(self.query, self.sites)
self.links = {
site: remove_duplicates(re.findall(self._regex_objs[site], resps[site].text))
for site in self.sites
}



class QuizizzScraper:
def __init__(self, links, query):
self.links = links
@@ -74,7 +127,7 @@ def __init__(self, links, query):
self.query = query

def async_requests(self, links):
reqs = [grequests.get(u, headers=headers) for u in links]
reqs = [grequests.get(u, headers=_make_headers()) for u in links]
self.resps = grequests.map(reqs, size=len(reqs))

def parse_links(self):
@@ -136,7 +189,7 @@ def __init__(self, links, query):
self._regex_obj = re.compile('\\= \\{"alphabeticalIsDifferent.*\\}; QLoad\\(')

def async_requests(self, links):
reqs = [grequests.get(u, headers=headers) for u in links]
reqs = [grequests.get(u, headers=_make_headers()) for u in links]
self.resps = grequests.map(reqs, size=len(reqs))

def parse_links(self):
@@ -170,61 +223,6 @@ def quizlet_parser(self, resp):
)



class BrainlyScraper:
def __init__(self, links, query):
self.links = links
self.resps = None
self.brainlys = []
self.query = query

def async_requests(self, links):
reqs = [grequests.get(u, headers=headers) for u in links]
self.resps = grequests.map(reqs, size=len(reqs))

def parse_links(self):
self.async_requests(self.links)
for resp in self.resps:
try:
self.brainlys.append(self.brainly_parser(resp))
except Exception as e:
print('exception', e, resp.url)
# pass # skip over any errors
return self.brainlys


def brainly_parser(self, resp):
data = json.loads(BeautifulSoup(resp.text, features='lxml').find('script', type="application/ld+json").string)[0]
answers = []
if 'acceptedAnswer' in data['mainEntity']:
answers += data['mainEntity']['acceptedAnswer']
if 'suggestedAnswer' in data['mainEntity']:
answers += data['mainEntity']['suggestedAnswer']

return max(
(
{
'question': data['name'].strip(),
'answer': get_text(i['text'])
.replace('Answer:', 'Answer: ')
.replace('Explanation:', '\nExplanation: ')
+ '\nUpvotes: '
+ str(i['upvoteCount']),
'similarity': (
similar(data['name'], self.query),
True,
i['upvoteCount'],
),
'url': resp.url,
}
for i in answers
),
key=lambda x: x['similarity'],
)




class TimeLogger:
def __init__(self):
self.elapsed_total = time.time()
@@ -261,19 +259,17 @@ def print_timers(self):





class Searchify:
def __init__(self, query, sites):
def __init__(self, query, sites, engine):
self.query = query
self.sites = sites
self.engine = engine
self.timer = TimeLogger()
self.flashcards = []
self.links = []
self.site_scrapers = {
'quizlet': QuizletScraper,
'quizizz': QuizizzScraper,
'brainly': BrainlyScraper,
}

def main(self):
@@ -306,8 +302,8 @@ def _flashcard_thread(self, site_scraper, links, site_name):


def get_links(self):
self.timer.start('bing search')
search_bing = SearchBing(self.query, self.sites)
self.timer.start('web search')
search_bing = SearchWeb(self.query, self.sites, self.engine)
search_bing.search()
self.timer.end()
self.links = search_bing.links
@@ -328,10 +324,11 @@ def sort_flashcards(self): # sourcery skip: for-index-replacement
if __name__ == '__main__' and len(sys.argv) > 1:
# argument parsing
import argparse
parser = argparse.ArgumentParser(description='Search Bing for flashcards')
parser = argparse.ArgumentParser(description='Search the web for flashcards')
parser.add_argument('--query', '-q', help='query to search for', default=None)
parser.add_argument('--output', '-o', help='output file', default=None)
parser.add_argument('--sites', '-s', help='question sources quizlet,quizizz,brainly (comma seperated list)', default='quizlet,quizizz,brainly')
parser.add_argument('--sites', '-s', help='question sources quizlet,quizizz (comma seperated list)', default='quizlet,quizizz')
parser.add_argument('--engine', '-e', help='search engine to use (google, bing)', default='bing')
args = parser.parse_args()

if args.output:
@@ -348,15 +345,20 @@ def sort_flashcards(self): # sourcery skip: for-index-replacement
flashcards = [] # create flashcard list

sites = args.sites.lower().split(',') # get list of sites
engine_name = args.engine.lower().strip() # get search engine

# start search engine
engine = SearchEngine(engine_name)

# run search
s = Searchify(
query=args.query,
sites=sites,
engine=engine,
)
s.main()

write(json.dumps(s.flashcards, indent=4))
print(str(len(s.flashcards))+ ' flashcards found')
print(f'{len(s.flashcards)} flashcards found')

s.timer.print_timers()
9 changes: 8 additions & 1 deletion textshot.py
Original file line number Diff line number Diff line change
@@ -134,8 +134,15 @@ def mouseReleaseEvent(self, event):
return super().mouseReleaseEvent(event)

self.hide()
QtWidgets.QApplication.restoreOverrideCursor()
QtWidgets.QApplication.processEvents()
shot = self.screen.copy(QtCore.QRect(self.start, self.end))

shot = self.screen.copy(
min(self.start.x(), self.end.x()),
min(self.start.y(), self.end.y()),
abs(self.start.x() - self.end.x()),
abs(self.start.y() - self.end.y()),
)
self.processImage(shot)
self.quit_app()
print('done')
113 changes: 81 additions & 32 deletions window.ui
Original file line number Diff line number Diff line change
@@ -204,28 +204,6 @@
</property>
</widget>
</item>
<item>
<widget class="QPushButton" name="brainly_button">
<property name="minimumSize">
<size>
<width>30</width>
<height>30</height>
</size>
</property>
<property name="text">
<string>Brainly</string>
</property>
<property name="checkable">
<bool>true</bool>
</property>
<property name="checked">
<bool>false</bool>
</property>
<property name="flat">
<bool>true</bool>
</property>
</widget>
</item>
</layout>
</item>
<item row="0" column="0" colspan="2">
@@ -404,9 +382,9 @@
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>379</width>
<height>631</height>
<y>-77</y>
<width>656</width>
<height>465</height>
</rect>
</property>
<layout class="QGridLayout" name="gridLayout_8">
@@ -775,26 +753,97 @@
<bool>true</bool>
</property>
<layout class="QGridLayout" name="gridLayout_6">
<item row="0" column="0">
<widget class="QCheckBox" name="setting_search_ocr">
<item row="1" column="0">
<widget class="QCheckBox" name="setting_search_paste">
<property name="text">
<string>Search after running OCR</string>
<string>Search after pasting</string>
</property>
<property name="checked">
<bool>true</bool>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QCheckBox" name="setting_search_paste">
<item row="0" column="0">
<widget class="QCheckBox" name="setting_search_ocr">
<property name="text">
<string>Search after pasting</string>
<string>Search after running OCR</string>
</property>
<property name="checked">
<bool>true</bool>
</property>
</widget>
</item>
<item row="2" column="0" colspan="2">
<layout class="QGridLayout" name="gridLayout_5">
<item row="0" column="2">
<widget class="QComboBox" name="search_engine_combo">
<property name="minimumSize">
<size>
<width>150</width>
<height>0</height>
</size>
</property>
<item>
<property name="text">
<string>Bing</string>
</property>
</item>
<item>
<property name="text">
<string>Google</string>
</property>
</item>
<item>
<property name="text">
<string>Startpage</string>
</property>
</item>
</widget>
</item>
<item row="0" column="0">
<widget class="QLabel" name="label_8">
<property name="minimumSize">
<size>
<width>100</width>
<height>0</height>
</size>
</property>
<property name="text">
<string>Search engine:</string>
</property>
</widget>
</item>
<item row="0" column="3">
<spacer name="horizontalSpacer_7">
<property name="orientation">
<enum>Qt::Horizontal</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>40</width>
<height>20</height>
</size>
</property>
</spacer>
</item>
<item row="0" column="1">
<spacer name="horizontalSpacer_8">
<property name="orientation">
<enum>Qt::Horizontal</enum>
</property>
<property name="sizeType">
<enum>QSizePolicy::Fixed</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>70</width>
<height>20</height>
</size>
</property>
</spacer>
</item>
</layout>
</item>
</layout>
</widget>
</item>
@@ -972,7 +1021,7 @@
<string>Window transparency</string>
</property>
<property name="minimum">
<number>15</number>
<number>5</number>
</property>
<property name="maximum">
<number>100</number>

0 comments on commit 692e8fb

Please sign in to comment.