From cf86c5b5af931cbd644f7313b9dedfee0dde15e4 Mon Sep 17 00:00:00 2001 From: lina Date: Wed, 16 Nov 2022 08:39:06 -0700 Subject: [PATCH 1/6] feat: getid method --- cheinsteinpy/parsers/pageParser.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cheinsteinpy/parsers/pageParser.py b/cheinsteinpy/parsers/pageParser.py index d4ab121..8f6b2ec 100644 --- a/cheinsteinpy/parsers/pageParser.py +++ b/cheinsteinpy/parsers/pageParser.py @@ -9,6 +9,13 @@ def checkLink(link): linkCheck = {"isChapter": isChapter} return linkCheck +def getId(link): + item = re.search(r'chegg.com/homework-help/questions-and-answers/(.*?)-q(\d+)', link) + if item: + return item.group(2) + else: + return None + def parsePage(data, isChapter): if isChapter: chapter = data["data"]["textbook_solution"]["chapter"][0] From cba9bc2c92c581a8bb1df8c97325bb717e03c696 Mon Sep 17 00:00:00 2001 From: lina Date: Wed, 16 Nov 2022 09:26:39 -0700 Subject: [PATCH 2/6] feat: request graphql method --- cheinsteinpy/requestPage.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/cheinsteinpy/requestPage.py b/cheinsteinpy/requestPage.py index b24210a..85d4e37 100644 --- a/cheinsteinpy/requestPage.py +++ b/cheinsteinpy/requestPage.py @@ -57,6 +57,28 @@ def requestChapter(url, cookie, userAgent, html): response = requests.post(url=url, headers=headers, json=query, data=None) return response.json() +def requestGraphQl(cookie, userAgent, data): + headers = { + 'apollographql-client-name': 'chegg-web', + 'apollographql-client-version': 'main-474a3766-3331951797', + 'content-type': 'application/json', + "Accept-Encoding": "gzip, deflate, br", + 'Accept-Language': 'en-US,en;q=0.5', + 'authorization': 'Basic TnNZS3dJMGxMdVhBQWQwenFTMHFlak5UVXAwb1l1WDY6R09JZVdFRnVvNndRRFZ4Ug==', + 'cookie': cookie, + 'upgrade-insecure-requests': '1', + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', + 'sec-fetch-site': 'cross-site', + 'sec-fetch-mode': 'navigate', + 'sec-fetch-user': '?1', + 'sec-fetch-dest': 'document', + 'Upgrade-Insecure-Requests': '1', + 'user-agent': userAgent, + } + url = "https://gateway.chegg.com/one-graph/graphql" + response = requests.post(url=url, headers=headers, json=data, data=None) + return response.text + def requestEnhancedAnswer(url, cookie, userAgent, html): headers = { 'Host': 'www.chegg.com', From 9544fbfc909ff7912b03a90a86ddef0df397f7de Mon Sep 17 00:00:00 2001 From: lina Date: Wed, 16 Nov 2022 09:43:04 -0700 Subject: [PATCH 3/6] fix: answer grabbing --- cheinsteinpy/api.py | 36 +++++++++++++++++++++++++++--- cheinsteinpy/parsers/pageParser.py | 4 +--- cheinsteinpy/requestPage.py | 2 +- 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/cheinsteinpy/api.py b/cheinsteinpy/api.py index eff16e8..30493bd 100644 --- a/cheinsteinpy/api.py +++ b/cheinsteinpy/api.py @@ -18,6 +18,22 @@ def checkLink(url): """ return pageParser.checkLink(url)["isChapter"] +def getId(url): + """ + Gets the id of the question. + + Parameters + ---------- + url : str + The url to check. + + Returns + ------- + id : str + The id of the question. + """ + return pageParser.getId(url) + def answer(url, cookie, userAgent): """ Gets answer data from Chegg. @@ -39,17 +55,31 @@ def answer(url, cookie, userAgent): """ cookieStr = cookieParser.parseCookie(cookie) isChapter = pageParser.checkLink(url)["isChapter"] - htmlData = requestPage.requestWebsite(url, cookieStr, userAgent) if isChapter: # await asyncio.sleep(6) + htmlData = requestPage.requestWebsite(url, cookieStr, userAgent) htmlRaw = requestPage.requestChapter(url, cookieStr, userAgent, htmlData) else: - htmlRaw = htmlData + qid = pageParser.getId(url) + data = { + "operationName":"QnaPageQuestionByLegacyId", + "variables": { + "id":int(qid) + }, + "extensions": { + "persistedQuery": { + "version":1, + "sha256Hash":"26efed323ef07d1759f67adadd2832ac85d7046b7eca681fe224d7824bab0928" + } + } + } + raw = requestPage.requestGraphQl(cookieStr, userAgent, data) + htmlRaw = raw["data"]["questionByLegacyId"]["htmlAnswers"][0]["answerData"]["html"] dataRaw = pageParser.parsePage(htmlRaw, isChapter) if isChapter: data = dataRaw[1] else: - data = dataRaw[1] + data = dataRaw parsedAnswer = answerParser.getAnswer(data, isChapter) if isChapter: answer = parsedAnswer[0] diff --git a/cheinsteinpy/parsers/pageParser.py b/cheinsteinpy/parsers/pageParser.py index 8f6b2ec..ddd9f8b 100644 --- a/cheinsteinpy/parsers/pageParser.py +++ b/cheinsteinpy/parsers/pageParser.py @@ -27,6 +27,4 @@ def parsePage(data, isChapter): return questionjson, solutionjson else: soup = bs(data, "html.parser") - questionhtml = soup.find("div", {"class": "question-body-text"}) - answerhtml = soup.find("div", {"class": "answer-given-body"}) - return questionhtml, answerhtml \ No newline at end of file + return soup \ No newline at end of file diff --git a/cheinsteinpy/requestPage.py b/cheinsteinpy/requestPage.py index 85d4e37..07b79ac 100644 --- a/cheinsteinpy/requestPage.py +++ b/cheinsteinpy/requestPage.py @@ -77,7 +77,7 @@ def requestGraphQl(cookie, userAgent, data): } url = "https://gateway.chegg.com/one-graph/graphql" response = requests.post(url=url, headers=headers, json=data, data=None) - return response.text + return response.json() def requestEnhancedAnswer(url, cookie, userAgent, html): headers = { From 79c3b616e1d1c3cac43294435bf5f15cc1ae567c Mon Sep 17 00:00:00 2001 From: lina Date: Wed, 16 Nov 2022 10:15:09 -0700 Subject: [PATCH 4/6] fix: question grabbing --- cheinsteinpy/api.py | 20 +++++++++++++++++--- cheinsteinpy/parsers/questionParser.py | 14 +++++++++++--- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/cheinsteinpy/api.py b/cheinsteinpy/api.py index 30493bd..5c8ef2a 100644 --- a/cheinsteinpy/api.py +++ b/cheinsteinpy/api.py @@ -108,17 +108,31 @@ def question(url, cookie, userAgent): """ cookieStr = cookieParser.parseCookie(cookie) isChapter = pageParser.checkLink(url)["isChapter"] - htmlData = requestPage.requestWebsite(url, cookieStr, userAgent) if isChapter: # await asyncio.sleep(6) + htmlData = requestPage.requestWebsite(url, cookieStr, userAgent) htmlRaw = requestPage.requestChapter(url, cookieStr, userAgent, htmlData) else: - htmlRaw = htmlData + qid = pageParser.getId(url) + data = { + "operationName":"QnaPageQuestionByLegacyId", + "variables": { + "id":int(qid) + }, + "extensions": { + "persistedQuery": { + "version":1, + "sha256Hash":"26efed323ef07d1759f67adadd2832ac85d7046b7eca681fe224d7824bab0928" + } + } + } + raw = requestPage.requestGraphQl(cookieStr, userAgent, data) + htmlRaw = raw["data"]["questionByLegacyId"]["content"]["body"] dataRaw = pageParser.parsePage(htmlRaw, isChapter) if isChapter: data = dataRaw[0] else: - data = dataRaw[0] + data = dataRaw parsedQuestion = questionParser.getQuestion(data, isChapter) if isChapter: question = parsedQuestion diff --git a/cheinsteinpy/parsers/questionParser.py b/cheinsteinpy/parsers/questionParser.py index ecda5cc..ad50ed0 100644 --- a/cheinsteinpy/parsers/questionParser.py +++ b/cheinsteinpy/parsers/questionParser.py @@ -32,9 +32,17 @@ def getQuestion(dataRaw, isChapter): img = dataRaw.find_all("img") for i in img: url = i["src"] - i.replace_with(url) + i.replace_with(" " + url + " ") + if "
" or "
" in str(dataRaw): + div = dataRaw.find_all("div") + for i in div: + i.replaceWithChildren() + if "
" in str(dataRaw): + br = dataRaw.find_all("br") + for i in br: + i.replace_with("") questionList = [] - for k in dataRaw.contents[1:-1]: + for k in dataRaw.contents: txt = k.text questionList.append(txt) questionList = [x for x in questionList if x] @@ -42,5 +50,5 @@ def getQuestion(dataRaw, isChapter): questionList = questionList[1:] if questionList[-1] == "\n": questionList = questionList[:-1] - questionList = " ".join(questionList) + questionList = (" ".join((" ".join(questionList)).split(" "))).strip() return questionList \ No newline at end of file From 8d095d92e8776af433c9518fa2406c5e16ba73c4 Mon Sep 17 00:00:00 2001 From: lina Date: Wed, 16 Nov 2022 10:21:54 -0700 Subject: [PATCH 5/6] chore: update version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2eba3dc..a21b146 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="cheinsteinpy", - version="0.1.6", + version="0.1.7", author="jckli & okinahiru", description="A Python library to get information from Chegg.", long_description=long_description, From 7cb789e8288fd936f16b5b112e1cfa0a9851ac1a Mon Sep 17 00:00:00 2001 From: lina Date: Fri, 2 Dec 2022 18:07:26 -0700 Subject: [PATCH 6/6] chore: add some files --- Pipfile | 13 +++++++++ Pipfile.lock | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++ test.py | 30 ++++++++++++++++++++ 3 files changed, 120 insertions(+) create mode 100644 Pipfile create mode 100644 Pipfile.lock create mode 100644 test.py diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..ee5fd68 --- /dev/null +++ b/Pipfile @@ -0,0 +1,13 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +beautifulsoup4 = "*" +requests = "*" + +[dev-packages] + +[requires] +python_version = "3.10" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..336588c --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,77 @@ +{ + "_meta": { + "hash": { + "sha256": "73352366ebbba8a1b3952ae48124bd44d7414d5f62dd76c10b00d5772e972461" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.10" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "beautifulsoup4": { + "hashes": [ + "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30", + "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693" + ], + "index": "pypi", + "version": "==4.11.1" + }, + "certifi": { + "hashes": [ + "sha256:0d9c601124e5a6ba9712dbc60d9c53c21e34f5f641fe83002317394311bdce14", + "sha256:90c1a32f1d68f940488354e36370f6cca89f0f106db09518524c88d6ed83f382" + ], + "markers": "python_version >= '3.6'", + "version": "==2022.9.24" + }, + "charset-normalizer": { + "hashes": [ + "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", + "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==2.1.1" + }, + "idna": { + "hashes": [ + "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4", + "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2" + ], + "markers": "python_version >= '3.5'", + "version": "==3.4" + }, + "requests": { + "hashes": [ + "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983", + "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349" + ], + "index": "pypi", + "version": "==2.28.1" + }, + "soupsieve": { + "hashes": [ + "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", + "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" + ], + "markers": "python_version >= '3.6'", + "version": "==2.3.2.post1" + }, + "urllib3": { + "hashes": [ + "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e", + "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' and python_version < '4'", + "version": "==1.26.12" + } + }, + "develop": {} +} diff --git a/test.py b/test.py new file mode 100644 index 0000000..101bacd --- /dev/null +++ b/test.py @@ -0,0 +1,30 @@ +from cheinsteinpy.parsers import pageParser, cookieParser +from cheinsteinpy import requestPage, api + +link = "https://www.chegg.com/homework-help/questions-and-answers/question-1-die-rolled-10-times-let-x-number-times-six-appears-10-rolls-part-probability-4--q63245414" +userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36" +with open("cookie.txt", 'r') as f: + cookieTxt = f.read() + +# cookieStr = cookieParser.parseCookie(cookieTxt) + +answer = api.question(link, cookieTxt, userAgent) +print(answer) +""" +payload = { + "operationName":"QnaPageQuestionByLegacyId", + "variables": { + "id":88129188 + }, + "extensions": { + "persistedQuery": { + "version":1, + "sha256Hash":"26efed323ef07d1759f67adadd2832ac85d7046b7eca681fe224d7824bab0928" + } + } +} + +#data = requestPage.requestWebsite(link, cookieStr, userAgent) +data = requestPage.requestGraphQl(cookieStr, userAgent, payload) +print(data) +""" \ No newline at end of file