-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsession.py
191 lines (157 loc) · 6.86 KB
/
session.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
from urllib import request
import json, re, os, mechanicalsoup
from datetime import datetime
import lxml
from lxml import html
import requests
from defs import ServerRefusal
from urllib.parse import urlparse, parse_qs
from bs4 import UnicodeDammit
import yaml
__logged_in__ = False
username = None
password = None
name_in_archive = False
index_html_generation = False
premium = False
browser = mechanicalsoup.Browser(
#soup_config={"features":"html.parser"}, Maybe don't use the soup TODO try this with those bad outlines
user_agent="Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0"
)
with open(r'config.yaml') as file:
config_yaml = yaml.load(file, Loader=yaml.FullLoader)
username = config_yaml['username']
password = config_yaml['password']
premium = config_yaml['premium']
name_in_archive = config_yaml['name_in_archive']
index_html_generation = config_yaml['index_html_generation']
'''Saves session cookies (run once you have a logged-in account)'''
def __save_session():
print("Saving session.")
#Dump the cookies json as a dictionary
dump = json.dumps(browser.session.cookies.get_dict())
f = open('session','w+')
f.writelines(dump)
f.close()
'''Attempts to resume the session cookies from before. If no cookies exist, log in.'''
def __reload_session():
if os.path.isfile('session'):
#Open session file
f = open('session','r+')
pmud = json.loads(f.readline())
#Iteratie the K/V pairs back into cookies (will overwrite)
for key,value in pmud.items():
browser.session.cookies.set(key,value)
f.close()
#TODO can maybe check if this comes with a login form
#Disable the dynamic interactives just in case
if premium:
browser.get("https://www.writing.com/main/my_account?action=set_q_i3&ajax=setDynaOffOn&val=-1")
else:
__log_in()
'''Logs in to writing.com and saves the session cookies.'''
def __log_in():
global username, password, browser
print("Logging in...")
#clear browser
browser = mechanicalsoup.Browser(
user_agent="Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0"
)
login_p = browser.get('https://www.writing.com/main/login.php')
login_form = login_p.soup.find("form",method="post",action="https://www.Writing.Com/main/login.php")
username_form = login_form.find("input",type="text")
password_form = login_form.find("input",type="password")
username_form['value'] = username
password_form['value'] = password
landing_p = browser.submit(login_form,login_p.url)
if landing_p.soup.find("form",method="post",action="https://www.Writing.Com/main/login.php") is not None:
print("Error logging in. Invalid credientals?")
return
print("Submitted login form.")
#Disable the dynamic interactives just in case
if premium:
browser.get("https://www.writing.com/main/my_account?action=set_q_i3&ajax=setDynaOffOn&val=-1")
#Assume login successful.
__save_session()
'''Reload the session upon import!'''
__reload_session()
'''A copy of get_page without retrying and without utf-8 decoding, since we're only interested in links anyways. Required because the search page will break apart multibyte utf-8 codes in the middle'''
def get_page_search(url):
response = browser.get(url)
tree = html.fromstring(response.content.decode("latin-1"))
return tree
''' Uses the soup browser with logged-in session to return an xpathable tree.'''
def get_page(url, encoding="utf-8"):
#Try 5 times, the requests can fail with requests.exceptions.ConnectionError sometimes
for tries in range(5):
try:
response = browser.get(url)
#If not logged in
#TODO Lock this? Multiple scrapers will trigger this at a time, and all try to login
if response.soup.find("form",method="post",action="https://www.Writing.Com/main/login.php") is not None:
#Prompt for login and try again
__log_in()
continue
break
except requests.exceptions.ConnectionError as e:
pass
if (tries == 4):
raise ServerRefusal('Could not connect to server after 5 attempts')
#html.fromstring will not output unicode if the entire repsonse isn't unicode
tree = html.fromstring(detwingle(response.content))
#Handle all of writing.com's redirect links here
links = tree.xpath("//a[starts-with(@href, 'https://www.Writing.Com/main/redirect')]")
for link in links:
parse = urlparse(link.attrib['href'])
query = parse_qs(parse.query)
link.attrib['href'] = query['redirect_url'][0]
return tree
#Fixing another BS4 bug, it's got the wrong character for á
#https://groups.google.com/g/beautifulsoup/c/H5E660vcYl4/m/UwXgO1rBHwAJ
UnicodeDammit.WINDOWS_1252_TO_UTF8[0xe1] = b'\xc3\xa1'
#Taken from BS4
#Hacked a bit to handle codes like \xe9 "é" which would triggle multi-byte stuff but wasn't actually by checking the rest of the bytes
def detwingle(in_bytes):
byte_chunks = []
chunk_start = 0
pos = 0
while pos < len(in_bytes):
byte = in_bytes[pos]
actually_unicode = False
if (byte >= UnicodeDammit.FIRST_MULTIBYTE_MARKER
and byte <= UnicodeDammit.LAST_MULTIBYTE_MARKER):
# This is the start of a UTF-8 multibyte character. Skip
# to the end.
for start, end, size in UnicodeDammit.MULTIBYTE_MARKERS_AND_SIZES:
if byte >= start and byte <= end:
#My hacks: a check to make sure the next bytes are properly utf multichar bytes
actually_unicode = True
for i in range(1, size):
if not (in_bytes[pos + i] >= 0x80 and in_bytes[pos + i] < 0xc0):
actually_unicode = False
break
#test #2, actually try to decode it
if actually_unicode:
pos += size
break
if actually_unicode:
continue
if byte >= 0x80 and byte in UnicodeDammit.WINDOWS_1252_TO_UTF8:
# We found a Windows-1252 character!
# Save the string up to this point as a chunk.
byte_chunks.append(in_bytes[chunk_start:pos])
# Now translate the Windows-1252 character into UTF-8
# and add it as another, one-byte chunk.
byte_chunks.append(UnicodeDammit.WINDOWS_1252_TO_UTF8[byte])
pos += 1
chunk_start = pos
else:
# Go on to the next character.
pos += 1
if chunk_start == 0:
# The string is unchanged.
return in_bytes
else:
# Store the final chunk.
byte_chunks.append(in_bytes[chunk_start:])
return b''.join(byte_chunks)