-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
117 lines (92 loc) · 3.44 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import urllib.request, urllib.parse, urllib.error
import nltk
import re
from bs4 import BeautifulSoup
# get the copy from a given url
# tokenize the content using nlp
# tags = ["JJ", "NN", "NNS", "RBR", "RBS", "VBN","NPP", "NNP"]
tags = ['NNP']
tags_danish = ['JJ', 'NN', 'NNS', 'VBP', 'NNS', 'VB', 'NNS', 'VB', 'NNS', 'NN', 'NN']
# get site markup
def get_site_markup(url):
markup = urllib.request.urlopen(url).read()
soup = BeautifulSoup(markup, 'html.parser')
# remove header and footer copy as it doesn't have brandish copy
# todo remove tags that have footer
if(soup.find('footer') != None):
soup.find('footer').decompose()
if(soup.find('header') != None):
soup.find('header').decompose()
# tags = soup(['h1','h2', 'h3', 'h4', 'h5', 'p', 'a'])
copy = soup.get_text(' ', strip=True)
return copy
social = ['email', 'facebook', 'instagram']
business = ['subscribe', 'login','menu', 'wishlist','faq','connect', 'account', 'company','tweet', 'checkout', 'free', 'total', 'payments', 'customer', 'cart','buy', 'policy', 'return', 'sale', 'customerservice', 'shipping', 'chat', 'purchase', 'privacy', 'inc', 'ordering']
dev = ['error', 'submit', 'sitemap']
def clean_copy(copy):
clean_copy = re.sub(r'[^a-zA-Z\s]',' ', copy)
for w in social:
clean_copy = re.sub(w,' ', clean_copy, flags=re.IGNORECASE)
for w in business:
clean_copy = re.sub(w,' ', clean_copy, flags=re.IGNORECASE)
for w in dev:
clean_copy = re.sub(w,' ', clean_copy, flags=re.IGNORECASE)
return clean_copy
# take copy and create token key
def tokenize(copy):
ss=nltk.sent_tokenize(copy)
tokenized_sent=[nltk.word_tokenize(sent) for sent in ss]
pos_sentences=[nltk.pos_tag(sent) for sent in tokenized_sent]
return pos_sentences
# match tokenized copy to defined set tokens
def match_tokens(token_array, nltk_selection_set = ['NNP']):
copy_set = set()
for token_set in token_array:
for token in token_set:
if token[1] in nltk_selection_set:
word = token[0].strip().lower()
copy_set.add(word)
copy = list(copy_set)
return copy
def make_para(copy, para_size, number_para):
copyArr = copy.split(' ')
para_size = int(para_size)
number_para = int(number_para)
total = number_para * para_size
paras = []
if(len(copyArr) > total):
para = 0
for i in range(number_para):
paras.append(copyArr[para: para + para_size])
para = para + para_size
for p in range(len(paras)):
paras[p] = ' '.join(paras[p])
return paras
else:
# create enough copy
while total > len(copyArr):
copyArr = copyArr + copyArr
# parse into number of paragraphs of defined length
para = 0
for i in range(number_para):
paras.append(copyArr[para: para + para_size])
para = para + para_size
for p in range(len(paras)):
paras[p] = ' '.join(paras[p])
return paras
# Get root domain
# def url_set_to_scrape(base_url):
# # store of previously searched urls
# searched = []
# # get domain url : https://example.com/page/100 -> https://example.com
# domain = re.findall(r'[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}', base_url)
# get_links(base_url, domain[0])
# # get link on given page
# def get_links(url, domain_url):
# links_on_page = []
# markup = urllib.request.urlopen(url).read()
# soup = BeautifulSoup(markup, 'html.parser')
# tags = soup('a')
# for tag in tags:
# if domain_url in tag:
# links_on_page.append(tag)