Skip to content

Commit

Permalink
minor fixes + refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
nikitajz committed Sep 23, 2017
1 parent 9de0d84 commit fd38a12
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 6 deletions.
3 changes: 2 additions & 1 deletion sandbox.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from bs4 import BeautifulSoup
import requests

u = "http://03online.com/news/bespokoit_allergiya/2017-9-22-337414"
# u = "http://03online.com/news/bespokoit_allergiya/2017-9-22-337414"
u = "http://03online.com/news/nizkiy_ig_e_no_vysypaniya_ne_prohodyat/2017-9-22-337601"
rm = requests.get(u)
soup = BeautifulSoup(rm.text, 'html.parser')

Expand Down
1 change: 1 addition & 0 deletions sandbox_question_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
rm = requests.get(u)
soup = BeautifulSoup(rm.text, 'html.parser')


def get_question_links_per_page(tag):
"""
Given the area tag return list of links for all questions on this page.
Expand Down
26 changes: 21 additions & 5 deletions scrape03online.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def __init__(self, question_block):
self.chronic_condition = content.select('div.extra-info.top > div')[3].select('i')[0].text
self.text = content.select('div.text')[0].text
self.cat_title = question_block.select('div.question-content > div.extra-info.bottom > div.cat > a')[0].text
self.cat_link = question_block.select('div.question-content > div.extra-info.bottom > div.cat > a')[0]['href']
self.cat_link = the_url + question_block.select('div.question-content > div.extra-info.bottom > div.cat > a')[0]['href']
self.published_date = question_block.select('div.question-content > div.extra-info.bottom > div.cat > a')[1]['href'].lstrip('/news/')


Expand Down Expand Up @@ -113,9 +113,9 @@ def get_question_links_per_page(tag):
for p in range(1, max_page + 1):
u = link.replace('/1-', '/{}-'.format(p))
print(u)
rq = requests.get(u) # request questions block page
rqb = requests.get(u) # request questions block page
# print(rq.ok)
sq = BeautifulSoup(rq.text, 'html.parser')
sq = BeautifulSoup(rqb.text, 'html.parser')
question_links = get_question_links_per_page(sq)
time.sleep(timeout)
return question_links
Expand All @@ -127,13 +127,29 @@ def get_question_links_per_page(tag):
v['question_links'] = get_question_links(v['link'], v['max_page'])
doctors[d] = v

with open('doctors_basic.json', 'w') as outfile:
json.dump(doctors, outfile)


def scrape_question(page_text):
"""
Scrape the questions from specific question page
:param the_link: str
:return: Question
"""

sq = BeautifulSoup(page_text, 'html.parser')
question_raw = sq.select("#content_main > div.divide-block.question-block")[0]
question = Question(question_raw)
return question


for d, v in doctors.items():
questions = []
for l in v['question_links']:
print("Collection questions for page", l)
question_tags = soup.select("#content_main > div.divide-block.question-block")[0]
q = Question(question_tags)
rq = requests.get(l)
scrape_question(rq.text)
questions.extend(q)
v['questions'] = questions
doctors[d] = v
Expand Down
25 changes: 25 additions & 0 deletions test_scraping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from scrape03online import scrape_question
import unittest


class TestClass(unittest.TestCase):
def test_question_scraping(self):
print("Reading question page file.")
with open('resources/allergolog_1-0-23.html') as f:
page = f.read()

title = 'Низкий ig e, но высыпания не проходят'
name = 'Татьяна, Г. Рыбное'
sex = 'Мужской'
age = '5 лет'
chronic_condition = 'не указаны'
text = 'Здравствуйте, ребенку 5 лет часто высыпания на коже чаще под коленками, на попе, кожа шершавая, после антигистаминных стихает, но последнее время не проходит, и ещё вокруг глаз. Сдали анализы на некоторых паразитов отрицательно, и на общий ig e результат 4,21 ке/л референсные значения 0,00-52,00. Не низкий ли уровень ig e, и почему не стихает дерматит?'
cat_title = 'Аллерголог'
cat_link = 'http://03online.com/news/allergolog/1-0-23'
published_date = '2017-09-22'

print("Scraping the page")
q = scrape_question(page)
self.assertEqual(title, q.title)
self.assertEqual(name, q.name)
self.assertEqual(text, q.text)

0 comments on commit fd38a12

Please sign in to comment.