minor fixes + refactoring

nikitajz · Sep 23, 2017 · fd38a12 · fd38a12
1 parent 9de0d84
commit fd38a12
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 6 deletions.
diff --git a/sandbox.py b/sandbox.py
@@ -1,7 +1,8 @@
 from bs4 import BeautifulSoup
 import requests
 
-u = "http://03online.com/news/bespokoit_allergiya/2017-9-22-337414"
+# u = "http://03online.com/news/bespokoit_allergiya/2017-9-22-337414"
+u = "http://03online.com/news/nizkiy_ig_e_no_vysypaniya_ne_prohodyat/2017-9-22-337601"
 rm = requests.get(u)
 soup = BeautifulSoup(rm.text, 'html.parser')
 

diff --git a/sandbox_question_block.py b/sandbox_question_block.py
@@ -6,6 +6,7 @@
 rm = requests.get(u)
 soup = BeautifulSoup(rm.text, 'html.parser')
 
+
 def get_question_links_per_page(tag):
     """
     Given the area tag return list of links for all questions on this page.

diff --git a/scrape03online.py b/scrape03online.py
@@ -83,7 +83,7 @@ def __init__(self, question_block):
         self.chronic_condition = content.select('div.extra-info.top > div')[3].select('i')[0].text
         self.text = content.select('div.text')[0].text
         self.cat_title = question_block.select('div.question-content > div.extra-info.bottom > div.cat > a')[0].text
-        self.cat_link = question_block.select('div.question-content > div.extra-info.bottom > div.cat > a')[0]['href']
+        self.cat_link = the_url + question_block.select('div.question-content > div.extra-info.bottom > div.cat > a')[0]['href']
         self.published_date = question_block.select('div.question-content > div.extra-info.bottom > div.cat > a')[1]['href'].lstrip('/news/')
 
 
@@ -113,9 +113,9 @@ def get_question_links_per_page(tag):
     for p in range(1, max_page + 1):
         u = link.replace('/1-', '/{}-'.format(p))
         print(u)
-        rq = requests.get(u)  # request questions block page
+        rqb = requests.get(u)  # request questions block page
         # print(rq.ok)
-        sq = BeautifulSoup(rq.text, 'html.parser')
+        sq = BeautifulSoup(rqb.text, 'html.parser')
         question_links = get_question_links_per_page(sq)
         time.sleep(timeout)
     return question_links
@@ -127,13 +127,29 @@ def get_question_links_per_page(tag):
     v['question_links'] = get_question_links(v['link'], v['max_page'])
     doctors[d] = v
 
+with open('doctors_basic.json', 'w') as outfile:
+    json.dump(doctors, outfile)
+
+
+def scrape_question(page_text):
+    """
+    Scrape the questions from specific question page
+    :param the_link: str
+    :return: Question
+    """
+
+    sq = BeautifulSoup(page_text, 'html.parser')
+    question_raw = sq.select("#content_main > div.divide-block.question-block")[0]
+    question = Question(question_raw)
+    return question
+
 
 for d, v in doctors.items():
     questions = []
     for l in v['question_links']:
         print("Collection questions for page", l)
-        question_tags = soup.select("#content_main > div.divide-block.question-block")[0]
-        q = Question(question_tags)
+        rq = requests.get(l)
+        scrape_question(rq.text)
         questions.extend(q)
         v['questions'] = questions
         doctors[d] = v

diff --git a/test_scraping.py b/test_scraping.py
@@ -0,0 +1,25 @@
+from scrape03online import scrape_question
+import unittest
+
+
+class TestClass(unittest.TestCase):
+    def test_question_scraping(self):
+        print("Reading question page file.")
+        with open('resources/allergolog_1-0-23.html') as f:
+            page = f.read()
+
+        title = 'Низкий ig e, но высыпания не проходят'
+        name = 'Татьяна, Г. Рыбное'
+        sex = 'Мужской'
+        age = '5 лет'
+        chronic_condition = 'не указаны'
+        text = 'Здравствуйте, ребенку 5 лет часто высыпания на коже чаще под коленками, на попе, кожа шершавая, после антигистаминных стихает, но последнее время не проходит, и ещё вокруг глаз. Сдали анализы на некоторых паразитов отрицательно, и на общий ig e результат 4,21 ке/л референсные значения 0,00-52,00. Не низкий ли уровень ig e, и почему не стихает дерматит?'
+        cat_title = 'Аллерголог'
+        cat_link = 'http://03online.com/news/allergolog/1-0-23'
+        published_date = '2017-09-22'
+
+        print("Scraping the page")
+        q = scrape_question(page)
+        self.assertEqual(title, q.title)
+        self.assertEqual(name, q.name)
+        self.assertEqual(text, q.text)