Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

It may help! #2

Open
Mallikarjun2k opened this issue Jan 2, 2022 · 1 comment
Open

It may help! #2

Mallikarjun2k opened this issue Jan 2, 2022 · 1 comment

Comments

@Mallikarjun2k
Copy link

import urllib3
from bs4 import BeautifulSoup
import requests
import os
import csv
import unicodedata
import pandas as pd

def get_links(tag, suffix):
url = 'https://medium.com/tag/' + tag
urls = [url + '/' + s for s in suffix]
links = []
for url in urls:
data = requests.get(url)
soup = BeautifulSoup(data.content, 'html.parser')
articles = soup.findAll('div', {"class": "postArticle-readMore"})
for i in articles:
links.append(i.a.get('href'))
return links

def get_article(links):
articles = []
for link in links:
try:
article = {}
data = requests.get(link)

        soup = BeautifulSoup(data.content, 'html.parser')
        author=[]
        title = soup.findAll('title')[0]
        title = title.get_text()
        article['title'] = unicodedata.normalize('NFKD', title)
        #print(article['title'])

        author = soup.findAll('meta', {"name": "author"})[0]
        author = author.get('content')
        article['author'] = unicodedata.normalize('NFKD', author)
        paras = soup.findAll('p')
        text = ''
        nxt_line = '\n'
        for para in paras:
            text += unicodedata.normalize('NFKD',para.get_text()) + nxt_line
        article['text'] = text
        
        articles.append(article)
        
    except KeyboardInterrupt:
        print('Exiting')
        os._exit(status = 0)
    except:
        continue
    

return articles

def save_articles(articles, csv_file, is_write = True):
csv_columns = ['title','author','text']
print(csv_file)
if is_write:
with open(csv_file, 'w',encoding="UTF-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter='|')
writer.writeheader()
for data in articles:
writer.writerow(data)
csvfile.close()
else:
with open(csv_file, 'a+',encoding="UTF-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter='|')
for data in articles:
writer.writerow(data)
csvfile.close()

if name == 'main':
is_write = True
tags = input('Write tags in space separated format.\n')
tags = tags.split(' ')
file_name = input('Write destination file name.\n')
if len(file_name.split('.')) == 1:
file_name += '.csv'
suffixes = ['', 'latest', 'archive/2000',
'archive/2010', 'archive/2011', 'archive/2012', 'archive/2013', 'archive/2014', 'archive/2015', 'archive/2016', 'archive/2017', 'archive/2018']
for tag in tags:
links = get_links(tag, suffixes)
articles = get_article(links)
save_articles(articles, file_name, is_write)
is_write = False
# To remove duplicates
articles = pd.read_csv(file_name, file_name, delimiter=None)
articles = articles.drop_duplicates()
articles.to_csv(file_name, sep='|', index=False)

@danielkang1296
Copy link

How do I have to read the file from scrap process correctly, please help me
image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants