-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsucharita_comments_extraction_code.py
53 lines (47 loc) · 1.87 KB
/
sucharita_comments_extraction_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# pip install selenium
# pip install beautifulsoup4
# pip install webdriver-manager
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd
def ScrapComment(url):
option = webdriver.FirefoxOptions()
option.add_argument("--headless")
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(), options=option)
driver.get(url)
prev_h = 0
while True:
height = driver.execute_script("""
function getActualHeight() {
return Math.max(
Math.max(document.body.scrollHeight, document.documentElement.scrollHeight),
Math.max(document.body.offsetHeight, document.documentElement.offsetHeight),
Math.max(document.body.clientHeight, document.documentElement.clientHeight)
);
}
return getActualHeight();
""")
driver.execute_script(f"window.scrollTo({prev_h},{prev_h + 200})")
# fix the time sleep value according to your network connection
time.sleep(1)
prev_h +=200
if prev_h >= height:
break
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
title_text_div = soup.select_one('#container h1')
title = title_text_div and title_text_div.text
comment_div = soup.select("#content #content-text")
comment_list = [x.text for x in comment_div]
print(title, comment_list)
dict = {"Comments": comment_list}
df = pd.DataFrame(dict)
df.to_csv('scrappedfile_sucharita.csv')
if __name__ == "__main__":
urls = [
"https://www.youtube.com/watch?v=lPvXZz7m9sI",
"https://www.youtube.com/watch?v=uEawmeO2gOY",
]
ScrapComment(urls[0])