-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWords Cloud Graph Code
40 lines (32 loc) · 1.29 KB
/
Words Cloud Graph Code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import PyPDF2
from wordcloud import WordCloud
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt
# String containing the file path
file_paths_str = r"C:/Users/DELL/Desktop/Research Data/2019.1.30.pdf"
# Use the split() method to break the string into individual file paths
file_paths = file_paths_str.split('\n')
# Initialize a variable to store all text content
all_text = ''
# Iterate over each file path
for pdf_file_path in file_paths:
# Open the PDF file
with open(pdf_file_path, 'rb') as pdf_file:
# Create a PDF file reader object
pdf_reader = PyPDF2.PdfReader(pdf_file)
# Read the text content of each page
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
all_text += page.extract_text()
# Clean up the text data (further processing can be done as needed)
# For example, remove newline characters, extra spaces, etc.
all_text = all_text.replace('\n', ' ')
# Generate a word cloud
stop_words = ["https", "co", "RT"] + list(STOPWORDS)
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white", stopwords=stop_words).generate(all_text)
# Display the word cloud
plt.figure()
plt.title("Word Cloud for 2024")
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()