-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarkdown.py
149 lines (120 loc) · 5.43 KB
/
markdown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import argparse
from pathlib import Path
from langchain_core.documents import Document
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
MODEL = os.environ['MODEL']
LOWERCASE_WORDS = {"in", "with", "the", "a", "an", "and", "but", "or", "nor", "for", "on", "at", "to", "by", "from", "of", "not"}
def should_include_title(title):
exclude_words = ["contents", "guide", "dedication", "preface", "acknowledgments", "outline", "revision", "who should read", "references", "o’reilly", "how to contact us"]
return not any(title.lower().startswith(word) for word in exclude_words)
def correct_case(title):
words = title.split('_')
corrected_words = []
for i, word in enumerate(words):
if i == 0 or word.lower() not in LOWERCASE_WORDS:
corrected_words.append(word.capitalize())
else:
corrected_words.append(word.lower())
return ' '.join(corrected_words)
def create_book_dir(path):
book_name = os.path.splitext(os.path.basename(path))[0]
book_dir = os.path.join(os.getcwd(), "books", book_name)
os.makedirs(book_dir, exist_ok=True)
return book_dir
def summarize_chapter(chapter_text, prompt_text):
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
model_name=MODEL,
chunk_size=10000,
chunk_overlap=100,
)
docs = text_splitter.create_documents([chapter_text])
llm = ChatOpenAI(model=MODEL, temperature=0, openai_api_key=OPENAI_API_KEY)
prompt = ChatPromptTemplate.from_messages(
[("system", prompt_text)]
)
chain = create_stuff_documents_chain(llm, prompt)
summary = ""
for token in chain.stream({"context": docs}):
summary += token
print(token, end="", flush=True)
return summary
def extract_chapters(path):
metadata = {
"source": str(path.name),
"path": str(path),
"created": path.stat().st_ctime,
"last_modified": path.stat().st_mtime,
"last_accessed": path.stat().st_atime
}
with open(str(path), encoding="UTF-8") as f:
text = f.read()
doc = Document(page_content=text, metadata=metadata)
markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=[
("#", "Header 1"),
("##", "Header 2")
]
)
chapters = []
sections = markdown_splitter.split_text(doc.page_content)
for section in sections:
section.metadata.update(doc.metadata)
if "Header 1" in section.metadata:
section.metadata["Title"] = section.metadata["Header 1"]
elif "Header 2" in section.metadata:
section.metadata["Title"] = section.metadata["Header 2"]
title = section.metadata.get("Title", "").strip()
if title and should_include_title(title):
existing_chapter = next((chapter for chapter in chapters if chapter.metadata["Title"] == title), None)
if existing_chapter:
existing_chapter.page_content += "\n\n" + section.page_content
else:
chapters.append(section)
return chapters
def list_chapters(path):
chapters = extract_chapters(path)
for i, chapter in enumerate(chapters, 1):
chapter_title = correct_case(chapter.metadata["Title"].replace(' ', '_'))
print(f"{i}. {chapter_title}")
def summarize(path, chapter_number=None):
book_dir = create_book_dir(path)
chapters = extract_chapters(path)
with open("prompt.txt" , 'r', encoding='utf-8') as file:
prompt_text = file.read()
for i, chapter in enumerate(chapters, 1):
chapter_title = correct_case(chapter.metadata["Title"].replace(' ', '_'))
if chapter_number and chapter_number != i:
continue
print(f"Summarizing: {chapter_title}")
summary = summarize_chapter(chapter.page_content, prompt_text)
chapter_file_name = f"Chapter_{i}_{chapter_title.replace(' ', '_')}.md"
chapter_path = os.path.join(book_dir, chapter_file_name)
with open(chapter_path, 'w', encoding='utf-8') as f:
f.write(f"# {chapter_title}\n\n")
f.write(summary)
print(f"\n\nSaved summary to: {chapter_path}\n")
if not chapter_number:
continue_summarizing = input("Do you want to continue with the next chapter? (y/n):")
if continue_summarizing != 'y':
break
print("\nSummarization complete.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Tool for managing and summarizing Markdown files.")
parser.add_argument("--list-chapters", type=str, help="List all chapters in the Markdown file")
parser.add_argument("--summarize", type=str, help="Summarize one or all chapters from the Markdown file")
parser.add_argument("--chapter", type=int, help="Number of the specific chapter to summarize")
args = parser.parse_args()
if args.list_chapters:
list_chapters(Path(args.list_chapters))
elif args.summarize:
summarize(Path(args.summarize), args.chapter)
else:
parser.print_help()