-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathprocess-pdf.rb
109 lines (98 loc) · 3.28 KB
/
process-pdf.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
require "json"
require "fileutils"
require "pdf-reader"
def generate_pdf_images(pdf_file, output_prefix)
command = "pdftoppm -jpeg -scale-to 1280 #{pdf_file.to_s} #{output_prefix}"
puts command
raise RuntimeError if !system(command)
end
def extract_links(page, reader)
(page.attributes[:Annots] || [])
.map do |ref_to_annotation|
# page.attributesに含まれるのは実際のannotation objectへの参照であるため、参照先を見る
reader.objects[ref_to_annotation]
end
.filter do |annotation|
# URI Actionに対するLink Annotationだけを選択
annotation[:Subtype] == :Link && annotation[:A][:S] == :URI
end
.map do |annotation|
# Annotationの位置と、リンク先URLを抽出
{ rect: annotation[:Rect], url: annotation[:A][:URI] }
end
end
def rectangle_to_css_position(rect, width, height)
# rectは [左端のx, 下端のy, 右端のx, 上端のy] すべて左下が原点
# width, heightはページの幅と高さ
{
top: (height - rect[3]) / height,
left: rect[0] / width,
bottom: rect[1] / height,
right: (width - rect[2]) / width
}.map { |k, v| [k, "#{(v * 100).round(2)}%"] }
.to_h
.map { |k, v| "#{k}:#{v};" }
.join("")
end
def parse_pdf(pdf_file, image_prefix, image_ext)
reader = PDF::Reader.new(pdf_file)
# pdftoppm generate images with page number (0-padding, 1-indexed)
# so we have to pad index with 0
idx_pad_len = (reader.pages.length + 1).to_s.length
pages =
reader.pages.map.with_index do |page, idx|
idx_str = (idx + 1).to_s.rjust(idx_pad_len, "0")
{
text: page.text.gsub(/\s+/, " "),
width: page.width,
height: page.height,
image: "#{image_prefix}#{idx_str}#{image_ext}",
links:
extract_links(page, reader).map do |link|
{
url: link[:url],
position_str:
rectangle_to_css_position(link[:rect], page.width, page.height)
}
end
}
end
title = reader.info[:title] || pages[0][:text]
{ title: title, pages: pages }
end
def main
dir = Pathname(__dir__)
pdf_dir = dir.join("assets/pdf")
output_dir = dir.join("_slides")
slide_image_dir = dir.join("assets/slide-images")
pdf_files = pdf_dir.glob("*.pdf")
# remove page from deleted PDFs
slide_basenames = pdf_files.map { |p| p.basename(".pdf").to_s }
output_dir
.glob("*.md")
.each do |page_file|
if !slide_basenames.include?(page_file.basename(".md").to_s)
page_file.delete()
end
end
# remove images from deleted PDFs
slide_image_dir
.children
.filter { |d| d.basename.to_s != ".gitkeep" }
.each { |d| d.rmtree() if !slide_basenames.include?(d.basename().to_s) }
# parse PDFs
pdf_files.each do |pdf_file|
pdf_name = pdf_file.basename(".pdf").to_s
slide_image_target_dir = slide_image_dir.join(pdf_name)
if not slide_image_target_dir.exist?
Dir.mkdir(slide_image_target_dir)
generate_pdf_images(pdf_file, "#{slide_image_target_dir.to_s}/slide")
end
out_markdown_file = output_dir.join("#{pdf_name}.md")
if not out_markdown_file.exist?
data = parse_pdf(pdf_file, "#{pdf_name}/slide-", ".jpg")
out_markdown_file.write("---\n#{JSON.pretty_generate(data)}\n---\n")
end
end
end
main