Skip to content

Commit

Permalink
Init
Browse files Browse the repository at this point in the history
  • Loading branch information
earthboundkid committed Dec 18, 2023
1 parent ae3ae71 commit 54f31f8
Show file tree
Hide file tree
Showing 6 changed files with 256 additions and 2 deletions.
24 changes: 24 additions & 0 deletions .github/workflows/go.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Go

on: [ push, pull_request ]
jobs:

build:
name: Build
runs-on: ubuntu-latest
steps:

- name: Set up Go 1.x
uses: actions/setup-go@v2
with:
go-version: ^1.21
id: go

- name: Check out code into the Go module directory
uses: actions/checkout@v2

- name: Get dependencies
run: go mod download

- name: Test
run: go test -v ./...
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,13 @@
# pdfcopy
Quickie script to take screenshots of URLs for copyright submissions
# pdfcopy [![GoDoc](https://godoc.org/github.com/spotlightpa/pdfcopy?status.svg)](https://godoc.org/github.com/spotlightpa/pdfcopy) [![Go Report Card](https://goreportcard.com/badge/github.com/spotlightpa/pdfcopy)](https://goreportcard.com/report/github.com/spotlightpa/pdfcopy)

Quickie script to take screenshots of URLs for copyright submissions.

## Installation

First install [Go](http://golang.org).

If you just want to install the binary to your current directory and don't care about the source code, run

```bash
GOBIN="$(pwd)" go install github.com/spotlightpa/pdfcopy@latest
```
180 changes: 180 additions & 0 deletions app/app.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
package app

import (
"context"
"crypto/md5"
"errors"
"flag"
"fmt"
"io"
"io/fs"
"log"
"os"
"os/exec"
"os/signal"
"path/filepath"

"github.com/carlmjohnson/csv"
"github.com/carlmjohnson/flagx"
"github.com/carlmjohnson/flagx/lazyio"
"github.com/carlmjohnson/flowmatic"
"github.com/carlmjohnson/versioninfo"
)

const AppName = "pdfcopy"

func CLI(args []string) error {
var app appEnv
err := app.ParseArgs(args)
if err != nil {
return err
}
if err = app.Exec(); err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
}
return err
}

func (app *appEnv) ParseArgs(args []string) error {
fl := flag.NewFlagSet(AppName, flag.ContinueOnError)
src := lazyio.FileOrURL(lazyio.StdIO, nil)
app.src = src
fl.Var(src, "src", "source file or URL")
fl.StringVar(&app.dst, "dst", "output.pdf", "destination `filepath`")
fl.StringVar(&app.temp, "temp", "", "temporary `filepath` for downloads and intermediate PDFs")
fl.IntVar(&app.maxProcs, "workers", 10, "number of workers")
app.Logger = log.New(os.Stderr, AppName+" ", log.LstdFlags)
flagx.BoolFunc(fl, "silent", "log debug output", func() error {
app.Logger.SetOutput(io.Discard)
return nil
})
fl.Usage = func() {
fmt.Fprintf(fl.Output(), `copyrightpdfs - %s
Download stuff and screenshot it
Usage:
copyrightpdfs [options]
Options:
`, versioninfo.Version)
fl.PrintDefaults()
}
versioninfo.AddFlag(fl)
if err := fl.Parse(args); err != nil {
return err
}
if err := flagx.ParseEnv(fl, AppName); err != nil {
return err
}
return nil
}

type appEnv struct {
src io.ReadCloser
temp, dst string
maxProcs int
*log.Logger
}

func (app *appEnv) Exec() (err error) {
// Open list of URLs
urls, err := app.readURLs()
if err != nil {
return err
}
if app.temp == "" {
// Make temp directory
tempdir, err := os.MkdirTemp("", "")
if err != nil {
return err
}
app.temp = tempdir
}
app.Logger.Printf("tempdir %q", app.temp)

// Start some Flowmatic groups
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt)
defer stop()

err = flowmatic.Each(app.maxProcs, urls, func(url string) error {
return app.buildPDF(ctx, url)
})
if err != nil {
return err
}

// Once they're all done'
// pdftk ./*.pdf cat output merged.pdf
// TODO: The order of PDFs is random. Fix that somehow
args, err := filepath.Glob(filepath.Join(app.temp, "*.pdf"))
if err != nil {
return err
}
args = append(args, "cat", "output", app.dst)
cmd := exec.CommandContext(ctx, "pdftk", args...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
app.Logger.Printf("pdftk cat %s", app.dst)
if err := cmd.Run(); err != nil {
return err
}

return err
}

func (app *appEnv) readURLs() ([]string, error) {
var urls []string
fr := csv.NewFieldReader(app.src)
for fr.Scan() {
urls = append(urls, fr.Field("url"))
}
return urls, fr.Err()
}

func (app *appEnv) buildPDF(ctx context.Context, url string) error {
hash := md5.Sum([]byte(url))
png := fmt.Sprintf("%0x.png", hash)
pdf := fmt.Sprintf("%0x.pdf", hash)

// Skip if stat file
_, err := os.Stat(filepath.Join(app.temp, png))
switch {
case err == nil:
app.Logger.Printf("have %s", png)
case !errors.Is(err, fs.ErrNotExist):
return err
default:
app.Logger.Printf("start %0x from %q", hash, url)
// TODO retry in loop
cmd := exec.CommandContext(ctx, "shot-scraper", "--reduced-motion",
// TODO figure out whether to use #content or not
"-s", "#content",
"-p", "16", "--output", png, url)
cmd.Dir = app.temp
cmd.Stderr = os.Stderr
cmd.Stdout = os.Stdout
if err := cmd.Run(); err != nil {
// mark this up
return fmt.Errorf("problem with %q from %q: %w", png, url, err)
}
}
// Skip if stat file
_, err = os.Stat(filepath.Join(app.temp, pdf))
switch {
case err == nil:
app.Logger.Printf("have %s", pdf)
return nil
case !errors.Is(err, fs.ErrNotExist):
return err
default:
cmd := exec.CommandContext(ctx, "convert", png, pdf)
cmd.Dir = app.temp
if err := cmd.Run(); err != nil {
return err
}
}
app.Logger.Printf("done %0x from %q", hash, url)
return nil
}
13 changes: 13 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
module github.com/spotlightpa/pdfcopy

go 1.21

require (
github.com/carlmjohnson/csv v1.20.0
github.com/carlmjohnson/exitcode v0.20.2
github.com/carlmjohnson/flagx v0.22.2
github.com/carlmjohnson/flowmatic v0.23.4
github.com/carlmjohnson/versioninfo v0.22.5
)

require github.com/carlmjohnson/deque v0.23.1 // indirect
14 changes: 14 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
github.com/carlmjohnson/be v0.22.3 h1:XwpxXz+wHvZ6O+i/IxcVQaGinsDkF99bpq0VXno6Voc=
github.com/carlmjohnson/be v0.22.3/go.mod h1:KAgPUh0HpzWYZZI+IABdo80wTgY43YhbdsiLYAaSI/Q=
github.com/carlmjohnson/csv v1.20.0 h1:/QBKcZdJY0FMerK6m+uMgZRwmQ33Uj5NBqkqcitstcY=
github.com/carlmjohnson/csv v1.20.0/go.mod h1:I2Bsj5YPd8aGaLfXelcf4ANJpFlrPYkgpJnwCsTdNbI=
github.com/carlmjohnson/deque v0.23.1 h1:X2HOJM9xcglY03deMZ0oZ1V2xtbqYV7dJDnZiSZN4Ak=
github.com/carlmjohnson/deque v0.23.1/go.mod h1:LF5NJjICBrEOPx84pxPL4nCimy5n9NQjxKi5cXkh+8U=
github.com/carlmjohnson/exitcode v0.20.2 h1:vE6rmkCGNA4kO4m1qwWIa77PKlUBVg46cNjs22eAOXE=
github.com/carlmjohnson/exitcode v0.20.2/go.mod h1:MZ6ThCDx517DQcrpYnnns1pLh8onjFl+B/AsrOrdmpc=
github.com/carlmjohnson/flagx v0.22.2 h1:UXf7gL4Ffv5RIH/HKp8CGNzDyopgezFLrDO1m4F8jWc=
github.com/carlmjohnson/flagx v0.22.2/go.mod h1:obobISvBnxgEXPLBITVXhRUOlSlzza1SGt34M64CPJc=
github.com/carlmjohnson/flowmatic v0.23.4 h1:SfK6f+zKUlw4aga1ph+7/csqVeUAWnBxfqKN5gvQzzs=
github.com/carlmjohnson/flowmatic v0.23.4/go.mod h1:Jpvyl591Dvkt9chYpnVupjxlKvqkZ9CtCmqL4wfQD7U=
github.com/carlmjohnson/versioninfo v0.22.5 h1:O00sjOLUAFxYQjlN/bzYTuZiS0y6fWDQjMRvwtKgwwc=
github.com/carlmjohnson/versioninfo v0.22.5/go.mod h1:QT9mph3wcVfISUKd0i9sZfVrPviHuSF+cUtLjm2WSf8=
12 changes: 12 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package main

import (
"os"

"github.com/carlmjohnson/exitcode"
"github.com/spotlightpa/pdfcopy/app"
)

func main() {
exitcode.Exit(app.CLI(os.Args[1:]))
}

0 comments on commit 54f31f8

Please sign in to comment.