diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml new file mode 100644 index 0000000..07a59c6 --- /dev/null +++ b/.github/workflows/go.yml @@ -0,0 +1,24 @@ +name: Go + +on: [ push, pull_request ] +jobs: + + build: + name: Build + runs-on: ubuntu-latest + steps: + + - name: Set up Go 1.x + uses: actions/setup-go@v2 + with: + go-version: ^1.21 + id: go + + - name: Check out code into the Go module directory + uses: actions/checkout@v2 + + - name: Get dependencies + run: go mod download + + - name: Test + run: go test -v ./... diff --git a/README.md b/README.md index 1acafb5..67d8806 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,13 @@ -# pdfcopy -Quickie script to take screenshots of URLs for copyright submissions +# pdfcopy [![GoDoc](https://godoc.org/github.com/spotlightpa/pdfcopy?status.svg)](https://godoc.org/github.com/spotlightpa/pdfcopy) [![Go Report Card](https://goreportcard.com/badge/github.com/spotlightpa/pdfcopy)](https://goreportcard.com/report/github.com/spotlightpa/pdfcopy) + +Quickie script to take screenshots of URLs for copyright submissions. + +## Installation + +First install [Go](http://golang.org). + +If you just want to install the binary to your current directory and don't care about the source code, run + +```bash +GOBIN="$(pwd)" go install github.com/spotlightpa/pdfcopy@latest +``` diff --git a/app/app.go b/app/app.go new file mode 100644 index 0000000..06783ca --- /dev/null +++ b/app/app.go @@ -0,0 +1,180 @@ +package app + +import ( + "context" + "crypto/md5" + "errors" + "flag" + "fmt" + "io" + "io/fs" + "log" + "os" + "os/exec" + "os/signal" + "path/filepath" + + "github.com/carlmjohnson/csv" + "github.com/carlmjohnson/flagx" + "github.com/carlmjohnson/flagx/lazyio" + "github.com/carlmjohnson/flowmatic" + "github.com/carlmjohnson/versioninfo" +) + +const AppName = "pdfcopy" + +func CLI(args []string) error { + var app appEnv + err := app.ParseArgs(args) + if err != nil { + return err + } + if err = app.Exec(); err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + } + return err +} + +func (app *appEnv) ParseArgs(args []string) error { + fl := flag.NewFlagSet(AppName, flag.ContinueOnError) + src := lazyio.FileOrURL(lazyio.StdIO, nil) + app.src = src + fl.Var(src, "src", "source file or URL") + fl.StringVar(&app.dst, "dst", "output.pdf", "destination `filepath`") + fl.StringVar(&app.temp, "temp", "", "temporary `filepath` for downloads and intermediate PDFs") + fl.IntVar(&app.maxProcs, "workers", 10, "number of workers") + app.Logger = log.New(os.Stderr, AppName+" ", log.LstdFlags) + flagx.BoolFunc(fl, "silent", "log debug output", func() error { + app.Logger.SetOutput(io.Discard) + return nil + }) + fl.Usage = func() { + fmt.Fprintf(fl.Output(), `copyrightpdfs - %s + +Download stuff and screenshot it + +Usage: + + copyrightpdfs [options] + +Options: +`, versioninfo.Version) + fl.PrintDefaults() + } + versioninfo.AddFlag(fl) + if err := fl.Parse(args); err != nil { + return err + } + if err := flagx.ParseEnv(fl, AppName); err != nil { + return err + } + return nil +} + +type appEnv struct { + src io.ReadCloser + temp, dst string + maxProcs int + *log.Logger +} + +func (app *appEnv) Exec() (err error) { + // Open list of URLs + urls, err := app.readURLs() + if err != nil { + return err + } + if app.temp == "" { + // Make temp directory + tempdir, err := os.MkdirTemp("", "") + if err != nil { + return err + } + app.temp = tempdir + } + app.Logger.Printf("tempdir %q", app.temp) + + // Start some Flowmatic groups + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt) + defer stop() + + err = flowmatic.Each(app.maxProcs, urls, func(url string) error { + return app.buildPDF(ctx, url) + }) + if err != nil { + return err + } + + // Once they're all done' + // pdftk ./*.pdf cat output merged.pdf + // TODO: The order of PDFs is random. Fix that somehow + args, err := filepath.Glob(filepath.Join(app.temp, "*.pdf")) + if err != nil { + return err + } + args = append(args, "cat", "output", app.dst) + cmd := exec.CommandContext(ctx, "pdftk", args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + app.Logger.Printf("pdftk cat %s", app.dst) + if err := cmd.Run(); err != nil { + return err + } + + return err +} + +func (app *appEnv) readURLs() ([]string, error) { + var urls []string + fr := csv.NewFieldReader(app.src) + for fr.Scan() { + urls = append(urls, fr.Field("url")) + } + return urls, fr.Err() +} + +func (app *appEnv) buildPDF(ctx context.Context, url string) error { + hash := md5.Sum([]byte(url)) + png := fmt.Sprintf("%0x.png", hash) + pdf := fmt.Sprintf("%0x.pdf", hash) + + // Skip if stat file + _, err := os.Stat(filepath.Join(app.temp, png)) + switch { + case err == nil: + app.Logger.Printf("have %s", png) + case !errors.Is(err, fs.ErrNotExist): + return err + default: + app.Logger.Printf("start %0x from %q", hash, url) + // TODO retry in loop + cmd := exec.CommandContext(ctx, "shot-scraper", "--reduced-motion", + // TODO figure out whether to use #content or not + "-s", "#content", + "-p", "16", "--output", png, url) + cmd.Dir = app.temp + cmd.Stderr = os.Stderr + cmd.Stdout = os.Stdout + if err := cmd.Run(); err != nil { + // mark this up + return fmt.Errorf("problem with %q from %q: %w", png, url, err) + } + } + // Skip if stat file + _, err = os.Stat(filepath.Join(app.temp, pdf)) + switch { + case err == nil: + app.Logger.Printf("have %s", pdf) + return nil + case !errors.Is(err, fs.ErrNotExist): + return err + default: + cmd := exec.CommandContext(ctx, "convert", png, pdf) + cmd.Dir = app.temp + if err := cmd.Run(); err != nil { + return err + } + } + app.Logger.Printf("done %0x from %q", hash, url) + return nil +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..4ad05f7 --- /dev/null +++ b/go.mod @@ -0,0 +1,13 @@ +module github.com/spotlightpa/pdfcopy + +go 1.21 + +require ( + github.com/carlmjohnson/csv v1.20.0 + github.com/carlmjohnson/exitcode v0.20.2 + github.com/carlmjohnson/flagx v0.22.2 + github.com/carlmjohnson/flowmatic v0.23.4 + github.com/carlmjohnson/versioninfo v0.22.5 +) + +require github.com/carlmjohnson/deque v0.23.1 // indirect diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..c5e88de --- /dev/null +++ b/go.sum @@ -0,0 +1,14 @@ +github.com/carlmjohnson/be v0.22.3 h1:XwpxXz+wHvZ6O+i/IxcVQaGinsDkF99bpq0VXno6Voc= +github.com/carlmjohnson/be v0.22.3/go.mod h1:KAgPUh0HpzWYZZI+IABdo80wTgY43YhbdsiLYAaSI/Q= +github.com/carlmjohnson/csv v1.20.0 h1:/QBKcZdJY0FMerK6m+uMgZRwmQ33Uj5NBqkqcitstcY= +github.com/carlmjohnson/csv v1.20.0/go.mod h1:I2Bsj5YPd8aGaLfXelcf4ANJpFlrPYkgpJnwCsTdNbI= +github.com/carlmjohnson/deque v0.23.1 h1:X2HOJM9xcglY03deMZ0oZ1V2xtbqYV7dJDnZiSZN4Ak= +github.com/carlmjohnson/deque v0.23.1/go.mod h1:LF5NJjICBrEOPx84pxPL4nCimy5n9NQjxKi5cXkh+8U= +github.com/carlmjohnson/exitcode v0.20.2 h1:vE6rmkCGNA4kO4m1qwWIa77PKlUBVg46cNjs22eAOXE= +github.com/carlmjohnson/exitcode v0.20.2/go.mod h1:MZ6ThCDx517DQcrpYnnns1pLh8onjFl+B/AsrOrdmpc= +github.com/carlmjohnson/flagx v0.22.2 h1:UXf7gL4Ffv5RIH/HKp8CGNzDyopgezFLrDO1m4F8jWc= +github.com/carlmjohnson/flagx v0.22.2/go.mod h1:obobISvBnxgEXPLBITVXhRUOlSlzza1SGt34M64CPJc= +github.com/carlmjohnson/flowmatic v0.23.4 h1:SfK6f+zKUlw4aga1ph+7/csqVeUAWnBxfqKN5gvQzzs= +github.com/carlmjohnson/flowmatic v0.23.4/go.mod h1:Jpvyl591Dvkt9chYpnVupjxlKvqkZ9CtCmqL4wfQD7U= +github.com/carlmjohnson/versioninfo v0.22.5 h1:O00sjOLUAFxYQjlN/bzYTuZiS0y6fWDQjMRvwtKgwwc= +github.com/carlmjohnson/versioninfo v0.22.5/go.mod h1:QT9mph3wcVfISUKd0i9sZfVrPviHuSF+cUtLjm2WSf8= diff --git a/main.go b/main.go new file mode 100644 index 0000000..6d28033 --- /dev/null +++ b/main.go @@ -0,0 +1,12 @@ +package main + +import ( + "os" + + "github.com/carlmjohnson/exitcode" + "github.com/spotlightpa/pdfcopy/app" +) + +func main() { + exitcode.Exit(app.CLI(os.Args[1:])) +}