Skip to content

Commit

Permalink
feat: some scraper cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
ecshreve committed Jun 23, 2023
1 parent 074b468 commit 4943066
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 24 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ jepp_scraper/
cache/
.env
*/**/certs
dump.*
17 changes: 11 additions & 6 deletions Taskfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,18 @@ dotenv: ['.env', '{{.ENV}}/.env.', '{{.HOME}}/.env']

tasks:
default:
desc: List all tasks and descriptions.
cmds:
- task --list-all

build:
desc: Builds both the scraper and server binaries.
deps:
- go:build:scraper
- go:build:server

test:
desc: Runs go tests
cmds:
- go test github.com/ecshreve/jepp/...

Expand All @@ -17,6 +25,9 @@ tasks:
- bin/scrape
sources:
- cmd/scrape/*.go
- pkg/scraper/*.go
- pkg/models/*.go
- pkg/utils/*.go
cmds:
- go build -o bin/scrape github.com/ecshreve/jepp/cmd/scrape

Expand All @@ -41,12 +52,6 @@ tasks:
cmds:
- swag fmt -d cmd/server,pkg/server,pkg/models,pkg/utils
- swag init --parseVendor -d cmd/server,pkg/server,pkg/models,pkg/utils

build:
desc: Builds both the scraper and server binaries.
deps:
- go:build:scraper
- go:build:server

scrape:
desc: Runs the scraper.
Expand Down
20 changes: 9 additions & 11 deletions cmd/scrape/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,25 @@ import (
"os"

"github.com/ecshreve/jepp/pkg/models"
"github.com/ecshreve/jepp/pkg/scraper"
log "github.com/sirupsen/logrus"
)

func main() {
if os.Getenv("JEPP_LOCAL_DEV") != "true" {
log.Fatal("this script should only be run in a local development environment")
}
log.SetLevel(log.InfoLevel)
log.Info("Starting Jepp scraper...")

models.GetDBHandle()

// Change loop values to scrape different seasons.
for i := 38; i > 38; i-- {
log.Infof("scraping season %d ", i)
gamesForSeason, err := models.GetGamesBySeason(int64(i))
if err != nil {
for i := 15; i > 10; i-- {
if err := scraper.ScrapeSeason(int64(i)); err != nil {
log.Fatal(err)
}

cluesForSeason := 0
for i, game := range gamesForSeason {
cluesForSeason += scrapeAndFillCluesForGame(nil, game.GameID)
log.Infof("%d/%d games updated", i, len(gamesForSeason))
}
log.Infof("inserted %d clues and %d games for season %d", cluesForSeason, len(gamesForSeason), i)
}

log.Info("...done scraping")
}
6 changes: 4 additions & 2 deletions pkg/models/category.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,14 @@ func GetCategory(categoryID int64) (*Category, error) {
}

func GetCategoryByName(categoryName string) (*Category, error) {
var c Category
query := fmt.Sprintf("SELECT * FROM category WHERE name='%s' ORDER BY category_id DESC LIMIT 1", categoryName)

if err := db.Get(&c, "SELECT category_id, name FROM category WHERE name=? LIMIT 1", categoryName); err != nil {
c := Category{}
if err := db.Get(&c, query); err != nil {
return nil, oops.Wrapf(err, "could not get category for name %s", categoryName)
}

log.Debugf("category: %+v", c)
return &c, nil
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/models/game.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ func GetGames() ([]Game, error) {

// GetGamesBySeason returns a list of games in the database for a given season.
func GetGamesBySeason(seasonID int64) ([]Game, error) {
query := fmt.Sprintf("SELECT * FROM game WHERE season_id=%d", seasonID)
query := fmt.Sprintf("SELECT * FROM game WHERE season_id=%d ORDER BY game_date DESC", seasonID)

games := []Game{}
if err := db.Select(&games, query); err != nil {
Expand Down
9 changes: 6 additions & 3 deletions cmd/scrape/game.go → pkg/scraper/game.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package main
package scraper

import (
"fmt"
Expand All @@ -9,7 +9,7 @@ import (
log "github.com/sirupsen/logrus"
)

// scrapeGame scrapes a game from j-archive.com
// scrapeGameClues scrapes a game from j-archive.com.
func scrapeGameClues(gameID int64) (map[int64]*mods.Clue, map[int64]string) {
clueMap := map[int64]*mods.Clue{}
clueStrings := map[int64]string{}
Expand All @@ -34,6 +34,7 @@ func scrapeGameClues(gameID int64) (map[int64]*mods.Clue, map[int64]string) {
clueStrings[clueId] = cid
})

// collect and parse the categories for single jepp
c.OnHTML("div[id=jeopardy_round]", func(e *colly.HTMLElement) {
cc := []string{}
e.ForEach("td.category_name", func(_ int, el *colly.HTMLElement) {
Expand All @@ -42,6 +43,7 @@ func scrapeGameClues(gameID int64) (map[int64]*mods.Clue, map[int64]string) {
cats[mods.Jeopardy] = append(cats[mods.Jeopardy], cc...)
})

// collect and parse the categories for double jepp
c.OnHTML("div[id=double_jeopardy_round]", func(e *colly.HTMLElement) {
cc := []string{}
e.ForEach("td.category_name", func(_ int, el *colly.HTMLElement) {
Expand All @@ -50,6 +52,7 @@ func scrapeGameClues(gameID int64) (map[int64]*mods.Clue, map[int64]string) {
cats[mods.DoubleJeopardy] = append(cats[mods.DoubleJeopardy], cc...)
})

// collect and parse the categories for final jepp
c.OnHTML("div[id=final_jeopardy_round]", func(e *colly.HTMLElement) {
cc := []string{}
e.ForEach("td.category_name", func(_ int, el *colly.HTMLElement) {
Expand Down Expand Up @@ -81,7 +84,7 @@ func scrapeAndFillCluesForGame(db *mods.JeppDB, gid int64) int {

for clueID, clue := range clues {
actual, err := mods.GetCategoryByName(cats[clueID])
if err != nil {
if actual != nil {
clue.CategoryID = actual.CategoryID
continue
}
Expand Down
24 changes: 24 additions & 0 deletions pkg/scraper/scraper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package scraper

import (
"github.com/ecshreve/jepp/pkg/models"
"github.com/samsarahq/go/oops"
log "github.com/sirupsen/logrus"
)

func ScrapeSeason(i int64) error {
log.Infof("scraping season %d ", i)
gamesForSeason, err := models.GetGamesBySeason(int64(i))
if err != nil {
return oops.Wrapf(err, "failed to get games for season %d", i)
}

cluesForSeason := 0
for i, game := range gamesForSeason {
cluesForSeason += scrapeAndFillCluesForGame(nil, game.GameID)
log.Infof("%d/%d games updated", i, len(gamesForSeason))
}
log.Infof("inserted %d clues and %d games for season %d", cluesForSeason, len(gamesForSeason), i)

return nil
}
2 changes: 1 addition & 1 deletion cmd/scrape/season.go → pkg/scraper/season.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package main
package scraper

import (
"fmt"
Expand Down

0 comments on commit 4943066

Please sign in to comment.