Skip to content

Commit

Permalink
Clean up GhanaWeb
Browse files Browse the repository at this point in the history
  • Loading branch information
kwalcock committed May 5, 2024
1 parent a6d3090 commit e9401c0
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import org.clulab.habitus.scraper.scrapers.article.CorpusArticleScraper
object ArticleScraperApp extends App {
val term = "sitemap"
val corpusFileName = args.lift(0).getOrElse(s"./scraper/corpora/ghana/$term/articlecorpus-filtered.txt")
val baseDirName = args.lift(1).getOrElse(s"../corpora/ghana/$term/articles")
val baseDirName = args.lift(1).getOrElse("/home/kwa/data/Corpora/habitus-project/corpora/ghana-sitemap/articlesonly")
val corpus = PageCorpus(corpusFileName)
val scraper = new CorpusArticleScraper(corpus)
val browser: Browser = new HabitusBrowser()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import org.clulab.utils.{FileUtils, Sourcer, StringUtils}
import scala.util.Using

object FilterLogApp extends App {
val inFileName = args.lift(0).getOrElse("/home/kwa/Projects/clulab/habitus-project/habitus/ArticleScraperApp-adomonline.log")
val inFileName = args.lift(0).getOrElse("/home/kwa/Projects/clulab/habitus-project/habitus/ArticleScraperApp-ghanaweb.log")
val outFileName = args.lift(1).getOrElse("/home/kwa/Projects/clulab/habitus-project/habitus/articlecorpus.txt")

Using.resources(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package org.clulab.habitus.scraper.apps

import org.clulab.habitus.scraper.Cleaner
import org.clulab.utils.{FileUtils, Sourcer}

import java.net.URL
import scala.util.Using

object VerifyDownloadsApp extends App {
val term = "sitemap"
val corpusFileName = args.lift(0).getOrElse(s"./scraper/corpora/ghana/$term/articlecorpus-filtered.txt")
val baseDirName = args.lift(1).getOrElse("/home/kwa/data/Corpora/habitus-project/corpora/ghana-sitemap/articlesonly/www_ghanaweb_com")
val cleaner = new Cleaner()

def extractFile(line: String): String = {
val url = new URL(line)
val file = cleaner.clean(url.getFile) + ".html"

file
}

val corpusFiles = Using.resource(Sourcer.sourceFromFilename(corpusFileName)) { source =>
source.getLines.map(extractFile).toSet
}

FileUtils.findFiles(baseDirName, "html").foreach { file =>
val name = file.getName

if (!corpusFiles(name))
println(s"$name is extra!")
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import org.clulab.habitus.scraper.scrapes.ArticleScrape
import org.json4s.jackson.JsonMethods
import org.json4s.{DefaultFormats, JObject}

import scala.util.Try

class GhanaWebArticleScraper extends PageArticleScraper(GhanaWebDomain) {
implicit val formats: DefaultFormats.type = DefaultFormats

Expand All @@ -21,9 +23,19 @@ class GhanaWebArticleScraper extends PageArticleScraper(GhanaWebDomain) {
}
.map { element =>
val json = element.innerHtml.replace("\t", " ")
val jObject = JsonMethods.parse(json).asInstanceOf[JObject]

jObject
val jObjectTry1 = Try { JsonMethods.parse(json).asInstanceOf[JObject] }
val jObjectTry2 = jObjectTry1.orElse(Try { JsonMethods.parse(json.filter(c => c >= 0x20)).asInstanceOf[JObject] })
val jObjectTry3 = jObjectTry2.orElse(Try { JsonMethods.parse(json.replace("\\", "\\\\")).asInstanceOf[JObject] })
val jObjectTry4 = jObjectTry3.orElse { Try {
val json2 = json
.replace("\\\",", "\",") // They have escaped the trailing quotes in the json
.replace("\\/", "/") // And forward slashes as well.

JsonMethods.parse(json2)
}}

jObjectTry4.get
}
val dateline = jObjectOpt.map { jObject =>
((jObject \ "@graph")(0) \ "datePublished").extract[String]
Expand All @@ -44,6 +56,8 @@ class GhanaWebArticleScraper extends PageArticleScraper(GhanaWebDomain) {
.filter(_.nonEmpty)
.mkString("\n\n")

if (text.isEmpty)
println("Why?")
ArticleScrape(page.url, Some(title), Some(dateline), bylineOpt, text)
}
}

0 comments on commit e9401c0

Please sign in to comment.