diff --git a/belief_pipeline/tpi_main.py b/belief_pipeline/tpi_main.py index 25183bfa..e250ff07 100644 --- a/belief_pipeline/tpi_main.py +++ b/belief_pipeline/tpi_main.py @@ -19,9 +19,9 @@ def get_in_and_out() -> Tuple[str, str]: if __name__ == "__main__": belief_model_name: str = "maxaalexeeva/belief-classifier_mturk_unmarked-trigger_bert-base-cased_2023-4-26-0-34" sentiment_model_name: str = "hriaz/finetuned_beliefs_sentiment_classifier_experiment1" - locations_file_name: str = "./belief_pipeline/GH.tsv" - input_file_name: str = "../corpora/ghana-elasticsearch/ghana-elasticsearch-4.tsv" - output_file_name: str = "../corpora/ghana-elasticsearch/ghana-elasticsearch-4a.tsv" + locations_file_name: str = "./belief_pipeline/UG.tsv" + input_file_name: str = "../corpora/uganda/interview/interviews.tsv" + output_file_name: str = "../corpora/uganda/interview/interviews-a.tsv" # input_file_name, output_file_name = get_in_and_out() pipeline = Pipeline( TpiInputStage(input_file_name), diff --git a/scraper/src/main/scala/org/clulab/habitus/scraper/apps/ArticleScraperApp.scala b/scraper/src/main/scala/org/clulab/habitus/scraper/apps/ArticleScraperApp.scala index 9f2de90c..029cdb0c 100644 --- a/scraper/src/main/scala/org/clulab/habitus/scraper/apps/ArticleScraperApp.scala +++ b/scraper/src/main/scala/org/clulab/habitus/scraper/apps/ArticleScraperApp.scala @@ -6,9 +6,9 @@ import org.clulab.habitus.scraper.corpora.PageCorpus import org.clulab.habitus.scraper.scrapers.article.CorpusArticleScraper object ArticleScraperApp extends App { - val term = "sitemap" - val corpusFileName = args.lift(0).getOrElse(s"./scraper/corpora/ghana/$term/articlecorpus-filtered.txt") - val baseDirName = args.lift(1).getOrElse(s"../corpora/ghana/$term/articles") + val term = "interview" + val corpusFileName = args.lift(0).getOrElse(s"./scraper/corpora/uganda/$term/articlecorpus.txt") + val baseDirName = args.lift(1).getOrElse(s"../corpora/uganda/$term/articles") val corpus = PageCorpus(corpusFileName) val scraper = new CorpusArticleScraper(corpus) val browser: Browser = new HabitusBrowser() diff --git a/scraper/src/main/scala/org/clulab/habitus/scraper/domains/InterviewDomain.scala b/scraper/src/main/scala/org/clulab/habitus/scraper/domains/InterviewDomain.scala new file mode 100644 index 00000000..9d95d98e --- /dev/null +++ b/scraper/src/main/scala/org/clulab/habitus/scraper/domains/InterviewDomain.scala @@ -0,0 +1,4 @@ +package org.clulab.habitus.scraper.domains + +// What should I put here? +object InterviewDomain extends Domain("", "file", ".iview") diff --git a/scraper/src/main/scala/org/clulab/habitus/scraper/scrapers/article/ArticleScraper.scala b/scraper/src/main/scala/org/clulab/habitus/scraper/scrapers/article/ArticleScraper.scala index 7e2a6e7e..736a122e 100644 --- a/scraper/src/main/scala/org/clulab/habitus/scraper/scrapers/article/ArticleScraper.scala +++ b/scraper/src/main/scala/org/clulab/habitus/scraper/scrapers/article/ArticleScraper.scala @@ -58,7 +58,8 @@ class CorpusArticleScraper(val corpus: PageCorpus) { new MiningReviewArticleScraper(), new MiningArticleScraper(), new PdfFileArticleScraper(), - new MailFileArticleScraper() + new MailFileArticleScraper(), + new InterviewFileArticleScraper() ) def getPageScraper(page: Page): PageArticleScraper = { diff --git a/scraper/src/main/scala/org/clulab/habitus/scraper/scrapers/article/InterviewFileArticleScraper.scala b/scraper/src/main/scala/org/clulab/habitus/scraper/scrapers/article/InterviewFileArticleScraper.scala new file mode 100644 index 00000000..9909e9cd --- /dev/null +++ b/scraper/src/main/scala/org/clulab/habitus/scraper/scrapers/article/InterviewFileArticleScraper.scala @@ -0,0 +1,73 @@ +package org.clulab.habitus.scraper.scrapers.article + +import net.ruippeixotog.scalascraper.browser.Browser +import org.clulab.habitus.scraper.Page +import org.clulab.habitus.scraper.domains.InterviewDomain +import org.clulab.habitus.scraper.scrapes.ArticleScrape +import org.clulab.utils.FileUtils +import org.clulab.wm.eidoscommon.utils.FileEditor +import org.json4s.DefaultFormats + +import java.io.File +import scala.util.Using + +case class InterviewMetadata(titleOpt: Option[String], datelineOpt: Option[String], bylineOpt: Option[String]) + +class InterviewFileArticleScraper extends PageArticleScraper(InterviewDomain) { + implicit val formats: DefaultFormats.type = DefaultFormats + + def scrape(browser: Browser, page: Page, iviewLocationName: String): ArticleScrape = { + val file = page.url.getFile.drop(1) + val text = FileUtils.getTextFromFile(iviewLocationName) + val metadata = InterviewFileArticleScraper.metadataMap(file) + + ArticleScrape(page.url, metadata.titleOpt, metadata.datelineOpt, metadata.bylineOpt, text) + } + + def readIview(page: Page, baseDirName: String): (String, String, String) = { + // See PdfFileArticleScraper for example of how these were derived + // from the non-file versions. + val subDirName = s"$baseDirName" + val file = page.url.getFile.drop(1) + val iviewLocationName = s"$baseDirName/$file" + + (subDirName, file, iviewLocationName) + } + + override def scrapeTo(browser: Browser, page: Page, baseDirName: String): Unit = { + val (subDirName, file, iviewLocationName) = readIview(page, baseDirName) + val scraped = scrape(browser, page, iviewLocationName) + val text = scraped.toText + val textLocationName = FileEditor(new File(iviewLocationName)).setExt("txt").get + val jsonLocationName = FileEditor(new File(iviewLocationName)).setExt("json").get + val json = scraped.toJson + + Using.resource(FileUtils.printWriterFromFile(textLocationName)) { printWriter => + printWriter.println(text) + } + + Using.resource(FileUtils.printWriterFromFile(jsonLocationName)) { printWriter => + printWriter.println(json) + } + } +} + +object InterviewFileArticleScraper { + val metadataMap: Map[String, InterviewMetadata] = Map( + "TranscriptUGA-E0006_original.iview" -> InterviewMetadata( + Some("Interview with NBS TV Karamoja, original version"), + Some("2024-01-26"), + Some("UGA-E0006") + ), + "TranscriptUGA-E0006_case.iview" -> InterviewMetadata( + Some("Interview with NBS TV Karamoja, distinguished by case"), + Some("2024-01-26"), + Some("UGA-E0006") + ), + "TranscriptUGA-E0006_respondent.iview" -> InterviewMetadata( + Some("Interview with NBS TV Karamoja, respondent only"), + Some("2024-01-26"), + Some("UGA-E0006") + ) + ).withDefaultValue(InterviewMetadata(None, None, None)) +} diff --git a/src/main/scala/org/clulab/habitus/apps/tpi/Step1OutputEidos.scala b/src/main/scala/org/clulab/habitus/apps/tpi/Step1OutputEidos.scala index 846f6bbd..4070deff 100644 --- a/src/main/scala/org/clulab/habitus/apps/tpi/Step1OutputEidos.scala +++ b/src/main/scala/org/clulab/habitus/apps/tpi/Step1OutputEidos.scala @@ -16,7 +16,7 @@ import scala.util.Using object Step1OutputEidos extends App { implicit val formats: DefaultFormats.type = org.json4s.DefaultFormats - val baseDirectoryName = args.lift(0).getOrElse("../corpora/ghana-regulations") + val baseDirectoryName = args.lift(0).getOrElse("../corpora/uganda/interview/articles") val inAndOutFiles = new File(baseDirectoryName) .listFilesByWildcard("*.json", recursive = true) .map { inFile => diff --git a/src/main/scala/org/clulab/habitus/apps/tpi/Step2InputEidos.scala b/src/main/scala/org/clulab/habitus/apps/tpi/Step2InputEidos.scala index 514d71b6..78345421 100644 --- a/src/main/scala/org/clulab/habitus/apps/tpi/Step2InputEidos.scala +++ b/src/main/scala/org/clulab/habitus/apps/tpi/Step2InputEidos.scala @@ -18,8 +18,8 @@ import scala.util.Using object Step2InputEidos extends App with Logging { implicit val formats: DefaultFormats.type = org.json4s.DefaultFormats val contextWindow = 3 - val baseDirectory = "../corpora/ghana-regulations/unknown" - val outputFileName = "../corpora/ghana-regulations/ghana-regulations.tsv" + val baseDirectory = "../corpora/uganda/interview/articles" + val outputFileName = "../corpora/uganda/interviews.tsv" val deserializer = new JLDDeserializer() def jsonFileToJsonld(jsonFile: File): File = diff --git a/src/main/scala/org/clulab/habitus/apps/tpi/Step3InterpretDates.scala b/src/main/scala/org/clulab/habitus/apps/tpi/Step3InterpretDates.scala index cc8c95ea..7274e41b 100644 --- a/src/main/scala/org/clulab/habitus/apps/tpi/Step3InterpretDates.scala +++ b/src/main/scala/org/clulab/habitus/apps/tpi/Step3InterpretDates.scala @@ -7,9 +7,9 @@ import org.clulab.wm.eidoscommon.utils.TsvReader import scala.util.Using object Step3InterpretDates extends App with Logging { - val inputFileName = "../corpora/ghana-regulations/ghana-regulations-2.tsv" - val outputFileName = "../corpora/ghana-regulations/ghana-regulations-3.tsv" - val expectedColumnCount = 21 + val inputFileName = "../corpora/uganda/interview/interviews-a.tsv" + val outputFileName = "../corpora/uganda/interview/interviews-b.tsv" + val expectedColumnCount = 22 Using.resource(Sourcer.sourceFromFilename(inputFileName)) { inputSource => Using.resource(FileUtils.printWriterFromFile(outputFileName)) { printWriter => diff --git a/src/main/scala/org/clulab/habitus/apps/tpi/Step4FindNearestLocation.scala b/src/main/scala/org/clulab/habitus/apps/tpi/Step4FindNearestLocation.scala index dadff2d5..65c1140e 100644 --- a/src/main/scala/org/clulab/habitus/apps/tpi/Step4FindNearestLocation.scala +++ b/src/main/scala/org/clulab/habitus/apps/tpi/Step4FindNearestLocation.scala @@ -31,9 +31,9 @@ object Step4FindNearestLocation extends App with Logging { val header = "prevLocation\tprevDistance\tnextLocation\tnextDistance" } - val inputFileName = "../corpora/ghana-regulations/ghana-regulations-3.tsv" - val outputFileName = "../corpora/ghana-regulations/ghana-regulations-4.tsv" - val expectedColumnCount = 22 + val inputFileName = "../corpora/uganda/interview/interviews-b.tsv" + val outputFileName = "../corpora/uganda/interview/interviews-c.tsv" + val expectedColumnCount = 23 val tsvReader = new TsvReader() var articleIndex = 0 @@ -41,7 +41,7 @@ object Step4FindNearestLocation extends App with Logging { val locationAndIndexes = lines.map { line => val columns = tsvReader.readln(line) val sentenceIndex = columns(3).toInt - val sentenceLocation = columns(19) + val sentenceLocation = columns(20) LocationAndIndex(sentenceLocation, sentenceIndex) }.toList