From c6adea8bfd118eed342f207fc06f83e15bc0a7a9 Mon Sep 17 00:00:00 2001
From: Keith Alcock <github@keithalcock.com>
Date: Tue, 21 Jan 2025 17:56:32 -0700
Subject: [PATCH] Include the logging from heuristics

---
 belief_pipeline/pandas_output_stage.py           |  4 ----
 belief_pipeline/pipeline.py                      | 13 +++++++++++++
 belief_pipeline/tpi_input_stage.py               |  4 ----
 belief_pipeline/tpi_location_stage_with_patch.py |  4 ++++
 4 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/belief_pipeline/pandas_output_stage.py b/belief_pipeline/pandas_output_stage.py
index cb5d9302..ae560e8d 100644
--- a/belief_pipeline/pandas_output_stage.py
+++ b/belief_pipeline/pandas_output_stage.py
@@ -7,10 +7,6 @@ def __init__(self, file_name: str) -> None:
         # if not os.path.exists(file_name):
         #     os.makedirs(file_name) # find the directory it's in, not use the entire file
 
-    def log(self, message: str):
-        with open("output.txt", "a", encoding="utf-8", newline="\n") as file:
-            print(message, file=file)
-
     def write(self, text):
         nl_count = text.count("\n") + 1
         self.log(str(nl_count))
diff --git a/belief_pipeline/pipeline.py b/belief_pipeline/pipeline.py
index 7a2f7c46..1f67f949 100644
--- a/belief_pipeline/pipeline.py
+++ b/belief_pipeline/pipeline.py
@@ -5,6 +5,13 @@ class PipelineStage():
     def _init__(self) -> None:
         pass
 
+    def log(self, message: str):
+        self.logToFile("log.txt", message)
+
+    def logToFile(self, filename: str, message: str):
+        with open(filename, "a", encoding="utf-8", newline="\n") as file:
+            print(message, file=file)
+
 class OuterStage(PipelineStage):
     def __init__(self) -> None:
         super().__init__()
@@ -17,6 +24,9 @@ def __init__(self, dir_name: str) -> None:
     def run(self) -> DataFrame:
         pass
 
+    def log(self, message: str):
+        self.logToFile("input.txt", message)
+
 class OutputStage(OuterStage):
     def __init__(self, file_name: str) -> None:
         super().__init__()
@@ -25,6 +35,9 @@ def __init__(self, file_name: str) -> None:
     def run(self, data_frame: DataFrame):
         pass
 
+    def log(self, message: str):
+        self.logToFile("output.txt", message)
+
 class InnerStage(PipelineStage):
     def __init__(self) -> None:
         super().__init__()
diff --git a/belief_pipeline/tpi_input_stage.py b/belief_pipeline/tpi_input_stage.py
index 7d5f3431..84562251 100644
--- a/belief_pipeline/tpi_input_stage.py
+++ b/belief_pipeline/tpi_input_stage.py
@@ -51,10 +51,6 @@ def mk_data_frame(self, file_name: str, sep: str) -> DataFrame:
                 print("There is an empty sentence!")
                 data_frame["sentence"][index] = "" # What should be done?
         return data_frame
-    
-    def log(self, message: str):
-        with open("input.txt", "a", encoding="utf-8", newline="\n") as file:
-            print(message, file=file)
 
     def read(self) -> StringIO:
         # In Python, the line separator is preserved.
diff --git a/belief_pipeline/tpi_location_stage_with_patch.py b/belief_pipeline/tpi_location_stage_with_patch.py
index ac16868e..0a9ddacf 100644
--- a/belief_pipeline/tpi_location_stage_with_patch.py
+++ b/belief_pipeline/tpi_location_stage_with_patch.py
@@ -3,6 +3,7 @@
 from tqdm import tqdm
 
 import itertools
+import os
 import pandas
 import re
 import spacy
@@ -68,6 +69,9 @@ def __init__(self, locations_file_name: str) -> None:
         # message on the console about lost data, probably from the extra column that we're not using here:
         # ParserWarning: Length of header or names does not match length of data. This leads to a loss of data
         # with index_col=False.
+        self.log("locations_file_name " + locations_file_name)
+        self.log("working directory " + os.getcwd())
+
         locations_data_frame = pandas.read_csv(locations_file_name, sep="\t", encoding="utf-8", index_col=False, names=[
             "geonameid", "name", "asciiname", "alternatenames", "latitude", "longitude", "unk1", "unk2", "country_code",
             "cc2", "unk3", "unk4", "unk5", "unk6", "population", "elevation", "unk7", "timezone", "unk8" #, "notes"