From c6adea8bfd118eed342f207fc06f83e15bc0a7a9 Mon Sep 17 00:00:00 2001 From: Keith Alcock Date: Tue, 21 Jan 2025 17:56:32 -0700 Subject: [PATCH] Include the logging from heuristics --- belief_pipeline/pandas_output_stage.py | 4 ---- belief_pipeline/pipeline.py | 13 +++++++++++++ belief_pipeline/tpi_input_stage.py | 4 ---- belief_pipeline/tpi_location_stage_with_patch.py | 4 ++++ 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/belief_pipeline/pandas_output_stage.py b/belief_pipeline/pandas_output_stage.py index cb5d9302..ae560e8d 100644 --- a/belief_pipeline/pandas_output_stage.py +++ b/belief_pipeline/pandas_output_stage.py @@ -7,10 +7,6 @@ def __init__(self, file_name: str) -> None: # if not os.path.exists(file_name): # os.makedirs(file_name) # find the directory it's in, not use the entire file - def log(self, message: str): - with open("output.txt", "a", encoding="utf-8", newline="\n") as file: - print(message, file=file) - def write(self, text): nl_count = text.count("\n") + 1 self.log(str(nl_count)) diff --git a/belief_pipeline/pipeline.py b/belief_pipeline/pipeline.py index 7a2f7c46..1f67f949 100644 --- a/belief_pipeline/pipeline.py +++ b/belief_pipeline/pipeline.py @@ -5,6 +5,13 @@ class PipelineStage(): def _init__(self) -> None: pass + def log(self, message: str): + self.logToFile("log.txt", message) + + def logToFile(self, filename: str, message: str): + with open(filename, "a", encoding="utf-8", newline="\n") as file: + print(message, file=file) + class OuterStage(PipelineStage): def __init__(self) -> None: super().__init__() @@ -17,6 +24,9 @@ def __init__(self, dir_name: str) -> None: def run(self) -> DataFrame: pass + def log(self, message: str): + self.logToFile("input.txt", message) + class OutputStage(OuterStage): def __init__(self, file_name: str) -> None: super().__init__() @@ -25,6 +35,9 @@ def __init__(self, file_name: str) -> None: def run(self, data_frame: DataFrame): pass + def log(self, message: str): + self.logToFile("output.txt", message) + class InnerStage(PipelineStage): def __init__(self) -> None: super().__init__() diff --git a/belief_pipeline/tpi_input_stage.py b/belief_pipeline/tpi_input_stage.py index 7d5f3431..84562251 100644 --- a/belief_pipeline/tpi_input_stage.py +++ b/belief_pipeline/tpi_input_stage.py @@ -51,10 +51,6 @@ def mk_data_frame(self, file_name: str, sep: str) -> DataFrame: print("There is an empty sentence!") data_frame["sentence"][index] = "" # What should be done? return data_frame - - def log(self, message: str): - with open("input.txt", "a", encoding="utf-8", newline="\n") as file: - print(message, file=file) def read(self) -> StringIO: # In Python, the line separator is preserved. diff --git a/belief_pipeline/tpi_location_stage_with_patch.py b/belief_pipeline/tpi_location_stage_with_patch.py index ac16868e..0a9ddacf 100644 --- a/belief_pipeline/tpi_location_stage_with_patch.py +++ b/belief_pipeline/tpi_location_stage_with_patch.py @@ -3,6 +3,7 @@ from tqdm import tqdm import itertools +import os import pandas import re import spacy @@ -68,6 +69,9 @@ def __init__(self, locations_file_name: str) -> None: # message on the console about lost data, probably from the extra column that we're not using here: # ParserWarning: Length of header or names does not match length of data. This leads to a loss of data # with index_col=False. + self.log("locations_file_name " + locations_file_name) + self.log("working directory " + os.getcwd()) + locations_data_frame = pandas.read_csv(locations_file_name, sep="\t", encoding="utf-8", index_col=False, names=[ "geonameid", "name", "asciiname", "alternatenames", "latitude", "longitude", "unk1", "unk2", "country_code", "cc2", "unk3", "unk4", "unk5", "unk6", "population", "elevation", "unk7", "timezone", "unk8" #, "notes"