Skip to content

Commit

Permalink
logging and lower yield check
Browse files Browse the repository at this point in the history
  • Loading branch information
rymarczy committed Oct 3, 2024
1 parent a2ecf97 commit 121deb2
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion src/lamp_py/ingestion/convert_gtfs_rt.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def process_files(self) -> Iterable[pyarrow.table]:
process_logger.log_complete()

def yield_check(
self, process_logger: ProcessLogger, min_rows: int = 5_000_000
self, process_logger: ProcessLogger, min_rows: int = 2_000_000
) -> Iterable[pyarrow.table]:
"""
yield all tables in the data_parts map that have been sufficiently
Expand Down Expand Up @@ -393,17 +393,25 @@ def make_hash_dataset(
:param table: pyarrow Table
:param local_path: path to local parquet file
"""
logger = ProcessLogger("make_hash_dataset")
logger.log_start()

table = hash_gtfs_rt_table(table)
logger.add_metadata(step="complete hash_gtfs_rt_table")
out_ds = pd.dataset(table)
logger.add_metadata(step="create out_ds(table)")

if self.sync_with_s3(local_path):
hash_gtfs_rt_parquet(local_path)
logger.add_metadata(step="complete hash_gtfs_rt_parquet")
out_ds = pd.dataset(
[
pd.dataset(table),
pd.dataset(local_path),
]
)
logger.add_metadata(step="create out_ds(local+table)")
logger.log_complete()

return out_ds

Expand Down

0 comments on commit 121deb2

Please sign in to comment.