From c959ac46eec7ee3e5204078dd3eb586c37a0982f Mon Sep 17 00:00:00 2001 From: Quang Date: Sun, 1 Dec 2024 20:56:46 +0700 Subject: [PATCH] Add logic to RTransparentPublication --- .../4d66a5dc782b_create_funder_column.py | 30 +++++++++++++++++++ dsst_etl/logger.py | 23 ++++++++++++++ dsst_etl/models.py | 3 ++ dsst_etl/upload_rtransparent_data.py | 25 +++++++--------- scripts/run_upload_rtransparent_data.py | 5 ---- 5 files changed, 66 insertions(+), 20 deletions(-) create mode 100644 alembic/versions/4d66a5dc782b_create_funder_column.py create mode 100644 dsst_etl/logger.py diff --git a/alembic/versions/4d66a5dc782b_create_funder_column.py b/alembic/versions/4d66a5dc782b_create_funder_column.py new file mode 100644 index 0000000..e67e3c5 --- /dev/null +++ b/alembic/versions/4d66a5dc782b_create_funder_column.py @@ -0,0 +1,30 @@ +"""Create funder column + +Revision ID: 4d66a5dc782b +Revises: 845c59592898 +Create Date: 2024-12-01 20:47:57.932627 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '4d66a5dc782b' +down_revision: Union[str, None] = '845c59592898' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('rtransparent_publication', sa.Column('funder', sa.String(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('rtransparent_publication', 'funder') + # ### end Alembic commands ### diff --git a/dsst_etl/logger.py b/dsst_etl/logger.py new file mode 100644 index 0000000..fcfd1c7 --- /dev/null +++ b/dsst_etl/logger.py @@ -0,0 +1,23 @@ +import logging +import sys + +# Create a logger +logger = logging.getLogger("dsst_etl") +logger.setLevel(logging.DEBUG) # Set the log level to DEBUG for detailed output + +# Create a console handler +console_handler = logging.StreamHandler(sys.stdout) +console_handler.setLevel(logging.DEBUG) + +# Create a formatter and set it for the handler +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +console_handler.setFormatter(formatter) + +# Add the handler to the logger +logger.addHandler(console_handler) + +# Optionally, add a file handler if you want to log to a file +file_handler = logging.FileHandler("dsst_etl.log") +file_handler.setLevel(logging.INFO) +file_handler.setFormatter(formatter) +logger.addHandler(file_handler) diff --git a/dsst_etl/models.py b/dsst_etl/models.py index 0ffde88..293aeaa 100644 --- a/dsst_etl/models.py +++ b/dsst_etl/models.py @@ -59,6 +59,8 @@ class RTransparentPublication(Base): __tablename__ = "rtransparent_publication" id = Column(Integer, primary_key=True) + title = Column(String) + author = Column(String) # Mandatory fields is_open_code = Column(Boolean, nullable=True) @@ -243,6 +245,7 @@ class RTransparentPublication(Base): title = Column(String, nullable=True) is_data_pred = Column(Boolean, nullable=True) is_code_pred = Column(Boolean, nullable=True) + funder = Column(String, nullable=True) work_id = Column(Integer, ForeignKey("works.id"), nullable=True) provenance_id = Column(Integer, ForeignKey("provenance.id"), nullable=True) diff --git a/dsst_etl/upload_rtransparent_data.py b/dsst_etl/upload_rtransparent_data.py index a6f1d86..3ba3e62 100644 --- a/dsst_etl/upload_rtransparent_data.py +++ b/dsst_etl/upload_rtransparent_data.py @@ -1,13 +1,11 @@ -import logging - +import numpy as np import pandas as pd import sqlalchemy +from tqdm import tqdm +from dsst_etl.logger import logger from dsst_etl.models import Provenance, RTransparentPublication, Works -# Configure logging -logger = logging.getLogger(__name__) - class RTransparentDataUploader: """ @@ -41,22 +39,19 @@ def upload_data(self, file_path, n_rows=1000): logger.info(f"Read {len(data)} rows from {file_path}") logger.info(f"Processing {n_rows} rows at a time") logger.info("Starting to process data") - print(data.columns) # Process data in chunks - for start in range(0, len(data), n_rows): + for start in tqdm(range(0, len(data), n_rows), desc="Processing data"): chunk = data.iloc[start : start + n_rows] - # Create entries for RTransparentPublication publications = [] for _, row in chunk.iterrows(): - publication = RTransparentPublication( - is_open_code=row.get("is_open_code"), - is_open_data=row.get("is_open_data"), - year=row.get("year"), - filename=row.get("filename"), - # Add other fields as necessary - ) + # Convert numpy.ndarray to string + row_dict = row.to_dict() + if isinstance(row_dict.get("funder"), np.ndarray): + row_dict["funder"] = ", ".join(row_dict["funder"].tolist()) + + publication = RTransparentPublication(**row_dict) publications.append(publication) # Create and reference entries in Works and Provenance as needed diff --git a/scripts/run_upload_rtransparent_data.py b/scripts/run_upload_rtransparent_data.py index 5ece720..b2278c3 100644 --- a/scripts/run_upload_rtransparent_data.py +++ b/scripts/run_upload_rtransparent_data.py @@ -3,11 +3,6 @@ from dsst_etl.db import get_db_session from dsst_etl.upload_rtransparent_data import RTransparentDataUploader -logger = logging.getLogger(__name__) -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s' - ) def main(): # Set up argument parser