Skip to content

Commit

Permalink
Add logic to RTransparentPublication
Browse files Browse the repository at this point in the history
  • Loading branch information
quang-ng committed Dec 1, 2024
1 parent 9a66fa5 commit c959ac4
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 20 deletions.
30 changes: 30 additions & 0 deletions alembic/versions/4d66a5dc782b_create_funder_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Create funder column
Revision ID: 4d66a5dc782b
Revises: 845c59592898
Create Date: 2024-12-01 20:47:57.932627
"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = '4d66a5dc782b'
down_revision: Union[str, None] = '845c59592898'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('rtransparent_publication', sa.Column('funder', sa.String(), nullable=True))
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('rtransparent_publication', 'funder')
# ### end Alembic commands ###
23 changes: 23 additions & 0 deletions dsst_etl/logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import logging
import sys

# Create a logger
logger = logging.getLogger("dsst_etl")
logger.setLevel(logging.DEBUG) # Set the log level to DEBUG for detailed output

# Create a console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.DEBUG)

# Create a formatter and set it for the handler
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
console_handler.setFormatter(formatter)

# Add the handler to the logger
logger.addHandler(console_handler)

# Optionally, add a file handler if you want to log to a file
file_handler = logging.FileHandler("dsst_etl.log")
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
3 changes: 3 additions & 0 deletions dsst_etl/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ class RTransparentPublication(Base):
__tablename__ = "rtransparent_publication"

id = Column(Integer, primary_key=True)
title = Column(String)
author = Column(String)

# Mandatory fields
is_open_code = Column(Boolean, nullable=True)
Expand Down Expand Up @@ -243,6 +245,7 @@ class RTransparentPublication(Base):
title = Column(String, nullable=True)
is_data_pred = Column(Boolean, nullable=True)
is_code_pred = Column(Boolean, nullable=True)
funder = Column(String, nullable=True)

work_id = Column(Integer, ForeignKey("works.id"), nullable=True)
provenance_id = Column(Integer, ForeignKey("provenance.id"), nullable=True)
25 changes: 10 additions & 15 deletions dsst_etl/upload_rtransparent_data.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import logging

import numpy as np
import pandas as pd
import sqlalchemy
from tqdm import tqdm

from dsst_etl.logger import logger
from dsst_etl.models import Provenance, RTransparentPublication, Works

# Configure logging
logger = logging.getLogger(__name__)


class RTransparentDataUploader:
"""
Expand Down Expand Up @@ -41,22 +39,19 @@ def upload_data(self, file_path, n_rows=1000):
logger.info(f"Read {len(data)} rows from {file_path}")
logger.info(f"Processing {n_rows} rows at a time")
logger.info("Starting to process data")
print(data.columns)

# Process data in chunks
for start in range(0, len(data), n_rows):
for start in tqdm(range(0, len(data), n_rows), desc="Processing data"):
chunk = data.iloc[start : start + n_rows]

# Create entries for RTransparentPublication
publications = []
for _, row in chunk.iterrows():
publication = RTransparentPublication(
is_open_code=row.get("is_open_code"),
is_open_data=row.get("is_open_data"),
year=row.get("year"),
filename=row.get("filename"),
# Add other fields as necessary
)
# Convert numpy.ndarray to string
row_dict = row.to_dict()
if isinstance(row_dict.get("funder"), np.ndarray):
row_dict["funder"] = ", ".join(row_dict["funder"].tolist())

publication = RTransparentPublication(**row_dict)
publications.append(publication)

# Create and reference entries in Works and Provenance as needed
Expand Down
5 changes: 0 additions & 5 deletions scripts/run_upload_rtransparent_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,6 @@
from dsst_etl.db import get_db_session
from dsst_etl.upload_rtransparent_data import RTransparentDataUploader

logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s'
)

def main():
# Set up argument parser
Expand Down

0 comments on commit c959ac4

Please sign in to comment.