Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: sgm-gharchive - updates and fixes for generating parquet from gharchive records + KubeCon SLC Demo #127

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions db/sgm-gharchive/cncf-consolidate-gz.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,14 @@ set -euox pipefail
# -d, --dry-run Perform a dry run without actual concatenation
# -v, --verbose Enable verbose output
# -h, --help Display this help text
./consolidate-gz.sh --source ~/gharchive-cncf/debug.cncf.all \
--target ~/gharchive-cncf/debug.cncf.byrepo \
--verbose
# --dry-run

./consolidate-gz.sh --source /p/gha-parquet-daily/2024 \
--target /p/gha-parquet-cncf \
--verbose | tee consolidate-gz-2024.log



# ./consolidate-gz.sh --source ~/gharchive-cncf/debug.cncf.all \
# --target ~/gharchive-cncf/debug.cncf.byrepo \
# --verbose
# --dry-run
54 changes: 54 additions & 0 deletions db/sgm-gharchive/gharchive-gz-to-hour2day.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
@echo off

echo ***
echo *** BEGIN
echo ***

set "start=%time%"

REM Get command line arguments
set "source=%~1"
set "target=%~2"
set "orgfile=%~3"
set "logs=%~4"
set "pylogs=%~5"
set "workers=%~6"

if "%logs%"=="" (
set "logs=%target%\gharchive-gz-hour2day-%source:~0,8%.csv"
)

if "%pylogs%"=="" (
set "pylogs=%target%\gharchive-gz-hour2day-pylog-%source:~0,8%.log"
)

if "%workers%"=="" (
set workers=55
)

python gharchive-gz-hour2day.py --source %source% --target %target% --org-file %orgfile% --log-results %logs% --verbose --workers %workers% > %pylogs% 2>&1

set "stop=%time%"
set /a "secs=1%stop:~6,2% - 1%start:~6,2% + (1%stop:~3,2% - 1%start:~3,2%) * 60 + (1%stop:~0,2% - 1%start:~0,2%) / 60"
echo Duration: %secs% seconds

@echo off
echo ***
echo *** END
echo ***

REM Help text
if "%~1"=="/?" goto help
exit /b

:help
echo Usage:
echo %0 source target orgfile [logs] [pylogs] [workers]
echo Required:
echo source: Path to source data
echo target: Path for output data
echo orgfile: Path to org list file
echo Optional:
echo logs: Path to save results log CSV (default is gharchive-gz-hour2day-source-target.csv)
echo pylogs: Path to save python output log (default is gharchive-gz-hour2day-pylog-source-target.log)
echo workers: Number of workers (default is 55)
100 changes: 70 additions & 30 deletions db/sgm-gharchive/gharchive-gz-to-parquet.bat
Original file line number Diff line number Diff line change
@@ -1,54 +1,94 @@
@echo off

echo ***
echo *** BEGIN
echo *** BEGIN: GHArchive GZ to Parquet Conversion
echo *** Started at: %date% %time%
echo ***

set "start=%time%"

REM Get command line arguments
REM Validate required arguments
if "%~1"=="" goto help
if "%~2"=="" goto help
if "%~3"=="" goto help

REM Get command line arguments
set "source=%~1"
set "target=%~2"
set "orgfile=%~3"
set "logs=%~4"
set "pylogs=%~5"
set "workers=%~6"

if "%logs%"=="" (
set "logs=%target%\gharchive-gz-hour2day-%source:~0,8%.csv"
REM Set optional arguments with defaults
if "%~4"=="" (
for %%I in ("%source%") do set "sourcename=%%~nxI"
set "logs=%target%\gharchive-gz-to-parquet-%sourcename%.csv"
) else (
set "logs=%~4"
)

if "%pylogs%"=="" (
set "pylogs=%target%\gharchive-gz-hour2day-pylog-%source:~0,8%.log"
if "%~5"=="" (
for %%I in ("%source%") do set "sourcename=%%~nxI"
set "pylogs=%target%\gharchive-gz-to-parquet-pylog-%sourcename%.log"
) else (
set "pylogs=%~5"
)

if "%workers%"=="" (
set workers=55
if "%~6"=="" (
set workers=55
) else (
set "workers=%~6"
)

python gharchive-gz-hour2day.py --source %source% --target %target% --org-file %orgfile% --log-results %logs% --verbose --workers %workers% > %pylogs% 2>&1
REM Run conversion
echo Running conversion with:
echo Source: %source%
echo Target: %target%
echo Org File: %orgfile%
echo Logs: %logs%
echo Python Logs: %pylogs%
echo Workers: %workers%
echo.

set "stop=%time%"
set /a "secs=1%stop:~6,2% - 1%start:~6,2% + (1%stop:~3,2% - 1%start:~3,2%) * 60 + (1%stop:~0,2% - 1%start:~0,2%) / 60"
echo Duration: %secs% seconds
if not exist "%target%" (
echo Creating target directory: %target%
mkdir "%target%"
)

@echo off
echo ***
echo *** END
echo ***
REM Remove the redirection to allow console output while still logging
echo python gharchive-gz-to-parquet.py --source %source% --target %target% --org-file %orgfile% --log-results %logs% --verbose --workers %workers%
python gharchive-gz-to-parquet.py --source %source% --target %target% --org-file %orgfile% --log-results %logs% --verbose --workers %workers%

REM Help text
if "%~1"=="/?" goto help
REM Calculate duration
set "stop=%time%"
set /a "secs=1%stop:~6,2% - 1%start:~6,2% + (1%stop:~3,2% - 1%start:~3,2%) * 60 + (1%stop:~0,2% - 1%start:~0,2%) * 60"
echo.
echo ***
echo *** END: Conversion completed
echo *** Duration: %secs% seconds
echo *** Finished at: %date% %time%
echo ***
exit /b

:help
echo GHArchive GZ to Parquet Conversion Script
echo.
echo Usage:
echo %0 source target orgfile [logs] [pylogs] [workers]
echo Required:
echo source: Path to source data
echo target: Path for output data
echo orgfile: Path to org list file
echo Optional:
echo logs: Path to save results log CSV (default is gharchive-gz-hour2day-source-target.csv)
echo pylogs: Path to save python output log (default is gharchive-gz-hour2day-pylog-source-target.log)
echo workers: Number of workers (default is 55)
echo %~nx0 source target orgfile [logs] [pylogs] [workers]
echo.
echo Required arguments:
echo source Path to source GZ data files
echo target Path for output Parquet files
echo orgfile Path to organization list file
echo.
echo Optional arguments:
echo logs Path to save results log CSV
echo (default: target\gharchive-gz-hour2day-YYYYMMDD.csv)
echo pylogs Path to save Python output log
echo (default: target\gharchive-gz-hour2day-pylog-YYYYMMDD.log)
echo workers Number of parallel workers (default: 55)
echo.
echo Example:
echo %~nx0 "p:\gha-raw-daily\2024" "p:\gha-parquet-daily\2024" "org-list-cncf.txt"

REM exit /b 1 returns error code 1 to calling process to indicate help was shown
REM due to missing required arguments or explicit help request
exit /b 1
Loading
Loading