diff --git a/myenv/bin/Activate.ps1 b/myenv/bin/Activate.ps1 new file mode 100644 index 0000000..b49d77b --- /dev/null +++ b/myenv/bin/Activate.ps1 @@ -0,0 +1,247 @@ +<# +.Synopsis +Activate a Python virtual environment for the current PowerShell session. + +.Description +Pushes the python executable for a virtual environment to the front of the +$Env:PATH environment variable and sets the prompt to signify that you are +in a Python virtual environment. Makes use of the command line switches as +well as the `pyvenv.cfg` file values present in the virtual environment. + +.Parameter VenvDir +Path to the directory that contains the virtual environment to activate. The +default value for this is the parent of the directory that the Activate.ps1 +script is located within. + +.Parameter Prompt +The prompt prefix to display when this virtual environment is activated. By +default, this prompt is the name of the virtual environment folder (VenvDir) +surrounded by parentheses and followed by a single space (ie. '(.venv) '). + +.Example +Activate.ps1 +Activates the Python virtual environment that contains the Activate.ps1 script. + +.Example +Activate.ps1 -Verbose +Activates the Python virtual environment that contains the Activate.ps1 script, +and shows extra information about the activation as it executes. + +.Example +Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv +Activates the Python virtual environment located in the specified location. + +.Example +Activate.ps1 -Prompt "MyPython" +Activates the Python virtual environment that contains the Activate.ps1 script, +and prefixes the current prompt with the specified string (surrounded in +parentheses) while the virtual environment is active. + +.Notes +On Windows, it may be required to enable this Activate.ps1 script by setting the +execution policy for the user. You can do this by issuing the following PowerShell +command: + +PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser + +For more information on Execution Policies: +https://go.microsoft.com/fwlink/?LinkID=135170 + +#> +Param( + [Parameter(Mandatory = $false)] + [String] + $VenvDir, + [Parameter(Mandatory = $false)] + [String] + $Prompt +) + +<# Function declarations --------------------------------------------------- #> + +<# +.Synopsis +Remove all shell session elements added by the Activate script, including the +addition of the virtual environment's Python executable from the beginning of +the PATH variable. + +.Parameter NonDestructive +If present, do not remove this function from the global namespace for the +session. + +#> +function global:deactivate ([switch]$NonDestructive) { + # Revert to original values + + # The prior prompt: + if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) { + Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt + Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT + } + + # The prior PYTHONHOME: + if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) { + Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME + Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME + } + + # The prior PATH: + if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) { + Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH + Remove-Item -Path Env:_OLD_VIRTUAL_PATH + } + + # Just remove the VIRTUAL_ENV altogether: + if (Test-Path -Path Env:VIRTUAL_ENV) { + Remove-Item -Path env:VIRTUAL_ENV + } + + # Just remove VIRTUAL_ENV_PROMPT altogether. + if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) { + Remove-Item -Path env:VIRTUAL_ENV_PROMPT + } + + # Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether: + if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) { + Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force + } + + # Leave deactivate function in the global namespace if requested: + if (-not $NonDestructive) { + Remove-Item -Path function:deactivate + } +} + +<# +.Description +Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the +given folder, and returns them in a map. + +For each line in the pyvenv.cfg file, if that line can be parsed into exactly +two strings separated by `=` (with any amount of whitespace surrounding the =) +then it is considered a `key = value` line. The left hand string is the key, +the right hand is the value. + +If the value starts with a `'` or a `"` then the first and last character is +stripped from the value before being captured. + +.Parameter ConfigDir +Path to the directory that contains the `pyvenv.cfg` file. +#> +function Get-PyVenvConfig( + [String] + $ConfigDir +) { + Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg" + + # Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue). + $pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue + + # An empty map will be returned if no config file is found. + $pyvenvConfig = @{ } + + if ($pyvenvConfigPath) { + + Write-Verbose "File exists, parse `key = value` lines" + $pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath + + $pyvenvConfigContent | ForEach-Object { + $keyval = $PSItem -split "\s*=\s*", 2 + if ($keyval[0] -and $keyval[1]) { + $val = $keyval[1] + + # Remove extraneous quotations around a string value. + if ("'""".Contains($val.Substring(0, 1))) { + $val = $val.Substring(1, $val.Length - 2) + } + + $pyvenvConfig[$keyval[0]] = $val + Write-Verbose "Adding Key: '$($keyval[0])'='$val'" + } + } + } + return $pyvenvConfig +} + + +<# Begin Activate script --------------------------------------------------- #> + +# Determine the containing directory of this script +$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition +$VenvExecDir = Get-Item -Path $VenvExecPath + +Write-Verbose "Activation script is located in path: '$VenvExecPath'" +Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)" +Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)" + +# Set values required in priority: CmdLine, ConfigFile, Default +# First, get the location of the virtual environment, it might not be +# VenvExecDir if specified on the command line. +if ($VenvDir) { + Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values" +} +else { + Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir." + $VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/") + Write-Verbose "VenvDir=$VenvDir" +} + +# Next, read the `pyvenv.cfg` file to determine any required value such +# as `prompt`. +$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir + +# Next, set the prompt from the command line, or the config file, or +# just use the name of the virtual environment folder. +if ($Prompt) { + Write-Verbose "Prompt specified as argument, using '$Prompt'" +} +else { + Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value" + if ($pyvenvCfg -and $pyvenvCfg['prompt']) { + Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'" + $Prompt = $pyvenvCfg['prompt']; + } + else { + Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)" + Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'" + $Prompt = Split-Path -Path $venvDir -Leaf + } +} + +Write-Verbose "Prompt = '$Prompt'" +Write-Verbose "VenvDir='$VenvDir'" + +# Deactivate any currently active virtual environment, but leave the +# deactivate function in place. +deactivate -nondestructive + +# Now set the environment variable VIRTUAL_ENV, used by many tools to determine +# that there is an activated venv. +$env:VIRTUAL_ENV = $VenvDir + +if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) { + + Write-Verbose "Setting prompt to '$Prompt'" + + # Set the prompt to include the env name + # Make sure _OLD_VIRTUAL_PROMPT is global + function global:_OLD_VIRTUAL_PROMPT { "" } + Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT + New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt + + function global:prompt { + Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) " + _OLD_VIRTUAL_PROMPT + } + $env:VIRTUAL_ENV_PROMPT = $Prompt +} + +# Clear PYTHONHOME +if (Test-Path -Path Env:PYTHONHOME) { + Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME + Remove-Item -Path Env:PYTHONHOME +} + +# Add the venv to the PATH +Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH +$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH" diff --git a/myenv/bin/activate b/myenv/bin/activate new file mode 100644 index 0000000..8439835 --- /dev/null +++ b/myenv/bin/activate @@ -0,0 +1,69 @@ +# This file must be used with "source bin/activate" *from bash* +# you cannot run it directly + +deactivate () { + # reset old environment variables + if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then + PATH="${_OLD_VIRTUAL_PATH:-}" + export PATH + unset _OLD_VIRTUAL_PATH + fi + if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then + PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}" + export PYTHONHOME + unset _OLD_VIRTUAL_PYTHONHOME + fi + + # This should detect bash and zsh, which have a hash command that must + # be called to get it to forget past commands. Without forgetting + # past commands the $PATH changes we made may not be respected + if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then + hash -r 2> /dev/null + fi + + if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then + PS1="${_OLD_VIRTUAL_PS1:-}" + export PS1 + unset _OLD_VIRTUAL_PS1 + fi + + unset VIRTUAL_ENV + unset VIRTUAL_ENV_PROMPT + if [ ! "${1:-}" = "nondestructive" ] ; then + # Self destruct! + unset -f deactivate + fi +} + +# unset irrelevant variables +deactivate nondestructive + +VIRTUAL_ENV="/Users/sahithikasim/guac-alytics/myenv" +export VIRTUAL_ENV + +_OLD_VIRTUAL_PATH="$PATH" +PATH="$VIRTUAL_ENV/bin:$PATH" +export PATH + +# unset PYTHONHOME if set +# this will fail if PYTHONHOME is set to the empty string (which is bad anyway) +# could use `if (set -u; : $PYTHONHOME) ;` in bash +if [ -n "${PYTHONHOME:-}" ] ; then + _OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}" + unset PYTHONHOME +fi + +if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then + _OLD_VIRTUAL_PS1="${PS1:-}" + PS1="(myenv) ${PS1:-}" + export PS1 + VIRTUAL_ENV_PROMPT="(myenv) " + export VIRTUAL_ENV_PROMPT +fi + +# This should detect bash and zsh, which have a hash command that must +# be called to get it to forget past commands. Without forgetting +# past commands the $PATH changes we made may not be respected +if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then + hash -r 2> /dev/null +fi diff --git a/myenv/bin/activate.csh b/myenv/bin/activate.csh new file mode 100644 index 0000000..8e4476b --- /dev/null +++ b/myenv/bin/activate.csh @@ -0,0 +1,26 @@ +# This file must be used with "source bin/activate.csh" *from csh*. +# You cannot run it directly. +# Created by Davide Di Blasi . +# Ported to Python 3.3 venv by Andrew Svetlov + +alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate' + +# Unset irrelevant variables. +deactivate nondestructive + +setenv VIRTUAL_ENV "/Users/sahithikasim/guac-alytics/myenv" + +set _OLD_VIRTUAL_PATH="$PATH" +setenv PATH "$VIRTUAL_ENV/bin:$PATH" + + +set _OLD_VIRTUAL_PROMPT="$prompt" + +if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then + set prompt = "(myenv) $prompt" + setenv VIRTUAL_ENV_PROMPT "(myenv) " +endif + +alias pydoc python -m pydoc + +rehash diff --git a/myenv/bin/activate.fish b/myenv/bin/activate.fish new file mode 100644 index 0000000..2aa114d --- /dev/null +++ b/myenv/bin/activate.fish @@ -0,0 +1,69 @@ +# This file must be used with "source /bin/activate.fish" *from fish* +# (https://fishshell.com/); you cannot run it directly. + +function deactivate -d "Exit virtual environment and return to normal shell environment" + # reset old environment variables + if test -n "$_OLD_VIRTUAL_PATH" + set -gx PATH $_OLD_VIRTUAL_PATH + set -e _OLD_VIRTUAL_PATH + end + if test -n "$_OLD_VIRTUAL_PYTHONHOME" + set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME + set -e _OLD_VIRTUAL_PYTHONHOME + end + + if test -n "$_OLD_FISH_PROMPT_OVERRIDE" + set -e _OLD_FISH_PROMPT_OVERRIDE + # prevents error when using nested fish instances (Issue #93858) + if functions -q _old_fish_prompt + functions -e fish_prompt + functions -c _old_fish_prompt fish_prompt + functions -e _old_fish_prompt + end + end + + set -e VIRTUAL_ENV + set -e VIRTUAL_ENV_PROMPT + if test "$argv[1]" != "nondestructive" + # Self-destruct! + functions -e deactivate + end +end + +# Unset irrelevant variables. +deactivate nondestructive + +set -gx VIRTUAL_ENV "/Users/sahithikasim/guac-alytics/myenv" + +set -gx _OLD_VIRTUAL_PATH $PATH +set -gx PATH "$VIRTUAL_ENV/bin" $PATH + +# Unset PYTHONHOME if set. +if set -q PYTHONHOME + set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME + set -e PYTHONHOME +end + +if test -z "$VIRTUAL_ENV_DISABLE_PROMPT" + # fish uses a function instead of an env var to generate the prompt. + + # Save the current fish_prompt function as the function _old_fish_prompt. + functions -c fish_prompt _old_fish_prompt + + # With the original prompt function renamed, we can override with our own. + function fish_prompt + # Save the return status of the last command. + set -l old_status $status + + # Output the venv prompt; color taken from the blue of the Python logo. + printf "%s%s%s" (set_color 4B8BBE) "(myenv) " (set_color normal) + + # Restore the return status of the previous command. + echo "exit $old_status" | . + # Output the original/"old" prompt. + _old_fish_prompt + end + + set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV" + set -gx VIRTUAL_ENV_PROMPT "(myenv) " +end diff --git a/myenv/bin/normalizer b/myenv/bin/normalizer new file mode 100755 index 0000000..f57dd66 --- /dev/null +++ b/myenv/bin/normalizer @@ -0,0 +1,8 @@ +#!/Users/sahithikasim/guac-alytics/myenv/bin/python3 +# -*- coding: utf-8 -*- +import re +import sys +from charset_normalizer.cli.normalizer import cli_detect +if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) + sys.exit(cli_detect()) diff --git a/myenv/bin/pip b/myenv/bin/pip new file mode 100755 index 0000000..2b726cd --- /dev/null +++ b/myenv/bin/pip @@ -0,0 +1,8 @@ +#!/Users/sahithikasim/guac-alytics/myenv/bin/python3 +# -*- coding: utf-8 -*- +import re +import sys +from pip._internal.cli.main import main +if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) + sys.exit(main()) diff --git a/myenv/bin/pip3 b/myenv/bin/pip3 new file mode 100755 index 0000000..2b726cd --- /dev/null +++ b/myenv/bin/pip3 @@ -0,0 +1,8 @@ +#!/Users/sahithikasim/guac-alytics/myenv/bin/python3 +# -*- coding: utf-8 -*- +import re +import sys +from pip._internal.cli.main import main +if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) + sys.exit(main()) diff --git a/myenv/bin/pip3.10 b/myenv/bin/pip3.10 new file mode 100755 index 0000000..2b726cd --- /dev/null +++ b/myenv/bin/pip3.10 @@ -0,0 +1,8 @@ +#!/Users/sahithikasim/guac-alytics/myenv/bin/python3 +# -*- coding: utf-8 -*- +import re +import sys +from pip._internal.cli.main import main +if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) + sys.exit(main()) diff --git a/myenv/bin/python b/myenv/bin/python new file mode 120000 index 0000000..b8a0adb --- /dev/null +++ b/myenv/bin/python @@ -0,0 +1 @@ +python3 \ No newline at end of file diff --git a/myenv/bin/python3 b/myenv/bin/python3 new file mode 120000 index 0000000..1ec499c --- /dev/null +++ b/myenv/bin/python3 @@ -0,0 +1 @@ +/Library/Frameworks/Python.framework/Versions/3.10/bin/python3 \ No newline at end of file diff --git a/myenv/bin/python3.10 b/myenv/bin/python3.10 new file mode 120000 index 0000000..b8a0adb --- /dev/null +++ b/myenv/bin/python3.10 @@ -0,0 +1 @@ +python3 \ No newline at end of file diff --git a/myenv/pyvenv.cfg b/myenv/pyvenv.cfg new file mode 100644 index 0000000..31f7c69 --- /dev/null +++ b/myenv/pyvenv.cfg @@ -0,0 +1,3 @@ +home = /Library/Frameworks/Python.framework/Versions/3.10/bin +include-system-site-packages = false +version = 3.10.10 diff --git a/scripts/buildinfo_main.py b/scripts/buildinfo_main.py index feab54e..3eaf09f 100644 --- a/scripts/buildinfo_main.py +++ b/scripts/buildinfo_main.py @@ -2,8 +2,8 @@ import headerparser import os import re -from calendar import monthrange -from database.buildinfo_db_init import open_db, init_db, close_db, insert_build +from ingestion.database.buildinfo_db_init import open_db, init_db, close_db, insert_build +from ingestion.constants import LOC,DB_LOC import progressbar from dateutil.parser import parse as du_parse @@ -173,9 +173,7 @@ def populate_db(location, db_location): if __name__ == "__main__": - location = '/data/yellow/vineet/raw_data/buildinfo_data' - db_location = '/data/yellow/vineet/database/bi_multi_tables.db' - init_db(db_location) # Initializing the database - populate_db(location, db_location) # Populating the data + init_db(DB_LOC) # Initializing the database + populate_db(LOC, DB_LOC) # Populating the data t_out = time.time() print('Program run time in seconds:', t_out - t_in, '(s)') diff --git a/scripts/ingestion/constants.py b/scripts/ingestion/constants.py index a25d068..9891e1f 100644 --- a/scripts/ingestion/constants.py +++ b/scripts/ingestion/constants.py @@ -1,11 +1,17 @@ #!/usr/bin/env python # encoding: utf-8 +import os -DB_LOC = '/data/yellow/vineet/database/bi_multi_tables.db' -MAINTAINER_INST_LOC = "https://popcon.debian.org/maint/by_inst" -MAINTAINER_TEXT_FILE = '/data/yellow/vineet/python_files/new_scripts/database_creation/maintainer.txt' -MAINTAINER_CSV_FILE = '/data/yellow/vineet/python_files/new_scripts/database_creation/maintainer.csv' -REGEX = "(\ )+" -INST_LOC = 'https://popcon.debian.org/by_inst' -POPCON_CSV = '/data/yellow/vineet/python_files/new_scripts/data_pre_processing/today.csv' -POPCON_TEXT = '/data/yellow/vineet/python_files/new_scripts/data_pre_processing/today.txt' \ No newline at end of file +# Check if the environment variables exist, otherwise use defaults +LOC = os.getenv('LOC', './buildinfo_data/') +DB_LOC = os.getenv('DB_LOC', './bi_multi_tables.db') +MAINTAINER_INST_LOC = os.getenv('MAINTAINER_INST_LOC', "https://popcon.debian.org/maint/by_inst") +MAINTAINER_TEXT_FILE = os.getenv('MAINTAINER_TEXT_FILE', 'maintainer.txt') +MAINTAINER_CSV_FILE = os.getenv('MAINTAINER_CSV_FILE', 'maintainer.csv') +REGEX = os.getenv('REGEX', "(\ )+") +INST_LOC = os.getenv('INST_LOC', 'https://popcon.debian.org/by_inst') +POPCON_CSV = os.getenv('POPCON_CSV', 'today.csv') +POPCON_TEXT = os.getenv('POPCON_TEXT', 'today.txt') +POPCON_DATA = os.getenv('POPCON_DATA', './popcon/{}/{}/{}') +POPCON = os.getenv('POPCON', 'https://popcon.debian.org') +BUILDINFO = os.getenv('BUILDINFO', 'https://buildinfos.debian.net/ftp-master.debian.org/buildinfo/{}/{}/{}/') diff --git a/scripts/ingestion/database/buildinfo_db_init.py b/scripts/ingestion/database/buildinfo_db_init.py index 2181c2c..dbb6973 100644 --- a/scripts/ingestion/database/buildinfo_db_init.py +++ b/scripts/ingestion/database/buildinfo_db_init.py @@ -1,9 +1,14 @@ #!/usr/bin/env python import sqlite3 - -LOCATION = '/data/yellow/vineet/database/bi_multi_tables.db' - -def init_db(location = LOCATION): +import sys +import os +# Get the absolute path of the parent directory +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +# Add the parent directory to the Python path +sys.path.append(parent_dir) +from constants import DB_LOC + +def init_db(location = DB_LOC): conn = sqlite3.connect(location) cur = conn.cursor() @@ -32,7 +37,7 @@ def init_db(location = LOCATION): version varchar, architecture varchar)""") - cur.execute("""CREATE TABLE IF NOT EXISTS dependency_table ( + cur.execute("""CREATE TABLE IF NOT EXISTS dependency_table( buildinfo_id INTEGER, binary_id INTEGER, FOREIGN KEY(buildinfo_id) references buildinfo_table(buildinfo_id), @@ -51,7 +56,7 @@ def init_db(location = LOCATION): conn.commit() conn.close() -def open_db(location = LOCATION): +def open_db(location = DB_LOC): conn = sqlite3.connect(location) return conn @@ -68,41 +73,45 @@ def close_db(conn = None): def insert_build(cur, result, build_time, deps,output): # Inserting data into source_table - - cur.execute(INSERT_SOURCE, (result['Source'], result['Version'])) - - # see https://docs.python.org/3/library/sqlite3.html#sqlite3.Cursor.lastrowid - # (warning, it's nonportable or threadsafe) - cur.execute('''select source_id from source_table where source_name='{}' and version='{}' '''.format(result['Source'],result['Version'])) - id1 = cur.lastrowid - - # inserting data into buildinfo_table - cur.execute(INSERT_BUILD, - (id1, - result['Architecture'], result['Build-Origin'], result['Build-Architecture'], - build_time, result['Build-Path'], result['Environment'])) - - cur.execute('''select buildinfo_id from buildinfo_table where type='{}' and source_id='{}' '''.format(result['Architecture'],id1)) - # build_id=cur.fetchone()[0] + if result['Source'] is not None: + cur.execute(INSERT_SOURCE, (result['Source'], result['Version'])) + + # see https://docs.python.org/3/library/sqlite3.html#sqlite3.Cursor.lastrowid + # (warning, it's nonportable or threadsafe) + cur.execute('''select source_id from source_table where source_name='{}' and version='{}' '''.format(result['Source'],result['Version'])) + id1 = cur.fetchone()[0] + # print('id1',id1) + # inserting data into buildinfo_table + cur.execute(INSERT_BUILD, + (id1, + result['Architecture'], result['Build-Origin'], result['Build-Architecture'], + build_time, result['Build-Path'], result['Environment'])) + + cur.execute('''select buildinfo_id from buildinfo_table where type='{}' and source_id='{}' '''.format(result['Architecture'],id1)) + # build_id=cur.fetchone()[0] + + build_id = cur.fetchone()[0] + # print('build_id',build_id) + + # we insert the binaries we just created + binary_ids = [] + if output: + for binary,md5,sha1,sha256 in output: + cur.execute(INSERT_BINARY, (binary, result['Version'], result['Architecture'])) + binary_id = cur.lastrowid + # print('binary_id',binary_id) + cur.execute(INSERT_OUTPUT, (build_id, binary_id, str(md5), str(sha1), str(sha256))) + binary_ids.append(binary_id) + + + # we get the ids as we update the binary table with the dependencies + # inserting dependencies into binary table + if deps: + for package, name, version in deps: + cur.execute(INSERT_BINARY, (name, version, result['Architecture'])) + binary_ids.append(cur.lastrowid) + # print('binary_ids',binary_ids) - build_id = cur.lastrowid - - # we insert the binaries we just created - binary_ids = [] - if output: - for binary,md5,sha1,sha256 in output: - cur.execute(INSERT_BINARY, (binary, result['Version'], result['Architecture'])) - binary_id = cur.lastrowid - cur.execute(INSERT_OUTPUT, (build_id, binary_id, str(md5), str(sha1), str(sha256))) - binary_ids.append(binary_id) - - - # we get the ids as we update the binary table with the dependencies - # inserting dependencies into binary table - if deps: - for package, name, version in deps: - cur.execute(INSERT_BINARY, (name, version, result['Architecture'])) - binary_ids.append(cur.lastrowid) - - for binary_id in binary_ids: - cur.execute(INSERT_DEPENDENCY, (build_id, binary_id)) + for binary_id in binary_ids: + cur.execute(INSERT_DEPENDENCY, (build_id, binary_id)) + diff --git a/scripts/ingestion/database/maintainer_db_init.py b/scripts/ingestion/database/maintainer_db_init.py index ee42749..d1ab326 100644 --- a/scripts/ingestion/database/maintainer_db_init.py +++ b/scripts/ingestion/database/maintainer_db_init.py @@ -1,12 +1,17 @@ #!/usr/bin/env python -import sqlite3 +import sys +import os +# Get the absolute path of the parent directory +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +# Add the parent directory to the Python path +sys.path.append(parent_dir) from constants import DB_LOC +import sqlite3 -conn = sqlite3.connect(DB_LOC) -cur = conn.cursor() - -def db_init(): - cur.execute("""CREATE TABLE IF NOT EXISTS maintainer( +def db_init(location=DB_LOC): + conn = sqlite3.connect(location) + cursor = conn.cursor() + cursor.execute("""CREATE TABLE IF NOT EXISTS maintainer( name text primary key, inst integer, vote integer, @@ -14,4 +19,4 @@ def db_init(): recent integer, no_files integer )""") - + return conn,cursor diff --git a/scripts/ingestion/database/popcon_db_init.py b/scripts/ingestion/database/popcon_db_init.py index 8fc3feb..ded1ed4 100644 --- a/scripts/ingestion/database/popcon_db_init.py +++ b/scripts/ingestion/database/popcon_db_init.py @@ -1,11 +1,16 @@ #!/usr/bin/env python import sqlite3 +import sys +import os +# Get the absolute path of the parent directory +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +# Add the parent directory to the Python path +sys.path.append(parent_dir) from constants import DB_LOC -conn = sqlite3.connect(DB_LOC) -cursor = conn.cursor() - -def db_init(): +def db_init(location=DB_LOC): + conn = sqlite3.connect(location) + cursor = conn.cursor() cursor.execute("""CREATE TABLE IF NOT EXISTS popularity_table( name text primary key, date date, @@ -18,3 +23,5 @@ def db_init(): inst_norm varchar, vote_norm varchar )""") + + return conn,cursor \ No newline at end of file diff --git a/scripts/ingestion/parsers/download_buildinfo.py b/scripts/ingestion/parsers/download_buildinfo.py new file mode 100644 index 0000000..cb16912 --- /dev/null +++ b/scripts/ingestion/parsers/download_buildinfo.py @@ -0,0 +1,61 @@ +import os +from bs4 import BeautifulSoup as bs +import requests +import time +import sys +# Get the absolute path of the parent directory +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +# Add the parent directory to the Python path +sys.path.append(parent_dir) +from constants import LOC, BUILDINFO + +years = ['2017','2018','2019' ,'2020', '2021', '2022', '2023'] # Add more years if needed +months = {'01': 31, '02': 28, '03': 31, '04': 30, '05': 31, '06': 30, + '07': 31, '08': 31, '09': 30, '10': 31, '11': 30, '12': 31} + +current_directory = os.getcwd() + +for year in years: + for month in months.keys(): + for day in range(1, months[month] + 1): + day_str = str(day).zfill(2) # Add leading zero for single-digit days + directory_path = os.path.join(LOC, str(year), str(month), day_str) + os.makedirs(directory_path, exist_ok=True) + os.chdir(directory_path) + time.sleep(5) + + url = BUILDINFO.format(year,month,day_str) + + page = requests.get(url) + + soup = bs(page.content,"html.parser") + + anchors = soup.find_all('a') + + all_links = [] + for link in anchors: + linkText = link.get('href') #find the html reference text mentioned between the anchor tags + all_links.append(linkText) + + all_links = [k for k in all_links if '.buildinfo' in k] + + for j in all_links: + web = url + j + page = requests.get(web) + data = bs(page.content,"html.parser") + data = str(data) + + if ('\n\n') in data: + data = data.split('\n\n')[1] + else: + data = data + + if '-----BEGIN PGP SIGNATURE-----' in data: + data = data.replace('-----BEGIN PGP SIGNATURE-----','') + else: + data = data + + with open (j, 'w') as g: + g.write(data) + + os.chdir(current_directory) diff --git a/scripts/ingestion/parsers/maintainer_parser.py b/scripts/ingestion/parsers/maintainer_parser.py index 932c782..95007cf 100644 --- a/scripts/ingestion/parsers/maintainer_parser.py +++ b/scripts/ingestion/parsers/maintainer_parser.py @@ -25,18 +25,18 @@ def parse_line(line): return (rank, name, inst, vote, old, recent, no_files) # Define the main parser function -def parser(): +def parser(textfile = MAINTAINER_TEXT_FILE): # Check if the file exists locally, if not, download it - if os.path.exists(MAINTAINER_TEXT_FILE): + if os.path.exists(textfile): print("Using local copy of the file.") else: print("Downloading file.") response = requests.get(MAINTAINER_INST_LOC) - with open(MAINTAINER_TEXT_FILE, "wt") as fp: + with open(textfile, "wt") as fp: fp.write(response.text) # Read the file and convert it to a CSV format - with open(MAINTAINER_TEXT_FILE, "rt") as fp, open(MAINTAINER_CSV_FILE,'w', encoding="utf-8",newline='') as fdout: + with open(textfile, "rt") as fp, open(MAINTAINER_CSV_FILE,'w', encoding="utf-8",newline='') as fdout: # Initialize the CSV writer and write the header row wr = csv.DictWriter(fdout, fieldnames=['rank','name', 'inst', 'vote', 'old', 'recent', 'no_files'], extrasaction='ignore') wr.writeheader() @@ -44,9 +44,15 @@ def parser(): # Loop through the lines in the file and write them to the CSV file for line in fp: # Skip empty lines and lines starting with # - if not line.strip() or line.startswith("#"): + if not line.strip() or line.startswith("#") or line.startswith("--------") or line.startswith("Total"): continue # Parse the line and write it to the CSV file parsed_line = parse_line(line) - wr.writerow(dict(zip(['rank','name', 'inst', 'vote', 'old', 'recent', 'no_files'], parsed_line))) \ No newline at end of file + wr.writerow(dict(zip(['rank','name', 'inst', 'vote', 'old', 'recent', 'no_files'], parsed_line))) + + fp.close() + fdout.close() + os.remove(textfile) + file = open(MAINTAINER_CSV_FILE, "r") + return file \ No newline at end of file diff --git a/scripts/ingestion/parsers/popcon_compare.py b/scripts/ingestion/parsers/popcon_compare.py index a0dfc0a..1d61eff 100644 --- a/scripts/ingestion/parsers/popcon_compare.py +++ b/scripts/ingestion/parsers/popcon_compare.py @@ -6,15 +6,18 @@ import shutil from bs4 import BeautifulSoup as bs from datetime import datetime,date -from constants import POPCON_CSV,POPCON_TEXT,INST_LOC,DB_LOC,REGEX -from database.popcon_db_init import target_con,conn,cursor +import sys +import os +# Get the absolute path of the parent directory +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +# Add the parent directory to the Python path +sys.path.append(parent_dir) +from constants import POPCON_CSV,POPCON_TEXT,INST_LOC,DB_LOC,REGEX,POPCON_DATA,POPCON +from database.popcon_db_init import db_init x=datetime.now() print('On ', datetime.strftime(x,"%m/%d/%Y")) -#Connecting to database -target_con(DB_LOC) - def parse_line(line): # Header format # rank name inst vote old recent no-files (maintainer) @@ -38,8 +41,7 @@ def parse_line(line): return (rank, name, inst, vote, old, recent, no_files, maintainer) def parser(): - url1 = 'https://popcon.debian.org' - page1 = requests.get(url1) + page1 = requests.get(POPCON) data1 = bs(page1.content,"html.parser") value = data1.find('td',class_='stats-cell') d = str(value) @@ -75,7 +77,7 @@ def parser(): o=csv.writer(fdout) data = data.split("\n") for line in data[:len(data)-3]: - if len(line) < 1 or line[0]=='#': + if len(line) < 1 or line[0]=='#' or line.startswith("--------") or line.startswith("Total"): continue line = parse_line(line) o.writerow(line) @@ -88,9 +90,10 @@ def parser(): year=date.today().year mon=date.today().month day=date.today().day - - # Saving data of each day - shutil.copyfile(POPCON_CSV,'/data/yellow/vineet/raw_data/popularity_contest/{}/{}/{}'.format(year,mon,day)) + source_data = POPCON_DATA.format(year, mon,day) + os.makedirs(source_data, exist_ok=True) + destination_file = os.path.join(source_data, 'data.csv') + shutil.copyfile(POPCON_CSV, destination_file) # Comparing the data from csv file and existing table and updating its values with open(POPCON_CSV, 'r',encoding= 'unicode_escape') as file: @@ -122,5 +125,7 @@ def parser(): file.close() os.remove(POPCON_CSV) conn.close() - -parser() + +if __name__ == "__main__": + conn,cursor = db_init(DB_LOC) + parser() diff --git a/scripts/ingestion/parsers/popcon_parser.py b/scripts/ingestion/parsers/popcon_parser.py index 194b127..c16ebc8 100644 --- a/scripts/ingestion/parsers/popcon_parser.py +++ b/scripts/ingestion/parsers/popcon_parser.py @@ -29,30 +29,34 @@ def parse_line(line): return (rank, name, inst, vote, old, recent, no_files, maintainer) -def parser(): +def parser(textfile=POPCON_TEXT): # Check if local copy of file exists, otherwise download - if os.path.exists(POPCON_TEXT): + if os.path.exists(textfile): print("using local copy of the file") else: print("downloading file") response = requests.get(INST_LOC) - with open(POPCON_TEXT, "wt") as fp: + with open(textfile, "wt") as fp: fp.write(response.text) - with open(POPCON_TEXT, "rt") as fp: + with open(textfile, "rt") as fp: data = fp.read() with open(POPCON_CSV,'w', encoding="utf-8",newline='') as fdout: - wr = csv.DictWriter(fdout, fieldnames=['sno','Name', 'inst', 'vote', 'old', 'recent', 'no_files', 'maintainer'], extrasaction='ignore') # ignore unwanted fields - o=csv.writer(fdout) + wr=csv.writer(fdout) data = data.split("\n") for line in data[:len(data)-3]: # Skip empty or commented lines - if len(line) < 1 or line[0]=='#': + if len(line) < 1 or line[0]=='#' or line.startswith("--------") or line.startswith("Total"): continue line = parse_line(line) - o.writerow(line) + wr.writerow(line) + fp.close() + fdout.close() + os.remove(textfile) + file = open(POPCON_CSV, "r") + return file diff --git a/scripts/maintainer_main.py b/scripts/maintainer_main.py index 77b7164..a5b6156 100644 --- a/scripts/maintainer_main.py +++ b/scripts/maintainer_main.py @@ -1,12 +1,13 @@ #!/usr/bin/env python import csv -from constants import MAINTAINER_CSV_FILE -from database.maintainer_db_init import db_init,conn,cur -from parsers.maintainer_parser import parser +import os +from ingestion.constants import DB_LOC, MAINTAINER_TEXT_FILE +from ingestion.database.maintainer_db_init import db_init +from ingestion.parsers.maintainer_parser import parser -def init(): +def init(maintainer_file): """ Inserting new values into the table """ - with open(MAINTAINER_CSV_FILE, 'r',encoding= 'unicode_escape') as file: + with open(maintainer_file, 'r',encoding= 'unicode_escape') as file: data = csv.reader(file,delimiter=',') no_records = 0 for row in data: @@ -16,16 +17,17 @@ def init(): o=row[4] r=row[5] no=row[6] - cur.execute('''INSERT OR REPLACE INTO maintainer VALUES (?,?,?,?,?,?)''',(n,i,v,o,r,no)) + cursor.execute('''INSERT OR REPLACE INTO maintainer VALUES (?,?,?,?,?,?)''',(n,i,v,o,r,no)) conn.commit() no_records += 1 print(no_records, ' checked') print('Closing file') + conn.close() if __name__ == "__main__": - db_init() # Initialize the database - parser() # Parse the data - init() # Inserts the records into table - -conn.close() + maintainer_file = parser(MAINTAINER_TEXT_FILE) # Parse the data + maintainer_file = maintainer_file.name + conn,cursor = db_init(DB_LOC) # Initialize the database + init(maintainer_file) # Inserts the records into table + os.remove(maintainer_file) diff --git a/scripts/popcon_main.py b/scripts/popcon_main.py index 6b9fbe5..3310038 100644 --- a/scripts/popcon_main.py +++ b/scripts/popcon_main.py @@ -1,15 +1,15 @@ import csv +import os from datetime import datetime -import sqlite3 -from database.popcon_db_init import db_init,conn,cursor -from constants import POPCON_CSV -from parsers.popcon_parser import parser +from ingestion.database.popcon_db_init import db_init +from ingestion.constants import DB_LOC,POPCON_TEXT +from ingestion.parsers.popcon_parser import parser -no_records = 0 -def popcon(): - with open(POPCON_CSV, 'r',encoding= 'unicode_escape') as file: +def popcon(popcon_file): + with open(popcon_file, 'r',encoding= 'unicode_escape') as file: data = csv.reader(file,delimiter=',') x=datetime.now() + no_records=0 for row in data: name=row[1] date=x @@ -23,10 +23,13 @@ def popcon(): conn.commit() no_records += 1 + conn.close() + print ('\n{} records transferred'.format(no_records)) + if __name__ == "__main__": - db_init() - parser() - popcon() + popcon_file = parser(POPCON_TEXT) # Parse the data + popcon_file = popcon_file.name + conn,cursor = db_init(DB_LOC) # Initialize the database + popcon(popcon_file) # Inserts the records into table + os.remove(popcon_file) -conn.close() -print ('\n{} records transferred'.format(no_records))