diff --git a/notebook-example/three/.gitignore b/notebook-example/three/.gitignore new file mode 100644 index 0000000..eb35bfe --- /dev/null +++ b/notebook-example/three/.gitignore @@ -0,0 +1,2 @@ +.venv +dist/ diff --git a/notebook-example/three/build.sh b/notebook-example/three/build.sh new file mode 100755 index 0000000..f0ccc0c --- /dev/null +++ b/notebook-example/three/build.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +set -e + +DIST=dist +FILES=notebooks + +# https://stackoverflow.com/a/76208774 + +# I don't want to have anything lingering. +# Environments affect the build. Use the +# one that we build here... for the build. +# conda deactivate >/dev/null 2>&1 +# deactivate >/dev/null 2>&1 + +echo "Removing venv" +rm -rf .venv +sleep 2 +echo "Creating clean venv" +python3 -m venv .venv +source .venv/bin/activate + +if [[ "$VIRTUAL_ENV" != "" ]] +then + INVENV=1 + echo "In environment $VIRTUAL_ENV" + sleep 2 +else + INVENV=0 +fi + +pip install --upgrade pip +pip install --no-cache-dir -r requirements.txt + +echo "Cleaning up for install" +rm -f *.db +rm -rf $DIST +rm -rf $FILES/__pycache__ + +sleep 1 + +echo "Launching build" +sleep 1 +jupyter lite init +jupyter lite build + +if [[ $RUN = 1 ]]; then + pushd $DIST + python -m http.server 8080 + popd +fi \ No newline at end of file diff --git a/notebook-example/three/jupyter_lite_config.json b/notebook-example/three/jupyter_lite_config.json new file mode 100644 index 0000000..52846e5 --- /dev/null +++ b/notebook-example/three/jupyter_lite_config.json @@ -0,0 +1,6 @@ +{ + "LiteBuildConfig": { + "contents": ["notebooks"], + "output_dir": "dist" + } + } \ No newline at end of file diff --git a/notebook-example/three/notebooks/README.md b/notebook-example/three/notebooks/README.md new file mode 100644 index 0000000..565cbc8 --- /dev/null +++ b/notebook-example/three/notebooks/README.md @@ -0,0 +1 @@ +Bye. \ No newline at end of file diff --git a/notebook-example/three/notebooks/api_key.py b/notebook-example/three/notebooks/api_key.py new file mode 100644 index 0000000..4b7ca60 --- /dev/null +++ b/notebook-example/three/notebooks/api_key.py @@ -0,0 +1,12 @@ +############################################## +# To obtain a key for use with the FAC. +# +# 1. Visit https://api.data.gov/signup/ +# 2. Enter your name and email address. +# 3. You will receive a key in the email. +# 4. Copy that key. +# 5. Paste your key in-between the quotes below, +# being careful not to add spaces before or after +# the key. + +FAC_API_KEY = "" diff --git a/notebook-example/three/notebooks/fac.py b/notebook-example/three/notebooks/fac.py new file mode 100644 index 0000000..eec4ef9 --- /dev/null +++ b/notebook-example/three/notebooks/fac.py @@ -0,0 +1,19 @@ +import os +import pyodide_http +import requests +pyodide_http.patch_all() + +from api_key import FAC_API_KEY + +############################################## +# This patches the FAC_API_KEY variable +# so that we can test the workbook locally. +# It will have no effect in the WWW environment. +if os.getenv("FAC_API_KEY") not in [None, ""]: + FAC_API_KEY = os.getenv("FAC_API_KEY") + +def get(endpoint, + params={"limit": 1} + ): + return requests.get(f"https://api.fac.gov/{endpoint}", + params = params) \ No newline at end of file diff --git a/notebook-example/three/notebooks/files/treasury.alns b/notebook-example/three/notebooks/files/treasury.alns new file mode 100644 index 0000000..a7e30e0 --- /dev/null +++ b/notebook-example/three/notebooks/files/treasury.alns @@ -0,0 +1,23 @@ +21.003 +21.004 +21.006 +21.008 +21.009 +21.010 +21.011 +21.012 +21.014 +21.015 +21.016 +21.017 +21.018 +21.019 +21.020 +21.021 +21.023 +21.024 +21.027 +21.028 +21.029 +21.031 +21.032 \ No newline at end of file diff --git a/notebook-example/three/notebooks/findings_by_aln.ipynb b/notebook-example/three/notebooks/findings_by_aln.ipynb new file mode 100644 index 0000000..29d28b1 --- /dev/null +++ b/notebook-example/three/notebooks/findings_by_aln.ipynb @@ -0,0 +1,43 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install pyodide-http requests peewee openpyxl pandas sqlite3\n", + "from libraries import findings_by_aln as fba\n", + "from fac import FAC_API_KEY" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Findings per ALN\n", + "\n", + "There was a 5-day period where search was not available. During that time, we published Excel workbooks that provided Federal users a set of spreadsheets that tracked findings on a per-ALN basis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set the API key for the library\n", + "fba.set_api_key(FAC_API_KEY)\n", + "# Changing the date generates a different workbook in the `xlsx` directory\n", + "fba.findings_by_aln(\"2024-06-17\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebook-example/three/notebooks/libraries/__init__.py b/notebook-example/three/notebooks/libraries/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/notebook-example/three/notebooks/libraries/aln.py b/notebook-example/three/notebooks/libraries/aln.py new file mode 100644 index 0000000..32444dd --- /dev/null +++ b/notebook-example/three/notebooks/libraries/aln.py @@ -0,0 +1,133 @@ +import re +import logging + +class ALN: + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger("accuracy_alns") + + def __init__(self, agency, program=None): + self.agency = agency + self.program = program + + def __repr__(self): + if self.program: + return f"{self.agency}.{self.program}" + else: + return f"{self.agency}" + + def __str__(self): + return self.__repr__() + + def __eq__(self, other): + return (self.agency == other.agency + and self.program == other.program) + + def __hash__(self): + return hash(str(self)) + + def streq(self, string_aln): + parts = string_aln.split(".") + return (self.agency == parts[0] + and self.program == parts[1]) + + def is_valid(self): + return ( + self.is_all_numeric() + or self.is_u_program() + or self.is_rd_program() + or self.is_gsa_migration() + or self.is_alpha_program() + ) + def category(self): + if self.is_all_numeric(): + return "NUMERIC" + elif self.is_u_program(): + return "U" + elif self.is_rd_program(): + return "RD" + elif self.is_alpha_program(): + return "ALPHA" + elif self.is_gsa_migration(): + return "GSA" + else: + ALN.logger(f"UNK ALN: {self.agency} {self.program}") + return "UNK" + + def is_numeric_agency(self): + try: + int(self.agency) + return True + except: + return False + + def is_numeric_program(self): + try: + int(self.program) + return True + except: + return False + + def is_all_numeric(self): + return self.is_numeric_agency() and self.is_numeric_program() + + def is_u_program(self): + return self.is_numeric_agency() and bool(re.match(r"^U[0-9]{2}$", self.program)) + + def is_rd_program(self): + return self.is_numeric_agency() and bool(re.match(r"^RD([0-9]{1})?$", self.program)) + + def is_gsa_migration(self): + return self.is_numeric_agency() and bool(re.match(r"^GSA_MIGRATION$", self.program)) + + def is_alpha_program(self): + return ( + self.is_numeric_agency() + and bool(re.match("^[0-9]{2}$", self.agency)) + and bool(re.match("^[0-9]{3}([A-Z])?$", self.program))) + +###################################### +# TESTS +###################################### + +def test_is_numeric_agency(): + assert ALN("10", "ABC").is_numeric_agency() is True + assert ALN("AB", "ABC").is_numeric_agency() is False + +numeric_programs_valid = [ + ALN("10", "123"), + ALN("10", "000") +] +numeric_programs_invalid = [ + ALN("10", "ABC"), + ALN("AB", "ABC") +] +def test_is_numeric_program(): + for aln in numeric_programs_valid: + assert aln.is_numeric_program() is True + for aln in numeric_programs_invalid: + assert aln.is_numeric_program() is False + +rd_alns_valid = [ + ALN("93", "RD"), + ALN("93", "RD1") +] +def test_is_rd_program(): + for aln in rd_alns_valid: + assert aln.is_rd_program() is True + +def test_validity(): + for aln in rd_alns_valid: + assert aln.is_valid() is True + for aln in numeric_programs_valid: + assert aln.is_valid() is True + for aln in numeric_programs_invalid: + assert aln.is_valid() is False + assert ALN("11", "123").is_valid() is True + assert ALN("92", "RD1").is_valid() is True + assert ALN("92", "RD").is_valid() is True + assert ALN("92", "RDX").is_valid() is False + assert ALN("84", "483A").is_valid() is True + assert ALN("84", "483AB").is_valid() is False + assert ALN("84", "48A").is_valid() is False + assert ALN("21", "U23").is_valid() is True + assert ALN("45", "GSA_MIGRATION").is_valid() is True diff --git a/notebook-example/three/notebooks/libraries/findings_by_aln.py b/notebook-example/three/notebooks/libraries/findings_by_aln.py new file mode 100644 index 0000000..0408b8a --- /dev/null +++ b/notebook-example/three/notebooks/libraries/findings_by_aln.py @@ -0,0 +1,394 @@ +import time +from openpyxl import Workbook +from types import SimpleNamespace +from openpyxl.styles import PatternFill +from playhouse.shortcuts import model_to_dict +# from rich.table import Table +# from rich.console import Console +# from rich import print + + +from libraries.findings_models import ( + DailyGenerals, + DailyFindings, + DailyMetadata, + get_unique_agency_numbers, + get_unique_cog_overs, + setup_database +) + +import libraries.findings_util as findings_util + +from libraries.findings_util import ( + op, + string_to_datetime, + fetch_from_api, + today, + get_query_count, + rm, + path_based_on_ext, + adjust_columns, + cog_over, + convert_bools, +) + +from libraries.findings_const import ( + FAC_API_BASE +) + +import logging +logger = logging.getLogger(__name__) + +# https://stackoverflow.com/questions/17755996/how-to-make-a-list-as-the-default-value-for-a-dictionary + +# A result is a single award for a single day. + +def set_api_key(key): + findings_util.set_api_key(key) + + +findings_fields_to_keep = set([ + "report_id", + "auditee_name", + "auditee_uei", + "cog_over", + "award_reference", + "reference_number", + "is_modified_opinion", + "is_other_matters", + "is_material_weakness", + "is_significant_deficiency", + "is_other_findings", + "is_questioned_costs", + "is_repeat_finding", + "prior_finding_ref_numbers", +]) + +awards_fields_to_keep = set([ + "report_id", + "reference_number", + "award_reference", + "auditee_name", + "aln", + "cog_over", + "federal_program_name", + "amount_expended", + "is_direct", + "is_major", + "is_passthrough_award", + "passthrough_amount", +]) + +yes_fill = PatternFill(start_color="FFD700", + end_color="FFD700", fill_type="solid") + + +class QParam(): + def __init__(self, date): + self.date = date + + +class Result(): + def __init__(self, d): + self.data = SimpleNamespace(**d) + + def add(self, key, value): + self.data[key] = value + + def __str__(self): + return f"{self.data.report_id}" + + def __repr__(self): + return self.__str__() + + +class FAC(): + # Takes a list of parameter objects. + def __init__(self, params): + # TODO: Remove any dates we have already run and + # cached locally. + self.params = params + self.results = [] + + # Fetch the general results. + # Must start here. + def general(self, report_id=None): + po: QParam + + for po in self.params: + payload = { + "fac_accepted_date": op("eq", po.date), + "select": ",".join([ + "report_id", + "auditee_name", + "cognizant_agency", + "oversight_agency", + "auditee_uei", + ]) + } + + jres = fetch_from_api("general", payload) + + for res in jres: + if DailyGenerals.select().where(DailyGenerals.report_id == res["report_id"]): + logger.debug(f"Skipping {res['report_id']}") + else: + d = {"report_id": res["report_id"], + "date": po.date, + "auditee_name": res["auditee_name"], + "cog_over": cog_over(res["cognizant_agency"], res["oversight_agency"]), + "auditee_uei": res["auditee_uei"], + } + self.results.append(DailyGenerals.create(**d)) + + # Now, populate with the findings. This tells us which we need, and + # which to remove. + def findings(self, report_id=None): + print("FINDINGS") + # console = Console() + # We should only do things where we have not fetched. + if report_id: + gq = DailyGenerals.select().where(DailyGenerals.report_id == report_id) + else: + gq = DailyGenerals.select().where(DailyGenerals.findings_count.is_null()) + for dg in gq: + print(f"\tfindings {dg.report_id}") + jres = fetch_from_api("findings", { + "report_id": op("eq", dg.report_id) + }) + for res in jres: + ## print(f"res {res}") + res["cog_over"] = dg.cog_over + res = res | model_to_dict(dg) + # We only need a subset of the keys + # that come back from the API query. + to_delete = set(res.keys()).difference(findings_fields_to_keep) + for k in to_delete: + del res[k] + # Make sure booleans are booleans... + # Peewee does not treat 'N' as False. + res = convert_bools(res) + # console.log(res) + dfq = (DailyFindings + .select() + .where((DailyFindings.report_id == dg.report_id) + & (DailyFindings.award_reference == res["award_reference"]) + & (DailyFindings.reference_number == res["reference_number"]))) + if dfq.exists(): + for df in dfq: + print( + f"\tUpdating {dg.report_id} {res['award_reference']} {res['reference_number']}") + (df + .update(**res) + .where((DailyFindings.report_id == dg.report_id) + & (DailyFindings.award_reference == res["award_reference"]) + & (DailyFindings.reference_number == res["reference_number"])) + .execute()) + else: + print( + f"\tCreating {dg.report_id} {res['award_reference']} {res['reference_number']}") + DailyFindings.create(**res) + dg.date_retrieved = today() + dg.findings_count = len(jres) + dg.save() + + def awards(self, report_id=None): + print("AWARDS") + # console = Console() + + if report_id: + gq = DailyGenerals.select().where(DailyGenerals.report_id == report_id) + else: + gq = DailyGenerals.select().where(DailyGenerals.awards_count.is_null()) + for dg in gq: + # For each general + dfq = (DailyFindings + .select() + .where(DailyFindings.report_id == dg.report_id)) + awards_count = 0 + # We already have findings loaded. + # These are the awards that we care about + for df in dfq: + # Now, for each row we find, we need to + # look up more award info. + jres = fetch_from_api("federal_awards", { + "report_id": op("eq", dg.report_id), + "award_reference": op("eq", df.award_reference) + }) + awards_count += 1 + # What comes back are federal awards results + for res in jres: + # Update the appropriate record. + res["aln"] = (res["federal_agency_prefix"] + + "." + res["federal_award_extension"]) + # We only need a subset of the keys + # that come back from the API query. + res = res | model_to_dict(dg) + to_delete = set(res.keys()).difference( + awards_fields_to_keep) + for k in to_delete: + del res[k] + res = convert_bools(res) + # Update the row in question + print(f"\tUpdating awards for {df.report_id} {df.award_reference} {df.reference_number}") + (df + .update(**res) + .where((DailyFindings.report_id == dg.report_id) + & (DailyFindings.award_reference == df.award_reference) + & (DailyFindings.reference_number == df.reference_number)) + .execute()) + dg.awards_count = awards_count + dg.save() + + def _add_sheets(self, wb, iter, query): + # get_unique_agency_numbers() + for iter_value in iter: + ws = wb.create_sheet(f"{iter_value}") + # Put headers on the sheets + for obj in query(iter_value): + as_d = model_to_dict(obj) + ws.append(list(as_d.keys())) + break + # Now the values. + for obj in query(iter_value): + as_d = model_to_dict(obj) + ws.append(list(as_d.values())) + adjust_columns(ws) + + def _cleanup_sheet(self, ws): + boolean_columns = ["K", "L", "M", "O", "P", "Q", "R", "S", "T", "U"] + # Trys to go through a sheet and + # 1. Hyperlink all the report ids, + # 2. Cleanup all the booleans. + # The columns are hard-coded to the order + # they appear from the dump into the sheet. + try: + report_ids = [] + for cell in ws["B"]: + if ("GSAFAC" in cell.value) or ("CENSUS" in cell.value): + report_ids.append(cell.value) + cell.hyperlink = f"https://app.fac.gov/dissemination/report/pdf/{cell.value}" + else: + pass + for ndx, cell in enumerate(ws["C"][1:]): + cell.hyperlink = f"https://app.fac.gov/dissemination/summary/{report_ids[ndx]}" + for bool_column in boolean_columns: + for cell in ws[bool_column]: + if cell.value == 1: + cell.value = "YES" + elif cell.value == 0: + cell.value = "NO" + else: + pass + for bool_column in boolean_columns: + for cell in ws[bool_column]: + if cell.value == "YES": + cell.fill = yes_fill + except: + pass + + def _remove_default_sheet(self, wb): + # Try removing the default sheet. + try: + del wb['Sheet'] + except: + pass + + def to_xlsx(self): + print("TO XLSX") + wb = Workbook() + self._add_sheets( + wb, + get_unique_agency_numbers(), + lambda iter_value: + (DailyFindings + .select + ().where(DailyFindings.aln.startswith(iter_value))) + ) + self._add_sheets( + wb, + get_unique_cog_overs(), + lambda iter_value: + (DailyFindings + .select + ().where(DailyFindings.cog_over == iter_value)) + ) + + # Hyperlink the report IDs + for sheet in wb.worksheets: + self._cleanup_sheet(sheet) + + self._remove_default_sheet(wb) + + return wb + + +# @click.command() +# @click.argument('acceptance_date', default="2024-03-02") +# @click.option("--clean", is_flag=True, show_default=True, default=False,) +# @click.option("--omit-generals", is_flag=True, show_default=True, default=False,) +# @click.option("--omit-findings", is_flag=True, show_default=True, default=False,) +# @click.option("--omit-awards", is_flag=True, show_default=True, default=False,) +# @click.option("--report-id", default=None,) +def findings_by_aln(acceptance_date, + clean=True, + omit_generals=False, + omit_findings=False, + omit_awards=False, + report_id=None): + acceptance_date = string_to_datetime(acceptance_date) + db_filename = f"{acceptance_date.strftime('%Y-%m-%d')}.sqlite" + workbook_filename = f"{acceptance_date.strftime('%Y-%m-%d')}-findings.xlsx" + # Possibly remove work products + # If we're only running part of the generation, then + # do not clean things. That's an error on the user's part. + if clean and (all(map(lambda v: not v, [omit_generals, omit_findings, omit_awards]))): + rm(path_based_on_ext(db_filename)) + + setup_database(db_filename) + + qparams = [] + qparams.append(QParam(acceptance_date.date())) + fac = FAC(qparams) + + g0 = g1 = 0 + f0 = f1 = 0 + a0 = a1 = 0 + + t0 = time.time() + if omit_generals: + print("Skipping general generation") + else: + g0 = time.time() + fac.general(report_id=report_id) + g1 = time.time() + + if omit_findings: + print("Skipping findings generation") + else: + f0 = time.time() + fac.findings(report_id=report_id) + f1 = time.time() + + if omit_awards: + print("Skipping award generation") + else: + a0 = time.time() + fac.awards(report_id=report_id) + a1 = time.time() + t1 = time.time() + + try: + wb = fac.to_xlsx() + rm(path_based_on_ext(workbook_filename)) + wb.save(path_based_on_ext(workbook_filename)) + DailyMetadata.create( + date_retrieved=today(), + queries_used=get_query_count(), + time_elapsed=t1-t0, + time_general=g1-g0, + time_findings=f1-f0, + time_awards=a1-a0, + ) + except: + print(f"{acceptance_date} NO FINDINGS, NO WORKBOOK") diff --git a/notebook-example/three/notebooks/libraries/findings_const.py b/notebook-example/three/notebooks/libraries/findings_const.py new file mode 100644 index 0000000..53c7168 --- /dev/null +++ b/notebook-example/three/notebooks/libraries/findings_const.py @@ -0,0 +1,3 @@ +FAC_API_BASE = "https://api.fac.gov" +MAX_RESULTS = 4_000_000 +STEP_SIZE = 20000 diff --git a/notebook-example/three/notebooks/libraries/findings_models.py b/notebook-example/three/notebooks/libraries/findings_models.py new file mode 100644 index 0000000..552b8f3 --- /dev/null +++ b/notebook-example/three/notebooks/libraries/findings_models.py @@ -0,0 +1,79 @@ +from peewee import * +from libraries.findings_util import ( + path_based_on_ext +) + +proxy = DatabaseProxy() # Create a proxy for our db. + +# We're going to need to cache things. +# So, a local DB makes sense. +# The table design... +# It will pull from General, Findings, and Federal Awards + +class DailyMetadata(Model): + date_retrieved = DateField(null=True) + queries_used = IntegerField(null=True) + time_elapsed = IntegerField(null=True) + time_general = IntegerField(null=True) + time_findings = IntegerField(null=True) + time_awards = IntegerField(null=True) + class Meta: + database = proxy + +class DailyGenerals(Model): + report_id = TextField(unique=True) # PK + auditee_name = TextField() + auditee_uei = TextField() + date = DateField() + date_retrieved = DateField(null=True) + findings_count = IntegerField(null=True) + awards_count = IntegerField(null=True) + cog_over = TextField(null=True) + class Meta: + database = proxy + + +class DailyFindings(Model): + report_id = TextField() + auditee_name = TextField() + auditee_uei = TextField() + award_reference = TextField(null=True) + reference_number = TextField(null=True) + aln = TextField(null=True) + cog_over = TextField(null=True) + federal_program_name = TextField(null=True) + amount_expended = IntegerField(null=True) + is_direct = BooleanField(null=True) + is_major = BooleanField(null=True) + is_passthrough_award = BooleanField(null=True) + passthrough_amount = IntegerField(null=True) + is_modified_opinion = BooleanField(null=True) + is_other_matters = BooleanField(null=True) + is_material_weakness = BooleanField(null=True) + is_significant_deficiency = BooleanField(null=True) + is_other_findings = BooleanField(null=True) + is_questioned_costs = BooleanField(null=True) + is_repeat_finding = BooleanField(null=True) + prior_finding_ref_numbers = TextField(null=True) + + class Meta: + database = proxy + + +def get_unique_agency_numbers(): + ans = set() + for df in DailyFindings.select(): + ans.add(df.aln.split(".")[0]) + return sorted(list(ans)) + +def get_unique_cog_overs(): + cogs = set() + for df in DailyFindings.select(): + cogs.add(df.cog_over) + return sorted(list(cogs)) + +def setup_database(filename): + # Set up the SQLite database pro + db = SqliteDatabase(':memory:') + proxy.initialize(db) + db.create_tables([DailyMetadata, DailyGenerals, DailyFindings]) diff --git a/notebook-example/three/notebooks/libraries/findings_util.py b/notebook-example/three/notebooks/libraries/findings_util.py new file mode 100644 index 0000000..477053f --- /dev/null +++ b/notebook-example/three/notebooks/libraries/findings_util.py @@ -0,0 +1,93 @@ +from datetime import datetime +import os +import requests + +from libraries.findings_const import ( + FAC_API_BASE +) +import logging +logger = logging.getLogger(__name__) + +API_KEY = "" +def set_api_key(key): + global API_KEY + API_KEY = key + +def get_api_key(): + global API_KEY + return API_KEY + +def op(op, value): + return f"{op}.{value}" + + +def string_to_datetime(strdate): + parts = strdate.split("-") + return datetime(int(parts[0]), int(parts[1]), int(parts[2])) + +def today(): + return datetime.now().strftime('%Y-%m-%d') + +query_count = 0 + +def get_query_count(): + global query_count + return query_count + +def fetch_from_api(table, payload): + global query_count + query_count += 1 + payload = payload | {"api_key": get_api_key()} + + res = requests.get(f"{FAC_API_BASE}/{table}", + params=payload,) + jres = res.json() + if len(jres) == 0: + print(f"No results found for {table}") + return jres + + +def rm(filename): + try: + os.remove(filename) + except FileNotFoundError: + pass + +def path_based_on_ext(the_file): + filename, file_extension = os.path.splitext(the_file) + try: + os.mkdir(file_extension) + except: + pass + return os.path.join(file_extension[1:], f"{filename}{file_extension}") + + +def convert_bools(res): + for k in res.keys(): + if res[k] in ['Y', "TRUE", "T", "YES"]: + res[k] = True + elif res[k] in ["N", "NO", "FALSE", "F"]: + res[k] = False + return res + + +def adjust_columns(ws): + for col in ws.columns: + max_length = 0 + column = col[0].column_letter # Get the column name + for cell in col: + try: # Necessary to avoid error on empty cells + if len(str(cell.value)) > max_length: + max_length = len(str(cell.value)) + except: + pass + adjusted_width = (max_length + 2) * 1.2 + ws.column_dimensions[column].width = adjusted_width + return ws + +def cog_over(c, o): + if c: + return f"COG-{c}" + else: + return f"OVER-{o}" + diff --git a/notebook-example/three/notebooks/libraries/sum_over_alns.py b/notebook-example/three/notebooks/libraries/sum_over_alns.py new file mode 100644 index 0000000..8696578 --- /dev/null +++ b/notebook-example/three/notebooks/libraries/sum_over_alns.py @@ -0,0 +1,296 @@ +import os +import requests +import sys +import datetime +#from alive_progress import alive_bar +from .aln import ALN +import pandas as pd +from openpyxl import Workbook +from openpyxl.utils.dataframe import dataframe_to_rows +import time + +# https://stackoverflow.com/questions/17755996/how-to-make-a-list-as-the-default-value-for-a-dictionary +from collections import defaultdict +from pprint import pprint +from collections import namedtuple as NT + +FAC_API_BASE = "https://api.fac.gov" +# This change hard-overrides using the local data. +# This involves leaving out some audits, but it is faster, +# and avoids key limit issues while testing. +# FAC_API_BASE = "http://localhost:3000" +# FAC_API_KEY = os.getenv("API_GOV_KEY") +MAX_RESULTS = 4_000_000 +STEP_SIZE = 20000 + +# Basic headers; intended for use locally as well as remotely. +def BASE_HEADERS(api_key): + return { + "X-API-Key": api_key + } + + +def load_aln_list(fname): + alns = set() + with open(fname, 'r') as fp: + for line in fp: + line = line.strip() + parts = line.split(".") + if len(parts) == 1: + alns.add(ALN(parts[0])) + else: + alns.add(ALN(parts[0], parts[1])) + return list(alns) + + +def op(op, value): + return f"{op}.{value}" + + +def string_to_datetime(strdate): + parts = strdate.split("-") + return datetime.datetime(int(parts[0]), int(parts[1]), int(parts[2])) + + +memoize_dates = {} + + +def get_date(report_id, api_key=""): + if memoize_dates.get(report_id, False): + return string_to_datetime(memoize_dates.get(report_id)) + payload = { + "report_id": op("eq", report_id), + "select": ",".join(["report_id", "fac_accepted_date"]), + "api_key": api_key, + } + res = requests.get(f"{FAC_API_BASE}/general", + params=payload, + headers=BASE_HEADERS(api_key)) + jres = res.json() + if len(jres) == 0: + print(f"NO DATE FOUND FOR {report_id}") + sys.exit() + the_date = jres[0]["fac_accepted_date"] + memoize_dates[report_id] = the_date + the_date = string_to_datetime(the_date) + return the_date + + +def calculate_for_aln(aln, + audit_year="2023", + before_acceptance="2023-06-28", + api_key=""): + # What report IDs does this ALN appear in? + # aln : report_id + aln_to_report_ids = defaultdict(list) + # What is the total direct amount on that ALN? + # aln : total + aln_to_total = defaultdict(lambda: 0) + # How many times do we see this ALN? + # aln : count + aln_to_count = defaultdict(lambda: 0) + aln_dates = defaultdict(list) + before_acceptance = string_to_datetime(before_acceptance) + + # We begin by finding this ALN in the federal_awards table + payload = { + "limit": STEP_SIZE - 1, + "federal_agency_prefix": op("eq", aln.agency), + "audit_year": op("eq", audit_year), + "is_direct": op("eq", "Y"), + "select": ",".join(["report_id", "amount_expended", "is_direct", "federal_agency_prefix", "federal_award_extension"]), + "api_key": api_key, + } + # If they included a program, and not just an agency number... + if aln.program: + payload["federal_award_extension"] = op("eq", aln.program) + + url = f"{FAC_API_BASE}/federal_awards" + + for start in range(0, MAX_RESULTS, STEP_SIZE): + payload["offset"] = start + + res = requests.get(url, + params=payload) + jres = res.json() + len_jres = len(jres) + # print(f"[{payload['offset']} -> {payload['offset'] + (STEP_SIZE-1)}] Retrieved {len_jres} results...") + if jres == []: + break + elif "code" in jres: + print("ERROR: ") + pprint(jres) + break + else: + for r in jres: + this_date = get_date(r["report_id"], api_key) + r["fac_accepted_date"] = this_date + if this_date < before_acceptance: + aln_to_report_ids[aln].append(r["report_id"]) + aln_to_count[aln] = aln_to_count.get(aln, 0) + 1 + aln_dates[aln].append(this_date) + if r["is_direct"] == "Y": + aln_to_total[aln] = aln_to_total.get( + aln, 0) + r["amount_expended"] + if len_jres < STEP_SIZE: + break + + # return (str(aln), aln_to_report_ids, aln_to_total, aln_to_count) + return Results(audit_year, str(aln), aln_to_report_ids[aln], aln_to_total[aln], aln_to_count[aln]) + + +def fac_weight_fun(reports, awards, dollars): + v = (0.485 * reports) + (0.485 * awards) + (0.03 * dollars) + return round(v, 3) + + +class Results(): + def __init__(self, audit_year, aln, report_ids, total_dollars, award_count): + self.audit_year = audit_year + self.aln = aln + self.report_ids = set(report_ids) + self.total_dollars = total_dollars + self.award_count = award_count + + def __str__(self): + return f"{self.aln} rids: {len(self.report_ids)} $: {self.total_dollars} awards: {self.award_count}" + + def __repr__(self): + return self.__str__() + + def to_csv(self): + return f"{self.audit_year},{self.aln},{len(self.report_ids)},{self.award_count},{self.total_dollars}" + + +class ResultSummary(): + def __init__(self, agency_number): + self.agency_number = agency_number + self.results = defaultdict(list) + self.alns = defaultdict(list) + self.report_counts = defaultdict(list) + self.award_counts = defaultdict(list) + self.total_dollars = defaultdict(list) + self.pct_of_reports = defaultdict(list) + self.pct_of_awards = defaultdict(list) + self.pct_of_dollars = defaultdict(list) + self.fac_weights = defaultdict(list) + + def add_result(self, audit_year, r): + self.results[audit_year].append(r) + + def prep_report(self): + for ay, rs in self.results.items(): + for r in rs: + self.alns[ay].append(r.aln) + self.report_counts[ay].append(len(r.report_ids)) + self.award_counts[ay].append(r.award_count) + self.total_dollars[ay].append(int(r.total_dollars)) + try: + self.pct_of_reports[ay] = list( + map(lambda n: round(n / sum(self.report_counts[ay])*100, 3), self.report_counts[ay])) + self.pct_of_awards[ay] = list( + map(lambda n: round(n / sum(self.award_counts[ay])*100, 3), self.award_counts[ay])) + self.pct_of_dollars[ay] = list( + map(lambda n: round(n / sum(self.total_dollars[ay])*100, 3), self.total_dollars[ay])) + self.fac_weights[ay] = list(map(fac_weight_fun, + self.pct_of_reports[ay], + self.pct_of_awards[ay], + self.pct_of_dollars[ay])) + except: + print("REPORT COUNTS", self.report_counts) + print("AWARD COUNTS", self.award_counts) + print("TOTAL DOLLARS", self.total_dollars) + + def report_as_xlsx(self): + self.prep_report() + wb = Workbook() + ws = wb.create_sheet("Overview") + df = pd.DataFrame({ + "note": [ + "All values rounded to 3 places.", + "FAC weight is (0.485 * pct_rpt) + (0.485 * pct_awd) + (0.03 * pct_$)", + "FAC weight can be used for estimating opdiv contribution, if desired.", + ] + }) + for r in dataframe_to_rows(df, index=True, header=True): + ws.append(r) + + for ay, _ in self.results.items(): + ws = wb.create_sheet(f"AY{ay}") + df = pd.DataFrame( + { + "aln": self.alns[ay], + "report_count": self.report_counts[ay], + "award_count": self.award_counts[ay], + "total_dollars": self.total_dollars[ay], + "pct_of_reports": self.pct_of_reports[ay], + "pct_of_awards": self.pct_of_awards[ay], + "pct_of_dollars": self.pct_of_dollars[ay], + "fac_weight": self.fac_weights[ay] + } + ) + for r in dataframe_to_rows(df, index=True, header=True): + ws.append(r) + del wb['Sheet'] + wb.save(f"agency-{self.agency_number}-distribution.xlsx") + + +def get_alns_by_agency_number(audit_year, agency_number, api_key="NO_API_KEY"): + payload = { + "federal_agency_prefix": op("eq", agency_number), + "select": "federal_award_extension", + "audit_year": op("eq", audit_year), + } + url = f"{FAC_API_BASE}/federal_awards" + all_alns = set() + + for start in range(0, MAX_RESULTS, STEP_SIZE): + payload = payload | { + "offset": start, + "api_key": api_key, + } + res = requests.get(url, + params=payload, + ) + jres = res.json() + if jres == []: + break + elif "code" in jres: + print("ERROR: ") + pprint(jres) + break + else: + # Don't bother with another call if we had fewer than the max. + for r in jres: + all_alns.add(ALN(agency_number, r["federal_award_extension"])) + + return all_alns + +# @click.command() +# @click.argument('list_of_alns') +# @click.option('--audit-years', default="2023", help='Audit year') +# @click.option('--before-acceptance', default="2023-06-28", help="Acceptance date") +# @click.option("--distinct-alns-for-agency", default=None, help="Each distinct aln under an agency number.") +def sum_over_alns(list_of_alns, audit_years, before_acceptance, distinct_alns_for_agency, api_key): + RS = ResultSummary(distinct_alns_for_agency) + + for audit_year in list(map(lambda y: int(y), audit_years.split(","))): + if distinct_alns_for_agency: + alns = get_alns_by_agency_number( + audit_year, + distinct_alns_for_agency, + api_key + ) + else: + alns = load_aln_list(list_of_alns) + for ndx, aln in enumerate(sorted(alns, key=lambda a: f"{a.agency}.{a.program}")): + print(f"Calculating for {aln} [{ndx + 1} of {len(alns)}]") + result = calculate_for_aln(aln, + audit_year=audit_year, + before_acceptance=before_acceptance, + api_key=api_key, + ) + print(result) + RS.add_result(audit_year, result) + + RS.report_as_xlsx() diff --git a/notebook-example/three/notebooks/sum_alns.ipynb b/notebook-example/three/notebooks/sum_alns.ipynb new file mode 100644 index 0000000..b1b718f --- /dev/null +++ b/notebook-example/three/notebooks/sum_alns.ipynb @@ -0,0 +1,93 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First, install packages required by the notebook\n", + "%pip install pyodide-http requests pandas openpyxl\n", + "# Import the FAC support library\n", + "import fac\n", + "# Import the code for this demo\n", + "from libraries import sum_over_alns as soa" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Does the API work\n", + "\n", + "The first thing we'll check is if the API works.\n", + "\n", + "If it does, we'll get back one record from the `general` API endpoint.\n", + "\n", + "If not, it will throw some kind of error.\n", + "\n", + "This means you should 1) obtain an API key, and 2) copy the key you receieve into the file `api_key.py`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "payload = { \"api_key\": fac.FAC_API_KEY }\n", + "query = payload | { \"limit\": 1 }\n", + "r = fac.get(\"general\", params = query)\n", + "print(r.text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Calculate the dollars per ALN \n", + "\n", + "This demonstrates a calculation using the FAC API. \n", + "\n", + "Given a list of ALNs, it:\n", + "\n", + "1. Looks up all awards with those ALNs, and\n", + "2. Adds up the direct funding on those awards.\n", + "\n", + "In this example, we're using a list of ALNs from Treasury. To test it with another list, you could create a file called `my_agency.alns` in the `files` folder, and enter one ALN per line. Then, change the code below to use the file `my_agency.alns` instead of `treasury.alns`. You can also change the year and date to explore other points in history.\n", + "\n", + "Note this does take a while." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "soa.sum_over_alns(\"files/treasury.alns\", \"2023\", \"2024-06-19\", None, api_key=fac.FAC_API_KEY)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebook-example/three/notebooks/traffic_per_week.ipynb b/notebook-example/three/notebooks/traffic_per_week.ipynb new file mode 100644 index 0000000..e69de29 diff --git a/notebook-example/three/notebooks/util/clear_local_storage.ipynb b/notebook-example/three/notebooks/util/clear_local_storage.ipynb new file mode 100644 index 0000000..9eaea50 --- /dev/null +++ b/notebook-example/three/notebooks/util/clear_local_storage.ipynb @@ -0,0 +1,59 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Clear local storage\n", + "\n", + "Found [here](https://github.com/jupyterlite/jupyterlite/issues/407#issuecomment-1353088447)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display, HTML\n", + "display(HTML(\"\"\"\n", + "\n", + "\n", + "\"\"\"))\n", + " " + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebook-example/three/notebooks/xlsx/README.md b/notebook-example/three/notebooks/xlsx/README.md new file mode 100644 index 0000000..17c4201 --- /dev/null +++ b/notebook-example/three/notebooks/xlsx/README.md @@ -0,0 +1 @@ +This is for output files from demonstration scripts. diff --git a/notebook-example/three/overrides.json b/notebook-example/three/overrides.json new file mode 100644 index 0000000..0bbb9d9 --- /dev/null +++ b/notebook-example/three/overrides.json @@ -0,0 +1,14 @@ +{ + "@jupyterlab/notebook-extension:panel": { + "toolbar": [ + { + "name": "download", + "label": "Download", + "args": {}, + "command": "docmanager:download", + "icon": "ui-components:download", + "rank": 50 + } + ] + } + } \ No newline at end of file diff --git a/notebook-example/three/requirements.txt b/notebook-example/three/requirements.txt new file mode 100644 index 0000000..71432d5 --- /dev/null +++ b/notebook-example/three/requirements.txt @@ -0,0 +1,62 @@ +# jupyterlite-core +# jupyterlite-pyodide-kernel +# libarchive-c +# jupyter_server +# jupyterlab_server +# jupyter-datatables +# jupyterlab-filesystem-access +# pyodide-http +# requests + +# Core modules (mandatory) +jupyterlite-core==0.3.0 +jupyterlab~=4.1.6 +notebook~=7.1.2 + + +# Python kernel (optional) +jupyterlite-pyodide-kernel==0.3.2 + +# JavaScript kernel (optional) +jupyterlite-javascript-kernel==0.3.0 + +# P5 kernel (optional) +jupyterlite-p5-kernel==0.1.0 + +# JupyterLab: Fasta file renderer (optional) +jupyterlab-fasta>=3.3.0,<4 +# JupyterLab: Geojson file renderer (optional) +jupyterlab-geojson>=3.4.0,<4 +# JupyterLab: guided tour (optional) +# TODO: re-enable after https://github.com/jupyterlab-contrib/jupyterlab-tour/issues/82 +# jupyterlab-tour +# JupyterLab: dark theme +jupyterlab-night +# JupyterLab: Miami nights theme (optional) +jupyterlab_miami_nights + +# Python: ipywidget library for Jupyter notebooks (optional) +ipywidgets>=8.1.1,<9 +# Python: ipyevents library for Jupyter notebooks (optional) +ipyevents>=2.0.1 +# Python: interative Matplotlib library for Jupyter notebooks (optional) +ipympl>=0.8.2 +# Python: ipycanvas library for Jupyter notebooks (optional) +ipycanvas>=0.9.1 +# Python: ipyleaflet library for Jupyter notebooks (optional) +ipyleaflet + +# Python: plotting libraries (optional) +plotly>=5,<6 +bqplot + +# Language packs +# https://github.com/jupyterlab/language-packs/tree/main/language-packs +jupyterlab-language-pack-es-ES +jupyterlab-language-pack-zh-CN +jupyterlab-language-pack-vi-VN +jupyterlab-language-pack-fr-FR + +openpyxl +pysqlite3 +peewee