Skip to content
This repository has been archived by the owner on Nov 14, 2021. It is now read-only.

Commit

Permalink
Merge pull request #111 from the-scouts/reports-pt-3
Browse files Browse the repository at this point in the history
  • Loading branch information
AA-Turner authored Aug 23, 2021
2 parents 727e3f5 + 97bdef7 commit 7bd89be
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 134 deletions.
12 changes: 6 additions & 6 deletions compass/core/_scrapers/member_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@

if TYPE_CHECKING:
from collections.abc import Collection
from collections.abc import Iterator
from collections.abc import Iterable

from compass.core.util.client import Client

Expand Down Expand Up @@ -192,7 +192,7 @@ def get_personal_tab(client: Client, membership_number: int, /) -> ci.MemberDeta
"""
tree = _get_member_profile_tab(client, membership_number, "Personal")
details: dict[str, Union[None, int, str, datetime.date, ci.AddressData, dict[str, str]]] = dict()
details: dict[str, Union[None, int, str, datetime.date, ci.AddressData, dict[str, str]]] = {}

# ### Extractors
# ## Core:
Expand Down Expand Up @@ -415,7 +415,7 @@ def _membership_duration(dates: Collection[tuple[datetime.date, datetime.date]])
return round(membership_duration_days / 365.2425, 3) # Leap year except thrice per 400 years.


def _reduce_date_list(dates: Collection[tuple[datetime.date, datetime.date]]) -> Iterator[tuple[datetime.date, datetime.date]]:
def _reduce_date_list(dates: Collection[tuple[datetime.date, datetime.date]]) -> Iterable[tuple[datetime.date, datetime.date]]:
"""Reduce list of start and end dates to disjoint ranges.
Iterate through date pairs and get longest consecutive date ranges. Returns
Expand Down Expand Up @@ -599,7 +599,7 @@ def _process_role_data(role: html.HtmlElement) -> tuple[int, dict[str, Union[Non
"""Parses a personal learning plan from a LXML row element containing data."""
child_nodes = list(role)

role_data: dict[str, Union[None, str, int, datetime.date]] = dict()
role_data: dict[str, Union[None, str, int, datetime.date]] = {}
role_number = int(role.get("data-ng_mrn"))
role_data["role_number"] = role_number
role_data["role_title"] = child_nodes[0].text_content()
Expand Down Expand Up @@ -648,8 +648,8 @@ def _compile_ongoing_learning(training_plps: TYPES_TRAINING_PLPS, tree: html.Htm
"""
# Handle GDPR (Get latest GDPR date)
training_ogl: TYPES_TRAINING_OGL = dict()
gdpr_generator: Iterator[datetime.date] = (
training_ogl: TYPES_TRAINING_OGL = {}
gdpr_generator = (
module["validated_date"]
for plp in training_plps.values()
for module in plp
Expand Down
231 changes: 105 additions & 126 deletions compass/core/_scrapers/reports.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from __future__ import annotations

from pathlib import Path
import re
import time
from typing import Literal, TYPE_CHECKING
from typing import cast, Literal, TYPE_CHECKING

from lxml import html
import requests
Expand All @@ -12,74 +11,80 @@
from compass.core.logger import logger
from compass.core.settings import Settings
from compass.core.util import auth_header
from compass.core.util import context_managers

if TYPE_CHECKING:
from compass.core.util.auth_header import TYPE_AUTH_IDS
from compass.core.util.client import Client

# TODO move to schema.reports if created
# TODO remove location from start, to keep list small
_report_types: dict[str, int] = {
# group reports
"Group Appointments Report": 59,
# district reports
"District Appointments Report": 50,
"District Member Directory Report": 51,
# "District Member Directory 18 To 25 Years": ,
"District Permits Report": 70,
"District Disclosure Report": 78,
"District Training Report": 79,
"District Awards Report": 94,
"District Disclosure Management Report": 102,
# county reports
"County/Area/Region Appointments Report": 48,
"County/Area/Region Member Directory Report": 49,
"County/Area/Region Member Directory 18 To 25 Years": 53,
"County/Area/Region Permits Report": 69,
"County/Area/Region Disclosure Report": 77,
"County/Area/Region Training Report": 80,
"County/Area/Region Awards Report": 95,
"County Disclosure Management Report": 101,
# region reports
"Region Member Directory": 37,
"Region Appointments Report": 52,
"Region Permit Report": 72,
"Region Disclosure Report": 76,
"Region Training Report": 84,
"Region Disclosure Management Report": 100,
_report_ids_appointments: dict[ci.TYPES_UNIT_LEVELS, int] = {
"Group": 59,
"District": 50,
"County": 48,
"Region": 52,
}
_report_ids_member_directory: dict[ci.TYPES_UNIT_LEVELS, int] = {
"District": 51,
"County": 49,
"Region": 37,
}
_report_ids_18_25_member_directory: dict[ci.TYPES_UNIT_LEVELS, int] = {
"County": 53,
}
_report_ids_permits: dict[ci.TYPES_UNIT_LEVELS, int] = {
"District": 70,
"County": 69,
"Region": 72,
}
_report_ids_disclosure: dict[ci.TYPES_UNIT_LEVELS, int] = {
"District": 78,
"County": 77,
"Region": 76,
}
_report_ids_training: dict[ci.TYPES_UNIT_LEVELS, int] = {
"District": 79,
"County": 80,
"Region": 84,
}
_report_ids_awards: dict[ci.TYPES_UNIT_LEVELS, int] = {
"District": 94,
"County": 95,
}
_report_ids_disclosure_management: dict[ci.TYPES_UNIT_LEVELS, int] = {
"District": 102,
"County": 101,
"Region": 100,
}
TYPES_REPORTS = Literal[
# group
"Group Appointments Report",
# district
"District Appointments Report",
"District Member Directory Report",
"District Permits Report",
"District Disclosure Report",
"District Training Report",
"District Awards Report",
"District Disclosure Management Report",
# county
"County/Area/Region Appointments Report",
"County/Area/Region Member Directory Report",
"County/Area/Region Member Directory 18 To 25 Years",
"County/Area/Region Permits Report",
"County/Area/Region Disclosure Report",
"County/Area/Region Training Report",
"County/Area/Region Awards Report",
"County Disclosure Management Report",
# region
"Region Member Directory",
"Region Appointments Report",
"Region Permit Report",
"Region Disclosure Report",
"Region Training Report",
"Region Disclosure Management Report",
"Appointments Report",
"Member Directory Report",
"18-25 Member Directory Report",
"Permits Report",
"Disclosure Report",
"Training Report",
"Awards Report",
"Disclosure Management Report",
]
_report_ids: dict[TYPES_REPORTS, dict[ci.TYPES_UNIT_LEVELS, int]] = {
"Appointments Report": _report_ids_appointments,
"Member Directory Report": _report_ids_member_directory,
"18-25 Member Directory Report": _report_ids_18_25_member_directory,
"Permits Report": _report_ids_permits,
"Disclosure Report": _report_ids_disclosure,
"Training Report": _report_ids_training,
"Awards Report": _report_ids_awards,
"Disclosure Management Report": _report_ids_disclosure_management,
}


def export_report(client: Client, auth_ids: TYPE_AUTH_IDS, report_type: TYPES_REPORTS, stream: bool = False) -> bytes:
def export_report(
client: Client,
report_type: TYPES_REPORTS,
hierarchy_level: ci.TYPES_HIERARCHY_LEVELS,
auth_ids: TYPE_AUTH_IDS,
stream: bool = False,
) -> str:
"""Exports report as CSV from Compass.
See `Reports.get_report` for an overview of the export process
Expand All @@ -99,10 +104,14 @@ def export_report(client: Client, auth_ids: TYPE_AUTH_IDS, report_type: TYPES_RE
reports a HTTP 5XX status code
"""
if report_type not in _report_types:
types = [*_report_types.keys()]
if report_type not in _report_ids:
types = [*_report_ids]
raise ci.CompassReportError(f"{report_type} is not a valid report type. Valid report types are {types}") from None
report_number = _report_types[report_type]
report_level_map = _report_ids[report_type]
if hierarchy_level not in report_level_map:
raise ci.CompassReportError(f"Requested report does not exist for hierarchy level: {hierarchy_level}.")
hierarchy_level = cast(ci.TYPES_UNIT_LEVELS, hierarchy_level)
report_number = report_level_map[hierarchy_level]

# Get token for report type & role running said report:
run_report_url = _get_report_token(client, auth_ids, report_number)
Expand All @@ -114,13 +123,14 @@ def export_report(client: Client, auth_ids: TYPE_AUTH_IDS, report_type: TYPES_RE
# Update form data & set location selection:
_update_form_data(client, report_page, run_report_url, report_number)

# Export the report:
# Get report export URL:
logger.info("Exporting report")
export_url = _extract_report_export_url(report_page.decode("UTF-8"))

time_string = time.strftime("%Y-%m-%d %H-%M-%S") # colons are illegal on windows
filename = f"Compass Export - {report_type} - {time_string}.csv"
csv_export = _download_report(client, export_url, streaming=stream, filename=filename)
# Download report to CSV:
start = time.time()
csv_export = _download_report(client, export_url, streaming=stream)
logger.debug(f"Downloading took {time.time() - start:.2f}s")

# start = time.time()
# TODO TRAINING REPORT ETC.
Expand All @@ -142,16 +152,15 @@ def export_report(client: Client, auth_ids: TYPE_AUTH_IDS, report_type: TYPES_RE


def _get_report_token(client: Client, auth_ids: TYPE_AUTH_IDS, report_number: int) -> str:
params = {
"pReportNumber": str(report_number),
"pMemberRoleNumber": str(auth_ids[1]), # auth IDs are membership number, role number, 'jk'
}
logger.debug("Getting report token")
response = auth_header.auth_header_get(
auth_ids,
client,
f"{Settings.web_service_path}/ReportToken",
params=params,
params={
"pReportNumber": str(report_number),
"pMemberRoleNumber": str(auth_ids[1]), # auth IDs are membership number, role number, 'jk'
},
)
_error_status(response)

Expand All @@ -174,26 +183,27 @@ def _update_form_data(client: Client, report_page: bytes, run_report: str, repor
form_data = {el.name: el.value for el in tree.forms[0].inputs if el.get("type") not in {"checkbox", "image"}}

# Appointments Reports
if report_number == 52:
if report_number in {48, 52}: # County, Region
form_data = _form_data_appointments(form_data, tree)

# Compass does user-agent sniffing in reports!!! This does seem to be the
# only place that *requires* a Mozilla/5 type UA.
# Including the MicrosoftAjax pair lets us check errors quickly. In reality
# we don't care about the output of this POST, just that it doesn't fail.
report = client.post(
updated_report_page = client.post(
run_report,
data=form_data,
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)", "X-MicrosoftAjax": "Delta=true"},
)

# Check error state
_error_status(report, msg="Updating report locations failed!")
if "compass.scouts.org.uk%2fError.aspx|" in report.text:
_error_status(updated_report_page, msg="Updating report locations failed!")
if "compass.scouts.org.uk%2fError.aspx|" in updated_report_page.text:
raise ci.CompassReportError("Compass Error!")


def _form_data_appointments(form_data: dict[str, str], tree: html.HtmlElement) -> dict[str, str | None]:
"""Select all units/locations."""
additional_form_data = {
"ReportViewer1$ctl10": "ltr",
"ReportViewer1$ctl11": "standards",
Expand All @@ -205,34 +215,15 @@ def _form_data_appointments(form_data: dict[str, str], tree: html.HtmlElement) -
"__ASYNCPOST": "true",
} # TODO this may not be needed. Test.

# ReportViewer1$ctl04$ctl05$txtValue - County Label
# ReportViewer1$ctl04$ctl07$txtValue - District Label
# ReportViewer1$ctl04$ctl09$txtValue - Role Statuses
# ReportViewer1$ctl04$ctl15$txtValue - Columns Label

numbered_counties = _parse_drop_down_list(tree, "ReportViewer1_ctl04_ctl05_divDropDown") # Counties
numbered_districts = _parse_drop_down_list(tree, "ReportViewer1_ctl04_ctl07_divDropDown") # Districts
numbered_role_statuses = _parse_drop_down_list(tree, "ReportViewer1_ctl04_ctl09_divDropDown") # Role Statuses
numbered_column_names = _parse_drop_down_list(tree, "ReportViewer1_ctl04_ctl15_divDropDown") # Report Fields

# # Export regional roles only
# form_data["ReportViewer1$ctl04$ctl05$txtValue"] = "Regional Roles"
# form_data["ReportViewer1$ctl04$ctl05$divDropDown$ctl01$HiddenIndices"] = "0"

# Export all districts
form_data["ReportViewer1$ctl04$ctl05$txtValue"] = ", ".join(numbered_counties.values())
form_data["ReportViewer1$ctl04$ctl05$divDropDown$ctl01$HiddenIndices"] = ",".join(numbered_counties.keys())
form_data["ReportViewer1$ctl04$ctl07$txtValue"] = ", ".join(numbered_districts.values())
form_data["ReportViewer1$ctl04$ctl07$divDropDown$ctl01$HiddenIndices"] = ",".join(numbered_districts.keys())

# TODO this may not be needed. Test.
# update text values of role statuses and column names from default indices
form_data["ReportViewer1$ctl04$ctl09$txtValue"] = _get_defaults_labels(
form_data, "ReportViewer1$ctl04$ctl09$divDropDown$ctl01$HiddenIndices", numbered_role_statuses
)
form_data["ReportViewer1$ctl04$ctl15$txtValue"] = _get_defaults_labels(
form_data, "ReportViewer1$ctl04$ctl15$divDropDown$ctl01$HiddenIndices", numbered_column_names
)
# report level - 1 (e.g. county -> district)
numbered_levels_children = _parse_drop_down_list(tree, "ReportViewer1_ctl04_ctl05_divDropDown")
form_data["ReportViewer1$ctl04$ctl05$txtValue"] = ", ".join(numbered_levels_children.values())
form_data["ReportViewer1$ctl04$ctl05$divDropDown$ctl01$HiddenIndices"] = ",".join(numbered_levels_children.keys())

# report level - 2 (e.g. county -> group)
numbered_levels_grandchildren = _parse_drop_down_list(tree, "ReportViewer1_ctl04_ctl07_divDropDown")
form_data["ReportViewer1$ctl04$ctl07$txtValue"] = ", ".join(numbered_levels_grandchildren.values())
form_data["ReportViewer1$ctl04$ctl07$divDropDown$ctl01$HiddenIndices"] = ",".join(numbered_levels_grandchildren.keys())

return form_data | additional_form_data

Expand All @@ -242,33 +233,21 @@ def _extract_report_export_url(report_page: str) -> str:
cut = report_page[start:].removeprefix('ExportUrlBase":"')
end = cut.index('"')
full_url = cut[:end].encode().decode("unicode-escape")
return f"{full_url}CSV"
return f"{Settings.base_url}/{full_url}CSV"


def _download_report(client: Client, url_path: str, streaming: bool, filename: str | None = None) -> bytes:
start = time.time()
url = f"{Settings.base_url}/{url_path}"

# actually do the download
if streaming:
csv_export = b""
with client.get(url, stream=True) as r:
_error_status(r)
for chunk in r.iter_content(chunk_size=None): # Chunk size == 1MiB
csv_export += chunk
else:
csv_export = client.get(url).content

logger.debug(f"Exporting took {time.time() - start:.2f}s")

# maybe save to disk
if filename is not None:
logger.info("Saving report")
with context_managers.filesystem_guard("Unable to write report export"):
Path(filename).write_bytes(csv_export)
logger.info("Report Saved")
def _download_report(client: Client, url: str, streaming: bool) -> str:
# standard download
if not streaming:
return client.get(url).content.decode("utf-8-sig") # report is returned with Byte Order Mark

return csv_export
# streaming download
csv_export = b""
with client.get(url, stream=True) as r:
_error_status(r)
for chunk in r.iter_content(chunk_size=None): # Chunk size == 1MiB
csv_export += chunk
return csv_export.decode("utf-8-sig") # report is returned with Byte Order Mark


def _error_status(response: requests.Response, /, msg: str = "Request to Compass failed!") -> None:
Expand Down
5 changes: 3 additions & 2 deletions compass/core/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@ def __init__(self, session: ci.Logon):
"""Constructor for Reports."""
self.auth_ids = session.membership_number, session.role_number, session._jk
self.client = session._client
self.hierarchy_level = session.hierarchy.level

def get_report(self, report_type: TYPES_REPORTS) -> bytes:
def get_report(self, report_type: TYPES_REPORTS) -> str:
"""Exports report as CSV from Compass.
Exporting a report is of course surprisingly complicated. The process
Expand Down Expand Up @@ -53,4 +54,4 @@ def get_report(self, report_type: TYPES_REPORTS) -> bytes:
reports a HTTP 5XX status code
"""
return export_report(self.client, self.auth_ids, report_type, stream=False)
return export_report(self.client, report_type, self.hierarchy_level, self.auth_ids, stream=False)

0 comments on commit 7bd89be

Please sign in to comment.