diff --git a/README.md b/README.md index 21456b5..e66abe3 100644 --- a/README.md +++ b/README.md @@ -6,15 +6,17 @@ This repository contains a lightning-fast [Python 3 module](epss) and a series o - Idempotently download daily sets of EPSS scores1 in JSON, JSONL, CSV, or [Apache Parquet](https://parquet.apache.org/)2 format - Explore EPSS scores using [Polars](https://pola.rs/), a lightning-fast dataframe library written in Rust -- Optionally drop unchanged scores +- Optionally drop unchanged scores3 - Optionally disable TLS certificate validation when downloading scores (i.e. to support environments where TLS MitM is being performed) -- [Easily](examples/get-scores-as-polars-dataframe.py) [switch](examples/get-changed-scores-as-polars-dataframe.py) between different versions3 of the [EPSS model](https://www.first.org/epss/model) +- [Easily](examples/get-scores-as-polars-dataframe.py) [switch](examples/get-changed-scores-as-polars-dataframe.py) between different versions4 of the [EPSS model](https://www.first.org/epss/model) 1. By default, EPSS scores will be downloaded from 2023-03-07 onward, as this is the date when the outputs of EPSS v3 (v2023.03.01) were first published. 2. Apache Parquet is the default file format. -3. EPSS has undergone 3 major revisions: [EPSS v1](https://arxiv.org/abs/1908.04856), EPSS v2 (v2022.01.01), and [EPSS v3 (v2023.03.01)](https://arxiv.org/abs/2302.14172) where the first, second, and third revisions all contain major improvements. +3. The [Cyentia Institute](https://www.cyentia.com/research/) [publishes](https://www.first.org/epss/data_stats) sets of EPSS scores partitioned by date on a daily basis in GZIP compressed CSV format. + +4. EPSS has undergone 3 major revisions: [EPSS v1](https://arxiv.org/abs/1908.04856), EPSS v2 (v2022.01.01), and [EPSS v3 (v2023.03.01)](https://arxiv.org/abs/2302.14172) where the first, second, and third revisions all contain major improvements. ## Background @@ -78,27 +80,53 @@ git+git+https://github.com/owner/repo@main ### Command line interface -### Python - -#### Determine the minimum and maximum dates for which EPSS scores are available +#### Listing scores published between two dates -For example, if using EPSS v3 (v2023.03.01): +To list1 all scores published since 2024 without dropping unchanged scores2: -```python -from epss.client import PolarsClient +```bash +poetry run epss scores -a 2024-01-01 --no-drop-unchanged | head +``` -client = PolarsClient( - include_v1_scores=False, - include_v2_scores=False, - include_v3_scores=True, -) -min_date, max_date = client.get_date_range() +```text +shape: (7_992_196, 4) +┌──────────────────┬─────────┬────────────┬────────────┐ +│ cve ┆ epss ┆ percentile ┆ date │ +│ --- ┆ --- ┆ --- ┆ --- │ +│ str ┆ f64 ┆ f64 ┆ date │ +╞══════════════════╪═════════╪════════════╪════════════╡ +│ CVE-2019-2725 ┆ 0.97572 ┆ 1.0 ┆ 2024-01-01 │ +│ CVE-2019-1653 ┆ 0.97567 ┆ 1.0 ┆ 2024-01-01 │ +│ CVE-2015-7297 ┆ 0.97564 ┆ 0.99999 ┆ 2024-01-01 │ +│ CVE-2014-6271 ┆ 0.97564 ┆ 0.99999 ┆ 2024-01-01 +... +``` -print(f'Min date: {min_date}') -print(f'Max date: {max_date}') +```bash +poetry run epss scores -a 2024-01-01 --drop-unchanged | head ``` ```text -Min date: 2021-04-14 -Max date: 2022-02-03 +shape: (33_592, 4) +┌──────────────────┬─────────┬────────────┬────────────┐ +│ cve ┆ epss ┆ percentile ┆ date │ +│ --- ┆ --- ┆ --- ┆ --- │ +│ str ┆ f64 ┆ f64 ┆ date │ +╞══════════════════╪═════════╪════════════╪════════════╡ +│ CVE-2019-1653 ┆ 0.97555 ┆ 0.99998 ┆ 2024-01-03 │ +│ CVE-2020-14750 ┆ 0.97544 ┆ 0.99995 ┆ 2024-01-03 │ +│ CVE-2013-2423 ┆ 0.97512 ┆ 0.99983 ┆ 2024-01-03 │ +│ CVE-2019-19781 ┆ 0.97485 ┆ 0.99967 ┆ 2024-01-03 │ +... ``` + +1. When querying historical sets of EPSS scores, any scores that have not already been downloaded will be downloaded automatically to a configurable working directory3. You do not have to explicitly download EPSS scores before querying them. + +2. Unchanged scores are dropped by default - this behaviour can be toggled using the `--drop-unchanged/--no-drop-unchanged` flags. + +3. If a working directory is not explicitly provided, scores will be written to a folder named `476c9b0d-79c6-4b7e-a31a-e18cec3d6444/epss/scores-by-date` within the system's temporary directory (e.g. `/var/folders/ps/c0fn47n54sg08wck9_x9qncr0000gp/T/476c9b0d-79c6-4b7e-a31a-e18cec3d6444/epss/scores-by-date/`). + +#### Download scores published between two dates + + + diff --git a/epss/cli.py b/epss/cli.py index d7e0966..37e69b9 100644 --- a/epss/cli.py +++ b/epss/cli.py @@ -74,6 +74,7 @@ def main( @click.option('--max-date', '-b', help='Maximum date') @click.option('--output-file', '-o', help='Output file') @click.option('--output-format', '-f', type=click.Choice(OUTPUT_FORMATS), help='Output format') +@click.option('--drop-unchanged/--no-drop-unchanged', 'drop_unchanged_scores', default=True, show_default=True, help='Drop unchanged scores') @click.option('--download', is_flag=True, help="Don't write to an output file or the console, just download the data") @click.pass_context def get_scores_cli( @@ -84,6 +85,7 @@ def get_scores_cli( max_date: Optional[str], output_file: Optional[str], output_format: Optional[str], + drop_unchanged_scores: bool, download: bool): """ Get scores @@ -104,7 +106,11 @@ def get_scores_cli( workdir=workdir, min_date=min_date, max_date=max_date, + drop_unchanged_scores=drop_unchanged_scores, ) + df = df.sort(by=['cve'], descending=True) + df = df.sort(by=['epss'], descending=True) + df = df.sort(by=['date'], descending=False) write_output(df, output_file, output_format) diff --git a/epss/client.py b/epss/client.py index 543ec85..2515580 100644 --- a/epss/client.py +++ b/epss/client.py @@ -208,6 +208,10 @@ def get_scores( drop_unchanged_scores: bool = True) -> pl.DataFrame: min_date, max_date = self.get_date_range(min_date, max_date) + + # This is necessary to avoid listing all scores at the beginning of the requested timeframe. + if drop_unchanged_scores: + min_date -= datetime.timedelta(days=-1) if min_date == max_date: return self.get_scores_by_date(workdir=workdir, date=min_date, query=query) @@ -222,13 +226,14 @@ def get_scores( dfs = executor.map(lambda date: resolver(date=date), dates) if drop_unchanged_scores is False: df = pl.concat(dfs) - else: - first = next(dfs) + else: + first = get_changed_scores(next(dfs), next(dfs)) changes = executor.map(lambda e: get_changed_scores(*e), util.iter_pairwise(dfs)) df = pl.concat(itertools.chain([first], changes)) - df = df.sort(by=['date', 'cve'], descending=False) + df = df.sort(by=['cve'], descending=True) + df = df.sort(by=['date'], descending=False) return df def get_scores_by_date( @@ -255,7 +260,8 @@ def get_scores_by_date( if 'cve' not in df.columns: raise ValueError(f'The dataframe for {date.isoformat()} does not contain a `cve` column (columns: {df.columns})') - df = df.sort(by=['cve'], descending=False) + df = df.sort(by=['cve'], descending=True) + df = df.sort(by=['date'], descending=False) return df def filter_scores(self, df: pl.DataFrame, query: Query) -> pl.DataFrame: @@ -459,6 +465,9 @@ def get_changed_scores(a: pl.DataFrame, b: pl.DataFrame) -> pl.DataFrame: ) df = df.filter(pl.col('epss_change') != 0) df = df.drop('prev_epss', 'epss_change') + + df = df.sort(by=['cve'], descending=True) + df = df.sort(by=['date'], descending=False) return df diff --git a/examples/get-date-range-by-model-version.py b/examples/get-date-range-by-model-version.py new file mode 100644 index 0000000..551ab14 --- /dev/null +++ b/examples/get-date-range-by-model-version.py @@ -0,0 +1,11 @@ +from epss.client import PolarsClient + +client = PolarsClient( + include_v1_scores=False, + include_v2_scores=False, + include_v3_scores=True, +) +min_date, max_date = client.get_date_range() + +print(f'Min date: {min_date}') +print(f'Max date: {max_date}')