diff --git a/README.md b/README.md
index 21456b5..e66abe3 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,17 @@ This repository contains a lightning-fast [Python 3 module](epss) and a series o
- Idempotently download daily sets of EPSS scores1 in JSON, JSONL, CSV, or [Apache Parquet](https://parquet.apache.org/)2 format
- Explore EPSS scores using [Polars](https://pola.rs/), a lightning-fast dataframe library written in Rust
-- Optionally drop unchanged scores
+- Optionally drop unchanged scores3
- Optionally disable TLS certificate validation when downloading scores (i.e. to support environments where TLS MitM is being performed)
-- [Easily](examples/get-scores-as-polars-dataframe.py) [switch](examples/get-changed-scores-as-polars-dataframe.py) between different versions3 of the [EPSS model](https://www.first.org/epss/model)
+- [Easily](examples/get-scores-as-polars-dataframe.py) [switch](examples/get-changed-scores-as-polars-dataframe.py) between different versions4 of the [EPSS model](https://www.first.org/epss/model)
1. By default, EPSS scores will be downloaded from 2023-03-07 onward, as this is the date when the outputs of EPSS v3 (v2023.03.01) were first published.
2. Apache Parquet is the default file format.
-3. EPSS has undergone 3 major revisions: [EPSS v1](https://arxiv.org/abs/1908.04856), EPSS v2 (v2022.01.01), and [EPSS v3 (v2023.03.01)](https://arxiv.org/abs/2302.14172) where the first, second, and third revisions all contain major improvements.
+3. The [Cyentia Institute](https://www.cyentia.com/research/) [publishes](https://www.first.org/epss/data_stats) sets of EPSS scores partitioned by date on a daily basis in GZIP compressed CSV format.
+
+4. EPSS has undergone 3 major revisions: [EPSS v1](https://arxiv.org/abs/1908.04856), EPSS v2 (v2022.01.01), and [EPSS v3 (v2023.03.01)](https://arxiv.org/abs/2302.14172) where the first, second, and third revisions all contain major improvements.
## Background
@@ -78,27 +80,53 @@ git+git+https://github.com/owner/repo@main
### Command line interface
-### Python
-
-#### Determine the minimum and maximum dates for which EPSS scores are available
+#### Listing scores published between two dates
-For example, if using EPSS v3 (v2023.03.01):
+To list1 all scores published since 2024 without dropping unchanged scores2:
-```python
-from epss.client import PolarsClient
+```bash
+poetry run epss scores -a 2024-01-01 --no-drop-unchanged | head
+```
-client = PolarsClient(
- include_v1_scores=False,
- include_v2_scores=False,
- include_v3_scores=True,
-)
-min_date, max_date = client.get_date_range()
+```text
+shape: (7_992_196, 4)
+┌──────────────────┬─────────┬────────────┬────────────┐
+│ cve ┆ epss ┆ percentile ┆ date │
+│ --- ┆ --- ┆ --- ┆ --- │
+│ str ┆ f64 ┆ f64 ┆ date │
+╞══════════════════╪═════════╪════════════╪════════════╡
+│ CVE-2019-2725 ┆ 0.97572 ┆ 1.0 ┆ 2024-01-01 │
+│ CVE-2019-1653 ┆ 0.97567 ┆ 1.0 ┆ 2024-01-01 │
+│ CVE-2015-7297 ┆ 0.97564 ┆ 0.99999 ┆ 2024-01-01 │
+│ CVE-2014-6271 ┆ 0.97564 ┆ 0.99999 ┆ 2024-01-01
+...
+```
-print(f'Min date: {min_date}')
-print(f'Max date: {max_date}')
+```bash
+poetry run epss scores -a 2024-01-01 --drop-unchanged | head
```
```text
-Min date: 2021-04-14
-Max date: 2022-02-03
+shape: (33_592, 4)
+┌──────────────────┬─────────┬────────────┬────────────┐
+│ cve ┆ epss ┆ percentile ┆ date │
+│ --- ┆ --- ┆ --- ┆ --- │
+│ str ┆ f64 ┆ f64 ┆ date │
+╞══════════════════╪═════════╪════════════╪════════════╡
+│ CVE-2019-1653 ┆ 0.97555 ┆ 0.99998 ┆ 2024-01-03 │
+│ CVE-2020-14750 ┆ 0.97544 ┆ 0.99995 ┆ 2024-01-03 │
+│ CVE-2013-2423 ┆ 0.97512 ┆ 0.99983 ┆ 2024-01-03 │
+│ CVE-2019-19781 ┆ 0.97485 ┆ 0.99967 ┆ 2024-01-03 │
+...
```
+
+1. When querying historical sets of EPSS scores, any scores that have not already been downloaded will be downloaded automatically to a configurable working directory3. You do not have to explicitly download EPSS scores before querying them.
+
+2. Unchanged scores are dropped by default - this behaviour can be toggled using the `--drop-unchanged/--no-drop-unchanged` flags.
+
+3. If a working directory is not explicitly provided, scores will be written to a folder named `476c9b0d-79c6-4b7e-a31a-e18cec3d6444/epss/scores-by-date` within the system's temporary directory (e.g. `/var/folders/ps/c0fn47n54sg08wck9_x9qncr0000gp/T/476c9b0d-79c6-4b7e-a31a-e18cec3d6444/epss/scores-by-date/`).
+
+#### Download scores published between two dates
+
+
+
diff --git a/epss/cli.py b/epss/cli.py
index d7e0966..37e69b9 100644
--- a/epss/cli.py
+++ b/epss/cli.py
@@ -74,6 +74,7 @@ def main(
@click.option('--max-date', '-b', help='Maximum date')
@click.option('--output-file', '-o', help='Output file')
@click.option('--output-format', '-f', type=click.Choice(OUTPUT_FORMATS), help='Output format')
+@click.option('--drop-unchanged/--no-drop-unchanged', 'drop_unchanged_scores', default=True, show_default=True, help='Drop unchanged scores')
@click.option('--download', is_flag=True, help="Don't write to an output file or the console, just download the data")
@click.pass_context
def get_scores_cli(
@@ -84,6 +85,7 @@ def get_scores_cli(
max_date: Optional[str],
output_file: Optional[str],
output_format: Optional[str],
+ drop_unchanged_scores: bool,
download: bool):
"""
Get scores
@@ -104,7 +106,11 @@ def get_scores_cli(
workdir=workdir,
min_date=min_date,
max_date=max_date,
+ drop_unchanged_scores=drop_unchanged_scores,
)
+ df = df.sort(by=['cve'], descending=True)
+ df = df.sort(by=['epss'], descending=True)
+ df = df.sort(by=['date'], descending=False)
write_output(df, output_file, output_format)
diff --git a/epss/client.py b/epss/client.py
index 543ec85..2515580 100644
--- a/epss/client.py
+++ b/epss/client.py
@@ -208,6 +208,10 @@ def get_scores(
drop_unchanged_scores: bool = True) -> pl.DataFrame:
min_date, max_date = self.get_date_range(min_date, max_date)
+
+ # This is necessary to avoid listing all scores at the beginning of the requested timeframe.
+ if drop_unchanged_scores:
+ min_date -= datetime.timedelta(days=-1)
if min_date == max_date:
return self.get_scores_by_date(workdir=workdir, date=min_date, query=query)
@@ -222,13 +226,14 @@ def get_scores(
dfs = executor.map(lambda date: resolver(date=date), dates)
if drop_unchanged_scores is False:
df = pl.concat(dfs)
- else:
- first = next(dfs)
+ else:
+ first = get_changed_scores(next(dfs), next(dfs))
changes = executor.map(lambda e: get_changed_scores(*e), util.iter_pairwise(dfs))
df = pl.concat(itertools.chain([first], changes))
- df = df.sort(by=['date', 'cve'], descending=False)
+ df = df.sort(by=['cve'], descending=True)
+ df = df.sort(by=['date'], descending=False)
return df
def get_scores_by_date(
@@ -255,7 +260,8 @@ def get_scores_by_date(
if 'cve' not in df.columns:
raise ValueError(f'The dataframe for {date.isoformat()} does not contain a `cve` column (columns: {df.columns})')
- df = df.sort(by=['cve'], descending=False)
+ df = df.sort(by=['cve'], descending=True)
+ df = df.sort(by=['date'], descending=False)
return df
def filter_scores(self, df: pl.DataFrame, query: Query) -> pl.DataFrame:
@@ -459,6 +465,9 @@ def get_changed_scores(a: pl.DataFrame, b: pl.DataFrame) -> pl.DataFrame:
)
df = df.filter(pl.col('epss_change') != 0)
df = df.drop('prev_epss', 'epss_change')
+
+ df = df.sort(by=['cve'], descending=True)
+ df = df.sort(by=['date'], descending=False)
return df
diff --git a/examples/get-date-range-by-model-version.py b/examples/get-date-range-by-model-version.py
new file mode 100644
index 0000000..551ab14
--- /dev/null
+++ b/examples/get-date-range-by-model-version.py
@@ -0,0 +1,11 @@
+from epss.client import PolarsClient
+
+client = PolarsClient(
+ include_v1_scores=False,
+ include_v2_scores=False,
+ include_v3_scores=True,
+)
+min_date, max_date = client.get_date_range()
+
+print(f'Min date: {min_date}')
+print(f'Max date: {max_date}')