diff --git a/README.md b/README.md
index 7eb31a7..4c07bf9 100644
--- a/README.md
+++ b/README.md
@@ -2,15 +2,74 @@
This repository contains a lightning-fast [Python 3 module](epss) and a series of [bash scripts](scripts) that are designed to make it easy for anyone to work with the daily outputs of the [Exploit Prediction Scoring System (EPSS)](https://www.first.org/epss/).
-🚧👷 Under construction 🏗️🚧
-
## Features
-- Explore EPSS scores using [Polars](https://pola.rs/), a lightning-fast dataframe library written in Rust
-- Idempotently download daily sets of EPSS scores1 in JSON, JSONL, CSV, or [Parquet](https://parquet.apache.org/) format2
-- Easily identify changes between two or more sets of EPSS scores
-- Translate sets of EPSS scores into sparse matrices to allow for the easy identification of changes to one or more computer security vulnerabilities on a daily or per CVE ID basis.
+- Idempotently download daily sets of EPSS scores1 in JSON, JSONL, CSV, or [Apache Parquet](https://parquet.apache.org/)2 format
+- Explore EPSS scores as either sparse or dense matrices using [Polars](https://pola.rs/), a lightning-fast dataframe library written in Rust
+- [Easily](examples/get-scores-as-polars-dataframe.py) [switch](examples/get-changed-scores-as-polars-dataframe.py) between different versions3 of the [EPSS model](https://www.first.org/epss/model)
1. By default, EPSS scores will be downloaded from 2023-03-07 onward, as this is the date when the outputs of EPSS v3 (v2023.03.01) were first published.
2. Apache Parquet is the default file format.
+
+3. EPSS has undergone 3 major revisions: [EPSS v1](https://arxiv.org/abs/1908.04856), EPSS v2 (v2022.01.01), and [EPSS v3 (v2023.03.01)](https://arxiv.org/abs/2302.14172) where the first, second, and third revisions all contain major improvements.
+
+## Background
+
+The Exploit Prediction Scoring System (EPSS) is a probabilistic [model](https://www.first.org/epss/model) that is designed to predict the likelihood of a given computer security vulnerability being exploited somewhere in the wild within the next 30 days.
+
+The first version of the EPSS model was released in 2021, and it has since undergone two major revisions.
+
+The first version of the EPSS model used logistic regression, but subsequent models have used [gradient-boosted decision trees](https://en.wikipedia.org/wiki/Gradient_boosting) ([XGBoost](https://en.wikipedia.org/wiki/XGBoost)) to make predictions.
+
+For additional information on EPSS and its applications, please consult the following resources:
+
+- [Exploit Prediction Scoring System (EPSS)](https://arxiv.org/abs/1908.04856)
+- [Enhancing Vulnerability Prioritization: Data-Driven Exploit Predictions with Community-Driven Insights](https://arxiv.org/abs/2302.14172)
+
+Additional resources:
+
+- [Daily analysis of EPSS scores](https://www.first.org/epss/data_stats)
+- [The Exploit Prediction Scoring System (EPSS) Explained](https://www.splunk.com/en_us/blog/learn/epss-exploit-prediction-scoring-system.html#:~:text=In%20short%2C%20EPSS%20allows%20us,vulnerability%20might%20be%20if%20exploited.)
+- [F5 Labs Joins the Exploit Prediction Scoring System as a Data Partner](https://www.f5.com/labs/articles/cisotociso/f5-labs-joins-the-exploit-prediction-scoring-system-as-a-data-partner)
+
+## Usage
+
+### Developers
+
+This package is not currently available on PyPi, but can be easily added to your project in one of two ways:
+
+- Using `poetry`1:
+
+```
+poetry add git+https://github.com/whitfieldsdad/epss.git
+```
+
+By branch:
+
+```
+poetry add git+https://github.com/whitfieldsdad/epss.git#main
+```
+
+By tag:
+
+```
+poetry add git+https://github.com/whitfieldsdad/epss.git#v3.0.0
+```
+
+
+- Using `requirements.txt`:
+
+By tag:
+
+```
+git+https://github.com/whitfieldsdad/epss@releases/tag/v3.0.0
+```
+
+By branch:
+
+```
+git+git+https://github.com/owner/repo@main
+```
+
+1. Using Poetry for dependency management and adding this project as a dependency of your project without explicitly specifying a branch or tag is recommended.
diff --git a/epss/client.py b/epss/client.py
index 8039285..ce234ad 100644
--- a/epss/client.py
+++ b/epss/client.py
@@ -35,9 +35,10 @@ def get_scores(
workdir: str,
min_date: Optional[TIME] = None,
max_date: Optional[TIME] = None,
- query: Optional[Query] = None) -> Any:
+ query: Optional[Query] = None,
+ drop_unchanged_scores: bool = True) -> Any:
"""
- Returns a dataframe containing EPSS scores published between the specified dates including scores that have not changed since the last calculation.
+ Returns a dataframe containing EPSS scores published between the specified dates.
The dataframe will be sorted by date and CVE ID in descending order.
"""
@@ -54,31 +55,6 @@ def get_scores_by_date(
The dataframe will be sorted by CVE ID in descending order.
"""
raise NotImplementedError()
-
- def get_score_changelog(
- self,
- workdir: str,
- min_date: Optional[TIME] = None,
- max_date: Optional[TIME] = None,
- query: Optional[Query] = None) -> Any:
- """
- Returns a dataframe containing the changes to EPSS scores published between the specified dates.
-
- The dataframe will be sorted by date and CVE ID in descending order.
- """
- raise NotImplementedError()
-
- def get_score_changelog_by_date(
- self,
- workdir: str,
- date: Optional[TIME] = None,
- query: Optional[Query] = None) -> Any:
- """
- Returns a dataframe containing the changes to EPSS scores published on the specified date.
-
- The dataframe will be sorted by CVE ID in descending order.
- """
- raise NotImplementedError()
@dataclass()
@@ -229,27 +205,31 @@ def get_scores(
workdir: str,
min_date: Optional[TIME] = None,
max_date: Optional[TIME] = None,
- query: Optional[Query] = None) -> pl.DataFrame:
+ query: Optional[Query] = None,
+ drop_unchanged_scores: bool = True) -> pl.DataFrame:
min_date, max_date = self.get_date_range(min_date, max_date)
logger.info('Reading scores for %s - %s', min_date.isoformat(), max_date.isoformat())
- self.download_scores(
- workdir=workdir,
- min_date=min_date,
- max_date=max_date,
- )
- if min_date != max_date:
- f = functools.partial(
+ if min_date == max_date:
+ df = self.get_scores_by_date(workdir=workdir, date=min_date, query=query)
+ else:
+ resolver = functools.partial(
self.get_scores_by_date,
workdir=workdir,
query=query,
)
+ dates = tuple(self.iter_dates(min_date, max_date))
with concurrent.futures.ThreadPoolExecutor() as executor:
- dfs = executor.map(lambda date: f(date=date), self.iter_dates(min_date, max_date))
+ dfs = executor.map(lambda d: resolver(date=d), dates)
+
+ # If `drop_unchanged_scores` is True, only include scores that have changed since the last calculation.
+ if drop_unchanged_scores:
+ first = next(dfs)
+ changes = executor.map(lambda e: get_changed_scores(*e), util.iter_pairwise(dfs))
+ dfs = itertools.chain([first], changes)
+
df = pl.concat(dfs)
- else:
- df = self.get_scores_by_date(workdir=workdir, date=min_date, query=query)
df = df.sort(by=['date', 'cve'], descending=True)
return df
@@ -281,44 +261,6 @@ def get_scores_by_date(
df = df.sort(by=['cve'], descending=True)
return df
- def get_score_changelog(
- self,
- workdir: str,
- min_date: Optional[TIME] = None,
- max_date: Optional[TIME] = None,
- query: Optional[Query] = None) -> pl.DataFrame:
-
- dates = self.iter_dates(min_date, max_date)
- f = functools.partial(
- self.get_scores_by_date,
- workdir=workdir,
- query=query,
- )
- with concurrent.futures.ThreadPoolExecutor() as executor:
- dfs = executor.map(lambda d: f(date=d), dates)
- first = next(dfs)
- changes = executor.map(lambda e: get_changed_scores(*e), util.iter_pairwise(dfs))
- dfs = itertools.chain([first], changes)
- df = pl.concat(dfs)
- df = df.sort(by=['date', 'cve'], descending=True)
- return df
-
- def get_score_changelog_by_date(
- self,
- workdir: str,
- date: Optional[TIME] = None,
- query: Optional[Query] = None) -> pl.DataFrame:
-
- date = util.parse_date(date)
- previous_date = date - datetime.timedelta(days=1)
- if previous_date < self.min_date:
- raise ValueError(f'No scores available for {previous_date.isoformat()}')
-
- a = self.get_scores_by_date(workdir=workdir, date=date, query=query)
- b = self.get_scores_by_date(workdir=workdir, date=previous_date, query=query)
- d = get_changed_scores(a, b)
- return d
-
def filter_scores(self, df: pl.DataFrame, query: Query) -> pl.DataFrame:
min_date, max_date = self.get_date_range()
df = df.filter(pl.col('date') >= min_date)
diff --git a/examples/get-changed-scores-as-polars-dataframe.py b/examples/get-changed-scores-as-polars-dataframe.py
new file mode 100644
index 0000000..288b5da
--- /dev/null
+++ b/examples/get-changed-scores-as-polars-dataframe.py
@@ -0,0 +1,21 @@
+from epss.client import PolarsClient
+
+import polars as pl
+import logging
+import tempfile
+import os
+
+cfg = pl.Config()
+cfg.set_tbl_rows(-1) # Unlimited output length
+
+logging.basicConfig(level=logging.WARNING, format='%(asctime)s %(levelname)s %(name)s %(message)s')
+
+WORKDIR = os.path.join(tempfile.gettempdir(), 'epss')
+
+client = PolarsClient(
+ include_v1_scores=False,
+ include_v2_scores=False,
+ include_v3_scores=True,
+)
+df = client.get_scores(workdir=WORKDIR, drop_unchanged_scores=True)
+print(df)
diff --git a/examples/get-scores-as-polars-dataframe.py b/examples/get-scores-as-polars-dataframe.py
new file mode 100644
index 0000000..5fe8392
--- /dev/null
+++ b/examples/get-scores-as-polars-dataframe.py
@@ -0,0 +1,21 @@
+from epss.client import PolarsClient
+
+import polars as pl
+import logging
+import tempfile
+import os
+
+cfg = pl.Config()
+cfg.set_tbl_rows(-1) # Unlimited output length
+
+logging.basicConfig(level=logging.WARNING, format='%(asctime)s %(levelname)s %(name)s %(message)s')
+
+WORKDIR = os.path.join(tempfile.gettempdir(), 'epss')
+
+client = PolarsClient(
+ include_v1_scores=False,
+ include_v2_scores=False,
+ include_v3_scores=True,
+)
+df = client.get_scores(workdir=WORKDIR, drop_unchanged_scores=False)
+print(df)