diff --git a/README.md b/README.md index 7eb31a7..4c07bf9 100644 --- a/README.md +++ b/README.md @@ -2,15 +2,74 @@ This repository contains a lightning-fast [Python 3 module](epss) and a series of [bash scripts](scripts) that are designed to make it easy for anyone to work with the daily outputs of the [Exploit Prediction Scoring System (EPSS)](https://www.first.org/epss/). -🚧👷 Under construction 🏗️🚧 - ## Features -- Explore EPSS scores using [Polars](https://pola.rs/), a lightning-fast dataframe library written in Rust -- Idempotently download daily sets of EPSS scores1 in JSON, JSONL, CSV, or [Parquet](https://parquet.apache.org/) format2 -- Easily identify changes between two or more sets of EPSS scores -- Translate sets of EPSS scores into sparse matrices to allow for the easy identification of changes to one or more computer security vulnerabilities on a daily or per CVE ID basis. +- Idempotently download daily sets of EPSS scores1 in JSON, JSONL, CSV, or [Apache Parquet](https://parquet.apache.org/)2 format +- Explore EPSS scores as either sparse or dense matrices using [Polars](https://pola.rs/), a lightning-fast dataframe library written in Rust +- [Easily](examples/get-scores-as-polars-dataframe.py) [switch](examples/get-changed-scores-as-polars-dataframe.py) between different versions3 of the [EPSS model](https://www.first.org/epss/model) 1. By default, EPSS scores will be downloaded from 2023-03-07 onward, as this is the date when the outputs of EPSS v3 (v2023.03.01) were first published. 2. Apache Parquet is the default file format. + +3. EPSS has undergone 3 major revisions: [EPSS v1](https://arxiv.org/abs/1908.04856), EPSS v2 (v2022.01.01), and [EPSS v3 (v2023.03.01)](https://arxiv.org/abs/2302.14172) where the first, second, and third revisions all contain major improvements. + +## Background + +The Exploit Prediction Scoring System (EPSS) is a probabilistic [model](https://www.first.org/epss/model) that is designed to predict the likelihood of a given computer security vulnerability being exploited somewhere in the wild within the next 30 days. + +The first version of the EPSS model was released in 2021, and it has since undergone two major revisions. + +The first version of the EPSS model used logistic regression, but subsequent models have used [gradient-boosted decision trees](https://en.wikipedia.org/wiki/Gradient_boosting) ([XGBoost](https://en.wikipedia.org/wiki/XGBoost)) to make predictions. + +For additional information on EPSS and its applications, please consult the following resources: + +- [Exploit Prediction Scoring System (EPSS)](https://arxiv.org/abs/1908.04856) +- [Enhancing Vulnerability Prioritization: Data-Driven Exploit Predictions with Community-Driven Insights](https://arxiv.org/abs/2302.14172) + +Additional resources: + +- [Daily analysis of EPSS scores](https://www.first.org/epss/data_stats) +- [The Exploit Prediction Scoring System (EPSS) Explained](https://www.splunk.com/en_us/blog/learn/epss-exploit-prediction-scoring-system.html#:~:text=In%20short%2C%20EPSS%20allows%20us,vulnerability%20might%20be%20if%20exploited.) +- [F5 Labs Joins the Exploit Prediction Scoring System as a Data Partner](https://www.f5.com/labs/articles/cisotociso/f5-labs-joins-the-exploit-prediction-scoring-system-as-a-data-partner) + +## Usage + +### Developers + +This package is not currently available on PyPi, but can be easily added to your project in one of two ways: + +- Using `poetry`1: + +``` +poetry add git+https://github.com/whitfieldsdad/epss.git +``` + +By branch: + +``` +poetry add git+https://github.com/whitfieldsdad/epss.git#main +``` + +By tag: + +``` +poetry add git+https://github.com/whitfieldsdad/epss.git#v3.0.0 +``` + + +- Using `requirements.txt`: + +By tag: + +``` +git+https://github.com/whitfieldsdad/epss@releases/tag/v3.0.0 +``` + +By branch: + +``` +git+git+https://github.com/owner/repo@main +``` + +1. Using Poetry for dependency management and adding this project as a dependency of your project without explicitly specifying a branch or tag is recommended. diff --git a/epss/client.py b/epss/client.py index 8039285..ce234ad 100644 --- a/epss/client.py +++ b/epss/client.py @@ -35,9 +35,10 @@ def get_scores( workdir: str, min_date: Optional[TIME] = None, max_date: Optional[TIME] = None, - query: Optional[Query] = None) -> Any: + query: Optional[Query] = None, + drop_unchanged_scores: bool = True) -> Any: """ - Returns a dataframe containing EPSS scores published between the specified dates including scores that have not changed since the last calculation. + Returns a dataframe containing EPSS scores published between the specified dates. The dataframe will be sorted by date and CVE ID in descending order. """ @@ -54,31 +55,6 @@ def get_scores_by_date( The dataframe will be sorted by CVE ID in descending order. """ raise NotImplementedError() - - def get_score_changelog( - self, - workdir: str, - min_date: Optional[TIME] = None, - max_date: Optional[TIME] = None, - query: Optional[Query] = None) -> Any: - """ - Returns a dataframe containing the changes to EPSS scores published between the specified dates. - - The dataframe will be sorted by date and CVE ID in descending order. - """ - raise NotImplementedError() - - def get_score_changelog_by_date( - self, - workdir: str, - date: Optional[TIME] = None, - query: Optional[Query] = None) -> Any: - """ - Returns a dataframe containing the changes to EPSS scores published on the specified date. - - The dataframe will be sorted by CVE ID in descending order. - """ - raise NotImplementedError() @dataclass() @@ -229,27 +205,31 @@ def get_scores( workdir: str, min_date: Optional[TIME] = None, max_date: Optional[TIME] = None, - query: Optional[Query] = None) -> pl.DataFrame: + query: Optional[Query] = None, + drop_unchanged_scores: bool = True) -> pl.DataFrame: min_date, max_date = self.get_date_range(min_date, max_date) logger.info('Reading scores for %s - %s', min_date.isoformat(), max_date.isoformat()) - self.download_scores( - workdir=workdir, - min_date=min_date, - max_date=max_date, - ) - if min_date != max_date: - f = functools.partial( + if min_date == max_date: + df = self.get_scores_by_date(workdir=workdir, date=min_date, query=query) + else: + resolver = functools.partial( self.get_scores_by_date, workdir=workdir, query=query, ) + dates = tuple(self.iter_dates(min_date, max_date)) with concurrent.futures.ThreadPoolExecutor() as executor: - dfs = executor.map(lambda date: f(date=date), self.iter_dates(min_date, max_date)) + dfs = executor.map(lambda d: resolver(date=d), dates) + + # If `drop_unchanged_scores` is True, only include scores that have changed since the last calculation. + if drop_unchanged_scores: + first = next(dfs) + changes = executor.map(lambda e: get_changed_scores(*e), util.iter_pairwise(dfs)) + dfs = itertools.chain([first], changes) + df = pl.concat(dfs) - else: - df = self.get_scores_by_date(workdir=workdir, date=min_date, query=query) df = df.sort(by=['date', 'cve'], descending=True) return df @@ -281,44 +261,6 @@ def get_scores_by_date( df = df.sort(by=['cve'], descending=True) return df - def get_score_changelog( - self, - workdir: str, - min_date: Optional[TIME] = None, - max_date: Optional[TIME] = None, - query: Optional[Query] = None) -> pl.DataFrame: - - dates = self.iter_dates(min_date, max_date) - f = functools.partial( - self.get_scores_by_date, - workdir=workdir, - query=query, - ) - with concurrent.futures.ThreadPoolExecutor() as executor: - dfs = executor.map(lambda d: f(date=d), dates) - first = next(dfs) - changes = executor.map(lambda e: get_changed_scores(*e), util.iter_pairwise(dfs)) - dfs = itertools.chain([first], changes) - df = pl.concat(dfs) - df = df.sort(by=['date', 'cve'], descending=True) - return df - - def get_score_changelog_by_date( - self, - workdir: str, - date: Optional[TIME] = None, - query: Optional[Query] = None) -> pl.DataFrame: - - date = util.parse_date(date) - previous_date = date - datetime.timedelta(days=1) - if previous_date < self.min_date: - raise ValueError(f'No scores available for {previous_date.isoformat()}') - - a = self.get_scores_by_date(workdir=workdir, date=date, query=query) - b = self.get_scores_by_date(workdir=workdir, date=previous_date, query=query) - d = get_changed_scores(a, b) - return d - def filter_scores(self, df: pl.DataFrame, query: Query) -> pl.DataFrame: min_date, max_date = self.get_date_range() df = df.filter(pl.col('date') >= min_date) diff --git a/examples/get-changed-scores-as-polars-dataframe.py b/examples/get-changed-scores-as-polars-dataframe.py new file mode 100644 index 0000000..288b5da --- /dev/null +++ b/examples/get-changed-scores-as-polars-dataframe.py @@ -0,0 +1,21 @@ +from epss.client import PolarsClient + +import polars as pl +import logging +import tempfile +import os + +cfg = pl.Config() +cfg.set_tbl_rows(-1) # Unlimited output length + +logging.basicConfig(level=logging.WARNING, format='%(asctime)s %(levelname)s %(name)s %(message)s') + +WORKDIR = os.path.join(tempfile.gettempdir(), 'epss') + +client = PolarsClient( + include_v1_scores=False, + include_v2_scores=False, + include_v3_scores=True, +) +df = client.get_scores(workdir=WORKDIR, drop_unchanged_scores=True) +print(df) diff --git a/examples/get-scores-as-polars-dataframe.py b/examples/get-scores-as-polars-dataframe.py new file mode 100644 index 0000000..5fe8392 --- /dev/null +++ b/examples/get-scores-as-polars-dataframe.py @@ -0,0 +1,21 @@ +from epss.client import PolarsClient + +import polars as pl +import logging +import tempfile +import os + +cfg = pl.Config() +cfg.set_tbl_rows(-1) # Unlimited output length + +logging.basicConfig(level=logging.WARNING, format='%(asctime)s %(levelname)s %(name)s %(message)s') + +WORKDIR = os.path.join(tempfile.gettempdir(), 'epss') + +client = PolarsClient( + include_v1_scores=False, + include_v2_scores=False, + include_v3_scores=True, +) +df = client.get_scores(workdir=WORKDIR, drop_unchanged_scores=False) +print(df)