Update README

whitfieldsdad · Feb 4, 2024 · e4ba4a9 · e4ba4a9
1 parent 4da890f
commit e4ba4a9
Show file tree

Hide file tree

Showing 4 changed files with 125 additions and 82 deletions.
diff --git a/README.md b/README.md
@@ -2,15 +2,74 @@
 
 This repository contains a lightning-fast [Python 3 module](epss) and a series of [bash scripts](scripts) that are designed to make it easy for anyone to work with the daily outputs of the [Exploit Prediction Scoring System (EPSS)](https://www.first.org/epss/).
 
-🚧👷 Under construction 🏗️🚧 
-
 ## Features
 
-- Explore EPSS scores using [Polars](https://pola.rs/), a lightning-fast dataframe library written in Rust
-- Idempotently download daily sets of EPSS scores<sub>1</sub> in JSON, JSONL, CSV, or [Parquet](https://parquet.apache.org/) format<sub>2</sub>
-- Easily identify changes between two or more sets of EPSS scores
-- Translate sets of EPSS scores into sparse matrices to allow for the easy identification of changes to one or more computer security vulnerabilities on a daily or per CVE ID basis.
+- Idempotently download daily sets of EPSS scores<sub>1</sub> in JSON, JSONL, CSV, or [Apache Parquet](https://parquet.apache.org/)<sub>2</sub> format
+- Explore EPSS scores as either sparse or dense matrices using [Polars](https://pola.rs/), a lightning-fast dataframe library written in Rust
+- [Easily](examples/get-scores-as-polars-dataframe.py) [switch](examples/get-changed-scores-as-polars-dataframe.py) between different versions<sub>3</sub> of the [EPSS model](https://www.first.org/epss/model)
 
 <sub>1. By default, EPSS scores will be downloaded from 2023-03-07 onward, as this is the date when the outputs of EPSS v3 (v2023.03.01) were first published.</sub>
 
 <sub>2. Apache Parquet is the default file format.</sub>
+
+<sub>3. EPSS has undergone 3 major revisions: [EPSS v1](https://arxiv.org/abs/1908.04856), EPSS v2 (v2022.01.01), and [EPSS v3 (v2023.03.01)](https://arxiv.org/abs/2302.14172) where the first, second, and third revisions all contain major improvements.</sub>
+
+## Background
+
+The Exploit Prediction Scoring System (EPSS) is a probabilistic [model](https://www.first.org/epss/model) that is designed to predict the likelihood of a given computer security vulnerability being exploited somewhere in the wild within the next 30 days.
+
+The first version of the EPSS model was released in 2021, and it has since undergone two major revisions.
+
+The first version of the EPSS model used logistic regression, but subsequent models have used [gradient-boosted decision trees](https://en.wikipedia.org/wiki/Gradient_boosting) ([XGBoost](https://en.wikipedia.org/wiki/XGBoost)) to make predictions.
+
+For additional information on EPSS and its applications, please consult the following resources:
+
+- [Exploit Prediction Scoring System (EPSS)](https://arxiv.org/abs/1908.04856)
+- [Enhancing Vulnerability Prioritization: Data-Driven Exploit Predictions with Community-Driven Insights](https://arxiv.org/abs/2302.14172)
+
+Additional resources:
+
+- [Daily analysis of EPSS scores](https://www.first.org/epss/data_stats)
+- [The Exploit Prediction Scoring System (EPSS) Explained](https://www.splunk.com/en_us/blog/learn/epss-exploit-prediction-scoring-system.html#:~:text=In%20short%2C%20EPSS%20allows%20us,vulnerability%20might%20be%20if%20exploited.)
+- [F5 Labs Joins the Exploit Prediction Scoring System as a Data Partner](https://www.f5.com/labs/articles/cisotociso/f5-labs-joins-the-exploit-prediction-scoring-system-as-a-data-partner)
+
+## Usage
+
+### Developers
+
+This package is not currently available on PyPi, but can be easily added to your project in one of two ways:
+
+- Using `poetry`<sub>1</sub>:
+
+```
+poetry add git+https://github.com/whitfieldsdad/epss.git
+```
+
+By branch:
+
+```
+poetry add git+https://github.com/whitfieldsdad/epss.git#main
+```
+
+By tag:
+
+```
+poetry add git+https://github.com/whitfieldsdad/epss.git#v3.0.0
+```
+
+
+- Using `requirements.txt`:
+
+By tag:
+
+```
+git+https://github.com/whitfieldsdad/epss@releases/tag/v3.0.0
+```
+
+By branch:
+
+```
+git+git+https://github.com/owner/repo@main
+```
+
+<sub>1. Using Poetry for dependency management and adding this project as a dependency of your project without explicitly specifying a branch or tag is recommended.</sub>
diff --git a/epss/client.py b/epss/client.py
@@ -35,9 +35,10 @@ def get_scores(
             workdir: str,
             min_date: Optional[TIME] = None, 
             max_date: Optional[TIME] = None,
-            query: Optional[Query] = None) -> Any:
+            query: Optional[Query] = None,
+            drop_unchanged_scores: bool = True) -> Any:
         """
-        Returns a dataframe containing EPSS scores published between the specified dates including scores that have not changed since the last calculation.        
+        Returns a dataframe containing EPSS scores published between the specified dates.
         
         The dataframe will be sorted by date and CVE ID in descending order.
         """
@@ -54,31 +55,6 @@ def get_scores_by_date(
         The dataframe will be sorted by CVE ID in descending order.
         """
         raise NotImplementedError()
-
-    def get_score_changelog(
-            self,
-            workdir: str,
-            min_date: Optional[TIME] = None, 
-            max_date: Optional[TIME] = None,
-            query: Optional[Query] = None) -> Any:
-        """
-        Returns a dataframe containing the changes to EPSS scores published between the specified dates.
-
-        The dataframe will be sorted by date and CVE ID in descending order.
-        """
-        raise NotImplementedError()
-
-    def get_score_changelog_by_date(
-            self,
-            workdir: str, 
-            date: Optional[TIME] = None,
-            query: Optional[Query] = None) -> Any:
-        """
-        Returns a dataframe containing the changes to EPSS scores published on the specified date.
-
-        The dataframe will be sorted by CVE ID in descending order.
-        """
-        raise NotImplementedError()
 
 
 @dataclass()
@@ -229,27 +205,31 @@ def get_scores(
             workdir: str,
             min_date: Optional[TIME] = None, 
             max_date: Optional[TIME] = None,
-            query: Optional[Query] = None) -> pl.DataFrame:
+            query: Optional[Query] = None,
+            drop_unchanged_scores: bool = True) -> pl.DataFrame:
 
         min_date, max_date = self.get_date_range(min_date, max_date)
         logger.info('Reading scores for %s - %s', min_date.isoformat(), max_date.isoformat())
 
-        self.download_scores(
-            workdir=workdir, 
-            min_date=min_date, 
-            max_date=max_date,
-        )
-        if min_date != max_date:
-            f = functools.partial(
+        if min_date == max_date:
+            df = self.get_scores_by_date(workdir=workdir, date=min_date, query=query)
+        else:
+            resolver = functools.partial(
                 self.get_scores_by_date,
                 workdir=workdir,
                 query=query,
             )
+            dates = tuple(self.iter_dates(min_date, max_date))
             with concurrent.futures.ThreadPoolExecutor() as executor:
-                dfs = executor.map(lambda date: f(date=date), self.iter_dates(min_date, max_date))
+                dfs = executor.map(lambda d: resolver(date=d), dates)
+
+                # If `drop_unchanged_scores` is True, only include scores that have changed since the last calculation.
+                if drop_unchanged_scores:
+                    first = next(dfs)
+                    changes = executor.map(lambda e: get_changed_scores(*e), util.iter_pairwise(dfs))
+                    dfs = itertools.chain([first], changes)
+
                 df = pl.concat(dfs)
-        else:
-            df = self.get_scores_by_date(workdir=workdir, date=min_date, query=query)
 
         df = df.sort(by=['date', 'cve'], descending=True)
         return df
@@ -281,44 +261,6 @@ def get_scores_by_date(
         df = df.sort(by=['cve'], descending=True)
         return df
 
-    def get_score_changelog(
-            self,
-            workdir: str, 
-            min_date: Optional[TIME] = None, 
-            max_date: Optional[TIME] = None,
-            query: Optional[Query] = None) -> pl.DataFrame:
-
-        dates = self.iter_dates(min_date, max_date)
-        f = functools.partial(
-            self.get_scores_by_date,
-            workdir=workdir,
-            query=query,
-        )
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            dfs = executor.map(lambda d: f(date=d), dates)
-            first = next(dfs)
-            changes = executor.map(lambda e: get_changed_scores(*e), util.iter_pairwise(dfs))
-            dfs = itertools.chain([first], changes)
-            df = pl.concat(dfs)
-            df = df.sort(by=['date', 'cve'], descending=True)
-            return df
-
-    def get_score_changelog_by_date(
-            self,
-            workdir: str, 
-            date: Optional[TIME] = None,
-            query: Optional[Query] = None) -> pl.DataFrame:
-
-        date = util.parse_date(date)
-        previous_date = date - datetime.timedelta(days=1)
-        if previous_date < self.min_date:
-            raise ValueError(f'No scores available for {previous_date.isoformat()}')
-
-        a = self.get_scores_by_date(workdir=workdir, date=date, query=query)
-        b = self.get_scores_by_date(workdir=workdir, date=previous_date, query=query)
-        d = get_changed_scores(a, b)
-        return d
-
     def filter_scores(self, df: pl.DataFrame, query: Query) -> pl.DataFrame:
         min_date, max_date = self.get_date_range()
         df = df.filter(pl.col('date') >= min_date)

diff --git a/examples/get-changed-scores-as-polars-dataframe.py b/examples/get-changed-scores-as-polars-dataframe.py
@@ -0,0 +1,21 @@
+from epss.client import PolarsClient
+
+import polars as pl
+import logging
+import tempfile
+import os
+
+cfg = pl.Config()
+cfg.set_tbl_rows(-1)    # Unlimited output length
+
+logging.basicConfig(level=logging.WARNING, format='%(asctime)s %(levelname)s %(name)s %(message)s')
+
+WORKDIR = os.path.join(tempfile.gettempdir(), 'epss')
+
+client = PolarsClient(
+    include_v1_scores=False,
+    include_v2_scores=False,
+    include_v3_scores=True,
+)
+df = client.get_scores(workdir=WORKDIR, drop_unchanged_scores=True)
+print(df)
diff --git a/examples/get-scores-as-polars-dataframe.py b/examples/get-scores-as-polars-dataframe.py
@@ -0,0 +1,21 @@
+from epss.client import PolarsClient
+
+import polars as pl
+import logging
+import tempfile
+import os
+
+cfg = pl.Config()
+cfg.set_tbl_rows(-1)    # Unlimited output length
+
+logging.basicConfig(level=logging.WARNING, format='%(asctime)s %(levelname)s %(name)s %(message)s')
+
+WORKDIR = os.path.join(tempfile.gettempdir(), 'epss')
+
+client = PolarsClient(
+    include_v1_scores=False,
+    include_v2_scores=False,
+    include_v3_scores=True,
+)
+df = client.get_scores(workdir=WORKDIR, drop_unchanged_scores=False)
+print(df)