vipyrsec · jonathan-d-zhang · Aug 2, 2024 · Aug 2, 2024 · Aug 4, 2024 · Aug 4, 2024
diff --git a/.github/workflows/mdbook.yaml b/.github/workflows/mdbook.yaml
@@ -0,0 +1,49 @@
+name: Build mdbook
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  build-book:
+    runs-on: ubuntu-22.04
+    steps:
+      - name: "Checkout repository"
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11  # v4.1.1
+
+      - name: "Install mdbook"
+        run: |
+          mkdir bin
+          curl -sSL https://github.com/rust-lang/mdBook/releases/download/v0.4.40/mdbook-v0.4.40-x86_64-unknown-linux-gnu.tar.gz | tar -xz --directory=bin
+
+      - name: "Build book"
+        run: bin/mdbook build
+
+      - name: "Upload artifact"
+        uses: actions/upload-pages-artifact@0252fc4ba7626f0298f0cf00902a25c6afc77fa8  # v3.0.0
+        with:
+          path: ./target/book
+
+  book-deploy:
+    needs: build-book
+
+    if: github.ref == 'refs/heads/main'
+
+    # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+    permissions:
+      contents: read
+      pages: write
+      id-token: write
+
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Deploy docs to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@87c3283f01cd6fe19a0ab93a23b2f6fcba5a8e42  # v4.0.3
diff --git a/.github/workflows/rust-ci.yaml b/.github/workflows/rust-ci.yaml
@@ -144,10 +144,12 @@ jobs:
       - name: Remove '.lock' file
         run: rm ./target/doc/.lock
 
+      # Commented out to test mdbook
+      #
       # `cargo doc` builds the docs without a top level `index.html`, so add
       # a barebones one to redirect to the generated one.
-      - name: Add redirect index.html
-        run: echo '<meta http-equiv="refresh" content="0; url=dragonfly_client_rs">' > target/doc/index.html
+      # - name: "Add redirect index.html"
+      #   run: echo '<meta http-equiv="refresh" content="0; url=dragonfly_client_rs">' > target/doc/index.html
 
       - name: Upload artifact
         uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa  # v3.0.1

diff --git a/README.md b/README.md
@@ -1,180 +1,44 @@
 # dragonfly-client-rs
 
-Modular compute nodes capable of scanning packages and sending results upstream
-to a control server, written in Rust.
+`dragonfly-client-rs` uses [Yara](https://virustotal.github.io/yara/) to scan code pulled from the [Python Package
+Index](https://pypi.org/) (PYPI). It polls for work from [`dragonfly-mainframe`](https://github.com/vipyrsec/dragonfly-mainframe).
 
-## Set up
+## Running `dragonfly-client-rs`
 
-This section goes over how to set up a client instance locally and via Docker.
-
-> Refer to the [Environment variables](#environment-variables) section for
-> information on what environment variables are necessary.
-
-### Local
-
-#### Requirements
-
-- [Rust](https://www.rust-lang.org/tools/install)
-- [YARA](https://yara.readthedocs.io/en/stable/gettingstarted.html#compiling-and-installing-yara)
-
-#### 1. Set the appropriate environment variable pointing to the YARA installation
-
-```bash
-export YARA_LIBRARY_PATH='/path/to/yara/libs'
-```
-
-#### 2. Build the binary with `cargo`
-
-```bash
-cargo build --release
-```
-
-#### 3. Run the built binary
-
-```bash
-./target/release/dragonfly-client-rs
-```
-
-### Docker
-
-#### Requirements
-
-- [Docker Engine](https://docs.docker.com/engine/install/)
-
-#### 1. Build and tag the image
-
-```bash
-docker build --tag vipyrsec/dragonfly-client-rs:latest .
-```
-
-#### 2. Run the container
-
-```bash
-docker run --name dragonfly-client-rs vipyrsec/dragonfly-client-rs:latest
-```
-
-### Docker Compose
-
-#### Requirements
+## Requirements
 
 - [Docker Engine](https://docs.docker.com/engine/install/)
 - [Docker Compose](https://docs.docker.com/compose/install/)
+- [Environment Variables](#environment-variables)
 
-#### Run the service
+## Run
 
 ```bash
 docker compose up
 ```
 
-### How it works: Overview
-
-The follow is a brief overview of how the client works. A more extensive
-writeup can be found towards the bottom of this page.
-
-The client is comprised of a few discrete components, each running
-independently. These are the scanning threadpool, the loader thread, and the
-sender thread.
+Note: to build and run without Docker Compose, see [build locally](docs/building_locally.md).
 
-- The Scanning Threadpool - Downloads and scans the releases.
-- The Loader Thread - This thread is responsible for requesting jobs from the API and submitting them to the threadpool.
-
-### Performance, efficiency, and optimization
-
-The client aims to be highly configurable to suit a variety of host machines.
-The environment variables of most value in this regard are as follows:
-
-- `DRAGONFLY_THREADS` defaults to the number of available parallelism, or
-  1 if it could not be determined. [This
-  page](https://doc.rust-lang.org/stable/std/thread/fn.available_parallelism.html)
-  explains in detail how this is calculated, but in short, it is often the
-  number of compute cores a machine has. The client will spawn this many
-  threads in a threadpool executor to perform concurrent scanning of files.
-- `DRAGONFLY_LOAD_DURATION` defaults to `60` seconds. This is the frequency
-  with which the loader thread will send an HTTP API request to the Dragonfly
-  API requesting N amount of jobs (defined by `DRAGONFLY_BULK_SIZE`).
-- `DRAGONFLY_BULK_SIZE` defaults to `20`. This is the amount of jobs the loader
-  thread will request from the API at once. Setting this too high may mean the
-  scanner threads can't keep up, but setting this too low may mean that
-  more CPU time is wasted by idling. `DRAGONFLY_MAX_SCAN_SIZE` defaults to
-- `128000000`. The maximum size of downloaded distributions, in bytes. Setting
-  this too high may cause clients with low memory to run out of memory and
-  crash, setting it too low may mean most packages are not scanned (due to
-  being above the size limit).
-
-Many of these options have disadvantages to setting these options to any
-extreme (too high or too low), so it's important to tweak it to a good middle
-ground that works best in your environment. However, we have tried our best to
-provide sensible defaults that will work reasonably efficiently: 20 jobs are
-requested from the API every 60 seconds.
-
-### How it works: Detailed Breakdown
-
-This section attempts to describe in detail how the client works under the
-hood, and how the various configuration parameters come into play.
-
-The client can be broken down into a few discrete components: The scanner
-threads, the loader thread, the sender thread. We will first explore in detail
-the workings of each of these components in isolation and then how they all fit
-together.
-
-The scanner thread(s) are what do most of the heavy lifting. They use bindings
-to the C YARA library, and most of this code can be found in `scanner.rs`. The
-way this program models PyPI data structure is as so: There are "packages" (or
-"releases") which is a name/version specifier combination. These "packages" are
-comprised of several "distributions" in the form of gzipped tarballs or wheels
-(which behave similarly to zip files, hence the use of the `zip` crate). Each
-distribution is comprised of a flat sequence of files (the hierarchical nature
-of the traditional file/folder system has been flatted for our use case). The
-main entry point interface to the scanner logic is via the
-`scan_all_distribution`. This loops over the download URLs of each distribution
-of the given job, and attempts to download them. The maximum size of these
-downloads, in bytes, is controlled by the `DRAGONFLY_MAX_SIZE` environment
-variable (128MB by default) Then, for each distribution downloaded, we loop
-over each file in that distribution, load it into memory, and apply the
-compiled YARA rules stored in memory against the file contents (this is done by
-the underlying C YARA library). Then, the results of each files is stored in
-a "distribution scan result" struct that represents the scan results of
-a single distribution. This process is repeated for all the distributions in
-a package, and are aggregated into a "package scan result" struct. This model
-highly reflects PyPI's model of "package -> distributions -> files". This
-process allows us to start with the download URLs of each distribution of
-a package, and end with the scan results of each file of each distribution of
-the given package.
+### Environment variables
 
-The loader thread's primary responsibility is to request a bunch of jobs from
-the API and spawn threadpool tasks on a timer. It will perform a "bulk job
-request" (`POST /jobs`) API request to retrieve N jobs from the API, where
-N can be configured via the `DRAGONFLY_BULK_SIZE` environment variable. The
-client will make these bulk requests at an interval defined by
-the`DRAGONFLY_LOAD_DURATION` environment variable. The jobs returned by the API
-endpoint will then be spawned as tasks in the threadpool. This process repeats for
-the duration of the program.
+Variables without a default are **required**. For more information on how to use these, see [tuning](docs/tuning.md).
 
-The client starts up by first authenticating with Auth0 to obtain an access
-token. It then stores this access token in a shared-state thread
-synchronization primitive that allows multiple concurrent readers but only one
-writer. This new access token is used to fetch the YARA rules from the
-Dragonfly API. The source code of the YARA rules is compiled (very much like
-compiling regex) and stored in the shared state. Then, the necessary threads
-are spawned. Once the threadpool task has finished scanning, it will send
-it's results over the Dragonfly HTTP API.
+| Variable                  | Default                          | Description                                  |
+| ------------------------- | -------------------------------- | -------------------------------------------- |
+| `DRAGONFLY_BASE_URL`      | `https://dragonfly.vipyrsec.com` | The base API URL for the mainframe server    |
+| `DRAGONFLY_AUTH0_DOMAIN`  | `vipyrsec.us.auth0.com`          | The auth0 domain that requests go to         |
+| `DRAGONFLY_AUDIENCE`      | `https://dragonfly.vipyrsec.com` | Auth0 Audience field                         |
+| `DRAGONFLY_CLIENT_ID`     |                                  | Auth0 client ID                              |
+| `DRAGONFLY_CLIENT_SECRET` |                                  | Auth0 client secret                          |
+| `DRAGONFLY_USERNAME`      |                                  | Provisioned username                         |
+| `DRAGONFLY_PASSWORD`      |                                  | Provisioned password                         |
+| `DRAGONFLY_LOAD_DURATION` | `60`                             | Seconds to wait between each API job request |
+| `DRAGONFLY_MAX_SCAN_SIZE` | `128_000_000`                    | Maximum distribution size in bytes to scan   |
 
-### Environment variables
+## Building docs locally
 
-Below are a list of environment variables that need to be configured, and what
-they do
+We use [`mdbook`](https://rust-lang.github.io/mdBook/index.html) for docs as well as [`cargo
+doc`](https://doc.rust-lang.org/cargo/commands/cargo-doc.html). The easiest way to build locally:
 
-<!-- markdownlint-disable MD013 -->
-| Variable                  | Default                          | Description                                                                     |
-| ------------------------- | -------------------------------- | ------------------------------------------------------------------------------- |
-| `DRAGONFLY_BASE_URL`      | `https://dragonfly.vipyrsec.com` | The base API URL for the mainframe server                                       |
-| `DRAGONFLY_AUTH0_DOMAIN`  | `vipyrsec.us.auth0.com`          | The auth0 domain that requests go to                                            |
-| `DRAGONFLY_AUDIENCE`      | `https://dragonfly.vipyrsec.com` | Auth0 Audience field                                                            |
-| `DRAGONFLY_CLIENT_ID`     |                                  | Auth0 client ID                                                                 |
-| `DRAGONFLY_CLIENT_SECRET` |                                  | Auth0 client secret                                                             |
-| `DRAGONFLY_USERNAME`      |                                  | Provisioned username                                                            |
-| `DRAGONFLY_PASSWORD`      |                                  | Provisioned password                                                            |
-| `DRAGONFLY_THREADS`       | Available parallelism / `1`      | Attempts to auto-detect the amount of threads, or defaults to 1 if not possible |
-| `DRAGONFLY_LOAD_DURATION` | 60                               | Seconds to wait between each API job request                                    |
-| `DRAGONFLY_BULK_SIZE`     | 20                               | The amount of jobs to request at once                                           |
-<!-- markdownlint-enable MD013 -->
+- For `docs/`: `mdbook serve`
+- For doc comments: `cargo doc --document-private-items --no-deps`
diff --git a/book.toml b/book.toml
@@ -0,0 +1,13 @@
+[book]
+authors = ["Vipyr Security"]
+language = "en"
+multilingual = false
+src = "docs"
+title = "Docs"
+description = "Documentation for the Dragonfly Client"
+
+[rust]
+edition = "2021"
+
+[build]
+build-dir = "target/book"
diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
@@ -0,0 +1,5 @@
+# Summary
+
+- [Overview](./overview.md)
+- [Running locally without Docker](./building-locally.md)
+- [Tuning Environment Variables](./tuning.md)
diff --git a/docs/building-locally.md b/docs/building-locally.md
@@ -0,0 +1,22 @@
+# How to Build and Run `dragonfly-client-rs` Locally
+## Requirements
+
+- [Rust](https://www.rust-lang.org/tools/install)
+- [YARA](https://yara.readthedocs.io/en/stable/gettingstarted.html#compiling-and-installing-yara)
+
+## Set the appropriate environment variable pointing to the YARA installation
+```bash
+export YARA_LIBRARY_PATH='/path/to/yara/libs'
+```
+
+## Build the binary with `cargo`
+
+```bash
+cargo build --release
+```
+
+## Run the built binary
+
+```bash
+./target/release/dragonfly-client-rs
+```
diff --git a/docs/overview.md b/docs/overview.md
@@ -0,0 +1,24 @@
+# Overview
+
+`dragonfly-client-rs` uses [Yara](https://virustotal.github.io/yara/) to scan
+code pulled from the [Python Package Index](https://pypi.org/) (PYPI). It polls
+for work from
+[`dragonfly-mainframe`][1].
+
+`dragonfly-client-rs` runs a main loop which does the following:
+
+* Authenticate using OAuth2
+* Fetch a job from [`dragonfly-mainframe`][1], which consists of a package to
+  scan
+* Scan the package
+* Report the results
+
+## Scanning
+
+Packages are scanned using Yara.
+
+## Reporting Results
+
+An HTTP request is sent to [`dragonfly-mainframe`][1].
+
+[1]: https://github.com/vipyrsec/dragonfly-mainframe
diff --git a/docs/tuning.md b/docs/tuning.md
@@ -0,0 +1,17 @@
+# Tuning `dragonfly-client-rs`
+
+Describes the configuration options in more detail.
+
+## `DRAGONFLY_LOAD_DURATION`
+
+Defaults to `60` seconds.
+
+The time to wait between failed job requests, in seconds.
+
+## `DRAGONFLY_MAX_SCAN_SIZE`
+Defaults to `128_000_000` (128 MB).
+
+The maximum size of downloaded distributions, in bytes. Setting this too high
+may cause clients with low memory to run out of memory and crash, setting it
+too low may mean most packages are not scanned (due to being above the size
+limit).
diff --git a/src/app_config.rs b/src/app_config.rs
@@ -8,9 +8,7 @@ use serde::{Deserialize, Serialize};
 #[derive(Serialize, Deserialize)]
 pub struct AppConfig {
     pub base_url: String,
-    pub threads: usize,
     pub load_duration: u64,
-    pub bulk_size: usize,
     pub auth0_domain: String,
     pub client_id: String,
     pub client_secret: String,
@@ -23,10 +21,6 @@ pub struct AppConfig {
 
 impl Default for AppConfig {
     fn default() -> Self {
-        let available_parallelism = std::thread::available_parallelism()
-            .map(usize::from)
-            .unwrap_or(1);
-
         #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
         AppConfig {
             base_url: String::from("https://dragonfly.vipyrsec.com"),
@@ -37,8 +31,6 @@ impl Default for AppConfig {
             client_secret: String::new(),
             username: String::new(),
             password: String::new(),
-            threads: available_parallelism,
-            bulk_size: 20,
             load_duration: 60,
             max_scan_size: 1.28e+8 as u64, // 128 MB
         }