Skip to content

Commit

Permalink
feat: Add synthetic data and model generation for performance testing
Browse files Browse the repository at this point in the history
  • Loading branch information
KingMob committed Jun 21, 2024
1 parent a36e228 commit 7d15919
Showing 1 changed file with 203 additions and 0 deletions.
203 changes: 203 additions & 0 deletions perf/gensql/query/perf/synthetic.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
(ns gensql.query.perf.synthetic
"For generating artificial data and models, of varying ecological validity."
(:require [clojure.math :as math]
[gensql.inference.gpm.crosscat :as xcat]
[gensql.query.db :as db]
[medley.core :as medley]))

(def ^:const view-prefix "view-")
(def ^:const col-prefix "col-")

(def ^:dynamic *default-categorical-alpha* 0.01)
(def ^:dynamic *default-local-alpha* 0.01)
(def ^:dynamic *default-global-alpha* 0.01)
(def ^:dynamic *default-m* 1)
(def ^:dynamic *default-r* 2)
(def ^:dynamic *default-s* 3)
(def ^:dynamic *default-nu* 4)

(def ^:dynamic *default-table-name* "table")
(def ^:dynamic *default-model-name* "model")

(defn num-digits-needed
"How many digits are needed to represent the largest number in the data.
Use for formatting names without too many zeros all over the place."
[max-n]
(int (math/ceil (math/log10 max-n))))

(defn numbered-format
"Returns a numbered format string for strings like \"prefix-00\"."
[prefix n]
(str prefix "%0" (num-digits-needed n) "d"))

(defn ^:private name-vec
"Generates a vector of numbered strings with a prefix, like (\"view-1\",
\"view-2\", \"view-3\" ...)"
[prefix num-names]
(let [name-format (numbered-format prefix num-names)]
(mapv #(format name-format %) (range num-names))))

(defn generate-data
"Generates a synthetic dataset with the given number of rows and columns.
The p-categorical is the proportion that are categorical, and the rest are
gaussian. Note that this is an enforced proportion, not a probability.
For categories, generates between 2-10 values in each category.
Returns a map with the data and the column-type metadata"
[num-rows num-columns p-categorical]
(let [col-format (numbered-format col-prefix num-columns)
col-names (name-vec col-prefix num-columns)
generate-category-values (fn [col-index]
;; Generate 2 to 10 category vals
(let [num-category-vals (+ 2 (rand-int 8))]
(name-vec "val-" num-category-vals)))
column-types (let [num-categorical-cols (int (math/round (* num-columns p-categorical)))]
(->> (range num-columns)
(map #(if (< % num-categorical-cols)
{:type :categorical
:values (generate-category-values %)}
{:type :gaussian}))
shuffle
(map-indexed #(assoc %2 :name (format col-format %1)))
vec))
generate-value (fn [{:keys [type values]}]
(case type
; not really gaussian, but shouldn't matter for benchmarks
:gaussian (rand-int 100)
:categorical (rand-nth values)))
generate-row (fn []
(zipmap col-names
(map generate-value column-types)))
data (vec (repeatedly num-rows generate-row))]
{:data data
:column-types column-types}))


(defn ^:private partition-evenly
"Splits xs into n groups of m (or m+1) elements, so each group is as close in
size as possible.
Relative order is not guaranteed."
[n xs]
(assert (<= n (count xs)))
(let [m (int (math/floor (/ (count xs) n)))
num-remaining (mod (count xs) n)
[groups remaining] (split-at (* m n) xs)
[groups-to-enlarge finished-groups] (split-at num-remaining (partition m groups))]
(into (map conj groups-to-enlarge remaining)
finished-groups)))


(defn spec-views
"Creates XCat views for the spec.
Evenly splits the columns across views. Not realistic, but better for
consistent benchmarking."
[num-views column-types {:keys [categorical-alpha m r s nu]}]
(let [categorical-hypers {:alpha categorical-alpha}
gaussian-hypers {:m m :r r :s s :nu nu}
hyper-groups (->> column-types
(partition-evenly num-views)
(map (fn [hg]
(reduce (fn [hypers col-type]
(assoc hypers
(:name col-type)
(if (= :categorical (:type col-type))
categorical-hypers
gaussian-hypers)))
{}
hg))))]
(zipmap (name-vec view-prefix num-views)
(map #(array-map :hypers %) hyper-groups))))


(defn latents-local
"Creates XCat latents for the model"
[data num-views num-clusters-per-view alpha]
(let [data-size (count data)
view-names (name-vec view-prefix num-views)
cluster-names (name-vec "clust-" num-clusters-per-view)
clusters (zipmap cluster-names
(partition-evenly num-clusters-per-view (range data-size)))]
(reduce
(fn [local view-name]
(assoc local
view-name
{:alpha alpha
:counts (medley/map-vals count clusters)
:y (reduce-kv (fn [y cl-name row-nums]
(reduce #(assoc %1 %2 cl-name)
y
row-nums))
{}
clusters)}))
{}
view-names)))


(defn generate-model
"Generates a CrossCat model from the given data and options.
NB: only categorical and gaussian primitives are currently supported."
[data column-types num-views num-clusters-per-view {:keys [local-alpha global-alpha] :as opts}]
(assert (<= num-clusters-per-view (count data)))
(let [categorical-cols (filter #(= :categorical (:type %)) column-types)
spec-col-types (zipmap (map :name column-types)
(map :type column-types))
options (zipmap (map :name categorical-cols)
(map #(-> % :values vec) categorical-cols))

xcat-spec {:views (spec-views num-views column-types opts)
:types spec-col-types}
xcat-latents {:global {:alpha global-alpha}
:local (latents-local data num-views num-clusters-per-view local-alpha)}]

(xcat/construct-xcat-from-latents xcat-spec xcat-latents data {:options options})))


(defn generate-db
"Generates a completely synthetic data set and XCat model, and returns a
valid query database.
Strictly for benchmarking. The data is largely random, and the model is not
actually trained from it.
Example usage:
(def db (generate-db {:num-rows 100
:num-columns 10
:p-categorical 0.6
:num-views 3
:num-clusters-per-view 2}))"
[{:keys [num-rows
num-columns
p-categorical
num-views
num-clusters-per-view
m
r
s
nu
categorical-alpha
local-alpha
global-alpha]
:as opts}]
(let [opts (merge {:categorical-alpha *default-categorical-alpha*
:local-alpha *default-local-alpha*
:global-alpha *default-global-alpha*
:m *default-m*
:r *default-r*
:s *default-s*
:nu *default-nu*}
opts)
{:keys [data column-types]} (generate-data num-rows num-columns p-categorical)
model (generate-model data
column-types
num-views
num-clusters-per-view
opts)]
(-> (db/empty)
(db/with-table *default-table-name* data)
(db/with-model *default-model-name* model)
(assoc :synthetic/column-types column-types))))

0 comments on commit 7d15919

Please sign in to comment.