-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbenchmark-lookup.ts
155 lines (141 loc) · 5.35 KB
/
benchmark-lookup.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import { db } from "@/server/db";
import { kernScores, kernPatterns, kernPatternMatches} from "@/server/db/schema";
import dotenv from 'dotenv';
import { meiToRegex } from "@/common/meiToRegex";
import { desc, sql } from "drizzle-orm";
import { generatePatterns } from "pattern-generator";
import {checkAndReturnPattern, sequentialScan} from "@/server/api/routers/search";
import fs from 'fs';
dotenv.config();
async function benchmarkLookup() {
// this function benchmarks the lookup of each pattern in the db
const patterns = generatePatterns();
const regex_patterns = patterns.map((pattern) => meiToRegex(pattern));
const report_seq_scan = await sequentialPatternSearch(regex_patterns);
const report_inv_index = await invertedIndexPatternSearch(regex_patterns);
const final_report = {
sequential_scan: report_seq_scan,
inverted_index: report_inv_index,
}
return final_report;
}
class MedianTracker {
// class to keep track of the median of a stream of numbers
// using two heaps
// https://www.geeksforgeeks.org/median-of-stream-of-integers-running-integers/
// this isn't actually a heap lol but it's fine
private lowerHeap: number[];
private higherHeap: number[];
constructor() {
this.lowerHeap = [];
this.higherHeap = [];
}
addNum(num: number) {
if (this.lowerHeap.length == 0 || num < this.lowerHeap[0]!) {
this.lowerHeap.push(num);
this.lowerHeap.sort((a, b) => b - a);
} else {
this.higherHeap.push(num);
this.higherHeap.sort((a, b) => a - b);
}
if (this.lowerHeap.length > this.higherHeap.length + 1) {
let num = this.lowerHeap.shift();
this.higherHeap.push(num!);
this.higherHeap.sort((a, b) => a - b);
} else if (this.higherHeap.length > this.lowerHeap.length) {
let num = this.higherHeap.shift();
this.lowerHeap.push(num!);
this.lowerHeap.sort((a, b) => b - a);
}
}
getMedian() {
if (this.lowerHeap.length == 0) {
return 0;
}
if (this.lowerHeap.length == this.higherHeap.length) {
return (this.lowerHeap[0]! + this.higherHeap[0]!) / 2;
} else {
return this.lowerHeap[0];
}
}
}
async function sequentialPatternSearch(patterns: string[]) {
// patterns in regex format
// this function searches for each pattern sequentially
// measurements are all in milliseconds
const measurements = {
min_time: Number.MAX_SAFE_INTEGER,
max_time: Number.MIN_SAFE_INTEGER,
count: 0,
sum_of_times: 0,
}
const medianTracker = new MedianTracker();
const match_count_and_time = [];
for (let pattern of patterns) {
console.log(`searching for pattern ${pattern}`);
let start = Date.now();
let results = await sequentialScan(pattern, db);
let end = Date.now();
let time = end - start;
measurements.min_time = Math.min(measurements.min_time, time);
measurements.max_time = Math.max(measurements.max_time, time);
measurements.sum_of_times += time;
measurements.count += 1;
medianTracker.addNum(time);
match_count_and_time.push([pattern, results.length, time]);
}
report_to_csv("sequential_scan_report.csv", match_count_and_time);
const report = {
min_time: measurements.min_time,
max_time: measurements.max_time,
avg_time: measurements.sum_of_times / measurements.count,
median_time: medianTracker.getMedian(),
}
return report;
}
async function invertedIndexPatternSearch(patterns: string[]) {
// patterns in regex format
// this function searches for each pattern using the inverted index
// measurements are all in milliseconds
const measurements = {
min_time: Number.MAX_SAFE_INTEGER,
max_time: Number.MIN_SAFE_INTEGER,
count: 0,
sum_of_times: 0,
}
const medianTracker = new MedianTracker();
const match_count_and_time = [];
for (let pattern of patterns) {
console.log(`inv index searching for pattern ${pattern}`);
let start = Date.now();
let results = await checkAndReturnPattern(pattern, db);
let end = Date.now();
let time = end - start;
measurements.min_time = Math.min(measurements.min_time, time);
measurements.max_time = Math.max(measurements.max_time, time);
measurements.sum_of_times += time;
measurements.count += 1;
medianTracker.addNum(time);
match_count_and_time.push([pattern, results.length, time]);
}
report_to_csv("inverted_index_report.csv", match_count_and_time);
const report = {
min_time: measurements.min_time,
max_time: measurements.max_time,
avg_time: measurements.sum_of_times / measurements.count,
median_time: medianTracker.getMedian(),
}
return report;
}
function report_to_csv(filename: string, match_count_and_time: any) {
// this function writes the report to a csv file
let csv = "match_count, time_taken\n";
for (let [_, match_count, time_taken] of match_count_and_time) {
csv += `${match_count}, ${time_taken}\n`;
}
fs.writeFileSync(filename, csv);
}
// run the benchmark
let result = await benchmarkLookup();
console.log(result);
process.exit(0);