From 7088a8526118eab64dc37928075b68752f0984f5 Mon Sep 17 00:00:00 2001 From: Matt <77928207+mattzcarey@users.noreply.github.com> Date: Sat, 23 Dec 2023 17:21:32 +0000 Subject: [PATCH] feat: turbopuffer vectorstore --- .gitignore | 2 + packages/crgpt-loader/.env_example | 2 + packages/crgpt-loader/crgpt-loader.ts | 48 ---- packages/crgpt-loader/package.json | 6 +- packages/crgpt-loader/pnpm-lock.yaml | 8 + packages/crgpt-loader/src/constants.ts | 29 +++ packages/crgpt-loader/src/crgpt-loader.ts | 212 ++++++++++++++++++ packages/crgpt-loader/{ => src}/index.ts | 0 .../crgpt-loader/src/lc_wip/githubLoader.ts | 2 + .../src/lc_wip/turbopufferVectorStore.ts | 188 ++++++++++++++++ .../crgpt-loader/src/utils/executeCommand.ts | 23 ++ .../crgpt-loader/src/utils/getEnvVariable.ts | 7 + packages/crgpt-loader/src/utils/index.ts | 3 + packages/crgpt-loader/src/utils/openFile.ts | 11 + packages/crgpt-loader/src/utils/savePage.ts | 27 +++ packages/crgpt-loader/test.ts | 14 +- packages/crgpt-loader/tsconfig.json | 20 ++ 17 files changed, 551 insertions(+), 51 deletions(-) create mode 100644 packages/crgpt-loader/.env_example delete mode 100644 packages/crgpt-loader/crgpt-loader.ts create mode 100644 packages/crgpt-loader/src/constants.ts create mode 100644 packages/crgpt-loader/src/crgpt-loader.ts rename packages/crgpt-loader/{ => src}/index.ts (100%) create mode 100644 packages/crgpt-loader/src/lc_wip/githubLoader.ts create mode 100644 packages/crgpt-loader/src/lc_wip/turbopufferVectorStore.ts create mode 100644 packages/crgpt-loader/src/utils/executeCommand.ts create mode 100644 packages/crgpt-loader/src/utils/getEnvVariable.ts create mode 100644 packages/crgpt-loader/src/utils/index.ts create mode 100644 packages/crgpt-loader/src/utils/openFile.ts create mode 100644 packages/crgpt-loader/src/utils/savePage.ts create mode 100644 packages/crgpt-loader/tsconfig.json diff --git a/.gitignore b/.gitignore index 7d41e374..fcfd91a4 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ dist !jest.config.js *.d.ts +**/data + cdk.context.json # nextjs and sst diff --git a/packages/crgpt-loader/.env_example b/packages/crgpt-loader/.env_example new file mode 100644 index 00000000..81707e6f --- /dev/null +++ b/packages/crgpt-loader/.env_example @@ -0,0 +1,2 @@ +OPENAI_API_KEY= +TURBOPUFFER_API_KEY= diff --git a/packages/crgpt-loader/crgpt-loader.ts b/packages/crgpt-loader/crgpt-loader.ts deleted file mode 100644 index 4850cbd2..00000000 --- a/packages/crgpt-loader/crgpt-loader.ts +++ /dev/null @@ -1,48 +0,0 @@ -import { exec } from "child_process"; -import { promises as fsPromises } from "fs"; -import os from "os"; -import join from "path"; - -export class CRGPTLoader { - private link: string; - - constructor(link: string) { - this.link = link; - } - - public async load(): Promise { - try { - // Create a temporary directory using the promises API - const tempDir = await fsPromises.mkdtemp( - join(os.tmpdir(), "CRGPTLoader-") - ); - - // Clone the repository with depth 1 to get only the latest state of the main branch - const cloneCommand = `git clone --depth 1 --filter=blob:none ${this.link} ${tempDir}`; - await this.executeCommand(cloneCommand); - - // Loop through the files in the cloned directory - const files = await fsPromises.readdir(tempDir); - for (const file of files) { - console.log(`${tempDir}/${file}`); - } - - // Delete the cloned files - await this.executeCommand(`rm -rf ${tempDir}`); - } catch (error) { - console.error("Error in CRGPTLoader:", error); - } - } - - private async executeCommand(command: string): Promise { - return new Promise((resolve, reject) => { - exec(command, (error, stdout, stderr) => { - if (error) { - reject(error); - return; - } - resolve(); - }); - }); - } -} diff --git a/packages/crgpt-loader/package.json b/packages/crgpt-loader/package.json index 0bd0aa34..85cd67c7 100644 --- a/packages/crgpt-loader/package.json +++ b/packages/crgpt-loader/package.json @@ -2,9 +2,10 @@ "name": "crgpt-loader", "version": "0.0.1", "description": "", - "main": "index.js", + "main": "dist/index.js", + "types": "dist/index.d.ts", "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" + "test": "ts-node test.ts" }, "keywords": [ "github", @@ -16,6 +17,7 @@ "author": "Matt Carey", "license": "MIT", "dependencies": { + "dotenv": "^16.3.1", "ignore": "^5.3.0", "langchain": "^0.0.204" }, diff --git a/packages/crgpt-loader/pnpm-lock.yaml b/packages/crgpt-loader/pnpm-lock.yaml index a276ba78..9b6e153b 100644 --- a/packages/crgpt-loader/pnpm-lock.yaml +++ b/packages/crgpt-loader/pnpm-lock.yaml @@ -5,6 +5,9 @@ settings: excludeLinksFromLockfile: false dependencies: + dotenv: + specifier: ^16.3.1 + version: 16.3.1 ignore: specifier: ^5.3.0 version: 5.3.0 @@ -489,6 +492,11 @@ packages: md5: 2.3.0 dev: false + /dotenv@16.3.1: + resolution: {integrity: sha512-IPzF4w4/Rd94bA9imS68tZBaYyBWSCE47V1RGuMrB94iyTOIEwRmVL2x/4An+6mETpLrKJ5hQkB8W4kFAadeIQ==} + engines: {node: '>=12'} + dev: false + /event-target-shim@5.0.1: resolution: {integrity: sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==} engines: {node: '>=6'} diff --git a/packages/crgpt-loader/src/constants.ts b/packages/crgpt-loader/src/constants.ts new file mode 100644 index 00000000..e62af2c0 --- /dev/null +++ b/packages/crgpt-loader/src/constants.ts @@ -0,0 +1,29 @@ +export const removeFolders = [ + "node_modules", + ".git", + ".github", + ".vscode", + ".idea", + "dist", + "build", + "out", + "coverage", + "tmp", + "temp", + "log", + "logs", +]; + +export const lockFiles = ["package-lock.json", "pnpm-lock.yaml", "yarn.lock"]; + +export const removeFoldersCommand = (dir: string): string => { + return `find ${dir} -type d \\( ${removeFolders + .map((folder) => `-name '${folder}'`) + .join(" -o ")} \\) -exec rm -rf {} +`; +}; + +export const removeFilesCommand = (dir: string): string => { + return `find ${dir} -type f \\( ${lockFiles + .map((file) => `-name '${file}'`) + .join(" -o ")} \\) -delete`; +}; diff --git a/packages/crgpt-loader/src/crgpt-loader.ts b/packages/crgpt-loader/src/crgpt-loader.ts new file mode 100644 index 00000000..2c5de44d --- /dev/null +++ b/packages/crgpt-loader/src/crgpt-loader.ts @@ -0,0 +1,212 @@ +import axios, { AxiosResponse } from "axios"; +import dotenv from "dotenv"; +import { promises as fsPromises } from "fs"; +import { Document } from "langchain/document"; +import { OpenAIEmbeddings } from "langchain/embeddings/openai"; +import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; +import os from "os"; +import path from "path"; +import { removeFilesCommand, removeFoldersCommand } from "./constants"; +import { executeCommand, openFile, savePage } from "./utils"; + +dotenv.config(); + +export class CRGPTLoader { + private link: string; + private embeddings: OpenAIEmbeddings; + + constructor(link: string) { + this.link = link; + this.embeddings = new OpenAIEmbeddings(); + } + + private extractRepoName(): string { + return this.link.split("/").slice(-1)[0]; + } + + private async splitDocuments(documents: Document[]): Promise { + const splitter = new RecursiveCharacterTextSplitter({ + chunkSize: 1500, + }); + + return splitter.splitDocuments(documents); + } + + private async buildDocument( + filePath: string + ): Promise> { + return new Document({ + pageContent: await openFile(filePath), + metadata: { + source: filePath, + }, + }); + } + + private async getEmbeddings(documents: Document[]): Promise { + return this.embeddings.embedDocuments( + documents.map((doc) => doc.pageContent) + ); + } + + private async storeDocuments( + documents: Document[], + embeddings: Number[][], + indexName = this.extractRepoName() + ): Promise { + try { + const ids = documents.map((_, index) => index); + const attributes = { + source: documents.map((doc) => doc.metadata.source), + pageContent: documents.map((doc) => doc.pageContent), + }; + + const apiEndpoint = `https://api.turbopuffer.com/v1/vectors/${indexName}`; + const headers = { + Authorization: `Bearer ${process.env.TURBOPUFFER_API_KEY}`, + "Content-Type": "application/json", + }; + + await axios.post( + apiEndpoint, + { + ids, + vectors: embeddings, + attributes, + }, + { headers } + ); + } catch (error) { + console.error("Error storing documents:", error); + throw error; + } + } + + public async load(): Promise { + try { + const tempDir = await this.cloneRepository(); + await this.removeUnwantedFilesAndFolders(tempDir); + + const documents = await this.createDocuments(tempDir); + + const chunks = await this.splitDocuments(documents); + const embeddings = await this.getEmbeddings(chunks); + + await this.storeDocuments(chunks, embeddings); + console.log("Documents stored"); + + await this.cleanup(tempDir); + } catch (error) { + console.error("Error in CRGPTLoader:", error); + } + } + + private async cloneRepository(): Promise { + const tempDir = await fsPromises.mkdtemp( + path.join(os.tmpdir(), "CRGPTLoader-") + ); + const cloneCommand = `git clone --depth 1 ${this.link} ${tempDir}`; + await executeCommand(cloneCommand); + return tempDir; + } + + private async removeUnwantedFilesAndFolders(tempDir: string): Promise { + try { + await executeCommand(removeFoldersCommand(tempDir)); + await executeCommand(removeFilesCommand(tempDir)); + } catch (error) { + console.error("Error removing files or folders:", error); + } + } + + private async cleanup(tempDir: string): Promise { + await executeCommand(`rm -rf ${tempDir}`); + } + + private async createDocuments( + directory: string + ): Promise[]> { + const entries = await fsPromises.readdir(directory, { + withFileTypes: true, + }); + const documents: Document<{ source: string }>[] = []; + + for (const entry of entries) { + const fullPath = path.join(directory, entry.name); + if (entry.isDirectory()) { + documents.push(...(await this.createDocuments(fullPath))); + } else if (entry.isFile()) { + try { + const document = await this.buildDocument(fullPath); + documents.push(document); + } catch (error) { + console.error(`Error reading file ${entry.name}:`, error); + } + } + } + + return documents; + } + + public async read(): Promise { + const namespace = this.extractRepoName(); + let nextCursor = null; + const dataDir = "./data"; + let pageIndex = 0; + + do { + try { + const response = await this.fetchPage(namespace, nextCursor); + + if (response.status === 202) { + // Data not ready, wait and retry + await new Promise((resolve) => setTimeout(resolve, 5000)); // wait for 5 seconds + continue; + } + + const { ids, vectors, attributes, next_cursor } = response.data; + savePage(dataDir, pageIndex, ids, vectors, attributes); + + nextCursor = next_cursor; + pageIndex++; + } catch (error) { + console.error("Error fetching data:", error); + throw error; + } + } while (nextCursor); + } + + private async fetchPage( + namespace: string, + cursor: string | null + ): Promise { + const apiEndpoint = `https://api.turbopuffer.com/v1/vectors/${namespace}`; + const params = cursor ? { cursor } : {}; + + return axios.get(apiEndpoint, { + headers: { Authorization: `Bearer ${process.env.TURBOPUFFER_API_KEY}` }, + params, + maxContentLength: Infinity, + maxBodyLength: Infinity, + }); + } + + public async delete(indexName = this.extractRepoName()): Promise { + try { + // Set up the API endpoint and headers + const apiEndpoint = `https://api.turbopuffer.com/v1/vectors/${indexName}`; + const headers = { + Authorization: `Bearer ${process.env.TURBOPUFFER_API_KEY}`, + }; + + // Make the DELETE request + const response = await axios.delete(apiEndpoint, { headers }); + + // Log the response status + console.log("Delete response:", response.data); + } catch (error) { + console.error("Error deleting documents:", error); + throw error; + } + } +} diff --git a/packages/crgpt-loader/index.ts b/packages/crgpt-loader/src/index.ts similarity index 100% rename from packages/crgpt-loader/index.ts rename to packages/crgpt-loader/src/index.ts diff --git a/packages/crgpt-loader/src/lc_wip/githubLoader.ts b/packages/crgpt-loader/src/lc_wip/githubLoader.ts new file mode 100644 index 00000000..6f82e681 --- /dev/null +++ b/packages/crgpt-loader/src/lc_wip/githubLoader.ts @@ -0,0 +1,2 @@ +//WIP GitHub loader integration using UNIX commands + diff --git a/packages/crgpt-loader/src/lc_wip/turbopufferVectorStore.ts b/packages/crgpt-loader/src/lc_wip/turbopufferVectorStore.ts new file mode 100644 index 00000000..943efe96 --- /dev/null +++ b/packages/crgpt-loader/src/lc_wip/turbopufferVectorStore.ts @@ -0,0 +1,188 @@ +import { Document } from "langchain/document"; +import { Embeddings } from "langchain/embeddings/base"; +import { VectorStore } from "langchain/vectorstores/base"; + +interface TurboPufferHeaders { + headers: { + Authorization: string; + "Content-Type": string; + }; +} + +enum TurboPufferDistanceMetric { + Cosine = "cosine_distance", + Euclidean = "euclidean_squared", +} + +interface TurboPufferQueryResult { + dist: number; + id: number; + vector: number[]; + attributes: Record; +} + +export class TurboPuffer extends VectorStore { + get lc_secrets(): { [key: string]: string } { + return { + apiKey: "TURBOPUFFER_API_KEY", + }; + } + + get lc_aliases(): { [key: string]: string } { + return { + apiKey: "turbopuffer_api_key", + }; + } + + private apiKey: string; + private namespace: string; + private apiEndpoint = "https://api.turbopuffer.com/v1/"; + + public _vectorstoreType(): string { + return "turbopuffer"; + } + + constructor( + embeddings: Embeddings, + args: { + apiKey?: string; + namespace?: string; + } + ) { + super(embeddings, args); + + const apiKey = args.apiKey ?? process.env["TURBOPUFFER_API_KEY"]; + if (!apiKey) { + throw new Error("TurboPuffer api key is not provided."); + } + this.apiKey = apiKey; + this.namespace = args.namespace ?? "default"; + } + + getJsonHeader(): TurboPufferHeaders { + return { + headers: { + Authorization: `Bearer ${this.apiKey}`, + "Content-Type": "application/json", + }, + }; + } + + async addVectors( + vectors: number[][], + documents: Document>[], + options?: { ids?: number[] } + ): Promise { + try { + if (options?.ids && options.ids.length !== vectors.length) { + throw new Error( + "Number of ids provided does not match number of vectors" + ); + } + + if (documents.length !== vectors.length) { + throw new Error( + "Number of documents provided does not match number of vectors" + ); + } + + if (documents.length === 0) { + throw new Error("No documents provided"); + } + + const docIds = options?.ids ?? documents.map((_, index) => index); + + const attributes = { + source: documents.map((doc) => doc.metadata.source), + pageContent: documents.map((doc) => doc.pageContent), + }; + + const data = { + docIds, + vectors, + attributes, + }; + + await fetch(`${this.apiEndpoint}/vectors/${this.namespace}`, { + method: "POST", + headers: this.getJsonHeader().headers, + body: JSON.stringify(data), + }); + } catch (error) { + console.error("Error storing vectors:", error); + throw error; + } + } + + async addDocuments( + documents: Document>[], + options?: { ids?: number[] } + ): Promise { + const vectors = await this.embeddings.embedDocuments( + documents.map((doc) => doc.pageContent) + ); + + return this.addVectors(vectors, documents, options); + } + + async queryVectors( + query: number[], + k: number, + distanceMetric: TurboPufferDistanceMetric, + includeAttributes?: string[], + includeVector?: boolean, + // See https://turbopuffer.com/docs/reference/query for more info + filters?: Record + ): Promise { + const data = { + query, + k, + distanceMetric, + filters, + includeAttributes, + includeVector, + }; + + const response = await fetch( + `${this.apiEndpoint}/vectors/${this.namespace}/query`, + { + method: "POST", + headers: this.getJsonHeader().headers, + body: JSON.stringify(data), + } + ); + + const json = await response.json(); + + return json.results; + } + + async similaritySearchVectorWithScore( + query: number[], + k: number, + filter?: Record + ): Promise<[Document, number][]> { + const search = await this.queryVectors( + query, + k, + TurboPufferDistanceMetric.Cosine, + ["source", "pageContent"], + false, + filter + ); + + const result: [Document, number][] = search.map((res) => { + return [ + new Document({ + pageContent: res.attributes.pageContent, + metadata: { + source: res.attributes.source, + }, + }), + res.dist, + ]; + }); + + return result; + } +} diff --git a/packages/crgpt-loader/src/utils/executeCommand.ts b/packages/crgpt-loader/src/utils/executeCommand.ts new file mode 100644 index 00000000..0e040cbd --- /dev/null +++ b/packages/crgpt-loader/src/utils/executeCommand.ts @@ -0,0 +1,23 @@ +import { exec } from "child_process"; + +export const executeCommand = (command: string): Promise => { + return new Promise((resolve, reject) => { + exec(command, (error, stdout, stderr) => { + if (error) { + console.error( + `Error executing command: ${command}\nError: ${error.message}\nstderr: ${stderr}` + ); + return reject(error); + } + if (stderr) { + console.warn( + `Command executed with warnings: ${command}\nstderr: ${stderr}` + ); + } + if (stdout) { + console.log(`Command executed: ${command}\nstdout: ${stdout}`); + } + resolve(); + }); + }); +}; diff --git a/packages/crgpt-loader/src/utils/getEnvVariable.ts b/packages/crgpt-loader/src/utils/getEnvVariable.ts new file mode 100644 index 00000000..2eb5a863 --- /dev/null +++ b/packages/crgpt-loader/src/utils/getEnvVariable.ts @@ -0,0 +1,7 @@ +export const getEnvVariable = (name: string): string => { + const value = process.env[name]; + if (!value) { + throw new Error(`Missing environment variable: ${name}`); + } + return value; +}; diff --git a/packages/crgpt-loader/src/utils/index.ts b/packages/crgpt-loader/src/utils/index.ts new file mode 100644 index 00000000..41c6dd44 --- /dev/null +++ b/packages/crgpt-loader/src/utils/index.ts @@ -0,0 +1,3 @@ +export * from "./executeCommand"; +export * from "./openFile"; +export * from "./savePage"; diff --git a/packages/crgpt-loader/src/utils/openFile.ts b/packages/crgpt-loader/src/utils/openFile.ts new file mode 100644 index 00000000..862cd943 --- /dev/null +++ b/packages/crgpt-loader/src/utils/openFile.ts @@ -0,0 +1,11 @@ +import { promises as fsPromises } from "fs"; + +export const openFile = async (filePath: string): Promise => { + try { + const content = await fsPromises.readFile(filePath, "utf8"); + return content; + } catch (error) { + console.error("Error reading file:", error); + throw error; + } +}; diff --git a/packages/crgpt-loader/src/utils/savePage.ts b/packages/crgpt-loader/src/utils/savePage.ts new file mode 100644 index 00000000..1a26daf1 --- /dev/null +++ b/packages/crgpt-loader/src/utils/savePage.ts @@ -0,0 +1,27 @@ +import { existsSync, mkdirSync, writeFileSync } from "fs"; +import { join } from "path"; + +export const savePage = ( + dataDir: string, + pageIndex: number, + ids: number[], + vectors: Number[][], + attributes: Record +) => { + if (!existsSync(dataDir)) { + mkdirSync(dataDir); + } + + writeFileSync( + join(dataDir, `ids_${pageIndex}.json`), + JSON.stringify(ids, null, 2) + ); + writeFileSync( + join(dataDir, `vectors_${pageIndex}.json`), + JSON.stringify(vectors, null, 2) + ); + writeFileSync( + join(dataDir, `attributes_${pageIndex}.json`), + JSON.stringify(attributes, null, 2) + ); +}; diff --git a/packages/crgpt-loader/test.ts b/packages/crgpt-loader/test.ts index 3769abe4..76ef3bad 100644 --- a/packages/crgpt-loader/test.ts +++ b/packages/crgpt-loader/test.ts @@ -1 +1,13 @@ -//some tests \ No newline at end of file +import { CRGPTLoader } from "./src/crgpt-loader"; + +const test = () => { + const loader = new CRGPTLoader( + "https://github.com/mattzcarey/code-review-gpt" + ); + + loader.load(); + loader.read(); + loader.delete(); +}; + +test(); diff --git a/packages/crgpt-loader/tsconfig.json b/packages/crgpt-loader/tsconfig.json new file mode 100644 index 00000000..43760374 --- /dev/null +++ b/packages/crgpt-loader/tsconfig.json @@ -0,0 +1,20 @@ +{ + "compilerOptions": { + "target": "es2017", + "module": "commonjs", + "lib": ["es2017", "dom"], + "outDir": "./dist", + "rootDir": "./src", + "strict": true, + "moduleResolution": "node", + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "types": ["node"] + }, + "include": ["src/*.ts", "src/lc_wip/turbopufferVectorStore.ts"], + "exclude": ["./node_modules", "./dist", "./src/**/*.test.ts"], + "engines": { + "node": ">=18" + } +}