diff --git a/packages/word-stats/README.md b/packages/word-stats/README.md new file mode 100644 index 00000000..3cc4a72b --- /dev/null +++ b/packages/word-stats/README.md @@ -0,0 +1,35 @@ +# @kitsuyui/word-stats + +A simple word-stats manipulation library + +## Installation + +### NPM + +```bash +npm install @kitsuyui/word-stats +``` + +### Yarn + +```bash +yarn add @kitsuyui/word-stats +``` + +### PNPM + +```bash +pnpm add @kitsuyui/word-stats +``` + +## Usage + +### convertCase + +```typescript + +``` + +## License + +MIT diff --git a/packages/word-stats/package.json b/packages/word-stats/package.json new file mode 100644 index 00000000..ca6de023 --- /dev/null +++ b/packages/word-stats/package.json @@ -0,0 +1,28 @@ +{ + "name": "@kitsuyui/word-stats", + "version": "0.0.0", + "license": "MIT", + "author": "Yui Kitsu ", + "description": "Word statistics package (count, TF-IDF, etc.)", + "scripts": { + "build": "tsup src/index.ts --clean", + "dev": "pnpm build --watch" + }, + "exports": { + ".": { + "require": { + "type": "./dist/index.d.mts", + "default": "./dist/index.cjs" + }, + "import": { + "type": "./dist/index.d.mts", + "default": "./dist/index.mjs" + } + } + }, + "main": "dist/index.js", + "module": "dist/index.js", + "types": "dist/index.d.ts", + "files": ["dist", "package.json"], + "devDependencies": {} +} diff --git a/packages/word-stats/src/index.spec.ts b/packages/word-stats/src/index.spec.ts new file mode 100644 index 00000000..d42c6bd1 --- /dev/null +++ b/packages/word-stats/src/index.spec.ts @@ -0,0 +1,80 @@ +import { describe, expect, it, jest } from '@jest/globals' + +import { + computeInverseDocumentFrequency, + computeTermFrequencies, + extractUniqueWords, + wordCount, +} from './index' + +describe('wordCount', () => { + it('should count words', () => { + const documents = [ + ['a', 'b', 'c'], + ['a', 'b', 'd'], + ] + const result = wordCount(documents) + expect(result).toEqual({ a: 2, b: 2, c: 1, d: 1 }) + }) +}) + +describe('computeTermFrequencies', () => { + it('should calculate tf', () => { + const documents = [ + ['a', 'b', 'c', 'c'], + ['a', 'b', 'd'], + ] + const result = computeTermFrequencies(documents) + expect(result).toEqual([ + { a: 0.25, b: 0.25, c: 0.5 }, + { a: 0.3333333333333333, b: 0.3333333333333333, d: 0.3333333333333333 }, + ]) + }) +}) + +describe('computeInverseDocumentFrequency', () => { + it('should calculate idf', () => { + const documents = [ + ['a', 'b', 'c'], + ['a', 'b', 'd'], + ['a', 'b', 'e'], + ['a', 'b', 'f'], + ['a', 'b', 'g'], + ['a', 'b', 'h'], + ['a', 'b', 'i'], + ['a', 'b', 'j'], + ['a', 'b', 'k'], + ['a', 'b', 'l'], + ] + const result = computeInverseDocumentFrequency(documents) + const tobe = Math.LN10 // 2.302585092994046 + + expect(result).toEqual({ + a: 0.0, + b: 0.0, + c: tobe, + d: tobe, + e: tobe, + f: tobe, + g: tobe, + h: tobe, + i: tobe, + j: tobe, + k: tobe, + l: tobe, + }) + }) +}) + +describe('uniqueWords', () => { + it('should return unique words', () => { + const document = ['a', 'b', 'c', 'a', 'c'] + const result = extractUniqueWords(document) + expect(result).toEqual(new Set(['a', 'b', 'c'])) + }) + it('should return empty set for empty document', () => { + const document: string[] = [] + const result = extractUniqueWords(document) + expect(result).toEqual(new Set()) + }) +}) diff --git a/packages/word-stats/src/index.ts b/packages/word-stats/src/index.ts new file mode 100644 index 00000000..44747535 --- /dev/null +++ b/packages/word-stats/src/index.ts @@ -0,0 +1,115 @@ +type Word = string +type Document = Word[] +type WordScore = { [word: Word]: number } + +/** + * Count the number of words in the documents + * @example + * ```ts + * const documents = [ + * ['a', 'b', 'c'], + * ['a', 'b', 'd'], + * ] + * const result = wordCount(documents) + * result // => { a: 2, b: 2, c: 1, d: 1 } + * ``` + * @param documents - The documents to count words from (array of arrays of words) + * @returns A dictionary of words and their counts + */ +export const wordCount = (documents: Document[]): WordScore => { + const wordCounts: WordScore = {} + for (const document of documents) { + for (const word of document) { + if (!wordCounts[word]) { + wordCounts[word] = 0 + } + wordCounts[word]++ + } + } + return wordCounts +} + +/** + * Calculate TF (Term Frequency) + * @example + * ```ts + * const documents = [ + * ['a', 'b', 'c', 'c], + * ['a', 'b', 'd'], + * ] + * + * const result = computeTermFrequencies(documents) + * result // => [{ a: 0.25, b: 0.25, c: 0.5 }, { a: 0.3333333333333333, b: 0.3333333333333333, d: 0.3333333333333333 }] + * ``` + * @param documents - The documents to calculate TF from + * @returns An array of dictionaries of words and their TF + */ +export const computeTermFrequencies = (documents: Document[]): WordScore[] => { + const tfs: WordScore[] = [] + for (const document of documents) { + const words = document + const wordCounts = wordCount([words]) + const wordTFs: WordScore = {} + for (const word in wordCounts) { + const count = wordCounts[word] + wordTFs[word] = count / words.length + } + tfs.push(wordTFs) + } + return tfs +} + +/** + * Calculate IDF (Inverse Document Frequency) + * @example + * ```ts + * const documents = [ + * ['a', 'b', 'c'], + * ['a', 'b', 'd'], + * ] + * const result = computeInverseDocumentFrequency(documents) + * result // => { a: 0.0, b: 0.0, c: 0.6931471805599453, d: 0.6931471805599453 } + * ``` + * @param documents - The documents to calculate IDF from + * @returns A dictionary of words and their IDF + */ +export const computeInverseDocumentFrequency = ( + documents: Document[] +): WordScore => { + const documentCount = documents.length + const wordDocumentCounts: WordScore = {} + for (const document of documents) { + const words = extractUniqueWords(document) + for (const word of words) { + if (!wordDocumentCounts[word]) { + wordDocumentCounts[word] = 0 + } + wordDocumentCounts[word]++ + } + } + const wordIDFs: WordScore = {} + for (const word in wordDocumentCounts) { + const count = wordDocumentCounts[word] + wordIDFs[word] = Math.log(documentCount / count) + } + return wordIDFs +} + +/** + * Get unique words from the document + * @example + * ```ts + * const document = ['a', 'b', 'c', 'c', 'a'] + * const result = extractUniqueWords([document]) + * result // => new Set(['a', 'b', 'c']) + * ``` + * @param document - The document to extract unique words from + * @returns A set of unique words + */ +export const extractUniqueWords = (document: Document): Set => { + const words = new Set() + for (const word of document) { + words.add(word) + } + return words +}