kitsuyui · kitsuyui · Dec 30, 2024 · Dec 30, 2024
diff --git a/packages/word-stats/README.md b/packages/word-stats/README.md
@@ -0,0 +1,35 @@
+# @kitsuyui/word-stats
+
+A simple word-stats manipulation library
+
+## Installation
+
+### NPM
+
+```bash
+npm install @kitsuyui/word-stats
+```
+
+### Yarn
+
+```bash
+yarn add @kitsuyui/word-stats
+```
+
+### PNPM
+
+```bash
+pnpm add @kitsuyui/word-stats
+```
+
+## Usage
+
+### convertCase
+
+```typescript
+
+```
+
+## License
+
+MIT
diff --git a/packages/word-stats/package.json b/packages/word-stats/package.json
@@ -0,0 +1,28 @@
+{
+  "name": "@kitsuyui/word-stats",
+  "version": "0.0.0",
+  "license": "MIT",
+  "author": "Yui Kitsu <[email protected]>",
+  "description": "Word statistics package (count, TF-IDF, etc.)",
+  "scripts": {
+    "build": "tsup src/index.ts --clean",
+    "dev": "pnpm build --watch"
+  },
+  "exports": {
+    ".": {
+      "require": {
+        "type": "./dist/index.d.mts",
+        "default": "./dist/index.cjs"
+      },
+      "import": {
+        "type": "./dist/index.d.mts",
+        "default": "./dist/index.mjs"
+      }
+    }
+  },
+  "main": "dist/index.js",
+  "module": "dist/index.js",
+  "types": "dist/index.d.ts",
+  "files": ["dist", "package.json"],
+  "devDependencies": {}
+}
diff --git a/packages/word-stats/src/index.spec.ts b/packages/word-stats/src/index.spec.ts
@@ -0,0 +1,80 @@
+import { describe, expect, it, jest } from '@jest/globals'
+
+import {
+  computeInverseDocumentFrequency,
+  computeTermFrequencies,
+  extractUniqueWords,
+  wordCount,
+} from './index'
+
+describe('wordCount', () => {
+  it('should count words', () => {
+    const documents = [
+      ['a', 'b', 'c'],
+      ['a', 'b', 'd'],
+    ]
+    const result = wordCount(documents)
+    expect(result).toEqual({ a: 2, b: 2, c: 1, d: 1 })
+  })
+})
+
+describe('computeTermFrequencies', () => {
+  it('should calculate tf', () => {
+    const documents = [
+      ['a', 'b', 'c', 'c'],
+      ['a', 'b', 'd'],
+    ]
+    const result = computeTermFrequencies(documents)
+    expect(result).toEqual([
+      { a: 0.25, b: 0.25, c: 0.5 },
+      { a: 0.3333333333333333, b: 0.3333333333333333, d: 0.3333333333333333 },
+    ])
+  })
+})
+
+describe('computeInverseDocumentFrequency', () => {
+  it('should calculate idf', () => {
+    const documents = [
+      ['a', 'b', 'c'],
+      ['a', 'b', 'd'],
+      ['a', 'b', 'e'],
+      ['a', 'b', 'f'],
+      ['a', 'b', 'g'],
+      ['a', 'b', 'h'],
+      ['a', 'b', 'i'],
+      ['a', 'b', 'j'],
+      ['a', 'b', 'k'],
+      ['a', 'b', 'l'],
+    ]
+    const result = computeInverseDocumentFrequency(documents)
+    const tobe = Math.LN10 // 2.302585092994046
+
+    expect(result).toEqual({
+      a: 0.0,
+      b: 0.0,
+      c: tobe,
+      d: tobe,
+      e: tobe,
+      f: tobe,
+      g: tobe,
+      h: tobe,
+      i: tobe,
+      j: tobe,
+      k: tobe,
+      l: tobe,
+    })
+  })
+})
+
+describe('uniqueWords', () => {
+  it('should return unique words', () => {
+    const document = ['a', 'b', 'c', 'a', 'c']
+    const result = extractUniqueWords(document)
+    expect(result).toEqual(new Set(['a', 'b', 'c']))
+  })
+  it('should return empty set for empty document', () => {
+    const document: string[] = []
+    const result = extractUniqueWords(document)
+    expect(result).toEqual(new Set())
+  })
+})
diff --git a/packages/word-stats/src/index.ts b/packages/word-stats/src/index.ts
@@ -0,0 +1,115 @@
+type Word = string
+type Document = Word[]
+type WordScore = { [word: Word]: number }
+
+/**
+ * Count the number of words in the documents
+ * @example
+ * ```ts
+ * const documents = [
+ *  ['a', 'b', 'c'],
+ *  ['a', 'b', 'd'],
+ * ]
+ * const result = wordCount(documents)
+ * result // => { a: 2, b: 2, c: 1, d: 1 }
+ * ```
+ * @param documents - The documents to count words from (array of arrays of words)
+ * @returns A dictionary of words and their counts
+ */
+export const wordCount = (documents: Document[]): WordScore => {
+  const wordCounts: WordScore = {}
+  for (const document of documents) {
+    for (const word of document) {
+      if (!wordCounts[word]) {
+        wordCounts[word] = 0
+      }
+      wordCounts[word]++
+    }
+  }
+  return wordCounts
+}
+
+/**
+ * Calculate TF (Term Frequency)
+ * @example
+ * ```ts
+ * const documents = [
+ *  ['a', 'b', 'c', 'c],
+ *  ['a', 'b', 'd'],
+ * ]
+ *
+ * const result = computeTermFrequencies(documents)
+ * result // => [{ a: 0.25, b: 0.25, c: 0.5 }, { a: 0.3333333333333333, b: 0.3333333333333333, d: 0.3333333333333333 }]
+ * ```
+ * @param documents - The documents to calculate TF from
+ * @returns An array of dictionaries of words and their TF
+ */
+export const computeTermFrequencies = (documents: Document[]): WordScore[] => {
+  const tfs: WordScore[] = []
+  for (const document of documents) {
+    const words = document
+    const wordCounts = wordCount([words])
+    const wordTFs: WordScore = {}
+    for (const word in wordCounts) {
+      const count = wordCounts[word]
+      wordTFs[word] = count / words.length
+    }
+    tfs.push(wordTFs)
+  }
+  return tfs
+}
+
+/**
+ * Calculate IDF (Inverse Document Frequency)
+ * @example
+ * ```ts
+ * const documents = [
+ *   ['a', 'b', 'c'],
+ *   ['a', 'b', 'd'],
+ * ]
+ * const result = computeInverseDocumentFrequency(documents)
+ * result // => { a: 0.0, b: 0.0, c: 0.6931471805599453, d: 0.6931471805599453 }
+ * ```
+ * @param documents - The documents to calculate IDF from
+ * @returns A dictionary of words and their IDF
+ */
+export const computeInverseDocumentFrequency = (
+  documents: Document[]
+): WordScore => {
+  const documentCount = documents.length
+  const wordDocumentCounts: WordScore = {}
+  for (const document of documents) {
+    const words = extractUniqueWords(document)
+    for (const word of words) {
+      if (!wordDocumentCounts[word]) {
+        wordDocumentCounts[word] = 0
+      }
+      wordDocumentCounts[word]++
+    }
+  }
+  const wordIDFs: WordScore = {}
+  for (const word in wordDocumentCounts) {
+    const count = wordDocumentCounts[word]
+    wordIDFs[word] = Math.log(documentCount / count)
+  }
+  return wordIDFs
+}
+
+/**
+ * Get unique words from the document
+ * @example
+ * ```ts
+ * const document = ['a', 'b', 'c', 'c', 'a']
+ * const result = extractUniqueWords([document])
+ * result // => new Set(['a', 'b', 'c'])
+ * ```
+ * @param document - The document to extract unique words from
+ * @returns A set of unique words
+ */
+export const extractUniqueWords = (document: Document): Set<Word> => {
+  const words = new Set<Word>()
+  for (const word of document) {
+    words.add(word)
+  }
+  return words
+}