Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement word-stats package #488

Merged
merged 1 commit into from
Dec 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions packages/word-stats/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# @kitsuyui/word-stats

A simple word-stats manipulation library

## Installation

### NPM

```bash
npm install @kitsuyui/word-stats
```

### Yarn

```bash
yarn add @kitsuyui/word-stats
```

### PNPM

```bash
pnpm add @kitsuyui/word-stats
```

## Usage

### convertCase

```typescript

```

## License

MIT
28 changes: 28 additions & 0 deletions packages/word-stats/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"name": "@kitsuyui/word-stats",
"version": "0.0.0",
"license": "MIT",
"author": "Yui Kitsu <[email protected]>",
"description": "Word statistics package (count, TF-IDF, etc.)",
"scripts": {
"build": "tsup src/index.ts --clean",
"dev": "pnpm build --watch"
},
"exports": {
".": {
"require": {
"type": "./dist/index.d.mts",
"default": "./dist/index.cjs"
},
"import": {
"type": "./dist/index.d.mts",
"default": "./dist/index.mjs"
}
}
},
"main": "dist/index.js",
"module": "dist/index.js",
"types": "dist/index.d.ts",
"files": ["dist", "package.json"],
"devDependencies": {}
}
80 changes: 80 additions & 0 deletions packages/word-stats/src/index.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import { describe, expect, it, jest } from '@jest/globals'

import {
computeInverseDocumentFrequency,
computeTermFrequencies,
extractUniqueWords,
wordCount,
} from './index'

describe('wordCount', () => {
it('should count words', () => {
const documents = [
['a', 'b', 'c'],
['a', 'b', 'd'],
]
const result = wordCount(documents)
expect(result).toEqual({ a: 2, b: 2, c: 1, d: 1 })
})
})

describe('computeTermFrequencies', () => {
it('should calculate tf', () => {
const documents = [
['a', 'b', 'c', 'c'],
['a', 'b', 'd'],
]
const result = computeTermFrequencies(documents)
expect(result).toEqual([
{ a: 0.25, b: 0.25, c: 0.5 },
{ a: 0.3333333333333333, b: 0.3333333333333333, d: 0.3333333333333333 },
])
})
})

describe('computeInverseDocumentFrequency', () => {
it('should calculate idf', () => {
const documents = [
['a', 'b', 'c'],
['a', 'b', 'd'],
['a', 'b', 'e'],
['a', 'b', 'f'],
['a', 'b', 'g'],
['a', 'b', 'h'],
['a', 'b', 'i'],
['a', 'b', 'j'],
['a', 'b', 'k'],
['a', 'b', 'l'],
]
const result = computeInverseDocumentFrequency(documents)
const tobe = Math.LN10 // 2.302585092994046

expect(result).toEqual({
a: 0.0,
b: 0.0,
c: tobe,
d: tobe,
e: tobe,
f: tobe,
g: tobe,
h: tobe,
i: tobe,
j: tobe,
k: tobe,
l: tobe,
})
})
})

describe('uniqueWords', () => {
it('should return unique words', () => {
const document = ['a', 'b', 'c', 'a', 'c']
const result = extractUniqueWords(document)
expect(result).toEqual(new Set(['a', 'b', 'c']))
})
it('should return empty set for empty document', () => {
const document: string[] = []
const result = extractUniqueWords(document)
expect(result).toEqual(new Set())
})
})
115 changes: 115 additions & 0 deletions packages/word-stats/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
type Word = string
type Document = Word[]
type WordScore = { [word: Word]: number }

/**
* Count the number of words in the documents
* @example
* ```ts
* const documents = [
* ['a', 'b', 'c'],
* ['a', 'b', 'd'],
* ]
* const result = wordCount(documents)
* result // => { a: 2, b: 2, c: 1, d: 1 }
* ```
* @param documents - The documents to count words from (array of arrays of words)
* @returns A dictionary of words and their counts
*/
export const wordCount = (documents: Document[]): WordScore => {
const wordCounts: WordScore = {}
for (const document of documents) {
for (const word of document) {
if (!wordCounts[word]) {
wordCounts[word] = 0
}
wordCounts[word]++
}
}
return wordCounts
}

/**
* Calculate TF (Term Frequency)
* @example
* ```ts
* const documents = [
* ['a', 'b', 'c', 'c],
* ['a', 'b', 'd'],
* ]
*
* const result = computeTermFrequencies(documents)
* result // => [{ a: 0.25, b: 0.25, c: 0.5 }, { a: 0.3333333333333333, b: 0.3333333333333333, d: 0.3333333333333333 }]
* ```
* @param documents - The documents to calculate TF from
* @returns An array of dictionaries of words and their TF
*/
export const computeTermFrequencies = (documents: Document[]): WordScore[] => {
const tfs: WordScore[] = []
for (const document of documents) {
const words = document
const wordCounts = wordCount([words])
const wordTFs: WordScore = {}
for (const word in wordCounts) {
const count = wordCounts[word]
wordTFs[word] = count / words.length
}
tfs.push(wordTFs)
}
return tfs
}

/**
* Calculate IDF (Inverse Document Frequency)
* @example
* ```ts
* const documents = [
* ['a', 'b', 'c'],
* ['a', 'b', 'd'],
* ]
* const result = computeInverseDocumentFrequency(documents)
* result // => { a: 0.0, b: 0.0, c: 0.6931471805599453, d: 0.6931471805599453 }
* ```
* @param documents - The documents to calculate IDF from
* @returns A dictionary of words and their IDF
*/
export const computeInverseDocumentFrequency = (
documents: Document[]
): WordScore => {
const documentCount = documents.length
const wordDocumentCounts: WordScore = {}
for (const document of documents) {
const words = extractUniqueWords(document)
for (const word of words) {
if (!wordDocumentCounts[word]) {
wordDocumentCounts[word] = 0
}
wordDocumentCounts[word]++
}
}
const wordIDFs: WordScore = {}
for (const word in wordDocumentCounts) {
const count = wordDocumentCounts[word]
wordIDFs[word] = Math.log(documentCount / count)
}
return wordIDFs
}

/**
* Get unique words from the document
* @example
* ```ts
* const document = ['a', 'b', 'c', 'c', 'a']
* const result = extractUniqueWords([document])
* result // => new Set(['a', 'b', 'c'])
* ```
* @param document - The document to extract unique words from
* @returns A set of unique words
*/
export const extractUniqueWords = (document: Document): Set<Word> => {
const words = new Set<Word>()
for (const word of document) {
words.add(word)
}
return words
}
Loading