chore: wip

ndabAP · Jan 18, 2025 · 716bc50 · 716bc50
1 parent c1fa941
commit 716bc50
Show file tree

Hide file tree

Showing 36 changed files with 512 additions and 1,633 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2020 Julian Claus
+Copyright (c) 2025 Julian Claus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/Makefile b/Makefile
@@ -15,17 +15,14 @@ build: windows linux darwin
 	@echo version: $(VERSION)
 
 windows: $(WINDOWS)
-
-linux: $(LINUX)
-
-darwin: $(DARWIN)
-
 $(WINDOWS):
 	env GOOS=windows GOARCH=amd64 go build -v -o bin/$(WINDOWS) -ldflags="-s -w -X main.version=$(VERSION)" ./cli/main.go
 
+linux: $(LINUX)
 $(LINUX):
 	env GOOS=linux GOARCH=amd64 go build -v -o bin/$(LINUX) -ldflags="-s -w -X main.version=$(VERSION)" ./cli/main.go
 
+darwin: $(DARWIN)
 $(DARWIN):
 	env GOOS=darwin GOARCH=amd64 go build -v -o bin/$(DARWIN) -ldflags="-s -w -X main.version=$(VERSION)" ./cli/main.go
 

diff --git a/README.md b/README.md
@@ -1,166 +1,3 @@
-# assocentity
-
-[![Go Report Card](https://goreportcard.com/badge/github.com/ndabAP/assocentity/v13)](https://goreportcard.com/report/github.com/ndabAP/assocentity/v13)
-
-Package assocentity is a social science tool to analyze the relative distance
-from tokens to entities. The motiviation is to make conclusions based on the
-distance from interesting tokens to a certain entity and its synonyms.
-
-## Features
-
-- Provide your own tokenizer
-- Provides a default NLP tokenizer (by Google)
-- Define aliases for entities
-- Provides a multi-OS, language-agnostic CLI version
-
-## Installation
-
-```bash
-$ go get github.com/ndabAP/assocentity/v13
-```
-
-## Prerequisites
-
-If you want to analyze human readable texts you can use the provided Natural
-Language tokenizer (powered by Google). To do so, sign-up for a Cloud Natural
-Language API service account key and download the generated JSON file. This
-equals the `credentialsFile` at the example below. You should never commit that
-file.
-
-A possible offline tokenizer would be a white space tokenizer. You also might
-use a parser depending on your purposes.
-
-## Example
-
-We would like to find out which adjectives are how close in average to a certain
-public person. Let's take George W. Bush and 1,000 NBC news articles as an
-example. "George Bush" is the entity and synonyms are "George Walker Bush" and
-"Bush" and so on. The text is each of the 1,000 NBC news articles.
-
-Defining a text source and to set the entity would be first step. Next, we need
-to instantiate our tokenizer. In this case, we use the provided Google NLP
-tokenizer. Finally, we can calculate our mean distances. We can use
-`assocentity.Distances`, which accepts multiple texts. Notice
-how we pass `tokenize.ADJ` to only include adjectives as part of speech.
-Finally, we can take the mean by passing the result to `assocentity.Mean`.
-
-```go
-// Define texts source and entity
-texts := []string{
-	"Former Presidents Barack Obama, Bill Clinton and ...", // Truncated
-	"At the pentagon on the afternoon of 9/11, ...",
-	"Tony Blair moved swiftly to place his relationship with ...",
-}
-entities := []string{
-	"Goerge Walker Bush",
-	"Goerge Bush",
-	"Bush",
-}
-source := assocentity.NewSource(entities, texts)
-
-// Instantiate the NLP tokenizer (powered by Google)
-nlpTok := nlp.NewNLPTokenizer(credentialsFile, nlp.AutoLang)
-
-// Get the distances to adjectives
-ctx := context.TODO()
-dists, err := assocentity.Distances(ctx, nlpTok, tokenize.ADJ, source)
-if err != nil {
-	// Handle error
-}
-// Get the mean from the distances
-mean := assocentity.Mean(dists)
-```
-
-### Tokenization
-
-If you provide your own tokenizer you must implement the interface with the
-method `Tokenize` and the following signature:
-
-```go
-type Tokenizer interface {
-	Tokenize(ctx context.Context, text string) ([]Token, error)
-}
-```
-
-`Token` is of type:
-
-```go
-type Token struct {
-	PoS  PoS    // Part of speech
-	Text string // Text
-}
-
-// Part of speech
-type PoS int
-```
-
-For example, given the text:
-
-```go
-text := "Punchinello was burning to get me"
-```
-
-The result from `Tokenize` would be:
-
-```go
-[]Token{
-	{
-		Text: "Punchinello",
-		PoS:  tokenize.NOUN,
-	},
-	{
-		Text: "was",
-		PoS:  tokenize.VERB,
-	},
-	{
-		Text: "burning",
-		PoS:  tokenize.VERB,
-	},
-	{
-		Text: "to",
-		PoS:  tokenize.PRT,
-	},
-	{
-		Text: "get",
-		PoS:  tokenize.VERB,
-	},
-	{
-		Text: "me",
-		PoS:  tokenize.PRON,
-	},
-}
-```
-
-## CLI
-
-There is also a language-agnostic terminal version available for either Windows,
-Mac (Darwin) or Linux (only with 64-bit support) if you don't have Go available.
-The application expects the text from "stdin" and accepts the following flags:
-
-| Flag          | Description                                                                                       | Type     | Default |
-| ------------- | ------------------------------------------------------------------------------------------------- | -------- | ------- |
-| `entities`    | Define entities to be searched within input, example: `-entities="Max Payne,Payne"`               | `string` |         |
-| `gog-svc-loc` | Google Clouds NLP JSON service account file, example: `-gog-svc-loc="/home/max/gog-svc-loc.json"` | `string` |         |
-| `op`          | Operation to excute: `-op="mean"`                                                                 | `string` | `mean`  |
-| `pos`         | Defines part of speeches to keep, example: `-pos=noun,verb,pron`                                  | `string` | `any`   |
-
-Example:
-
-```bash
-echo "Relax, Max. You're a nice guy." | ./bin/assocentity_linux_amd64_v13.0.0-0-g948274a-dirty -gog-svc-loc=/home/max/.config/assocentity/google-service.json -entities="Max Payne,Payne,Max"
-```
-
-The output is written to "stdout" in appropoiate formats.
-
-## Projects using assocentity
-
-- [entityscrape](https://github.com/ndabAP/entityscrape) - Distance between word
-  types (default: adjectives) in news articles and persons
-
-## Author
-
-[Julian Claus](https://www.julian-claus.de) and contributors.
-
-## License
-
-MIT
+- Source.Tokenize() and Source.Sentiment() (different NLP API)
+- Which leader uses more swear words
+- Count(...args)
diff --git a/assocentity.go b/assocentity.go
@@ -1,13 +1,18 @@
 package assocentity
 
 import (
+	"container/list"
 	"context"
-	"math"
+	"slices"
+	"unique"
+
+	"github.com/ndabAP/assocentity/v14/tokenizer"
 )
 
 type (
 	Tokens struct {
-
+		entities []string
+		texts    []*list.List
 	}
 
 	// source wraps entities and texts
@@ -17,20 +22,45 @@ type (
 	}
 )
 
-// NewSource returns a new source, consisting of entities and texts
+// NewSource returns a new source, consisting of entities and texts. Duplicate
+// entities are removed
 func NewSource(entities, texts []string) source {
-	entities = slices.Compact(entities)
+	e := make([]string, 0)
+	for _, entity := range entities {
+		if !slices.Contains(e, entity) {
+			e = append(e, entity)
+		}
+	}
 	return source{
-		Entities: entities,
+		Entities: e,
 		Texts:    texts,
 	}
 }
 
-func (s source) Tokenize(ctx context.Context, tokenizer Tokenizer) (Tokens, error) {
+func (s source) Tokenize(ctx context.Context, tokenizer tokenizer.Tokenizer) (Tokens, error) {
 	var tokens Tokens
-	if ctx.Done() {
-		return tokens, nil
-	}
 
+	tokens.entities = s.Texts
+	tokens.texts = make([]*list.List, 0, len(s.Texts))
+	for _, text := range s.Texts {
+		select {
+		case <-ctx.Done():
+			return tokens, ctx.Err()
+		default:
+		}
+
+		tok, err := tokenizer.Tokenize(ctx, text)
+		if err != nil {
+			return tokens, err
+		}
 
-}
+		l := list.New()
+		for _, t := range tok {
+			l.PushBack(unique.Make(t))
+		}
+
+		tokens.texts = append(tokens.texts, l)
+	}
+
+	return tokens, nil
+}