Skip to content

Commit

Permalink
chore: wip
Browse files Browse the repository at this point in the history
  • Loading branch information
ndabAP committed Jan 26, 2025
1 parent 437ef24 commit 72081d1
Show file tree
Hide file tree
Showing 13 changed files with 173 additions and 80 deletions.
4 changes: 0 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +0,0 @@
.idea/
.vscode/
*.json
.env
26 changes: 26 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "NLP test",
"type": "go",
"request": "launch",
"mode": "test",
"program": "${workspaceFolder}/tokenize/nlp",
"env": {
"GOOGLE_NLP_CREDS_PATH": "${input:google_nlp_creds_path}"
},
"args": []
}
],
"inputs": [
{
"id": "google_nlp_creds_path",
"description": "Google Natural Language API credentials path",
"type": "promptString"
}
]
}
4 changes: 2 additions & 2 deletions source_analyses.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func (s source) Analyses(
default:
}

analysis, err := tokenizer.Tokenize(ctx, entity, tokenize.Syntax)
analysis, err := tokenizer.Tokenize(ctx, entity, tokenize.FeatureSyntax)
if err != nil {
return analyses, err
}
Expand Down Expand Up @@ -164,7 +164,7 @@ func (s source) find(
break
}

if *w != *v {
if w != v {
i = 0
found = false
s()
Expand Down
7 changes: 7 additions & 0 deletions source_analyses_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package assocentity

import "testing"

func TestAnalyses(t *testing.T) {
type test struct{}
}
6 changes: 2 additions & 4 deletions tokenize/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,12 @@ package tokenize

import (
v1 "cloud.google.com/go/language/apiv1beta2/languagepb"
v2 "cloud.google.com/go/language/apiv2/languagepb"
)

type (
Analysis struct {
Sentiment *v2.Sentiment

Tokens []*Token
Sentiment *Sentiment
Tokens []*Token
}
)

Expand Down
6 changes: 3 additions & 3 deletions tokenize/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ import (
type Feature int

const (
All Feature = Syntax | Sentiment
FeatureAll Feature = FeatureSyntax | FeatureSentiment

Syntax Feature = 1 << iota
Sentiment
FeatureSyntax Feature = 1 << iota
FeatureSentiment
)

// Tokenizer tokenizes a text according to the defined feauteres
Expand Down
11 changes: 6 additions & 5 deletions tokenize/nlp/retry/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ func Retry(ctx context.Context, req func() error) error {
// growing delay
const (
retries = 6
backoff = 180.0
backoff = 180 // In seconds
)

var (
delay = 1 // In seconds
try = 0
delay = 1 // In seconds
)
for {
select {
Expand All @@ -46,9 +46,10 @@ func Retry(ctx context.Context, req func() error) error {

// Exponentially back-off
time.Sleep(time.Second * time.Duration(delay))
if delay < backoff {
delay = delay*delay + rand.IntN(10-1) + 1 // x² + jitter
}
delay = min(
backoff,
delay*delay+rand.IntN(10-1)+1, // delay² + jitter[1, 10)
)

try++
continue
Expand Down
72 changes: 41 additions & 31 deletions tokenize/nlp/tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,20 @@ import (
"golang.org/x/sync/errgroup"
)

// AutoLang tries to automatically recognize the language
const AutoLang = "auto"

// nlp tokenizers a text using Googles Natural Language AI
type nlp struct {
creds string
feats tokenize.Feature
lang string
}

// NewNLP returns a new Google Natural Language AI tokenizer instance. Note that
// NLP has a built-in retrier
func NewNLP(creds, lang string, feats tokenize.Feature) tokenize.Tokenizer {
func NewNLP(creds, lang string) tokenize.Tokenizer {
return nlp{
creds: creds,
feats: feats,
lang: lang,
}
}
Expand All @@ -30,44 +31,53 @@ func NewNLP(creds, lang string, feats tokenize.Feature) tokenize.Tokenizer {
func (nlp nlp) Tokenize(ctx context.Context, text string, feats tokenize.Feature) (tokenize.Analysis, error) {
analysis := tokenize.Analysis{}

g, ctx := errgroup.WithContext(ctx)
fns := make([]func() error, 0)

// Analyse syntax
if feats&tokenize.Syntax != 0 {
g.Go(func() error {
res, err := v1.NewV1(nlp.creds, nlp.lang).Annotations(ctx, text)
if err != nil {
return err
}
syntaxfn := func() error {
res, err := v1.NewV1(nlp.creds, nlp.lang).Syntax(ctx, text)
if err != nil {
return err
}

tokens := make([]*tokenize.Token, 0)
for _, token := range res.GetTokens() {
tokens = append(tokens, &tokenize.Token{
Text: token.GetText(),
PartOfSpeech: token.GetPartOfSpeech(),
DependencyEdge: token.GetDependencyEdge(),
Lemma: token.GetLemma(),
})
}

analysis.Tokens = tokens
return nil
})
analysis.Tokens = res.GetTokens()
return nil
}
if feats&tokenize.FeatureSyntax != 0 {
fns = append(fns, syntaxfn)
}

// Analyse sentiment
if feats&tokenize.Sentiment != 0 {
g.Go(func() error {
res, err := v2.NewV2(nlp.creds, nlp.lang).Sentiment(ctx, text)
if err != nil {
return err
var (
v2feats v2.Features
annotatefn = func(feats v2.Features) func() error {
return func() error {
res, err := v2.NewV2(nlp.creds, nlp.lang).Annotate(ctx, text, feats)
if err != nil {
return err
}

analysis.Sentiment = res.GetDocumentSentiment()
return nil
}
}
)
if feats&tokenize.FeatureSentiment != 0 {
v2feats.ExtractSentiment = true
}
if feats&tokenize.FeatureSentiment != 0 {
fns = append(fns, annotatefn(v2feats))
}

analysis.Sentiment = res.GetDocumentSentiment()
return nil
})
// All features
if feats == tokenize.FeatureAll {
fns = []func() error{syntaxfn, annotatefn(v2feats)}
}

g, ctx := errgroup.WithContext(ctx)
for _, fn := range fns {
g.Go(fn)
}
if err := g.Wait(); err != nil {
return analysis, err
}
Expand Down
57 changes: 57 additions & 0 deletions tokenize/nlp/tokenizer_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package nlp

import (
"context"
"os"
"slices"
"testing"

"github.com/ndabAP/assocentity/v15/tokenize"
)

func TestTokenize(t *testing.T) {
if testing.Short() {
t.SkipNow()
}

var (
ctx = context.Background()
creds = os.Getenv("GOOGLE_NLP_CREDS_PATH")

text = "You can't win this one, Max."
)

t.Run("syntax", func(t *testing.T) {
nlp := NewNLP(creds, AutoLang)
analysis, err := nlp.Tokenize(ctx, text, tokenize.FeatureSyntax)
if err != nil {
t.Fatal(err)
}

var (
want = []string{
"You", "ca", "n't", "win", "this", "one", ",", "Max", ".",
}
got = make([]string, 0, len(want))
)
for _, token := range analysis.Tokens {
got = append(got, token.Text.Content)
}

if !slices.Equal[[]string, string](got, want) {
t.Errorf("got %v, want %v", got, want)
}
})

t.Run("sentiment", func(t *testing.T) {
nlp := NewNLP(creds, AutoLang)
analysis, err := nlp.Tokenize(ctx, text, tokenize.FeatureSentiment)
if err != nil {
t.Fatal(err)
}

if analysis.Sentiment == nil {
t.Errorf("got %v", analysis.Sentiment)
}
})
}
18 changes: 6 additions & 12 deletions tokenize/nlp/v1/req.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@ import (
"google.golang.org/api/option"
)

// AutoLang tries to automatically recognize the language
const AutoLang = "auto"

type api struct {
creds string
lang string
Expand All @@ -24,10 +21,10 @@ func NewV1(creds, lang string) api {
}
}

func (v1 api) Annotations(ctx context.Context, text string) (*languagepb.AnnotateTextResponse, error) {
func (v1 api) Syntax(ctx context.Context, text string) (*languagepb.AnalyzeSyntaxResponse, error) {
client, err := language.NewClient(ctx, option.WithCredentialsFile(v1.creds))
if err != nil {
return &languagepb.AnnotateTextResponse{}, err
return &languagepb.AnalyzeSyntaxResponse{}, err
}
defer client.Close()

Expand All @@ -37,18 +34,15 @@ func (v1 api) Annotations(ctx context.Context, text string) (*languagepb.Annotat
},
Type: languagepb.Document_PLAIN_TEXT,
}
if v1.lang != AutoLang {
if v1.lang != "auto" {
doc.Language = v1.lang
}

var res *languagepb.AnnotateTextResponse
var res *languagepb.AnalyzeSyntaxResponse
if err := retry.Retry(ctx, func() error {
res, err = client.AnnotateText(ctx, &languagepb.AnnotateTextRequest{
Document: doc,
Features: &languagepb.AnnotateTextRequest_Features{
ExtractSyntax: true,
},
res, err = client.AnalyzeSyntax(ctx, &languagepb.AnalyzeSyntaxRequest{
EncodingType: languagepb.EncodingType_UTF8,
Document: doc,
})

return err
Expand Down
26 changes: 16 additions & 10 deletions tokenize/nlp/v2/req.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,26 @@ import (
"google.golang.org/api/option"
)

// AutoLang tries to automatically recognize the language
const AutoLang = "auto"

type api struct {
creds string
lang string
}

type Features struct {
ExtractSentiment bool
}

func NewV2(creds, lang string) api {
return api{
creds: creds,
lang: lang,
}
}

func (v2 api) Sentiment(ctx context.Context, text string) (*languagepb.AnalyzeSentimentResponse, error) {
func (v2 api) Annotate(ctx context.Context, text string, feats Features) (*languagepb.AnnotateTextResponse, error) {
client, err := language.NewClient(ctx, option.WithCredentialsFile(v2.creds))
if err != nil {
return &languagepb.AnalyzeSentimentResponse{}, err
return &languagepb.AnnotateTextResponse{}, err
}
defer client.Close()

Expand All @@ -37,15 +38,20 @@ func (v2 api) Sentiment(ctx context.Context, text string) (*languagepb.AnalyzeSe
},
Type: languagepb.Document_PLAIN_TEXT,
}
if v2.lang != AutoLang {
if v2.lang != "auto" {
doc.LanguageCode = v2.lang
}

var res *languagepb.AnalyzeSentimentResponse
var res *languagepb.AnnotateTextResponse
if err := retry.Retry(ctx, func() error {
res, err = client.AnalyzeSentiment(ctx, &languagepb.AnalyzeSentimentRequest{
Document: doc,
EncodingType: languagepb.EncodingType_UTF8,
f := &languagepb.AnnotateTextRequest_Features{}
if feats.ExtractSentiment {
f.ExtractDocumentSentiment = true
}

res, err = client.AnnotateText(ctx, &languagepb.AnnotateTextRequest{
Document: doc,
Features: f,
})

return err
Expand Down
5 changes: 5 additions & 0 deletions tokenize/sentiment.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package tokenize

import v2 "cloud.google.com/go/language/apiv2/languagepb"

type Sentiment = v2.Sentiment
Loading

0 comments on commit 72081d1

Please sign in to comment.