Skip to content

Commit

Permalink
chore: wip
Browse files Browse the repository at this point in the history
  • Loading branch information
ndabAP committed Jan 25, 2025
1 parent 3a7ec5d commit 3849647
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 49 deletions.
35 changes: 26 additions & 9 deletions lookup_table.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,46 @@ import (

type (
LookupTable struct {
entities map[string][]*tokenize.Token // { "Max Payne": ["Max", "Payne"] }
texts []tokenize.Analyzes // [["Relax", ",", "Max", "."]]
entities map[string][]*tokenize.Token

lookup []map[string][]pair
analyses []tokenize.Analyses
lookup []map[string]pairs
}

pair [2]int
pair [2]int
pairs []pair

iterfunc func(tokens []*tokenize.Token) iter.Seq2[int, *tokenize.Token]
)

func (table LookupTable) at(i int) (map[string][]pair, bool) {
func (table LookupTable) at(i int) (map[string]pairs, bool) {
if i >= len(table.lookup) {
return nil, false
}

return table.lookup[i], true
}

func (t pair) at(i int) (int, bool) {
if i >= len(t) {
return 0, false
func (p pair) at(i int) int {
return p[i]
}

func (p pairs) heads() iter.Seq[int] {
return func(yield func(int) bool) {
for _, pair := range p {
if !yield(pair.at(0)) {
return
}
}
}
}

return t[i], true
func (p pairs) tails() iter.Seq[int] {
return func(yield func(int) bool) {
for _, pair := range p {
if !yield(pair.at(1)) {
return
}
}
}
}
17 changes: 6 additions & 11 deletions lookup_table_vecs.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,16 @@ import (
"github.com/ndabAP/assocentity/v15/tokenize"
)

func (table LookupTable) Vecs() {
func (table LookupTable) Vecs() map[*tokenize.Token][]int {
vecs := make(map[*tokenize.Token][]int)
// For each text
for i, analyzes := range table.texts {
tokens := analyzes.Tokens
for i, analyses := range table.analyses {
lookup, ok := table.at(i)
if !ok {
continue
}

next, stop := iter.Pull(maps.Values(lookup))
for j, token := range tokens {
for j, token := range analyses.Tokens {
if token == tokenize.NilToken {
continue
}
Expand All @@ -27,16 +25,13 @@ func (table LookupTable) Vecs() {
if !ok {
continue
}
for _, t := range tuple {
i, ok := t.at(0)
if !ok {
continue
}

for i := range tuple.heads() {
vecs[token] = append(vecs[token], i-j)
}
}

stop()
}

return vecs
}
4 changes: 0 additions & 4 deletions source.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ package assocentity

import (
"slices"

"github.com/ndabAP/assocentity/v15/tokenize"
)

type (
Expand All @@ -12,8 +10,6 @@ type (
Entities []string // ["Max Payne", "Max", "Payne"]
Texts []string // [["Relax, Max."]]
}

WithMut func(*tokenize.Token) *tokenize.Token
)

// NewSource returns a new source, consisting of entities and texts. Duplicate
Expand Down
46 changes: 27 additions & 19 deletions source_lookup_table.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,10 @@ import (
"github.com/ndabAP/assocentity/v15/tokenize"
)

// LookupTable tokenizes the text with the given tokenizer, and optionally
// mutates a token with the given mutator. If a mutator returns nil, the token
// will be an empty token to keep dependencies in sync.
type (
WithMut func(*tokenize.Token) *tokenize.Token
)

// It returns a LookupTable with the tokenized entites and texts as well as the
// index-based coordinates of each entity
func (s source) LookupTable(
ctx context.Context,
tokenizer tokenize.Tokenizer,
Expand All @@ -41,20 +39,20 @@ func (s source) LookupTable(
}

// Texts
table.texts = make([]tokenize.Analyzes, len(s.Texts))
table.analyses = make([]tokenize.Analyses, len(s.Texts))
for i, text := range s.Texts {
select {
case <-ctx.Done():
return table, ctx.Err()
default:
}

analyzes, err := tokenizer.Tokenize(ctx, text, feats)
analyses, err := tokenizer.Tokenize(ctx, text, feats)
if err != nil {
return table, err
}

for _, t := range analyzes.Tokens {
for _, t := range analyses.Tokens {
for _, f := range mut {
t = f(t)
}
Expand All @@ -64,31 +62,39 @@ func (s source) LookupTable(
t = tokenize.NilToken
}

table.texts[i].Tokens = append(table.texts[i].Tokens, t)
table.analyses[i].Tokens = append(table.analyses[i].Tokens, t)
}
}

// Lookup table
table.lookup = make([]map[string][]pair, 0, len(table.texts))
for _, analyzes := range table.texts {
tokens := analyzes.Tokens
lookup := make(map[string][]pair, len(table.entities))

i := 0
table.lookup = make([]map[string]pairs, 0, len(table.analyses))
for _, analyses := range table.analyses {
tokens := analyses.Tokens

var (
lookup = make(map[string]pairs, len(table.entities))
i = 0
)
for {
if i == len(tokens)-1 {
break
}

if entity, buf, j := s.cmp(tokens[i:], entities, slices.All); j > -1 {
if entity, t, j := s.cmp(
tokens[i:],
entities,
slices.All,
); j > -1 {
if j == -1 {
continue
}

entities[entity] = buf
// Replace entities with actual tokens
entities[entity] = t

t := pair{i, i + j}
lookup[entity] = append(lookup[entity], t)
// Append entity coordinates
p := pair{i, i + j}
lookup[entity] = append(lookup[entity], p)

// Skip entity
switch j {
Expand All @@ -104,6 +110,7 @@ func (s source) LookupTable(
i++
}

table.entities = entities
table.lookup = append(table.lookup, lookup)
}

Expand All @@ -126,6 +133,7 @@ func (s source) cmp(
break
}

// Entity buffer
buf := make([]*tokenize.Token, 0, len(tok))

// Entity iterator
Expand Down
2 changes: 1 addition & 1 deletion tokenize/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ const (

// Tokenizer tokenizes a text and entities
type Tokenizer interface {
Tokenize(ctx context.Context, text string, feats Feature) (Analyzes, error)
Tokenize(ctx context.Context, text string, feats Feature) (Analyses, error)
}
4 changes: 2 additions & 2 deletions tokenize/nlp/tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ func NewNLP(creds, lang string, feats tokenize.Feature) tokenize.Tokenizer {
}

// Tokenize tokenizers a text
func (nlp *nlp) Tokenize(ctx context.Context, text string, feats tokenize.Feature) (tokenize.Analyzes, error) {
analyzes := tokenize.Analyzes{}
func (nlp *nlp) Tokenize(ctx context.Context, text string, feats tokenize.Feature) (tokenize.Analyses, error) {
analyzes := tokenize.Analyses{}

g, ctx := errgroup.WithContext(ctx)

Expand Down
6 changes: 3 additions & 3 deletions tokenize/token.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ type (
Lemma string
}

Analyzes struct {
Analyses struct {
Sentiment *v2.Sentiment

Tokens []*Token
Expand All @@ -22,7 +22,7 @@ type (

var NilToken = &Token{}

func (analyzes Analyzes) Relationship(token *Token) *[]v1.DependencyEdge_Label {
func (analyses Analyses) Relationship(token *Token) *[]v1.DependencyEdge_Label {
if token == nil {
return nil
}
Expand All @@ -31,7 +31,7 @@ func (analyzes Analyzes) Relationship(token *Token) *[]v1.DependencyEdge_Label {
}

labels := make([]v1.DependencyEdge_Label, 0)
for _, t := range analyzes.Tokens {
for _, t := range analyses.Tokens {
if t == NilToken {
continue
}
Expand Down

0 comments on commit 3849647

Please sign in to comment.