Skip to content

Commit

Permalink
chore: wip
Browse files Browse the repository at this point in the history
  • Loading branch information
ndabAP committed Jan 24, 2025
1 parent 7393adc commit 550d40d
Show file tree
Hide file tree
Showing 8 changed files with 195 additions and 136 deletions.
1 change: 0 additions & 1 deletion loc.go

This file was deleted.

21 changes: 21 additions & 0 deletions lookup_table.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package assocentity

import (
"iter"

"github.com/ndabAP/assocentity/v15/tokenize"
)

type (
// LookupTable wraps entities with multiple texts
LookupTable struct {
entities map[string]tokenize.Tokens // { "Max Payne": ["Max", "Payne"] }
texts []tokenize.Tokens // [["Relax", ",", "Max", "."]]

lookup map[string][][]int // { "Max Payne": [[5, 13], [17, 25]] }
}

// iterfunc allows to easily pass arbitrary iterators to a sequence of
// tokens
iterfunc func(entities []*tokenize.Token) iter.Seq2[int, *tokenize.Token]
)
59 changes: 4 additions & 55 deletions tokens_vec.go → lookup_table_locs.go
Original file line number Diff line number Diff line change
@@ -1,63 +1,12 @@
package assocentity

import (
"iter"
"slices"

"github.com/ndabAP/assocentity/v15/tokenize"
)

func (tokens Tokens) Locs() {
cmp := func(text []*tokenize.Token, iterator iterfunc) int {
var i int = 0
next, stop := iter.Pull(slices.Values(tokens.entities))
defer stop()
for {
found := true
tok, ok := next()
if !ok {
break
}

// Entity iterator
n, s := iter.Pull2(iterator(tok))
// Text iterator
m, t := iter.Pull2(iterator(text))
for {
_, u, ok := m()
if !ok {
s()
t()
break
}
j, v, ok := n()
if !ok {
s()
t()
break
}

// If text and entity iterator values don't match, continue
// with next entity
if u.Text != v.Text {
i = 0
found = false
s()
t()
break
}

i = j
}

if found {
return i
}
}

return -1
}

func (tokens LookupTable) Vecs() {
vecs := make(map[*tokenize.Token][]int)
// For each text
for _, text := range tokens.texts {
Expand All @@ -68,7 +17,7 @@ func (tokens Tokens) Locs() {
break
}

if j := cmp(text[i:], slices.All); j > -1 {
if j := entitycmp(text[i:], slices.All); j > -1 {
// Skip entity
switch j {
case -1:
Expand All @@ -84,7 +33,7 @@ func (tokens Tokens) Locs() {
token := text[i]
// Search for entity, starting from text token
for k := range slices.All(text[i:]) {
if l := cmp(text[i:], slices.All); l > -1 {
if l := entitycmp(text[i:], slices.All); l > -1 {
// Found entity
vecs[token] = append(vecs[token], i+k)

Expand All @@ -110,7 +59,7 @@ func (tokens Tokens) Locs() {
break
}

if j := cmp(text[:l], slices.Backward); j > -1 {
if j := entitycmp(text[:l], slices.Backward); j > -1 {
// Found entity, skip
// Skip entity
switch j {
Expand Down
57 changes: 0 additions & 57 deletions source_coords.go

This file was deleted.

158 changes: 158 additions & 0 deletions source_lookup_table.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
package assocentity

import (
"context"
"iter"
"maps"
"slices"

"github.com/ndabAP/assocentity/v15/tokenize"
)

// LookupTable tokenizes the text with the given tokenizer, and optionally
// mutates a token with the given mutator. If a mutator returns nil, the token
// will be an empty token to keep dependencies in sync.

// It returns a LookupTable with the tokenized entites and texts as well as the
// index-based coordinates of each entity
func (s source) LookupTable(
ctx context.Context,
tokenizer tokenize.Tokenizer,
mut ...WithMut,
) (LookupTable, error) {
var table LookupTable

// Entities
table.entities = make(map[string]tokenize.Tokens, len(s.Entities))
for _, entity := range s.Entities {
select {
case <-ctx.Done():
return table, ctx.Err()
default:
}

tok, err := tokenizer.Tokenize(ctx, entity)
if err != nil {
return table, err
}

table.entities[entity] = append(table.entities[entity], tok...)
}

// Texts
table.texts = make([]tokenize.Tokens, len(s.Texts))
for i, text := range s.Texts {
select {
case <-ctx.Done():
return table, ctx.Err()
default:
}

tok, err := tokenizer.Tokenize(ctx, text)
if err != nil {
return table, err
}

for _, t := range tok {
for _, m := range mut {
t = m(t)
}
// nil mutations result into an empty token so dependencies stay in
// sync
if t == nil {
t = &tokenize.Token{}
}

table.texts[i] = append(table.texts[i], t)
}
}

// Lookup table
table.lookup = make(map[string][][]int)
for _, tokens := range table.texts {
i := 0
for {
if i == len(tokens)-1 {
break
}

if entity, j := s.cmp(tokens[i:], table.entities, slices.All); j > -1 {
if j == -1 {
continue
}

table.lookup[entity] = append(table.lookup[entity], []int{i, i + j})

// Skip entity
switch j {
case 0:
i += 1
default:
i += j
}

continue
}

i++
}
}

return table, nil
}

func (s source) cmp(
text []*tokenize.Token,
entities map[string]tokenize.Tokens,
iterator iterfunc,
) (string, int) {
// i contains the final index when the entity was found, or -1 if not
var i int = 0
next, stop := iter.Pull2(maps.All(entities))
defer stop()
for {
found := true
entity, tok, ok := next()
if !ok {
break
}

// Entity iterator
n, s := iter.Pull2(iterator(tok))
// Text iterator
m, t := iter.Pull2(iterator(text))
for {
// If no entity is left, cancel
j, v, ok := n()
if !ok {
s()
t()
break
}
// If no text is left, cancel
_, u, ok := m()
if !ok {
s()
t()
break
}

if u.Text != v.Text {
// Continue with next entity
i = 0
found = false
s()
t()
break
}

i = j
}

if found {
return entity, i
}
}

return "", -1
}
10 changes: 5 additions & 5 deletions source_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ func TestSource_Tokenize(t *testing.T) {
texts []string
tokenizer tokenize.Tokenizer
mut []WithMut
want Tokens
want LookupTable
wantErr bool
}

Expand All @@ -81,7 +81,7 @@ func TestSource_Tokenize(t *testing.T) {
entities: []string{},
texts: []string{},
tokenizer: delimiter.New(nil),
want: Tokens{entities: []tokenize.Tokens{}, texts: []tokenize.Tokens{}},
want: LookupTable{entities: []tokenize.Tokens{}, texts: []tokenize.Tokens{}},
},
{
name: "only entities",
Expand All @@ -90,7 +90,7 @@ func TestSource_Tokenize(t *testing.T) {
tokenizer: delimiter.New(func(r rune) bool {
return r == ' '
}),
want: Tokens{
want: LookupTable{
entities: []tokenize.Tokens{
{
{Text: &languagepb.TextSpan{Content: "Max"}},
Expand Down Expand Up @@ -118,7 +118,7 @@ func TestSource_Tokenize(t *testing.T) {
return false
}
}),
want: Tokens{
want: LookupTable{
entities: []tokenize.Tokens{},
texts: []tokenize.Tokens{
{
Expand All @@ -135,7 +135,7 @@ func TestSource_Tokenize(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
s := NewSource(tt.entities, tt.texts)
got, err := s.Coords(context.Background(), tt.tokenizer, tt.mut...)
got, err := s.LookupTable(context.Background(), tt.tokenizer, tt.mut...)
if (err != nil) != tt.wantErr {
t.Errorf("Tokenize() error = %v, wantErr %v", err, tt.wantErr)
return
Expand Down
8 changes: 7 additions & 1 deletion tokenize/token.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,14 @@ type (
Tokens []*Token
)

func (tokens Tokens) Dep(i int) *Token {
// Dependency returns the dependency to the given token index returned by the
// Tokenizer. Note: nil values are not taken into account
func (tokens Tokens) Dependency(i int) *Token {
for _, token := range tokens {
if token.DependencyEdge == nil {
continue
}

if token.DependencyEdge.HeadTokenIndex == int32(i) {
return token
}
Expand Down
Loading

0 comments on commit 550d40d

Please sign in to comment.