-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
195 additions
and
136 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
package assocentity | ||
|
||
import ( | ||
"iter" | ||
|
||
"github.com/ndabAP/assocentity/v15/tokenize" | ||
) | ||
|
||
type ( | ||
// LookupTable wraps entities with multiple texts | ||
LookupTable struct { | ||
entities map[string]tokenize.Tokens // { "Max Payne": ["Max", "Payne"] } | ||
texts []tokenize.Tokens // [["Relax", ",", "Max", "."]] | ||
|
||
lookup map[string][][]int // { "Max Payne": [[5, 13], [17, 25]] } | ||
} | ||
|
||
// iterfunc allows to easily pass arbitrary iterators to a sequence of | ||
// tokens | ||
iterfunc func(entities []*tokenize.Token) iter.Seq2[int, *tokenize.Token] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
package assocentity | ||
|
||
import ( | ||
"context" | ||
"iter" | ||
"maps" | ||
"slices" | ||
|
||
"github.com/ndabAP/assocentity/v15/tokenize" | ||
) | ||
|
||
// LookupTable tokenizes the text with the given tokenizer, and optionally | ||
// mutates a token with the given mutator. If a mutator returns nil, the token | ||
// will be an empty token to keep dependencies in sync. | ||
|
||
// It returns a LookupTable with the tokenized entites and texts as well as the | ||
// index-based coordinates of each entity | ||
func (s source) LookupTable( | ||
ctx context.Context, | ||
tokenizer tokenize.Tokenizer, | ||
mut ...WithMut, | ||
) (LookupTable, error) { | ||
var table LookupTable | ||
|
||
// Entities | ||
table.entities = make(map[string]tokenize.Tokens, len(s.Entities)) | ||
for _, entity := range s.Entities { | ||
select { | ||
case <-ctx.Done(): | ||
return table, ctx.Err() | ||
default: | ||
} | ||
|
||
tok, err := tokenizer.Tokenize(ctx, entity) | ||
if err != nil { | ||
return table, err | ||
} | ||
|
||
table.entities[entity] = append(table.entities[entity], tok...) | ||
} | ||
|
||
// Texts | ||
table.texts = make([]tokenize.Tokens, len(s.Texts)) | ||
for i, text := range s.Texts { | ||
select { | ||
case <-ctx.Done(): | ||
return table, ctx.Err() | ||
default: | ||
} | ||
|
||
tok, err := tokenizer.Tokenize(ctx, text) | ||
if err != nil { | ||
return table, err | ||
} | ||
|
||
for _, t := range tok { | ||
for _, m := range mut { | ||
t = m(t) | ||
} | ||
// nil mutations result into an empty token so dependencies stay in | ||
// sync | ||
if t == nil { | ||
t = &tokenize.Token{} | ||
} | ||
|
||
table.texts[i] = append(table.texts[i], t) | ||
} | ||
} | ||
|
||
// Lookup table | ||
table.lookup = make(map[string][][]int) | ||
for _, tokens := range table.texts { | ||
i := 0 | ||
for { | ||
if i == len(tokens)-1 { | ||
break | ||
} | ||
|
||
if entity, j := s.cmp(tokens[i:], table.entities, slices.All); j > -1 { | ||
if j == -1 { | ||
continue | ||
} | ||
|
||
table.lookup[entity] = append(table.lookup[entity], []int{i, i + j}) | ||
|
||
// Skip entity | ||
switch j { | ||
case 0: | ||
i += 1 | ||
default: | ||
i += j | ||
} | ||
|
||
continue | ||
} | ||
|
||
i++ | ||
} | ||
} | ||
|
||
return table, nil | ||
} | ||
|
||
func (s source) cmp( | ||
text []*tokenize.Token, | ||
entities map[string]tokenize.Tokens, | ||
iterator iterfunc, | ||
) (string, int) { | ||
// i contains the final index when the entity was found, or -1 if not | ||
var i int = 0 | ||
next, stop := iter.Pull2(maps.All(entities)) | ||
defer stop() | ||
for { | ||
found := true | ||
entity, tok, ok := next() | ||
if !ok { | ||
break | ||
} | ||
|
||
// Entity iterator | ||
n, s := iter.Pull2(iterator(tok)) | ||
// Text iterator | ||
m, t := iter.Pull2(iterator(text)) | ||
for { | ||
// If no entity is left, cancel | ||
j, v, ok := n() | ||
if !ok { | ||
s() | ||
t() | ||
break | ||
} | ||
// If no text is left, cancel | ||
_, u, ok := m() | ||
if !ok { | ||
s() | ||
t() | ||
break | ||
} | ||
|
||
if u.Text != v.Text { | ||
// Continue with next entity | ||
i = 0 | ||
found = false | ||
s() | ||
t() | ||
break | ||
} | ||
|
||
i = j | ||
} | ||
|
||
if found { | ||
return entity, i | ||
} | ||
} | ||
|
||
return "", -1 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.