diff --git a/loc.go b/loc.go deleted file mode 100644 index 686a8dd..0000000 --- a/loc.go +++ /dev/null @@ -1 +0,0 @@ -package assocentity diff --git a/lookup_table.go b/lookup_table.go new file mode 100644 index 0000000..c4f9119 --- /dev/null +++ b/lookup_table.go @@ -0,0 +1,21 @@ +package assocentity + +import ( + "iter" + + "github.com/ndabAP/assocentity/v15/tokenize" +) + +type ( + // LookupTable wraps entities with multiple texts + LookupTable struct { + entities map[string]tokenize.Tokens // { "Max Payne": ["Max", "Payne"] } + texts []tokenize.Tokens // [["Relax", ",", "Max", "."]] + + lookup map[string][][]int // { "Max Payne": [[5, 13], [17, 25]] } + } + + // iterfunc allows to easily pass arbitrary iterators to a sequence of + // tokens + iterfunc func(entities []*tokenize.Token) iter.Seq2[int, *tokenize.Token] +) diff --git a/tokens_vec.go b/lookup_table_locs.go similarity index 50% rename from tokens_vec.go rename to lookup_table_locs.go index c92b3bb..46bd70a 100644 --- a/tokens_vec.go +++ b/lookup_table_locs.go @@ -1,63 +1,12 @@ package assocentity import ( - "iter" "slices" "github.com/ndabAP/assocentity/v15/tokenize" ) -func (tokens Tokens) Locs() { - cmp := func(text []*tokenize.Token, iterator iterfunc) int { - var i int = 0 - next, stop := iter.Pull(slices.Values(tokens.entities)) - defer stop() - for { - found := true - tok, ok := next() - if !ok { - break - } - - // Entity iterator - n, s := iter.Pull2(iterator(tok)) - // Text iterator - m, t := iter.Pull2(iterator(text)) - for { - _, u, ok := m() - if !ok { - s() - t() - break - } - j, v, ok := n() - if !ok { - s() - t() - break - } - - // If text and entity iterator values don't match, continue - // with next entity - if u.Text != v.Text { - i = 0 - found = false - s() - t() - break - } - - i = j - } - - if found { - return i - } - } - - return -1 - } - +func (tokens LookupTable) Vecs() { vecs := make(map[*tokenize.Token][]int) // For each text for _, text := range tokens.texts { @@ -68,7 +17,7 @@ func (tokens Tokens) Locs() { break } - if j := cmp(text[i:], slices.All); j > -1 { + if j := entitycmp(text[i:], slices.All); j > -1 { // Skip entity switch j { case -1: @@ -84,7 +33,7 @@ func (tokens Tokens) Locs() { token := text[i] // Search for entity, starting from text token for k := range slices.All(text[i:]) { - if l := cmp(text[i:], slices.All); l > -1 { + if l := entitycmp(text[i:], slices.All); l > -1 { // Found entity vecs[token] = append(vecs[token], i+k) @@ -110,7 +59,7 @@ func (tokens Tokens) Locs() { break } - if j := cmp(text[:l], slices.Backward); j > -1 { + if j := entitycmp(text[:l], slices.Backward); j > -1 { // Found entity, skip // Skip entity switch j { diff --git a/source_coords.go b/source_coords.go deleted file mode 100644 index 20d28e6..0000000 --- a/source_coords.go +++ /dev/null @@ -1,57 +0,0 @@ -package assocentity - -import ( - "context" - - "github.com/ndabAP/assocentity/v15/tokenize" -) - -type Loc struct{} - -// Coords tokenizes the text with the given tokenizer -func (s source) Coords( - ctx context.Context, - tokenizer tokenize.Tokenizer, - mut ...WithMut, -) (Tokens, error) { - var tokens Tokens - - // Entities - tokens.entities = make([]tokenize.Tokens, len(s.Entities)) - for i, entity := range s.Entities { - tok, err := tokenizer.Tokenize(ctx, entity) - if err != nil { - return tokens, err - } - - tokens.entities[i] = append(tokens.entities[i], tok...) - } - - // Texts - tokens.texts = make([]tokenize.Tokens, len(s.Texts)) - for i, text := range s.Texts { - select { - case <-ctx.Done(): - return tokens, ctx.Err() - default: - } - - tok, err := tokenizer.Tokenize(ctx, text) - if err != nil { - return tokens, err - } - - for _, t := range tok { - for _, m := range mut { - t = m(t) - } - if t == nil { - continue - } - - tokens.texts[i] = append(tokens.texts[i], t) - } - } - - return tokens, nil -} diff --git a/source_lookup_table.go b/source_lookup_table.go new file mode 100644 index 0000000..4acbb7a --- /dev/null +++ b/source_lookup_table.go @@ -0,0 +1,158 @@ +package assocentity + +import ( + "context" + "iter" + "maps" + "slices" + + "github.com/ndabAP/assocentity/v15/tokenize" +) + +// LookupTable tokenizes the text with the given tokenizer, and optionally +// mutates a token with the given mutator. If a mutator returns nil, the token +// will be an empty token to keep dependencies in sync. + +// It returns a LookupTable with the tokenized entites and texts as well as the +// index-based coordinates of each entity +func (s source) LookupTable( + ctx context.Context, + tokenizer tokenize.Tokenizer, + mut ...WithMut, +) (LookupTable, error) { + var table LookupTable + + // Entities + table.entities = make(map[string]tokenize.Tokens, len(s.Entities)) + for _, entity := range s.Entities { + select { + case <-ctx.Done(): + return table, ctx.Err() + default: + } + + tok, err := tokenizer.Tokenize(ctx, entity) + if err != nil { + return table, err + } + + table.entities[entity] = append(table.entities[entity], tok...) + } + + // Texts + table.texts = make([]tokenize.Tokens, len(s.Texts)) + for i, text := range s.Texts { + select { + case <-ctx.Done(): + return table, ctx.Err() + default: + } + + tok, err := tokenizer.Tokenize(ctx, text) + if err != nil { + return table, err + } + + for _, t := range tok { + for _, m := range mut { + t = m(t) + } + // nil mutations result into an empty token so dependencies stay in + // sync + if t == nil { + t = &tokenize.Token{} + } + + table.texts[i] = append(table.texts[i], t) + } + } + + // Lookup table + table.lookup = make(map[string][][]int) + for _, tokens := range table.texts { + i := 0 + for { + if i == len(tokens)-1 { + break + } + + if entity, j := s.cmp(tokens[i:], table.entities, slices.All); j > -1 { + if j == -1 { + continue + } + + table.lookup[entity] = append(table.lookup[entity], []int{i, i + j}) + + // Skip entity + switch j { + case 0: + i += 1 + default: + i += j + } + + continue + } + + i++ + } + } + + return table, nil +} + +func (s source) cmp( + text []*tokenize.Token, + entities map[string]tokenize.Tokens, + iterator iterfunc, +) (string, int) { + // i contains the final index when the entity was found, or -1 if not + var i int = 0 + next, stop := iter.Pull2(maps.All(entities)) + defer stop() + for { + found := true + entity, tok, ok := next() + if !ok { + break + } + + // Entity iterator + n, s := iter.Pull2(iterator(tok)) + // Text iterator + m, t := iter.Pull2(iterator(text)) + for { + // If no entity is left, cancel + j, v, ok := n() + if !ok { + s() + t() + break + } + // If no text is left, cancel + _, u, ok := m() + if !ok { + s() + t() + break + } + + if u.Text != v.Text { + // Continue with next entity + i = 0 + found = false + s() + t() + break + } + + i = j + } + + if found { + return entity, i + } + } + + return "", -1 +} diff --git a/source_test.go b/source_test.go index fb2e8a2..0741d68 100644 --- a/source_test.go +++ b/source_test.go @@ -71,7 +71,7 @@ func TestSource_Tokenize(t *testing.T) { texts []string tokenizer tokenize.Tokenizer mut []WithMut - want Tokens + want LookupTable wantErr bool } @@ -81,7 +81,7 @@ func TestSource_Tokenize(t *testing.T) { entities: []string{}, texts: []string{}, tokenizer: delimiter.New(nil), - want: Tokens{entities: []tokenize.Tokens{}, texts: []tokenize.Tokens{}}, + want: LookupTable{entities: []tokenize.Tokens{}, texts: []tokenize.Tokens{}}, }, { name: "only entities", @@ -90,7 +90,7 @@ func TestSource_Tokenize(t *testing.T) { tokenizer: delimiter.New(func(r rune) bool { return r == ' ' }), - want: Tokens{ + want: LookupTable{ entities: []tokenize.Tokens{ { {Text: &languagepb.TextSpan{Content: "Max"}}, @@ -118,7 +118,7 @@ func TestSource_Tokenize(t *testing.T) { return false } }), - want: Tokens{ + want: LookupTable{ entities: []tokenize.Tokens{}, texts: []tokenize.Tokens{ { @@ -135,7 +135,7 @@ func TestSource_Tokenize(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { s := NewSource(tt.entities, tt.texts) - got, err := s.Coords(context.Background(), tt.tokenizer, tt.mut...) + got, err := s.LookupTable(context.Background(), tt.tokenizer, tt.mut...) if (err != nil) != tt.wantErr { t.Errorf("Tokenize() error = %v, wantErr %v", err, tt.wantErr) return diff --git a/tokenize/token.go b/tokenize/token.go index c35742d..d4c2e23 100644 --- a/tokenize/token.go +++ b/tokenize/token.go @@ -15,8 +15,14 @@ type ( Tokens []*Token ) -func (tokens Tokens) Dep(i int) *Token { +// Dependency returns the dependency to the given token index returned by the +// Tokenizer. Note: nil values are not taken into account +func (tokens Tokens) Dependency(i int) *Token { for _, token := range tokens { + if token.DependencyEdge == nil { + continue + } + if token.DependencyEdge.HeadTokenIndex == int32(i) { return token } diff --git a/tokens.go b/tokens.go deleted file mode 100644 index 8949dbe..0000000 --- a/tokens.go +++ /dev/null @@ -1,17 +0,0 @@ -package assocentity - -import ( - "iter" - - "github.com/ndabAP/assocentity/v15/tokenize" -) - -type ( - // Tokens wraps entities with multiple texts - Tokens struct { - entities []tokenize.Tokens // [["Max", "Payne"], ["Max"], ["Payne"]] - texts []tokenize.Tokens // [["Relax", ",", "Max", "."]] - } - - iterfunc func(entities []*tokenize.Token) iter.Seq2[int, *tokenize.Token] -)