Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

output irrelevant docs in FTS #4694

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions extension/fts/src/function/query_fts_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,8 @@ QFTSOutputWriter::QFTSOutputWriter(storage::MemoryManager* mm, QFTSOutput* qFTSO
void QFTSOutputWriter::write(processor::FactorizedTable& scoreFT, nodeID_t docNodeID, uint64_t len,
int64_t docsID) {
bool hasScore = qFTSOutput->scores.contains(docNodeID);
docsVector.setNull(pos, !hasScore);
docsVector.setValue(pos, nodeID_t{(common::offset_t)docsID, bindData.outputTableID});
docsVector.setNull(pos, false /* isNull */);
scoreVector.setNull(pos, !hasScore);
auto k = bindData.config.k;
auto b = bindData.config.b;
Expand All @@ -211,7 +212,6 @@ void QFTSOutputWriter::write(processor::FactorizedTable& scoreFT, nodeID_t docNo
score += log10((numDocs - df + 0.5) / (df + 0.5) + 1) *
((tf * (k + 1) / (tf + k * (1 - b + b * (len / avgDocLen)))));
}
docsVector.setValue(pos, nodeID_t{(common::offset_t)docsID, bindData.outputTableID});
scoreVector.setValue(pos, score);
}
scoreFT.append(vectors);
Expand Down
67 changes: 42 additions & 25 deletions extension/fts/test/test_files/fts_small.test
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,28 @@
---- ok
-LOG SingleKeyWordUpperCase
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'Alice') RETURN _node.ID, score
---- 2
---- 3
0|0.271133
3|0.209476
20|
-LOG SingleKeyWordLowerCase
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice') RETURN _node.ID, score
---- 2
---- 3
0|0.271133
3|0.209476
20|
-LOG QueryEmptyString
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', '') RETURN _node.ID, score
---- 0
---- 3
0|
3|
20|
-LOG QueryStopWord
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'at') RETURN _node.ID, score
---- 0
---- 3
0|
3|
20|
-LOG QuerySingular
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'studys') RETURN _node.ID, score
---- 3
Expand All @@ -49,28 +57,36 @@
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx2', ['content'])
---- ok
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'toronto') RETURN _node.ID, score
---- 1
---- 3
0|0.565815
3|
20|
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx1', 'toronto') RETURN _node.ID, score
---- 1
---- 3
0|0.400747
3|
20|
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx2', 'toronto') RETURN _node.ID, score
---- 1
---- 3
0|0.393753
3|
20|
-LOG DropAndRecreate
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx4', ['content', 'name'])
---- ok
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx4', 'waterloo') RETURN _node.ID, score
---- 2
---- 3
0|0.192034
3|
20|0.210752
-STATEMENT CALL DROP_FTS_INDEX('doc', 'docIdx4')
---- ok
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx4', ['content', 'name'])
---- ok
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx4', 'waterloo') RETURN _node.ID, score
---- 2
---- 3
0|0.192034
3|
20|0.210752

-CASE FTSWithParams
Expand All @@ -84,19 +100,19 @@ Binder exception: Unrecognized stemmer 'german1'. Supported stemmers are: ['arab
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx', ['content', 'author', 'name'], stemmer := 'araBic')
---- ok
#Note: arabic stemmer doesn't reduce studys/studying->study
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'study') RETURN _node.ID, score
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'study') WHERE score is not null RETURN _node.ID, score
---- 1
20|0.437146
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx1', ['content', 'author', 'name'], stemmer := 'frEnch')
---- ok
#Note: french stemmer doesn't reduce studying->study
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx1', 'study') RETURN _node.ID, score
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx1', 'study') WHERE score is not null RETURN _node.ID, score
---- 2
0|0.194190
20|0.209476
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx2', ['content', 'author', 'name'], stemmer := 'nOne')
---- ok
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx2', 'studying') RETURN _node.ID, score
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx2', 'studying') where score is not null RETURN _node.ID, score
---- 1
3|0.437146
-LOG CreateFTSIncorrectParam
Expand All @@ -109,8 +125,9 @@ Binder exception: Unrecognized optional parameter: test
Binder exception: 25 has data type INT64 but STRING was expected.
-LOG QueryFTSOptionalParam
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx1', 'study', k := 0.3, B:= 0.5) RETURN _node.ID, score
---- 2
---- 3
0|0.201218
3|
20|0.205603
-LOG QueryFTSOptionalParamError
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx1', 'study', k := 0.3, c:= 0.5) RETURN _node.ID, score
Expand All @@ -131,36 +148,36 @@ Binder exception: BM25 model requires the Term Frequency Saturation(k) value to
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx', ['content', 'author', 'name'])
---- 0
-LOG QueryFTSConjunctiveSingleKeyword
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice', conjunctive := true) RETURN _node.ID, score
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice', conjunctive := true) where score is not null RETURN _node.ID, score
---- 2
0|0.271133
3|0.209476
-LOG QueryFTSConjunctiveMultiKeywords
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice studying', conjunctive := true) RETURN _node.ID, score
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice studying', conjunctive := true) where score is not null RETURN _node.ID, score
---- 2
0|0.326304
3|0.268990
-LOG QueryFTSConjunctiveDuplicateKeywords
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice studying alice', conjunctive := true) RETURN _node.ID, score
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice studying alice', conjunctive := true) where score is not null RETURN _node.ID, score
---- 2
0|0.326304
3|0.268990
-LOG QueryFTSConjunctiveDuplicateSingleKeyword
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice alice', conjunctive := true) RETURN _node.ID, score
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice alice', conjunctive := true) where score is not null RETURN _node.ID, score
---- 2
0|0.271133
3|0.209476
-LOG QueryFTSConjunctiveNotExistSingleKeyword
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'carol', conjunctive := true) RETURN _node.ID, score
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'carol', conjunctive := true) where score is not null RETURN _node.ID, score
---- 0
-LOG QueryFTSConjunctiveNotExistMultiKeywords
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'carol dog', conjunctive := true) RETURN _node.ID, score
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'carol dog', conjunctive := true) where score is not null RETURN _node.ID, score
---- 0
-LOG QueryFTSConjunctivePartialExistKeyword
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice carol', conjunctive := true) RETURN _node.ID, score
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice carol', conjunctive := true) where score is not null RETURN _node.ID, score
---- 0
-LOG QueryFTSConjunctivePartialExistKeyword
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice waterloo', conjunctive := true) RETURN _node.ID, score
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice waterloo', conjunctive := true) where score is not null RETURN _node.ID, score
---- 1
0|0.465323

Expand All @@ -170,7 +187,7 @@ Binder exception: BM25 model requires the Term Frequency Saturation(k) value to
---- ok
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx', ['content', 'author', 'name'])
---- ok
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'Alice') RETURN _node.ID, score
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'Alice') where score is not null RETURN _node.ID, score
---- 2
0|0.271133
3|0.209476
Expand All @@ -180,7 +197,7 @@ Binder exception: BM25 model requires the Term Frequency Saturation(k) value to
Catalog exception: QUERY_FTS_INDEX function does not exist.
-STATEMENT load extension "${KUZU_ROOT_DIRECTORY}/extension/fts/build/libfts.kuzu_extension"
---- ok
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'Alice') RETURN _node.ID, score
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'Alice') where score is not null RETURN _node.ID, score
---- 2
0|0.271133
3|0.209476
Expand All @@ -191,7 +208,7 @@ Catalog exception: QUERY_FTS_INDEX function does not exist.
---- ok
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx', ['content', 'author', 'name'])
---- ok
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'Alice') RETURN _node.ID, score
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'Alice') WHERE score is not null RETURN _node.ID, score
---- 2
0|0.271133
3|0.209476
Expand All @@ -200,7 +217,7 @@ Catalog exception: QUERY_FTS_INDEX function does not exist.
-IMPORT_DATABASE "${KUZU_EXPORT_DB_DIRECTORY}_fts/small"
-STATEMENT IMPORT DATABASE '${KUZU_EXPORT_DB_DIRECTORY}_fts/small';
---- ok
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'Alice') RETURN _node.ID, score
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'Alice') WHERE score is not null RETURN _node.ID, score
---- 2
0|0.271133
3|0.209476
8 changes: 4 additions & 4 deletions extension/fts/test/test_files/ms_passage_small.test
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'contentIdx', ['content'])
---- ok
-LOG QueryKeywords
-STATEMENT CALL query_fts_index('doc', 'contentIdx', 'dispossessed meaning') RETURN _node.id, score order by score, _node.id;
-STATEMENT CALL query_fts_index('doc', 'contentIdx', 'dispossessed meaning') WHERE score is not null RETURN _node.id, score order by score, _node.id;
-CHECK_ORDER
---- 9
389|1.773149
Expand All @@ -23,7 +23,7 @@
109|1.957072
390|2.937742
-LOG QuerySingleKeyWord
-STATEMENT CALL query_fts_index('doc', 'contentIdx', 'normality') RETURN _node.id, score order by score, _node.id;
-STATEMENT CALL query_fts_index('doc', 'contentIdx', 'normality') WHERE score is not null RETURN _node.id, score order by score, _node.id;
-CHECK_ORDER
---- 7
50|1.794529
Expand All @@ -34,13 +34,13 @@
137|2.187176
57|2.269955
-LOG QueryShortQuestion
-STATEMENT CALL query_fts_index('doc', 'contentIdx', 'what is piroxicam used to treat') RETURN _node.id, score order by score, _node.id;
-STATEMENT CALL query_fts_index('doc', 'contentIdx', 'what is piroxicam used to treat') where score is not null RETURN _node.id, score order by score, _node.id;
-CHECK_ORDER
---- 2
67|1.848853
476|2.684744
-LOG QueryLongQuestion
-STATEMENT CALL query_fts_index('doc', 'contentIdx', 'how long does it take to recover from top wisdom teeth removal') RETURN _node.id, score order by score, _node.id;
-STATEMENT CALL query_fts_index('doc', 'contentIdx', 'how long does it take to recover from top wisdom teeth removal') where score is not null RETURN _node.id, score order by score, _node.id;
-CHECK_ORDER
---- 31
62|0.794823
Expand Down
Loading