Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fts bug fix #4677

Merged
merged 2 commits into from
Jan 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion extension/fts/src/fts_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#include "catalog/catalog_entry/catalog_entry_type.h"
#include "function/create_fts_index.h"
#include "function/drop_fts_index.h"
#include "function/query_fts_gds.h"
#include "function/query_fts.h"
#include "function/stem.h"
#include "main/client_context.h"
#include "main/database.h"
Expand Down
2 changes: 1 addition & 1 deletion extension/fts/src/function/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ add_library(kuzu_fts_function
create_fts_index.cpp
fts_config.cpp
drop_fts_index.cpp
query_fts_gds.cpp
query_fts.cpp
fts_utils.cpp)

set(FTS_OBJECT_FILES
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "function/query_fts_gds.h"
#include "function/query_fts.h"

#include "binder/binder.h"
#include "binder/expression/expression_util.h"
Expand Down Expand Up @@ -359,6 +359,9 @@ static std::vector<std::string> getTerms(std::string& query, const std::string&
RE2::GlobalReplace(&query, regexPattern, replacePattern);
StringUtils::toLower(query);
auto terms = StringUtils::split(query, " ");
if (stemmer == "none") {
return terms;
}
StemFunction::validateStemmer(stemmer);
auto sbStemmer = sb_stemmer_new(reinterpret_cast<const char*>(stemmer.c_str()), "UTF_8");
std::vector<std::string> result;
Expand All @@ -375,8 +378,6 @@ void QFTSAlgorithm::bind(const GDSBindInput& input, main::ClientContext& context
auto inputTableName = getParamVal(input, 0);
auto indexName = getParamVal(input, 1);
auto query = getParamVal(input, 2);
auto stemmer = "english";
auto terms = getTerms(query, stemmer);

auto& tableEntry =
FTSUtils::bindTable(inputTableName, &context, indexName, FTSUtils::IndexOperation::QUERY);
Expand All @@ -385,6 +386,7 @@ void QFTSAlgorithm::bind(const GDSBindInput& input, main::ClientContext& context
context.getCatalog()
->getIndex(context.getTransaction(), tableEntry.getTableID(), indexName)
->constCast<FTSIndexCatalogEntry>();
auto terms = getTerms(query, ftsIndexEntry.getFTSConfig().stemmer);
auto entry =
context.getCatalog()->getTableCatalogEntry(context.getTransaction(), inputTableName);
auto nodeOutput = bindNodeOutput(input.binder, {entry});
Expand Down
127 changes: 120 additions & 7 deletions extension/fts/test/test_files/fts_small.test
Original file line number Diff line number Diff line change
Expand Up @@ -8,44 +8,157 @@
-STATEMENT load extension "${KUZU_ROOT_DIRECTORY}/extension/fts/build/libfts.kuzu_extension"
---- ok
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx', ['content', 'author', 'name'])
---- ok
-LOG SingleKeyWordUpperCase
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'Alice') RETURN _node.ID, score
---- 2
0|0.271133
3|0.209476
-LOG SingleKeyWordLowerCase
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice') RETURN _node.ID, score
---- 2
0|0.271133
3|0.209476
-LOG QueryEmptyString
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', '') RETURN _node.ID, score
---- 0
-LOG QueryStopWord
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'at') RETURN _node.ID, score
---- 0
-LOG QuerySingular
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'studys') RETURN _node.ID, score
---- 3
0|0.055171
20|0.059514
3|0.059514
-LOG QueryPresentTense
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'studying') RETURN _node.ID, score
---- 3
0|0.055171
20|0.059514
3|0.059514
-LOG QueryWithSpecialChar
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'study ->') RETURN _node.ID, score
---- 3
0|0.055171
20|0.059514
3|0.059514
-LOG MultipleIndexes
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx1', ['content', 'author'])
---- ok
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx2', ['content'])
---- ok
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'toronto') RETURN _node.ID, score
---- 1
0|0.565815
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx1', 'toronto') RETURN _node.ID, score
---- 1
0|0.400747
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx2', 'toronto') RETURN _node.ID, score
---- 1
0|0.393753
-LOG DropAndRecreate
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx4', ['content', 'name'])
---- ok
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx4', 'waterloo') RETURN _node.ID, score
---- 2
0|0.192034
20|0.210752
-STATEMENT CALL DROP_FTS_INDEX('doc', 'docIdx4')
---- ok
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx4', ['content', 'name'])
---- ok
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx4', 'waterloo') RETURN _node.ID, score
---- 2
0|0.192034
20|0.210752

-CASE FTSWithParams
-STATEMENT load extension "${KUZU_ROOT_DIRECTORY}/extension/fts/build/libfts.kuzu_extension"
---- ok

-LOG StemmerOption
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx', ['content', 'author', 'name'], stemmer := 'german1')
---- error
Binder exception: Unrecognized stemmer 'german1'. Supported stemmers are: ['arabic, basque, catalan, danish, dutch, english, finnish, french, german, greek, hindi, hungarian, indonesian, irish, italian, lithuanian, nepali, norwegian, porter, portuguese, romanian, russian, serbian, spanish, swedish, tamil, turkish'], or use 'none' for no stemming.
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx', ['content', 'author', 'name'], stemmer := 'araBic')
---- ok
#Note: arabic stemmer doesn't reduce studys/studying->study
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'study') RETURN _node.ID, score
---- 1
20|0.437146
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx1', ['content', 'author', 'name'], stemmer := 'frEnch')
---- ok
#Note: french stemmer doesn't reduce studying->study
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx1', 'study') RETURN _node.ID, score
---- 2
0|0.194190
20|0.209476
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx2', ['content', 'author', 'name'], stemmer := 'nOne')
---- ok
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx2', 'studying') RETURN _node.ID, score
---- 1
3|0.437146
-LOG CreateFTSIncorrectParam
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx5', ['content', 'author', 'name'], test := 'nOne')
---- error
Binder exception: Unrecognized optional parameter: test
-LOG CreateFTSIncorrectParamType
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx5', ['content', 'author', 'name'], stemmer := 25)
---- error
Binder exception: 25 has data type INT64 but STRING was expected.
-LOG QueryFTSOptionalParam
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx1', 'study', k := 0.3, B:= 0.5) RETURN _node.ID, score
---- 2
0|0.201218
20|0.205603
-LOG QueryFTSOptionalParamError
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx1', 'study', k := 0.3, c:= 0.5) RETURN _node.ID, score
---- error
Binder exception: Unrecognized optional parameter: c
-LOG QueryFTSBOutOfRange
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx1', 'study', k := 0.3, B:= 3.88) RETURN _node.ID, score
---- error
Binder exception: BM25 model requires the Document Length Normalization(b) value to be in the range [0,1].
-LOG QueryFTSKOutOfRange
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx1', 'study', k := -5.3, B:= 0.7) RETURN _node.ID, score
---- error
Binder exception: BM25 model requires the Term Frequency Saturation(k) value to be a positive number.

-CASE fts_conjunctive_case
-STATEMENT load extension "${KUZU_ROOT_DIRECTORY}/extension/fts/build/libfts.kuzu_extension"
---- ok
-STATEMENT CALL CREATE_FTS_INDEX('doc', 'docIdx', ['content', 'author', 'name'])
---- 0
-LOG QueryFTSConjunctiveSingleKeyword
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice', conjunctive := true) RETURN _node.ID, score
---- 2
0|0.271133
3|0.209476

-LOG QueryFTSConjunctiveMultiKeywords
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice studying', conjunctive := true) RETURN _node.ID, score
---- 2
0|0.326304
3|0.268990

-LOG QueryFTSConjunctiveDuplicateKeywords
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice studying alice', conjunctive := true) RETURN _node.ID, score
---- 2
0|0.326304
3|0.268990

-LOG QueryFTSConjunctiveDuplicateSingleKeyword
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice alice', conjunctive := true) RETURN _node.ID, score
---- 2
0|0.271133
3|0.209476

-LOG QueryFTSConjunctiveNotExistSingleKeyword
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'carol', conjunctive := true) RETURN _node.ID, score
---- 0

-LOG QueryFTSConjunctiveNotExistMultiKeywords
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'carol dog', conjunctive := true) RETURN _node.ID, score
---- 0

-LOG QueryFTSConjunctivePartialExistKeyword
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice carol', conjunctive := true) RETURN _node.ID, score
---- 0

-LOG QueryFTSConjunctivePartialExistKeyword
-STATEMENT CALL QUERY_FTS_INDEX('doc', 'docIdx', 'alice waterloo', conjunctive := true) RETURN _node.ID, score
---- 1
Expand Down
Loading