From 9f3194c633b66a12907a194fd26ce4d94b373038 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 15 Jan 2025 11:19:36 +0800 Subject: [PATCH 1/7] add logs for chunker parameter parser Signed-off-by: yuye-aws --- .../chunker/ChunkerParameterParser.java | 73 +++++++++++++------ 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java index 52d8eef00..a26371651 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java @@ -19,8 +19,12 @@ public final class ChunkerParameterParser { private ChunkerParameterParser() {} // no instance of this util class /** - * Parse String type parameter. - * Throw IllegalArgumentException if parameter is not a string or an empty string. + * Parses and validates a string parameter from the parameters map. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @return The validated string value from the parameters map + * @throws IllegalArgumentException if the parameter is not a string or is empty */ public static String parseString(final Map parameters, final String fieldName) { Object fieldValue = parameters.get(fieldName); @@ -36,9 +40,13 @@ public static String parseString(final Map parameters, final Str } /** - * Parse String type parameter. - * Return default value if the parameter is missing. - * Throw IllegalArgumentException if parameter is not a string or an empty string. + * Parses and validates a string parameter from the parameters map with fallback to a default value. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @param defaultValue The default value to return if the parameter is not present + * @return The validated string value from the parameters map if present, otherwise the default value + * @throws IllegalArgumentException if the parameter is present but is not a string or is empty */ public static String parseStringWithDefault(final Map parameters, final String fieldName, final String defaultValue) { if (!parameters.containsKey(fieldName)) { @@ -49,8 +57,12 @@ public static String parseStringWithDefault(final Map parameters } /** - * Parse integer type parameter with default value. - * Throw IllegalArgumentException if the parameter is not an integer. + * Parses and validates an integer value from the parameters map. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @return The parsed integer value from the parameters map + * @throws IllegalArgumentException if the parameter is not an integer or is empty */ public static int parseInteger(final Map parameters, final String fieldName) { String fieldValueString = parameters.get(fieldName).toString(); @@ -64,9 +76,13 @@ public static int parseInteger(final Map parameters, final Strin } /** - * Parse integer type parameter with default value. - * Return default value if the parameter is missing. - * Throw IllegalArgumentException if the parameter is not an integer. + * Parses and validates an integer parameter from the parameters map with fallback to a default value. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @param defaultValue The default value to return if the parameter is not present + * @return The integer value from the parameters map if present, otherwise the default value + * @throws IllegalArgumentException if the parameter is present but cannot be converted to an integer */ public static int parseIntegerWithDefault(final Map parameters, final String fieldName, final int defaultValue) { if (!parameters.containsKey(fieldName)) { @@ -77,9 +93,12 @@ public static int parseIntegerWithDefault(final Map parameters, } /** - * Parse integer type parameter with positive value. - * Return default value if the parameter is missing. - * Throw IllegalArgumentException if the parameter is not a positive integer. + * Parses and validates a positive integer parameter from the parameters map. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @return The parsed positive integer value + * @throws IllegalArgumentException if the parameter is not a positive integer or cannot be converted to an integer */ public static int parsePositiveInteger(final Map parameters, final String fieldName) { int fieldValueInt = parseInteger(parameters, fieldName); @@ -90,9 +109,13 @@ public static int parsePositiveInteger(final Map parameters, fin } /** - * Parse integer type parameter with positive value. - * Return default value if the parameter is missing. - * Throw IllegalArgumentException if the parameter is not a positive integer. + * Parses and validates a positive integer parameter from the parameters map with fallback to a default value. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @param defaultValue The default value to return if the parameter is not present + * @return The positive integer value from the parameters map if present, otherwise the default value + * @throws IllegalArgumentException if the parameter is present but is not a positive integer */ public static int parsePositiveIntegerWithDefault( final Map parameters, @@ -107,8 +130,12 @@ public static int parsePositiveIntegerWithDefault( } /** - * Parse double type parameter. - * Throw IllegalArgumentException if parameter is not a double. + * Parses and validates a double value from the parameters map. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @return The parsed double value + * @throws IllegalArgumentException if the parameter cannot be converted to a double */ public static double parseDouble(final Map parameters, final String fieldName) { String fieldValueString = parameters.get(fieldName).toString(); @@ -122,9 +149,13 @@ public static double parseDouble(final Map parameters, final Str } /** - * Parse double type parameter. - * Return default value if the parameter is missing. - * Throw IllegalArgumentException if parameter is not a double. + * Parses and validates a double value from the parameters map with fallback to a default value. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @param defaultValue The default value to return if the parameter is not present + * @return The double value from the parameters map if present, otherwise the default value + * @throws IllegalArgumentException if the parameter is present but cannot be converted to a double */ public static double parseDoubleWithDefault(final Map parameters, final String fieldName, final double defaultValue) { if (!parameters.containsKey(fieldName)) { From bed5c32c3a87619b0f5ffface5a64b8d88b43bd2 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 15 Jan 2025 11:24:10 +0800 Subject: [PATCH 2/7] add logs for delimiter chunker Signed-off-by: yuye-aws --- .../processor/chunker/DelimiterChunker.java | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 0f3d66c55..83dae96c5 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -16,12 +16,25 @@ */ public final class DelimiterChunker implements Chunker { + /** + * The identifier for the delimiter chunking algorithm. + */ public static final String ALGORITHM_NAME = "delimiter"; + /** + * The parameter field name for specifying the delimiter. + */ public static final String DELIMITER_FIELD = "delimiter"; + /** + * The default delimiter value used when none is specified. + * Uses two consecutive newline characters to split on paragraph boundaries. + */ public static final String DEFAULT_DELIMITER = "\n\n"; + /** + * The delimiter string used for text chunking. + */ private String delimiter; public DelimiterChunker(final Map parameters) { From d1fc86732e4ed5c63445e3cebae4203d7689163b Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 15 Jan 2025 11:40:46 +0800 Subject: [PATCH 3/7] add comments for delimiter chunker Signed-off-by: yuye-aws --- .../chunker/FixedTokenLengthChunker.java | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 614ea33f9..3b364814a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -24,13 +24,22 @@ */ public final class FixedTokenLengthChunker implements Chunker { + /** The identifier for the fixed token length chunking algorithm. */ public static final String ALGORITHM_NAME = "fixed_token_length"; - // field name for each parameter + /** Field name for the analysis registry configuration parameter. */ public static final String ANALYSIS_REGISTRY_FIELD = "analysis_registry"; + + /** Field name for specifying the maximum number of tokens per chunk. */ public static final String TOKEN_LIMIT_FIELD = "token_limit"; + + /** Field name for specifying the overlap rate between consecutive chunks. */ public static final String OVERLAP_RATE_FIELD = "overlap_rate"; + + /** Field name for specifying the maximum token count allowed in the input text. */ public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count"; + + /** Field name for specifying the tokenizer to be used for text analysis. */ public static final String TOKENIZER_FIELD = "tokenizer"; // default values for each non-runtime parameter @@ -57,6 +66,10 @@ public final class FixedTokenLengthChunker implements Chunker { private double overlapRate; private final AnalysisRegistry analysisRegistry; + /** + * Constructor that initializes the fixed token length chunker with the specified parameters. + * @param parameters a map with non-runtime parameters to be parsed + */ public FixedTokenLengthChunker(final Map parameters) { parseParameters(parameters); this.analysisRegistry = (AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD); From d90eab1f6f9c53485e7715637cf1b2afdd4aad5e Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 15 Jan 2025 11:41:01 +0800 Subject: [PATCH 4/7] update comments for delimiter chunker Signed-off-by: yuye-aws --- .../processor/chunker/DelimiterChunker.java | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 83dae96c5..0cee22d97 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -16,27 +16,22 @@ */ public final class DelimiterChunker implements Chunker { - /** - * The identifier for the delimiter chunking algorithm. - */ + /** The identifier for the delimiter chunking algorithm. */ public static final String ALGORITHM_NAME = "delimiter"; - /** - * The parameter field name for specifying the delimiter. - */ + /** The parameter field name for specifying the delimiter. */ public static final String DELIMITER_FIELD = "delimiter"; - /** - * The default delimiter value used when none is specified. - * Uses two consecutive newline characters to split on paragraph boundaries. - */ + /** The default delimiter value used when none is specified. Uses two consecutive newline characters to split on paragraph boundaries. */ public static final String DEFAULT_DELIMITER = "\n\n"; - /** - * The delimiter string used for text chunking. - */ + /** The delimiter string used for text chunking. */ private String delimiter; + /** + * Constructor that initializes the delimiter chunker with the specified parameters. + * @param parameters a map with non-runtime parameters to be parsed + */ public DelimiterChunker(final Map parameters) { parseParameters(parameters); } From f11e2f9c362b24861383ea8ffd713300615c1ca0 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 15 Jan 2025 11:43:27 +0800 Subject: [PATCH 5/7] update comments for chunker interface Signed-off-by: yuye-aws --- .../opensearch/neuralsearch/processor/chunker/Chunker.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index 3fa2eeb7c..903e88bd2 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -13,9 +13,16 @@ */ public interface Chunker { + /** Field name for specifying the maximum chunk limit in the configuration. */ String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; + + /** Field name for tracking the count of chunked strings. */ String CHUNK_STRING_COUNT_FIELD = "chunk_string_count"; + + /** Default maximum number of chunks allowed (100). */ int DEFAULT_MAX_CHUNK_LIMIT = 100; + + /** Special value (-1) indicating that chunk limiting is disabled. */ int DISABLED_MAX_CHUNK_LIMIT = -1; /** From 2b340764c423122eb410fc7e1a0b3381e808407f Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 15 Jan 2025 11:45:21 +0800 Subject: [PATCH 6/7] add comments for chunker facdtory Signed-off-by: yuye-aws --- .../neuralsearch/processor/chunker/ChunkerFactory.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index aab9eaa3e..6b4ddd594 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -25,8 +25,16 @@ private ChunkerFactory() {} // no instance of this factory class DelimiterChunker::new ); + /** Set of supported chunker algorithm types */ public static Set CHUNKER_ALGORITHMS = CHUNKERS_CONSTRUCTORS.keySet(); + /** + * Creates a new Chunker instance based on the specified type and parameters. + * + * @param type the type of chunker to create + * @param parameters configuration parameters for the chunker + * @return a new Chunker instance configured with the given parameters + */ public static Chunker create(final String type, final Map parameters) { Function, Chunker> chunkerConstructionFunction = CHUNKERS_CONSTRUCTORS.get(type); // chunkerConstructionFunction is not null because we have validated the type in text chunking processor From 4f7bb26a7ee71977914dc0f23b83f1f44ba147bd Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Wed, 15 Jan 2025 11:50:58 +0800 Subject: [PATCH 7/7] update comments for chunker interface Signed-off-by: yuye-aws --- .../org/opensearch/neuralsearch/processor/chunker/Chunker.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index 903e88bd2..f8f496291 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -49,6 +49,7 @@ public interface Chunker { * @param chunkResultSize the size of chunking result * @param runtimeMaxChunkLimit runtime max_chunk_limit, used to check with chunkResultSize * @param chunkStringCount runtime chunk_string_count, used to check with chunkResultSize + * @return true if adding the new chunks would exceed the limit, false otherwise */ static boolean checkRunTimeMaxChunkLimit(int chunkResultSize, int runtimeMaxChunkLimit, int chunkStringCount) { return runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT && chunkResultSize + chunkStringCount >= runtimeMaxChunkLimit;