diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index 3fa2eeb7c..f8f496291 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -13,9 +13,16 @@ */ public interface Chunker { + /** Field name for specifying the maximum chunk limit in the configuration. */ String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; + + /** Field name for tracking the count of chunked strings. */ String CHUNK_STRING_COUNT_FIELD = "chunk_string_count"; + + /** Default maximum number of chunks allowed (100). */ int DEFAULT_MAX_CHUNK_LIMIT = 100; + + /** Special value (-1) indicating that chunk limiting is disabled. */ int DISABLED_MAX_CHUNK_LIMIT = -1; /** @@ -42,6 +49,7 @@ public interface Chunker { * @param chunkResultSize the size of chunking result * @param runtimeMaxChunkLimit runtime max_chunk_limit, used to check with chunkResultSize * @param chunkStringCount runtime chunk_string_count, used to check with chunkResultSize + * @return true if adding the new chunks would exceed the limit, false otherwise */ static boolean checkRunTimeMaxChunkLimit(int chunkResultSize, int runtimeMaxChunkLimit, int chunkStringCount) { return runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT && chunkResultSize + chunkStringCount >= runtimeMaxChunkLimit; diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java index aab9eaa3e..6b4ddd594 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerFactory.java @@ -25,8 +25,16 @@ private ChunkerFactory() {} // no instance of this factory class DelimiterChunker::new ); + /** Set of supported chunker algorithm types */ public static Set CHUNKER_ALGORITHMS = CHUNKERS_CONSTRUCTORS.keySet(); + /** + * Creates a new Chunker instance based on the specified type and parameters. + * + * @param type the type of chunker to create + * @param parameters configuration parameters for the chunker + * @return a new Chunker instance configured with the given parameters + */ public static Chunker create(final String type, final Map parameters) { Function, Chunker> chunkerConstructionFunction = CHUNKERS_CONSTRUCTORS.get(type); // chunkerConstructionFunction is not null because we have validated the type in text chunking processor diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java index 52d8eef00..a26371651 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerParameterParser.java @@ -19,8 +19,12 @@ public final class ChunkerParameterParser { private ChunkerParameterParser() {} // no instance of this util class /** - * Parse String type parameter. - * Throw IllegalArgumentException if parameter is not a string or an empty string. + * Parses and validates a string parameter from the parameters map. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @return The validated string value from the parameters map + * @throws IllegalArgumentException if the parameter is not a string or is empty */ public static String parseString(final Map parameters, final String fieldName) { Object fieldValue = parameters.get(fieldName); @@ -36,9 +40,13 @@ public static String parseString(final Map parameters, final Str } /** - * Parse String type parameter. - * Return default value if the parameter is missing. - * Throw IllegalArgumentException if parameter is not a string or an empty string. + * Parses and validates a string parameter from the parameters map with fallback to a default value. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @param defaultValue The default value to return if the parameter is not present + * @return The validated string value from the parameters map if present, otherwise the default value + * @throws IllegalArgumentException if the parameter is present but is not a string or is empty */ public static String parseStringWithDefault(final Map parameters, final String fieldName, final String defaultValue) { if (!parameters.containsKey(fieldName)) { @@ -49,8 +57,12 @@ public static String parseStringWithDefault(final Map parameters } /** - * Parse integer type parameter with default value. - * Throw IllegalArgumentException if the parameter is not an integer. + * Parses and validates an integer value from the parameters map. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @return The parsed integer value from the parameters map + * @throws IllegalArgumentException if the parameter is not an integer or is empty */ public static int parseInteger(final Map parameters, final String fieldName) { String fieldValueString = parameters.get(fieldName).toString(); @@ -64,9 +76,13 @@ public static int parseInteger(final Map parameters, final Strin } /** - * Parse integer type parameter with default value. - * Return default value if the parameter is missing. - * Throw IllegalArgumentException if the parameter is not an integer. + * Parses and validates an integer parameter from the parameters map with fallback to a default value. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @param defaultValue The default value to return if the parameter is not present + * @return The integer value from the parameters map if present, otherwise the default value + * @throws IllegalArgumentException if the parameter is present but cannot be converted to an integer */ public static int parseIntegerWithDefault(final Map parameters, final String fieldName, final int defaultValue) { if (!parameters.containsKey(fieldName)) { @@ -77,9 +93,12 @@ public static int parseIntegerWithDefault(final Map parameters, } /** - * Parse integer type parameter with positive value. - * Return default value if the parameter is missing. - * Throw IllegalArgumentException if the parameter is not a positive integer. + * Parses and validates a positive integer parameter from the parameters map. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @return The parsed positive integer value + * @throws IllegalArgumentException if the parameter is not a positive integer or cannot be converted to an integer */ public static int parsePositiveInteger(final Map parameters, final String fieldName) { int fieldValueInt = parseInteger(parameters, fieldName); @@ -90,9 +109,13 @@ public static int parsePositiveInteger(final Map parameters, fin } /** - * Parse integer type parameter with positive value. - * Return default value if the parameter is missing. - * Throw IllegalArgumentException if the parameter is not a positive integer. + * Parses and validates a positive integer parameter from the parameters map with fallback to a default value. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @param defaultValue The default value to return if the parameter is not present + * @return The positive integer value from the parameters map if present, otherwise the default value + * @throws IllegalArgumentException if the parameter is present but is not a positive integer */ public static int parsePositiveIntegerWithDefault( final Map parameters, @@ -107,8 +130,12 @@ public static int parsePositiveIntegerWithDefault( } /** - * Parse double type parameter. - * Throw IllegalArgumentException if parameter is not a double. + * Parses and validates a double value from the parameters map. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @return The parsed double value + * @throws IllegalArgumentException if the parameter cannot be converted to a double */ public static double parseDouble(final Map parameters, final String fieldName) { String fieldValueString = parameters.get(fieldName).toString(); @@ -122,9 +149,13 @@ public static double parseDouble(final Map parameters, final Str } /** - * Parse double type parameter. - * Return default value if the parameter is missing. - * Throw IllegalArgumentException if parameter is not a double. + * Parses and validates a double value from the parameters map with fallback to a default value. + * + * @param parameters The map containing chunking parameters + * @param fieldName The name of the field to extract from the parameters map + * @param defaultValue The default value to return if the parameter is not present + * @return The double value from the parameters map if present, otherwise the default value + * @throws IllegalArgumentException if the parameter is present but cannot be converted to a double */ public static double parseDoubleWithDefault(final Map parameters, final String fieldName, final double defaultValue) { if (!parameters.containsKey(fieldName)) { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index 0f3d66c55..0cee22d97 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -16,14 +16,22 @@ */ public final class DelimiterChunker implements Chunker { + /** The identifier for the delimiter chunking algorithm. */ public static final String ALGORITHM_NAME = "delimiter"; + /** The parameter field name for specifying the delimiter. */ public static final String DELIMITER_FIELD = "delimiter"; + /** The default delimiter value used when none is specified. Uses two consecutive newline characters to split on paragraph boundaries. */ public static final String DEFAULT_DELIMITER = "\n\n"; + /** The delimiter string used for text chunking. */ private String delimiter; + /** + * Constructor that initializes the delimiter chunker with the specified parameters. + * @param parameters a map with non-runtime parameters to be parsed + */ public DelimiterChunker(final Map parameters) { parseParameters(parameters); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 614ea33f9..3b364814a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -24,13 +24,22 @@ */ public final class FixedTokenLengthChunker implements Chunker { + /** The identifier for the fixed token length chunking algorithm. */ public static final String ALGORITHM_NAME = "fixed_token_length"; - // field name for each parameter + /** Field name for the analysis registry configuration parameter. */ public static final String ANALYSIS_REGISTRY_FIELD = "analysis_registry"; + + /** Field name for specifying the maximum number of tokens per chunk. */ public static final String TOKEN_LIMIT_FIELD = "token_limit"; + + /** Field name for specifying the overlap rate between consecutive chunks. */ public static final String OVERLAP_RATE_FIELD = "overlap_rate"; + + /** Field name for specifying the maximum token count allowed in the input text. */ public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count"; + + /** Field name for specifying the tokenizer to be used for text analysis. */ public static final String TOKENIZER_FIELD = "tokenizer"; // default values for each non-runtime parameter @@ -57,6 +66,10 @@ public final class FixedTokenLengthChunker implements Chunker { private double overlapRate; private final AnalysisRegistry analysisRegistry; + /** + * Constructor that initializes the fixed token length chunker with the specified parameters. + * @param parameters a map with non-runtime parameters to be parsed + */ public FixedTokenLengthChunker(final Map parameters) { parseParameters(parameters); this.analysisRegistry = (AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD);