Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance: Add comments to chunking-related code #1106

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,16 @@
*/
public interface Chunker {

/** Field name for specifying the maximum chunk limit in the configuration. */
String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit";

/** Field name for tracking the count of chunked strings. */
String CHUNK_STRING_COUNT_FIELD = "chunk_string_count";

/** Default maximum number of chunks allowed (100). */
int DEFAULT_MAX_CHUNK_LIMIT = 100;

/** Special value (-1) indicating that chunk limiting is disabled. */
int DISABLED_MAX_CHUNK_LIMIT = -1;

/**
Expand All @@ -42,6 +49,7 @@ public interface Chunker {
* @param chunkResultSize the size of chunking result
* @param runtimeMaxChunkLimit runtime max_chunk_limit, used to check with chunkResultSize
* @param chunkStringCount runtime chunk_string_count, used to check with chunkResultSize
* @return true if adding the new chunks would exceed the limit, false otherwise
*/
static boolean checkRunTimeMaxChunkLimit(int chunkResultSize, int runtimeMaxChunkLimit, int chunkStringCount) {
return runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT && chunkResultSize + chunkStringCount >= runtimeMaxChunkLimit;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,16 @@ private ChunkerFactory() {} // no instance of this factory class
DelimiterChunker::new
);

/** Set of supported chunker algorithm types */
public static Set<String> CHUNKER_ALGORITHMS = CHUNKERS_CONSTRUCTORS.keySet();

/**
* Creates a new Chunker instance based on the specified type and parameters.
*
* @param type the type of chunker to create
* @param parameters configuration parameters for the chunker
* @return a new Chunker instance configured with the given parameters
*/
public static Chunker create(final String type, final Map<String, Object> parameters) {
Function<Map<String, Object>, Chunker> chunkerConstructionFunction = CHUNKERS_CONSTRUCTORS.get(type);
// chunkerConstructionFunction is not null because we have validated the type in text chunking processor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@ public final class ChunkerParameterParser {
private ChunkerParameterParser() {} // no instance of this util class

/**
* Parse String type parameter.
* Throw IllegalArgumentException if parameter is not a string or an empty string.
* Parses and validates a string parameter from the parameters map.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @return The validated string value from the parameters map
* @throws IllegalArgumentException if the parameter is not a string or is empty
*/
public static String parseString(final Map<String, Object> parameters, final String fieldName) {
Object fieldValue = parameters.get(fieldName);
Expand All @@ -36,9 +40,13 @@ public static String parseString(final Map<String, Object> parameters, final Str
}

/**
* Parse String type parameter.
* Return default value if the parameter is missing.
* Throw IllegalArgumentException if parameter is not a string or an empty string.
* Parses and validates a string parameter from the parameters map with fallback to a default value.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @param defaultValue The default value to return if the parameter is not present
* @return The validated string value from the parameters map if present, otherwise the default value
* @throws IllegalArgumentException if the parameter is present but is not a string or is empty
*/
public static String parseStringWithDefault(final Map<String, Object> parameters, final String fieldName, final String defaultValue) {
if (!parameters.containsKey(fieldName)) {
Expand All @@ -49,8 +57,12 @@ public static String parseStringWithDefault(final Map<String, Object> parameters
}

/**
* Parse integer type parameter with default value.
* Throw IllegalArgumentException if the parameter is not an integer.
* Parses and validates an integer value from the parameters map.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @return The parsed integer value from the parameters map
* @throws IllegalArgumentException if the parameter is not an integer or is empty
*/
public static int parseInteger(final Map<String, Object> parameters, final String fieldName) {
String fieldValueString = parameters.get(fieldName).toString();
Expand All @@ -64,9 +76,13 @@ public static int parseInteger(final Map<String, Object> parameters, final Strin
}

/**
* Parse integer type parameter with default value.
* Return default value if the parameter is missing.
* Throw IllegalArgumentException if the parameter is not an integer.
* Parses and validates an integer parameter from the parameters map with fallback to a default value.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @param defaultValue The default value to return if the parameter is not present
* @return The integer value from the parameters map if present, otherwise the default value
* @throws IllegalArgumentException if the parameter is present but cannot be converted to an integer
*/
public static int parseIntegerWithDefault(final Map<String, Object> parameters, final String fieldName, final int defaultValue) {
if (!parameters.containsKey(fieldName)) {
Expand All @@ -77,9 +93,12 @@ public static int parseIntegerWithDefault(final Map<String, Object> parameters,
}

/**
* Parse integer type parameter with positive value.
* Return default value if the parameter is missing.
* Throw IllegalArgumentException if the parameter is not a positive integer.
* Parses and validates a positive integer parameter from the parameters map.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @return The parsed positive integer value
* @throws IllegalArgumentException if the parameter is not a positive integer or cannot be converted to an integer
*/
public static int parsePositiveInteger(final Map<String, Object> parameters, final String fieldName) {
int fieldValueInt = parseInteger(parameters, fieldName);
Expand All @@ -90,9 +109,13 @@ public static int parsePositiveInteger(final Map<String, Object> parameters, fin
}

/**
* Parse integer type parameter with positive value.
* Return default value if the parameter is missing.
* Throw IllegalArgumentException if the parameter is not a positive integer.
* Parses and validates a positive integer parameter from the parameters map with fallback to a default value.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @param defaultValue The default value to return if the parameter is not present
* @return The positive integer value from the parameters map if present, otherwise the default value
* @throws IllegalArgumentException if the parameter is present but is not a positive integer
*/
public static int parsePositiveIntegerWithDefault(
final Map<String, Object> parameters,
Expand All @@ -107,8 +130,12 @@ public static int parsePositiveIntegerWithDefault(
}

/**
* Parse double type parameter.
* Throw IllegalArgumentException if parameter is not a double.
* Parses and validates a double value from the parameters map.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @return The parsed double value
* @throws IllegalArgumentException if the parameter cannot be converted to a double
*/
public static double parseDouble(final Map<String, Object> parameters, final String fieldName) {
String fieldValueString = parameters.get(fieldName).toString();
Expand All @@ -122,9 +149,13 @@ public static double parseDouble(final Map<String, Object> parameters, final Str
}

/**
* Parse double type parameter.
* Return default value if the parameter is missing.
* Throw IllegalArgumentException if parameter is not a double.
* Parses and validates a double value from the parameters map with fallback to a default value.
*
* @param parameters The map containing chunking parameters
* @param fieldName The name of the field to extract from the parameters map
* @param defaultValue The default value to return if the parameter is not present
* @return The double value from the parameters map if present, otherwise the default value
* @throws IllegalArgumentException if the parameter is present but cannot be converted to a double
*/
public static double parseDoubleWithDefault(final Map<String, Object> parameters, final String fieldName, final double defaultValue) {
if (!parameters.containsKey(fieldName)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,22 @@
*/
public final class DelimiterChunker implements Chunker {

/** The identifier for the delimiter chunking algorithm. */
public static final String ALGORITHM_NAME = "delimiter";

/** The parameter field name for specifying the delimiter. */
public static final String DELIMITER_FIELD = "delimiter";

/** The default delimiter value used when none is specified. Uses two consecutive newline characters to split on paragraph boundaries. */
public static final String DEFAULT_DELIMITER = "\n\n";

/** The delimiter string used for text chunking. */
private String delimiter;

/**
* Constructor that initializes the delimiter chunker with the specified parameters.
* @param parameters a map with non-runtime parameters to be parsed
*/
public DelimiterChunker(final Map<String, Object> parameters) {
parseParameters(parameters);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,22 @@
*/
public final class FixedTokenLengthChunker implements Chunker {

/** The identifier for the fixed token length chunking algorithm. */
public static final String ALGORITHM_NAME = "fixed_token_length";

// field name for each parameter
/** Field name for the analysis registry configuration parameter. */
public static final String ANALYSIS_REGISTRY_FIELD = "analysis_registry";

/** Field name for specifying the maximum number of tokens per chunk. */
public static final String TOKEN_LIMIT_FIELD = "token_limit";

/** Field name for specifying the overlap rate between consecutive chunks. */
public static final String OVERLAP_RATE_FIELD = "overlap_rate";

/** Field name for specifying the maximum token count allowed in the input text. */
public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count";

/** Field name for specifying the tokenizer to be used for text analysis. */
public static final String TOKENIZER_FIELD = "tokenizer";

// default values for each non-runtime parameter
Expand All @@ -57,6 +66,10 @@ public final class FixedTokenLengthChunker implements Chunker {
private double overlapRate;
private final AnalysisRegistry analysisRegistry;

/**
* Constructor that initializes the fixed token length chunker with the specified parameters.
* @param parameters a map with non-runtime parameters to be parsed
*/
public FixedTokenLengthChunker(final Map<String, Object> parameters) {
parseParameters(parameters);
this.analysisRegistry = (AnalysisRegistry) parameters.get(ANALYSIS_REGISTRY_FIELD);
Expand Down
Loading