Skip to content

Commit

Permalink
Store index metadata file for Lucene text indexes (#13948)
Browse files Browse the repository at this point in the history
* add lucene.properties metadata file per lucene index

* add javadoc

* improve javadoc
  • Loading branch information
itschrispeck authored Oct 7, 2024
1 parent 41b9a60 commit 0b7dae6
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@ public LuceneTextIndexCreator(String column, File segmentIndexDir, boolean commi
// to V3 if segmentVersion is set to V3 in SegmentGeneratorConfig.
_indexFile = getV1TextIndexFile(segmentIndexDir);

// write properties file for the immutable segment
if (_commitOnClose) {
TextIndexUtils.writeConfigToPropertiesFile(_indexFile, config);
}

Analyzer luceneAnalyzer = TextIndexUtils.getAnalyzer(config);
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(luceneAnalyzer);
indexWriterConfig.setRAMBufferSizeMB(config.getLuceneMaxBufferSizeMB());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,15 @@ public LuceneTextIndexReader(String column, File indexDir, int numDocs, TextInde
// TODO: consider using a threshold of num docs per segment to decide between building
// mapping file upfront on segment load v/s on-the-fly during query processing
_docIdTranslator = new DocIdTranslator(indexDir, _column, numDocs, _indexSearcher);
// If the properties file exists, use the analyzer properties and query parser class from the properties file
File propertiesFile = new File(indexFile, V1Constants.Indexes.LUCENE_TEXT_INDEX_PROPERTIES_FILE);
if (propertiesFile.exists()) {
config = TextIndexUtils.getUpdatedConfigFromPropertiesFile(propertiesFile, config);
}
_analyzer = TextIndexUtils.getAnalyzer(config);
_queryParserClass = config.getLuceneQueryParserClass();
_queryParserClassConstructor =
TextIndexUtils.getQueryParserWithStringAndAnalyzerTypeConstructor(_queryParserClass);
TextIndexUtils.getQueryParserWithStringAndAnalyzerTypeConstructor(_queryParserClass);
LOGGER.info("Successfully read lucene index for {} from {}", _column, indexDir);
} catch (Exception e) {
LOGGER.error("Failed to instantiate Lucene text index reader for column {}, exception {}", column,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,22 @@
import java.util.Map;
import java.util.stream.Collectors;
import javax.annotation.Nullable;
import org.apache.commons.configuration2.PropertiesConfiguration;
import org.apache.commons.configuration2.ex.ConfigurationException;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryparser.classic.QueryParserBase;
import org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
import org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
import org.apache.pinot.segment.spi.V1Constants;
import org.apache.pinot.segment.spi.V1Constants.Indexes;
import org.apache.pinot.segment.spi.index.TextIndexConfig;
import org.apache.pinot.segment.spi.store.SegmentDirectoryPaths;
import org.apache.pinot.spi.config.table.FSTType;
import org.apache.pinot.spi.config.table.FieldConfig;
import org.apache.pinot.spi.env.CommonsConfigurationUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -279,19 +284,72 @@ public static Constructor<QueryParserBase> getQueryParserWithStringAndAnalyzerTy
// Fail-fast if the query parser is specified class is not QueryParseBase class
final Class<?> queryParserClass = Class.forName(queryParserClassName);
if (!QueryParserBase.class.isAssignableFrom(queryParserClass)) {
throw new ReflectiveOperationException("The specified lucene query parser class " + queryParserClassName
+ " is not assignable from " + QueryParserBase.class.getName());
throw new ReflectiveOperationException(
"The specified lucene query parser class " + queryParserClassName + " is not assignable from "
+ QueryParserBase.class.getName());
}
// Fail-fast if the query parser does not have the required constructor used by this class
try {
queryParserClass.getConstructor(String.class, Analyzer.class);
} catch (NoSuchMethodException ex) {
throw new NoSuchMethodException("The specified lucene query parser class " + queryParserClassName
+ " is not assignable from does not have the required constructor method with parameter type "
+ "[String.class, Analyzer.class]"
);
+ " is not assignable from does not have the required constructor method with parameter type "
+ "[String.class, Analyzer.class]");
}

return (Constructor<QueryParserBase>) queryParserClass.getConstructor(String.class, Analyzer.class);
}

/**
* Writes the config to the properties file. Configs saved include luceneAnalyzerClass, luceneAnalyzerClassArgs,
* luceneAnalyzerClassArgTypes, and luceneQueryParserClass.
*
* @param indexDir directory where the properties file is saved
* @param config config to write to the properties file
*/
public static void writeConfigToPropertiesFile(File indexDir, TextIndexConfig config) {
PropertiesConfiguration properties = new PropertiesConfiguration();
List<String> escapedLuceneAnalyzerClassArgs = config.getLuceneAnalyzerClassArgs().stream()
.map(CommonsConfigurationUtils::replaceSpecialCharacterInPropertyValue).collect(Collectors.toList());
List<String> escapedLuceneAnalyzerClassArgTypes = config.getLuceneAnalyzerClassArgTypes().stream()
.map(CommonsConfigurationUtils::replaceSpecialCharacterInPropertyValue).collect(Collectors.toList());

properties.setProperty(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS, config.getLuceneAnalyzerClass());
properties.setProperty(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARGS, escapedLuceneAnalyzerClassArgs);
properties.setProperty(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARG_TYPES, escapedLuceneAnalyzerClassArgTypes);
properties.setProperty(FieldConfig.TEXT_INDEX_LUCENE_QUERY_PARSER_CLASS, config.getLuceneQueryParserClass());

File propertiesFile = new File(indexDir, V1Constants.Indexes.LUCENE_TEXT_INDEX_PROPERTIES_FILE);
CommonsConfigurationUtils.saveToFile(properties, propertiesFile);
}

/**
* Returns an updated TextIndexConfig, overriding the values in the config with the values in the properties file.
* The configs overwritten include luceneAnalyzerClass, luceneAnalyzerClassArgs, luceneAnalyzerClassArgTypes,
* and luceneQueryParserClass.
*
* @param file properties file to read from
* @param config config to update
* @return updated TextIndexConfig
*/
public static TextIndexConfig getUpdatedConfigFromPropertiesFile(File file, TextIndexConfig config)
throws ConfigurationException {
PropertiesConfiguration properties = CommonsConfigurationUtils.fromFile(file);
List<String> luceneAnalyzerClassArgs =
properties.getList(String.class, FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARGS);
List<String> luceneAnalyzerClassArgTypes =
properties.getList(String.class, FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARG_TYPES);
List<String> recoveredLuceneAnalyzerClassArgs = luceneAnalyzerClassArgs == null ? new ArrayList<>()
: luceneAnalyzerClassArgs.stream().map(CommonsConfigurationUtils::recoverSpecialCharacterInPropertyValue)
.collect(Collectors.toList());
List<String> recoveredLuceneAnalyzerClassArgTypes = luceneAnalyzerClassArgTypes == null ? new ArrayList<>()
: luceneAnalyzerClassArgTypes.stream().map(CommonsConfigurationUtils::recoverSpecialCharacterInPropertyValue)
.collect(Collectors.toList());

return new TextIndexConfigBuilder(config).withLuceneAnalyzerClass(
properties.getString(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS))
.withLuceneAnalyzerClassArgs(recoveredLuceneAnalyzerClassArgs)
.withLuceneAnalyzerClassArgTypes(recoveredLuceneAnalyzerClassArgTypes)
.withLuceneQueryParserClass(properties.getString(FieldConfig.TEXT_INDEX_LUCENE_QUERY_PARSER_CLASS)).build();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.pinot.segment.local.segment.store;

import java.io.File;
import java.util.Arrays;
import org.apache.commons.configuration2.ex.ConfigurationException;
import org.apache.commons.io.FileUtils;
import org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
import org.apache.pinot.segment.spi.V1Constants;
import org.apache.pinot.segment.spi.index.TextIndexConfig;
import org.testng.annotations.Test;

import static org.testng.Assert.assertEquals;


public class TextIndexUtilsTest {
private static final File TEMP_DIR = new File(FileUtils.getTempDirectory(), "TextIndexUtilsTest");

@Test
public void testRoundTripProperties()
throws ConfigurationException {
TextIndexConfig config =
new TextIndexConfigBuilder().withLuceneAnalyzerClass("org.apache.lucene.analysis.core.KeywordAnalyzer")
.withLuceneAnalyzerClassArgs(
Arrays.asList(" \\,.\n\t()[]{}\"':=-_$\\?@&|#+/", "\\,.()[]{}\"':=-_$\\?@&|#+"))
.withLuceneAnalyzerClassArgTypes(Arrays.asList("java.lang.String", "java.lang.String"))
.withLuceneQueryParserClass("org.apache.pinot.utils.lucene.queryparser.FakeQueryParser").build();

TextIndexUtils.writeConfigToPropertiesFile(TEMP_DIR, config);
TextIndexConfig readConfig = TextIndexUtils.getUpdatedConfigFromPropertiesFile(
new File(TEMP_DIR, V1Constants.Indexes.LUCENE_TEXT_INDEX_PROPERTIES_FILE), config);
assertEquals(readConfig, config);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ public static class Indexes {
public static final String LUCENE_V9_TEXT_INDEX_FILE_EXTENSION = ".lucene.v9.index";
public static final String LUCENE_V99_FST_INDEX_FILE_EXTENSION = ".lucene.v99.fst";
public static final String LUCENE_V99_TEXT_INDEX_FILE_EXTENSION = ".lucene.v99.index";
public static final String LUCENE_TEXT_INDEX_PROPERTIES_FILE = "lucene.properties";
public static final String VECTOR_INDEX_FILE_EXTENSION = ".vector.index";
public static final String VECTOR_HNSW_INDEX_FILE_EXTENSION = ".vector.hnsw.index";
public static final String VECTOR_V99_INDEX_FILE_EXTENSION = ".vector.v99.index";
Expand Down

0 comments on commit 0b7dae6

Please sign in to comment.