From 0b7dae61db27ae69057050044820ef6caae720a0 Mon Sep 17 00:00:00 2001 From: Christopher Peck <27231838+itschrispeck@users.noreply.github.com> Date: Mon, 7 Oct 2024 13:44:49 -0700 Subject: [PATCH] Store index metadata file for Lucene text indexes (#13948) * add lucene.properties metadata file per lucene index * add javadoc * improve javadoc --- .../impl/text/LuceneTextIndexCreator.java | 5 ++ .../readers/text/LuceneTextIndexReader.java | 7 +- .../local/segment/store/TextIndexUtils.java | 68 +++++++++++++++++-- .../segment/store/TextIndexUtilsTest.java | 51 ++++++++++++++ .../apache/pinot/segment/spi/V1Constants.java | 1 + 5 files changed, 126 insertions(+), 6 deletions(-) create mode 100644 pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/TextIndexUtilsTest.java diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java index d38e2c04fe8e..53c50b30e0e0 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java @@ -118,6 +118,11 @@ public LuceneTextIndexCreator(String column, File segmentIndexDir, boolean commi // to V3 if segmentVersion is set to V3 in SegmentGeneratorConfig. _indexFile = getV1TextIndexFile(segmentIndexDir); + // write properties file for the immutable segment + if (_commitOnClose) { + TextIndexUtils.writeConfigToPropertiesFile(_indexFile, config); + } + Analyzer luceneAnalyzer = TextIndexUtils.getAnalyzer(config); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(luceneAnalyzer); indexWriterConfig.setRAMBufferSizeMB(config.getLuceneMaxBufferSizeMB()); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java index ff5690d0430d..b49006f1cd6b 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java @@ -93,10 +93,15 @@ public LuceneTextIndexReader(String column, File indexDir, int numDocs, TextInde // TODO: consider using a threshold of num docs per segment to decide between building // mapping file upfront on segment load v/s on-the-fly during query processing _docIdTranslator = new DocIdTranslator(indexDir, _column, numDocs, _indexSearcher); + // If the properties file exists, use the analyzer properties and query parser class from the properties file + File propertiesFile = new File(indexFile, V1Constants.Indexes.LUCENE_TEXT_INDEX_PROPERTIES_FILE); + if (propertiesFile.exists()) { + config = TextIndexUtils.getUpdatedConfigFromPropertiesFile(propertiesFile, config); + } _analyzer = TextIndexUtils.getAnalyzer(config); _queryParserClass = config.getLuceneQueryParserClass(); _queryParserClassConstructor = - TextIndexUtils.getQueryParserWithStringAndAnalyzerTypeConstructor(_queryParserClass); + TextIndexUtils.getQueryParserWithStringAndAnalyzerTypeConstructor(_queryParserClass); LOGGER.info("Successfully read lucene index for {} from {}", _column, indexDir); } catch (Exception e) { LOGGER.error("Failed to instantiate Lucene text index reader for column {}, exception {}", column, diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java index 2d8abe40f454..350ad6856e52 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java @@ -28,17 +28,22 @@ import java.util.Map; import java.util.stream.Collectors; import javax.annotation.Nullable; +import org.apache.commons.configuration2.PropertiesConfiguration; +import org.apache.commons.configuration2.ex.ConfigurationException; import org.apache.commons.io.FileUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.queryparser.classic.QueryParserBase; import org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator; +import org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder; +import org.apache.pinot.segment.spi.V1Constants; import org.apache.pinot.segment.spi.V1Constants.Indexes; import org.apache.pinot.segment.spi.index.TextIndexConfig; import org.apache.pinot.segment.spi.store.SegmentDirectoryPaths; import org.apache.pinot.spi.config.table.FSTType; import org.apache.pinot.spi.config.table.FieldConfig; +import org.apache.pinot.spi.env.CommonsConfigurationUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -279,19 +284,72 @@ public static Constructor getQueryParserWithStringAndAnalyzerTy // Fail-fast if the query parser is specified class is not QueryParseBase class final Class queryParserClass = Class.forName(queryParserClassName); if (!QueryParserBase.class.isAssignableFrom(queryParserClass)) { - throw new ReflectiveOperationException("The specified lucene query parser class " + queryParserClassName - + " is not assignable from " + QueryParserBase.class.getName()); + throw new ReflectiveOperationException( + "The specified lucene query parser class " + queryParserClassName + " is not assignable from " + + QueryParserBase.class.getName()); } // Fail-fast if the query parser does not have the required constructor used by this class try { queryParserClass.getConstructor(String.class, Analyzer.class); } catch (NoSuchMethodException ex) { throw new NoSuchMethodException("The specified lucene query parser class " + queryParserClassName - + " is not assignable from does not have the required constructor method with parameter type " - + "[String.class, Analyzer.class]" - ); + + " is not assignable from does not have the required constructor method with parameter type " + + "[String.class, Analyzer.class]"); } return (Constructor) queryParserClass.getConstructor(String.class, Analyzer.class); } + + /** + * Writes the config to the properties file. Configs saved include luceneAnalyzerClass, luceneAnalyzerClassArgs, + * luceneAnalyzerClassArgTypes, and luceneQueryParserClass. + * + * @param indexDir directory where the properties file is saved + * @param config config to write to the properties file + */ + public static void writeConfigToPropertiesFile(File indexDir, TextIndexConfig config) { + PropertiesConfiguration properties = new PropertiesConfiguration(); + List escapedLuceneAnalyzerClassArgs = config.getLuceneAnalyzerClassArgs().stream() + .map(CommonsConfigurationUtils::replaceSpecialCharacterInPropertyValue).collect(Collectors.toList()); + List escapedLuceneAnalyzerClassArgTypes = config.getLuceneAnalyzerClassArgTypes().stream() + .map(CommonsConfigurationUtils::replaceSpecialCharacterInPropertyValue).collect(Collectors.toList()); + + properties.setProperty(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS, config.getLuceneAnalyzerClass()); + properties.setProperty(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARGS, escapedLuceneAnalyzerClassArgs); + properties.setProperty(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARG_TYPES, escapedLuceneAnalyzerClassArgTypes); + properties.setProperty(FieldConfig.TEXT_INDEX_LUCENE_QUERY_PARSER_CLASS, config.getLuceneQueryParserClass()); + + File propertiesFile = new File(indexDir, V1Constants.Indexes.LUCENE_TEXT_INDEX_PROPERTIES_FILE); + CommonsConfigurationUtils.saveToFile(properties, propertiesFile); + } + + /** + * Returns an updated TextIndexConfig, overriding the values in the config with the values in the properties file. + * The configs overwritten include luceneAnalyzerClass, luceneAnalyzerClassArgs, luceneAnalyzerClassArgTypes, + * and luceneQueryParserClass. + * + * @param file properties file to read from + * @param config config to update + * @return updated TextIndexConfig + */ + public static TextIndexConfig getUpdatedConfigFromPropertiesFile(File file, TextIndexConfig config) + throws ConfigurationException { + PropertiesConfiguration properties = CommonsConfigurationUtils.fromFile(file); + List luceneAnalyzerClassArgs = + properties.getList(String.class, FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARGS); + List luceneAnalyzerClassArgTypes = + properties.getList(String.class, FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARG_TYPES); + List recoveredLuceneAnalyzerClassArgs = luceneAnalyzerClassArgs == null ? new ArrayList<>() + : luceneAnalyzerClassArgs.stream().map(CommonsConfigurationUtils::recoverSpecialCharacterInPropertyValue) + .collect(Collectors.toList()); + List recoveredLuceneAnalyzerClassArgTypes = luceneAnalyzerClassArgTypes == null ? new ArrayList<>() + : luceneAnalyzerClassArgTypes.stream().map(CommonsConfigurationUtils::recoverSpecialCharacterInPropertyValue) + .collect(Collectors.toList()); + + return new TextIndexConfigBuilder(config).withLuceneAnalyzerClass( + properties.getString(FieldConfig.TEXT_INDEX_LUCENE_ANALYZER_CLASS)) + .withLuceneAnalyzerClassArgs(recoveredLuceneAnalyzerClassArgs) + .withLuceneAnalyzerClassArgTypes(recoveredLuceneAnalyzerClassArgTypes) + .withLuceneQueryParserClass(properties.getString(FieldConfig.TEXT_INDEX_LUCENE_QUERY_PARSER_CLASS)).build(); + } } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/TextIndexUtilsTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/TextIndexUtilsTest.java new file mode 100644 index 000000000000..f818a39c7f96 --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/TextIndexUtilsTest.java @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.store; + +import java.io.File; +import java.util.Arrays; +import org.apache.commons.configuration2.ex.ConfigurationException; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder; +import org.apache.pinot.segment.spi.V1Constants; +import org.apache.pinot.segment.spi.index.TextIndexConfig; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; + + +public class TextIndexUtilsTest { + private static final File TEMP_DIR = new File(FileUtils.getTempDirectory(), "TextIndexUtilsTest"); + + @Test + public void testRoundTripProperties() + throws ConfigurationException { + TextIndexConfig config = + new TextIndexConfigBuilder().withLuceneAnalyzerClass("org.apache.lucene.analysis.core.KeywordAnalyzer") + .withLuceneAnalyzerClassArgs( + Arrays.asList(" \\,.\n\t()[]{}\"':=-_$\\?@&|#+/", "\\,.()[]{}\"':=-_$\\?@&|#+")) + .withLuceneAnalyzerClassArgTypes(Arrays.asList("java.lang.String", "java.lang.String")) + .withLuceneQueryParserClass("org.apache.pinot.utils.lucene.queryparser.FakeQueryParser").build(); + + TextIndexUtils.writeConfigToPropertiesFile(TEMP_DIR, config); + TextIndexConfig readConfig = TextIndexUtils.getUpdatedConfigFromPropertiesFile( + new File(TEMP_DIR, V1Constants.Indexes.LUCENE_TEXT_INDEX_PROPERTIES_FILE), config); + assertEquals(readConfig, config); + } +} diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java index 85db6bb83961..50d5ce7d1153 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java @@ -58,6 +58,7 @@ public static class Indexes { public static final String LUCENE_V9_TEXT_INDEX_FILE_EXTENSION = ".lucene.v9.index"; public static final String LUCENE_V99_FST_INDEX_FILE_EXTENSION = ".lucene.v99.fst"; public static final String LUCENE_V99_TEXT_INDEX_FILE_EXTENSION = ".lucene.v99.index"; + public static final String LUCENE_TEXT_INDEX_PROPERTIES_FILE = "lucene.properties"; public static final String VECTOR_INDEX_FILE_EXTENSION = ".vector.index"; public static final String VECTOR_HNSW_INDEX_FILE_EXTENSION = ".vector.hnsw.index"; public static final String VECTOR_V99_INDEX_FILE_EXTENSION = ".vector.v99.index";