-
Notifications
You must be signed in to change notification settings - Fork 690
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SOLR-17192: Add "field-limiting" URP to catch ill-designed schemas #2395
Changes from 15 commits
d022a81
343e7b0
6c7c8ea
5512246
2840fe1
cab33e0
2882169
f5a4f05
df64c44
05f5e9e
6b7dc1b
aae56d5
d3f14e4
6831440
a07df3e
32a8e4f
86e3166
9773505
2588b9c
f864df5
5eb7d13
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.solr.update.processor; | ||
|
||
import java.io.IOException; | ||
import java.lang.invoke.MethodHandles; | ||
import java.util.Locale; | ||
import org.apache.solr.common.SolrException; | ||
import org.apache.solr.request.SolrQueryRequest; | ||
import org.apache.solr.update.AddUpdateCommand; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
public class NumFieldLimitingUpdateRequestProcessor extends UpdateRequestProcessor { | ||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); | ||
|
||
private SolrQueryRequest req; | ||
private int fieldThreshold; | ||
private int currentNumFields; | ||
private boolean warnOnly; | ||
|
||
public NumFieldLimitingUpdateRequestProcessor( | ||
SolrQueryRequest req, | ||
UpdateRequestProcessor next, | ||
int fieldThreshold, | ||
int currentNumFields, | ||
boolean warnOnly) { | ||
super(next); | ||
this.req = req; | ||
this.fieldThreshold = fieldThreshold; | ||
this.currentNumFields = currentNumFields; | ||
this.warnOnly = warnOnly; | ||
} | ||
|
||
public void processAdd(AddUpdateCommand cmd) throws IOException { | ||
if (coreExceedsFieldLimit()) { | ||
throwExceptionOrLog(cmd.getPrintableId()); | ||
} else { | ||
if (log.isDebugEnabled()) { | ||
log.debug( | ||
"Allowing document {}, since current core is under the 'maxFields' limit (numFields={}, maxFields={})", | ||
cmd.getPrintableId(), | ||
currentNumFields, | ||
fieldThreshold); | ||
} | ||
} | ||
super.processAdd(cmd); | ||
} | ||
|
||
protected boolean coreExceedsFieldLimit() { | ||
return currentNumFields > fieldThreshold; | ||
} | ||
|
||
protected void throwExceptionOrLog(String id) { | ||
final String messageSuffix = warnOnly ? "Blocking update of document " + id : ""; | ||
final String message = | ||
String.format( | ||
Locale.ROOT, | ||
"Current core has %d fields, exceeding the max-fields limit of %d. %s", | ||
currentNumFields, | ||
fieldThreshold, | ||
messageSuffix); | ||
if (warnOnly) { | ||
log.warn(message); | ||
} else { | ||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, message); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.solr.update.processor; | ||
|
||
import org.apache.solr.common.util.NamedList; | ||
import org.apache.solr.core.SolrCore; | ||
import org.apache.solr.request.SolrQueryRequest; | ||
import org.apache.solr.response.SolrQueryResponse; | ||
import org.apache.solr.search.SolrIndexSearcher; | ||
import org.apache.solr.util.plugin.SolrCoreAware; | ||
|
||
/** | ||
* This factory generates an UpdateRequestProcessor which fails update requests once a core has | ||
* exceeded a configurable maximum number of fields. Meant as a safeguard to help users notice | ||
* potentially-dangerous schema design before performance and stability problems start to occur. | ||
* | ||
* <p>The URP uses the core's {@link SolrIndexSearcher} to judge the current number of fields. | ||
* Accordingly, it undercounts the number of fields in the core - missing all fields added since the | ||
* previous searcher was opened. As such, the URP's request-blocking is "best effort" - it cannot be | ||
* relied on as a precise limit on the number of fields. | ||
* | ||
* <p>Additionally, the field-counting includes all documents present in the index, including any | ||
* deleted docs that haven't yet been purged via segment merging. Note that this can differ | ||
* significantly from the number of fields defined in managed-schema.xml - especially when dynamic | ||
* fields are enabled. The only way to reduce this field count is to delete documents and wait until | ||
* the deleted documents have been removed by segment merges. Users may of course speed up this | ||
* process by tweaking Solr's segment-merging, triggering an "optimize" operation, etc. | ||
* | ||
* <p>{@link NumFieldLimitingUpdateRequestProcessorFactory} accepts two configuration parameters: | ||
* | ||
* <ul> | ||
* <li><code>maxFields</code> - (required) The maximum number of fields before update requests | ||
* should be aborted. Once this limit has been exceeded, additional update requests will fail | ||
* until fields have been removed or the "maxFields" is increased. | ||
* <li><code>warnOnly</code> - (optional) If <code>true</code> then the URP logs verbose warnings | ||
* about the limit being exceeded but doesn't abort update requests. Defaults to <code>false | ||
* </code> if not specified | ||
* </ul> | ||
* | ||
* @since 9.6.0 | ||
gerlowskija marked this conversation as resolved.
Show resolved
Hide resolved
|
||
*/ | ||
public class NumFieldLimitingUpdateRequestProcessorFactory extends UpdateRequestProcessorFactory | ||
implements SolrCoreAware { | ||
|
||
private static final String MAXIMUM_FIELDS_PARAM = "maxFields"; | ||
private static final String WARN_ONLY_PARAM = "warnOnly"; | ||
|
||
private NumFieldsMonitor numFieldsMonitor; | ||
private int maximumFields; | ||
private boolean warnOnly; | ||
|
||
@Override | ||
public void inform(final SolrCore core) { | ||
// register a commit callback for monitoring the number of fields in the schema | ||
numFieldsMonitor = new NumFieldsMonitor(core); | ||
core.getUpdateHandler().registerCommitCallback(numFieldsMonitor); | ||
core.registerNewSearcherListener(numFieldsMonitor); | ||
} | ||
|
||
@Override | ||
public void init(NamedList<?> args) { | ||
warnOnly = args.indexOf(WARN_ONLY_PARAM, 0) > 0 ? args.getBooleanArg(WARN_ONLY_PARAM) : false; | ||
|
||
if (args.indexOf(MAXIMUM_FIELDS_PARAM, 0) < 0) { | ||
throw new IllegalArgumentException( | ||
"The " | ||
+ MAXIMUM_FIELDS_PARAM | ||
+ " parameter is required for " | ||
+ getClass().getName() | ||
+ ", but no value was provided."); | ||
} | ||
final Object rawMaxFields = args.get(MAXIMUM_FIELDS_PARAM); | ||
gerlowskija marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (rawMaxFields == null || !(rawMaxFields instanceof Integer)) { | ||
throw new IllegalArgumentException( | ||
MAXIMUM_FIELDS_PARAM + " must be configured as a non-null <int>"); | ||
} | ||
maximumFields = (Integer) rawMaxFields; | ||
if (maximumFields <= 0) { | ||
throw new IllegalArgumentException(MAXIMUM_FIELDS_PARAM + " must be a positive integer"); | ||
} | ||
} | ||
|
||
@Override | ||
public UpdateRequestProcessor getInstance( | ||
SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { | ||
|
||
// TODO Should we skip to the next URP if running in SolrCloud and *not* a leader? | ||
return new NumFieldLimitingUpdateRequestProcessor( | ||
req, next, maximumFields, numFieldsMonitor.getCurrentNumFields(), warnOnly); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Woah, okay, I wasn't expecting this. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Personally, I find the NumFieldsMonitor approach preferable to using a SIS here directly:
If there are other votes I'm happy to go with whatever the majority considers simpler. But otherwise I'll stick with NumFieldsMonitor approach.
I don't love "NumFieldLimitingURP" as a name, but I like that the name itself suggests why a user would want to use it. "Want to cap the fields in your index? Then use this thing..." BlockNewDocs captures what the URP does, but doesn't capture the "why", which makes it less discoverable to users IMO (And you could also interpret it as distinguishing between "new" and "existing" docs, which isn't the case.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Unless that abstraction shows utility here, I don't think it's worth another class.
That's not expensive!
Sorry, what? I didn't mean rename the factory (it's well named!). I think the URP should be a small inner class of this factory named whatever makes sense. Not public. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To be clear, I am proposing that the whole NumFieldsMonitor thing (and how we init it, etc.) be replaced by no more than one line of code. :-) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
NumFieldsMonitor is more LOC and more Java classes, but IMO both of those are red-herrings in terms of complexity and maintenance burden. My subjective opinion is that an unfamiliar reader would have an easier time grokking NumFieldsMonitor than they would parsing through the reference-checking logic that (as I understand it) the alternative would entail. That said I'm surprised to hear your confidence in "one line". My understanding of what you proposed, based on your earlier comment here, sounded like a good bit more than that. It'd need to: hold onto a weak-reference to the previous searcher, do some reference equality check on each invocation, fetch the updated number of fields, update the weak-ref if the current searcher is now different, etc. If using If I'm misunderstanding your suggestion and overcomplicating it in my head, and it's really just one line, please share 😄 . I'm happy to take a commit on this branch if you've got a minute! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Multiple ideas have been discussed, so it's a bit confusing. My epiphany from your recent changes is that we only need to know how many fields exist at the time that the URP is instantiated, unlike the first incarnation. This ought to be a major simplifier and it nullifies my idea around wanting a WeakReference to a SolrIndexSearcher. May I do the change to your PR here in one commit that you might revert if you don't like it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Absolutely, that'd be great. Thanks @dsmiley ! EDIT - oops, I see I responded here too late and you've already created a PR against my fork. Will go take a look at that. In future, you're welcome to push directly to my dev branch, I don't mind. Whatever you prefer. |
||
} | ||
|
||
public int getFieldThreshold() { | ||
return maximumFields; | ||
} | ||
|
||
public boolean getWarnOnly() { | ||
return warnOnly; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.solr.update.processor; | ||
|
||
import org.apache.solr.core.AbstractSolrEventListener; | ||
import org.apache.solr.core.SolrCore; | ||
import org.apache.solr.search.SolrIndexSearcher; | ||
import org.apache.solr.util.RefCounted; | ||
|
||
public class NumFieldsMonitor extends AbstractSolrEventListener { | ||
dsmiley marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
private int numFields; | ||
|
||
public NumFieldsMonitor(SolrCore core) { | ||
super(core); | ||
this.numFields = -1; | ||
} | ||
|
||
@Override | ||
public void postCommit() { | ||
dsmiley marked this conversation as resolved.
Show resolved
Hide resolved
|
||
RefCounted<SolrIndexSearcher> indexSearcher = null; | ||
try { | ||
indexSearcher = getCore().getSearcher(); | ||
gerlowskija marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Get the number of fields directly from the IndexReader instead of the Schema object to also | ||
// include the fields that are missing from the Schema file (dynamic/deleted etc.) | ||
numFields = indexSearcher.get().getFieldInfos().size(); | ||
} finally { | ||
if (indexSearcher != null) { | ||
indexSearcher.decref(); | ||
} | ||
} | ||
} | ||
|
||
@Override | ||
public void newSearcher(SolrIndexSearcher newSearcher, SolrIndexSearcher currentSearcher) { | ||
numFields = newSearcher.getFieldInfos().size(); | ||
} | ||
|
||
public int getCurrentNumFields() { | ||
return numFields; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
<?xml version="1.0" encoding="UTF-8" ?> | ||
<!-- | ||
Licensed to the Apache Software Foundation (ASF) under one or more | ||
contributor license agreements. See the NOTICE file distributed with | ||
this work for additional information regarding copyright ownership. | ||
The ASF licenses this file to You under the Apache License, Version 2.0 | ||
(the "License"); you may not use this file except in compliance with | ||
the License. You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
--> | ||
<schema name="minimal" version="1.1"> | ||
<fieldType name="string" class="solr.StrField"/> | ||
<fieldType name="int" class="${solr.tests.IntegerFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> | ||
<fieldType name="long" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> | ||
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100" multiValued="true" /> | ||
<dynamicField name="*" type="string" indexed="true" stored="true"/> | ||
<!-- for versioning --> | ||
<field name="_version_" type="long" indexed="true" stored="true"/> | ||
<field name="_root_" type="string" indexed="true" stored="true" multiValued="false" required="false"/> | ||
<field name="id" type="string" indexed="true" stored="true"/> | ||
<dynamicField name="*_s" type="string" indexed="true" stored="true" /> | ||
<uniqueKey>id</uniqueKey> | ||
</schema> |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
<?xml version="1.0" ?> | ||
|
||
<!-- | ||
Licensed to the Apache Software Foundation (ASF) under one or more | ||
contributor license agreements. See the NOTICE file distributed with | ||
this work for additional information regarding copyright ownership. | ||
The ASF licenses this file to You under the Apache License, Version 2.0 | ||
(the "License"); you may not use this file except in compliance with | ||
the License. You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
--> | ||
|
||
<!-- Minimal solrconfig.xml with /select, /admin and /update only --> | ||
|
||
<config> | ||
|
||
<dataDir>${solr.data.dir:}</dataDir> | ||
|
||
<directoryFactory name="DirectoryFactory" | ||
class="${solr.directoryFactory:solr.NRTCachingDirectoryFactory}"/> | ||
<schemaFactory class="ClassicIndexSchemaFactory"/> | ||
|
||
<luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion> | ||
|
||
<updateHandler class="solr.DirectUpdateHandler2"> | ||
<commitWithin> | ||
<softCommit>${solr.commitwithin.softcommit:true}</softCommit> | ||
</commitWithin> | ||
<updateLog class="${solr.ulog:solr.UpdateLog}"></updateLog> | ||
</updateHandler> | ||
|
||
<requestHandler name="/select" class="solr.SearchHandler"> | ||
<lst name="defaults"> | ||
<str name="echoParams">explicit</str> | ||
<str name="indent">true</str> | ||
<str name="df">text</str> | ||
</lst> | ||
</requestHandler> | ||
|
||
<updateProcessor class="solr.NumFieldLimitingUpdateRequestProcessorFactory" name="max-fields"> | ||
<int name="maxFields">${solr.test.maxFields:1234}</int> | ||
</updateProcessor> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why declare this separately instead of in the one chain that uses it? It's not obvious, when looking at the chain, that there is this additional processor. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tbh I wasn't aware that you could declare processors (that require config) directly in a processor chain. I can't find an example of that. Looking through several other solrconfig.xml files for URPs that require configuration...they all look similar to what I've done here afaict. (See the _default configset Solr ships and the default configset in the 'core' modules tests...both look similar to the example here) Maybe I'm misunderstanding you or missing something though - can you point to an example of how you'd prefer to see this declared, and a reason it's preferable? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Having no-config vs config doesn't conceptually change where the element can be placed.
Can be in the same places. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've updated this to be in line with your request. Though I'm still curious if there's a reason to prefer this going forward, or whether it's subjective. I've not paid much attention historically to how these appear in our configsets, but naively I'd assume that more composability is better. |
||
<updateProcessor class="solr.AddSchemaFieldsUpdateProcessorFactory" name="add-schema-fields"> | ||
<lst name="typeMapping"> | ||
<str name="valueClass">java.lang.String</str> | ||
<str name="fieldType">text_general</str> | ||
<lst name="copyField"> | ||
<str name="dest">*_str</str> | ||
<int name="maxChars">256</int> | ||
</lst> | ||
<!-- Use as default mapping instead of defaultFieldType --> | ||
<bool name="default">true</bool> | ||
</lst> | ||
</updateProcessor> | ||
<updateRequestProcessorChain name="add-unknown-fields-to-the-schema" default="true" | ||
processor="max-fields"> | ||
<processor class="solr.LogUpdateProcessorFactory"/> | ||
<processor class="solr.DistributedUpdateProcessorFactory"/> | ||
<processor class="solr.RunUpdateProcessorFactory"/> | ||
</updateRequestProcessorChain> | ||
|
||
<indexConfig> | ||
<mergeScheduler class="${solr.mscheduler:org.apache.lucene.index.ConcurrentMergeScheduler}"/> | ||
</indexConfig> | ||
</config> | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
javadoc needed, even if one line linking to a factory for further info.
Or don't make public :-)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missed this earlier; done!