Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SOLR-16995: Associate with each replica type a property for "numReplicas" #2039

Merged
merged 5 commits into from
Dec 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,11 @@
package org.apache.solr.cloud;

import java.lang.invoke.MethodHandles;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.solr.cloud.api.collections.CollectionHandlingUtils;
import org.apache.solr.cloud.overseer.OverseerAction;
import org.apache.solr.common.MapWriter;
import org.apache.solr.common.SolrException;
Expand All @@ -48,11 +47,12 @@

final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final EnumSet<Replica.Type> leaderEligibleReplicaTypes =
CollectionHandlingUtils.leaderEligibleReplicaTypes();

private final CoreContainer cc;
private final SyncStrategy syncStrategy;
private final DistributedClusterStateUpdater distributedClusterStateUpdater;
private final EnumSet<Replica.Type> leaderEligibleReplicaTypes;

private volatile boolean isClosed = false;

Expand All @@ -68,10 +68,6 @@ public ShardLeaderElectionContext(
this.cc = cc;
this.syncStrategy = new SyncStrategy(cc);
this.distributedClusterStateUpdater = zkController.getDistributedClusterStateUpdater();
leaderEligibleReplicaTypes =
Arrays.stream(Replica.Type.values())
.filter(t -> t.leaderEligible)
.collect(Collectors.toCollection(() -> EnumSet.noneOf(Replica.Type.class)));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@
import static org.apache.solr.cloud.api.collections.CollectionHandlingUtils.SKIP_CREATE_REPLICA_IN_CLUSTER_STATE;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.CORE_NAME_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.NRT_REPLICAS;
import static org.apache.solr.common.cloud.ZkStateReader.PULL_REPLICAS;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.TLOG_REPLICAS;
import static org.apache.solr.common.params.CollectionAdminParams.COLL_CONF;
import static org.apache.solr.common.params.CollectionAdminParams.FOLLOW_ALIASES;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICA;
Expand All @@ -38,7 +35,6 @@
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -135,19 +131,7 @@ List<ZkNodeProps> addReplica(
int timeout = message.getInt(TIMEOUT, 10 * 60); // 10 minutes
boolean parallel = message.getBool("parallel", false);

ReplicaCount numReplicas =
new ReplicaCount(
message.getInt(NRT_REPLICAS, 0),
message.getInt(TLOG_REPLICAS, 0),
message.getInt(PULL_REPLICAS, 0));
if (numReplicas.total() <= 0) {
Replica.Type replicaType =
Replica.Type.valueOf(
message
.getStr(ZkStateReader.REPLICA_TYPE, Replica.Type.NRT.name())
.toUpperCase(Locale.ROOT));
numReplicas.increment(replicaType);
}
ReplicaCount numReplicas = ReplicaCount.fromMessage(message, null, 1);

int totalReplicas = numReplicas.total();
if (totalReplicas > 1) {
Expand Down Expand Up @@ -450,7 +434,7 @@ public static List<ReplicaPosition> buildReplicaPositions(
// the same node, but we've got to accommodate.
positions = new ArrayList<>(totalReplicas);
int i = 0;
for (Replica.Type type : Replica.Type.values()) {
for (Replica.Type type : numReplicas.keySet()) {
for (int j = 0; j < numReplicas.get(type); j++) {
positions.add(new ReplicaPosition(collectionName, sliceName, i++, type, node));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,15 @@
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.stream.Collectors;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrResponse;
import org.apache.solr.client.solrj.SolrServerException;
Expand Down Expand Up @@ -107,20 +109,24 @@ public class CollectionHandlingUtils {

// Immutable Maps are null-hostile, so build our own
public static final Map<String, Object> COLLECTION_PROPS_AND_DEFAULTS =
Collections.unmodifiableMap(
Utils.makeMap(
CollectionStateProps.DOC_ROUTER,
DocRouter.DEFAULT_NAME,
CollectionStateProps.REPLICATION_FACTOR,
"1",
CollectionStateProps.NRT_REPLICAS,
"1",
CollectionStateProps.TLOG_REPLICAS,
"0",
CollectionStateProps.PER_REPLICA_STATE,
null,
CollectionStateProps.PULL_REPLICAS,
"0"));
Collections.unmodifiableMap(makeCollectionPropsAndDefaults());

private static Map<String, Object> makeCollectionPropsAndDefaults() {
Map<String, Object> propsAndDefaults =
Utils.makeMap(
CollectionStateProps.DOC_ROUTER,
(Object) DocRouter.DEFAULT_NAME,
CollectionStateProps.REPLICATION_FACTOR,
"1",
CollectionStateProps.PER_REPLICA_STATE,
null);
for (Replica.Type replicaType : Replica.Type.values()) {
propsAndDefaults.put(
replicaType.numReplicasPropertyName,
replicaType == Replica.Type.defaultType() ? "1" : "0");
}
return propsAndDefaults;
}

public static final Random RANDOM;

Expand All @@ -135,6 +141,20 @@ public class CollectionHandlingUtils {
}
}

/** Returns names of properties that are used to specify a number of replicas of a given type. */
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't these two methods numReplicasProperties and leaderEligibleReplicaTypes belong in Replica.Type?

Copy link
Contributor Author

@pvcnt pvcnt Nov 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but the reason for not doing it was to avoid exposing more code in solrj while it is only used in Solr itself.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. But I'd still value more a cleaner object oriented design vs packaging/deployment concerns. SolrJ exposes a lot of internals to clients so within that framework I think it makes sense to group code related to the same concepts.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with @murblanc . It's really tiny amounts of code any way.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really would prefer not to bloat Replica.Type with methods, especially static ones, that are not used there. Since those methods are used only from API/Command layer, I believe that having them in a class named CollectionHandlingUtils makes sense. And since they are static (this is a different story for non-static methods, I added one as requested elsewhere in this PR) I do not see the difference it makes to have them here or there. So I definitely prefer not adding random util methods to a place where they do not belong.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand why you say "do not belong".
Except standard Java classes, these two static methods depend only on Replica.Type.
If you don't want to put them in the enum put them in Replica, but move them close to the code they're related to.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They do not belong there because they are not used in solrj, but in solr-core.

public static Set<String> numReplicasProperties() {
return Arrays.stream(Replica.Type.values())
.map(t -> t.numReplicasPropertyName)
.collect(Collectors.toSet());
}

/** Returns replica types that are eligible to be leader. */
public static EnumSet<Replica.Type> leaderEligibleReplicaTypes() {
return Arrays.stream(Replica.Type.values())
.filter(t -> t.leaderEligible)
.collect(Collectors.toCollection(() -> EnumSet.noneOf(Replica.Type.class)));
}

static boolean waitForCoreNodeGone(
String collectionName,
String shard,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@

package org.apache.solr.cloud.api.collections;

import static org.apache.solr.common.cloud.ZkStateReader.NRT_REPLICAS;
import static org.apache.solr.common.cloud.ZkStateReader.PULL_REPLICAS;
import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR;
import static org.apache.solr.common.cloud.ZkStateReader.TLOG_REPLICAS;
import static org.apache.solr.common.params.CollectionAdminParams.ALIAS;
import static org.apache.solr.common.params.CollectionAdminParams.COLL_CONF;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICA;
Expand Down Expand Up @@ -139,7 +135,6 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList<Objec
// fail fast if parameters are wrong or incomplete
List<String> shardNames = populateShardNames(message, router);
ReplicaCount numReplicas = getNumReplicas(message);
numReplicas.validate();

DocCollection newColl = null;
final String collectionPath = DocCollection.getCollectionPath(collectionName);
Expand Down Expand Up @@ -590,12 +585,22 @@ public static List<String> populateShardNames(ZkNodeProps message, String router
return shardNames;
}

private static ReplicaCount getNumReplicas(ZkNodeProps message) {
int numTlogReplicas = message.getInt(TLOG_REPLICAS, 0);
int numNrtReplicas =
message.getInt(
NRT_REPLICAS, message.getInt(REPLICATION_FACTOR, numTlogReplicas > 0 ? 0 : 1));
return new ReplicaCount(numNrtReplicas, numTlogReplicas, message.getInt(PULL_REPLICAS, 0));
private ReplicaCount getNumReplicas(ZkNodeProps message) {
ReplicaCount numReplicas = ReplicaCount.fromMessage(message);
boolean hasLeaderEligibleReplica = numReplicas.hasLeaderReplica();
if (!hasLeaderEligibleReplica && !numReplicas.contains(Replica.Type.defaultType())) {
// Ensure that there is at least one replica that can become leader if the user did
// not force a replica count.
numReplicas.put(Replica.Type.defaultType(), 1);
} else if (!hasLeaderEligibleReplica) {
// This can still fail if the user manually forced "0" replica counts.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could simplify by removing the else branch here and keeping only the numLeaderEligibleReplicas == 0 in the if condition, basically adding a default replica (assumed to be leader eligible) if the user passed 0 leaders in the request.

BTW the numReplicas.validate() call in existing code (line 142) was useless since upon return from getNumReplicas there was always at least one leader replica.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This slightly changes the existing behaviour. As noted in the comment, if a user explicitly passes nrtReplicas=0 to the create collection command, it would fail. With your suggestion, we would silently force nrtReplicas=1. I don't mind changing the behaviour, but I tried very hard not to.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And yes ReplicaCount#validate was useless, that's why I removed it. ;)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if user passes only 0 counts for all replicas it's ok to set the default replica count to 1.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current behaviour is:

  • if I pass no parameters, I will have 1 NRT replica
  • if I pass pullReplicas=1, I will have 1 NRT and 1 PULL replica
  • if I pass nrtReplicas=0, I will have an error

What you are suggesting is to change the third situation, and instead to silently transform the user request into nrtReplicas=1. The error is triggered if the default replica type is explicitly set to 0, not if all replica types are set to 0.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, then it works because if you pass createReplicas=false alone, this will default tonrtReplicas=1. I am pretty sure that if you explicitly pass createReplicas=false&nrtReplicas=0 the call is rejected.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This succeeds: ```
curl -X POST http://localhost:8983/api/collections -H 'Content-Type: application/json' -d '
{
"name": "techproducts_v2",
"config": "_default",
"numShards": 1,
"createReplicas": false
}'

I run Solr with `docker run --rm -p 8983:8983 solr:9 -c`

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but this fails:

curl -X POST http://localhost:8983/api/collections -H 'Content-Type: application/json' -d '
{
"name": "techproducts_v2",
"config": "_default",
"numShards": 1,
"createReplicas": false,
"nrtReplicas":0
}'

with:

{"responseHeader":{"status":400,"QTime":28},"error":{"metadata":{"error-class":"org.apache.solr.common.SolrException","root-error-class":"org.apache.solr.common.SolrException"},"msg":"nrtReplicas + tlogReplicas must be greater than 0","code":400}}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It may be a bug (what is the purpose of validating the number of replicas if not creating any replicas?), but the goal of this PR is not to change this behaviour.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a small change in behavior for an edge-case. Wouldn't break any existing user.

throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"Unexpected number of replicas ("
+ numReplicas
+ "), there must be at least one leader-eligible replica");
}
return numReplicas;
}

String getConfigName(String coll, ZkNodeProps message) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,28 +17,22 @@
package org.apache.solr.cloud.api.collections;

import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.NRT_REPLICAS;
import static org.apache.solr.common.cloud.ZkStateReader.PULL_REPLICAS;
import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.TLOG_REPLICAS;
import static org.apache.solr.common.params.CollectionAdminParams.FOLLOW_ALIASES;
import static org.apache.solr.common.params.CommonAdminParams.ASYNC;

import java.lang.invoke.MethodHandles;
import java.util.HashMap;
import java.util.Map;
import org.apache.solr.cloud.DistributedClusterStateUpdater;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.ReplicaCount;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CommonAdminParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.Utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -72,8 +66,14 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList<Objec
}
DocCollection collection = clusterState.getCollection(collectionName);

ReplicaCount numReplicas = getNumReplicas(collection, message);
numReplicas.validate();
ReplicaCount numReplicas = ReplicaCount.fromMessage(message, collection, 1);
if (!numReplicas.hasLeaderReplica()) {
throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"Unexpected number of replicas ("
+ numReplicas
+ "), there must be at least one leader-eligible replica");
}

if (ccc.getDistributedClusterStateUpdater().isDistributedStateUpdate()) {
// The message has been crafted by CollectionsHandler.CollectionOperation.CREATESHARD_OP and
Expand All @@ -97,33 +97,28 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList<Objec
CollectionHandlingUtils.waitForNewShard(collectionName, sliceName, ccc.getZkStateReader());

String async = message.getStr(ASYNC);
ZkNodeProps addReplicasProps =
new ZkNodeProps(
Map<String, Object> addReplicasProps =
Utils.makeMap(
COLLECTION_PROP,
collectionName,
(Object) collectionName,
murblanc marked this conversation as resolved.
Show resolved Hide resolved
SHARD_ID_PROP,
sliceName,
ZkStateReader.NRT_REPLICAS,
String.valueOf(numReplicas.get(Replica.Type.NRT)),
ZkStateReader.TLOG_REPLICAS,
String.valueOf(numReplicas.get(Replica.Type.TLOG)),
ZkStateReader.PULL_REPLICAS,
String.valueOf(numReplicas.get(Replica.Type.PULL)),
CollectionHandlingUtils.CREATE_NODE_SET,
message.getStr(CollectionHandlingUtils.CREATE_NODE_SET),
CommonAdminParams.WAIT_FOR_FINAL_STATE,
Boolean.toString(waitForFinalState));
numReplicas.writeProps(addReplicasProps);

Map<String, Object> propertyParams = new HashMap<>();
CollectionHandlingUtils.addPropertyParams(message, propertyParams);
addReplicasProps = addReplicasProps.plus(propertyParams);
if (async != null) addReplicasProps.getProperties().put(ASYNC, async);
CollectionHandlingUtils.addPropertyParams(message, addReplicasProps);
if (async != null) {
addReplicasProps.put(ASYNC, async);
}
final NamedList<Object> addResult = new NamedList<>();
try {
new AddReplicaCmd(ccc)
.addReplica(
clusterState,
addReplicasProps,
new ZkNodeProps(addReplicasProps),
pvcnt marked this conversation as resolved.
Show resolved Hide resolved
addResult,
() -> {
@SuppressWarnings("unchecked")
Expand Down Expand Up @@ -160,15 +155,4 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList<Objec

log.info("Finished create command on all shards for collection: {}", collectionName);
}

private static ReplicaCount getNumReplicas(DocCollection collection, ZkNodeProps message) {
return new ReplicaCount(
message.getInt(
NRT_REPLICAS,
message.getInt(
REPLICATION_FACTOR,
collection.getInt(NRT_REPLICAS, collection.getInt(REPLICATION_FACTOR, 1)))),
message.getInt(PULL_REPLICAS, collection.getInt(PULL_REPLICAS, 0)),
message.getInt(TLOG_REPLICAS, collection.getInt(TLOG_REPLICAS, 0)));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
package org.apache.solr.cloud.api.collections;

import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.NRT_REPLICAS;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.params.CollectionAdminParams.FOLLOW_ALIASES;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICA;
Expand Down Expand Up @@ -308,7 +307,7 @@ private void migrateKey(
CREATE.toLower(),
NAME,
tempSourceCollectionName,
NRT_REPLICAS,
Replica.Type.defaultType().numReplicasPropertyName,
1,
CollectionHandlingUtils.NUM_SLICES,
1,
Expand Down Expand Up @@ -394,7 +393,7 @@ private void migrateKey(
ccc.getSolrCloudManager().getDistribStateManager(),
zkStateReader.getClusterState().getCollection(tempSourceCollectionName),
tempSourceSlice.getName(),
Replica.Type.NRT);
Replica.Type.defaultType());
props = new HashMap<>();
props.put(Overseer.QUEUE_OPERATION, ADDREPLICA.toLower());
props.put(COLLECTION_PROP, tempSourceCollectionName);
Expand Down
Loading
Loading