Skip to content

Commit

Permalink
Redriving blocked operations to child shards
Browse files Browse the repository at this point in the history
  • Loading branch information
vikasvb90 committed May 13, 2024
1 parent 0c51aed commit df7ca8b
Show file tree
Hide file tree
Showing 16 changed files with 153 additions and 43 deletions.
2 changes: 1 addition & 1 deletion .idea/inspectionProfiles/Project_Default.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.action;

import org.opensearch.OpenSearchException;

/**
* Exception thrown when waiting writes fail due to primary shard getting split.
*
* @opensearch.internal
*/
public class PrimaryShardSplitException extends OpenSearchException {
public PrimaryShardSplitException(String msg) {
super(msg);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,18 @@
import org.opensearch.action.ActionRunnable;
import org.opensearch.action.DocWriteRequest;
import org.opensearch.action.DocWriteResponse;
import org.opensearch.action.PrimaryShardSplitException;
import org.opensearch.action.RoutingMissingException;
import org.opensearch.action.admin.indices.create.AutoCreateAction;
import org.opensearch.action.admin.indices.create.CreateIndexRequest;
import org.opensearch.action.admin.indices.create.CreateIndexResponse;
import org.opensearch.action.index.IndexRequest;
import org.opensearch.action.ingest.IngestActionForwarder;
import org.opensearch.action.support.ActionFilters;
import org.opensearch.action.support.ActiveShardCount;
import org.opensearch.action.support.AutoCreateIndex;
import org.opensearch.action.support.HandledTransportAction;
import org.opensearch.action.support.WriteRequest;
import org.opensearch.action.update.TransportUpdateAction;
import org.opensearch.action.update.UpdateRequest;
import org.opensearch.action.update.UpdateResponse;
Expand Down Expand Up @@ -526,6 +529,9 @@ private final class BulkOperation extends ActionRunnable<BulkResponse> {
@Override
protected void doRun() {
assert bulkRequest != null;
final ActiveShardCount bulkActiveShardCount = bulkRequest.waitForActiveShards();
final WriteRequest.RefreshPolicy refreshPolicy = bulkRequest.getRefreshPolicy();
final TimeValue timeout = bulkRequest.timeout();
final ClusterState clusterState = observer.setAndGetObservedState();
if (handleBlockExceptions(clusterState)) {
return;
Expand Down Expand Up @@ -677,6 +683,48 @@ public void onResponse(BulkShardResponse bulkShardResponse) {

@Override
public void onFailure(Exception e) {
if (e instanceof PrimaryShardSplitException) {
try {
reDriveOnChildShards();
return;
} catch (Exception ex) {
logger.error("Unexpected error occurred on attempting to re-drive " +
"bulk request after primary split", ex);
}
}
setBulkItemFailures(e);
if (counter.decrementAndGet() == 0) {
finishHim();
}
}

private void reDriveOnChildShards() {
BulkRequest bulkRequestForRedrive = new BulkRequest();
bulkRequestForRedrive.waitForActiveShards(bulkActiveShardCount);
bulkRequestForRedrive.timeout(timeout);
bulkRequestForRedrive.setRefreshPolicy(refreshPolicy);
requests.forEach(request -> bulkRequestForRedrive.add(request.request()));
ActionListener<BulkResponse> reDriveListener = ActionListener.wrap(reDriveResponses -> {
reDriveResponses.forEach(reDriveResponse -> {
if (reDriveResponse != null) {
responses.set(reDriveResponse.getItemId(), reDriveResponse);
}
});
if (counter.decrementAndGet() == 0) {
publishResponse();
}
}, reDriveException -> {
// We don't expect any item level failure here. So, we fail all items.
setBulkItemFailures(reDriveException);
if (counter.decrementAndGet() == 0) {
finishHim();
}
});
new BulkOperation(task, bulkRequestForRedrive, reDriveListener, responses, startTimeNanos,
indicesThatCannotBeCreated).run();
}

private void setBulkItemFailures(Exception e) {
// create failures for all relevant requests
for (BulkItemRequest request : requests) {
final String indexName = concreteIndices.getConcreteIndex(request.index()).getName();
Expand All @@ -690,14 +738,14 @@ public void onFailure(Exception e) {
docStatusStats.inc(bulkItemResponse.status());
responses.set(request.id(), bulkItemResponse);
}

if (counter.decrementAndGet() == 0) {
finishHim();
}
}

private void finishHim() {
indicesService.addDocStatusStats(docStatusStats);
publishResponse();
}

private void publishResponse() {
listener.onResponse(
new BulkResponse(
responses.toArray(new BulkItemResponse[responses.length()]),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ protected Request routedBasedOnClusterVersion(long routedBasedOnClusterVersion)
return (Request) this;
}

long routedBasedOnClusterVersion() {
public long routedBasedOnClusterVersion() {
return routedBasedOnClusterVersion;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.opensearch.ExceptionsHelper;
import org.opensearch.OpenSearchException;
import org.opensearch.action.ActionListenerResponseHandler;
import org.opensearch.action.PrimaryShardSplitException;
import org.opensearch.action.UnavailableShardsException;
import org.opensearch.action.support.ActionFilters;
import org.opensearch.action.support.ActiveShardCount;
Expand Down Expand Up @@ -566,6 +567,11 @@ void runWithPrimaryShardReference(final PrimaryShardReference primaryShardRefere
}

if (primaryShardReference.isRelocated()) {
if (primaryShardReference.routingEntry().splitting()) {
// This means shard was being split and was in relocation handoff stage when replication op on primary arrived.
// Write ops specifically will now get retried and will be routed to respective child shards.
throw new PrimaryShardSplitException("Primary shard is already split. Cannot perform replication operation on parent primary.");
}
primaryShardReference.close(); // release shard operation lock as soon as possible
setPhase(replicationTask, "primary_delegation");
// delegate primary phase to relocation target
Expand Down Expand Up @@ -1021,7 +1027,12 @@ protected void doRun() {
assert request.waitForActiveShards() != ActiveShardCount.DEFAULT
: "request waitForActiveShards must be set in resolveRequest";

final ShardRouting primary = state.getRoutingTable().shardRoutingTable(request.shardId()).primaryShard();
final ShardRouting primary;
if (indexMetadata.isParentShard(request.shardId().id()) && indexMetadata.isNonServingShard(request.shardId.id())) {
throw new PrimaryShardSplitException("Primary shard is already split. Cannot perform replication operation on parent primary.");
} else {
primary = state.getRoutingTable().shardRoutingTable(request.shardId()).primaryShard();
}
if (primary == null || primary.active() == false) {
logger.trace(
"primary shard [{}] is not yet active, scheduling a retry: action [{}], request [{}], "
Expand Down Expand Up @@ -1211,14 +1222,17 @@ void finishAsFailed(Exception failure) {
}

void finishWithUnexpectedFailure(Exception failure) {
logger.warn(
() -> new ParameterizedMessage(
"unexpected error during the primary phase for action [{}], request [{}]",
actionName,
request
),
failure
);
if (!(failure instanceof PrimaryShardSplitException)) {
// Skip log in case of primary shard split as write requests are going to be retried.
logger.warn(
() -> new ParameterizedMessage(
"unexpected error during the primary phase for action [{}], request [{}]",
actionName,
request
),
failure
);
}
if (finished.compareAndSet(false, true)) {
setPhase(task, "failed");
listener.onFailure(failure);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ public String toString() {
.append(indexMetadata.getAliasesVersion())
.append("]\n");
for (int shard = 0; shard < indexMetadata.getNumberOfShards(); shard++) {
if (indexMetadata.primaryTerm(shard) != IndexMetadata.SPLIT_PARENT_TERM) {
if (indexMetadata.isServingShard(shard)) {
sb.append(TAB).append(TAB).append(shard).append(": ");
sb.append("p_term [").append(indexMetadata.primaryTerm(shard)).append("], ");
sb.append("isa_ids ").append(indexMetadata.inSyncAllocationIds(shard)).append("\n");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -934,15 +934,19 @@ public boolean isParentShard(Integer shardId) {
return parentToChildShardsMetadata.containsKey(shardId);
}

public boolean isNonServingShard(Integer shardId) {
return primaryTerms[shardId] == SPLIT_PARENT_TERM;
}

public boolean isServingShard(Integer shardId) {
return primaryTerms[shardId] != SPLIT_PARENT_TERM;
}

public SplitMetadata getSplitMetadata(Integer shardId) {
assert isParentShard(shardId);
return parentToChildShardsMetadata.get(shardId);
}

public Map<Integer, SplitMetadata> getParentToChildShardsMetadata() {
return parentToChildShardsMetadata;
}

public List<Integer> getChildShardIds(int shardId) {
assert isParentShard(shardId);
return new ArrayList<>(parentToChildShardsMetadata.get(shardId).getChildShards());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -452,12 +452,11 @@ public ShardId shardId(ClusterState clusterState, String index, String id, @Null
}

public static int generateShardId(IndexMetadata indexMetadata, @Nullable String id, @Nullable String routing) {
return generateShardId(indexMetadata, id, routing, (shardId) ->
indexMetadata.primaryTerm(shardId) == IndexMetadata.SPLIT_PARENT_TERM);
return generateShardId(indexMetadata, id, routing, indexMetadata::isNonServingShard);
}

public static int generateShardId(IndexMetadata indexMetadata, @Nullable String id, @Nullable String routing,
Predicate<Integer> canIncludeRecoveringChildShardIds) {
Predicate<Integer> shouldIncludeChildShards) {
final String effectiveRouting;
final int partitionOffset;

Expand All @@ -475,12 +474,11 @@ public static int generateShardId(IndexMetadata indexMetadata, @Nullable String
partitionOffset = 0;
}

return calculateShardIdOfChild(indexMetadata, effectiveRouting, partitionOffset, canIncludeRecoveringChildShardIds);
return calculateShardIdOfChild(indexMetadata, effectiveRouting, partitionOffset, shouldIncludeChildShards);
}

private static int calculateScaledShardId(IndexMetadata indexMetadata, String effectiveRouting, int partitionOffset) {
return calculateShardIdOfChild(indexMetadata, effectiveRouting, partitionOffset, (shardId) ->
indexMetadata.primaryTerm(shardId) == IndexMetadata.SPLIT_PARENT_TERM);
return calculateShardIdOfChild(indexMetadata, effectiveRouting, partitionOffset, indexMetadata::isNonServingShard);
}

private static int calculateShardIdOfChild(IndexMetadata indexMetadata, String effectiveRouting, int partitionOffset,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,14 @@ final class ShardSplittingQuery extends Query {
private final IndexMetadata indexMetadata;
private final int shardId;
private final BitSetProducer nestedParentBitSetProducer;
private final Predicate<Integer> canIncludeRecoveringChildShardIds;
private final Predicate<Integer> shouldIncludeChildShards;

ShardSplittingQuery(IndexMetadata indexMetadata, int shardId, boolean hasNested,
Predicate<Integer> canIncludeRecoveringChildShardIds) {
Predicate<Integer> shouldIncludeChildShards) {
this.indexMetadata = indexMetadata;
this.shardId = shardId;
this.nestedParentBitSetProducer = hasNested ? newParentDocBitSetProducer(indexMetadata.getCreationVersion()) : null;
this.canIncludeRecoveringChildShardIds = canIncludeRecoveringChildShardIds;
this.shouldIncludeChildShards = shouldIncludeChildShards;
}

@Override
Expand All @@ -107,7 +107,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
indexMetadata,
Uid.decodeId(ref.bytes, ref.offset, ref.length),
null,
canIncludeRecoveringChildShardIds
shouldIncludeChildShards
);
return shardId == targetShardId;
};
Expand Down Expand Up @@ -151,7 +151,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
};
// in the _routing case we first go and find all docs that have a routing value and mark the ones we have to delete
findSplitDocs(RoutingFieldMapper.NAME, ref -> {
int targetShardId = OperationRouting.generateShardId(indexMetadata, null, ref.utf8ToString(), canIncludeRecoveringChildShardIds);
int targetShardId = OperationRouting.generateShardId(indexMetadata, null, ref.utf8ToString(), shouldIncludeChildShards);
return shardId == targetShardId;
}, leafReader, maybeWrapConsumer.apply(bitSet::set));

Expand Down Expand Up @@ -292,7 +292,7 @@ boolean matches(int doc) throws IOException {
leftToVisit = 2;
leafReader.storedFields().document(doc, this);
assert id != null : "docID must not be null - we might have hit a nested document";
int targetShardId = OperationRouting.generateShardId(indexMetadata, id, routing, canIncludeRecoveringChildShardIds);
int targetShardId = OperationRouting.generateShardId(indexMetadata, id, routing, shouldIncludeChildShards);
return targetShardId != shardId;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,7 @@ public static void addIndices(

Tuple<Boolean, Directory> addIndexDirectoryTuple = new Tuple<>(true, statsDirectory);
addIndices(indexRecoveryStats, indexSort, sources, maxSeqNo, maxUnsafeAutoIdTimestamp, indexMetadata,
shardId, split, hasNested, addIndexDirectoryTuple, (childShardId) ->
indexMetadata.primaryTerm(childShardId) == IndexMetadata.SPLIT_PARENT_TERM,
shardId, split, hasNested, addIndexDirectoryTuple, indexMetadata::isNonServingShard,
IndexWriterConfig.OpenMode.CREATE);
}

Expand All @@ -245,7 +244,7 @@ public static void addIndices(
boolean split,
boolean hasNested,
Tuple<Boolean, Directory> addIndexDirectoryTuple,
Predicate<Integer> canIncludeRecoveringChildShardIds,
Predicate<Integer> shouldIncludeChildShards,
IndexWriterConfig.OpenMode openMode
) throws IOException {
assert sources.length > 0;
Expand All @@ -269,7 +268,7 @@ public static void addIndices(
}
indexRecoveryStats.setFileDetailsComplete();
if (split) {
writer.deleteDocuments(new ShardSplittingQuery(indexMetadata, shardId, hasNested, canIncludeRecoveringChildShardIds));
writer.deleteDocuments(new ShardSplittingQuery(indexMetadata, shardId, hasNested, shouldIncludeChildShards));
}
/*
* We set the maximum sequence number and the local checkpoint on the target to the maximum of the maximum sequence numbers on
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -886,7 +886,8 @@ public void handleRecoveryDone(ReplicationState state, ShardRouting shardRouting
}

public void handleChildRecoveriesDone(ShardRouting sourceShardRouting, long primaryTerm, RecoverySource recoverySource) {
shardStateAction.childShardsStarted(sourceShardRouting, primaryTerm, "after " + recoverySource, SHARD_STATE_ACTION_LISTENER);

shardStateAction.childShardsStarted(sourceShardRouting, primaryTerm, "after " + recoverySource, SHARD_STATE_ACTION_LISTENER);
}

private void failAndRemoveShard(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,7 @@ && isTargetSameHistory()
sendSnapshotStep
);
}, onFailure);

finalizeStepAndCompleteFuture(startingSeqNo, sendSnapshotStep, sendFileStep, prepareEngineStep, onFailure);
finalizeStepAndCompleteFuture(startingSeqNo, sendSnapshotStep, sendFileStep, prepareEngineStep, new StepListener<>(), onFailure);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,9 @@ protected void finalizeStepAndCompleteFuture(
StepListener<List<SendSnapshotResult>> sendSnapshotStep,
StepListener<SendFileResult> sendFileStep,
StepListener<TimeValue> prepareEngineStep,
StepListener<Void> finalizeStep,
Consumer<Exception> onFailure
) {
final StepListener<Void> finalizeStep = new StepListener<>();
// Recovery target can trim all operations >= startingSeqNo as we have sent all these operations in the phase 2
final long trimAboveSeqNo = startingSeqNo - 1;
sendSnapshotStep.whenComplete(sendSnapshotResult -> {
Expand Down
Loading

0 comments on commit df7ca8b

Please sign in to comment.