Skip to content

Commit

Permalink
Log local hot-threads when shard creation fails due to shardLock (ela…
Browse files Browse the repository at this point in the history
…stic#120495)

Closes: ES-10493
  • Loading branch information
nicktindall authored Jan 22, 2025
1 parent 682e46a commit dd2fb5b
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,14 @@ public void testShardLockFailure() throws Exception {
var ignored1 = internalCluster().getInstance(NodeEnvironment.class, node).shardLock(shardId, "blocked for test");
var mockLog = MockLog.capture(IndicesClusterStateService.class);
) {

mockLog.addExpectation(
new MockLog.SeenEventExpectation(
"hot threads",
"org.elasticsearch.indices.cluster.IndicesClusterStateService",
Level.WARN,
"[testindex][0]: acquire shard lock for create"
)
);
mockLog.addExpectation(new MockLog.LoggingExpectation() {
private final CountDownLatch countDownLatch = new CountDownLatch(1);
int debugMessagesSeen = 0;
Expand Down Expand Up @@ -147,6 +154,14 @@ public void testShardLockTimeout() throws Exception {
var ignored1 = internalCluster().getInstance(NodeEnvironment.class, node).shardLock(shardId, "blocked for test");
var mockLog = MockLog.capture(IndicesClusterStateService.class);
) {
mockLog.addExpectation(
new MockLog.SeenEventExpectation(
"hot threads",
"org.elasticsearch.indices.cluster.IndicesClusterStateService",
Level.WARN,
"[testindex][0]: acquire shard lock for create"
)
);
mockLog.addExpectation(
new MockLog.SeenEventExpectation(
"timeout message",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
import org.elasticsearch.common.util.concurrent.EsExecutors;
import org.elasticsearch.common.util.concurrent.RunOnce;
import org.elasticsearch.common.util.concurrent.ThreadContext;
import org.elasticsearch.common.util.concurrent.ThrottledTaskRunner;
import org.elasticsearch.core.Nullable;
Expand Down Expand Up @@ -74,6 +75,7 @@
import org.elasticsearch.indices.recovery.RecoveryFailedException;
import org.elasticsearch.indices.recovery.RecoveryState;
import org.elasticsearch.injection.guice.Inject;
import org.elasticsearch.monitor.jvm.HotThreads;
import org.elasticsearch.repositories.RepositoriesService;
import org.elasticsearch.search.SearchService;
import org.elasticsearch.snapshots.SnapshotShardsService;
Expand Down Expand Up @@ -688,6 +690,7 @@ private void createShard(ShardRouting shardRouting, ClusterState state) {
primaryTerm,
0,
0L,
new RunOnce(() -> HotThreads.logLocalCurrentThreads(logger, Level.WARN, shardId + ": acquire shard lock for create")),
ActionListener.runBefore(new ActionListener<>() {
@Override
public void onResponse(Boolean success) {
Expand Down Expand Up @@ -740,6 +743,7 @@ private void createShardWhenLockAvailable(
long primaryTerm,
int iteration,
long delayMillis,
RunOnce dumpHotThreads,
ActionListener<Boolean> listener
) {
try {
Expand All @@ -763,8 +767,9 @@ private void createShardWhenLockAvailable(
listener.onFailure(e);
return;
}
final Level level = (iteration + 25) % 30 == 0 ? Level.WARN : Level.DEBUG;
logger.log(
(iteration + 25) % 30 == 0 ? Level.WARN : Level.DEBUG,
level,
"""
shard lock for [{}] has been unavailable for at least [{}/{}ms], \
attempting to create shard while applying cluster state [version={},uuid={}], will retry in [{}]: [{}]""",
Expand All @@ -776,6 +781,9 @@ private void createShardWhenLockAvailable(
shardLockRetryInterval,
e.getMessage()
);
if (level == Level.WARN) {
dumpHotThreads.run();
}
// TODO could we instead subscribe to the shard lock and trigger the retry exactly when it is released rather than polling?
threadPool.scheduleUnlessShuttingDown(
shardLockRetryInterval,
Expand Down Expand Up @@ -813,6 +821,7 @@ private void createShardWhenLockAvailable(
shardLockRetryTimeout.millis(),
shardRouting
);
dumpHotThreads.run();
listener.onFailure(
new ElasticsearchTimeoutException("timed out while waiting to acquire shard lock for " + shardRouting)
);
Expand Down Expand Up @@ -841,6 +850,7 @@ private void createShardWhenLockAvailable(
primaryTerm,
iteration + 1,
newDelayMillis,
dumpHotThreads,
listener
);

Expand Down

0 comments on commit dd2fb5b

Please sign in to comment.