From dd2fb5bd465aa268bdac27005caef6b26b37dd86 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Wed, 22 Jan 2025 17:49:18 +1030 Subject: [PATCH] Log local hot-threads when shard creation fails due to shardLock (#120495) Closes: ES-10493 --- .../indices/cluster/ShardLockFailureIT.java | 17 ++++++++++++++++- .../cluster/IndicesClusterStateService.java | 12 +++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/indices/cluster/ShardLockFailureIT.java b/server/src/internalClusterTest/java/org/elasticsearch/indices/cluster/ShardLockFailureIT.java index 0ce3ca53e1c1f..6f0a18adbfe82 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/indices/cluster/ShardLockFailureIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/indices/cluster/ShardLockFailureIT.java @@ -72,7 +72,14 @@ public void testShardLockFailure() throws Exception { var ignored1 = internalCluster().getInstance(NodeEnvironment.class, node).shardLock(shardId, "blocked for test"); var mockLog = MockLog.capture(IndicesClusterStateService.class); ) { - + mockLog.addExpectation( + new MockLog.SeenEventExpectation( + "hot threads", + "org.elasticsearch.indices.cluster.IndicesClusterStateService", + Level.WARN, + "[testindex][0]: acquire shard lock for create" + ) + ); mockLog.addExpectation(new MockLog.LoggingExpectation() { private final CountDownLatch countDownLatch = new CountDownLatch(1); int debugMessagesSeen = 0; @@ -147,6 +154,14 @@ public void testShardLockTimeout() throws Exception { var ignored1 = internalCluster().getInstance(NodeEnvironment.class, node).shardLock(shardId, "blocked for test"); var mockLog = MockLog.capture(IndicesClusterStateService.class); ) { + mockLog.addExpectation( + new MockLog.SeenEventExpectation( + "hot threads", + "org.elasticsearch.indices.cluster.IndicesClusterStateService", + Level.WARN, + "[testindex][0]: acquire shard lock for create" + ) + ); mockLog.addExpectation( new MockLog.SeenEventExpectation( "timeout message", diff --git a/server/src/main/java/org/elasticsearch/indices/cluster/IndicesClusterStateService.java b/server/src/main/java/org/elasticsearch/indices/cluster/IndicesClusterStateService.java index eae8e8ac132db..0e9901bc05682 100644 --- a/server/src/main/java/org/elasticsearch/indices/cluster/IndicesClusterStateService.java +++ b/server/src/main/java/org/elasticsearch/indices/cluster/IndicesClusterStateService.java @@ -42,6 +42,7 @@ import org.elasticsearch.common.util.concurrent.AbstractRunnable; import org.elasticsearch.common.util.concurrent.ConcurrentCollections; import org.elasticsearch.common.util.concurrent.EsExecutors; +import org.elasticsearch.common.util.concurrent.RunOnce; import org.elasticsearch.common.util.concurrent.ThreadContext; import org.elasticsearch.common.util.concurrent.ThrottledTaskRunner; import org.elasticsearch.core.Nullable; @@ -74,6 +75,7 @@ import org.elasticsearch.indices.recovery.RecoveryFailedException; import org.elasticsearch.indices.recovery.RecoveryState; import org.elasticsearch.injection.guice.Inject; +import org.elasticsearch.monitor.jvm.HotThreads; import org.elasticsearch.repositories.RepositoriesService; import org.elasticsearch.search.SearchService; import org.elasticsearch.snapshots.SnapshotShardsService; @@ -688,6 +690,7 @@ private void createShard(ShardRouting shardRouting, ClusterState state) { primaryTerm, 0, 0L, + new RunOnce(() -> HotThreads.logLocalCurrentThreads(logger, Level.WARN, shardId + ": acquire shard lock for create")), ActionListener.runBefore(new ActionListener<>() { @Override public void onResponse(Boolean success) { @@ -740,6 +743,7 @@ private void createShardWhenLockAvailable( long primaryTerm, int iteration, long delayMillis, + RunOnce dumpHotThreads, ActionListener listener ) { try { @@ -763,8 +767,9 @@ private void createShardWhenLockAvailable( listener.onFailure(e); return; } + final Level level = (iteration + 25) % 30 == 0 ? Level.WARN : Level.DEBUG; logger.log( - (iteration + 25) % 30 == 0 ? Level.WARN : Level.DEBUG, + level, """ shard lock for [{}] has been unavailable for at least [{}/{}ms], \ attempting to create shard while applying cluster state [version={},uuid={}], will retry in [{}]: [{}]""", @@ -776,6 +781,9 @@ private void createShardWhenLockAvailable( shardLockRetryInterval, e.getMessage() ); + if (level == Level.WARN) { + dumpHotThreads.run(); + } // TODO could we instead subscribe to the shard lock and trigger the retry exactly when it is released rather than polling? threadPool.scheduleUnlessShuttingDown( shardLockRetryInterval, @@ -813,6 +821,7 @@ private void createShardWhenLockAvailable( shardLockRetryTimeout.millis(), shardRouting ); + dumpHotThreads.run(); listener.onFailure( new ElasticsearchTimeoutException("timed out while waiting to acquire shard lock for " + shardRouting) ); @@ -841,6 +850,7 @@ private void createShardWhenLockAvailable( primaryTerm, iteration + 1, newDelayMillis, + dumpHotThreads, listener );