-
Notifications
You must be signed in to change notification settings - Fork 211
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Send RetryInfo on OTel Timeouts (#4294)
DataPrepper is sending `RESOURCE_EXHAUSTED` gRPC responses whenever a buffer is full or a circuit breaker is active. These statuses do not contain a retry info. In the OpenTelemetry protocol, this implies a non-retryable error, that will lead to message drops, e.g. in the OTel collector. To apply proper back pressure in these scenarios a retry info is added to the status. Implementation uses exponential backoff. Idea is to start with a minimum delay on the first time-out or circuit breaker activation. If the next such event happens within twice the last delay after the previous event, double the delay until a maximum delay is reached. Use the maximum delay from then on, until a sufficiently long period (maximum delay) without an event happens. Then the delay is reset to minimum. --------- Signed-off-by: Karsten Schnitter <[email protected]> Signed-off-by: Tomas Longo <[email protected]> Co-authored-by: David Venable <[email protected]>
- Loading branch information
1 parent
059e1c5
commit 2595076
Showing
25 changed files
with
1,009 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
49 changes: 49 additions & 0 deletions
49
...gins/armeria-common/src/main/java/org/opensearch/dataprepper/GrpcRetryInfoCalculator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
package org.opensearch.dataprepper; | ||
|
||
import com.google.rpc.RetryInfo; | ||
|
||
import java.time.Duration; | ||
import java.time.Instant; | ||
import java.util.concurrent.atomic.AtomicReference; | ||
|
||
class GrpcRetryInfoCalculator { | ||
|
||
private final Duration minimumDelay; | ||
private final Duration maximumDelay; | ||
|
||
private final AtomicReference<Instant> lastTimeCalled; | ||
private final AtomicReference<Duration> nextDelay; | ||
|
||
GrpcRetryInfoCalculator(Duration minimumDelay, Duration maximumDelay) { | ||
this.minimumDelay = minimumDelay; | ||
this.maximumDelay = maximumDelay; | ||
// Create a cushion so that the calculator treats a first quick exception (after prepper startup) as normal request (e.g. does not calculate a backoff) | ||
this.lastTimeCalled = new AtomicReference<>(Instant.now().minus(maximumDelay)); | ||
this.nextDelay = new AtomicReference<>(minimumDelay); | ||
} | ||
|
||
private static RetryInfo createProtoResult(Duration delay) { | ||
return RetryInfo.newBuilder().setRetryDelay(mapDuration(delay)).build(); | ||
} | ||
|
||
private static Duration minDuration(Duration left, Duration right) { | ||
return left.compareTo(right) <= 0 ? left : right; | ||
} | ||
|
||
private static com.google.protobuf.Duration.Builder mapDuration(Duration duration) { | ||
return com.google.protobuf.Duration.newBuilder().setSeconds(duration.getSeconds()).setNanos(duration.getNano()); | ||
} | ||
|
||
RetryInfo createRetryInfo() { | ||
Instant now = Instant.now(); | ||
// Is the last time we got called longer ago than the next delay? | ||
if (lastTimeCalled.getAndSet(now).isBefore(now.minus(nextDelay.get()))) { | ||
// Use minimum delay and reset the saved delay | ||
nextDelay.set(minimumDelay); | ||
return createProtoResult(minimumDelay); | ||
} | ||
Duration delay = nextDelay.getAndUpdate(d -> minDuration(maximumDelay, d.multipliedBy(2))); | ||
return createProtoResult(delay); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
83 changes: 83 additions & 0 deletions
83
.../armeria-common/src/test/java/org/opensearch/dataprepper/GrpcRetryInfoCalculatorTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
package org.opensearch.dataprepper; | ||
|
||
import com.google.rpc.RetryInfo; | ||
import org.junit.jupiter.api.Test; | ||
|
||
import java.time.Duration; | ||
|
||
import static org.hamcrest.MatcherAssert.assertThat; | ||
import static org.hamcrest.Matchers.equalTo; | ||
|
||
public class GrpcRetryInfoCalculatorTest { | ||
|
||
@Test | ||
public void testMinimumDelayOnFirstCall() { | ||
RetryInfo retryInfo = new GrpcRetryInfoCalculator(Duration.ofMillis(100), Duration.ofSeconds(1)).createRetryInfo(); | ||
|
||
assertThat(retryInfo.getRetryDelay().getNanos(), equalTo(100_000_000)); | ||
assertThat(retryInfo.getRetryDelay().getSeconds(), equalTo(0L)); | ||
} | ||
|
||
@Test | ||
public void testExponentialBackoff() { | ||
GrpcRetryInfoCalculator calculator = | ||
new GrpcRetryInfoCalculator(Duration.ofSeconds(1), Duration.ofSeconds(10)); | ||
RetryInfo retryInfo1 = calculator.createRetryInfo(); | ||
RetryInfo retryInfo2 = calculator.createRetryInfo(); | ||
RetryInfo retryInfo3 = calculator.createRetryInfo(); | ||
RetryInfo retryInfo4 = calculator.createRetryInfo(); | ||
|
||
assertThat(retryInfo1.getRetryDelay().getSeconds(), equalTo(1L)); | ||
assertThat(retryInfo2.getRetryDelay().getSeconds(), equalTo(1L)); | ||
assertThat(retryInfo3.getRetryDelay().getSeconds(), equalTo(2L)); | ||
assertThat(retryInfo4.getRetryDelay().getSeconds(), equalTo(4L)); | ||
} | ||
|
||
@Test | ||
public void testUsesMaximumAsLongestDelay() { | ||
GrpcRetryInfoCalculator calculator = | ||
new GrpcRetryInfoCalculator(Duration.ofSeconds(1), Duration.ofSeconds(2)); | ||
RetryInfo retryInfo1 = calculator.createRetryInfo(); | ||
RetryInfo retryInfo2 = calculator.createRetryInfo(); | ||
RetryInfo retryInfo3 = calculator.createRetryInfo(); | ||
|
||
assertThat(retryInfo1.getRetryDelay().getSeconds(), equalTo(1L)); | ||
assertThat(retryInfo2.getRetryDelay().getSeconds(), equalTo(1L)); | ||
assertThat(retryInfo3.getRetryDelay().getSeconds(), equalTo(2L)); | ||
} | ||
|
||
@Test | ||
public void testResetAfterDelayWearsOff() throws InterruptedException { | ||
int minDelayNanos = 1_000_000; | ||
GrpcRetryInfoCalculator calculator = | ||
new GrpcRetryInfoCalculator(Duration.ofNanos(minDelayNanos), Duration.ofSeconds(1)); | ||
|
||
RetryInfo retryInfo1 = calculator.createRetryInfo(); | ||
RetryInfo retryInfo2 = calculator.createRetryInfo(); | ||
RetryInfo retryInfo3 = calculator.createRetryInfo(); | ||
sleep(retryInfo3); | ||
RetryInfo retryInfo4 = calculator.createRetryInfo(); | ||
|
||
assertThat(retryInfo1.getRetryDelay().getNanos(), equalTo(minDelayNanos)); | ||
assertThat(retryInfo2.getRetryDelay().getNanos(), equalTo(minDelayNanos)); | ||
assertThat(retryInfo3.getRetryDelay().getNanos(), equalTo(minDelayNanos * 2)); | ||
assertThat(retryInfo4.getRetryDelay().getNanos(), equalTo(minDelayNanos)); | ||
} | ||
|
||
@Test | ||
public void testQuickFirstExceptionDoesNotTriggerBackoffCalculationEvenWithLongMinDelay() throws InterruptedException { | ||
GrpcRetryInfoCalculator calculator = | ||
new GrpcRetryInfoCalculator(Duration.ofSeconds(10), Duration.ofSeconds(20)); | ||
|
||
RetryInfo retryInfo1 = calculator.createRetryInfo(); | ||
RetryInfo retryInfo2 = calculator.createRetryInfo(); | ||
|
||
assertThat(retryInfo1.getRetryDelay().getSeconds(), equalTo(10L)); | ||
assertThat(retryInfo2.getRetryDelay().getSeconds(), equalTo(10L)); | ||
} | ||
|
||
private void sleep(RetryInfo retryInfo) throws InterruptedException { | ||
// make sure we let enough time pass by adding a few milliseconds on top | ||
Thread.sleep((retryInfo.getRetryDelay().getNanos() / 1_000_000) + 200 ); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.