RE
Retry Patterns
Resilience core v1.0.0
Retry Patterns
Overview
Retry patterns handle transient failures by automatically re-attempting failed operations. This skill covers retry strategies, exponential backoff, jitter, and when retries are appropriate.
Key Concepts
Retry Strategies
┌─────────────────────────────────────────────────────────────┐
│ Retry Strategies │
├─────────────────────────────────────────────────────────────┤
│ │
│ 1. Fixed Interval: │
│ ──▶ fail ──(1s)──▶ retry ──(1s)──▶ retry │
│ Simple but can cause thundering herd │
│ │
│ 2. Exponential Backoff: │
│ ──▶ fail ──(1s)──▶ fail ──(2s)──▶ fail ──(4s)──▶ │
│ Reduces load during outages │
│ │
│ 3. Exponential Backoff + Jitter: │
│ ──▶ fail ──(0.8s)──▶ fail ──(2.3s)──▶ fail ──(3.7s)──▶│
│ Prevents synchronized retries │
│ │
│ 4. Decorrelated Jitter: │
│ sleep = min(cap, random(base, sleep * 3)) │
│ Best distribution for high contention │
│ │
│ │
│ Jitter Comparison (1000 clients): │
│ ┌─────────────────────────────────────────┐ │
│ │ │ │
│ │ No jitter: ████████ (at same time) │ │
│ │ Full jitter: ░░░░░░░░ (spread out) │ │
│ │ Equal jitter: ▒▒▒▒▒▒▒▒ (half spread) │ │
│ │ │ │
│ └─────────────────────────────────────────┘ │
│ │
│ When to Retry: │
│ ✓ Transient network errors │
│ ✓ Rate limiting (429) │
│ ✓ Service unavailable (503) │
│ ✓ Timeouts (may need idempotency) │
│ │
│ When NOT to Retry: │
│ ✗ Client errors (400, 401, 403, 404) │
│ ✗ Validation failures │
│ ✗ Non-idempotent operations (without key) │
│ │
└─────────────────────────────────────────────────────────────┘
Best Practices
1. Use Exponential Backoff with Jitter
Prevents thundering herd and reduces server load.
2. Set Maximum Retry Attempts
Avoid infinite retry loops.
3. Only Retry Transient Failures
Don’t retry validation or authorization errors.
4. Ensure Idempotency
Retries must be safe to repeat.
5. Log Retry Attempts
Track retry frequency for monitoring.
Code Examples
Example 1: Exponential Backoff with Jitter
public class RetryWithBackoff {
private final int maxAttempts;
private final Duration initialDelay;
private final Duration maxDelay;
private final double multiplier;
private final Random random = new Random();
public <T> T execute(Callable<T> operation) throws Exception {
int attempt = 0;
Exception lastException = null;
while (attempt < maxAttempts) {
try {
return operation.call();
} catch (Exception e) {
lastException = e;
if (!isRetryable(e)) {
throw e;
}
attempt++;
if (attempt >= maxAttempts) {
break;
}
Duration delay = calculateDelay(attempt);
log.warn("Attempt {} failed, retrying in {}ms", attempt, delay.toMillis());
Thread.sleep(delay.toMillis());
}
}
throw new RetryExhaustedException("All " + maxAttempts + " attempts failed", lastException);
}
private Duration calculateDelay(int attempt) {
// Exponential backoff
long exponentialDelay = (long) (initialDelay.toMillis() * Math.pow(multiplier, attempt - 1));
// Cap at max delay
long cappedDelay = Math.min(exponentialDelay, maxDelay.toMillis());
// Add full jitter: random value between 0 and calculated delay
long jitteredDelay = (long) (random.nextDouble() * cappedDelay);
return Duration.ofMillis(jitteredDelay);
}
/**
* Decorrelated jitter - better distribution under high contention
*/
private Duration calculateDecorrelatedDelay(int attempt, long previousDelay) {
long base = initialDelay.toMillis();
long cap = maxDelay.toMillis();
// sleep = min(cap, random_between(base, previous_sleep * 3))
long min = base;
long max = previousDelay * 3;
long delay = min + (long) (random.nextDouble() * (max - min));
return Duration.ofMillis(Math.min(cap, delay));
}
private boolean isRetryable(Exception e) {
if (e instanceof HttpClientErrorException httpError) {
int status = httpError.getStatusCode().value();
// Retry 429 (rate limit), 503 (unavailable), 504 (gateway timeout)
return status == 429 || status == 503 || status == 504;
}
// Retry network and timeout errors
return e instanceof IOException ||
e instanceof TimeoutException ||
e instanceof ConnectException;
}
public static RetryWithBackoff standard() {
return new RetryWithBackoff(3, Duration.ofMillis(100), Duration.ofSeconds(10), 2.0);
}
}
Example 2: Resilience4j Retry Configuration
@Configuration
public class RetryConfig {
@Bean
public RetryRegistry retryRegistry() {
// Default configuration
io.github.resilience4j.retry.RetryConfig defaultConfig =
io.github.resilience4j.retry.RetryConfig.custom()
.maxAttempts(3)
.waitDuration(Duration.ofMillis(500))
.enableExponentialBackoff()
.exponentialBackoffMultiplier(2)
.randomizedWaitEnabled(true) // Adds jitter
.retryExceptions(
IOException.class,
TimeoutException.class,
RetryableException.class
)
.ignoreExceptions(
BusinessException.class,
ValidationException.class
)
.retryOnResult(response ->
response instanceof HttpResponse<?> http &&
http.statusCode() == 503
)
.build();
RetryRegistry registry = RetryRegistry.of(defaultConfig);
// Custom configuration for specific service
registry.addConfiguration("aggressive",
io.github.resilience4j.retry.RetryConfig.custom()
.maxAttempts(5)
.waitDuration(Duration.ofMillis(100))
.build()
);
registry.addConfiguration("conservative",
io.github.resilience4j.retry.RetryConfig.custom()
.maxAttempts(2)
.waitDuration(Duration.ofSeconds(1))
.build()
);
return registry;
}
}
@Service
public class ExternalServiceClient {
private final Retry retry;
private final RestClient restClient;
public ExternalServiceClient(RetryRegistry registry, RestClient restClient) {
this.retry = registry.retry("external-service");
this.restClient = restClient;
// Log retry events
retry.getEventPublisher()
.onRetry(event -> log.warn(
"Retry attempt {} for {}, waiting {}ms",
event.getNumberOfRetryAttempts(),
event.getName(),
event.getWaitInterval().toMillis()
))
.onSuccess(event -> log.debug(
"Succeeded after {} attempts",
event.getNumberOfRetryAttempts()
))
.onError(event -> log.error(
"Failed after {} attempts: {}",
event.getNumberOfRetryAttempts(),
event.getLastThrowable().getMessage()
));
}
public ApiResponse callService(ApiRequest request) {
return Retry.decorateSupplier(retry, () -> {
return restClient.post()
.uri("/api/resource")
.body(request)
.retrieve()
.body(ApiResponse.class);
}).get();
}
public CompletableFuture<ApiResponse> callServiceAsync(ApiRequest request) {
return Retry.decorateCompletionStage(
retry,
Executors.newSingleThreadScheduledExecutor(),
() -> CompletableFuture.supplyAsync(() ->
restClient.post()
.uri("/api/resource")
.body(request)
.retrieve()
.body(ApiResponse.class)
)
).get().toCompletableFuture();
}
}
Example 3: Retry with Rate Limit Handling
public class RateLimitAwareRetry {
public <T> T executeWithRateLimitHandling(
Callable<T> operation,
int maxAttempts) throws Exception {
int attempt = 0;
Exception lastException = null;
while (attempt < maxAttempts) {
try {
return operation.call();
} catch (RateLimitException e) {
attempt++;
lastException = e;
// Use Retry-After header if available
Duration waitTime = parseRetryAfter(e.getRetryAfterHeader())
.orElse(calculateExponentialBackoff(attempt));
log.warn("Rate limited, waiting {}ms before retry {}/{}",
waitTime.toMillis(), attempt, maxAttempts);
Thread.sleep(waitTime.toMillis());
} catch (Exception e) {
// Non-rate-limit errors get standard retry
attempt++;
lastException = e;
if (!isRetryable(e) || attempt >= maxAttempts) {
throw e;
}
Duration waitTime = calculateExponentialBackoff(attempt);
Thread.sleep(waitTime.toMillis());
}
}
throw new RetryExhaustedException("Rate limit retries exhausted", lastException);
}
private Optional<Duration> parseRetryAfter(String retryAfterHeader) {
if (retryAfterHeader == null) {
return Optional.empty();
}
try {
// Try parsing as seconds
int seconds = Integer.parseInt(retryAfterHeader);
return Optional.of(Duration.ofSeconds(seconds));
} catch (NumberFormatException e) {
// Try parsing as HTTP date
try {
ZonedDateTime retryAt = ZonedDateTime.parse(
retryAfterHeader,
DateTimeFormatter.RFC_1123_DATE_TIME
);
return Optional.of(Duration.between(ZonedDateTime.now(), retryAt));
} catch (Exception ex) {
return Optional.empty();
}
}
}
/**
* Token bucket for client-side rate limiting
*/
private final RateLimiter clientRateLimiter = RateLimiter.of(
"client",
RateLimiterConfig.custom()
.limitForPeriod(100)
.limitRefreshPeriod(Duration.ofSeconds(1))
.timeoutDuration(Duration.ofSeconds(5))
.build()
);
public <T> T executeWithClientSideLimit(Callable<T> operation) throws Exception {
// Wait for permit before attempting
clientRateLimiter.acquirePermission();
return executeWithRateLimitHandling(operation, 3);
}
}
Example 4: Retry with Timeout Budget
public class BudgetedRetry {
private final Duration totalTimeout;
private final Duration perAttemptTimeout;
private final int maxAttempts;
/**
* Retry within a total time budget
*/
public <T> T executeWithBudget(Callable<T> operation) throws Exception {
Instant deadline = Instant.now().plus(totalTimeout);
int attempt = 0;
Exception lastException = null;
while (Instant.now().isBefore(deadline) && attempt < maxAttempts) {
attempt++;
// Calculate remaining budget
Duration remaining = Duration.between(Instant.now(), deadline);
if (remaining.isNegative() || remaining.isZero()) {
break;
}
// Use smaller of per-attempt timeout and remaining budget
Duration attemptTimeout = remaining.compareTo(perAttemptTimeout) < 0
? remaining
: perAttemptTimeout;
try {
return executeWithTimeout(operation, attemptTimeout);
} catch (TimeoutException e) {
lastException = e;
log.warn("Attempt {} timed out after {}ms, {} remaining in budget",
attempt, attemptTimeout.toMillis(), remaining.toMillis());
} catch (Exception e) {
lastException = e;
if (!isRetryable(e)) {
throw e;
}
// Calculate backoff but respect budget
Duration backoff = calculateBackoff(attempt);
Duration actualWait = backoff.compareTo(remaining) < 0
? backoff
: remaining;
if (actualWait.toMillis() > 0) {
Thread.sleep(actualWait.toMillis());
}
}
}
throw new TimeoutException("Budget exhausted after " + attempt + " attempts");
}
private <T> T executeWithTimeout(Callable<T> operation, Duration timeout)
throws Exception {
ExecutorService executor = Executors.newSingleThreadExecutor();
Future<T> future = executor.submit(operation);
try {
return future.get(timeout.toMillis(), TimeUnit.MILLISECONDS);
} finally {
executor.shutdownNow();
}
}
/**
* Hedged requests - start parallel request if first is slow
*/
public <T> T executeWithHedging(
Callable<T> operation,
Duration hedgeDelay,
int maxHedges) {
ExecutorService executor = Executors.newFixedThreadPool(maxHedges + 1);
CompletableFuture<T> result = new CompletableFuture<>();
AtomicInteger hedgeCount = new AtomicInteger(0);
// Start first request immediately
submitHedge(executor, operation, result, hedgeCount);
// Schedule hedges
ScheduledExecutorService scheduler = Executors.newSingleThreadScheduledExecutor();
for (int i = 1; i <= maxHedges; i++) {
final int hedgeNum = i;
scheduler.schedule(
() -> {
if (!result.isDone() && hedgeCount.get() < maxHedges) {
log.debug("Starting hedge request {}", hedgeNum);
submitHedge(executor, operation, result, hedgeCount);
}
},
hedgeDelay.toMillis() * i,
TimeUnit.MILLISECONDS
);
}
try {
return result.get(totalTimeout.toMillis(), TimeUnit.MILLISECONDS);
} catch (Exception e) {
throw new RuntimeException("All hedged requests failed", e);
} finally {
executor.shutdownNow();
scheduler.shutdownNow();
}
}
private <T> void submitHedge(
ExecutorService executor,
Callable<T> operation,
CompletableFuture<T> result,
AtomicInteger hedgeCount) {
hedgeCount.incrementAndGet();
executor.submit(() -> {
try {
T value = operation.call();
result.complete(value);
} catch (Exception e) {
// Don't complete exceptionally - let other hedges succeed
log.debug("Hedge attempt failed", e);
}
});
}
}
Example 5: Annotation-Based Retry
@Target({ElementType.METHOD, ElementType.TYPE})
@Retention(RetentionPolicy.RUNTIME)
public @interface Retryable {
int maxAttempts() default 3;
long delay() default 1000;
double multiplier() default 2.0;
Class<? extends Exception>[] include() default {};
Class<? extends Exception>[] exclude() default {};
}
@Aspect
@Component
public class RetryAspect {
@Around("@annotation(retryable)")
public Object retry(ProceedingJoinPoint pjp, Retryable retryable) throws Throwable {
int maxAttempts = retryable.maxAttempts();
long delay = retryable.delay();
double multiplier = retryable.multiplier();
int attempt = 0;
Throwable lastException = null;
while (attempt < maxAttempts) {
try {
return pjp.proceed();
} catch (Throwable t) {
lastException = t;
attempt++;
if (!shouldRetry(t, retryable) || attempt >= maxAttempts) {
throw t;
}
long waitTime = (long) (delay * Math.pow(multiplier, attempt - 1));
waitTime = addJitter(waitTime);
log.warn("Method {} failed, attempt {}/{}, retrying in {}ms",
pjp.getSignature().getName(), attempt, maxAttempts, waitTime);
Thread.sleep(waitTime);
}
}
throw lastException;
}
private boolean shouldRetry(Throwable t, Retryable retryable) {
// Check exclusions first
for (Class<? extends Exception> excluded : retryable.exclude()) {
if (excluded.isInstance(t)) {
return false;
}
}
// Check inclusions
if (retryable.include().length > 0) {
for (Class<? extends Exception> included : retryable.include()) {
if (included.isInstance(t)) {
return true;
}
}
return false;
}
// Default: retry on IOException and RuntimeException
return t instanceof IOException || t instanceof RuntimeException;
}
private long addJitter(long delay) {
return (long) (delay * (0.5 + Math.random()));
}
}
// Usage
@Service
public class PaymentService {
@Retryable(
maxAttempts = 3,
delay = 500,
multiplier = 2.0,
include = {IOException.class, TimeoutException.class},
exclude = {ValidationException.class}
)
public PaymentResult processPayment(Payment payment) {
return externalPaymentGateway.process(payment);
}
}
Anti-Patterns
❌ Retrying Non-Idempotent Operations
// WRONG - may charge multiple times
while (failed) {
chargeCard(amount); // Not idempotent!
}
// ✅ CORRECT - use idempotency key
while (failed) {
chargeCard(idempotencyKey, amount);
}
❌ Immediate Retries Without Backoff
Can cause thundering herd and worsen outages.