DI
Failure Modes
Distributed Systems advanced v1.0.0
Distributed System Failure Modes
Overview
Distributed systems fail in complex ways: partial failures, network partitions, Byzantine faults, and cascading failures. This skill covers failure classification, detection, and mitigation strategies.
Key Concepts
Failure Taxonomy
┌─────────────────────────────────────────────────────────────┐
│ Failure Categories │
├─────────────────────────────────────────────────────────────┤
│ │
│ 1. Crash Failures: │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ Node ────▶ ✗ (stops responding) │ │
│ │ • Fail-stop: others can detect │ │
│ │ • Fail-silent: may or may not be detected │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
│ 2. Network Failures: │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ ┌─────┐ ┌─────┐ │ │
│ │ │ A │────✗─────│ B │ Partition │ │
│ │ └─────┘ └─────┘ │ │
│ │ │ │
│ │ A ──▶ B (delay) ──▶ B Message delay │ │
│ │ A ──▶ B ──✗ Message loss │ │
│ │ A ──▶ B ──▶ B ──▶ B Message duplication │ │
│ │ A ──▶ B ──▶ A ──▶ B Message reordering │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
│ 3. Byzantine Failures: │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ Node behaves arbitrarily: │ │
│ │ • Sends different values to different nodes │ │
│ │ • Lies about its state │ │
│ │ • Corrupts data intentionally │ │
│ │ Requires 3f+1 nodes to tolerate f Byzantine faults │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
│ 4. Cascading Failures: │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ │ │
│ │ Service A fails ──▶ Service B overloaded │ │
│ │ ──▶ Service C times out │ │
│ │ ──▶ System collapse │ │
│ │ │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
│ 5. Partial Failures: │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ Part of the system works, part doesn't │ │
│ │ • Some replicas available, some not │ │
│ │ • Request succeeds on one path, fails on another │ │
│ │ Most challenging: uncertain state │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────┘
Best Practices
1. Design for Failure
Assume everything will fail; build resilience in.
2. Fail Fast
Detect failures quickly; don’t wait for timeouts.
3. Limit Blast Radius
Isolate failures to prevent cascading.
4. Implement Graceful Degradation
Continue operating with reduced functionality.
5. Test Failure Scenarios
Use chaos engineering to validate resilience.
Code Examples
Example 1: Failure Detection
public class FailureDetector {
private final Map<String, NodeHealth> nodeHealth = new ConcurrentHashMap<>();
private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(2);
// Phi Accrual Failure Detector parameters
private static final double PHI_THRESHOLD = 8.0;
private static final int SAMPLE_SIZE = 1000;
public void start(List<String> nodes) {
for (String node : nodes) {
nodeHealth.put(node, new NodeHealth(node));
}
// Periodic heartbeat check
scheduler.scheduleAtFixedRate(
this::checkHeartbeats,
0, 1, TimeUnit.SECONDS
);
}
public void recordHeartbeat(String node) {
NodeHealth health = nodeHealth.get(node);
if (health != null) {
health.recordHeartbeat();
}
}
/**
* Phi Accrual Failure Detector
* Returns probability that node has failed
*/
public double suspicionLevel(String node) {
NodeHealth health = nodeHealth.get(node);
if (health == null) {
return Double.MAX_VALUE;
}
long timeSinceLastHeartbeat = System.currentTimeMillis() - health.lastHeartbeat;
double phi = health.calculatePhi(timeSinceLastHeartbeat);
return phi;
}
public boolean isAvailable(String node) {
return suspicionLevel(node) < PHI_THRESHOLD;
}
public List<String> getAvailableNodes() {
return nodeHealth.keySet().stream()
.filter(this::isAvailable)
.collect(Collectors.toList());
}
private void checkHeartbeats() {
for (Map.Entry<String, NodeHealth> entry : nodeHealth.entrySet()) {
String node = entry.getKey();
double phi = suspicionLevel(node);
if (phi >= PHI_THRESHOLD) {
handleSuspectedFailure(node, phi);
}
}
}
private void handleSuspectedFailure(String node, double phi) {
log.warn("Node {} suspected failed (phi={})", node, phi);
// Notify listeners
failureListeners.forEach(l -> l.onSuspectedFailure(node));
// Could trigger:
// - Remove from routing
// - Start failover
// - Alert operations
}
}
class NodeHealth {
private final String nodeId;
private final CircularBuffer<Long> heartbeatIntervals;
volatile long lastHeartbeat;
public NodeHealth(String nodeId) {
this.nodeId = nodeId;
this.heartbeatIntervals = new CircularBuffer<>(1000);
this.lastHeartbeat = System.currentTimeMillis();
}
public synchronized void recordHeartbeat() {
long now = System.currentTimeMillis();
long interval = now - lastHeartbeat;
heartbeatIntervals.add(interval);
lastHeartbeat = now;
}
public double calculatePhi(long timeSinceLastHeartbeat) {
if (heartbeatIntervals.size() < 10) {
// Not enough samples
return timeSinceLastHeartbeat > 30000 ? PHI_THRESHOLD + 1 : 0;
}
double mean = heartbeatIntervals.stream()
.mapToLong(Long::longValue)
.average()
.orElse(1000);
double variance = heartbeatIntervals.stream()
.mapToDouble(i -> Math.pow(i - mean, 2))
.average()
.orElse(0);
double stdDev = Math.sqrt(variance);
// Calculate phi using exponential distribution
double y = (timeSinceLastHeartbeat - mean) / stdDev;
double phi = -Math.log10(1 - cdf(y));
return phi;
}
private double cdf(double y) {
return 1.0 / (1.0 + Math.exp(-y));
}
}
Example 2: Cascading Failure Prevention
public class CascadePreventionService {
private final Map<String, ServiceHealth> services = new ConcurrentHashMap<>();
private final CircuitBreakerRegistry circuitBreakers;
private final BulkheadRegistry bulkheads;
private final RateLimiterRegistry rateLimiters;
/**
* Multi-layer protection against cascading failures
*/
public <T> T executeProtected(
String serviceName,
Supplier<T> operation,
Supplier<T> fallback) {
ServiceHealth health = services.get(serviceName);
// Layer 1: Rate limiting - prevent overload
RateLimiter rateLimiter = rateLimiters.rateLimiter(serviceName);
if (!rateLimiter.acquirePermission()) {
health.recordRateLimited();
return fallback.get();
}
// Layer 2: Bulkhead - limit concurrent calls
Bulkhead bulkhead = bulkheads.bulkhead(serviceName);
// Layer 3: Circuit breaker - fail fast if service is down
CircuitBreaker circuitBreaker = circuitBreakers.circuitBreaker(serviceName);
try {
return Bulkhead.decorateSupplier(bulkhead,
CircuitBreaker.decorateSupplier(circuitBreaker, () -> {
T result = operation.get();
health.recordSuccess();
return result;
})
).get();
} catch (BulkheadFullException e) {
health.recordBulkheadRejection();
return fallback.get();
} catch (CallNotPermittedException e) {
health.recordCircuitOpen();
return fallback.get();
} catch (Exception e) {
health.recordFailure(e);
throw e;
}
}
/**
* Adaptive load shedding based on system health
*/
public boolean shouldAcceptRequest(String serviceName, int priority) {
ServiceHealth health = services.get(serviceName);
double loadFactor = health.getCurrentLoadFactor();
// Accept all high-priority requests
if (priority >= 10) {
return true;
}
// Probabilistic shedding based on load
if (loadFactor > 0.9) {
// Severe load - only high priority
return priority >= 8;
} else if (loadFactor > 0.7) {
// High load - shed low priority
double acceptProbability = 1.0 - ((loadFactor - 0.7) / 0.2);
return priority >= 5 || Math.random() < acceptProbability;
}
return true;
}
/**
* Health check with dependencies
*/
public HealthCheckResult checkHealth(String serviceName) {
ServiceHealth health = services.get(serviceName);
List<DependencyHealth> dependencies = health.getDependencies().stream()
.map(dep -> new DependencyHealth(
dep,
circuitBreakers.circuitBreaker(dep).getState().toString(),
bulkheads.bulkhead(dep).getMetrics().getAvailableConcurrentCalls()
))
.collect(Collectors.toList());
boolean healthy = health.isHealthy() &&
dependencies.stream().allMatch(DependencyHealth::isHealthy);
return new HealthCheckResult(
serviceName,
healthy ? Status.UP : Status.DOWN,
health.getMetrics(),
dependencies
);
}
}
class ServiceHealth {
private final String name;
private final AtomicLong successCount = new AtomicLong();
private final AtomicLong failureCount = new AtomicLong();
private final AtomicLong activeRequests = new AtomicLong();
private final SlidingWindowCounter recentErrors;
public double getCurrentLoadFactor() {
// Based on active requests vs capacity
long active = activeRequests.get();
int capacity = getCapacity();
return (double) active / capacity;
}
public double getErrorRate() {
long total = successCount.get() + failureCount.get();
if (total == 0) return 0;
return (double) failureCount.get() / total;
}
public boolean isHealthy() {
return getErrorRate() < 0.5 && getCurrentLoadFactor() < 0.95;
}
}
Example 3: Partial Failure Handling
public class PartialFailureHandler {
/**
* Handle uncertainty after timeout
*/
public <T> T executeWithUncertaintyHandling(
String operationId,
Callable<T> operation,
Duration timeout,
Function<String, Optional<T>> statusChecker) {
Future<T> future = executor.submit(operation);
try {
return future.get(timeout.toMillis(), TimeUnit.MILLISECONDS);
} catch (TimeoutException e) {
// Operation timed out - uncertain state
log.warn("Operation {} timed out, checking status", operationId);
return handleUncertainState(operationId, future, statusChecker);
} catch (ExecutionException e) {
throw new RuntimeException(e.getCause());
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new RuntimeException(e);
}
}
private <T> T handleUncertainState(
String operationId,
Future<T> future,
Function<String, Optional<T>> statusChecker) {
// Strategy 1: Check if operation actually completed
for (int attempt = 0; attempt < 3; attempt++) {
Optional<T> result = statusChecker.apply(operationId);
if (result.isPresent()) {
log.info("Operation {} completed despite timeout", operationId);
return result.get();
}
sleep(Duration.ofSeconds(1));
}
// Strategy 2: Cancel if possible
boolean cancelled = future.cancel(true);
if (cancelled) {
log.info("Cancelled operation {}", operationId);
}
// Strategy 3: Return uncertain result
throw new UncertainOperationException(
"Operation " + operationId + " state is uncertain"
);
}
/**
* Saga pattern for distributed transactions with compensation
*/
public void executeSaga(List<SagaStep> steps) {
List<SagaStep> completedSteps = new ArrayList<>();
try {
for (SagaStep step : steps) {
log.info("Executing saga step: {}", step.getName());
step.execute();
completedSteps.add(step);
}
} catch (Exception e) {
log.error("Saga failed at step, compensating", e);
// Compensate in reverse order
Collections.reverse(completedSteps);
for (SagaStep step : completedSteps) {
try {
log.info("Compensating step: {}", step.getName());
step.compensate();
} catch (Exception compensationError) {
log.error("Compensation failed for step: {}", step.getName(),
compensationError);
// Log for manual intervention
alertOps("Saga compensation failed", step, compensationError);
}
}
throw new SagaFailedException("Saga rolled back", e);
}
}
}
interface SagaStep {
String getName();
void execute() throws Exception;
void compensate() throws Exception;
}
class OrderSagaStep implements SagaStep {
private final OrderService orderService;
private String orderId;
@Override
public String getName() {
return "CreateOrder";
}
@Override
public void execute() throws Exception {
orderId = orderService.createOrder(orderRequest);
}
@Override
public void compensate() throws Exception {
if (orderId != null) {
orderService.cancelOrder(orderId);
}
}
}
Example 4: Network Partition Handling
public class PartitionHandler {
private final ClusterMembership membership;
private final PartitionDetector detector;
/**
* Detect network partition using gossip protocol
*/
public PartitionStatus detectPartition() {
Set<String> reachable = new HashSet<>();
Set<String> unreachable = new HashSet<>();
for (String node : membership.getAllNodes()) {
if (canReach(node)) {
reachable.add(node);
} else {
unreachable.add(node);
}
}
if (unreachable.isEmpty()) {
return PartitionStatus.healthy();
}
// Check if we're in minority partition
boolean inMinority = reachable.size() < membership.getAllNodes().size() / 2 + 1;
return new PartitionStatus(
reachable,
unreachable,
inMinority,
Instant.now()
);
}
/**
* Handle partition based on strategy
*/
public void handlePartition(PartitionStatus status, PartitionStrategy strategy) {
if (!status.isPartitioned()) {
return;
}
log.warn("Network partition detected. Reachable: {}, Unreachable: {}",
status.getReachable(), status.getUnreachable());
switch (strategy) {
case STOP_IF_MINORITY:
if (status.isInMinority()) {
log.error("In minority partition, stopping writes");
disableWrites();
}
break;
case CONTINUE_WITH_QUORUM:
if (status.hasQuorum()) {
log.info("Have quorum, continuing with available nodes");
updateRoutingTable(status.getReachable());
} else {
disableWrites();
}
break;
case OPTIMISTIC_WRITES:
log.warn("Continuing with optimistic writes, may need reconciliation");
enableOptimisticWrites();
break;
}
}
/**
* Reconcile after partition heals
*/
public void reconcileAfterPartition(Set<String> rejoiningNodes) {
log.info("Reconciling with rejoining nodes: {}", rejoiningNodes);
for (String node : rejoiningNodes) {
try {
// Get their view of data
List<VersionedData> theirData = fetchDataFromNode(node);
// Merge with our data
for (VersionedData data : theirData) {
reconcileData(data);
}
// Push our data to them
pushDataToNode(node, getLocalData());
} catch (Exception e) {
log.error("Reconciliation failed for node: {}", node, e);
}
}
}
private void reconcileData(VersionedData theirData) {
VersionedData ourData = getLocalData(theirData.getKey());
if (ourData == null) {
// We don't have it, accept theirs
storeData(theirData);
} else if (theirData.getVersion().happenedBefore(ourData.getVersion())) {
// Ours is newer, keep ours
} else if (ourData.getVersion().happenedBefore(theirData.getVersion())) {
// Theirs is newer, accept theirs
storeData(theirData);
} else {
// Concurrent - need conflict resolution
VersionedData resolved = conflictResolver.resolve(ourData, theirData);
storeData(resolved);
}
}
}
enum PartitionStrategy {
STOP_IF_MINORITY, // CP - stop writes if in minority
CONTINUE_WITH_QUORUM, // CP - continue only with quorum
OPTIMISTIC_WRITES // AP - accept writes, reconcile later
}
Example 5: Chaos Engineering Hooks
@Component
public class ChaosMonkey {
private final Map<String, ChaosConfig> chaosConfigs = new ConcurrentHashMap<>();
private final Random random = new Random();
@Around("@annotation(ChaosEnabled)")
public Object injectChaos(ProceedingJoinPoint pjp) throws Throwable {
String methodName = pjp.getSignature().toShortString();
ChaosConfig config = chaosConfigs.get(methodName);
if (config == null || !config.isEnabled()) {
return pjp.proceed();
}
// Inject latency
if (config.getLatencyMs() > 0 && random.nextDouble() < config.getLatencyProbability()) {
long delay = config.getLatencyMs() + random.nextInt(config.getLatencyJitterMs());
log.debug("Injecting {}ms latency into {}", delay, methodName);
Thread.sleep(delay);
}
// Inject failure
if (random.nextDouble() < config.getFailureProbability()) {
log.debug("Injecting failure into {}", methodName);
throw config.getExceptionSupplier().get();
}
// Inject slow response
if (random.nextDouble() < config.getSlowProbability()) {
Object result = pjp.proceed();
Thread.sleep(config.getSlowDelayMs());
return result;
}
return pjp.proceed();
}
public void enableChaos(String target, ChaosConfig config) {
log.warn("Enabling chaos for: {} with config: {}", target, config);
chaosConfigs.put(target, config);
}
public void disableChaos(String target) {
chaosConfigs.remove(target);
}
public void disableAll() {
chaosConfigs.clear();
}
}
@Builder
@Value
class ChaosConfig {
boolean enabled;
double failureProbability; // 0.0 - 1.0
double latencyProbability; // 0.0 - 1.0
int latencyMs;
int latencyJitterMs;
double slowProbability;
int slowDelayMs;
Supplier<Exception> exceptionSupplier;
public static ChaosConfig latency(int ms, double probability) {
return ChaosConfig.builder()
.enabled(true)
.latencyProbability(probability)
.latencyMs(ms)
.latencyJitterMs(ms / 4)
.build();
}
public static ChaosConfig failure(double probability) {
return ChaosConfig.builder()
.enabled(true)
.failureProbability(probability)
.exceptionSupplier(() -> new RuntimeException("Chaos failure"))
.build();
}
}
Anti-Patterns
❌ Unbounded Retries
// WRONG - can cause cascade
while (true) {
try {
return service.call();
} catch (Exception e) {
// Retry forever
}
}
// ✅ CORRECT - bounded retries with backoff
return Retry.of("service-call", RetryConfig.custom()
.maxAttempts(3)
.waitDuration(Duration.ofMillis(100))
.build()
).executeSupplier(() -> service.call());
❌ Ignoring Timeouts
Always set timeouts on remote calls; never wait forever.