Introduction to SRE
Site Reliability Engineering (SRE) is a discipline that incorporates aspects of software engineering and applies them to infrastructure and operations problems. As defined by Google, the primary goal is to create scalable and highly reliable software systems.
1. The Service Level Hierarchy
Service Level Indicators (SLIs)
SLIs are carefully defined quantitative measures of some aspect of the level of service that is provided.
# Common SLI Categories and Metrics
class SLIMetrics:
"""
Key Service Level Indicators
"""
# Availability SLI
@staticmethod
def availability(successful_requests, total_requests):
"""
Percentage of requests that were successful
"""
return (successful_requests / total_requests) * 100
# Latency SLI
@staticmethod
def latency_percentile(latencies, percentile=99):
"""
Response time at a given percentile
"""
import numpy as np
return np.percentile(latencies, percentile)
# Throughput SLI
@staticmethod
def throughput(requests, time_window_seconds):
"""
Requests per second
"""
return requests / time_window_seconds
# Error Rate SLI
@staticmethod
def error_rate(errors, total_requests):
"""
Percentage of requests that resulted in errors
"""
return (errors / total_requests) * 100Service Level Objectives (SLOs)
SLOs are target values or ranges for a service level measured by an SLI.
# SLO Definition Examples
SLO_DEFINITIONS = {
"api_availability": {
"sli": "availability",
"target": 99.9, # 99.9% uptime
"window": "30d",
"description": "99.9% of API requests should be successful"
},
"api_latency_p99": {
"sli": "latency_p99",
"target": 200, # 200ms
"unit": "milliseconds",
"window": "30d",
"description": "99% of requests should complete in under 200ms"
},
"api_latency_p50": {
"sli": "latency_p50",
"target": 50, # 50ms
"unit": "milliseconds",
"window": "30d",
"description": "50% of requests should complete in under 50ms"
}
}
# SLO to Error Budget mapping
def calculate_allowed_downtime(slo_target, window_days):
"""
Calculate allowed downtime based on SLO
99.9% SLO over 30 days = 43.2 minutes allowed downtime
99.99% SLO over 30 days = 4.32 minutes allowed downtime
"""
total_minutes = window_days * 24 * 60
error_budget_percent = 100 - slo_target
allowed_downtime = total_minutes * (error_budget_percent / 100)
return allowed_downtime # in minutesService Level Agreements (SLAs)
SLAs are explicit or implicit contracts with users about the consequences of meeting or missing SLOs.
| SLO Target | Monthly Downtime | Daily Downtime | Common Use Case |
|---|---|---|---|
| 99% | 7.2 hours | 14.4 minutes | Internal tools |
| 99.9% | 43.2 minutes | 1.44 minutes | Web applications |
| 99.99% | 4.32 minutes | 8.64 seconds | Critical infrastructure |
| 99.999% | 26 seconds | 0.86 seconds | Financial systems |
2. Error Budgets
Error budgets represent the acceptable amount of unreliability based on your SLO.
class ErrorBudget:
def __init__(self, slo_target: float, window_days: int = 30):
self.slo_target = slo_target
self.window_days = window_days
self.budget = 100 - slo_target # e.g., 0.1% for 99.9% SLO
def calculate_remaining(self, current_availability: float) -> dict:
"""
Calculate remaining error budget
"""
consumed = self.slo_target - current_availability
remaining = self.budget - consumed
return {
"total_budget": self.budget,
"consumed": max(0, consumed),
"remaining": max(0, remaining),
"remaining_percent": (remaining / self.budget) * 100,
"burn_rate": consumed / self.budget,
"is_exhausted": remaining <= 0
}
def time_to_exhaustion(self, burn_rate_per_day: float) -> float:
"""
Calculate days until error budget is exhausted
"""
if burn_rate_per_day <= 0:
return float('inf')
return self.budget / burn_rate_per_day
# Usage example
budget = ErrorBudget(slo_target=99.9, window_days=30)
status = budget.calculate_remaining(current_availability=99.85)
print(f"Error budget remaining: {status['remaining_percent']:.1f}%")3. SLO-Based Alerting
# Multi-window, Multi-burn-rate alerting
# Based on Google's SRE Workbook recommendations
ALERT_CONFIGS = {
"fast_burn": {
# Page: Consuming 2% of monthly budget in 1 hour
"short_window": "5m",
"long_window": "1h",
"burn_rate_threshold": 14.4, # 2% / (1/720) = 14.4x normal
"severity": "page"
},
"slow_burn": {
# Ticket: Consuming 5% of monthly budget in 6 hours
"short_window": "30m",
"long_window": "6h",
"burn_rate_threshold": 6, # 5% / (6/720) = 6x normal
"severity": "ticket"
},
"low_burn": {
# Log: Consuming 10% of monthly budget in 3 days
"short_window": "6h",
"long_window": "3d",
"burn_rate_threshold": 1, # Normal burn rate
"severity": "log"
}
}
def calculate_burn_rate(errors, requests, slo_target, window_hours):
"""
Calculate burn rate relative to error budget consumption
"""
error_rate = errors / requests if requests > 0 else 0
error_budget = 1 - (slo_target / 100)
# Burn rate = (current error rate) / (allowed error rate)
burn_rate = error_rate / error_budget if error_budget > 0 else 0
return burn_rate4. Implementing SRE Practices
Monitoring and Observability
from prometheus_client import Counter, Histogram, Gauge
import time
# Define metrics
REQUEST_COUNT = Counter(
'http_requests_total',
'Total HTTP requests',
['method', 'endpoint', 'status']
)
REQUEST_LATENCY = Histogram(
'http_request_duration_seconds',
'HTTP request latency in seconds',
['method', 'endpoint'],
buckets=[.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10]
)
ERROR_BUDGET_REMAINING = Gauge(
'error_budget_remaining_percent',
'Remaining error budget as percentage',
['service', 'slo_name']
)
# Middleware for automatic metrics collection
async def metrics_middleware(request, call_next):
start_time = time.time()
try:
response = await call_next(request)
status = response.status_code
except Exception as e:
status = 500
raise
finally:
duration = time.time() - start_time
REQUEST_COUNT.labels(
method=request.method,
endpoint=request.url.path,
status=status
).inc()
REQUEST_LATENCY.labels(
method=request.method,
endpoint=request.url.path
).observe(duration)
return responseIncident Management
class IncidentManager:
"""
Incident lifecycle management following SRE practices
"""
def __init__(self):
self.active_incidents = {}
self.notification_service = NotificationService()
async def declare_incident(self, title: str, severity: str, affected_services: list):
incident = Incident(
id=self.generate_id(),
title=title,
severity=severity,
affected_services=affected_services,
status="active",
timeline=[
TimelineEntry(
timestamp=datetime.now(),
action="Incident declared",
author="system"
)
]
)
self.active_incidents[incident.id] = incident
# Assign roles
incident.incident_commander = await self.assign_ic()
incident.communications_lead = await self.assign_comm_lead()
# Start communication
await self.notification_service.broadcast(
channel="incidents",
message=self.format_incident_notification(incident)
)
# Create status page entry
await self.update_status_page(incident)
return incident
async def run_postmortem(self, incident_id: str):
incident = self.active_incidents[incident_id]
postmortem = Postmortem(
incident_id=incident_id,
summary=incident.title,
timeline=incident.timeline,
impact=self.calculate_impact(incident),
root_causes=[],
action_items=[],
lessons_learned=[]
)
return postmortem5. Toil Reduction
Toil is manual, repetitive, automatable work that scales linearly with service growth.
# Example: Automating certificate renewal
class CertificateAutomation:
"""
Automated certificate management to reduce toil
"""
def __init__(self, cert_manager: CertManager):
self.cert_manager = cert_manager
self.renewal_threshold_days = 30
async def check_and_renew_certificates(self):
"""
Automatically renew certificates approaching expiration
"""
certs = await self.cert_manager.list_certificates()
for cert in certs:
days_until_expiry = (cert.expiry - datetime.now()).days
if days_until_expiry <= self.renewal_threshold_days:
try:
await self.renew_certificate(cert)
await self.notify_success(cert)
except RenewalError as e:
await self.notify_failure(cert, e)
async def renew_certificate(self, cert: Certificate):
# Request new certificate
new_cert = await self.cert_manager.request_certificate(
domain=cert.domain,
type=cert.type
)
# Deploy to load balancer
await self.deploy_certificate(new_cert)
# Verify deployment
await self.verify_certificate(new_cert)
return new_certConclusion
SRE is about finding the right balance between reliability and innovation. By defining clear SLOs, managing error budgets, and automating toil, teams can achieve sustainable reliability while maintaining velocity.
Key Takeaways from Google's SRE Guidebooks
- Define SLIs that matter to users, not just internal metrics
- Set SLOs that are achievable and meaningful
- Use error budgets to balance reliability with feature velocity
- Implement multi-window, multi-burn-rate alerting
- Conduct blameless postmortems to learn from incidents
- Automate toil to free engineers for strategic work
- Aim for 50% of SRE time on engineering, 50% on operations