Distributed Tracing: Observability in Microservices
Implement OpenTelemetry tracing for debugging distributed systems—but trace explosion overwhelms storage
Distributed Tracing: Observability in Microservices
Distributed tracing tracks requests across microservices. This guide implements OpenTelemetry for production observability.
OpenTelemetry Instrumentation
Auto-instrument Python services:
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.instrumentation.requests import RequestsInstrumentor
from flask import Flask
import requests
# Initialize tracer
trace.set_tracer_provider(TracerProvider())
tracer = trace.get_tracer(__name__)
# Configure Jaeger exporter
jaeger_exporter = JaegerExporter(
agent_host_name="localhost",
agent_port=6831,
)
span_processor = BatchSpanProcessor(jaeger_exporter)
trace.get_tracer_provider().add_span_processor(span_processor)
# Create Flask app
app = Flask(__name__)
# Auto-instrument Flask and requests library
FlaskInstrumentor().instrument_app(app)
RequestsInstrumentor().instrument()
@app.route("/api/order")
def create_order():
"""Create order - calls multiple services"""
with tracer.start_as_current_span("create_order") as span:
span.set_attribute("order.id", 12345)
# Call inventory service
inventory_response = check_inventory()
span.set_attribute("inventory.available", inventory_response['available'])
if not inventory_response['available']:
span.set_attribute("error", True)
span.add_event("Inventory unavailable")
return {"error": "Out of stock"}, 400
# Call payment service
payment_result = process_payment()
# Call shipping service
shipping_result = schedule_shipping()
return {"order_id": 12345, "status": "confirmed"}
def check_inventory():
"""Inventory service call"""
with tracer.start_as_current_span("check_inventory"):
response = requests.get("http://inventory-service/check?item=123")
return response.json()
def process_payment():
"""Payment service call"""
with tracer.start_as_current_span("process_payment") as span:
span.set_attribute("payment.amount", 99.99)
response = requests.post("http://payment-service/charge", json={"amount": 99.99})
return response.json()
def schedule_shipping():
"""Shipping service call"""
with tracer.start_as_current_span("schedule_shipping"):
response = requests.post("http://shipping-service/schedule")
return response.json()
if __name__ == "__main__":
app.run(port=5000)
Custom Span Instrumentation
Add detailed tracing to critical paths:
from opentelemetry import trace
from opentelemetry.trace import Status, StatusCode
import time
tracer = trace.get_tracer(__name__)
def complex_operation(user_id: int, data: dict):
"""Heavily instrumented business logic"""
with tracer.start_as_current_span("complex_operation") as span:
span.set_attribute("user.id", user_id)
span.set_attribute("data.size", len(data))
try:
# Step 1: Validate
with tracer.start_as_current_span("validate_input") as validate_span:
if not data:
raise ValueError("Empty data")
validate_span.add_event("Validation passed")
# Step 2: Database query
with tracer.start_as_current_span("database_query") as db_span:
start = time.time()
results = query_database(user_id)
duration_ms = (time.time() - start) * 1000
db_span.set_attribute("db.statement", "SELECT * FROM users WHERE id = ?")
db_span.set_attribute("db.rows_returned", len(results))
db_span.set_attribute("db.duration_ms", duration_ms)
# ⚠️ Slow query detection
if duration_ms > 100:
db_span.add_event("Slow query detected", {
"threshold_ms": 100,
"actual_ms": duration_ms
})
# Step 3: Process data
with tracer.start_as_current_span("process_data") as process_span:
processed = heavy_computation(data)
process_span.set_attribute("output.size", len(processed))
span.set_status(Status(StatusCode.OK))
return processed
except Exception as e:
# Record exception in span
span.record_exception(e)
span.set_status(Status(StatusCode.ERROR, str(e)))
raise
Trace Sampling
Control trace volume:
from opentelemetry.sdk.trace.sampling import (
TraceIdRatioBased,
ParentBased,
ALWAYS_ON,
ALWAYS_OFF
)
class AdaptiveSampler:
"""Dynamically adjust sampling based on system load"""
def __init__(self, base_rate=0.1, max_traces_per_sec=1000):
self.base_rate = base_rate
self.max_traces_per_sec = max_traces_per_sec
self.current_traces_per_sec = 0
self.last_reset = time.time()
def should_sample(self, trace_id) -> bool:
"""Adaptive sampling decision"""
now = time.time()
# Reset counter every second
if now - self.last_reset > 1.0:
self.current_traces_per_sec = 0
self.last_reset = now
# Always sample errors (assuming trace_id encodes error info)
if trace_id & 0x1: # Simplified: check LSB
return True
# Apply rate limit
if self.current_traces_per_sec >= self.max_traces_per_sec:
return False
# Probabilistic sampling
import random
if random.random() < self.base_rate:
self.current_traces_per_sec += 1
return True
return False
# Use parent-based sampling (child spans inherit parent's decision)
sampler = ParentBased(root=TraceIdRatioBased(0.1)) # Sample 10% of root spans
Warnings ⚠️
Trace Explosion: High-traffic systems generate millions of traces per second. Storage costs spiral. The 2035 "Trace Storm" accumulated 50PB of trace data, bankrupting startups.
Performance Overhead: Tracing adds latency (typically 1-5ms per traced operation).
PII Leakage: Traces often capture sensitive data in attributes. GDPR violations are common.
Related Chronicles: The Observability Collapse (2035)
Tools: OpenTelemetry, Jaeger, Zipkin, Honeycomb, Datadog APM
Related Research
Service Mesh with Istio: Secure Microservice Communication
Deploy Istio for traffic management and mTLS—but misconfiguration causes cascading failures
eBPF: Programmable Kernel Observability
Implement eBPF probes for deep system visibility—but kernel bugs cause panics
When Post-Scarcity Destroyed Civilization (Infinite Abundance, Zero Motivation)
Molecular assemblers + fusion power + ASI = post-scarcity. Anything anyone wants, instantly, free. No more work, competition, or achievement. Society collapsed—not from disaster, but from success. Humans can't function without scarcity. Hard science exploring post-scarcity dangers, abundance psychology, and why humans need struggle to thrive.