Monitoring & Observability for ML Systems
Published: November 2025 | 28 min read
The Three Pillars of ML Observability
Effective monitoring of machine learning systems requires going beyond traditional software metrics to include specialized ML-specific telemetry. A comprehensive monitoring strategy should cover:
1. System Metrics
- Resource utilization (CPU, GPU, memory)
- Request rates and latencies
- Error rates and types
- Container/pod health
2. Data Quality
- Feature distributions
- Data drift detection
- Missing values
- Outlier detection
3. Model Performance
- Prediction accuracy
- Business metrics
- Concept drift
- Feature importance shifts
Implementing ML Monitoring with Prometheus and Grafana
# monitoring/metrics.py
from prometheus_client import start_http_server, Gauge, Histogram, Counter
import time
import numpy as np
from typing import Dict, Any, List, Optional
import logging
logger = logging.getLogger(__name__)
class ModelMetrics:
"""A class to track and expose model metrics for Prometheus."""
def __init__(self, model_name: str, label_names: Optional[List[str]] = None):
self.model_name = model_name
self.label_names = label_names or []
# Common metrics
self.request_counter = Counter(
'model_requests_total',
'Total number of prediction requests',
['model_name', 'endpoint']
)
self.prediction_latency = Histogram(
'model_prediction_latency_seconds',
'Prediction latency in seconds',
['model_name', 'endpoint'],
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]
)
self.prediction_errors = Counter(
'model_prediction_errors_total',
'Total number of prediction errors',
['model_name', 'endpoint', 'error_type']
)
# Data quality metrics
self.feature_drift = Gauge(
'model_feature_drift',
'Feature drift score',
['model_name', 'feature_name']
)
# Model performance metrics
self.prediction_distribution = Histogram(
'model_prediction_distribution',
'Distribution of prediction values',
['model_name'],
buckets=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
)
# Business metrics
self.business_metric = Gauge(
'model_business_metric',
'Business metric (e.g., revenue, conversion rate)',
['model_name', 'metric_name']
)
def record_prediction(
self,
endpoint: str,
features: Dict[str, Any],
prediction: float,
label: Optional[float] = None,
metadata: Optional[Dict[str, Any]] = None
) -> None:
"""Record a prediction and its metadata."""
try:
# Increment request counter
self.request_counter.labels(
model_name=self.model_name,
endpoint=endpoint
).inc()
# Record prediction distribution
self.prediction_distribution.labels(
model_name=self.model_name
).observe(prediction)
# Record feature drift (simplified example)
for feature_name, value in features.items():
# In practice, you'd calculate drift using a reference distribution
self.feature_drift.labels(
model_name=self.model_name,
feature_name=feature_name
).set(value)
# Record business metrics if available
if metadata and 'business_metric' in metadata:
for metric_name, metric_value in metadata['business_metric'].items():
self.business_metric.labels(
model_name=self.model_name,
metric_name=metric_name
).set(metric_value)
return True
except Exception as e:
logger.error(f"Failed to record metrics: {str(e)}")
return False
def record_error(
self,
endpoint: str,
error_type: str,
error_message: str = "",
exception: Optional[Exception] = None
) -> None:
"""Record an error that occurred during prediction."""
self.prediction_errors.labels(
model_name=self.model_name,
endpoint=endpoint,
error_type=error_type
).inc()
logger.error(
f"Prediction error in {endpoint}: {error_message}",
exc_info=exception
)
# Example usage
if __name__ == "__main__":
# Start Prometheus metrics server
start_http_server(8000)
# Initialize metrics
metrics = ModelMetrics(model_name="fraud_detection")
# Simulate recording predictions
while True:
features = {
'amount': np.random.normal(100, 50),
'transaction_hour': np.random.randint(0, 24),
'user_risk_score': np.random.uniform(0, 1)
}
# Record prediction
metrics.record_prediction(
endpoint="/predict",
features=features,
prediction=np.random.uniform(0, 1),
metadata={
'business_metric': {
'revenue': np.random.uniform(10, 1000),
'conversion_rate': np.random.uniform(0, 1)
}
}
)
# Simulate occasional errors
if np.random.random() < 0.1:
try:
raise ValueError("Invalid input features")
except Exception as e:
metrics.record_error(
endpoint="/predict",
error_type="validation_error",
error_message=str(e),
exception=e
)
time.sleep(1)
Grafana Dashboard for ML Monitoring
```json { "annotations": { "list": [ { "builtIn": 1, "datasource": "-- Dashboard --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "id": 1, "links": [], "panels": [ { "datasource": "Prometheus", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.5 } ] }, "unit": "s" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, "id": 2, "options": { "legend": { "calcs": [ "mean", "max", "lastNotNull" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "expr": "rate(model_prediction_latency_seconds_sum{model_name=\"fraud_detection"}[5m]) / rate(model_prediction_latency_seconds_count{model_name=\"fraud_detection"}[5m])", "legendFormat": "{{endpoint}} - p99", "refId": "A" } ], "title": "Prediction Latency (p99)", "type": "timeseries" }, { "datasource": "Prometheus", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.5 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, "id": 3, "options": { "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "pluginVersion": "9.0.0", "targets": [ { "expr": "sum(rate(model_prediction_errors_total{model_name=\"fraud_detection"}[5m])) / sum(rate(model_requests_total{model_name=\"fraud_detection"}[5m]))", "legendFormat": "Error Rate", "refId": "A" } ], "title": "Error Rate", "type": "gauge" }, { "datasource": "Prometheus", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, "id": 4, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "expr": "sum(rate(model_requests_total{model_name=\"fraud_detection"}[5m])) by (endpoint)", "legendFormat": "{{endpoint}}", "refId": "A" } ], "title": "Request Rate", "type": "timeseries" }, { "datasource": "Prometheus", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.8 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, "id": 5, "options": { "legend": { "calcs": [ "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "expr": "model_feature_drift{model_name=\"fraud_detection"}", "legendFormat": "{{feature_name}}", "refId": "A" } ], "title": "Feature Drift", "type": "timeseries" }, { "datasource": "Prometheus", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, "id": 6, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "expr": "model_business_metric{model_name=\"fraud_detection"}", "legendFormat": "{{metric_name}}", "refId": "A" } ], "title": "Business Metrics", "type": "timeseries" }, { "datasource": "Prometheus", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, "id": 7, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "expr": "sum by (error_type) (rate(model_prediction_errors_total{model_name=\"fraud_detection"}[5m]))", "legendFormat": "{{error_type}}", "refId": "A" } ], "title": "Error Types", "type": "timeseries" } ], "refresh": "5s", "schemaVersion": 36, "style": "dark", "tags": ["ml", "monitoring"], "templating": { "list": [ { "current": { "selected": false, "text": "fraud_detection", "value": "fraud_detection" }, "hide": 0, "includeAll": false, "label": "Model", "multi": false, "name": "model", "options": [ { "selected": true, "text": "fraud_detection", "value": "fraud_detection" } ], "query": "fraud_detection", "queryValue": "", "skipUrlSync": false, "type": "custom" } ] }, "time": { "from": "now-6h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ] }, "timezone": "browser", "title": "ML Model Monitoring Dashboard", "version": 1, "weekStart": "" }