魔物公寓
45.13M · 2026-03-13
可观测性(Observability)是现代分布式系统的核心能力,它通过三大支柱帮助我们理解系统内部状态:
┌─────────────────────────────────────────────────────────────┐
│ 可观测性体系 │
├─────────────────────────────────────────────────────────────┤
│ │
│ Logs (日志) Metrics (指标) Traces │
│ ───────────── ─────────────── ────────── │
│ What happened? How many/fast? Why slow? │
│ 发生了什么? 数量/速度如何? 为什么慢? │
│ │
│ • 离散事件 • 聚合数据 • 请求流程 │
│ • 详细上下文 • 时序统计 • 调用链条 │
│ • 问题定位 • 趋势分析 • 性能瓶颈 │
│ │
└─────────────────────────────────────────────────────────────┘
/**
* 可观测性三大支柱的协同工作示例
* 场景:一次LLM API调用
*/
// 1. TRACE: 记录请求链路(traceId: abc123)
// Span1: HTTP Request [100ms]
// └─ Span2: LLM API Call [80ms]
// └─ Span3: Token Processing [10ms]
// 2. METRICS: 记录聚合指标
// llm.request.duration.p99 = 150ms
// llm.token.usage.total = 1500
// llm.error.rate = 0.02
// 3. LOGS: 记录详细事件
// [INFO] traceId=abc123 Request received: user_id=u123
// [DEBUG] traceId=abc123 Sending to OpenAI: tokens=500
// [WARN] traceId=abc123 Rate limit approached: 80% capacity
// [INFO] traceId=abc123 Response sent: latency=100ms
| 支柱 | 优势 | 劣势 | 典型使用场景 |
|---|---|---|---|
| Logs | 详细信息丰富 | 存储成本高,查询慢 | 问题复现、错误调试 |
| Metrics | 高效存储,快速查询 | 缺少上下文 | 实时监控、告警触发 |
| Traces | 完整调用链路 | 采样率限制 | 性能分析、依赖排查 |
最佳实践:通过 traceId 关联三者,实现从告警到根因的快速定位:
Metrics 告警 → 找到异常时间点
↓
Traces 定位 → 找到慢请求的 traceId
↓
Logs 分析 → 通过 traceId 查看详细日志
↓
根因确认 → 解决问题
结构化日志是可观测性的基础,它让日志从"人类可读"变为"机器可解析"。
logback-spring.xml 完整配置:
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<!-- 日志格式:JSON 结构化输出 -->
<appender name="JSON_CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
<encoder class="net.logstash.logback.encoder.LogstashEncoder">
<includeMdcKeyName>traceId</includeMdcKeyName>
<includeMdcKeyName>userId</includeMdcKeyName>
<includeMdcKeyName>modelName</includeMdcKeyName>
<customFields>{"service":"llm-gateway","env":"${SPRING_PROFILES_ACTIVE}"}</customFields>
</encoder>
</appender>
<!-- 文件输出:按日期和大小滚动 -->
<appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>logs/application.log</file>
<rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
<fileNamePattern>logs/application-%d{yyyy-MM-dd}.%i.log.gz</fileNamePattern>
<maxFileSize>100MB</maxFileSize>
<maxHistory>30</maxHistory>
<totalSizeCap>10GB</totalSizeCap>
</rollingPolicy>
<encoder class="net.logstash.logback.encoder.LogstashEncoder"/>
</appender>
<!-- 异步日志:提升性能 -->
<appender name="ASYNC_FILE" class="ch.qos.logback.classic.AsyncAppender">
<queueSize>512</queueSize>
<discardingThreshold>0</discardingThreshold>
<appender-ref ref="FILE"/>
</appender>
<root level="INFO">
<appender-ref ref="JSON_CONSOLE"/>
<appender-ref ref="ASYNC_FILE"/>
</root>
</configuration>
MDC(Mapped Diagnostic Context)让日志携带请求上下文:
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.MDC;
import org.springframework.stereotype.Component;
import javax.servlet.Filter;
import javax.servlet.FilterChain;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServletRequest;
import java.util.UUID;
/**
* MDC 过滤器:为每个请求注入 traceId
*/
@Component
public class MdcFilter implements Filter {
private static final String TRACE_ID = "traceId";
private static final String USER_ID = "userId";
private static final String REQUEST_PATH = "requestPath";
@Override
public void doFilter(ServletRequest request, ServletResponse response,
FilterChain chain) throws IOException, ServletException {
try {
HttpServletRequest httpRequest = (HttpServletRequest) request;
// 生成或提取 traceId
String traceId = httpRequest.getHeader("X-Trace-Id");
if (traceId == null) {
traceId = UUID.randomUUID().toString().replace("-", "");
}
// 注入 MDC
MDC.put(TRACE_ID, traceId);
MDC.put(REQUEST_PATH, httpRequest.getRequestURI());
// 从认证信息提取 userId
String userId = extractUserId(httpRequest);
if (userId != null) {
MDC.put(USER_ID, userId);
}
chain.doFilter(request, response);
} finally {
// 清理 MDC,避免内存泄漏
MDC.clear();
}
}
private String extractUserId(HttpServletRequest request) {
// 从 JWT token 或 session 提取用户 ID
String token = request.getHeader("Authorization");
if (token != null) {
// 解析 JWT 获取 userId
return parseUserIdFromToken(token);
}
return null;
}
}
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
@Slf4j
@Service
public class LlmService {
/**
* 优秀的结构化日志示例
*/
public String chat(String userId, String prompt) {
long startTime = System.currentTimeMillis();
// 好的日志:结构化、有上下文
log.info("LLM request started: userId={}, promptLength={}, model={}",
userId, prompt.length(), "gpt-4");
try {
// 调用 LLM API
String response = callOpenAI(prompt);
long duration = System.currentTimeMillis() - startTime;
// 记录关键业务指标
log.info("LLM request succeeded: userId={}, duration={}ms, " +
"inputTokens={}, outputTokens={}, cost=${}",
userId, duration, 500, 300, 0.015);
return response;
} catch (RateLimitException e) {
// 错误日志包含足够的排查信息
log.error("LLM rate limit exceeded: userId={}, retryAfter={}s, " +
"dailyQuota={}, used={}",
userId, e.getRetryAfter(), e.getQuota(), e.getUsed(), e);
throw e;
} catch (Exception e) {
// 避免:日志缺少上下文
// log.error("Error occurred", e);
// 正确:包含业务上下文
log.error("LLM request failed: userId={}, prompt={}, error={}",
userId, maskSensitiveData(prompt), e.getMessage(), e);
throw e;
}
}
/**
* 敏感数据脱敏
*/
private String maskSensitiveData(String data) {
if (data == null || data.length() <= 20) {
return "***";
}
// 只显示前后各10个字符
return data.substring(0, 10) + "..." +
data.substring(data.length() - 10);
}
}
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
/**
* 自定义序列化器:自动脱敏敏感字段
*/
public class SensitiveDataSerializer extends JsonSerializer<String> {
@Override
public void serialize(String value, JsonGenerator gen, SerializerProvider serializers)
throws IOException {
if (value == null) {
gen.writeNull();
return;
}
// 保留前后各2个字符,中间用 * 代替
if (value.length() <= 4) {
gen.writeString("***");
} else {
String masked = value.substring(0, 2) +
"*".repeat(value.length() - 4) +
value.substring(value.length() - 2);
gen.writeString(masked);
}
}
}
/**
* 使用脱敏注解
*/
@Data
public class UserRequest {
private String userId;
@JsonSerialize(using = SensitiveDataSerializer.class)
private String apiKey; // 输出: "sk********************yz"
@JsonSerialize(using = SensitiveDataSerializer.class)
private String prompt; // 输出: "请**********密码"
}
日志输出示例(JSON格式):
{
"timestamp": "2026-03-10T14:23:15.123Z",
"level": "INFO",
"thread": "http-nio-8080-exec-1",
"logger": "com.example.LlmService",
"message": "LLM request succeeded",
"traceId": "a1b2c3d4e5f6",
"userId": "u12345",
"requestPath": "/api/v1/chat",
"duration": 1234,
"inputTokens": 500,
"outputTokens": 300,
"cost": 0.015,
"modelName": "gpt-4",
"service": "llm-gateway",
"env": "production"
}
Metrics 提供系统运行时的量化数据,是实时监控和告警的基础。
Maven 依赖:
<dependencies>
<!-- Spring Boot Actuator -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<!-- Micrometer Prometheus -->
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
</dependency>
</dependencies>
application.yml 配置:
management:
endpoints:
web:
exposure:
include: health,info,metrics,prometheus
metrics:
export:
prometheus:
enabled: true
tags:
application: ${spring.application.name}
env: ${spring.profiles.active}
distribution:
percentiles-histogram:
http.server.requests: true
percentiles:
http.server.requests: 0.5, 0.95, 0.99
sla:
http.server.requests: 100ms,500ms,1s,5s
import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Timer;
import io.micrometer.core.instrument.Gauge;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;
import java.util.concurrent.atomic.AtomicInteger;
/**
* LLM 指标收集器
*/
@Component
@RequiredArgsConstructor
public class LlmMetrics {
private final MeterRegistry meterRegistry;
// 当前活跃请求数
private final AtomicInteger activeRequests = new AtomicInteger(0);
public LlmMetrics(MeterRegistry meterRegistry) {
this.meterRegistry = meterRegistry;
// 注册 Gauge:实时值
Gauge.builder("llm.requests.active", activeRequests, AtomicInteger::get)
.description("Current active LLM requests")
.register(meterRegistry);
}
/**
* 记录 LLM 请求耗时
*/
public Timer.Sample startRequest(String model, String userId) {
activeRequests.incrementAndGet();
// 请求计数器
Counter.builder("llm.requests.total")
.tag("model", model)
.tag("user", userId)
.description("Total LLM requests")
.register(meterRegistry)
.increment();
return Timer.start(meterRegistry);
}
/**
* 记录请求完成
*/
public void recordSuccess(Timer.Sample sample, String model,
int inputTokens, int outputTokens, double cost) {
activeRequests.decrementAndGet();
// 记录耗时
sample.stop(Timer.builder("llm.request.duration")
.tag("model", model)
.tag("status", "success")
.description("LLM request duration")
.register(meterRegistry));
// Token 使用量
Counter.builder("llm.tokens.consumed")
.tag("model", model)
.tag("type", "input")
.description("Input tokens consumed")
.register(meterRegistry)
.increment(inputTokens);
Counter.builder("llm.tokens.consumed")
.tag("model", model)
.tag("type", "output")
.register(meterRegistry)
.increment(outputTokens);
// 成本统计
Counter.builder("llm.cost.total")
.tag("model", model)
.description("Total cost in USD")
.register(meterRegistry)
.increment(cost);
}
/**
* 记录请求失败
*/
public void recordFailure(Timer.Sample sample, String model, String errorType) {
activeRequests.decrementAndGet();
sample.stop(Timer.builder("llm.request.duration")
.tag("model", model)
.tag("status", "failure")
.register(meterRegistry));
// 错误计数
Counter.builder("llm.errors.total")
.tag("model", model)
.tag("error_type", errorType)
.description("Total LLM errors")
.register(meterRegistry)
.increment();
}
/**
* 记录缓存命中
*/
public void recordCacheHit(boolean hit) {
Counter.builder("llm.cache.requests")
.tag("result", hit ? "hit" : "miss")
.description("Cache hit/miss count")
.register(meterRegistry)
.increment();
}
}
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
@Slf4j
@Service
@RequiredArgsConstructor
public class MonitoredLlmService {
private final LlmMetrics metrics;
private final OpenAiClient openAiClient;
private final CacheService cacheService;
public String chat(String userId, String prompt, String model) {
// 开始计时
Timer.Sample sample = metrics.startRequest(model, userId);
try {
// 检查缓存
String cacheKey = generateCacheKey(prompt, model);
String cachedResponse = cacheService.get(cacheKey);
if (cachedResponse != null) {
metrics.recordCacheHit(true);
log.info("Cache hit for prompt: {}", prompt);
return cachedResponse;
}
metrics.recordCacheHit(false);
// 调用 LLM API
ChatResponse response = openAiClient.chat(prompt, model);
// 记录成功指标
metrics.recordSuccess(
sample,
model,
response.getUsage().getPromptTokens(),
response.getUsage().getCompletionTokens(),
calculateCost(response.getUsage(), model)
);
// 缓存结果
cacheService.put(cacheKey, response.getContent());
return response.getContent();
} catch (RateLimitException e) {
metrics.recordFailure(sample, model, "rate_limit");
throw e;
} catch (Exception e) {
metrics.recordFailure(sample, model, "unknown");
throw e;
}
}
private double calculateCost(Usage usage, String model) {
// GPT-4 定价示例
double inputCostPer1k = 0.03;
double outputCostPer1k = 0.06;
return (usage.getPromptTokens() * inputCostPer1k / 1000) +
(usage.getCompletionTokens() * outputCostPer1k / 1000);
}
}
访问 可以看到:
# HELP llm_requests_total Total LLM requests
# TYPE llm_requests_total counter
llm_requests_total{application="llm-gateway",env="prod",model="gpt-4"} 15234.0
# HELP llm_request_duration_seconds LLM request duration
# TYPE llm_request_duration_seconds summary
llm_request_duration_seconds{application="llm-gateway",model="gpt-4",status="success",quantile="0.5"} 1.234
llm_request_duration_seconds{application="llm-gateway",model="gpt-4",status="success",quantile="0.95"} 3.456
llm_request_duration_seconds{application="llm-gateway",model="gpt-4",status="success",quantile="0.99"} 5.678
# HELP llm_tokens_consumed_total Input tokens consumed
# TYPE llm_tokens_consumed_total counter
llm_tokens_consumed_total{model="gpt-4",type="input"} 1500000.0
llm_tokens_consumed_total{model="gpt-4",type="output"} 800000.0
# HELP llm_cost_total_dollars Total cost in USD
# TYPE llm_cost_total_dollars counter
llm_cost_total_dollars{model="gpt-4"} 123.45
# HELP llm_cache_requests_total Cache hit/miss count
# TYPE llm_cache_requests_total counter
llm_cache_requests_total{result="hit"} 8500.0
llm_cache_requests_total{result="miss"} 6734.0
# HELP llm_requests_active Current active LLM requests
# TYPE llm_requests_active gauge
llm_requests_active{application="llm-gateway"} 12.0
分布式链路追踪帮助我们理解请求在系统中的完整流转路径。
Maven 依赖:
<dependencies>
<!-- OpenTelemetry -->
<dependency>
<groupId>io.opentelemetry</groupId>
<artifactId>opentelemetry-api</artifactId>
</dependency>
<dependency>
<groupId>io.opentelemetry</groupId>
<artifactId>opentelemetry-sdk</artifactId>
</dependency>
<dependency>
<groupId>io.opentelemetry</groupId>
<artifactId>opentelemetry-exporter-jaeger</artifactId>
</dependency>
<!-- Spring Boot Starter -->
<dependency>
<groupId>io.opentelemetry.instrumentation</groupId>
<artifactId>opentelemetry-spring-boot-starter</artifactId>
</dependency>
</dependencies>
application.yml 配置:
otel:
service:
name: llm-gateway
exporter:
jaeger:
endpoint:
traces:
sampler:
probability: 0.1 # 采样率 10%
metrics:
exporter: prometheus
Spring Boot 自动为以下组件创建 Span:
import io.opentelemetry.api.trace.Span;
import io.opentelemetry.api.trace.Tracer;
import io.opentelemetry.context.Scope;
import io.opentelemetry.api.trace.SpanKind;
import io.opentelemetry.api.trace.StatusCode;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
@Service
@RequiredArgsConstructor
public class TracedLlmService {
private final Tracer tracer;
private final OpenAiClient openAiClient;
/**
* 手动创建 Span
*/
public String chat(String prompt, String model) {
// 创建父 Span
Span parentSpan = tracer.spanBuilder("llm.chat")
.setSpanKind(SpanKind.SERVER)
.startSpan();
try (Scope scope = parentSpan.makeCurrent()) {
// 添加属性
parentSpan.setAttribute("llm.model", model);
parentSpan.setAttribute("llm.prompt.length", prompt.length());
// 调用子操作
String processedPrompt = preprocessPrompt(prompt);
String response = callLlmApi(processedPrompt, model);
String finalResponse = postprocessResponse(response);
parentSpan.setStatus(StatusCode.OK);
return finalResponse;
} catch (Exception e) {
parentSpan.setStatus(StatusCode.ERROR, e.getMessage());
parentSpan.recordException(e);
throw e;
} finally {
parentSpan.end();
}
}
/**
* 预处理子 Span
*/
private String preprocessPrompt(String prompt) {
Span span = tracer.spanBuilder("llm.preprocess")
.setSpanKind(SpanKind.INTERNAL)
.startSpan();
try (Scope scope = span.makeCurrent()) {
// 敏感词过滤、长度截断等
String processed = prompt.trim().substring(0, Math.min(prompt.length(), 4000));
span.setAttribute("prompt.original.length", prompt.length());
span.setAttribute("prompt.processed.length", processed.length());
span.setStatus(StatusCode.OK);
return processed;
} finally {
span.end();
}
}
/**
* LLM API 调用子 Span
*/
private String callLlmApi(String prompt, String model) {
Span span = tracer.spanBuilder("llm.api.call")
.setSpanKind(SpanKind.CLIENT)
.startSpan();
try (Scope scope = span.makeCurrent()) {
long startTime = System.currentTimeMillis();
ChatResponse response = openAiClient.chat(prompt, model);
long duration = System.currentTimeMillis() - startTime;
// 记录详细信息
span.setAttribute("llm.provider", "openai");
span.setAttribute("llm.model", model);
span.setAttribute("llm.tokens.input", response.getUsage().getPromptTokens());
span.setAttribute("llm.tokens.output", response.getUsage().getCompletionTokens());
span.setAttribute("llm.latency.ms", duration);
span.setAttribute("llm.cost.usd", calculateCost(response.getUsage(), model));
span.setStatus(StatusCode.OK);
return response.getContent();
} catch (RateLimitException e) {
span.setAttribute("error.type", "rate_limit");
span.setAttribute("retry.after.seconds", e.getRetryAfter());
span.setStatus(StatusCode.ERROR, "Rate limit exceeded");
span.recordException(e);
throw e;
} finally {
span.end();
}
}
/**
* 后处理子 Span
*/
private String postprocessResponse(String response) {
Span span = tracer.spanBuilder("llm.postprocess")
.setSpanKind(SpanKind.INTERNAL)
.startSpan();
try (Scope scope = span.makeCurrent()) {
// Markdown 格式化、内容过滤等
String processed = formatMarkdown(response);
span.setAttribute("response.length", processed.length());
span.setStatus(StatusCode.OK);
return processed;
} finally {
span.end();
}
}
}
import io.opentelemetry.instrumentation.annotations.WithSpan;
import io.opentelemetry.instrumentation.annotations.SpanAttribute;
import org.springframework.stereotype.Service;
@Service
public class AnnotatedLlmService {
/**
* @WithSpan 自动创建 Span
*/
@WithSpan("llm.chat")
public String chat(
@SpanAttribute("user.id") String userId,
@SpanAttribute("llm.prompt") String prompt,
@SpanAttribute("llm.model") String model) {
// 方法执行会自动包裹在 Span 中
return callOpenAI(prompt, model);
}
@WithSpan("llm.api.call")
public String callOpenAI(
@SpanAttribute("llm.prompt.length") int promptLength,
@SpanAttribute("llm.model") String model) {
// OpenTelemetry 会自动:
// 1. 创建 Span
// 2. 添加参数作为 attributes
// 3. 捕获异常
// 4. 记录耗时
// 5. 结束 Span
return performApiCall(promptLength, model);
}
}
import io.opentelemetry.api.trace.Span;
import io.opentelemetry.context.Context;
import io.opentelemetry.context.propagation.TextMapGetter;
import io.opentelemetry.context.propagation.TextMapPropagator;
import org.springframework.http.HttpHeaders;
import org.springframework.stereotype.Component;
import org.springframework.web.client.RestTemplate;
/**
* 跨服务传播 Trace 上下文
*/
@Component
public class TracePropagationClient {
private final RestTemplate restTemplate;
private final TextMapPropagator propagator;
public String callDownstreamService(String url, String payload) {
HttpHeaders headers = new HttpHeaders();
// 将当前 Span 上下文注入 HTTP Headers
propagator.inject(Context.current(), headers, (carrier, key, value) -> {
carrier.add(key, value);
});
// 发送请求,下游服务会提取 traceId/spanId
return restTemplate.postForObject(url,
new HttpEntity<>(payload, headers),
String.class);
}
}
HTTP Headers 示例:
traceparent: 00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01
tracestate: congo=t61rcWkgMzE
启动 Jaeger:
docker run -d --name jaeger
-p 16686:16686
-p 14250:14250
jaegertracing/all-in-one:latest
访问 ,可以看到:
[========= llm.chat (150ms) =========]
|
+-- [== llm.preprocess (5ms) ==]
|
+-- [========= llm.api.call (120ms) =========]
| |
| +-- HTTP POST to api.openai.com (118ms)
|
+-- [== llm.postprocess (10ms) ==]
Grafana 将 Metrics 和 Traces 可视化,提供实时监控能力。
prometheus.yml:
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'llm-gateway'
metrics_path: '/actuator/prometheus'
static_configs:
- targets: ['localhost:8080']
labels:
service: 'llm-gateway'
env: 'production'
Dashboard JSON 配置:
{
"dashboard": {
"title": "LLM Gateway Monitoring",
"panels": [
{
"id": 1,
"title": "QPS (Queries Per Second)",
"type": "graph",
"targets": [
{
"expr": "rate(llm_requests_total[5m])",
"legendFormat": "{{model}}"
}
],
"yaxes": [
{"label": "requests/sec", "format": "short"}
]
},
{
"id": 2,
"title": "Request Latency (P50/P95/P99)",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.50, rate(llm_request_duration_seconds_bucket[5m]))",
"legendFormat": "P50"
},
{
"expr": "histogram_quantile(0.95, rate(llm_request_duration_seconds_bucket[5m]))",
"legendFormat": "P95"
},
{
"expr": "histogram_quantile(0.99, rate(llm_request_duration_seconds_bucket[5m]))",
"legendFormat": "P99"
}
],
"yaxes": [
{"label": "seconds", "format": "s"}
],
"alert": {
"conditions": [
{
"evaluator": {"type": "gt", "params": [10]},
"query": {"params": ["P99", "5m", "now"]},
"reducer": {"type": "avg"}
}
],
"message": "P99 latency > 10s for LLM requests"
}
},
{
"id": 3,
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "rate(llm_errors_total[5m]) / rate(llm_requests_total[5m]) * 100",
"legendFormat": "{{error_type}}"
}
],
"yaxes": [
{"label": "error %", "format": "percent"}
],
"alert": {
"conditions": [
{
"evaluator": {"type": "gt", "params": [5]},
"message": "Error rate > 5%"
}
]
}
},
{
"id": 4,
"title": "Token Usage",
"type": "graph",
"targets": [
{
"expr": "rate(llm_tokens_consumed_total{type='input'}[1h])",
"legendFormat": "Input Tokens/h"
},
{
"expr": "rate(llm_tokens_consumed_total{type='output'}[1h])",
"legendFormat": "Output Tokens/h"
}
],
"yaxes": [
{"label": "tokens/hour", "format": "short"}
]
},
{
"id": 5,
"title": "Daily Cost",
"type": "singlestat",
"targets": [
{
"expr": "increase(llm_cost_total_dollars[24h])"
}
],
"format": "currencyUSD",
"thresholds": "50,100",
"colors": ["green", "yellow", "red"],
"alert": {
"conditions": [
{
"evaluator": {"type": "gt", "params": [100]},
"message": "Daily cost exceeded $100"
}
]
}
},
{
"id": 6,
"title": "Cache Hit Rate",
"type": "graph",
"targets": [
{
"expr": "rate(llm_cache_requests_total{result='hit'}[5m]) / rate(llm_cache_requests_total[5m]) * 100",
"legendFormat": "Hit Rate %"
}
],
"yaxes": [
{"label": "hit rate %", "format": "percent"}
]
},
{
"id": 7,
"title": "Active Requests",
"type": "graph",
"targets": [
{
"expr": "llm_requests_active",
"legendFormat": "Active"
}
],
"yaxes": [
{"label": "count", "format": "short"}
]
},
{
"id": 8,
"title": "Model Distribution",
"type": "piechart",
"targets": [
{
"expr": "sum by (model) (rate(llm_requests_total[1h]))",
"legendFormat": "{{model}}"
}
]
}
],
"refresh": "10s",
"time": {
"from": "now-1h",
"to": "now"
}
}
}
# 1. QPS (每秒请求数)
rate(llm_requests_total[5m])
# 2. 平均响应时间
rate(llm_request_duration_seconds_sum[5m]) / rate(llm_request_duration_seconds_count[5m])
# 3. P95 延迟
histogram_quantile(0.95, rate(llm_request_duration_seconds_bucket[5m]))
# 4. 错误率
rate(llm_errors_total[5m]) / rate(llm_requests_total[5m])
# 5. 每小时成本
increase(llm_cost_total_dollars[1h])
# 6. 缓存命中率
sum(rate(llm_cache_requests_total{result="hit"}[5m])) /
sum(rate(llm_cache_requests_total[5m]))
# 7. Token 消耗速率
sum(rate(llm_tokens_consumed_total[1h])) by (type)
# 8. 按模型分组的 QPS
sum by (model) (rate(llm_requests_total[5m]))
# 9. 慢请求数量 (>5s)
sum(rate(llm_request_duration_seconds_bucket{le="5"}[5m])) -
sum(rate(llm_request_duration_seconds_bucket{le="10"}[5m]))
# 10. 同比上周 QPS 变化
(rate(llm_requests_total[5m]) - rate(llm_requests_total[5m] offset 7d)) /
rate(llm_requests_total[5m] offset 7d) * 100
# 颜色方案
thresholds:
- value: 0
color: green # 正常
- value: 0.8
color: yellow # 警告
- value: 0.95
color: red # 严重
# 时间范围
time_ranges:
- realtime: 5m # 实时监控
- recent: 1h # 近期趋势
- daily: 24h # 每日总览
- weekly: 7d # 周度对比
# 刷新频率
refresh_interval:
- critical: 10s # 关键指标
- normal: 30s # 常规指标
- cost: 5m # 成本统计
自动化告警帮助团队及时发现和响应问题。
alert_rules.yml:
groups:
- name: llm_gateway_alerts
interval: 30s
rules:
# 1. 错误率告警
- alert: HighErrorRate
expr: |
(
rate(llm_errors_total[5m]) /
rate(llm_requests_total[5m])
) * 100 > 5
for: 5m
labels:
severity: critical
team: llm-platform
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} for model {{ $labels.model }}"
runbook: "https://wiki.company.com/runbooks/high-error-rate"
# 2. 高延迟告警
- alert: HighLatency
expr: |
histogram_quantile(0.99,
rate(llm_request_duration_seconds_bucket[5m])
) > 10
for: 5m
labels:
severity: warning
team: llm-platform
annotations:
summary: "P99 latency too high"
description: "P99 latency is {{ $value }}s for model {{ $labels.model }}"
# 3. 成本超预算告警
- alert: DailyCostExceeded
expr: |
increase(llm_cost_total_dollars[24h]) > 100
labels:
severity: warning
team: finance
annotations:
summary: "Daily cost budget exceeded"
description: "Current daily cost: ${{ $value }}, Budget: $100"
# 4. 缓存命中率下降
- alert: LowCacheHitRate
expr: |
(
rate(llm_cache_requests_total{result="hit"}[10m]) /
rate(llm_cache_requests_total[10m])
) < 0.3
for: 10m
labels:
severity: info
team: llm-platform
annotations:
summary: "Cache hit rate below 30%"
description: "Consider reviewing cache strategy"
# 5. 服务不可用
- alert: ServiceDown
expr: up{job="llm-gateway"} == 0
for: 1m
labels:
severity: critical
team: sre
annotations:
summary: "LLM Gateway service is down"
description: "Service {{ $labels.instance }} is unreachable"
# 6. Token 配额告警
- alert: TokenQuotaNearLimit
expr: |
(
rate(llm_tokens_consumed_total[1h]) * 24
) > 5000000 * 0.8
labels:
severity: warning
team: llm-platform
annotations:
summary: "Daily token quota at 80%"
description: "Current usage: {{ $value }} tokens/day"
# 7. 速率限制频繁
- alert: FrequentRateLimits
expr: |
rate(llm_errors_total{error_type="rate_limit"}[5m]) > 1
for: 5m
labels:
severity: warning
team: llm-platform
annotations:
summary: "Frequent rate limit errors"
description: "Consider increasing quota or implementing backoff"
# 8. 活跃请求堆积
- alert: RequestQueueBacklog
expr: llm_requests_active > 100
for: 5m
labels:
severity: warning
team: llm-platform
annotations:
summary: "Request queue backlog detected"
description: "{{ $value }} active requests in queue"
alertmanager.yml:
global:
resolve_timeout: 5m
slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
route:
receiver: 'default'
group_by: ['alertname', 'severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
routes:
# Critical 告警 -> PagerDuty
- match:
severity: critical
receiver: pagerduty
continue: true
# Warning 告警 -> Slack
- match:
severity: warning
receiver: slack-warnings
# Info 告警 -> Email
- match:
severity: info
receiver: email
receivers:
# PagerDuty 配置
- name: 'pagerduty'
pagerduty_configs:
- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
description: '{{ .GroupLabels.alertname }}: {{ .Annotations.summary }}'
details:
firing: '{{ .Alerts.Firing | len }}'
description: '{{ .Annotations.description }}'
runbook: '{{ .Annotations.runbook }}'
# Slack 配置
- name: 'slack-warnings'
slack_configs:
- channel: '#llm-alerts'
username: 'Prometheus'
icon_emoji: ':warning:'
title: '{{ .GroupLabels.alertname }}'
text: |
*Summary:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
*Severity:* {{ .GroupLabels.severity }}
<{{ .GeneratorURL }}|View in Prometheus>
send_resolved: true
# Email 配置
- name: 'email'
email_configs:
- to: 'llm-team@company.com'
from: 'alerts@company.com'
smarthost: 'smtp.gmail.com:587'
auth_username: 'alerts@company.com'
auth_password: 'YOUR_PASSWORD'
headers:
Subject: '[{{ .GroupLabels.severity }}] {{ .GroupLabels.alertname }}'
inhibit_rules:
# 服务宕机时,抑制其他告警
- source_match:
alertname: 'ServiceDown'
target_match_re:
alertname: '.*'
equal: ['instance']
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
import org.springframework.web.client.RestTemplate;
/**
* 应用内告警触发
*/
@Service
@RequiredArgsConstructor
public class AlertService {
private final RestTemplate restTemplate;
/**
* 发送 Slack 告警
*/
public void sendSlackAlert(String title, String message, AlertLevel level) {
String webhookUrl = "https://hooks.slack.com/services/YOUR/WEBHOOK";
SlackMessage payload = SlackMessage.builder()
.text(title)
.attachments(List.of(
Attachment.builder()
.color(level.getColor())
.title(title)
.text(message)
.timestamp(Instant.now().getEpochSecond())
.build()
))
.build();
restTemplate.postForEntity(webhookUrl, payload, String.class);
}
/**
* 触发 PagerDuty 事件
*/
public void triggerPagerDuty(String incidentKey, String description, String severity) {
String apiUrl = "https://events.pagerduty.com/v2/enqueue";
PagerDutyEvent event = PagerDutyEvent.builder()
.routingKey("YOUR_ROUTING_KEY")
.eventAction("trigger")
.payload(
Payload.builder()
.summary(description)
.severity(severity)
.source("llm-gateway")
.customDetails(Map.of(
"incident_key", incidentKey,
"timestamp", Instant.now().toString()
))
.build()
)
.build();
restTemplate.postForEntity(apiUrl, event, String.class);
}
}
enum AlertLevel {
INFO("good"),
WARNING("warning"),
ERROR("danger");
private final String color;
AlertLevel(String color) {
this.color = color;
}
public String getColor() {
return color;
}
}
┌─────────────────────────────────────────────────────────────────┐
│ LLM Gateway Application │
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
│ │ Logback │ │ Micrometer │ │ OpenTelemetry│ │
│ │ (Logs) │ │ (Metrics) │ │ (Traces) │ │
│ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │
└─────────┼──────────────────┼──────────────────┼──────────────────┘
│ │ │
│ JSON logs │ /actuator/ │ OTLP
│ │ prometheus │
▼ ▼ ▼
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
│ Elasticsearch│ │ Prometheus │ │ Jaeger │
│ (ELK) │ │ │ │ │
│ │ │ Time-series │ │ Trace DB │
│ Log Storage │ │ Metrics │ │ │
└──────┬───────┘ └──────┬───────┘ └──────┬───────┘
│ │ │
│ │ PromQL │ TraceQL
│ │ │
│ ▼ │
│ ┌──────────────┐ │
│ │ AlertManager │ │
│ │ │ │
│ │ Rule Engine │ │
│ └──────┬───────┘ │
│ │ │
│ │ Alerts │
│ ▼ │
│ ┌──────────────┐ │
└─────────►│ Grafana │◄─────────┘
│ │
│ Dashboards │
│ + Alerts │
└──────┬───────┘
│
▼
┌──────────────┐
│ Notification │
│ │
│ Slack/Email/ │
│ PagerDuty │
└──────────────┘
1. Logs 流向:
Application
└─> Logback (JSON format)
└─> Filebeat / Fluentd
└─> Elasticsearch
└─> Kibana (查询分析)
└─> Grafana (可视化)
2. Metrics 流向:
Application
└─> Micrometer
└─> /actuator/prometheus endpoint
└─> Prometheus (scrape)
└─> AlertManager (告警)
└─> Grafana (可视化)
3. Traces 流向:
Application
└─> OpenTelemetry SDK
└─> OTLP Exporter
└─> Jaeger Collector
└─> Jaeger Query
└─> Grafana (Tempo/Jaeger 数据源)
docker-compose.yml:
version: '3.8'
services:
# Prometheus
prometheus:
image: prom/prometheus:latest
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./alert_rules.yml:/etc/prometheus/alert_rules.yml
ports:
- "9090:9090"
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.enable-lifecycle'
# Grafana
grafana:
image: grafana/grafana:latest
volumes:
- grafana-storage:/var/lib/grafana
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards
- ./grafana/datasources:/etc/grafana/provisioning/datasources
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_USERS_ALLOW_SIGN_UP=false
# Jaeger
jaeger:
image: jaegertracing/all-in-one:latest
ports:
- "16686:16686" # UI
- "14250:14250" # gRPC
- "6831:6831/udp" # UDP
environment:
- COLLECTOR_OTLP_ENABLED=true
# Elasticsearch
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:8.10.0
environment:
- discovery.type=single-node
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
ports:
- "9200:9200"
volumes:
- es-data:/usr/share/elasticsearch/data
# Kibana
kibana:
image: docker.elastic.co/kibana/kibana:8.10.0
ports:
- "5601:5601"
depends_on:
- elasticsearch
# AlertManager
alertmanager:
image: prom/alertmanager:latest
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
ports:
- "9093:9093"
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
volumes:
grafana-storage:
es-data:
启动命令:
docker-compose up -d
访问地址:
任务:为一个聊天接口添加完整的 Logs/Metrics/Traces。
@RestController
@RequestMapping("/api/v1")
@RequiredArgsConstructor
@Slf4j
public class ChatController {
private final LlmService llmService;
private final LlmMetrics metrics;
private final Tracer tracer;
@PostMapping("/chat")
@WithSpan("api.chat")
public ResponseEntity<ChatResponse> chat(
@RequestHeader("X-User-Id") String userId,
@RequestBody ChatRequest request) {
// 1. MDC 注入
MDC.put("userId", userId);
MDC.put("model", request.getModel());
// 2. 开始计时
Timer.Sample sample = metrics.startRequest(request.getModel(), userId);
// 3. 记录日志
log.info("Chat request received: promptLength={}", request.getPrompt().length());
// 4. 创建 Span
Span span = tracer.spanBuilder("chat.process").startSpan();
try (Scope scope = span.makeCurrent()) {
// 调用服务
String response = llmService.chat(userId, request.getPrompt(), request.getModel());
// 记录成功
metrics.recordSuccess(sample, request.getModel(), 500, 300, 0.015);
span.setStatus(StatusCode.OK);
log.info("Chat request succeeded: responseLength={}", response.length());
return ResponseEntity.ok(new ChatResponse(response));
} catch (Exception e) {
// 记录失败
metrics.recordFailure(sample, request.getModel(), e.getClass().getSimpleName());
span.setStatus(StatusCode.ERROR, e.getMessage());
span.recordException(e);
log.error("Chat request failed: error={}", e.getMessage(), e);
throw e;
} finally {
span.end();
MDC.clear();
}
}
}
验证:
任务:当日成本超过 $50 时发送 Slack 告警。
# alert_rules.yml
- alert: DailyCostWarning
expr: increase(llm_cost_total_dollars[24h]) > 50
for: 5m
labels:
severity: warning
team: finance
annotations:
summary: "Daily LLM cost exceeds $50"
description: "Current cost: ${{ $value | printf "%.2f" }}"
action: "Review usage patterns in Grafana"
任务:使用 Prometheus 数据分析性能退化。
# 对比本周和上周的 P95 延迟
(
histogram_quantile(0.95, rate(llm_request_duration_seconds_bucket[7d]))
-
histogram_quantile(0.95, rate(llm_request_duration_seconds_bucket[7d] offset 7d))
) /
histogram_quantile(0.95, rate(llm_request_duration_seconds_bucket[7d] offset 7d))
* 100
结果解读:
最后更新:2026-03-09 字数统计:5,000 字 预计阅读时间:40 分钟