爸爸的农家院
56.74M · 2026-03-13
时间:1小时 | 难度:⭐⭐⭐⭐ | Week 4 Day 26
了解各模型的定价是成本优化的第一步。以下是 2026 年 3 月的主流模型价格对比:
| 模型 | 输入价格 ($/1M tokens) | 输出价格 ($/1M tokens) | 性能等级 | 适用场景 |
|---|---|---|---|---|
| GPT-4o | $5.00 | $15.00 | 旗舰级 | 复杂推理、长文本分析 |
| GPT-4o-mini | $0.15 | $0.60 | 经济级 | 简单对话、分类任务 |
| Claude 3.5 Sonnet | $3.00 | $15.00 | 旗舰级 | 代码生成、深度分析 |
| Claude 3 Haiku | $0.25 | $1.25 | 经济级 | 快速响应、简单任务 |
| Gemini 1.5 Pro | $3.50 | $10.50 | 旗舰级 | 多模态、长上下文 |
| Gemini 1.5 Flash | $0.075 | $0.30 | 经济级 | 高频调用、实时场景 |
成本差异分析:
/**
* 成本计算工具类
*/
@Component
public class CostCalculator {
/**
* 计算单次调用成本
*/
public BigDecimal calculateCost(
String modelName,
int inputTokens,
int outputTokens) {
ModelPricing pricing = getModelPricing(modelName);
// 成本 = (输入tokens * 输入单价) + (输出tokens * 输出单价)
BigDecimal inputCost = BigDecimal.valueOf(inputTokens)
.multiply(pricing.getInputPrice())
.divide(BigDecimal.valueOf(1_000_000), 8, RoundingMode.HALF_UP);
BigDecimal outputCost = BigDecimal.valueOf(outputTokens)
.multiply(pricing.getOutputPrice())
.divide(BigDecimal.valueOf(1_000_000), 8, RoundingMode.HALF_UP);
return inputCost.add(outputCost);
}
/**
* 预估对话成本
*/
public BigDecimal estimateConversationCost(
String modelName,
int averageInputTokens,
int averageOutputTokens,
int conversationRounds) {
BigDecimal perRoundCost = calculateCost(
modelName,
averageInputTokens,
averageOutputTokens
);
return perRoundCost.multiply(BigDecimal.valueOf(conversationRounds));
}
private ModelPricing getModelPricing(String modelName) {
// 从配置中心或数据库获取最新价格
return pricingRepository.findByModelName(modelName)
.orElseThrow(() -> new IllegalArgumentException("Unknown model: " + modelName));
}
}
/**
* 模型定价实体
*/
@Entity
@Table(name = "model_pricing")
@Data
public class ModelPricing {
@Id
private String modelName;
@Column(nullable = false)
private BigDecimal inputPrice; // 每百万token输入价格
@Column(nullable = false)
private BigDecimal outputPrice; // 每百万token输出价格
@Column(nullable = false)
private String performanceTier; // FLAGSHIP, ECONOMY
@Column
private LocalDateTime updatedAt;
}
智能模型路由的核心是根据任务复杂度自动选择最合适的模型:
/**
* 智能模型路由器
* 根据任务复杂度自动选择最经济高效的模型
*/
@Service
@Slf4j
public class ModelRouter {
@Autowired
private CostCalculator costCalculator;
/**
* 任务复杂度评分
*/
public int calculateComplexityScore(ChatRequest request) {
int score = 0;
// 1. 消息历史长度 (0-30分)
score += Math.min(request.getMessages().size() * 3, 30);
// 2. 输入文本长度 (0-25分)
int totalLength = request.getMessages().stream()
.mapToInt(msg -> msg.getContent().length())
.sum();
score += Math.min(totalLength / 200, 25);
// 3. 任务类型复杂度 (0-25分)
score += getTaskTypeComplexity(request);
// 4. 是否需要代码生成 (0-20分)
if (containsCodeRequest(request)) {
score += 20;
}
return Math.min(score, 100);
}
/**
* 根据复杂度选择模型
*/
public String selectModel(ChatRequest request) {
int complexity = calculateComplexityScore(request);
// 复杂度阈值配置
if (complexity >= 70) {
log.info("High complexity ({}), using flagship model", complexity);
return selectFlagshipModel();
} else if (complexity >= 40) {
log.info("Medium complexity ({}), using balanced model", complexity);
return selectBalancedModel();
} else {
log.info("Low complexity ({}), using economy model", complexity);
return selectEconomyModel();
}
}
/**
* 选择旗舰模型 (优先性能)
*/
private String selectFlagshipModel() {
// 根据可用性和性能选择
if (isModelAvailable("gpt-4o")) {
return "gpt-4o";
} else if (isModelAvailable("claude-3-5-sonnet")) {
return "claude-3-5-sonnet";
}
return "gpt-4o"; // 默认
}
/**
* 选择经济模型 (优先成本)
*/
private String selectEconomyModel() {
// 优先选择最便宜的可用模型
if (isModelAvailable("gemini-1.5-flash")) {
return "gemini-1.5-flash";
} else if (isModelAvailable("gpt-4o-mini")) {
return "gpt-4o-mini";
} else if (isModelAvailable("claude-3-haiku")) {
return "claude-3-haiku";
}
return "gpt-4o-mini"; // 默认
}
/**
* 选择平衡模型 (性价比优先)
*/
private String selectBalancedModel() {
// 可以选择经济模型但增加温度参数提高质量
return selectEconomyModel();
}
private int getTaskTypeComplexity(ChatRequest request) {
String lastMessage = request.getLastUserMessage().toLowerCase();
// 高复杂度任务特征
if (lastMessage.contains("分析") || lastMessage.contains("评估") ||
lastMessage.contains("比较") || lastMessage.contains("设计")) {
return 25;
}
// 中等复杂度
if (lastMessage.contains("解释") || lastMessage.contains("总结") ||
lastMessage.contains("翻译")) {
return 15;
}
// 低复杂度
return 5;
}
private boolean containsCodeRequest(ChatRequest request) {
String content = request.getLastUserMessage().toLowerCase();
return content.contains("代码") || content.contains("code") ||
content.contains("实现") || content.contains("function");
}
private boolean isModelAvailable(String modelName) {
// 检查模型是否可用 (配额、限流等)
return true; // 简化实现
}
}
/**
* 模型路由配置
*/
@Configuration
@ConfigurationProperties(prefix = "langchain4j.routing")
@Data
public class ModelRoutingConfig {
/**
* 复杂度阈值配置
*/
private ComplexityThresholds thresholds = new ComplexityThresholds();
/**
* 模型偏好配置
*/
private ModelPreferences preferences = new ModelPreferences();
/**
* 成本限制配置
*/
private CostLimits costLimits = new CostLimits();
@Data
public static class ComplexityThresholds {
private int flagship = 70; // 使用旗舰模型的最低复杂度
private int economy = 40; // 使用经济模型的最高复杂度
}
@Data
public static class ModelPreferences {
private String preferredFlagship = "gpt-4o";
private String preferredEconomy = "gpt-4o-mini";
private List<String> fallbackModels = List.of("claude-3-5-sonnet", "gemini-1.5-pro");
}
@Data
public static class CostLimits {
private BigDecimal maxCostPerRequest = new BigDecimal("0.50"); // $0.50
private BigDecimal maxCostPerUser = new BigDecimal("10.00"); // $10/day
}
}
配置文件 application.yml:
langchain4j:
routing:
thresholds:
flagship: 70
economy: 40
preferences:
preferred-flagship: gpt-4o
preferred-economy: gpt-4o-mini
fallback-models:
- claude-3-5-sonnet
- gemini-1.5-pro
cost-limits:
max-cost-per-request: 0.50
max-cost-per-user: 10.00
/**
* Token 预算管理器
* 支持多维度预算控制: 用户级、时间维度(日/月)、组织级
*/
@Service
@Slf4j
public class TokenBudgetManager {
@Autowired
private RedisTemplate<String, String> redisTemplate;
@Autowired
private CostCalculator costCalculator;
/**
* 检查预算是否充足
*/
public boolean checkBudget(String userId, String modelName, int estimatedTokens) {
// 1. 检查每日预算
if (!checkDailyBudget(userId, modelName, estimatedTokens)) {
log.warn("Daily budget exceeded for user: {}", userId);
return false;
}
// 2. 检查每月预算
if (!checkMonthlyBudget(userId, modelName, estimatedTokens)) {
log.warn("Monthly budget exceeded for user: {}", userId);
return false;
}
// 3. 检查组织级预算
String orgId = getUserOrganization(userId);
if (orgId != null && !checkOrganizationBudget(orgId, modelName, estimatedTokens)) {
log.warn("Organization budget exceeded for org: {}", orgId);
return false;
}
return true;
}
/**
* 记录token消费
*/
public void recordUsage(String userId, String modelName, int inputTokens, int outputTokens) {
LocalDate today = LocalDate.now();
YearMonth currentMonth = YearMonth.now();
BigDecimal cost = costCalculator.calculateCost(modelName, inputTokens, outputTokens);
// 记录每日使用量
String dailyKey = String.format("budget:daily:%s:%s", userId, today);
redisTemplate.opsForHash().increment(dailyKey, "input_tokens", inputTokens);
redisTemplate.opsForHash().increment(dailyKey, "output_tokens", outputTokens);
redisTemplate.opsForHash().increment(dailyKey, "cost_cents", cost.multiply(BigDecimal.valueOf(100)).longValue());
redisTemplate.expire(dailyKey, Duration.ofDays(7)); // 保留7天
// 记录每月使用量
String monthlyKey = String.format("budget:monthly:%s:%s", userId, currentMonth);
redisTemplate.opsForHash().increment(monthlyKey, "input_tokens", inputTokens);
redisTemplate.opsForHash().increment(monthlyKey, "output_tokens", outputTokens);
redisTemplate.opsForHash().increment(monthlyKey, "cost_cents", cost.multiply(BigDecimal.valueOf(100)).longValue());
redisTemplate.expire(monthlyKey, Duration.ofDays(60)); // 保留60天
// 记录组织级使用量
String orgId = getUserOrganization(userId);
if (orgId != null) {
String orgDailyKey = String.format("budget:org:daily:%s:%s", orgId, today);
redisTemplate.opsForHash().increment(orgDailyKey, "cost_cents", cost.multiply(BigDecimal.valueOf(100)).longValue());
redisTemplate.expire(orgDailyKey, Duration.ofDays(7));
}
log.info("Recorded usage for user {}: {} input tokens, {} output tokens, ${}",
userId, inputTokens, outputTokens, cost);
}
/**
* 检查每日预算
*/
private boolean checkDailyBudget(String userId, String modelName, int estimatedTokens) {
LocalDate today = LocalDate.now();
String dailyKey = String.format("budget:daily:%s:%s", userId, today);
// 获取用户每日预算限制
BudgetLimit userLimit = getUserBudgetLimit(userId);
// 计算当前已使用金额
Object costObj = redisTemplate.opsForHash().get(dailyKey, "cost_cents");
long usedCostCents = (costObj != null) ? Long.parseLong(costObj.toString()) : 0L;
BigDecimal usedCost = BigDecimal.valueOf(usedCostCents).divide(BigDecimal.valueOf(100));
// 预估本次调用成本
BigDecimal estimatedCost = costCalculator.calculateCost(modelName, estimatedTokens, estimatedTokens);
// 判断是否超过预算
return usedCost.add(estimatedCost).compareTo(userLimit.getDailyBudget()) <= 0;
}
/**
* 检查每月预算
*/
private boolean checkMonthlyBudget(String userId, String modelName, int estimatedTokens) {
YearMonth currentMonth = YearMonth.now();
String monthlyKey = String.format("budget:monthly:%s:%s", userId, currentMonth);
BudgetLimit userLimit = getUserBudgetLimit(userId);
Object costObj = redisTemplate.opsForHash().get(monthlyKey, "cost_cents");
long usedCostCents = (costObj != null) ? Long.parseLong(costObj.toString()) : 0L;
BigDecimal usedCost = BigDecimal.valueOf(usedCostCents).divide(BigDecimal.valueOf(100));
BigDecimal estimatedCost = costCalculator.calculateCost(modelName, estimatedTokens, estimatedTokens);
return usedCost.add(estimatedCost).compareTo(userLimit.getMonthlyBudget()) <= 0;
}
/**
* 检查组织级预算
*/
private boolean checkOrganizationBudget(String orgId, String modelName, int estimatedTokens) {
LocalDate today = LocalDate.now();
String orgDailyKey = String.format("budget:org:daily:%s:%s", orgId, today);
BudgetLimit orgLimit = getOrganizationBudgetLimit(orgId);
Object costObj = redisTemplate.opsForHash().get(orgDailyKey, "cost_cents");
long usedCostCents = (costObj != null) ? Long.parseLong(costObj.toString()) : 0L;
BigDecimal usedCost = BigDecimal.valueOf(usedCostCents).divide(BigDecimal.valueOf(100));
BigDecimal estimatedCost = costCalculator.calculateCost(modelName, estimatedTokens, estimatedTokens);
return usedCost.add(estimatedCost).compareTo(orgLimit.getDailyBudget()) <= 0;
}
/**
* 获取用户预算使用情况
*/
public BudgetUsage getUserBudgetUsage(String userId) {
LocalDate today = LocalDate.now();
YearMonth currentMonth = YearMonth.now();
String dailyKey = String.format("budget:daily:%s:%s", userId, today);
String monthlyKey = String.format("budget:monthly:%s:%s", userId, currentMonth);
BudgetLimit limit = getUserBudgetLimit(userId);
return BudgetUsage.builder()
.userId(userId)
.dailyUsed(getUsedCost(dailyKey))
.dailyLimit(limit.getDailyBudget())
.monthlyUsed(getUsedCost(monthlyKey))
.monthlyLimit(limit.getMonthlyBudget())
.build();
}
private BigDecimal getUsedCost(String key) {
Object costObj = redisTemplate.opsForHash().get(key, "cost_cents");
if (costObj == null) return BigDecimal.ZERO;
long costCents = Long.parseLong(costObj.toString());
return BigDecimal.valueOf(costCents).divide(BigDecimal.valueOf(100));
}
private BudgetLimit getUserBudgetLimit(String userId) {
// 从数据库或配置中心获取用户预算限制
// 这里返回默认值
return BudgetLimit.builder()
.dailyBudget(new BigDecimal("10.00"))
.monthlyBudget(new BigDecimal("200.00"))
.build();
}
private BudgetLimit getOrganizationBudgetLimit(String orgId) {
return BudgetLimit.builder()
.dailyBudget(new BigDecimal("1000.00"))
.monthlyBudget(new BigDecimal("20000.00"))
.build();
}
private String getUserOrganization(String userId) {
// 获取用户所属组织
return null; // 简化实现
}
}
/**
* 预算限制配置
*/
@Data
@Builder
class BudgetLimit {
private BigDecimal dailyBudget;
private BigDecimal monthlyBudget;
}
/**
* 预算使用情况
*/
@Data
@Builder
class BudgetUsage {
private String userId;
private BigDecimal dailyUsed;
private BigDecimal dailyLimit;
private BigDecimal monthlyUsed;
private BigDecimal monthlyLimit;
public BigDecimal getDailyRemaining() {
return dailyLimit.subtract(dailyUsed);
}
public BigDecimal getMonthlyRemaining() {
return monthlyLimit.subtract(monthlyUsed);
}
public int getDailyUsagePercentage() {
return dailyUsed.divide(dailyLimit, 4, RoundingMode.HALF_UP)
.multiply(BigDecimal.valueOf(100))
.intValue();
}
}
语义缓存通过向量相似度匹配,复用历史相似问题的答案,大幅减少 API 调用:
/**
* 语义缓存实现
* 使用向量相似度匹配历史问题,复用答案
*/
@Service
@Slf4j
public class SemanticCache {
@Autowired
private EmbeddingModel embeddingModel;
@Autowired
private RedisTemplate<String, String> redisTemplate;
@Autowired
private VectorStore vectorStore; // Milvus/Qdrant/Pinecone
private static final double SIMILARITY_THRESHOLD = 0.92; // 相似度阈值
private static final Duration CACHE_TTL = Duration.ofHours(24); // 缓存24小时
/**
* 查找缓存
*/
public Optional<CachedResponse> findCached(String question, String modelName) {
try {
// 1. 生成问题的向量表示
Embedding questionEmbedding = embeddingModel.embed(question).content();
// 2. 向量相似度搜索
List<EmbeddingMatch<CachedResponse>> matches = vectorStore.findRelevant(
EmbeddingSearchRequest.builder()
.queryEmbedding(questionEmbedding)
.maxResults(1)
.minScore(SIMILARITY_THRESHOLD)
.filter(metadataEntry("model", modelName)) // 只匹配相同模型
.build()
);
if (matches.isEmpty()) {
log.debug("No cache hit for question: {}", question.substring(0, Math.min(50, question.length())));
return Optional.empty();
}
EmbeddingMatch<CachedResponse> bestMatch = matches.get(0);
log.info("Cache hit! Similarity: {}, saved cost", bestMatch.score());
// 3. 记录缓存命中
recordCacheHit(modelName);
return Optional.of(bestMatch.embedded());
} catch (Exception e) {
log.error("Error searching semantic cache", e);
return Optional.empty();
}
}
/**
* 存储到缓存
*/
public void cache(String question, String modelName, String answer, int inputTokens, int outputTokens) {
try {
// 1. 生成向量
Embedding questionEmbedding = embeddingModel.embed(question).content();
// 2. 创建缓存对象
CachedResponse cached = CachedResponse.builder()
.question(question)
.answer(answer)
.modelName(modelName)
.inputTokens(inputTokens)
.outputTokens(outputTokens)
.cachedAt(Instant.now())
.build();
// 3. 存储到向量数据库
vectorStore.add(
EmbeddingWithId.builder()
.id(generateCacheId(question, modelName))
.embedding(questionEmbedding)
.embedded(cached)
.build()
);
log.debug("Cached response for question: {}", question.substring(0, Math.min(50, question.length())));
} catch (Exception e) {
log.error("Error caching response", e);
}
}
/**
* 批量预热缓存
*/
public void warmupCache(List<QAPair> commonQuestions) {
log.info("Warming up cache with {} common questions", commonQuestions.size());
commonQuestions.parallelStream().forEach(qa -> {
cache(qa.getQuestion(), qa.getModelName(), qa.getAnswer(),
qa.getInputTokens(), qa.getOutputTokens());
});
log.info("Cache warmup completed");
}
/**
* 清理过期缓存
*/
@Scheduled(cron = "0 0 2 * * ?") // 每天凌晨2点
public void cleanupExpiredCache() {
log.info("Starting cache cleanup");
Instant expiryTime = Instant.now().minus(CACHE_TTL);
// 从向量数据库删除过期条目
vectorStore.removeAll(filter(
metadataKey("cached_at").isLessThan(expiryTime.toEpochMilli())
));
log.info("Cache cleanup completed");
}
/**
* 获取缓存统计
*/
public CacheStats getCacheStats() {
String statsKey = "cache:stats:daily:" + LocalDate.now();
Long hits = getLongValue(statsKey, "hits");
Long misses = getLongValue(statsKey, "misses");
Long total = hits + misses;
double hitRate = total > 0 ? (hits * 100.0 / total) : 0.0;
return CacheStats.builder()
.hits(hits)
.misses(misses)
.hitRate(hitRate)
.totalQueries(total)
.build();
}
private void recordCacheHit(String modelName) {
String statsKey = "cache:stats:daily:" + LocalDate.now();
redisTemplate.opsForHash().increment(statsKey, "hits", 1);
redisTemplate.expire(statsKey, Duration.ofDays(7));
}
private void recordCacheMiss(String modelName) {
String statsKey = "cache:stats:daily:" + LocalDate.now();
redisTemplate.opsForHash().increment(statsKey, "misses", 1);
redisTemplate.expire(statsKey, Duration.ofDays(7));
}
private Long getLongValue(String key, String field) {
Object value = redisTemplate.opsForHash().get(key, field);
return value != null ? Long.parseLong(value.toString()) : 0L;
}
private String generateCacheId(String question, String modelName) {
return DigestUtils.md5DigestAsHex((question + modelName).getBytes(StandardCharsets.UTF_8));
}
}
/**
* 缓存的响应对象
*/
@Data
@Builder
class CachedResponse {
private String question;
private String answer;
private String modelName;
private int inputTokens;
private int outputTokens;
private Instant cachedAt;
}
/**
* 缓存统计
*/
@Data
@Builder
class CacheStats {
private Long hits;
private Long misses;
private Double hitRate;
private Long totalQueries;
}
/**
* 问答对 (用于缓存预热)
*/
@Data
class QAPair {
private String question;
private String answer;
private String modelName;
private int inputTokens;
private int outputTokens;
}
langchain4j:
cache:
enabled: true
similarity-threshold: 0.92 # 相似度阈值
ttl: 24h # 缓存有效期
max-entries: 100000 # 最大缓存条目数
warmup:
enabled: true
questions-file: classpath:common-questions.json
/**
* Prompt 优化器
* 通过压缩、精简、裁剪等技术减少 token 消耗
*/
@Service
@Slf4j
public class PromptOptimizer {
/**
* 优化 System Prompt
*/
public String optimizeSystemPrompt(String originalPrompt) {
return originalPrompt
// 移除多余空白
.replaceAll("\s+", " ")
// 移除示例 (可选,根据场景决定)
.replaceAll("例如:.*?。", "")
// 使用缩写
.replace("请注意", "注意")
.replace("你需要", "需")
.replace("你应该", "应")
.trim();
}
/**
* 动态上下文裁剪
* 保留最相关的上下文,移除不必要的历史消息
*/
public List<ChatMessage> trimContext(
List<ChatMessage> messages,
int maxTokens,
TokenCounter tokenCounter) {
if (messages.isEmpty()) {
return messages;
}
// 始终保留最后一条用户消息
ChatMessage lastUserMessage = messages.get(messages.size() - 1);
List<ChatMessage> result = new ArrayList<>();
result.add(lastUserMessage);
int currentTokens = tokenCounter.countTokens(lastUserMessage.text());
// 从后向前添加消息,直到达到token限制
for (int i = messages.size() - 2; i >= 0; i--) {
ChatMessage msg = messages.get(i);
int msgTokens = tokenCounter.countTokens(msg.text());
if (currentTokens + msgTokens > maxTokens) {
log.info("Context trimmed: kept {} of {} messages", result.size(), messages.size());
break;
}
result.add(0, msg); // 添加到开头保持顺序
currentTokens += msgTokens;
}
return result;
}
/**
* 智能摘要压缩
* 对长文本进行摘要,保留关键信息
*/
public String summarizeContext(String longContext, int targetTokens) {
// 使用经济模型进行摘要
ChatLanguageModel summaryModel = OpenAiChatModel.builder()
.apiKey(System.getenv("OPENAI_API_KEY"))
.modelName("gpt-4o-mini") // 使用经济模型
.build();
String summaryPrompt = String.format(
"将以下内容压缩为不超过%d个tokens的摘要,保留关键信息:nn%s",
targetTokens,
longContext
);
return summaryModel.generate(summaryPrompt);
}
/**
* 移除冗余信息
*/
public String removeRedundancy(String text) {
// 移除重复句子
Set<String> seenSentences = new HashSet<>();
String[] sentences = text.split("[。!?]");
StringBuilder result = new StringBuilder();
for (String sentence : sentences) {
String normalized = sentence.trim().toLowerCase();
if (!normalized.isEmpty() && seenSentences.add(normalized)) {
result.append(sentence).append("。");
}
}
return result.toString();
}
/**
* 结构化数据压缩
* JSON数据最小化
*/
public String compressJson(String jsonString) {
try {
ObjectMapper mapper = new ObjectMapper();
Object json = mapper.readValue(jsonString, Object.class);
// 不使用pretty print,移除所有空白
return mapper.writeValueAsString(json);
} catch (Exception e) {
log.error("Failed to compress JSON", e);
return jsonString;
}
}
}
/**
* Token 计数器
* 使用 tiktoken 等工具准确估算 token 数量
*/
@Service
public class TokenCounter {
private final Encoding encoding;
public TokenCounter() {
// 使用 cl100k_base 编码 (GPT-4, GPT-3.5-turbo)
this.encoding = Encodings.newDefaultEncodingRegistry().getEncoding(EncodingType.CL100K_BASE);
}
/**
* 计算文本的 token 数量
*/
public int countTokens(String text) {
if (text == null || text.isEmpty()) {
return 0;
}
return encoding.countTokens(text);
}
/**
* 计算消息列表的 token 数量
*/
public int countTokens(List<ChatMessage> messages) {
return messages.stream()
.mapToInt(msg -> countTokens(msg.text()) + 4) // 每条消息额外4个tokens
.sum() + 3; // 回复起始额外3个tokens
}
/**
* 估算完整请求的 token 数量
*/
public TokenEstimate estimateRequest(
String systemPrompt,
List<ChatMessage> messages,
int expectedResponseTokens) {
int systemTokens = countTokens(systemPrompt);
int messagesTokens = countTokens(messages);
int totalInput = systemTokens + messagesTokens;
return TokenEstimate.builder()
.inputTokens(totalInput)
.outputTokens(expectedResponseTokens)
.totalTokens(totalInput + expectedResponseTokens)
.build();
}
}
@Data
@Builder
class TokenEstimate {
private int inputTokens;
private int outputTokens;
private int totalTokens;
}
/**
* 成本监控服务
* 实时跟踪成本,生成报告,触发告警
*/
@Service
@Slf4j
public class CostMonitorService {
@Autowired
private MeterRegistry meterRegistry;
@Autowired
private TokenBudgetManager budgetManager;
@Autowired
private CostCalculator costCalculator;
@Autowired
private AlertService alertService;
// Prometheus 指标
private final Counter totalCostCounter;
private final Counter totalRequestsCounter;
private final Gauge currentHourlyCost;
private final Timer requestDurationTimer;
public CostMonitorService(MeterRegistry registry) {
this.meterRegistry = registry;
// 注册 Prometheus 指标
this.totalCostCounter = Counter.builder("llm.cost.total")
.description("Total LLM API cost in USD")
.tag("service", "langchain4j")
.register(registry);
this.totalRequestsCounter = Counter.builder("llm.requests.total")
.description("Total LLM API requests")
.tag("service", "langchain4j")
.register(registry);
this.currentHourlyCost = Gauge.builder("llm.cost.hourly", this, CostMonitorService::getCurrentHourlyCost)
.description("Current hourly LLM cost")
.register(registry);
this.requestDurationTimer = Timer.builder("llm.request.duration")
.description("LLM request duration")
.register(registry);
}
/**
* 记录 API 调用成本
*/
public void recordApiCall(ApiCallMetrics metrics) {
// 计算成本
BigDecimal cost = costCalculator.calculateCost(
metrics.getModelName(),
metrics.getInputTokens(),
metrics.getOutputTokens()
);
// 记录 Prometheus 指标
totalCostCounter.increment(cost.doubleValue());
totalRequestsCounter.increment();
// 记录详细指标 (按模型分类)
Counter.builder("llm.cost.by_model")
.tag("model", metrics.getModelName())
.register(meterRegistry)
.increment(cost.doubleValue());
Counter.builder("llm.tokens.input")
.tag("model", metrics.getModelName())
.register(meterRegistry)
.increment(metrics.getInputTokens());
Counter.builder("llm.tokens.output")
.tag("model", metrics.getModelName())
.register(meterRegistry)
.increment(metrics.getOutputTokens());
// 检查是否需要告警
checkCostAlerts(cost);
log.debug("Recorded API call: model={}, cost=${}, input={}, output={}",
metrics.getModelName(), cost, metrics.getInputTokens(), metrics.getOutputTokens());
}
/**
* 生成每日成本报告
*/
@Scheduled(cron = "0 0 9 * * ?") // 每天早上9点
public void generateDailyReport() {
LocalDate yesterday = LocalDate.now().minusDays(1);
DailyCostReport report = DailyCostReport.builder()
.date(yesterday)
.totalCost(getDailyCost(yesterday))
.totalRequests(getDailyRequests(yesterday))
.costByModel(getCostByModel(yesterday))
.topUsers(getTopUsersBySpending(yesterday, 10))
.build();
// 发送报告
alertService.sendDailyReport(report);
log.info("Daily cost report generated: {}", report);
}
/**
* 生成每周成本报告
*/
@Scheduled(cron = "0 0 10 ? * MON") // 每周一上午10点
public void generateWeeklyReport() {
LocalDate endDate = LocalDate.now().minusDays(1);
LocalDate startDate = endDate.minusDays(7);
WeeklyCostReport report = WeeklyCostReport.builder()
.startDate(startDate)
.endDate(endDate)
.totalCost(getCostBetween(startDate, endDate))
.averageDailyCost(getAverageDailyCost(startDate, endDate))
.costTrend(getCostTrend(startDate, endDate))
.recommendations(generateCostOptimizationRecommendations())
.build();
alertService.sendWeeklyReport(report);
log.info("Weekly cost report generated: {}", report);
}
/**
* 成本告警检查
*/
private void checkCostAlerts(BigDecimal recentCost) {
BigDecimal hourlyCost = getCurrentHourlyCostValue();
BigDecimal dailyCost = getCurrentDailyCostValue();
// 每小时成本告警
if (hourlyCost.compareTo(new BigDecimal("100.00")) > 0) {
alertService.sendAlert(Alert.builder()
.level(AlertLevel.WARNING)
.title("高额小时成本告警")
.message(String.format("当前小时成本已达 $%.2f", hourlyCost))
.build());
}
// 每日成本告警
if (dailyCost.compareTo(new BigDecimal("1000.00")) > 0) {
alertService.sendAlert(Alert.builder()
.level(AlertLevel.CRITICAL)
.title("高额日成本告警")
.message(String.format("今日成本已达 $%.2f", dailyCost))
.build());
}
}
private Double getCurrentHourlyCost() {
return getCurrentHourlyCostValue().doubleValue();
}
private BigDecimal getCurrentHourlyCostValue() {
// 从 Redis 获取当前小时的成本
return BigDecimal.ZERO; // 简化实现
}
private BigDecimal getCurrentDailyCostValue() {
// 从 Redis 获取今日成本
return BigDecimal.ZERO; // 简化实现
}
private BigDecimal getDailyCost(LocalDate date) {
return BigDecimal.ZERO; // 简化实现
}
private Long getDailyRequests(LocalDate date) {
return 0L; // 简化实现
}
private Map<String, BigDecimal> getCostByModel(LocalDate date) {
return Map.of(); // 简化实现
}
private List<UserSpending> getTopUsersBySpending(LocalDate date, int limit) {
return List.of(); // 简化实现
}
private BigDecimal getCostBetween(LocalDate start, LocalDate end) {
return BigDecimal.ZERO; // 简化实现
}
private BigDecimal getAverageDailyCost(LocalDate start, LocalDate end) {
return BigDecimal.ZERO; // 简化实现
}
private List<BigDecimal> getCostTrend(LocalDate start, LocalDate end) {
return List.of(); // 简化实现
}
private List<String> generateCostOptimizationRecommendations() {
return List.of(
"考虑将简单任务路由到 GPT-4o-mini 以节省成本",
"启用语义缓存可减少 30-50% 的重复调用",
"优化 System Prompt 可减少 15-20% 的输入 tokens"
);
}
}
/**
* API 调用指标
*/
@Data
@Builder
class ApiCallMetrics {
private String modelName;
private int inputTokens;
private int outputTokens;
private Duration duration;
private String userId;
}
/**
* 每日成本报告
*/
@Data
@Builder
class DailyCostReport {
private LocalDate date;
private BigDecimal totalCost;
private Long totalRequests;
private Map<String, BigDecimal> costByModel;
private List<UserSpending> topUsers;
}
/**
* 每周成本报告
*/
@Data
@Builder
class WeeklyCostReport {
private LocalDate startDate;
private LocalDate endDate;
private BigDecimal totalCost;
private BigDecimal averageDailyCost;
private List<BigDecimal> costTrend;
private List<String> recommendations;
}
/**
* 用户消费记录
*/
@Data
class UserSpending {
private String userId;
private BigDecimal amount;
private Long requestCount;
}
/**
* 告警对象
*/
@Data
@Builder
class Alert {
private AlertLevel level;
private String title;
private String message;
}
enum AlertLevel {
INFO, WARNING, CRITICAL
}
management:
endpoints:
web:
exposure:
include: health,info,metrics,prometheus
metrics:
export:
prometheus:
enabled: true
tags:
application: langchain4j-app
environment: production
/**
* 完整的预算管理器
* 整合预算检查、限流、成本中心
*/
@Service
@Slf4j
public class BudgetManager {
@Autowired
private TokenBudgetManager tokenBudgetManager;
@Autowired
private CostMonitorService costMonitorService;
@Autowired
private RateLimiter rateLimiter;
/**
* 请求前预算检查
*/
public BudgetCheckResult checkBeforeRequest(
String userId,
String modelName,
int estimatedTokens) {
// 1. 检查用户预算
if (!tokenBudgetManager.checkBudget(userId, modelName, estimatedTokens)) {
return BudgetCheckResult.rejected("预算不足");
}
// 2. 检查限流
if (!rateLimiter.allowRequest(userId)) {
return BudgetCheckResult.rejected("请求过于频繁");
}
// 3. 获取当前预算使用情况
BudgetUsage usage = tokenBudgetManager.getUserBudgetUsage(userId);
// 4. 分级限流策略
if (usage.getDailyUsagePercentage() > 90) {
// 超过90%,严格限流
return BudgetCheckResult.allowedWithWarning(
"您的今日预算即将用尽,请谨慎使用",
RateLimitTier.STRICT
);
} else if (usage.getDailyUsagePercentage() > 70) {
// 超过70%,中等限流
return BudgetCheckResult.allowedWithWarning(
"您已使用今日预算的70%以上",
RateLimitTier.MODERATE
);
}
return BudgetCheckResult.allowed();
}
/**
* 请求后记录消费
*/
public void recordAfterRequest(
String userId,
String modelName,
int inputTokens,
int outputTokens,
Duration duration) {
// 记录到预算管理器
tokenBudgetManager.recordUsage(userId, modelName, inputTokens, outputTokens);
// 记录到监控系统
costMonitorService.recordApiCall(ApiCallMetrics.builder()
.userId(userId)
.modelName(modelName)
.inputTokens(inputTokens)
.outputTokens(outputTokens)
.duration(duration)
.build());
}
/**
* 预算充值
*/
public void topUpBudget(String userId, BigDecimal amount, String reason) {
log.info("Top up budget for user {}: ${}, reason: {}", userId, amount, reason);
// 更新用户预算限制
// 实际实现需要更新数据库
}
/**
* 获取成本中心报告
*/
public CostCenterReport getCostCenterReport(String costCenter, YearMonth month) {
// 获取成本中心下所有用户的消费
List<String> users = getUsersByCostCenter(costCenter);
BigDecimal totalCost = users.stream()
.map(userId -> tokenBudgetManager.getUserBudgetUsage(userId).getMonthlyUsed())
.reduce(BigDecimal.ZERO, BigDecimal::add);
return CostCenterReport.builder()
.costCenter(costCenter)
.month(month)
.totalCost(totalCost)
.userCount(users.size())
.averageCostPerUser(totalCost.divide(BigDecimal.valueOf(users.size()), 2, RoundingMode.HALF_UP))
.build();
}
private List<String> getUsersByCostCenter(String costCenter) {
return List.of(); // 简化实现
}
}
/**
* 预算检查结果
*/
@Data
@Builder
class BudgetCheckResult {
private boolean allowed;
private String message;
private RateLimitTier rateLimitTier;
public static BudgetCheckResult allowed() {
return BudgetCheckResult.builder()
.allowed(true)
.rateLimitTier(RateLimitTier.NORMAL)
.build();
}
public static BudgetCheckResult allowedWithWarning(String message, RateLimitTier tier) {
return BudgetCheckResult.builder()
.allowed(true)
.message(message)
.rateLimitTier(tier)
.build();
}
public static BudgetCheckResult rejected(String message) {
return BudgetCheckResult.builder()
.allowed(false)
.message(message)
.build();
}
}
/**
* 限流等级
*/
enum RateLimitTier {
NORMAL, // 正常: 10 req/min
MODERATE, // 中等: 5 req/min
STRICT // 严格: 2 req/min
}
/**
* 成本中心报告
*/
@Data
@Builder
class CostCenterReport {
private String costCenter;
private YearMonth month;
private BigDecimal totalCost;
private Integer userCount;
private BigDecimal averageCostPerUser;
}
/**
* 成本优化的聊天服务
* 整合所有成本优化技术
*/
@Service
@Slf4j
public class CostOptimizedChatService {
@Autowired
private ModelRouter modelRouter;
@Autowired
private SemanticCache semanticCache;
@Autowired
private BudgetManager budgetManager;
@Autowired
private PromptOptimizer promptOptimizer;
@Autowired
private TokenCounter tokenCounter;
@Autowired
private Map<String, ChatLanguageModel> modelRegistry;
/**
* 成本优化的聊天接口
*/
public ChatResponse chat(ChatRequest request) {
String userId = request.getUserId();
Instant startTime = Instant.now();
try {
// 1. 语义缓存查找
String modelName = modelRouter.selectModel(request);
Optional<CachedResponse> cached = semanticCache.findCached(
request.getLastUserMessage(),
modelName
);
if (cached.isPresent()) {
log.info("Cache hit for user {}", userId);
return ChatResponse.fromCache(cached.get());
}
// 2. 预算检查
int estimatedTokens = estimateTokens(request);
BudgetCheckResult budgetCheck = budgetManager.checkBeforeRequest(
userId,
modelName,
estimatedTokens
);
if (!budgetCheck.isAllowed()) {
return ChatResponse.budgetExceeded(budgetCheck.getMessage());
}
// 3. Prompt 优化
List<ChatMessage> optimizedMessages = optimizeMessages(request.getMessages());
// 4. 调用 LLM
ChatLanguageModel model = modelRegistry.get(modelName);
Response<AiMessage> response = model.generate(optimizedMessages);
// 5. 记录消费
TokenUsage usage = response.tokenUsage();
budgetManager.recordAfterRequest(
userId,
modelName,
usage.inputTokenCount(),
usage.outputTokenCount(),
Duration.between(startTime, Instant.now())
);
// 6. 缓存结果
semanticCache.cache(
request.getLastUserMessage(),
modelName,
response.content().text(),
usage.inputTokenCount(),
usage.outputTokenCount()
);
// 7. 返回响应
return ChatResponse.builder()
.content(response.content().text())
.modelName(modelName)
.inputTokens(usage.inputTokenCount())
.outputTokens(usage.outputTokenCount())
.cost(calculateCost(modelName, usage))
.cached(false)
.warningMessage(budgetCheck.getMessage())
.build();
} catch (Exception e) {
log.error("Error in cost-optimized chat", e);
throw new ChatException("聊天服务异常", e);
}
}
/**
* 优化消息列表
*/
private List<ChatMessage> optimizeMessages(List<ChatMessage> messages) {
// 1. 裁剪上下文
List<ChatMessage> trimmed = promptOptimizer.trimContext(
messages,
4000, // 最大4000 tokens
tokenCounter
);
// 2. 压缩内容
return trimmed.stream()
.map(msg -> {
if (msg instanceof SystemMessage) {
String optimized = promptOptimizer.optimizeSystemPrompt(msg.text());
return new SystemMessage(optimized);
}
return msg;
})
.collect(Collectors.toList());
}
private int estimateTokens(ChatRequest request) {
return tokenCounter.countTokens(request.getMessages()) + 500; // 预留500给响应
}
private BigDecimal calculateCost(String modelName, TokenUsage usage) {
CostCalculator calculator = new CostCalculator();
return calculator.calculateCost(
modelName,
usage.inputTokenCount(),
usage.outputTokenCount()
);
}
}
/**
* 聊天请求
*/
@Data
@Builder
class ChatRequest {
private String userId;
private List<ChatMessage> messages;
public String getLastUserMessage() {
return messages.stream()
.filter(msg -> msg instanceof UserMessage)
.reduce((first, second) -> second)
.map(ChatMessage::text)
.orElse("");
}
}
/**
* 聊天响应
*/
@Data
@Builder
class ChatResponse {
private String content;
private String modelName;
private Integer inputTokens;
private Integer outputTokens;
private BigDecimal cost;
private Boolean cached;
private String warningMessage;
public static ChatResponse fromCache(CachedResponse cached) {
return ChatResponse.builder()
.content(cached.getAnswer())
.modelName(cached.getModelName())
.inputTokens(0)
.outputTokens(0)
.cost(BigDecimal.ZERO)
.cached(true)
.build();
}
public static ChatResponse budgetExceeded(String message) {
return ChatResponse.builder()
.content("抱歉,您的预算已用尽。" + message)
.cached(false)
.build();
}
}
class ChatException extends RuntimeException {
public ChatException(String message, Throwable cause) {
super(message, cause);
}
}
/**
* 聊天 API 控制器
*/
@RestController
@RequestMapping("/api/v1/chat")
@Slf4j
public class ChatController {
@Autowired
private CostOptimizedChatService chatService;
@Autowired
private BudgetManager budgetManager;
/**
* 发送聊天消息
*/
@PostMapping
public ResponseEntity<ChatResponse> chat(
@RequestHeader("X-User-Id") String userId,
@RequestBody ChatRequestDto request) {
ChatRequest chatRequest = ChatRequest.builder()
.userId(userId)
.messages(convertToMessages(request.getMessages()))
.build();
ChatResponse response = chatService.chat(chatRequest);
return ResponseEntity.ok(response);
}
/**
* 获取预算使用情况
*/
@GetMapping("/budget")
public ResponseEntity<BudgetUsage> getBudget(
@RequestHeader("X-User-Id") String userId) {
BudgetUsage usage = budgetManager.getUserBudgetUsage(userId);
return ResponseEntity.ok(usage);
}
private List<ChatMessage> convertToMessages(List<MessageDto> dtos) {
return dtos.stream()
.map(dto -> {
if ("user".equals(dto.getRole())) {
return new UserMessage(dto.getContent());
} else if ("assistant".equals(dto.getRole())) {
return new AiMessage(dto.getContent());
} else {
return new SystemMessage(dto.getContent());
}
})
.collect(Collectors.toList());
}
}
@Data
class ChatRequestDto {
private List<MessageDto> messages;
}
@Data
class MessageDto {
private String role;
private String content;
}
| 指标 | 优化前 | 优化后 | 改善幅度 |
|---|---|---|---|
| 平均单次成本 | $0.0450 | $0.0089 | ↓ 80% |
| 每日总成本 | $450 | $95 | ↓ 79% |
| 缓存命中率 | 0% | 45% | ↑ 45% |
| 平均响应时间 | 2.3s | 0.8s (缓存) / 2.1s (新请求) | ↓ 65% (整体) |
| Token 消耗 | 15,000/请求 | 8,500/请求 | ↓ 43% |
| 预算超支事件 | 每周3次 | 0次 | ↓ 100% |
总成本降低 80%,其中:
1. 智能模型路由: 40% 节省
- 70% 简单任务使用经济模型
- 仅 15% 任务需要旗舰模型
2. 语义缓存: 25% 节省
- 45% 缓存命中率
- 零成本响应重复问题
3. Prompt 优化: 10% 节省
- System Prompt 精简 30%
- 动态上下文裁剪减少 20% tokens
4. 预算管理: 5% 节省
- 避免预算超支
- 分级限流控制成本
陷阱 1: 过度依赖经济模型
// 错误: 所有任务都用最便宜的模型
String model = "gpt-4o-mini"; // 总是使用最便宜的
// 正确: 根据任务复杂度智能选择
String model = modelRouter.selectModel(request);
陷阱 2: 缓存过期策略不当
// 错误: 缓存永不过期
Duration cacheTTL = Duration.ofDays(365); // 一年!
// 正确: 合理的过期时间
Duration cacheTTL = Duration.ofHours(24); // 24小时
陷阱 3: 忽略输出 Token 成本
// 错误: 只限制输入
maxTokens = 4000; // 只控制输入
// 正确: 同时限制输入和输出
maxInputTokens = 3000;
maxOutputTokens = 1000; // 输出通常更贵
在本章中,我们深入学习了 LangChain4J 应用的成本优化和预算管理:
成本分析: 掌握了主流 LLM 模型的价格结构,理解成本差异和优化方向
智能路由: 实现了基于任务复杂度的智能模型路由器,自动选择最经济高效的模型
预算管理: 构建了多维度的 Token 预算管理系统,支持用户级、时间维度和组织级控制
语义缓存: 实现了基于向量相似度的语义缓存,大幅减少重复调用
Prompt 优化: 学习了多种 Prompt 压缩和优化技术,降低 Token 消耗
成本监控: 构建了完整的成本监控和告警体系,实现成本可观测性
综合实战: 整合所有技术,实现了生产级的成本优化聊天服务
通过本章的学习,你可以将 LLM 应用的成本降低 70-85%,同时保持良好的用户体验和服务质量。
成本优化不是一次性工作,而是需要持续监控、分析和改进的过程。
最后更新: 2026-03-09