plugins/agent-architect/skills/patterns/agent-cost-budgeting/SKILL.md
Use this skill when managing AI agent costs. Activate when the user needs to control token usage, implement cost limits for agents, optimize LLM spending, track agent costs, or prevent runaway API bills in agent systems.
npx skillsauth add latestaiagents/agent-skills agent-cost-budgetingInstall this skill globally with one command. Works with Claude Code, Cursor, and Windsurf.
3 of 9 scanners reported clean
Some scanners were skipped, did not run, or reported a non-clean status. Review each row below.
Control and optimize token usage across AI agent systems.
interface CostModel {
provider: string;
model: string;
inputCostPer1K: number; // $ per 1000 input tokens
outputCostPer1K: number; // $ per 1000 output tokens
cacheCostPer1K?: number; // $ per 1000 cached tokens (if supported)
}
const COST_MODELS: Record<string, CostModel> = {
'claude-3-opus': {
provider: 'anthropic',
model: 'claude-3-opus-20240229',
inputCostPer1K: 0.015,
outputCostPer1K: 0.075
},
'claude-3-sonnet': {
provider: 'anthropic',
model: 'claude-3-sonnet-20240229',
inputCostPer1K: 0.003,
outputCostPer1K: 0.015
},
'claude-3-haiku': {
provider: 'anthropic',
model: 'claude-3-haiku-20240307',
inputCostPer1K: 0.00025,
outputCostPer1K: 0.00125
},
'gpt-4-turbo': {
provider: 'openai',
model: 'gpt-4-turbo',
inputCostPer1K: 0.01,
outputCostPer1K: 0.03
},
'gpt-4o': {
provider: 'openai',
model: 'gpt-4o',
inputCostPer1K: 0.005,
outputCostPer1K: 0.015
},
'gpt-4o-mini': {
provider: 'openai',
model: 'gpt-4o-mini',
inputCostPer1K: 0.00015,
outputCostPer1K: 0.0006
}
};
function calculateCost(
model: string,
inputTokens: number,
outputTokens: number
): number {
const costModel = COST_MODELS[model];
if (!costModel) throw new Error(`Unknown model: ${model}`);
return (
(inputTokens / 1000) * costModel.inputCostPer1K +
(outputTokens / 1000) * costModel.outputCostPer1K
);
}
interface Budget {
id: string;
name: string;
limitUSD: number;
spentUSD: number;
period: 'task' | 'hourly' | 'daily' | 'monthly';
resetAt?: Date;
alertThresholds: number[]; // e.g., [0.5, 0.8, 0.95]
hardLimit: boolean; // Stop vs warn at limit
}
class BudgetManager {
private budgets = new Map<string, Budget>();
async checkBudget(budgetId: string, estimatedCost: number): Promise<BudgetCheck> {
const budget = this.budgets.get(budgetId);
if (!budget) return { allowed: true };
const remaining = budget.limitUSD - budget.spentUSD;
const newTotal = budget.spentUSD + estimatedCost;
const utilizationAfter = newTotal / budget.limitUSD;
// Check thresholds
const crossedThresholds = budget.alertThresholds.filter(
t => budget.spentUSD / budget.limitUSD < t && utilizationAfter >= t
);
if (crossedThresholds.length > 0) {
await this.sendAlerts(budget, crossedThresholds);
}
// Check limit
if (estimatedCost > remaining) {
if (budget.hardLimit) {
return {
allowed: false,
reason: `Budget exceeded: ${remaining.toFixed(4)} USD remaining`,
remaining
};
} else {
return {
allowed: true,
warning: `Budget will be exceeded`,
remaining
};
}
}
return { allowed: true, remaining };
}
async recordSpend(budgetId: string, cost: number): Promise<void> {
const budget = this.budgets.get(budgetId);
if (!budget) return;
budget.spentUSD += cost;
// Check if period should reset
if (budget.resetAt && new Date() >= budget.resetAt) {
budget.spentUSD = cost; // Start fresh with current spend
budget.resetAt = this.calculateNextReset(budget.period);
}
}
}
// Rough estimation before API call
function estimateTokens(text: string): number {
// Rough heuristic: ~4 characters per token for English
return Math.ceil(text.length / 4);
}
// More accurate estimation using tiktoken (for OpenAI)
import { encoding_for_model } from 'tiktoken';
function countTokensAccurate(text: string, model: string): number {
const enc = encoding_for_model(model);
const tokens = enc.encode(text);
enc.free();
return tokens.length;
}
// Estimate cost before execution
function estimateCallCost(
model: string,
systemPrompt: string,
userMessage: string,
expectedOutputTokens: number
): number {
const inputTokens = estimateTokens(systemPrompt + userMessage);
return calculateCost(model, inputTokens, expectedOutputTokens);
}
class CostAwareAgent {
private budget: Budget;
private spent = 0;
constructor(budgetUSD: number) {
this.budget = {
id: 'agent-budget',
name: 'Agent Task Budget',
limitUSD: budgetUSD,
spentUSD: 0,
period: 'task',
alertThresholds: [0.5, 0.8],
hardLimit: true
};
}
async execute(task: string): Promise<Result> {
// Estimate cost
const estimate = this.estimateTaskCost(task);
if (estimate > this.remaining) {
return this.handleBudgetExceeded(task, estimate);
}
// Choose model based on budget
const model = this.selectModelForBudget(task);
// Execute with tracking
const result = await this.llm.complete({
model,
messages: [{ role: 'user', content: task }],
onUsage: (usage) => this.recordUsage(usage, model)
});
return result;
}
private selectModelForBudget(task: string): string {
const complexity = this.assessComplexity(task);
const remaining = this.budget.limitUSD - this.spent;
// Use cheaper models when budget is tight
if (remaining < 0.10) {
return 'gpt-4o-mini'; // Cheapest
}
if (remaining < 0.50 || complexity === 'low') {
return 'claude-3-haiku';
}
if (remaining < 2.00 || complexity === 'medium') {
return 'claude-3-sonnet';
}
return 'claude-3-opus'; // Full power when budget allows
}
private handleBudgetExceeded(task: string, estimate: number): Result {
// Options:
// 1. Simplify the task
// 2. Use cheaper model
// 3. Return partial result
// 4. Request budget increase
const cheaperModel = this.findCheapestViableModel(task);
if (cheaperModel) {
return this.execute(task); // Retry with cheaper model
}
return {
success: false,
error: 'Budget exceeded',
partialResult: null,
budgetInfo: {
remaining: this.remaining,
estimated: estimate
}
};
}
get remaining(): number {
return this.budget.limitUSD - this.spent;
}
}
interface AgentCostAllocation {
agentId: string;
allocatedUSD: number;
spentUSD: number;
priority: 'low' | 'medium' | 'high';
}
class MultiAgentBudgetManager {
private totalBudget: number;
private allocations = new Map<string, AgentCostAllocation>();
allocateBudget(agents: { id: string; priority: string }[]): void {
// Priority weights
const weights = { high: 3, medium: 2, low: 1 };
const totalWeight = agents.reduce(
(sum, a) => sum + weights[a.priority], 0
);
for (const agent of agents) {
const share = (weights[agent.priority] / totalWeight) * this.totalBudget;
this.allocations.set(agent.id, {
agentId: agent.id,
allocatedUSD: share,
spentUSD: 0,
priority: agent.priority as any
});
}
}
// Reallocate from under-spending to over-spending agents
rebalance(): void {
const underSpenders = Array.from(this.allocations.values())
.filter(a => a.spentUSD < a.allocatedUSD * 0.5);
const overSpenders = Array.from(this.allocations.values())
.filter(a => a.spentUSD > a.allocatedUSD * 0.8);
for (const over of overSpenders) {
const needed = over.spentUSD - over.allocatedUSD * 0.8;
for (const under of underSpenders) {
const available = under.allocatedUSD * 0.5 - under.spentUSD;
const transfer = Math.min(needed, available);
if (transfer > 0) {
under.allocatedUSD -= transfer;
over.allocatedUSD += transfer;
}
}
}
}
}
// Anthropic prompt caching
async function callWithCaching(messages: Message[]): Promise<Response> {
// Mark system prompt for caching
const cachedMessages = messages.map((m, i) => {
if (i === 0 && m.role === 'system') {
return {
...m,
cache_control: { type: 'ephemeral' }
};
}
return m;
});
return anthropic.messages.create({
model: 'claude-3-sonnet-20240229',
messages: cachedMessages
});
}
function routeToOptimalModel(task: string, budget: number): string {
const complexity = assessComplexity(task);
// Complexity vs cost matrix
const modelMatrix = {
simple: ['gpt-4o-mini', 'claude-3-haiku'],
medium: ['claude-3-sonnet', 'gpt-4o'],
complex: ['claude-3-opus', 'gpt-4-turbo']
};
const candidates = modelMatrix[complexity];
// Find cheapest that fits budget
for (const model of candidates) {
const estimatedCost = estimateCallCost(model, task, 500);
if (estimatedCost <= budget) {
return model;
}
}
return candidates[candidates.length - 1]; // Fallback to cheapest
}
async function compressContext(
context: string,
targetTokens: number
): Promise<string> {
const currentTokens = estimateTokens(context);
if (currentTokens <= targetTokens) {
return context;
}
// Use cheap model to summarize
const summary = await llm.complete({
model: 'gpt-4o-mini',
messages: [{
role: 'user',
content: `Summarize this context in under ${targetTokens} tokens, preserving key information:\n\n${context}`
}]
});
return summary;
}
interface CostReport {
period: { start: Date; end: Date };
totalSpent: number;
byModel: Record<string, { calls: number; tokens: number; cost: number }>;
byAgent: Record<string, { calls: number; cost: number }>;
byTask: Record<string, { cost: number; success: boolean }>;
trends: {
dailyAverage: number;
projectedMonthly: number;
topCostDrivers: string[];
};
}
function generateCostReport(usageData: UsageRecord[]): CostReport {
// ... aggregation logic
}
development
Test skills for correct activation, content quality, and regression — both automated checks (frontmatter validity, lint) and manual verification (query-suite activation testing). Covers CI integration and how to catch skill regressions before users do. Use this skill when adding skills to a repo, setting up CI for a skill library, or debugging "the skill exists but doesn't work". Activate when: test skills, validate skills, skill CI, skill linting, skill activation test, skill regression.
documentation
Write the YAML frontmatter for a SKILL.md file so it activates reliably — name, description, and activation keywords that the model matches against. Covers length, tone, and the most common frontmatter mistakes. Use this skill when authoring a new skill, fixing a skill that isn't auto-activating, or reviewing skills for publication. Activate when: SKILL.md frontmatter, skill description, skill activation, skill YAML, write a skill, author a skill.
development
Design skills that fire at the right moment — neither over-eager (noise) nor under-eager (silent). Covers activation specificity, trigger phrases, disambiguation between overlapping skills, and debugging activation. Use this skill when multiple skills could fire on the same query, a skill never fires, or a skill fires too often. Activate when: skill won't activate, skill over-activates, overlapping skills, skill triggers, skill selection, skill disambiguation.
development
Structure SKILL.md content so the model reads just enough — concise summary up front, progressively deeper detail, examples on demand. Covers section ordering, length budgets, when to split into multiple skills. Use this skill when writing or refactoring a skill body, one skill has grown too long, or a skill is wordy but not useful. Activate when: SKILL.md structure, skill content, skill too long, split skill, progressive disclosure, skill body.