distributions/claude/skills/agent-testing-patterns/SKILL.md
Test AI agent systems including tool use, multi-turn conversations, error recovery, and non-deterministic outputs. Covers mock strategies, evaluation metrics, and regression testing for agent workflows. Triggers on AI agent testing, LLM evaluation, or agent quality assurance requests.
npx skillsauth add a-organvm/a-i--skills agent-testing-patternsInstall this skill globally with one command. Works with Claude Code, Cursor, and Windsurf.
3 of 9 scanners reported clean
Some scanners were skipped, did not run, or reported a non-clean status. Review each row below.
Test AI agent systems that use tools, make decisions, and produce non-deterministic outputs.
| Challenge | Cause | Strategy | |-----------|-------|----------| | Non-deterministic output | LLM randomness | Assert on structure, not exact text | | Tool use sequences | Agent autonomy | Verify tool calls, not call order | | Multi-turn state | Conversation context | Snapshot-based assertions | | Cost | API calls | Mock LLM in unit tests | | Latency | API round-trips | Parallel test execution | | Flakiness | Model updates | Semantic assertions, not string matches |
╱╲
╱ ╲ E2E Agent Tests (few, expensive)
╱────╲ Full agent loop with real LLM
╱ ╲
╱────────╲ Integration Tests (moderate)
╱ ╲ Tool execution, state management
╱────────────╲
╱ Unit Tests ╲ Tool implementations, parsers, validators
╱────────────────╲
import pytest
def test_file_read_tool():
tool = FileReadTool()
result = tool.execute({"path": "test.txt"})
assert result["content"] == "expected content"
assert result["success"] is True
def test_file_read_tool_missing_file():
tool = FileReadTool()
result = tool.execute({"path": "nonexistent.txt"})
assert result["success"] is False
assert "not found" in result["error"].lower()
def test_tool_input_validation():
tool = FileReadTool()
with pytest.raises(ValueError, match="path is required"):
tool.execute({})
def test_parse_tool_call():
raw = '{"tool": "search", "args": {"query": "python"}}'
result = parse_tool_call(raw)
assert result.tool == "search"
assert result.args == {"query": "python"}
def test_parse_malformed_tool_call():
raw = "not json at all"
result = parse_tool_call(raw)
assert result is None
class MockLLMClient:
def __init__(self, responses: list[dict]):
self.responses = iter(responses)
self.calls: list[dict] = []
async def generate(self, messages: list[dict], tools: list[dict] = None) -> dict:
self.calls.append({"messages": messages, "tools": tools})
return next(self.responses)
@pytest.fixture
def mock_agent():
client = MockLLMClient(responses=[
{"content": None, "tool_calls": [{"name": "search", "args": {"query": "python packaging"}}]},
{"content": "Based on the search results, here's how to package Python..."},
])
return Agent(llm=client, tools=[SearchTool(), FileReadTool()])
@pytest.mark.asyncio
async def test_agent_uses_search_then_responds(mock_agent):
result = await mock_agent.run("How do I package a Python project?")
# Verify tool was called
assert len(mock_agent.tool_history) == 1
assert mock_agent.tool_history[0].tool_name == "search"
assert "python" in mock_agent.tool_history[0].args["query"].lower()
# Verify final response exists
assert result.content is not None
assert len(result.content) > 0
@pytest.mark.asyncio
async def test_session_preserves_context(mock_agent):
await mock_agent.run("My name is Alice")
result = await mock_agent.run("What's my name?")
# Verify conversation history maintained
assert len(mock_agent.messages) == 4 # 2 user + 2 assistant
@pytest.mark.e2e
@pytest.mark.asyncio
async def test_agent_creates_file(real_agent, tmp_path):
result = await real_agent.run(f"Create a Python hello world script at {tmp_path}/hello.py")
# Assert on outcome, not exact content
hello_file = tmp_path / "hello.py"
assert hello_file.exists()
content = hello_file.read_text()
assert "print" in content # Must use print
assert content.strip() # Non-empty
# Verify it's valid Python
compile(content, "hello.py", "exec")
@pytest.mark.e2e
@pytest.mark.asyncio
async def test_agent_explains_concept(real_agent):
result = await real_agent.run("Explain what a circuit breaker pattern is in 2-3 sentences")
# Semantic checks (not exact string matching)
assert len(result.content) > 50
assert len(result.content) < 1000
assert any(term in result.content.lower() for term in ["fault", "failure", "threshold", "open", "closed"])
@dataclass
class AgentEvalResult:
task_completed: bool
tool_calls_count: int
tokens_used: int
latency_ms: float
error_recovery_count: int
async def evaluate_agent(agent, test_cases: list[dict]) -> list[AgentEvalResult]:
results = []
for case in test_cases:
start = time.perf_counter()
try:
result = await agent.run(case["prompt"])
completed = case["validator"](result)
except Exception:
completed = False
latency = (time.perf_counter() - start) * 1000
results.append(AgentEvalResult(
task_completed=completed,
tool_calls_count=len(agent.tool_history),
tokens_used=agent.total_tokens,
latency_ms=latency,
error_recovery_count=agent.error_count,
))
return results
def test_tool_call_format_regression():
"""Ensure tool call format hasn't changed."""
response = agent.format_tool_call("search", {"query": "test"})
expected = load_golden("tool_call_format.json")
assert response == expected
BENCHMARK_CASES = [
{"prompt": "List all Python files in the project", "expected_tools": ["glob"], "max_tokens": 500},
{"prompt": "Fix the syntax error in app.py", "expected_tools": ["read", "edit"], "max_tokens": 2000},
]
async def run_benchmark(agent):
for case in BENCHMARK_CASES:
result = await agent.run(case["prompt"])
tools_used = {t.tool_name for t in agent.tool_history}
assert tools_used.issubset(set(case["expected_tools"] + ["think"]))
assert agent.total_tokens <= case["max_tokens"]
development
Create algorithmic and generative art using mathematical patterns, noise functions, particle systems, and procedural generation. Covers flow fields, L-systems, fractals, and creative coding foundations. Triggers on generative art, algorithmic art, creative coding, procedural generation, or mathematical visualization requests.
development
Audits web applications and architectures for compliance with GDPR, CCPA, and other privacy regulations, focusing on consent, data minimization, and user rights.
development
Optimize Google Cloud Platform resource allocation and manage cloud credits efficiently. Use when planning GCP deployments, analyzing cloud spend, maximizing value from expiring credits, right-sizing instances, or designing cost-effective architectures. Triggers on GCP cost optimization, credit management, resource allocation planning, or cloud budget concerns.
testing
Designs engaging gameplay loops, economies, and progression systems, balancing challenge and reward for interactive experiences.