Production Test Harness Implementation
From Guardrails to a Complete Harness
The five guardrails define the boundaries. The production harness is the complete implementation that enforces them while providing a clean interface for agent execution.
The ConstrainedTestHarness Class
from dataclasses import dataclass
from typing import Optional
import time
@dataclass
class HarnessConfig:
max_steps: int = 30
timeout_seconds: int = 300
max_tokens: int = 50_000
allowed_domains: list[str] = None
allowed_actions: list[str] = None
require_assertion: bool = True
screenshot_on_failure: bool = True
class ConstrainedTestHarness:
def __init__(self, agent, config: HarnessConfig):
self.agent = agent
self.config = config
self.step_count = 0
self.token_count = 0
self.start_time = None
self.violations = []
def execute(self, test_objective: str) -> TestResult:
self.start_time = time.time()
while True:
# Check all guardrails before each step
violation = self.check_guardrails()
if violation:
return TestResult(
status="ABORTED",
reason=f"Guardrail violation: {violation}",
steps_taken=self.step_count,
violations=self.violations
)
# Let the agent take one step
action = self.agent.next_action()
# Validate the action before execution
if not self.is_action_allowed(action):
self.violations.append(f"Blocked action: {action}")
return TestResult(
status="BLOCKED",
reason=f"Action not allowed: {action}",
steps_taken=self.step_count
)
# Execute and record
result = self.agent.execute_action(action)
self.step_count += 1
self.token_count += result.tokens_used
if result.is_terminal:
# Validate the test result
if self.config.require_assertion and not result.has_assertion:
return TestResult(
status="INVALID",
reason="Test completed without any assertion",
steps_taken=self.step_count
)
return result
def check_guardrails(self) -> Optional[str]:
if self.step_count >= self.config.max_steps:
return f"Max steps ({self.config.max_steps}) exceeded"
elapsed = time.time() - self.start_time
if elapsed > self.config.timeout_seconds:
return f"Timeout ({self.config.timeout_seconds}s) exceeded"
if self.token_count >= self.config.max_tokens:
return f"Token budget ({self.config.max_tokens}) exhausted"
return None
def is_action_allowed(self, action) -> bool:
if self.config.allowed_actions:
if action.type not in self.config.allowed_actions:
return False
if self.config.allowed_domains and action.type == "NAVIGATE":
from urllib.parse import urlparse
domain = urlparse(action.url).netloc
if domain not in self.config.allowed_domains:
return False
return True
Running Multiple Tests with the Harness
In practice, you run many tests through the same harness. Each test gets its own step/token counters but shares the harness configuration.
class TestSuiteRunner:
def __init__(self, agent_factory, config: HarnessConfig):
self.agent_factory = agent_factory
self.config = config
def run_suite(self, objectives: list[str]) -> SuiteResult:
results = []
total_tokens = 0
suite_start = time.time()
for objective in objectives:
# Create fresh agent for each test (isolation)
agent = self.agent_factory()
harness = ConstrainedTestHarness(agent, self.config)
result = harness.execute(objective)
results.append(result)
total_tokens += harness.token_count
# Early exit on critical failures
if result.status == "BLOCKED":
break # Stop if a guardrail violation occurs
return SuiteResult(
tests=results,
total_time=time.time() - suite_start,
total_tokens=total_tokens,
pass_count=sum(1 for r in results if r.status == "pass"),
fail_count=sum(1 for r in results if r.status == "fail"),
abort_count=sum(1 for r in results if r.status == "ABORTED"),
)
Harness Observability
A production harness must be observable. Every action, decision, and guardrail check should be logged.
import logging
import json
class ObservableHarness(ConstrainedTestHarness):
def __init__(self, agent, config, logger=None):
super().__init__(agent, config)
self.logger = logger or logging.getLogger("test_harness")
self.telemetry = []
def execute(self, test_objective: str) -> TestResult:
self.logger.info(f"Starting test: {test_objective}")
self.logger.info(f"Config: max_steps={self.config.max_steps}, "
f"timeout={self.config.timeout_seconds}s, "
f"token_budget={self.config.max_tokens}")
result = super().execute(test_objective)
self.logger.info(f"Test completed: status={result.status}, "
f"steps={self.step_count}, "
f"tokens={self.token_count}, "
f"time={time.time() - self.start_time:.1f}s")
# Save telemetry for post-run analysis
self.telemetry.append({
"objective": test_objective,
"status": result.status,
"steps": self.step_count,
"tokens": self.token_count,
"duration": time.time() - self.start_time,
"violations": self.violations,
})
return result
def save_telemetry(self, path: str):
with open(path, "w") as f:
json.dump(self.telemetry, f, indent=2)
Harness Patterns for Different Test Types
Exploratory Test Harness (loose guardrails)
exploratory_config = HarnessConfig(
max_steps=50,
timeout_seconds=600, # 10 minutes
max_tokens=100_000,
allowed_domains=None, # Any domain
allowed_actions=None, # Any action
require_assertion=False, # Exploration may not assert
screenshot_on_failure=True,
)
CI Gate Harness (strict guardrails)
ci_config = HarnessConfig(
max_steps=20,
timeout_seconds=120, # 2 minutes
max_tokens=30_000,
allowed_domains=["staging.myapp.com"],
allowed_actions=["NAVIGATE", "CLICK", "TYPE", "ASSERT", "SCREENSHOT"],
require_assertion=True,
screenshot_on_failure=True,
)
Production Monitoring Harness (read-only)
monitoring_config = HarnessConfig(
max_steps=10,
timeout_seconds=60, # 1 minute
max_tokens=10_000,
allowed_domains=["app.mycompany.com"],
allowed_actions=["NAVIGATE", "ASSERT", "SCREENSHOT"], # No CLICK or TYPE
require_assertion=True,
screenshot_on_failure=True,
)
Error Recovery in the Harness
The harness should handle agent failures gracefully:
def execute_with_recovery(self, test_objective: str) -> TestResult:
"""Execute with automatic recovery from transient errors."""
max_retries = 2
for attempt in range(max_retries + 1):
try:
return self.execute(test_objective)
except BrowserCrashError:
if attempt < max_retries:
self.logger.warning(f"Browser crashed, retrying (attempt {attempt + 1})")
self.agent.reset_browser()
self.step_count = 0
self.token_count = 0
else:
return TestResult(
status="ERROR",
reason="Browser crashed after max retries"
)
except LLMTimeoutError:
if attempt < max_retries:
self.logger.warning(f"LLM timeout, retrying (attempt {attempt + 1})")
else:
return TestResult(
status="ERROR",
reason="LLM timeout after max retries"
)
Key Takeaway
The production harness is the bridge between the abstract guardrail concepts and a working system. It enforces constraints, provides observability, handles errors gracefully, and supports different profiles for different environments. Every team running agentic tests in CI should have a harness like this -- it turns unpredictable agents into reliable pipeline components.