Production Test Harness Implementation

From Guardrails to a Complete Harness

The five guardrails define the boundaries. The production harness is the complete implementation that enforces them while providing a clean interface for agent execution.

The ConstrainedTestHarness Class

from dataclasses import dataclass
from typing import Optional
import time

@dataclass
class HarnessConfig:
    max_steps: int = 30
    timeout_seconds: int = 300
    max_tokens: int = 50_000
    allowed_domains: list[str] = None
    allowed_actions: list[str] = None
    require_assertion: bool = True
    screenshot_on_failure: bool = True

class ConstrainedTestHarness:
    def __init__(self, agent, config: HarnessConfig):
        self.agent = agent
        self.config = config
        self.step_count = 0
        self.token_count = 0
        self.start_time = None
        self.violations = []

    def execute(self, test_objective: str) -> TestResult:
        self.start_time = time.time()

        while True:
            # Check all guardrails before each step
            violation = self.check_guardrails()
            if violation:
                return TestResult(
                    status="ABORTED",
                    reason=f"Guardrail violation: {violation}",
                    steps_taken=self.step_count,
                    violations=self.violations
                )

            # Let the agent take one step
            action = self.agent.next_action()

            # Validate the action before execution
            if not self.is_action_allowed(action):
                self.violations.append(f"Blocked action: {action}")
                return TestResult(
                    status="BLOCKED",
                    reason=f"Action not allowed: {action}",
                    steps_taken=self.step_count
                )

            # Execute and record
            result = self.agent.execute_action(action)
            self.step_count += 1
            self.token_count += result.tokens_used

            if result.is_terminal:
                # Validate the test result
                if self.config.require_assertion and not result.has_assertion:
                    return TestResult(
                        status="INVALID",
                        reason="Test completed without any assertion",
                        steps_taken=self.step_count
                    )
                return result

    def check_guardrails(self) -> Optional[str]:
        if self.step_count >= self.config.max_steps:
            return f"Max steps ({self.config.max_steps}) exceeded"
        elapsed = time.time() - self.start_time
        if elapsed > self.config.timeout_seconds:
            return f"Timeout ({self.config.timeout_seconds}s) exceeded"
        if self.token_count >= self.config.max_tokens:
            return f"Token budget ({self.config.max_tokens}) exhausted"
        return None

    def is_action_allowed(self, action) -> bool:
        if self.config.allowed_actions:
            if action.type not in self.config.allowed_actions:
                return False
        if self.config.allowed_domains and action.type == "NAVIGATE":
            from urllib.parse import urlparse
            domain = urlparse(action.url).netloc
            if domain not in self.config.allowed_domains:
                return False
        return True

Running Multiple Tests with the Harness

In practice, you run many tests through the same harness. Each test gets its own step/token counters but shares the harness configuration.

class TestSuiteRunner:
    def __init__(self, agent_factory, config: HarnessConfig):
        self.agent_factory = agent_factory
        self.config = config

    def run_suite(self, objectives: list[str]) -> SuiteResult:
        results = []
        total_tokens = 0
        suite_start = time.time()

        for objective in objectives:
            # Create fresh agent for each test (isolation)
            agent = self.agent_factory()
            harness = ConstrainedTestHarness(agent, self.config)

            result = harness.execute(objective)
            results.append(result)
            total_tokens += harness.token_count

            # Early exit on critical failures
            if result.status == "BLOCKED":
                break  # Stop if a guardrail violation occurs

        return SuiteResult(
            tests=results,
            total_time=time.time() - suite_start,
            total_tokens=total_tokens,
            pass_count=sum(1 for r in results if r.status == "pass"),
            fail_count=sum(1 for r in results if r.status == "fail"),
            abort_count=sum(1 for r in results if r.status == "ABORTED"),
        )

Harness Observability

A production harness must be observable. Every action, decision, and guardrail check should be logged.

import logging
import json

class ObservableHarness(ConstrainedTestHarness):
    def __init__(self, agent, config, logger=None):
        super().__init__(agent, config)
        self.logger = logger or logging.getLogger("test_harness")
        self.telemetry = []

    def execute(self, test_objective: str) -> TestResult:
        self.logger.info(f"Starting test: {test_objective}")
        self.logger.info(f"Config: max_steps={self.config.max_steps}, "
                        f"timeout={self.config.timeout_seconds}s, "
                        f"token_budget={self.config.max_tokens}")

        result = super().execute(test_objective)

        self.logger.info(f"Test completed: status={result.status}, "
                        f"steps={self.step_count}, "
                        f"tokens={self.token_count}, "
                        f"time={time.time() - self.start_time:.1f}s")

        # Save telemetry for post-run analysis
        self.telemetry.append({
            "objective": test_objective,
            "status": result.status,
            "steps": self.step_count,
            "tokens": self.token_count,
            "duration": time.time() - self.start_time,
            "violations": self.violations,
        })

        return result

    def save_telemetry(self, path: str):
        with open(path, "w") as f:
            json.dump(self.telemetry, f, indent=2)

Harness Patterns for Different Test Types

Exploratory Test Harness (loose guardrails)

exploratory_config = HarnessConfig(
    max_steps=50,
    timeout_seconds=600,          # 10 minutes
    max_tokens=100_000,
    allowed_domains=None,         # Any domain
    allowed_actions=None,         # Any action
    require_assertion=False,      # Exploration may not assert
    screenshot_on_failure=True,
)

CI Gate Harness (strict guardrails)

ci_config = HarnessConfig(
    max_steps=20,
    timeout_seconds=120,          # 2 minutes
    max_tokens=30_000,
    allowed_domains=["staging.myapp.com"],
    allowed_actions=["NAVIGATE", "CLICK", "TYPE", "ASSERT", "SCREENSHOT"],
    require_assertion=True,
    screenshot_on_failure=True,
)

Production Monitoring Harness (read-only)

monitoring_config = HarnessConfig(
    max_steps=10,
    timeout_seconds=60,           # 1 minute
    max_tokens=10_000,
    allowed_domains=["app.mycompany.com"],
    allowed_actions=["NAVIGATE", "ASSERT", "SCREENSHOT"],  # No CLICK or TYPE
    require_assertion=True,
    screenshot_on_failure=True,
)

Error Recovery in the Harness

The harness should handle agent failures gracefully:

def execute_with_recovery(self, test_objective: str) -> TestResult:
    """Execute with automatic recovery from transient errors."""
    max_retries = 2
    for attempt in range(max_retries + 1):
        try:
            return self.execute(test_objective)
        except BrowserCrashError:
            if attempt < max_retries:
                self.logger.warning(f"Browser crashed, retrying (attempt {attempt + 1})")
                self.agent.reset_browser()
                self.step_count = 0
                self.token_count = 0
            else:
                return TestResult(
                    status="ERROR",
                    reason="Browser crashed after max retries"
                )
        except LLMTimeoutError:
            if attempt < max_retries:
                self.logger.warning(f"LLM timeout, retrying (attempt {attempt + 1})")
            else:
                return TestResult(
                    status="ERROR",
                    reason="LLM timeout after max retries"
                )

Key Takeaway

The production harness is the bridge between the abstract guardrail concepts and a working system. It enforces constraints, provides observability, handles errors gracefully, and supports different profiles for different environments. Every team running agentic tests in CI should have a harness like this -- it turns unpredictable agents into reliable pipeline components.