Regex and Parsing
Regular expressions and data parsing are everyday tools for QA engineers. You parse log files to find error patterns, validate response formats, extract transaction IDs from text, and process configuration files. These skills turn raw text into actionable test data.
Regular Expressions for Log Parsing
import re
log_line = '2024-03-15 14:23:01 ERROR [PaymentService] Transaction TX-98712 failed: insufficient_funds'
# Extract transaction ID
match = re.search(r'TX-\d+', log_line)
tx_id = match.group() # "TX-98712"
# Extract log level
level = re.search(r'\b(DEBUG|INFO|WARN|ERROR|FATAL)\b', log_line).group() # "ERROR"
# Extract timestamp
timestamp = re.search(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', log_line).group()
# Parse multiple log lines
with open("app.log") as f:
errors = [line for line in f if re.search(r'\bERROR\b', line)]
print(f"Found {len(errors)} error lines")
Essential Regex Patterns for QA
| Pattern | Matches | Use Case |
|---|---|---|
\d{3}-\d{4} |
123-4567 | Phone number fragments |
\b[A-Z]{2,}\b |
ERROR, WARN | Log level extraction |
"[^"]*" |
Any quoted string | Extract values from raw text |
\d{4}-\d{2}-\d{2} |
2024-03-15 | ISO date extraction |
\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z]{2,}\b |
email@example.com | Email validation |
\b\d{1,3}(\.\d{1,3}){3}\b |
192.168.1.1 | IP address extraction |
TX-\d+ |
TX-98712 | Transaction/order ID extraction |
v\d+\.\d+\.\d+ |
v2.4.1 | Version number extraction |
[0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12} |
UUID format | UUID extraction |
Regex in Test Assertions
# Validate response format
import re
def test_user_id_format(api_response):
"""User IDs should be UUIDs."""
user_id = api_response.json()["id"]
uuid_pattern = r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$'
assert re.match(uuid_pattern, user_id), f"Invalid UUID format: {user_id}"
def test_email_format(api_response):
"""Email should be valid format."""
email = api_response.json()["email"]
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
assert re.match(email_pattern, email), f"Invalid email format: {email}"
def test_error_message_no_stack_trace(error_response):
"""Error messages should not contain internal stack traces."""
body = error_response.text
assert not re.search(r'at .*\(.*:\d+:\d+\)', body), "Stack trace exposed in error"
assert not re.search(r'Traceback \(most recent call last\)', body), "Python traceback exposed"
assert not re.search(r'/usr/src/app', body), "Internal file paths exposed"
TypeScript Regex
test("API response contains valid ISO date", () => {
const createdAt = response.data.created_at;
const isoDatePattern = /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}/;
expect(createdAt).toMatch(isoDatePattern);
});
test("Error response does not leak internals", () => {
const body = JSON.stringify(errorResponse.data);
expect(body).not.toMatch(/at .*\(.*:\d+:\d+\)/);
expect(body).not.toMatch(/node_modules/);
});
Regex Building Blocks
| Symbol | Meaning | Example |
|---|---|---|
. |
Any character | a.c matches "abc", "a1c" |
\d |
Digit | \d{3} matches "123" |
\w |
Word character (letter, digit, underscore) | \w+ matches "hello_world" |
\s |
Whitespace | \s+ matches spaces, tabs, newlines |
\b |
Word boundary | \berror\b matches "error" but not "errors" |
^ |
Start of string | ^Hello matches "Hello world" |
$ |
End of string | world$ matches "Hello world" |
* |
Zero or more | ab*c matches "ac", "abc", "abbc" |
+ |
One or more | ab+c matches "abc", "abbc" (not "ac") |
? |
Zero or one | colou?r matches "color" and "colour" |
{n,m} |
Between n and m | \d{2,4} matches "12", "123", "1234" |
[abc] |
Character class | [aeiou] matches any vowel |
[^abc] |
Negated class | [^0-9] matches non-digits |
(...) |
Capture group | (\d+)-(\d+) captures "123" and "456" from "123-456" |
(?:...) |
Non-capture group | Groups without capturing |
\1 |
Back reference | Matches the first captured group |
JSON Parsing
JSON is the universal data format for APIs. Fluent JSON handling is essential.
import json
# Parse API response
data = json.loads(response.text)
assert data["users"][0]["name"] == "Alice"
# Pretty print for debugging
print(json.dumps(data, indent=2))
# Validate nested structure
def validate_user_structure(user: dict):
required = {"id", "name", "email", "created_at"}
assert required.issubset(user.keys()), f"Missing: {required - user.keys()}"
assert isinstance(user["id"], int)
assert isinstance(user["name"], str) and len(user["name"]) > 0
assert "@" in user["email"]
for user in data["users"]:
validate_user_structure(user)
Handling JSON Edge Cases
# Empty response body
if response.text:
data = response.json()
else:
data = None
# Non-JSON response (HTML error page)
try:
data = response.json()
except json.JSONDecodeError:
pytest.fail(f"Response is not JSON: {response.text[:200]}")
# Large numbers (precision loss)
# JSON spec does not define number precision. Some APIs return IDs as strings.
user_id = data["id"] # Could be int or string depending on API
YAML Parsing
YAML is common in configuration files, CI/CD pipelines, and test data.
import yaml
# Load test configuration
with open("config.yaml") as f:
config = yaml.safe_load(f)
base_url = config["environments"]["staging"]["url"]
timeout = config["environments"]["staging"]["timeout"]
# Load test data from YAML
with open("test_users.yaml") as f:
test_data = yaml.safe_load(f)
# test_users.yaml:
# users:
# - email: admin@test.com
# role: admin
# - email: viewer@test.com
# role: viewer
Always use yaml.safe_load(), not yaml.load(). The latter can execute arbitrary code.
CSV Parsing for Test Data
import csv
# Read test data from CSV
with open("test_cases.csv") as f:
reader = csv.DictReader(f)
test_data = list(reader)
# [{"email": "test1@test.com", "expected_status": "200"}, ...]
# Use in parametrized tests
@pytest.mark.parametrize("row", test_data)
def test_from_csv(api_client, row):
response = api_client.post("/auth/login", json={
"email": row["email"], "password": row["password"]
})
assert response.status_code == int(row["expected_status"])
Practical Exercise
- Write a regex that extracts all error-level log lines from a log file and groups them by service name (text in square brackets like
[PaymentService]) - Parse a JSON API response and validate that all items have required fields, correct types, and no sensitive data
- Write a function that converts a HAR file (JSON format) into a summary: total requests, average response time, and error count
- Create a YAML-based test data file and write a pytest parametrize test that reads from it
Key Takeaways
- Regex is essential for log parsing, format validation, and data extraction
- Know the core patterns:
\d,\w,\b,+,*,?,(),[] - Always use
yaml.safe_load()to prevent code execution - JSON parsing: handle edge cases (empty body, non-JSON response, large numbers)
- Combine regex with parsing for powerful test data processing