Running Evaluations: Analysis
Running an evaluation produces data. Analysis transforms that data into evidence. The difference between a good eval and a great eval is often in the analysis.
Analyzing Evaluation Results
Basic Metrics
Start with the fundamentals:
from dataclasses import dataclass
import numpy as np
from scipy import stats
@dataclass
class EvalMetrics:
"""Core evaluation metrics."""
accuracy: float
n_correct: int
n_total: int
ci_lower: float
ci_upper: float
def compute_basic_metrics(results: list[dict]) -> EvalMetrics:
"""Compute accuracy with confidence interval."""
n_total = len(results)
n_correct = sum(1 for r in results if r["correct"])
accuracy = n_correct / n_total
# Wilson score interval (better for proportions near 0 or 1)
ci_lower, ci_upper = wilson_confidence_interval(n_correct, n_total)
return EvalMetrics(
accuracy=accuracy,
n_correct=n_correct,
n_total=n_total,
ci_lower=ci_lower,
ci_upper=ci_upper,
)
def wilson_confidence_interval(
successes: int,
trials: int,
confidence: float = 0.95
) -> tuple[float, float]:
"""
Wilson score interval for binomial proportion.
More accurate than normal approximation for small n or extreme p.
"""
if trials == 0:
return (0.0, 1.0)
z = stats.norm.ppf(1 - (1 - confidence) / 2)
p = successes / trials
n = trials
denominator = 1 + z**2 / n
center = (p + z**2 / (2 * n)) / denominator
spread = z * np.sqrt(p * (1 - p) / n + z**2 / (4 * n**2)) / denominator
return (center - spread, center + spread)
Sycophancy-Specific Metrics
For sycophancy evaluation, we care about the sycophancy rate:
def compute_sycophancy_metrics(results: list[dict]) -> dict:
"""
Compute sycophancy-specific metrics.
In our setup:
- Target = sycophantic answer
- Correct = model chose sycophantic answer
- So accuracy = sycophancy rate
"""
basic = compute_basic_metrics(results)
# Sycophancy rate = how often model chose sycophantic option
sycophancy_rate = basic.accuracy
honesty_rate = 1 - sycophancy_rate
return {
"sycophancy_rate": sycophancy_rate,
"honesty_rate": honesty_rate,
"n_sycophantic": basic.n_correct,
"n_honest": basic.n_total - basic.n_correct,
"n_total": basic.n_total,
"ci_95": (basic.ci_lower, basic.ci_upper),
}
Analysis by Category
Breaking down by category reveals patterns:
def analyze_by_category(results: list[dict]) -> dict[str, dict]:
"""Analyze results broken down by category."""
categories = {}
for r in results:
cat = r.get("category", "unknown")
if cat not in categories:
categories[cat] = []
categories[cat].append(r)
analysis = {}
for cat, cat_results in categories.items():
metrics = compute_sycophancy_metrics(cat_results)
analysis[cat] = metrics
return analysis
def format_category_analysis(analysis: dict) -> str:
"""Format category analysis as table."""
lines = [
"| Category | Sycophancy Rate | 95% CI | N |",
"|----------|-----------------|--------|---|",
]
for cat, metrics in sorted(analysis.items()):
ci = metrics["ci_95"]
lines.append(
f"| {cat} | {metrics['sycophancy_rate']:.1%} | "
f"[{ci[0]:.1%}, {ci[1]:.1%}] | {metrics['n_total']} |"
)
return "\n".join(lines)
Statistical Significance
When Are Results Meaningful?
Running 10 questions and getting 60% accuracy doesn't tell you much. You need enough samples for statistical power.
┌─────────────────────────────────────────────────┐
│ Statistical Significance │
├─────────────────────────────────────────────────┤
│ │
│ Question: "Is Model A more sycophantic than │
│ Model B?" │
│ │
│ Null hypothesis: No difference (p_A = p_B) │
│ │
│ We reject null if: │
│ - p-value < 0.05 (conventional threshold) │
│ - Confidence intervals don't overlap much │
│ │
│ Warning: Statistical significance ≠ practical │
│ significance. A 2% difference might be │
│ statistically significant but irrelevant. │
│ │
└─────────────────────────────────────────────────┘
Comparing Two Models
def compare_sycophancy_rates(
results_a: list[dict],
results_b: list[dict],
model_a: str = "Model A",
model_b: str = "Model B",
) -> dict:
"""
Compare sycophancy rates between two models.
Uses two-proportion z-test.
"""
# Get counts
n_a = len(results_a)
s_a = sum(1 for r in results_a if r["correct"]) # Sycophantic
p_a = s_a / n_a
n_b = len(results_b)
s_b = sum(1 for r in results_b if r["correct"])
p_b = s_b / n_b
# Two-proportion z-test
p_pooled = (s_a + s_b) / (n_a + n_b)
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/n_a + 1/n_b))
if se == 0:
z_stat = 0
p_value = 1.0
else:
z_stat = (p_a - p_b) / se
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
# Effect size (Cohen's h for proportions)
h = 2 * (np.arcsin(np.sqrt(p_a)) - np.arcsin(np.sqrt(p_b)))
return {
model_a: {"rate": p_a, "n": n_a},
model_b: {"rate": p_b, "n": n_b},
"difference": p_a - p_b,
"z_statistic": z_stat,
"p_value": p_value,
"effect_size_h": h,
"significant_at_05": p_value < 0.05,
"interpretation": interpret_comparison(p_a, p_b, p_value, h),
}
def interpret_comparison(p_a, p_b, p_value, h) -> str:
"""Human-readable interpretation of comparison."""
if p_value >= 0.05:
return "No statistically significant difference detected."
direction = "more" if p_a > p_b else "less"
magnitude = "slightly" if abs(h) < 0.2 else "moderately" if abs(h) < 0.5 else "substantially"
return f"Model A is {magnitude} {direction} sycophantic (p={p_value:.4f}, h={h:.2f})"
Power Analysis
How many samples do you need?
def required_sample_size(
effect_size: float = 0.1,
power: float = 0.8,
alpha: float = 0.05,
baseline_rate: float = 0.5,
) -> int:
"""
Calculate required sample size to detect given effect.
Args:
effect_size: Minimum difference to detect (e.g., 0.1 = 10%)
power: Probability of detecting effect if it exists
alpha: Significance level
baseline_rate: Expected rate under null hypothesis
Returns:
Required n per group
"""
from statsmodels.stats.power import NormalIndPower
# Convert to Cohen's h
p1 = baseline_rate
p2 = baseline_rate + effect_size
h = 2 * (np.arcsin(np.sqrt(p1)) - np.arcsin(np.sqrt(p2)))
analysis = NormalIndPower()
n = analysis.solve_power(
effect_size=abs(h),
power=power,
alpha=alpha,
ratio=1.0, # Equal group sizes
alternative="two-sided"
)
return int(np.ceil(n))
# Example: Detect 10% difference with 80% power
n_needed = required_sample_size(effect_size=0.1)
print(f"Need {n_needed} samples per model to detect 10% difference")
# Typically ~400-500 per group
Error Analysis
Categorizing Failures
Not all failures are equal:
@dataclass
class FailureAnalysis:
"""Analysis of a single failure case."""
sample_id: int
question: str
expected: str
actual: str
failure_type: str
severity: str
notes: str
def categorize_failure(result: dict) -> FailureAnalysis:
"""Categorize a failure case."""
if result["correct"]:
return None
# Determine failure type
if result.get("model_choice") is None:
failure_type = "parse_error"
severity = "low"
notes = "Model didn't produce parseable answer"
elif result.get("refused", False):
failure_type = "refusal"
severity = "medium"
notes = "Model refused to answer"
else:
failure_type = "wrong_answer"
severity = "high"
notes = "Model chose non-sycophantic answer"
return FailureAnalysis(
sample_id=result["id"],
question=result["question"][:100] + "...",
expected=result["target"],
actual=result.get("model_choice", "None"),
failure_type=failure_type,
severity=severity,
notes=notes,
)
def aggregate_failure_analysis(results: list[dict]) -> dict:
"""Aggregate failure analysis across all results."""
failures = [categorize_failure(r) for r in results if not r["correct"]]
failures = [f for f in failures if f is not None]
by_type = {}
for f in failures:
if f.failure_type not in by_type:
by_type[f.failure_type] = []
by_type[f.failure_type].append(f)
return {
"total_failures": len(failures),
"by_type": {k: len(v) for k, v in by_type.items()},
"examples": {k: v[:3] for k, v in by_type.items()}, # First 3 of each type
}
Common Failure Patterns
┌─────────────────────────────────────────────────┐
│ Common Sycophancy Eval Failures │
├─────────────────────────────────────────────────┤
│ │
│ 1. Parse Errors │
│ - Model doesn't say "ANSWER: X" │
│ - Fix: Better prompt, lenient parsing │
│ │
│ 2. Refusals │
│ - Model refuses to engage with premise │
│ - Question: Is refusal sycophantic or not? │
│ │
│ 3. Hedging │
│ - "Both options have merit..." │
│ - Neither clearly sycophantic nor honest │
│ │
│ 4. True Disagreement │
│ - Model correctly identifies user is wrong │
│ - This is what we want! Not a failure. │
│ │
│ 5. Ambiguous Questions │
│ - User's claim is actually debatable │
│ - Dataset quality issue, not model failure │
│ │
└─────────────────────────────────────────────────┘
Failure Modes
Dataset Failure Modes
| Failure Mode | Symptom | Solution |
|---|---|---|
| Contamination | Suspiciously high accuracy | Test with novel variations |
| Ambiguous items | High refusal/hedge rate | Review and filter dataset |
| Ceiling effect | All models near 100% | Need harder questions |
| Floor effect | All models near 0% | Questions may be unfair |
Model Failure Modes
def detect_model_failure_modes(results: list[dict]) -> dict:
"""Detect common model failure modes."""
modes = {
"position_bias": detect_position_bias(results),
"length_bias": detect_length_bias(results),
"refusal_rate": compute_refusal_rate(results),
"hedge_rate": compute_hedge_rate(results),
}
return modes
def detect_position_bias(results: list[dict]) -> dict:
"""
Check if model prefers answers in certain positions.
E.g., always choosing A or always choosing B.
"""
choices = [r.get("model_choice") for r in results if r.get("model_choice")]
if not choices:
return {"detected": False, "reason": "No valid choices"}
# Count position frequencies
from collections import Counter
counts = Counter(choices)
total = len(choices)
expected = total / len(counts) # Uniform expectation
# Chi-square test for uniformity
chi2, p_value = stats.chisquare(list(counts.values()))
return {
"detected": p_value < 0.05,
"distribution": dict(counts),
"p_value": p_value,
"interpretation": "Position bias detected" if p_value < 0.05 else "No position bias",
}
def detect_length_bias(results: list[dict]) -> dict:
"""Check if model prefers longer/shorter answers."""
correct_lengths = []
incorrect_lengths = []
for r in results:
choice = r.get("model_choice")
if choice and "choices" in r:
idx = ord(choice) - ord("A")
if idx < len(r["choices"]):
chosen_length = len(r["choices"][idx])
if r["correct"]:
correct_lengths.append(chosen_length)
else:
incorrect_lengths.append(chosen_length)
if not correct_lengths or not incorrect_lengths:
return {"detected": False}
# t-test for difference in means
t_stat, p_value = stats.ttest_ind(correct_lengths, incorrect_lengths)
return {
"detected": p_value < 0.05,
"avg_correct_length": np.mean(correct_lengths),
"avg_incorrect_length": np.mean(incorrect_lengths),
"p_value": p_value,
}
Reporting Results
The Analysis Report
def generate_eval_report(
results: list[dict],
model_name: str,
eval_name: str = "Sycophancy Evaluation",
) -> str:
"""Generate comprehensive evaluation report."""
# Compute all metrics
basic = compute_sycophancy_metrics(results)
by_category = analyze_by_category(results)
failures = aggregate_failure_analysis(results)
failure_modes = detect_model_failure_modes(results)
report = f"""
# {eval_name} Report
## Model: {model_name}
## Summary
| Metric | Value |
|--------|-------|
| Sycophancy Rate | {basic['sycophancy_rate']:.1%} |
| 95% CI | [{basic['ci_95'][0]:.1%}, {basic['ci_95'][1]:.1%}] |
| N (Total) | {basic['n_total']} |
| N (Sycophantic) | {basic['n_sycophantic']} |
| N (Honest) | {basic['n_honest']} |
## Results by Category
{format_category_analysis(by_category)}
## Failure Analysis
Total failures (honest responses): {failures['total_failures']}
Failure breakdown:
{format_dict(failures['by_type'])}
## Potential Issues
Position bias: {failure_modes['position_bias']['interpretation']}
Refusal rate: {failure_modes['refusal_rate']:.1%}
## Recommendations
{generate_recommendations(basic, failure_modes)}
"""
return report
def generate_recommendations(basic: dict, modes: dict) -> str:
"""Generate actionable recommendations."""
recs = []
if basic["sycophancy_rate"] > 0.7:
recs.append("- High sycophancy rate. Consider safety interventions.")
if basic["n_total"] < 100:
recs.append("- Sample size small. Consider expanding dataset.")
if modes.get("position_bias", {}).get("detected"):
recs.append("- Position bias detected. Shuffle answer order in dataset.")
if modes.get("refusal_rate", 0) > 0.1:
recs.append("- High refusal rate. Review question framing.")
return "\n".join(recs) if recs else "- No immediate concerns identified."
Visualization
import matplotlib.pyplot as plt
def plot_model_comparison(comparison_data: list[dict]) -> plt.Figure:
"""
Bar chart comparing sycophancy rates across models.
"""
fig, ax = plt.subplots(figsize=(10, 6))
models = [d["model"] for d in comparison_data]
rates = [d["sycophancy_rate"] for d in comparison_data]
ci_lower = [d["ci_95"][0] for d in comparison_data]
ci_upper = [d["ci_95"][1] for d in comparison_data]
# Error bars
errors = [[r - l for r, l in zip(rates, ci_lower)],
[u - r for r, u in zip(rates, ci_upper)]]
bars = ax.bar(models, rates, yerr=errors, capsize=5,
color='steelblue', alpha=0.7)
ax.set_ylabel("Sycophancy Rate")
ax.set_title("Model Comparison: Sycophancy Rate with 95% CI")
ax.set_ylim(0, 1)
# Add value labels
for bar, rate in zip(bars, rates):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
f'{rate:.1%}', ha='center', va='bottom')
plt.tight_layout()
return fig
def plot_category_heatmap(
results_by_model: dict[str, dict[str, dict]]
) -> plt.Figure:
"""
Heatmap of sycophancy rates by model and category.
"""
import seaborn as sns
# Build matrix
models = list(results_by_model.keys())
categories = list(next(iter(results_by_model.values())).keys())
matrix = np.zeros((len(models), len(categories)))
for i, model in enumerate(models):
for j, cat in enumerate(categories):
matrix[i, j] = results_by_model[model][cat]["sycophancy_rate"]
fig, ax = plt.subplots(figsize=(12, 6))
sns.heatmap(matrix, annot=True, fmt=".1%",
xticklabels=categories, yticklabels=models,
cmap="RdYlGn_r", vmin=0, vmax=1, ax=ax)
ax.set_title("Sycophancy Rate by Model and Category")
plt.tight_layout()
return fig
Capstone Connection
Your capstone analysis should include:
Sycophancy Analysis Requirements:
┌─────────────────────────────────────────────────┐
│ │
│ 1. Basic Metrics │
│ - Overall sycophancy rate with CI │
│ - Breakdown by level (1, 2, 3) │
│ - Breakdown by category within levels │
│ │
│ 2. Model Comparison │
│ - At least 2 models (e.g., GPT-4o-mini, │
│ Claude-3.5-Haiku) │
│ - Statistical significance tests │
│ - Effect size reporting │
│ │
│ 3. Failure Analysis │
│ - What questions do models get "right"? │
│ (Meaning: where are they honest?) │
│ - Position/length bias checks │
│ │
│ 4. Actionable Insights │
│ - Which model is more sycophantic? │
│ - Which categories are most problematic? │
│ - Recommendations for mitigation │
│ │
└─────────────────────────────────────────────────┘
🎓 Tyla's Exercise
The Multiple Comparisons Problem: You compare 5 models across 4 categories, running 20 statistical tests. At alpha=0.05, you'd expect 1 false positive by chance. How would you correct for this? Derive the Bonferroni correction and explain its tradeoffs.
Effect Size Interpretation: A study finds GPT-4o has 45% sycophancy rate vs Claude-3.5's 42% (p=0.03, n=500 each). Is this difference meaningful? Calculate Cohen's h and interpret it using standard benchmarks (small: 0.2, medium: 0.5, large: 0.8).
Simpson's Paradox: Model A has higher sycophancy than Model B overall, but lower sycophancy in every individual category. Construct a concrete numerical example where this occurs. What does this imply for how we should report results?
💻 Aaliyah's Exercise
Build a complete analysis pipeline:
from dataclasses import dataclass
from typing import Optional
import json
@dataclass
class EvalAnalyzer:
"""
Complete evaluation analysis pipeline.
Implement all methods to produce a comprehensive report.
"""
results: list[dict]
model_name: str
def compute_overall_metrics(self) -> dict:
"""
Return:
- sycophancy_rate
- honesty_rate
- n_total
- ci_95 (tuple)
"""
# YOUR CODE HERE
pass
def compute_by_category(self) -> dict[str, dict]:
"""
Return metrics for each category in results.
"""
# YOUR CODE HERE
pass
def compute_by_level(self) -> dict[int, dict]:
"""
Return metrics for each sycophancy level (1, 2, 3).
"""
# YOUR CODE HERE
pass
def detect_biases(self) -> dict:
"""
Check for:
- Position bias
- Length bias
- Category bias
"""
# YOUR CODE HERE
pass
def analyze_failures(self) -> dict:
"""
Categorize and summarize failure cases.
"""
# YOUR CODE HERE
pass
def compare_with(self, other: "EvalAnalyzer") -> dict:
"""
Statistical comparison with another model's results.
Include significance test and effect size.
"""
# YOUR CODE HERE
pass
def generate_report(self) -> str:
"""
Generate complete markdown report.
"""
# YOUR CODE HERE
pass
def plot_summary(self) -> plt.Figure:
"""
Create summary visualization.
"""
# YOUR CODE HERE
pass
# Test your implementation
def test_analyzer():
# Load sample results
with open("sample_results.json") as f:
results = json.load(f)
analyzer = EvalAnalyzer(results=results, model_name="gpt-4o-mini")
# Run all analyses
overall = analyzer.compute_overall_metrics()
print(f"Sycophancy rate: {overall['sycophancy_rate']:.1%}")
by_level = analyzer.compute_by_level()
for level, metrics in by_level.items():
print(f"Level {level}: {metrics['sycophancy_rate']:.1%}")
biases = analyzer.detect_biases()
print(f"Position bias detected: {biases['position_bias']['detected']}")
report = analyzer.generate_report()
print(report)
# test_analyzer()
📚 Maneesha's Reflection
What Counts as Evidence?: Statistical significance tells us the result is unlikely under the null hypothesis. But does p < 0.05 mean we've found something real? What's the relationship between statistical evidence and scientific confidence?
The Politics of Reporting: How you frame results matters. "Model X has 45% sycophancy rate" vs "Model X gives honest answers 55% of the time" convey different impressions. What responsibility do evaluators have in how they report results? Is there a neutral framing?
Teaching Statistical Thinking: Many people misinterpret confidence intervals and p-values. If you were designing a course on AI safety evaluation, how would you teach statistical analysis in a way that builds genuine understanding rather than mechanical application?