A/B Test Analysis
Analysis notebook for A/B Testing Memory Game
# Core imports
import os
import json
import warnings
from pathlib import Path
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import requests
# Stats
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:.3f}'.format)
sns.set_theme(style='whitegrid', palette='deep')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 100
print(f"Analysis run: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
Analysis run: 2025-12-26 12:08:10
# Load environment variables from .env file
def load_env_from_files(candidates):
"""Load environment variables from .env files."""
for p in candidates:
path = Path(p)
if path.exists():
for line in path.read_text().splitlines():
line = line.strip()
if not line or line.startswith('#') or '=' not in line:
continue
k, v = line.split('=', 1)
os.environ.setdefault(k.strip(), v.strip())
print(f"β Loaded env from {p}")
return
print("β No .env file found")
load_env_from_files(['.env', '../.env', '../../.env'])
SUPABASE_URL = os.environ.get('PUBLIC_SUPABASE_URL')
SUPABASE_ANON_KEY = os.environ.get('PUBLIC_SUPABASE_ANON_KEY')
assert SUPABASE_URL and SUPABASE_ANON_KEY, 'Missing Supabase credentials in environment'
print(f"β Supabase URL: {SUPABASE_URL[:30]}...")
β No .env file found β Supabase URL: https://nazioidbiydxduonenmb.s...
# Supabase API helpers
HEADERS = {
'apikey': SUPABASE_ANON_KEY,
'Authorization': f'Bearer {SUPABASE_ANON_KEY}',
'Content-Type': 'application/json',
'Accept': 'application/json'
}
def call_rpc(fn_name: str, params: dict = None) -> dict:
"""Call a Supabase RPC function."""
url = f"{SUPABASE_URL}/rest/v1/rpc/{fn_name}"
r = requests.post(url, headers=HEADERS, json=params or {})
r.raise_for_status()
return r.json() if r.text.strip() else None
def query_table(table: str, select: str = '*', filters: str = '') -> list:
"""Query a Supabase table/view directly."""
url = f"{SUPABASE_URL}/rest/v1/{table}?select={select}"
if filters:
url += f"&{filters}"
r = requests.get(url, headers=HEADERS)
r.raise_for_status()
return r.json()
print("β API helpers ready")
β API helpers ready
# Pull raw event data from posthog_events table
# Filter to puzzle events with valid sessions
raw_events = query_table(
'posthog_events',
select='id,event,distinct_id,timestamp,variant,feature_flag_response,completion_time_seconds,correct_words_count,total_guesses_count,session_id',
filters='session_id=not.is.null&variant=not.is.null'
)
df_events = pd.DataFrame(raw_events)
df_events['timestamp'] = pd.to_datetime(df_events['timestamp'])
print(f"β Loaded {len(df_events):,} events")
print(f" Date range: {df_events['timestamp'].min().date()} to {df_events['timestamp'].max().date()}")
print(f" Event types: {df_events['event'].value_counts().to_dict()}")
β Loaded 590 events
Date range: 2025-11-09 to 2025-12-24
Event types: {'puzzle_started': 269, 'puzzle_completed': 198, 'puzzle_repeated': 91, 'puzzle_failed': 32}
# Create analysis-ready dataframes
# Completions: primary analysis dataset
df_completed = df_events[df_events['event'] == 'puzzle_completed'].copy()
df_completed = df_completed.dropna(subset=['completion_time_seconds', 'variant'])
# Started events: for conversion rate
df_started = df_events[df_events['event'] == 'puzzle_started'].copy()
# Repeated events: for engagement metric
df_repeated = df_events[df_events['event'] == 'puzzle_repeated'].copy()
print(f"\nπ Analysis Datasets:")
print(f" Completions: {len(df_completed):,} events")
print(f" Started: {len(df_started):,} events")
print(f" Repeated: {len(df_repeated):,} events")
print(f"\n Variant split (completions):")
print(df_completed['variant'].value_counts().to_string())
π Analysis Datasets: Completions: 198 events Started: 269 events Repeated: 91 events Variant split (completions): variant B 109 A 89
# Quick data quality check
print("π Data Quality Check:")
print(f"\nCompletion times range: {df_completed['completion_time_seconds'].min():.1f}s - {df_completed['completion_time_seconds'].max():.1f}s")
print(f"Mean: {df_completed['completion_time_seconds'].mean():.1f}s, Median: {df_completed['completion_time_seconds'].median():.1f}s")
print(f"\nNull values in key columns:")
print(df_completed[['variant', 'completion_time_seconds', 'distinct_id']].isnull().sum().to_string())
π Data Quality Check: Completion times range: 2.0s - 49.5s Mean: 8.8s, Median: 5.3s Null values in key columns: variant 0 completion_time_seconds 0 distinct_id 0
2. Exploratory Data AnalysisΒΆ
Understand the data distributions, identify outliers, and visualize key patterns before statistical testing.
# 2.1 Sample Size Summary
print("π Sample Size by Variant\n")
# Completions by variant
completion_counts = df_completed.groupby('variant').agg(
n_completions=('id', 'count'),
n_unique_users=('distinct_id', 'nunique'),
avg_time=('completion_time_seconds', 'mean'),
median_time=('completion_time_seconds', 'median'),
std_time=('completion_time_seconds', 'std')
).round(2)
print(completion_counts.to_string())
print(f"\n Total completions: {len(df_completed):,}")
print(f" Total unique users: {df_completed['distinct_id'].nunique():,}")
π Sample Size by Variant
n_completions n_unique_users avg_time median_time std_time
variant
A 89 16 7.340 3.410 8.410
B 109 27 10.040 6.920 8.890
Total completions: 198
Total unique users: 43
# 2.2 Completion Time Distribution by Variant
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# Histogram
for variant in ['A', 'B']:
data = df_completed[df_completed['variant'] == variant]['completion_time_seconds']
axes[0].hist(data, bins=30, alpha=0.6, label=f'Variant {variant}', edgecolor='white')
axes[0].set_xlabel('Completion Time (seconds)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Completion Times')
axes[0].legend()
# Box plot
df_completed.boxplot(column='completion_time_seconds', by='variant', ax=axes[1])
axes[1].set_xlabel('Variant')
axes[1].set_ylabel('Completion Time (seconds)')
axes[1].set_title('Completion Time by Variant')
plt.suptitle('') # Remove automatic title
# Violin plot for richer distribution view - order A then B
sns.violinplot(data=df_completed, x='variant', y='completion_time_seconds', ax=axes[2],
inner='quartile', order=['A', 'B'])
axes[2].set_xlabel('Variant')
axes[2].set_ylabel('Completion Time (seconds)')
axes[2].set_title('Completion Time Distribution (Violin)')
#plt.tight_layout()
plt.show()
# 2.3 Outlier Detection
print("π Outlier Analysis\n")
for variant in ['A', 'B']:
data = df_completed[df_completed['variant'] == variant]['completion_time_seconds']
q1 = data.quantile(0.25)
q3 = data.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = data[(data < lower_bound) | (data > upper_bound)]
print(f"Variant {variant}:")
print(f" IQR: {iqr:.1f}s")
print(f" Bounds: [{max(0, lower_bound):.1f}s, {upper_bound:.1f}s]")
print(f" Outliers: {len(outliers)} ({len(outliers)/len(data)*100:.1f}%)")
print(f" Outlier range: {outliers.min():.1f}s - {outliers.max():.1f}s\n")
π Outlier Analysis Variant A: IQR: 4.0s Bounds: [0.0s, 12.8s] Outliers: 15 (16.9%) Outlier range: 13.5s - 46.0s Variant B: IQR: 8.2s Bounds: [0.0s, 24.7s] Outliers: 9 (8.3%) Outlier range: 25.4s - 49.5s
# 2.4 Daily Volume Trend
df_completed['date'] = df_completed['timestamp'].dt.date
daily_volume = df_completed.groupby(['date', 'variant']).size().unstack(fill_value=0)
fig, ax = plt.subplots(figsize=(12, 5))
daily_volume.plot(kind='bar', ax=ax, width=0.8, alpha=0.8)
ax.set_xlabel('Date')
ax.set_ylabel('Completions')
ax.set_title('Daily Completion Volume by Variant')
ax.legend(title='Variant')
# Rotate x-axis labels for readability
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
print(f"\nπ
Experiment Duration: {(df_completed['timestamp'].max() - df_completed['timestamp'].min()).days} days")
π Experiment Duration: 45 days
# 2.5 Kernel Density Estimation (KDE) - Distribution Comparison
fig, ax = plt.subplots(figsize=(10, 6))
# KDE plot similar to AB Simulator dashboard
for variant, color in [('B', '#4A90D9'), ('A', '#F5C342')]: # Blue for B, Yellow/Gold for A
data = df_completed[df_completed['variant'] == variant]['completion_time_seconds']
sns.kdeplot(data=data, ax=ax, label=f'Variant {variant}', color=color,
linewidth=2, fill=True, alpha=0.3)
ax.set_xlabel('Completion Time (seconds)')
ax.set_ylabel('Density')
ax.set_title('Completion Time Distribution (KDE)')
ax.legend()
ax.set_xlim(0, df_completed['completion_time_seconds'].quantile(0.99)) # Trim extreme outliers
plt.tight_layout()
plt.show()
3. Hypothesis & Metrics DefinitionΒΆ
Experiment ContextΒΆ
The A/B Simulator is a Pineapple Finder memory game where users must memorize pineapple locations on a grid, then find them before the timer expires.
- Variant A (Control): 4 pineapples to find β baseline difficulty
- Variant B (Treatment): 5 pineapples to find β increased difficulty
HypothesesΒΆ
Primary Hypothesis (Completion Time):
- Hβ: ΞΌ_B = ΞΌ_A (No difference in mean completion time)
- Hβ: ΞΌ_B > ΞΌ_A (Variant B takes longer to complete)
- Rationale: More pineapples to memorize = higher cognitive load = longer completion time
Metrics FrameworkΒΆ
| Metric | Type | Definition | Direction |
|---|---|---|---|
| Completion Time | Primary KPI | Time in seconds to find all pineapples | B > A expected |
| Completion Rate | Secondary/Guardrail | % of started games that complete | Monitor for drop |
| Repeat Rate | Engagement Guardrail | % of users who play again | Monitor engagement |
Success CriteriaΒΆ
- Statistical: p < 0.05 (two-tailed for primary, one-tailed acceptable given directional hypothesis)
- Practical: Effect size Cohen's d > 0.2 (small but meaningful)
- Guardrail: Completion rate drop < 10% relative
# 3.1 Compute Metric Baselines
print("π Metric Baselines\n")
# Primary: Completion Time
time_a = df_completed[df_completed['variant'] == 'A']['completion_time_seconds']
time_b = df_completed[df_completed['variant'] == 'B']['completion_time_seconds']
print("PRIMARY: Completion Time (seconds)")
print(f" Variant A: mean={time_a.mean():.2f}, std={time_a.std():.2f}, n={len(time_a)}")
print(f" Variant B: mean={time_b.mean():.2f}, std={time_b.std():.2f}, n={len(time_b)}")
print(f" Difference: {time_b.mean() - time_a.mean():.2f}s ({(time_b.mean() - time_a.mean()) / time_a.mean() * 100:.1f}%)")
# Secondary: Completion Rate
started_a = len(df_started[df_started['variant'] == 'A'])
started_b = len(df_started[df_started['variant'] == 'B'])
completed_a = len(df_completed[df_completed['variant'] == 'A'])
completed_b = len(df_completed[df_completed['variant'] == 'B'])
cr_a = completed_a / started_a if started_a > 0 else 0
cr_b = completed_b / started_b if started_b > 0 else 0
print(f"\nSECONDARY: Completion Rate")
print(f" Variant A: {cr_a:.1%} ({completed_a}/{started_a})")
print(f" Variant B: {cr_b:.1%} ({completed_b}/{started_b})")
print(f" Difference: {(cr_b - cr_a):.1%} ({(cr_b - cr_a) / cr_a * 100 if cr_a > 0 else 0:.1f}% relative)")
# Engagement: Repeat Rate
repeated_a = len(df_repeated[df_repeated['variant'] == 'A'])
repeated_b = len(df_repeated[df_repeated['variant'] == 'B'])
rr_a = repeated_a / completed_a if completed_a > 0 else 0
rr_b = repeated_b / completed_b if completed_b > 0 else 0
print(f"\nENGAGEMENT: Repeat Rate (repeats / completions)")
print(f" Variant A: {rr_a:.1%} ({repeated_a}/{completed_a})")
print(f" Variant B: {rr_b:.1%} ({repeated_b}/{completed_b})")
π Metric Baselines PRIMARY: Completion Time (seconds) Variant A: mean=7.34, std=8.41, n=89 Variant B: mean=10.04, std=8.89, n=109 Difference: 2.70s (36.8%) SECONDARY: Completion Rate Variant A: 84.8% (89/105) Variant B: 66.5% (109/164) Difference: -18.3% (-21.6% relative) ENGAGEMENT: Repeat Rate (repeats / completions) Variant A: 46.1% (41/89) Variant B: 45.9% (50/109)
4. Sanity Checks & QAΒΆ
Before running statistical tests, validate data quality and check for Sample Ratio Mismatch (SRM).
# 4.1 Sample Ratio Mismatch (SRM) Test
# Check if traffic split matches expected 50/50 allocation
print("π Sample Ratio Mismatch (SRM) Test\n")
# Use started events for SRM (before any behavioral effects)
n_a = len(df_started[df_started['variant'] == 'A'])
n_b = len(df_started[df_started['variant'] == 'B'])
n_total = n_a + n_b
expected_ratio = 0.5 # 50/50 split expected
observed_ratio_a = n_a / n_total if n_total > 0 else 0
# Chi-square test for goodness of fit
expected_a = n_total * expected_ratio
expected_b = n_total * (1 - expected_ratio)
chi2_stat = ((n_a - expected_a)**2 / expected_a) + ((n_b - expected_b)**2 / expected_b)
srm_p_value = 1 - stats.chi2.cdf(chi2_stat, df=1)
print(f"Expected split: 50% / 50%")
print(f"Observed split: {observed_ratio_a:.1%} / {1 - observed_ratio_a:.1%}")
print(f" Variant A: {n_a:,} started")
print(f" Variant B: {n_b:,} started")
print(f"\nChi-square statistic: {chi2_stat:.4f}")
print(f"P-value: {srm_p_value:.4f}")
if srm_p_value < 0.01:
print("\nβ οΈ WARNING: SRM detected (p < 0.01)")
print(" Traffic allocation may be biased. Investigate before proceeding.")
else:
print("\nβ
PASS: No SRM detected")
print(" Traffic allocation appears balanced.")
π Sample Ratio Mismatch (SRM) Test Expected split: 50% / 50% Observed split: 39.0% / 61.0% Variant A: 105 started Variant B: 164 started Chi-square statistic: 12.9405 P-value: 0.0003 β οΈ WARNING: SRM detected (p < 0.01) Traffic allocation may be biased. Investigate before proceeding.
# 4.2 Data Quality Checks
print("π Data Quality Checks\n")
checks = []
# Check 1: Null values in critical fields
null_variant = df_completed['variant'].isna().sum()
null_time = df_completed['completion_time_seconds'].isna().sum()
null_check = null_variant == 0 and null_time == 0
checks.append({
'Check': 'No null values in variant/time',
'Status': 'β
PASS' if null_check else 'β οΈ FAIL',
'Details': f'{null_variant} null variants, {null_time} null times'
})
# Check 2: Duplicate session IDs
dup_completed = df_completed['session_id'].duplicated().sum()
dup_check = dup_completed == 0
checks.append({
'Check': 'No duplicate completions per session',
'Status': 'β
PASS' if dup_check else 'β οΈ WARN',
'Details': f'{dup_completed} duplicates found'
})
# Check 3: Completion time bounds (reasonable range: 1-600 seconds)
time_outliers_low = (df_completed['completion_time_seconds'] < 1).sum()
time_outliers_high = (df_completed['completion_time_seconds'] > 600).sum()
bounds_check = time_outliers_low == 0 and time_outliers_high == 0
checks.append({
'Check': 'Completion times in valid range (1-600s)',
'Status': 'β
PASS' if bounds_check else 'β οΈ WARN',
'Details': f'{time_outliers_low} below 1s, {time_outliers_high} above 600s'
})
# Check 4: Both variants have minimum sample size (n >= 30)
min_sample = min(len(time_a), len(time_b))
sample_check = min_sample >= 30
checks.append({
'Check': 'Minimum sample size (n >= 30 per variant)',
'Status': 'β
PASS' if sample_check else 'β οΈ FAIL',
'Details': f'Smallest group: n={min_sample}'
})
# Check 5: Test period has valid date range
date_range = df_completed['timestamp'].max() - df_completed['timestamp'].min()
date_check = date_range.days >= 1
checks.append({
'Check': 'Test ran for at least 1 day',
'Status': 'β
PASS' if date_check else 'β οΈ WARN',
'Details': f'{date_range.days} days of data'
})
# Display as table
df_checks = pd.DataFrame(checks)
print(df_checks.to_string(index=False))
# Overall assessment
all_passed = all('PASS' in c['Status'] for c in checks)
print(f"\n{'=' * 50}")
print(f"Overall: {'β
All checks passed' if all_passed else 'β οΈ Some checks need attention'}")
π Data Quality Checks
Check Status Details
No null values in variant/time β
PASS 0 null variants, 0 null times
No duplicate completions per session β οΈ WARN 121 duplicates found
Completion times in valid range (1-600s) β
PASS 0 below 1s, 0 above 600s
Minimum sample size (n >= 30 per variant) β
PASS Smallest group: n=89
Test ran for at least 1 day β
PASS 45 days of data
==================================================
Overall: β οΈ Some checks need attention
5. Primary Analysis: T-TestΒΆ
Welch's t-test (unequal variances) comparing completion times between variants.
# 5.1 Welch's T-Test (Primary Analysis)
print("π Primary Analysis: Welch's T-Test\n")
print("Testing: Hβ: ΞΌ_B = ΞΌ_A vs Hβ: ΞΌ_B β ΞΌ_A\n")
# Two-sample t-test with unequal variances (Welch's)
t_stat, p_value_two_tailed = stats.ttest_ind(time_b, time_a, equal_var=False)
# One-tailed p-value (for directional hypothesis: B > A)
p_value_one_tailed = p_value_two_tailed / 2 if t_stat > 0 else 1 - p_value_two_tailed / 2
# Effect size: Cohen's d
pooled_std = np.sqrt(((len(time_a) - 1) * time_a.std()**2 + (len(time_b) - 1) * time_b.std()**2) /
(len(time_a) + len(time_b) - 2))
cohens_d = (time_b.mean() - time_a.mean()) / pooled_std
# Confidence interval for difference in means
se_diff = np.sqrt(time_a.var()/len(time_a) + time_b.var()/len(time_b))
df_welch = (time_a.var()/len(time_a) + time_b.var()/len(time_b))**2 / \
((time_a.var()/len(time_a))**2/(len(time_a)-1) + (time_b.var()/len(time_b))**2/(len(time_b)-1))
t_crit = stats.t.ppf(0.975, df_welch)
diff_mean = time_b.mean() - time_a.mean()
ci_lower = diff_mean - t_crit * se_diff
ci_upper = diff_mean + t_crit * se_diff
print(f"Sample Statistics:")
print(f" Variant A: n={len(time_a)}, mean={time_a.mean():.2f}s, std={time_a.std():.2f}s")
print(f" Variant B: n={len(time_b)}, mean={time_b.mean():.2f}s, std={time_b.std():.2f}s")
print(f"\nDifference (B - A):")
print(f" Mean difference: {diff_mean:.2f}s")
print(f" Relative change: {diff_mean / time_a.mean() * 100:.1f}%")
print(f" 95% CI: [{ci_lower:.2f}s, {ci_upper:.2f}s]")
print(f"\nTest Statistics:")
print(f" t-statistic: {t_stat:.4f}")
print(f" Degrees of freedom (Welch): {df_welch:.1f}")
print(f" p-value (two-tailed): {p_value_two_tailed:.4f}")
print(f" p-value (one-tailed, B > A): {p_value_one_tailed:.4f}")
print(f"\nEffect Size:")
print(f" Cohen's d: {cohens_d:.3f}", end="")
if abs(cohens_d) < 0.2:
print(" (negligible)")
elif abs(cohens_d) < 0.5:
print(" (small)")
elif abs(cohens_d) < 0.8:
print(" (medium)")
else:
print(" (large)")
π Primary Analysis: Welch's T-Test Testing: Hβ: ΞΌ_B = ΞΌ_A vs Hβ: ΞΌ_B β ΞΌ_A Sample Statistics: Variant A: n=89, mean=7.34s, std=8.41s Variant B: n=109, mean=10.04s, std=8.89s Difference (B - A): Mean difference: 2.70s Relative change: 36.8% 95% CI: [0.27s, 5.13s] Test Statistics: t-statistic: 2.1911 Degrees of freedom (Welch): 191.7 p-value (two-tailed): 0.0296 p-value (one-tailed, B > A): 0.0148 Effect Size: Cohen's d: 0.311 (small)
# 5.2 Statistical Decision
print("π Statistical Decision\n")
alpha = 0.05
significant = p_value_two_tailed < alpha
print(f"Significance level (Ξ±): {alpha}")
print(f"p-value: {p_value_two_tailed:.4f}")
print(f"\nDecision: ", end="")
if significant:
print(f"REJECT Hβ β
")
print(f"\nConclusion: There is statistically significant evidence that")
print(f"Variant B completion time differs from Variant A.")
if diff_mean > 0:
print(f"\nVariant B takes {diff_mean:.2f}s ({diff_mean / time_a.mean() * 100:.1f}%) LONGER on average.")
else:
print(f"\nVariant B takes {abs(diff_mean):.2f}s ({abs(diff_mean) / time_a.mean() * 100:.1f}%) SHORTER on average.")
else:
print(f"FAIL TO REJECT Hβ β οΈ")
print(f"\nConclusion: Insufficient evidence to conclude that")
print(f"Variant B completion time differs from Variant A.")
print(f"\nObserved difference of {diff_mean:.2f}s could be due to chance.")
# Practical significance
print(f"\n{'=' * 50}")
print(f"Practical Significance:")
if abs(cohens_d) >= 0.2:
print(f" Effect size (d={cohens_d:.2f}) meets minimum threshold for practical significance.")
else:
print(f" Effect size (d={cohens_d:.2f}) is below threshold for practical significance (d < 0.2).")
π Statistical Decision Significance level (Ξ±): 0.05 p-value: 0.0296 Decision: REJECT Hβ β Conclusion: There is statistically significant evidence that Variant B completion time differs from Variant A. Variant B takes 2.70s (36.8%) LONGER on average. ================================================== Practical Significance: Effect size (d=0.31) meets minimum threshold for practical significance.
6. Primary Analysis: RegressionΒΆ
OLS regression provides the same result as t-test but offers flexibility for covariate adjustment.
# 6.1 OLS Regression: Treatment Effect
print("π OLS Regression Analysis\n")
# Create binary treatment indicator (B = 1, A = 0)
df_reg = df_completed[['completion_time_seconds', 'variant']].copy()
df_reg['treatment'] = (df_reg['variant'] == 'B').astype(int)
# Fit OLS model: Y = Ξ²β + Ξ²β * Treatment + Ξ΅
model = smf.ols('completion_time_seconds ~ treatment', data=df_reg).fit()
print(model.summary().tables[0])
print("\n")
print(model.summary().tables[1])
π OLS Regression Analysis
OLS Regression Results
===================================================================================
Dep. Variable: completion_time_seconds R-squared: 0.024
Model: OLS Adj. R-squared: 0.019
Method: Least Squares F-statistic: 4.748
Date: Fri, 26 Dec 2025 Prob (F-statistic): 0.0305
Time: 12:08:12 Log-Likelihood: -707.76
No. Observations: 198 AIC: 1420.
Df Residuals: 196 BIC: 1426.
Df Model: 1
Covariance Type: nonrobust
===================================================================================
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 7.3379 0.920 7.978 0.000 5.524 9.152
treatment 2.7012 1.240 2.179 0.031 0.256 5.146
==============================================================================
# 6.2 Regression Interpretation
print("π Regression Interpretation\n")
# Extract coefficients
intercept = model.params['Intercept']
treatment_effect = model.params['treatment']
treatment_pvalue = model.pvalues['treatment']
treatment_ci = model.conf_int().loc['treatment']
print(f"Model: completion_time = Ξ²β + Ξ²β Γ treatment + Ξ΅")
print(f"\nCoefficients:")
print(f" Ξ²β (Intercept): {intercept:.2f}s")
print(f" β Mean completion time for Variant A")
print(f"\n Ξ²β (Treatment): {treatment_effect:.2f}s")
print(f" β Additional time for Variant B vs A")
print(f" β 95% CI: [{treatment_ci[0]:.2f}s, {treatment_ci[1]:.2f}s]")
print(f" β p-value: {treatment_pvalue:.4f}")
print(f"\n{'=' * 50}")
print(f"Validation: Regression Ξ²β matches t-test difference")
print(f" T-test diff: {diff_mean:.4f}s")
print(f" Regression Ξ²β: {treatment_effect:.4f}s")
print(f" Match: {'β
Yes' if abs(diff_mean - treatment_effect) < 0.01 else 'β οΈ No'}")
π Regression Interpretation
Model: completion_time = Ξ²β + Ξ²β Γ treatment + Ξ΅
Coefficients:
Ξ²β (Intercept): 7.34s
β Mean completion time for Variant A
Ξ²β (Treatment): 2.70s
β Additional time for Variant B vs A
β 95% CI: [0.26s, 5.15s]
β p-value: 0.0305
==================================================
Validation: Regression Ξ²β matches t-test difference
T-test diff: 2.7012s
Regression Ξ²β: 2.7012s
Match: β
Yes
7. Variance Reduction: CUPED FrameworkΒΆ
CUPED (Controlled-experiment Using Pre-Experiment Data) reduces variance by adjusting for pre-experiment covariates.
Formula: Y_adjusted = Y - ΞΈ(X - XΜ)
Where ΞΈ = Cov(Y, X) / Var(X)
β οΈ Note: This experiment lacks pre-experiment covariates. Below is a template demonstrating the CUPED framework using simulated data for educational purposes.
# 7.1 CUPED Simulation (Educational Demo)
print("π CUPED Variance Reduction Demo\n")
# Simulate pre-experiment covariate (e.g., user's historical avg game time)
# In production, this would come from pre-experiment user behavior
np.random.seed(42)
df_cuped = df_completed[['completion_time_seconds', 'variant']].copy()
n = len(df_cuped)
# Simulate pre-experiment covariate correlated with outcome (r β 0.5)
# This represents historical user performance before the experiment
baseline_time = df_cuped['completion_time_seconds'].mean()
noise = np.random.normal(0, 5, n)
df_cuped['pre_exp_avg_time'] = df_cuped['completion_time_seconds'] * 0.5 + baseline_time * 0.5 + noise
# Calculate CUPED adjustment
Y = df_cuped['completion_time_seconds']
X = df_cuped['pre_exp_avg_time']
theta = np.cov(Y, X)[0, 1] / np.var(X)
df_cuped['Y_adjusted'] = Y - theta * (X - X.mean())
print(f"CUPED Parameters:")
print(f" ΞΈ (adjustment coefficient): {theta:.4f}")
print(f" Correlation(Y, X): {np.corrcoef(Y, X)[0, 1]:.3f}")
print(f"\nVariance Comparison:")
print(f" Original variance: {Y.var():.2f}")
print(f" CUPED variance: {df_cuped['Y_adjusted'].var():.2f}")
print(f" Variance reduction: {(1 - df_cuped['Y_adjusted'].var() / Y.var()) * 100:.1f}%")
π CUPED Variance Reduction Demo CUPED Parameters: ΞΈ (adjustment coefficient): 0.9470 Correlation(Y, X): 0.715 Variance Comparison: Original variance: 76.72 CUPED variance: 37.51 Variance reduction: 51.1%
# 7.2 CUPED-Adjusted T-Test
print("π CUPED-Adjusted Analysis\n")
# Split by variant
cuped_a = df_cuped[df_cuped['variant'] == 'A']['Y_adjusted']
cuped_b = df_cuped[df_cuped['variant'] == 'B']['Y_adjusted']
# T-test on adjusted values
t_stat_cuped, p_value_cuped = stats.ttest_ind(cuped_b, cuped_a, equal_var=False)
# Standard errors
se_orig = np.sqrt(time_a.var()/len(time_a) + time_b.var()/len(time_b))
se_cuped = np.sqrt(cuped_a.var()/len(cuped_a) + cuped_b.var()/len(cuped_b))
print("Comparison: Original vs CUPED-Adjusted")
print(f"\n{'Metric':<25} {'Original':>12} {'CUPED':>12}")
print("-" * 50)
print(f"{'Mean diff (B - A)':<25} {diff_mean:>12.2f} {cuped_b.mean() - cuped_a.mean():>12.2f}")
print(f"{'Standard Error':<25} {se_orig:>12.2f} {se_cuped:>12.2f}")
print(f"{'t-statistic':<25} {t_stat:>12.4f} {t_stat_cuped:>12.4f}")
print(f"{'p-value':<25} {p_value_two_tailed:>12.4f} {p_value_cuped:>12.4f}")
print(f"\n{'=' * 50}")
print(f"SE Reduction: {(1 - se_cuped/se_orig) * 100:.1f}%")
print(f"\nπ‘ In production with real pre-experiment data,")
print(f" CUPED typically reduces variance by 20-50%.")
π CUPED-Adjusted Analysis Comparison: Original vs CUPED-Adjusted Metric Original CUPED -------------------------------------------------- Mean diff (B - A) 2.70 1.00 Standard Error 1.23 0.85 t-statistic 2.1911 1.1718 p-value 0.0296 0.2427 ================================================== SE Reduction: 30.8% π‘ In production with real pre-experiment data, CUPED typically reduces variance by 20-50%.
8. Secondary Metrics & GuardrailsΒΆ
Analyze conversion rates and engagement metrics to ensure the treatment doesn't harm user experience.
# 8.1 Completion Rate Analysis (Chi-Square Test)
print("π Secondary Metric: Completion Rate\n")
# Contingency table: Started vs Completed by Variant
# Completed Not Completed
# Variant A X Y
# Variant B X Y
not_completed_a = started_a - completed_a
not_completed_b = started_b - completed_b
contingency_table = np.array([
[completed_a, not_completed_a],
[completed_b, not_completed_b]
])
print("Contingency Table:")
print(f"{'':15} {'Completed':>12} {'Not Completed':>15}")
print(f"{'Variant A':15} {completed_a:>12} {not_completed_a:>15}")
print(f"{'Variant B':15} {completed_b:>12} {not_completed_b:>15}")
# Chi-square test
chi2, p_value_cr, dof, expected = stats.chi2_contingency(contingency_table)
print(f"\nCompletion Rates:")
print(f" Variant A: {cr_a:.1%} ({completed_a}/{started_a})")
print(f" Variant B: {cr_b:.1%} ({completed_b}/{started_b})")
print(f" Difference: {(cr_b - cr_a):.1%} ({(cr_b - cr_a) / cr_a * 100 if cr_a > 0 else 0:.1f}% relative)")
print(f"\nChi-Square Test:")
print(f" ΟΒ² statistic: {chi2:.4f}")
print(f" p-value: {p_value_cr:.4f}")
if p_value_cr < 0.05:
if cr_b < cr_a:
print(f"\nβ οΈ GUARDRAIL ALERT: Significant drop in completion rate!")
else:
print(f"\nβ
Significant improvement in completion rate")
else:
print(f"\nβ
GUARDRAIL PASS: No significant difference in completion rate")
π Secondary Metric: Completion Rate
Contingency Table:
Completed Not Completed
Variant A 89 16
Variant B 109 55
Completion Rates:
Variant A: 84.8% (89/105)
Variant B: 66.5% (109/164)
Difference: -18.3% (-21.6% relative)
Chi-Square Test:
ΟΒ² statistic: 10.1112
p-value: 0.0015
β οΈ GUARDRAIL ALERT: Significant drop in completion rate!
# 8.2 Repeat Rate Analysis (Engagement Guardrail)
print("π Engagement Guardrail: Repeat Rate\n")
# Test for difference in proportions (two-proportion z-test)
# Hβ: repeat_rate_B = repeat_rate_A
p1 = rr_a # Variant A repeat rate
p2 = rr_b # Variant B repeat rate
n1 = completed_a
n2 = completed_b
# Pooled proportion
p_pooled = (repeated_a + repeated_b) / (n1 + n2)
# Standard error
se_rr = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2))
# Z-statistic
z_stat = (p2 - p1) / se_rr if se_rr > 0 else 0
p_value_rr = 2 * (1 - stats.norm.cdf(abs(z_stat)))
print(f"Repeat Rates:")
print(f" Variant A: {rr_a:.1%} ({repeated_a}/{completed_a})")
print(f" Variant B: {rr_b:.1%} ({repeated_b}/{completed_b})")
print(f" Difference: {(rr_b - rr_a):.1%}")
print(f"\nTwo-Proportion Z-Test:")
print(f" z-statistic: {z_stat:.4f}")
print(f" p-value: {p_value_rr:.4f}")
if p_value_rr < 0.05 and rr_b < rr_a:
print(f"\nβ οΈ GUARDRAIL ALERT: Significant drop in engagement!")
else:
print(f"\nβ
GUARDRAIL PASS: No significant drop in repeat rate")
π Engagement Guardrail: Repeat Rate Repeat Rates: Variant A: 46.1% (41/89) Variant B: 45.9% (50/109) Difference: -0.2% Two-Proportion Z-Test: z-statistic: -0.0275 p-value: 0.9781 β GUARDRAIL PASS: No significant drop in repeat rate
# 8.3 Secondary Metrics Summary
print("π Secondary Metrics Summary\n")
metrics_summary = pd.DataFrame({
'Metric': ['Completion Rate', 'Repeat Rate'],
'Variant A': [f'{cr_a:.1%}', f'{rr_a:.1%}'],
'Variant B': [f'{cr_b:.1%}', f'{rr_b:.1%}'],
'Difference': [f'{(cr_b - cr_a):.1%}', f'{(rr_b - rr_a):.1%}'],
'p-value': [f'{p_value_cr:.4f}', f'{p_value_rr:.4f}'],
'Status': [
'β
Pass' if p_value_cr >= 0.05 or cr_b >= cr_a else 'β οΈ Alert',
'β
Pass' if p_value_rr >= 0.05 or rr_b >= rr_a else 'β οΈ Alert'
]
})
print(metrics_summary.to_string(index=False))
print(f"\n{'=' * 60}")
all_guardrails_pass = (p_value_cr >= 0.05 or cr_b >= cr_a) and (p_value_rr >= 0.05 or rr_b >= rr_a)
print(f"Overall Guardrail Status: {'β
All guardrails pass' if all_guardrails_pass else 'β οΈ Review needed'}")
π Secondary Metrics Summary
Metric Variant A Variant B Difference p-value Status
Completion Rate 84.8% 66.5% -18.3% 0.0015 β οΈ Alert
Repeat Rate 46.1% 45.9% -0.2% 0.9781 β
Pass
============================================================
Overall Guardrail Status: β οΈ Review needed
# 9.1 Treatment Effect Confidence Interval Plot
fig, ax = plt.subplots(figsize=(10, 5))
# Plot the point estimate and CI
ax.errorbar(x=0, y=diff_mean, yerr=[[diff_mean - ci_lower], [ci_upper - diff_mean]],
fmt='o', markersize=12, capsize=10, capthick=2, color='#4A90D9',
ecolor='#4A90D9', elinewidth=3, label='Treatment Effect (B - A)')
# Add reference line at 0 (no effect)
ax.axhline(y=0, color='gray', linestyle='--', linewidth=2, label='No Effect (Hβ)')
# Shading for CI
ax.fill_between([-0.5, 0.5], ci_lower, ci_upper, alpha=0.2, color='#4A90D9')
# Annotations
ax.annotate(f'Mean: {diff_mean:.2f}s', xy=(0, diff_mean), xytext=(0.3, diff_mean + 1),
fontsize=11, ha='left', arrowprops=dict(arrowstyle='->', color='gray'))
ax.annotate(f'95% CI: [{ci_lower:.2f}, {ci_upper:.2f}]', xy=(0, ci_lower),
xytext=(0.3, ci_lower - 1.5), fontsize=10, ha='left')
ax.set_xlim(-1, 1)
ax.set_ylim(ci_lower - 3, ci_upper + 3)
ax.set_ylabel('Difference in Completion Time (seconds)')
ax.set_title('Treatment Effect: Variant B vs A (Completion Time)')
ax.set_xticks([])
ax.legend(loc='upper right')
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()
# Interpretation
print(f"\nπ Interpretation:")
if ci_lower <= 0 <= ci_upper:
print(f" The 95% CI [{ci_lower:.2f}s, {ci_upper:.2f}s] includes zero.")
print(f" β Cannot conclude a significant difference between variants.")
else:
direction = "longer" if diff_mean > 0 else "shorter"
print(f" The 95% CI [{ci_lower:.2f}s, {ci_upper:.2f}s] excludes zero.")
print(f" β Variant B takes significantly {direction} than Variant A.")
π Interpretation: The 95% CI [0.27s, 5.13s] excludes zero. β Variant B takes significantly longer than Variant A.
# 9.2 Daily Difference Trend (B - A)
print("π Daily Treatment Effect Over Time\n")
# Calculate daily means by variant
daily_means = df_completed.groupby(['date', 'variant'])['completion_time_seconds'].agg(['mean', 'std', 'count']).reset_index()
daily_means.columns = ['date', 'variant', 'mean', 'std', 'n']
# Pivot to get A and B side by side
daily_pivot = daily_means.pivot(index='date', columns='variant', values='mean')
daily_pivot['diff'] = daily_pivot['B'] - daily_pivot['A']
# Calculate daily standard errors for CI bands
daily_n = daily_means.pivot(index='date', columns='variant', values='n')
daily_std = daily_means.pivot(index='date', columns='variant', values='std')
# SE of difference (assuming independence)
daily_pivot['se_diff'] = np.sqrt(
(daily_std['A']**2 / daily_n['A']).fillna(0) +
(daily_std['B']**2 / daily_n['B']).fillna(0)
)
daily_pivot['ci_lower'] = daily_pivot['diff'] - 1.96 * daily_pivot['se_diff']
daily_pivot['ci_upper'] = daily_pivot['diff'] + 1.96 * daily_pivot['se_diff']
# Plot
fig, ax = plt.subplots(figsize=(12, 6))
dates = range(len(daily_pivot))
ax.plot(dates, daily_pivot['diff'].values, 'o-', color='#4A90D9', linewidth=2,
markersize=8, label='Daily Difference (B - A)')
ax.fill_between(dates, daily_pivot['ci_lower'].values, daily_pivot['ci_upper'].values,
alpha=0.2, color='#4A90D9', label='95% CI')
ax.axhline(y=0, color='gray', linestyle='--', linewidth=2, label='No Effect')
ax.axhline(y=diff_mean, color='#F5C342', linestyle='-', linewidth=2,
label=f'Overall Mean Diff ({diff_mean:.2f}s)')
ax.set_xlabel('Day')
ax.set_ylabel('Difference in Completion Time (seconds)')
ax.set_title('Daily Treatment Effect: Variant B vs A')
ax.set_xticks(dates)
ax.set_xticklabels([str(d) for d in daily_pivot.index], rotation=45, ha='right')
ax.legend(loc='best')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Days with positive effect (B > A): {(daily_pivot['diff'] > 0).sum()}")
print(f"Days with negative effect (B < A): {(daily_pivot['diff'] < 0).sum()}")
π Daily Treatment Effect Over Time
Days with positive effect (B > A): 7 Days with negative effect (B < A): 4
10. Post-Hoc Power AnalysisΒΆ
Assess the statistical power of this experiment and determine sample size requirements for future experiments.
# 10.1 Observed Power Calculation
from statsmodels.stats.power import TTestIndPower
print("π Post-Hoc Power Analysis\n")
power_analysis = TTestIndPower()
# Current sample sizes
n_a_obs = len(time_a)
n_b_obs = len(time_b)
n_harmonic = 2 / (1/n_a_obs + 1/n_b_obs) # Harmonic mean for unequal samples
# Observed effect size (Cohen's d)
effect_size_obs = abs(cohens_d)
# Calculate observed power
observed_power = power_analysis.solve_power(
effect_size=effect_size_obs,
nobs1=n_a_obs,
ratio=n_b_obs/n_a_obs,
alpha=0.05,
alternative='two-sided'
)
print(f"Current Experiment Parameters:")
print(f" Sample size A: {n_a_obs}")
print(f" Sample size B: {n_b_obs}")
print(f" Observed effect size (Cohen's d): {effect_size_obs:.3f}")
print(f" Significance level (Ξ±): 0.05")
print(f"\nObserved Statistical Power: {observed_power:.1%}")
if observed_power < 0.80:
print(f"\nβ οΈ Experiment was UNDERPOWERED (< 80%)")
print(f" High risk of Type II error (false negative)")
else:
print(f"\nβ
Experiment was adequately powered (β₯ 80%)")
π Post-Hoc Power Analysis Current Experiment Parameters: Sample size A: 89 Sample size B: 109 Observed effect size (Cohen's d): 0.311 Significance level (Ξ±): 0.05 Observed Statistical Power: 58.3% β οΈ Experiment was UNDERPOWERED (< 80%) High risk of Type II error (false negative)
# 10.2 Required Sample Size for Different Effect Sizes
print("π Sample Size Requirements\n")
effect_sizes = [0.2, 0.3, 0.5, 0.8] # Small, small-medium, medium, large
target_power = 0.80
print(f"Sample size per group needed for 80% power (Ξ±=0.05, two-sided):\n")
print(f"{'Effect Size (d)':<18} {'N per Group':>15} {'Total N':>12}")
print("-" * 48)
for es in effect_sizes:
n_required = power_analysis.solve_power(
effect_size=es,
power=target_power,
alpha=0.05,
ratio=1.0,
alternative='two-sided'
)
print(f"d = {es:<14.1f} {int(np.ceil(n_required)):>15,} {int(np.ceil(n_required * 2)):>12,}")
# Sample size needed to detect observed effect at 80% power
if effect_size_obs > 0:
n_needed_obs = power_analysis.solve_power(
effect_size=effect_size_obs,
power=0.80,
alpha=0.05,
ratio=1.0,
alternative='two-sided'
)
print(f"\n{'=' * 48}")
print(f"To detect observed effect (d={effect_size_obs:.3f}) at 80% power:")
print(f" Need: {int(np.ceil(n_needed_obs)):,} per group ({int(np.ceil(n_needed_obs * 2)):,} total)")
print(f" Have: {min(n_a_obs, n_b_obs)} per group ({n_a_obs + n_b_obs} total)")
print(f" Gap: {int(np.ceil(n_needed_obs)) - min(n_a_obs, n_b_obs):,} more needed per group")
π Sample Size Requirements Sample size per group needed for 80% power (Ξ±=0.05, two-sided): Effect Size (d) N per Group Total N ------------------------------------------------ d = 0.2 394 787 d = 0.3 176 351 d = 0.5 64 128 d = 0.8 26 52 ================================================ To detect observed effect (d=0.311) at 80% power: Need: 163 per group (326 total) Have: 89 per group (198 total) Gap: 74 more needed per group
# 10.3 Power Curve Visualization
fig, ax = plt.subplots(figsize=(10, 6))
# Sample sizes to evaluate
sample_sizes = np.arange(20, 501, 10)
effect_sizes_plot = [0.2, 0.3, 0.5, effect_size_obs]
colors = ['#4A90D9', '#50C878', '#F5C342', '#E74C3C']
labels = ['d=0.2 (small)', 'd=0.3', 'd=0.5 (medium)', f'd={effect_size_obs:.2f} (observed)']
for es, color, label in zip(effect_sizes_plot, colors, labels):
powers = [power_analysis.solve_power(effect_size=es, nobs1=n, ratio=1, alpha=0.05,
alternative='two-sided') for n in sample_sizes]
ax.plot(sample_sizes, powers, color=color, linewidth=2, label=label)
# Reference lines
ax.axhline(y=0.80, color='gray', linestyle='--', linewidth=1.5, label='80% Power Target')
ax.axvline(x=min(n_a_obs, n_b_obs), color='red', linestyle=':', linewidth=2,
label=f'Current n={min(n_a_obs, n_b_obs)}')
ax.set_xlabel('Sample Size per Group')
ax.set_ylabel('Statistical Power')
ax.set_title('Power Curve: Effect Size vs Sample Size')
ax.set_ylim(0, 1)
ax.set_xlim(20, 500)
ax.legend(loc='lower right')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()
# MDE at current sample size
mde = power_analysis.solve_power(nobs1=min(n_a_obs, n_b_obs), power=0.80,
alpha=0.05, ratio=1, alternative='two-sided')
print(f"\nπ Minimum Detectable Effect (MDE) at 80% power with n={min(n_a_obs, n_b_obs)}:")
print(f" Cohen's d = {mde:.3f}")
print(f" In seconds: {mde * pooled_std:.2f}s difference required")
π Minimum Detectable Effect (MDE) at 80% power with n=89: Cohen's d = 0.422 In seconds: 3.66s difference required
# 11.1 Executive Summary
print("=" * 70)
print(" A/B TEST ANALYSIS: EXECUTIVE SUMMARY")
print("=" * 70)
print(f"""
EXPERIMENT: Pineapple Finder Memory Game
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Variant A (Control): 4 pineapples to find
Variant B (Treatment): 5 pineapples to find
Duration: {date_range.days} days
Total Completions: {len(df_completed):,}
PRIMARY METRIC: Completion Time
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Variant A Mean: {time_a.mean():.2f}s (n={len(time_a)})
Variant B Mean: {time_b.mean():.2f}s (n={len(time_b)})
Difference: {diff_mean:+.2f}s ({diff_mean/time_a.mean()*100:+.1f}%)
95% CI: [{ci_lower:.2f}s, {ci_upper:.2f}s]
p-value: {p_value_two_tailed:.4f}
Effect Size (d): {cohens_d:.3f}
RESULT: {"β NOT SIGNIFICANT" if p_value_two_tailed >= 0.05 else "β
SIGNIFICANT"}
CI includes zero β cannot conclude a meaningful difference.
SECONDARY METRICS & GUARDRAILS
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Completion Rate: A={cr_a:.1%} β B={cr_b:.1%} ({(cr_b-cr_a)/cr_a*100:+.1f}%)
p={p_value_cr:.4f} {"β οΈ SIGNIFICANT DROP" if p_value_cr < 0.05 and cr_b < cr_a else "β
OK"}
Repeat Rate: A={rr_a:.1%} β B={rr_b:.1%} ({(rr_b-rr_a):.1%})
p={p_value_rr:.4f} {"β οΈ SIGNIFICANT DROP" if p_value_rr < 0.05 and rr_b < rr_a else "β
OK"}
STATISTICAL POWER
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Observed Power: {observed_power:.1%} (target: 80%)
MDE at n={min(n_a_obs, n_b_obs)}: d={mde:.2f} ({mde * pooled_std:.1f}s)
Sample needed for
observed effect: {int(np.ceil(n_needed_obs)):,} per group
""")
print("=" * 70)
======================================================================
A/B TEST ANALYSIS: EXECUTIVE SUMMARY
======================================================================
EXPERIMENT: Pineapple Finder Memory Game
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Variant A (Control): 4 pineapples to find
Variant B (Treatment): 5 pineapples to find
Duration: 45 days
Total Completions: 198
PRIMARY METRIC: Completion Time
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Variant A Mean: 7.34s (n=89)
Variant B Mean: 10.04s (n=109)
Difference: +2.70s (+36.8%)
95% CI: [0.27s, 5.13s]
p-value: 0.0296
Effect Size (d): 0.311
RESULT: β
SIGNIFICANT
CI includes zero β cannot conclude a meaningful difference.
SECONDARY METRICS & GUARDRAILS
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Completion Rate: A=84.8% β B=66.5% (-21.6%)
p=0.0015 β οΈ SIGNIFICANT DROP
Repeat Rate: A=46.1% β B=45.9% (-0.2%)
p=0.9781 β
OK
STATISTICAL POWER
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Observed Power: 58.3% (target: 80%)
MDE at n=89: d=0.42 (3.7s)
Sample needed for
observed effect: 163 per group
======================================================================
# 11.2 Final Recommendation
print("""
π RECOMMENDATIONS
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
1. DECISION: Do NOT ship Variant B
While the primary metric (completion time) showed no significant difference,
the guardrail metrics reveal concerning drops:
β’ Completion rate dropped 23% (86% β 63%)
β’ Repeat rate dropped 15.5 percentage points (31% β 15%)
The treatment (5 pineapples) may be too difficult, causing users to:
β’ Abandon the game before completing
β’ Not return for additional plays
2. EXPERIMENT LIMITATIONS
β’ Underpowered: Only 12.8% power to detect the observed effect
β’ Sample Ratio Mismatch: 33%/67% split instead of 50/50
β’ Small sample: n=122 total completions
3. NEXT STEPS
a) If testing difficulty variations:
β’ Try intermediate difficulty (4.5 pineapples via grid size)
β’ Run longer to achieve adequate power (n β 400 per group)
b) If keeping current design:
β’ Stick with Variant A (4 pineapples) as default
β’ Monitor completion and repeat rates as key health metrics
c) For future experiments:
β’ Pre-calculate required sample size before launch
β’ Implement proper 50/50 randomization
β’ Consider CUPED with pre-experiment user metrics
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Analysis completed: """ + datetime.now().strftime('%Y-%m-%d %H:%M:%S') + """
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
""")
π RECOMMENDATIONS
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
1. DECISION: Do NOT ship Variant B
While the primary metric (completion time) showed no significant difference,
the guardrail metrics reveal concerning drops:
β’ Completion rate dropped 23% (86% β 63%)
β’ Repeat rate dropped 15.5 percentage points (31% β 15%)
The treatment (5 pineapples) may be too difficult, causing users to:
β’ Abandon the game before completing
β’ Not return for additional plays
2. EXPERIMENT LIMITATIONS
β’ Underpowered: Only 12.8% power to detect the observed effect
β’ Sample Ratio Mismatch: 33%/67% split instead of 50/50
β’ Small sample: n=122 total completions
3. NEXT STEPS
a) If testing difficulty variations:
β’ Try intermediate difficulty (4.5 pineapples via grid size)
β’ Run longer to achieve adequate power (n β 400 per group)
b) If keeping current design:
β’ Stick with Variant A (4 pineapples) as default
β’ Monitor completion and repeat rates as key health metrics
c) For future experiments:
β’ Pre-calculate required sample size before launch
β’ Implement proper 50/50 randomization
β’ Consider CUPED with pre-experiment user metrics
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Analysis completed: 2025-12-26 12:08:13
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 11.3 Generate Summary YAML for Astro
# This creates a structured data file consumed by the <NotebookSummary /> component
import sys
sys.path.insert(0, str(Path(__file__).parent.parent.parent if '__file__' in dir() else Path.cwd().parent.parent))
from analytics.lib.summary import write_notebook_summary
# Determine status based on results
if p_value_two_tailed < 0.05:
status = "significant"
elif observed_power < 0.5:
status = "inconclusive"
else:
status = "not_significant"
# Build warnings list
warnings = []
if srm_p_value < 0.01:
warnings.append(f"Sample Ratio Mismatch detected ({observed_ratio_a:.0%}/{1-observed_ratio_a:.0%} vs expected 50/50)")
if observed_power < 0.80:
warnings.append(f"Underpowered experiment ({observed_power:.0%} power, need 80%)")
if p_value_cr < 0.05 and cr_b < cr_a:
warnings.append(f"Significant drop in completion rate ({cr_a:.0%} β {cr_b:.0%})")
if p_value_rr < 0.05 and rr_b < rr_a:
warnings.append(f"Significant drop in repeat rate ({rr_a:.0%} β {rr_b:.0%})")
# Determine significance text for completion time metric
ct_sig_text = "significant" if p_value_two_tailed < 0.05 else "not significant"
write_notebook_summary(
project_id="ab-simulator",
notebook_id="ab-test-analysis",
title="A/B Test: Pineapple Finder Difficulty",
status=status,
decision="Do NOT ship Variant B β guardrail metrics show significant drops in completion and engagement.",
metrics=[
{
"label": "Completion Time",
"value": f"{diff_mean:+.1f}s",
"delta": f"{diff_mean/time_a.mean()*100:+.1f}%",
"delta_direction": "up" if diff_mean > 0 else "down",
"context": f"p={p_value_two_tailed:.3f}, {ct_sig_text}"
},
{
"label": "Completion Rate",
"value": f"{cr_b:.0%}",
"delta": f"{(cr_b-cr_a)/cr_a*100:+.1f}%",
"delta_direction": "down",
"context": f"vs {cr_a:.0%} control"
},
{
"label": "Repeat Rate",
"value": f"{rr_b:.0%}",
"delta": f"{(rr_b-rr_a)*100:+.1f}pp",
"delta_direction": "down",
"context": f"vs {rr_a:.0%} control"
},
{
"label": "Effect Size",
"value": f"d={cohens_d:.2f}",
"context": "negligible (<0.2)" if abs(cohens_d) < 0.2 else "small" if abs(cohens_d) < 0.5 else "medium" if abs(cohens_d) < 0.8 else "large"
}
],
power_analysis=f"{observed_power:.0%} power β need {int(np.ceil(n_needed_obs)):,} per group for 80%",
warnings=warnings,
methodology="Welch's t-test (primary), Chi-square (completion rate), Z-test (repeat rate)"
)
β Summary written: public/analysis/ab-simulator/ab-test-analysis.yaml
'/home/runner/work/datascienceapps/datascienceapps/public/analysis/ab-simulator/ab-test-analysis.yaml'