154 lines
5.8 KiB
Python
154 lines
5.8 KiB
Python
"""
|
|
MLB Statcast enrichment using pybaseball.
|
|
|
|
Provides:
|
|
- Pitcher pitch-mix + zone heatmap data for K-prop grading
|
|
- Batter vs Pitcher historical matchup data
|
|
|
|
We avoid wide-net `statcast()` calls that pull every pitch league-wide —
|
|
those routinely time out. Pitcher-specific calls are scoped to a 30-day
|
|
trailing window which keeps payloads under a few hundred KB.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import time
|
|
from datetime import datetime, timedelta
|
|
from typing import Optional
|
|
|
|
import pandas as pd
|
|
from pybaseball import statcast_pitcher
|
|
|
|
from app.utils.cache import cache_get, cache_set
|
|
from app.config import SPLITS_TTL
|
|
|
|
|
|
def _today_iso() -> str:
|
|
return datetime.utcnow().strftime("%Y-%m-%d")
|
|
|
|
|
|
def _date_n_days_ago(n: int) -> str:
|
|
return (datetime.utcnow() - timedelta(days=n)).strftime("%Y-%m-%d")
|
|
|
|
|
|
def get_pitcher_profile(pitcher_id: int, days_back: int = 30) -> dict:
|
|
"""
|
|
Aggregate a pitcher's recent pitch-level data into pitch mix,
|
|
velocity, whiff/chase, and zone heatmap counts.
|
|
"""
|
|
if not isinstance(pitcher_id, int) or pitcher_id <= 0:
|
|
return {"error": "invalid pitcher_id"}
|
|
|
|
cache_key = f"mlb:pitcher:{pitcher_id}:d{days_back}"
|
|
cached = cache_get(cache_key)
|
|
if cached is not None:
|
|
cached["source"] = "cache"
|
|
return cached
|
|
|
|
end = _today_iso()
|
|
start = _date_n_days_ago(days_back)
|
|
|
|
try:
|
|
data = statcast_pitcher(start, end, pitcher_id)
|
|
except Exception as exc:
|
|
return {"error": f"statcast fetch failed: {exc!s}"}
|
|
|
|
if data is None or data.empty:
|
|
return {"pitcher_id": pitcher_id, "pitch_mix": [], "zone": [], "note": "no data"}
|
|
|
|
# Pitch mix
|
|
description_col = data["description"] if "description" in data.columns else pd.Series(dtype=str)
|
|
pitch_mix_grouped = data.groupby("pitch_type") if "pitch_type" in data.columns else None
|
|
pitch_mix: list[dict] = []
|
|
if pitch_mix_grouped is not None:
|
|
for ptype, g in pitch_mix_grouped:
|
|
total = len(g)
|
|
d = g["description"] if "description" in g.columns else pd.Series(dtype=str)
|
|
swings = d.isin([
|
|
"swinging_strike", "foul", "foul_tip", "hit_into_play",
|
|
"swinging_strike_blocked",
|
|
]).sum() if not d.empty else 0
|
|
whiffs = (d == "swinging_strike").sum() if not d.empty else 0
|
|
pitch_mix.append({
|
|
"pitch_type": str(ptype),
|
|
"count": int(total),
|
|
"share": float(total / len(data)) if len(data) else 0.0,
|
|
"avg_velocity": float(g["release_speed"].mean()) if "release_speed" in g.columns else None,
|
|
"whiff_rate": float(whiffs / swings) if swings else 0.0,
|
|
})
|
|
|
|
# Zone heatmap (the existing pybaseball 'zone' column is the 13-zone scheme)
|
|
zone_data: list[dict] = []
|
|
if "zone" in data.columns:
|
|
for zone, g in data.groupby("zone"):
|
|
d = g["description"] if "description" in g.columns else pd.Series(dtype=str)
|
|
zone_data.append({
|
|
"zone": int(zone) if pd.notna(zone) else None,
|
|
"pitches": int(len(g)),
|
|
"whiff_rate": float((d == "swinging_strike").mean()) if not d.empty else 0.0,
|
|
})
|
|
|
|
result = {
|
|
"pitcher_id": pitcher_id,
|
|
"window_days": days_back,
|
|
"total_pitches": int(len(data)),
|
|
"avg_velocity": float(data["release_speed"].mean()) if "release_speed" in data.columns else None,
|
|
"k_rate_estimate": float((data["events"] == "strikeout").mean()) if "events" in data.columns else None,
|
|
"pitch_mix": pitch_mix,
|
|
"zone": zone_data,
|
|
"source": "statcast",
|
|
}
|
|
cache_set(cache_key, result, SPLITS_TTL)
|
|
return result
|
|
|
|
|
|
def get_batter_vs_pitcher(batter_id: int, pitcher_id: int, years_back: int = 3) -> dict:
|
|
"""
|
|
Historical matchup. We scope to the pitcher because their pitch stream
|
|
is small enough to fetch quickly; then filter to plate appearances by
|
|
the batter.
|
|
"""
|
|
if not isinstance(batter_id, int) or not isinstance(pitcher_id, int):
|
|
return {"error": "invalid ids"}
|
|
|
|
cache_key = f"mlb:bvp:{batter_id}:{pitcher_id}:y{years_back}"
|
|
cached = cache_get(cache_key)
|
|
if cached is not None:
|
|
cached["source"] = "cache"
|
|
return cached
|
|
|
|
end = _today_iso()
|
|
start = _date_n_days_ago(365 * years_back)
|
|
|
|
try:
|
|
pitcher_data = statcast_pitcher(start, end, pitcher_id)
|
|
except Exception as exc:
|
|
return {"error": f"statcast fetch failed: {exc!s}"}
|
|
|
|
if pitcher_data is None or pitcher_data.empty or "batter" not in pitcher_data.columns:
|
|
return {"batter_id": batter_id, "pitcher_id": pitcher_id, "matchup": "no data"}
|
|
|
|
matchup = pitcher_data[pitcher_data["batter"] == batter_id]
|
|
if matchup.empty:
|
|
return {"batter_id": batter_id, "pitcher_id": pitcher_id, "matchup": "no history"}
|
|
|
|
events = matchup["events"] if "events" in matchup.columns else pd.Series(dtype=str)
|
|
result = {
|
|
"batter_id": batter_id,
|
|
"pitcher_id": pitcher_id,
|
|
"plate_appearances": int(events.notna().sum()),
|
|
"hits": int(events.isin(["single", "double", "triple", "home_run"]).sum()),
|
|
"strikeouts": int((events == "strikeout").sum()),
|
|
"home_runs": int((events == "home_run").sum()),
|
|
"walks": int((events == "walk").sum()),
|
|
"avg_exit_velocity": float(matchup["launch_speed"].mean()) if "launch_speed" in matchup.columns else None,
|
|
"pitches_seen": int(len(matchup)),
|
|
"pitch_types_faced": {
|
|
str(k): int(v)
|
|
for k, v in (matchup["pitch_type"].value_counts().to_dict().items() if "pitch_type" in matchup.columns else {}).items()
|
|
},
|
|
"source": "statcast",
|
|
}
|
|
# Cache aggressively — historical matchup data is stable.
|
|
cache_set(cache_key, result, SPLITS_TTL * 2)
|
|
return result
|