Source code for apem.unit_based_model.evaluation.welfare_analysis

"""Utilities for loading and validating welfare tables from structured files or allocation stats text files."""

from __future__ import annotations

from pathlib import Path

import pandas as pd

REQUIRED_COLUMNS = ("power_flow_model", "welfare_scope", "period", "welfare")
SUPPORTED_WELFARE_SCOPES = {"period", "total"}



[docs]
def load_welfare_table(
    path: str | Path,
    *,
    power_flow_model_name: str | None = None,
    welfare_scope_column: str = "welfare_scope",
    period_column: str = "period",
    welfare_column: str = "welfare",
    sheet_name: str = "Sheet1",
) -> pd.DataFrame:
    """
    Load a welfare table from disk and normalize core columns.

    Supported file types are ``.txt``, ``.csv``, ``.parquet``, ``.xlsx``,
    and ``.xls``.

    :param path: file path to load
    :param power_flow_model_name: model name override used when the loaded file
                                  does not include ``power_flow_model``
    :param welfare_scope_column: source column name mapped to ``welfare_scope``
    :param period_column: source column name mapped to ``period``
    :param welfare_column: source column name mapped to ``welfare``
    :param sheet_name: Excel sheet name when loading ``.xlsx``/``.xls``
    :return: validated normalized welfare table
    :raises ValueError: if file type is unsupported or parsed data fails
                        validation
    """
    file_path = Path(path)
    suffix = file_path.suffix.lower()
    supported_suffixes = {".txt", ".csv", ".parquet", ".xlsx", ".xls"}

    if suffix not in supported_suffixes:
        supported = ", ".join(sorted(supported_suffixes))
        raise ValueError(f"Unsupported file type '{suffix}'. Supported types: {supported}.")

    if suffix == ".txt":
        df = _load_welfare_from_stats_file(file_path)
    elif suffix == ".csv":
        df = pd.read_csv(file_path)
    elif suffix == ".parquet":
        df = pd.read_parquet(file_path)
    else:
        df = pd.read_excel(file_path, sheet_name=sheet_name)

    df = df.rename(columns=lambda value: str(value).strip())

    rename_map: dict[str, str] = {}
    if welfare_scope_column != "welfare_scope" and welfare_scope_column in df.columns:
        rename_map[welfare_scope_column] = "welfare_scope"
    if period_column != "period" and period_column in df.columns:
        rename_map[period_column] = "period"
    if welfare_column != "welfare" and welfare_column in df.columns:
        rename_map[welfare_column] = "welfare"
    if rename_map:
        df = df.rename(columns=rename_map)

    if "power_flow_model" not in df.columns:
        df["power_flow_model"] = power_flow_model_name or _infer_power_flow_model_name(file_path)

    return validate_welfare_table(df)




[docs]
def validate_welfare_table(df: pd.DataFrame) -> pd.DataFrame:
    """
    Validate and normalize a generic welfare-analysis input table.

    :param df: input table expected to contain ``power_flow_model``,
               ``welfare_scope``, ``period``, and ``welfare``
    :return: normalized copy with lowercase scope labels, integer-like periods,
             and numeric welfare values
    :raises ValueError: if required columns are missing, scope values are
                        unsupported, model labels are empty, or period/scope
                        combinations are inconsistent
    """
    normalized = df.copy()
    normalized.columns = [str(column).strip() for column in normalized.columns]

    missing = [column for column in REQUIRED_COLUMNS if column not in normalized.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}. Required columns: {list(REQUIRED_COLUMNS)}.")

    normalized["power_flow_model"] = normalized["power_flow_model"].astype(str).str.strip()
    normalized["welfare_scope"] = normalized["welfare_scope"].astype(str).str.strip().str.lower()
    normalized["period"] = pd.to_numeric(normalized["period"], errors="coerce").astype("Int64")
    normalized["welfare"] = pd.to_numeric(normalized["welfare"], errors="coerce")

    if normalized["power_flow_model"].eq("").any():
        raise ValueError("Column 'power_flow_model' contains empty values.")

    invalid_scopes = sorted(set(normalized["welfare_scope"]) - SUPPORTED_WELFARE_SCOPES)
    if invalid_scopes:
        raise ValueError(
            f"Unsupported welfare_scope values: {invalid_scopes}. "
            f"Supported welfare_scope values: {sorted(SUPPORTED_WELFARE_SCOPES)}."
        )

    if normalized["welfare"].notna().sum() == 0:
        raise ValueError("Column 'welfare' does not contain any numeric values.")

    if ((normalized["welfare_scope"] == "period") & normalized["period"].isna()).any():
        raise ValueError("Rows with welfare_scope='period' must have a numeric period.")

    if ((normalized["welfare_scope"] == "total") & normalized["period"].notna()).any():
        raise ValueError("Rows with welfare_scope='total' must not have a period value.")

    return normalized



def _infer_power_flow_model_name(file_path: Path) -> str:
    stem = file_path.stem
    if stem.endswith("_stats"):
        return stem.removesuffix("_stats")
    return stem


def _load_welfare_from_stats_file(file_path: Path) -> pd.DataFrame:
    records: list[dict[str, object]] = []
    for line in file_path.read_text(encoding="utf-8").splitlines():
        stripped = line.strip()
        if not stripped or ":" not in stripped:
            continue

        label, raw_value = stripped.split(":", maxsplit=1)
        label = label.strip()
        raw_value = raw_value.strip()

        if label.startswith("Welfare period "):
            records.append(
                {
                    "welfare_scope": "period",
                    "period": label.removeprefix("Welfare period ").strip(),
                    "welfare": raw_value,
                }
            )
        elif label == "Total welfare":
            records.append(
                {
                    "welfare_scope": "total",
                    "period": pd.NA,
                    "welfare": raw_value,
                }
            )

    return pd.DataFrame(records)