Source code for apem.unit_based_model.evaluation.lost_opp_cost_analysis

"""Utilities for loading and validating lost opportunity cost tables."""

from __future__ import annotations

from pathlib import Path

import pandas as pd

REQUIRED_COLUMNS = ("algorithm", "lost_opp_cost", "component", "value")
SUPPORTED_LOST_OPP_COSTS = {"glocs", "llocs", "mwps"}
SUPPORTED_COMPONENTS = {"buyers", "sellers", "network", "total"}

LOST_OPP_COST_LINE_LABELS = {
    "GLOCs buyers": ("glocs", "buyers"),
    "GLOCs sellers": ("glocs", "sellers"),
    "GLOCs network": ("glocs", "network"),
    "Total GLOCs": ("glocs", "total"),
    "LLOCs buyers": ("llocs", "buyers"),
    "LLOCs sellers": ("llocs", "sellers"),
    "LLOCs network": ("llocs", "network"),
    "Total LLOCs": ("llocs", "total"),
    "MWPs buyers": ("mwps", "buyers"),
    "MWPs sellers": ("mwps", "sellers"),
    "MWPs network": ("mwps", "network"),
    "Total MWPs": ("mwps", "total"),
}



[docs]
def load_lost_opp_cost_table(
    path: str | Path,
    *,
    algorithm_column: str = "algorithm",
    lost_opp_cost_column: str = "lost_opp_cost",
    component_column: str = "component",
    value_column: str = "value",
    sheet_name: str = "Sheet1",
) -> pd.DataFrame:
    """
    Load a lost-opportunity-cost table from disk and normalize core columns.

    Supported file types are ``.csv``, ``.parquet``, ``.txt``, ``.xlsx``,
    and ``.xls``.

    :param path: file path to load
    :param algorithm_column: source column name mapped to ``algorithm``
    :param lost_opp_cost_column: source column name mapped to ``lost_opp_cost``
    :param component_column: source column name mapped to ``component``
    :param value_column: source column name mapped to ``value``
    :param sheet_name: Excel sheet name when loading ``.xlsx``/``.xls``
    :return: validated normalized table with columns
             ``algorithm``, ``lost_opp_cost``, ``component``, ``value``
    :raises ValueError: if the file type is unsupported or parsed data fails
                        validation
    """
    file_path = Path(path)
    suffix = file_path.suffix.lower()
    supported_suffixes = {".csv", ".parquet", ".txt", ".xlsx", ".xls"}

    if suffix not in supported_suffixes:
        supported = ", ".join(sorted(supported_suffixes))
        raise ValueError(f"Unsupported file type '{suffix}'. Supported types: {supported}.")

    if suffix == ".csv":
        df = pd.read_csv(file_path)
    elif suffix == ".parquet":
        df = pd.read_parquet(file_path)
    elif suffix == ".txt":
        df = _load_lost_opp_costs_from_stats_file(file_path)
    else:
        df = pd.read_excel(file_path, sheet_name=sheet_name)

    df = df.rename(columns=lambda value: str(value).strip())

    rename_map: dict[str, str] = {}
    if algorithm_column != "algorithm" and algorithm_column in df.columns:
        rename_map[algorithm_column] = "algorithm"
    if lost_opp_cost_column != "lost_opp_cost" and lost_opp_cost_column in df.columns:
        rename_map[lost_opp_cost_column] = "lost_opp_cost"
    if component_column != "component" and component_column in df.columns:
        rename_map[component_column] = "component"
    if value_column != "value" and value_column in df.columns:
        rename_map[value_column] = "value"
    if rename_map:
        df = df.rename(columns=rename_map)

    if "algorithm" not in df.columns:
        df["algorithm"] = _infer_algorithm_name(file_path)

    return validate_lost_opp_cost_table(df)




[docs]
def validate_lost_opp_cost_table(df: pd.DataFrame) -> pd.DataFrame:
    """
    Validate and normalize a generic lost-opportunity-cost input table.

    :param df: input table expected to contain ``algorithm``,
               ``lost_opp_cost``, ``component``, and ``value``
    :return: normalized copy with lowercase categorical values and numeric
             ``value``
    :raises ValueError: if required columns are missing, unsupported categories
                        are present, labels are empty, or no numeric values are
                        available
    """
    normalized = df.copy()
    normalized.columns = [str(column).strip() for column in normalized.columns]

    missing = [column for column in REQUIRED_COLUMNS if column not in normalized.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}. Required columns: {list(REQUIRED_COLUMNS)}.")

    normalized["algorithm"] = normalized["algorithm"].astype(str).str.strip()
    normalized["lost_opp_cost"] = normalized["lost_opp_cost"].astype(str).str.strip().str.lower()
    normalized["component"] = normalized["component"].astype(str).str.strip().str.lower()
    normalized["value"] = pd.to_numeric(normalized["value"], errors="coerce")

    if normalized["algorithm"].eq("").any():
        raise ValueError("Column 'algorithm' contains empty values.")
    if normalized["lost_opp_cost"].eq("").any():
        raise ValueError("Column 'lost_opp_cost' contains empty values.")
    if normalized["component"].eq("").any():
        raise ValueError("Column 'component' contains empty values.")

    invalid_lost_opp_costs = sorted(set(normalized["lost_opp_cost"]) - SUPPORTED_LOST_OPP_COSTS)
    if invalid_lost_opp_costs:
        raise ValueError(
            f"Unsupported lost_opp_cost values: {invalid_lost_opp_costs}. "
            f"Supported lost_opp_cost values: {sorted(SUPPORTED_LOST_OPP_COSTS)}."
        )

    invalid_components = sorted(set(normalized["component"]) - SUPPORTED_COMPONENTS)
    if invalid_components:
        raise ValueError(
            f"Unsupported component values: {invalid_components}. "
            f"Supported components: {sorted(SUPPORTED_COMPONENTS)}."
        )

    if normalized["value"].notna().sum() == 0:
        raise ValueError("Column 'value' does not contain any numeric values.")

    return normalized



def _infer_algorithm_name(file_path: Path) -> str:
    parent_name = file_path.parent.name
    if parent_name.endswith("_results"):
        return parent_name.removesuffix("_results")
    stem = file_path.stem
    if stem.endswith("_stats"):
        return stem.removesuffix("_stats")
    return stem


def _load_lost_opp_costs_from_stats_file(file_path: Path) -> pd.DataFrame:
    records: list[dict[str, object]] = []
    for line in file_path.read_text(encoding="utf-8").splitlines():
        stripped = line.strip()
        if not stripped or ":" not in stripped:
            continue
        label, raw_value = stripped.split(":", maxsplit=1)
        if label not in LOST_OPP_COST_LINE_LABELS:
            continue

        lost_opp_cost, component = LOST_OPP_COST_LINE_LABELS[label]
        records.append(
            {
                "lost_opp_cost": lost_opp_cost,
                "component": component,
                "value": raw_value.strip(),
            }
        )

    return pd.DataFrame(records)