Source code for op3.uq.encoder_bridge

"""
Chapter 8 encoder <-> Op^3 UQ bridge (Task 20).

Exposes the 1,794-row OptumGX Monte Carlo database used by the
dissertation Chapter 8 digital-twin encoder as a first-class Op^3
UQ data source. Previously the MC database was consumed via a
side-channel CSV loaded directly by the encoder training script;
this module turns it into a propagator that any downstream Op^3
stage (Bayesian calibration, PCE surrogate, DLC sensitivity) can
consume.

Real CSV schema (SiteA integrated_database_1794.csv):

    run, S_D, scour_m, su0, k_su, Hmax_kN, H_ratio, V_ratio,
    f1_Hz, f1_f0, fixity_proxy

1794 Monte Carlo runs sampled over:
    scour_m  in [0.0, 4.0] m
    su0      in [7.5, 27.8] kPa  (surface undrained shear strength)
    k_su     in [12, 35] kPa/m   (depth gradient)

outputs:
    f1_Hz    first fore-aft bending frequency
    f1_f0    f1 normalised by the pristine (s=0) case
    Hmax_kN  ultimate horizontal capacity at collapse
    H_ratio  Hmax relative to pristine
    fixity_proxy (base rotational compliance indicator)

Use
---
    from op3.uq.encoder_bridge import load_site_a_mc
    df = load_site_a_mc()   # reads PHD/data/integrated_database_1794.csv
    # Sample the real joint distribution for Bayesian updates
    # or encoder training.

``prior`` is a list of ``SoilPrior`` objects whose mean and COV are
statistically consistent with the database, so
``propagate_pisa_mc(soil_priors=prior, ...)`` produces the same joint
distribution the encoder was trained on. This decouples the encoder
from the raw OptumGX runs and makes every Ch8 derivation traceable
through the committed Op^3 pipeline.
"""
from __future__ import annotations

from pathlib import Path

import numpy as np
import pandas as pd

from op3.uq.propagation import SoilPrior



[docs]
def load_encoder_mc(csv_path: str | Path) -> pd.DataFrame:
    """
    Load a generic Chapter 8 MC database. For the real SiteA case
    use ``load_site_a_mc()`` which auto-resolves the PHD path.

    Returns a pandas DataFrame. The CSV is expected to have one row
    per MC realisation; any columns beyond the minimum set are
    preserved for downstream feature engineering.
    """
    p = Path(csv_path)
    if not p.exists():
        raise FileNotFoundError(f"encoder MC database not found: {p}")
    df = pd.read_csv(p)
    if "run_id" not in df.columns and "run" not in df.columns:
        df = df.reset_index().rename(columns={"index": "run_id"})
    return df




[docs]
def load_site_a_mc() -> pd.DataFrame:
    """
    Load the real 1794-sample SiteA integrated MC database from the
    PHD SSOT (``F:/TREE_OF_THOUGHT/PHD/data/integrated_database_1794.csv``).

    This is the authoritative training set for the Chapter 8 digital
    twin encoder. The database spans scour [0, 4] m, su0 [7.5, 27.8]
    kPa, and k_su [12, 35] kPa/m, and records f1_Hz, Hmax_kN, and
    fixity_proxy outputs from 1794 real OptumGX + OpenSeesPy runs.
    """
    from op3.data_sources import site_a_mc_database
    p = site_a_mc_database()
    return pd.read_csv(p)




[docs]
def encoder_as_prior(
    df: pd.DataFrame,
    columns: list[str],
    *,
    default_soil_type: str = "sand",
    su_or_phi_default: float = 35.0,
) -> list[SoilPrior]:
    """
    Turn per-column statistics from the MC database into a list of
    ``SoilPrior`` objects, one per column. Each prior carries the
    empirical mean and COV computed from the column; the soil_type
    and strength default are user-selectable.

    The resulting prior list is suitable for feeding into
    ``op3.uq.propagation.propagate_pisa_mc`` to reproduce the encoder
    training distribution through the deterministic PISA pipeline.
    """
    priors: list[SoilPrior] = []
    for i, col in enumerate(columns):
        if col not in df.columns:
            raise KeyError(f"column {col} missing from MC database")
        values = df[col].values.astype(float)
        mean = float(np.mean(values))
        std = float(np.std(values))
        cov = float(std / mean) if mean > 0 else 0.30
        priors.append(SoilPrior(
            depth_m=float(i * 15.0),   # 15 m spacing by convention
            G_mean_Pa=mean,
            G_cov=cov,
            soil_type=default_soil_type,
            su_or_phi_mean=su_or_phi_default,
            su_or_phi_cov=0.10,
        ))
    return priors




[docs]
def bayesian_from_encoder(
    df: pd.DataFrame,
    *,
    forward_model,
    observation_col: str = "f1_Hz",
    parameter_col: str = "G0_top_Pa",
    sigma: float = 0.005,
    n_grid: int = 101,
):
    """
    Treat one MC row as the "truth" observation and run an Op^3
    Bayesian calibration over another parameter column. Useful for
    synthetic-truth verification of the encoder: if the encoder is
    consistent, the posterior mean should recover the row's true
    parameter value.

    Returns an ``op3.uq.bayesian.BayesianPosterior``.
    """
    import numpy as np
    from op3.uq.bayesian import grid_bayesian_calibration, normal_likelihood

    truth_param = float(df[parameter_col].iloc[0])
    measured = float(df[observation_col].iloc[0])

    lo, hi = 0.5 * truth_param, 1.5 * truth_param
    grid = np.linspace(lo, hi, n_grid)

    return grid_bayesian_calibration(
        forward_model=forward_model,
        likelihood_fn=normal_likelihood(measured, sigma),
        grid=grid,
    )