Source code for chemparseplot.parse.chemgp_hdf5
# SPDX-FileCopyrightText: 2023-present Rohit Goswami <rog32@hi.is>
#
# SPDX-License-Identifier: MIT
"""HDF5 file I/O utilities for ChemGP data.
This module provides functions for reading structured data from ChemGP HDF5
output files. The HDF5 layout mirrors the Julia common_plot.jl helpers.
HDF5 Layout
-----------
- ``grids/<name>``: 2D arrays with attrs x_range, y_range, x_length, y_length
- ``table/<name>``: group of same-length 1D arrays
- ``paths/<name>``: point sequences (x, y or rAB, rBC)
- ``points/<name>``: point sets (x, y or pc1, pc2)
- Root attrs: metadata scalars
.. versionadded:: 1.7.0
Extracted from chemgp.plt_gp to standalone module.
"""
from typing import Any
import numpy as np
[docs]
def read_h5_table(f: Any, name: str = "table") -> Any:
"""Read a group of same-length vectors as a DataFrame.
Parameters
----------
f
Open HDF5 file object
name
Name of the table group (default: "table")
Returns
-------
DataFrame
DataFrame with columns from the HDF5 group
"""
import pandas as pd
g = f[name]
cols = {}
for k in g.keys():
arr = g[k][()]
if arr.dtype.kind in {"S", "O"}:
cols[k] = arr.astype(str).tolist()
else:
cols[k] = arr.tolist()
return pd.DataFrame(cols)
[docs]
def read_h5_grid(
f: Any, name: str
) -> tuple[np.ndarray, np.ndarray | None, np.ndarray | None]:
"""Read a 2D grid with optional axis ranges.
Parameters
----------
f
Open HDF5 file object
name
Name of the grid dataset
Returns
-------
tuple
(data, x_coords, y_coords) where x_coords and y_coords may be None
if axis range attributes are not present
"""
ds = f[f"grids/{name}"]
data = ds[()]
x_coords = None
y_coords = None
if "x_range" in ds.attrs and "x_length" in ds.attrs:
lo, hi = ds.attrs["x_range"]
n = int(ds.attrs["x_length"])
x_coords = np.linspace(lo, hi, n)
if "y_range" in ds.attrs and "y_length" in ds.attrs:
lo, hi = ds.attrs["y_range"]
n = int(ds.attrs["y_length"])
y_coords = np.linspace(lo, hi, n)
return data, x_coords, y_coords
[docs]
def read_h5_path(f: Any, name: str) -> dict[str, np.ndarray]:
"""Read a path (ordered point sequence).
Parameters
----------
f
Open HDF5 file object
name
Name of the path dataset
Returns
-------
dict
Dictionary mapping coordinate names to arrays
"""
g = f[f"paths/{name}"]
return {k: g[k][()] for k in g.keys()}
[docs]
def read_h5_points(f: Any, name: str) -> dict[str, np.ndarray]:
"""Read a point set.
Parameters
----------
f
Open HDF5 file object
name
Name of the points dataset
Returns
-------
dict
Dictionary mapping coordinate names to arrays
"""
g = f[f"points/{name}"]
return {k: g[k][()] for k in g.keys()}
[docs]
def validate_hdf5_structure(
f: Any, required_groups: list[str] | None = None
) -> list[str]:
"""Validate HDF5 file has expected structure.
Parameters
----------
f
Open HDF5 file object
required_groups
List of required group names (default: ["grids", "table"])
Returns
-------
list[str]
List of missing groups (empty if all present)
Raises
------
ValueError
If required groups are missing
"""
if required_groups is None:
required_groups = ["grids", "table"]
missing = [g for g in required_groups if g not in f]
if missing:
msg = f"Invalid HDF5 structure. Missing groups: {missing}"
raise ValueError(msg)
return missing