Source code for xtal2png.utils.data

from importlib.resources import read_text
from typing import Optional, Sequence
from warnings import warn

import numpy as np
from numpy.testing import assert_allclose, assert_equal
from numpy.typing import ArrayLike
from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
from pymatgen.core.lattice import Lattice
from pymatgen.core.structure import Structure
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer

coords = [[0, 0, 0], [0.75, 0.5, 0.75]]
lattice = Lattice.from_parameters(a=3.84, b=3.84, c=3.84, alpha=120, beta=90, gamma=60)
dummy_structures = [
    Structure(lattice, ["Si", "Si"], coords),
    Structure(lattice, ["Ni", "Ni"], coords),
]

EXAMPLE_CIFS = ["Zn2B2PbO6.cif", "V2NiSe4.cif"]
example_structures = []
for cif in EXAMPLE_CIFS:
    cif_str = read_text("xtal2png.utils", cif)
    example_structures.append(Structure.from_str(cif_str, "cif"))


# ToDo: potentially expose tolerance options
def _get_space_group(s: Structure) -> int:
    """Get space group from structure.
    See issue https://github.com/sparks-baird/xtal2png/issues/184
    """
    try:
        return int(np.round(s.get_space_group_info()[1]))
    except TypeError:
        # 0 should be fine as it is not taken
        return 0


[docs]def element_wise_scaler(
    X: ArrayLike,
    feature_range: Optional[Sequence] = None,
    data_range: Optional[Sequence] = None,
):
    """Scale parameters according to a prespecified min and max (``data_range``).

    ``feature_range`` is preserved from MinMaxScaler

    See Also
    --------
    sklearn.preprocessing.MinMaxScaler : Scale each feature to a given range.

    Parameters
    ----------
    X : ArrayLike
        Features to be scaled element-wise.
    feature_range : Sequence
        The scaled values will span the range of ``feature_range``
    data_range : Sequence
        Expected bounds for the data, e.g. 0 to 117 for periodic elements

    Returns
    -------
    X_scaled
        Element-wise scaled values.

    Examples
    --------
    >>> element_wise_scaler([[1, 2], [3, 4]], feature_range=[1, 4], data_range=[0, 8])
    array([[1.375, 1.75 ],
        [2.125, 2.5  ]])
    """
    if not isinstance(X, np.ndarray):
        X = np.array(X)
    if data_range is None:
        data_range = [np.min(X), np.max(X)]
    if feature_range is None:
        feature_range = [np.min(X), np.max(X)]

    data_min, data_max = data_range
    feature_min, feature_max = feature_range
    # following modified from:
    # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
    X_std = (X - data_min) / (data_max - data_min)
    X_scaled = X_std * (feature_max - feature_min) + feature_min
    return X_scaled


[docs]def element_wise_unscaler(
    X_scaled: ArrayLike,
    feature_range: Sequence,
    data_range: Sequence,
):
    """Scale parameters according to a prespecified min and max (``data_range``).

    ``feature_range`` is preserved from MinMaxScaler

    See Also
    --------
    sklearn.preprocessing.MinMaxScaler : Scale each feature to a given range.

    Parameters
    ----------
    X : ArrayLike
        Element-wise scaled values.
    feature_range : Sequence
        The scaled values will span the range of ``feature_range``
    data_range : Sequence
        Expected bounds for the data, e.g. 0 to 117 for periodic elements

    Returns
    -------
    X
        Element-wise unscaled values.

    Examples
    --------
    >>> element_wise_unscaler(
    ...     [[1.375, 1.75], [2.125, 2.5]], feature_range=[1, 4], data_range=[0, 8]
    ... )
    array([[1., 2.],
       [3., 4.]])

    """
    if not isinstance(X_scaled, np.ndarray):
        X_scaled = np.array(X_scaled)

    data_min, data_max = data_range
    feature_min, feature_max = feature_range
    # following modified from:
    # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

    # inverse transform, checked against Mathematica
    X_std = (X_scaled - feature_min) / (feature_max - feature_min)
    X = data_min + (data_max - data_min) * X_std
    return X


[docs]def rgb_scaler(
    X: ArrayLike,
    data_range: Optional[Sequence] = None,
):
    """Scale parameters according to RGB scale (0 to 255).

    ``feature_range`` is fixed to [0, 255], ``data_range`` is either specified

    See Also
    --------
    sklearn.preprocessing.MinMaxScaler : Scale each feature to a given range.

    Parameters
    ----------
    X : ArrayLike
        Features to be scaled element-wise.
    data_range : Optional[Sequence]
        Range to use in place of np.min(X) and np.max(X) as in ``MinMaxScaler``.

    Returns
    -------
    X_scaled
        Element-wise scaled values.

    Examples
    --------
    >>> rgb_scaler([[1, 2], [3, 4]], data_range=[0, 8])
    array([[ 32,  64],
        [ 96, 128]], dtype=uint8)
    """
    rgb_range = [0, 255]
    X_scaled = element_wise_scaler(X, data_range=data_range, feature_range=rgb_range)
    X_scaled = np.round(X_scaled).astype(int)
    return X_scaled


[docs]def rgb_unscaler(
    X: ArrayLike,
    data_range: Sequence,
):
    """Unscale parameters from their RGB scale (0 to 255).

    ``feature_range`` is fixed to [0, 255], ``data_range`` is either specified or
    calculated based on min and max.

    See Also
    --------
    sklearn.preprocessing.MinMaxScaler : Scale each feature to a given range.

    Parameters
    ----------
    X : ArrayLike
        Element-wise scaled values.
    data_range : Optional[Sequence]
        Range to use in place of np.min(X) and np.max(X) as in ``class:MinMaxScaler``.

    Returns
    -------
    X
        Unscaled features.

    Examples
    --------
    >>> rgb_unscaler([[32, 64], [96, 128]], data_range=[0, 8])
    array([[1, 2],
          [3, 4]])
    """
    rgb_range = [0, 255]
    X_scaled = element_wise_unscaler(X, data_range=data_range, feature_range=rgb_range)
    return X_scaled


[docs]def get_image_mode(d: np.ndarray) -> str:
    """Get the image mode (i.e. "RGB" vs. grayscale ("L")) for an image array.

    Parameters
    ----------
    d : np.ndarray
        A NumPy array with 3 dimensions, where the first dimension corresponds to the
      of image channels and the second and third dimensions correspond to the height and
      width of the image.

    Returns
    -------
    mode : str
        "RGB" for 3-channel images and "L" for grayscale images.

    Raises
    ------
    ValueError
        "expected an array with 3 dimensions, received {d.ndim} dims"
    ValueError
        "Expected a single-channel or 3-channel array, but received a {d.ndim}-channel
        array."

    Examples
    --------
    >>> d = np.zeros((1, 64, 64), dtype=np.uint8) # grayscale image
    >>> mode = get_image_mode(d)
    "L"
    """
    if d.ndim != 3:
        raise ValueError("expected an array with 3 dimensions, received {d.ndim} dims")
    if d.shape[0] == 3:
        mode = "RGB"
    elif d.shape[0] == 1:
        mode = "L"
    else:
        raise ValueError(
            f"Expected a single-channel or 3-channel array, but received a {d.ndim}-channel array."  # noqa: E501
        )

    return mode


[docs]def unit_cell_converter(
    s: Structure, cell_type: Optional[str] = None, symprec=0.1, angle_tolerance=5.0
):
    """Convert from the original unit cell type to another unit cell via pymatgen.

    Parameters
    ----------
    s : Structure
        a pymatgen Structure.
    cell_type : Optional[str], optional
        The cell type as a str or None if leaving the structure as-is. Possible options
        are "primitive_standard", "conventional_standard", "refined", "reduced", and
        None. By default None

    Returns
    -------
    s : Structure
        The converted Structure.

    Raises
    ------
    ValueError
        "Expected one of 'primitive_standard', 'conventional_standard', 'refined',
        'reduced' or None, got {cell_type}"

    Examples
    --------
    >>> s = unit_cell_converter(s, cell_type="reduced")
    """
    spa = SpacegroupAnalyzer(
        s,
        symprec=symprec,
        angle_tolerance=angle_tolerance,
    )
    if cell_type == "primitive_standard":
        s = spa.get_primitive_standard_structure()
    elif cell_type == "conventional_standard":
        s = spa.get_conventional_standard_structure()
    elif cell_type == "refined":
        s = spa.get_refined_structure()
    elif cell_type == "reduced":
        s = s.get_reduced_structure()
    elif cell_type is not None:
        raise ValueError(
            f"Expected one of 'primitive_standard', 'conventional_standard', 'refined', 'reduced' or None, got {cell_type}"  # noqa: E501
        )
    return s


RGB_TOL = 1 / 255  # should this be 256?
RGB_LOOSE_TOL = 1.5 / 255


[docs]def assert_structures_approximate_match(
    example_structures, structures, tol_multiplier=1.0
):
    for i, (s, structure) in enumerate(zip(example_structures, structures)):
        dummy_matcher = StructureMatcher()
        ltol = dummy_matcher.ltol * tol_multiplier
        stol = dummy_matcher.stol * tol_multiplier
        angle_tol = dummy_matcher.angle_tol * tol_multiplier
        sm = StructureMatcher(
            ltol=ltol,
            stol=stol,
            angle_tol=angle_tol,
            comparator=ElementComparator(),
        )
        is_match = sm.fit(s, structure)
        if not is_match:
            warn(
                f"{i}-th original and decoded structures do not match according to StructureMatcher(comparator=ElementComparator()).fit(s, structure).\n\nOriginal (s): {s}\n\nDecoded (structure): {structure}"  # noqa: E501
            )

        spa = SpacegroupAnalyzer(s, symprec=0.1, angle_tolerance=5.0)
        s = spa.get_refined_structure()
        spa = SpacegroupAnalyzer(structure, symprec=0.1, angle_tolerance=5.0)
        structure = spa.get_refined_structure()

        sm = StructureMatcher(primitive_cell=False, comparator=ElementComparator())
        s2 = sm.get_s2_like_s1(s, structure)

        a_check = s._lattice.a
        b_check = s._lattice.b
        c_check = s._lattice.c
        angles_check = s._lattice.angles
        atomic_numbers_check = s.atomic_numbers
        frac_coords_check = s.frac_coords
        space_group_check = _get_space_group(s)

        latt_a = s2._lattice.a
        latt_b = s2._lattice.b
        latt_c = s2._lattice.c
        angles = s2._lattice.angles
        atomic_numbers = s2.atomic_numbers
        frac_coords = s2.frac_coords
        space_group = _get_space_group(s)

        assert_allclose(
            a_check,
            latt_a,
            rtol=RGB_LOOSE_TOL * tol_multiplier,
            err_msg="lattice parameter length `a` not all close",
        )

        assert_allclose(
            b_check,
            latt_b,
            rtol=RGB_LOOSE_TOL * tol_multiplier,
            err_msg="lattice parameter length `b` not all close",
        )

        assert_allclose(
            c_check,
            latt_c,
            rtol=RGB_LOOSE_TOL * 2 * tol_multiplier,
            err_msg="lattice parameter length `c` not all close",
        )

        assert_allclose(
            angles_check,
            angles,
            rtol=RGB_LOOSE_TOL * tol_multiplier,
            err_msg="lattice parameter angles not all close",
        )

        assert_allclose(
            atomic_numbers_check,
            atomic_numbers,
            rtol=RGB_LOOSE_TOL * tol_multiplier,
            err_msg="atomic numbers not all close",
        )

        # use atol since frac_coords values are between 0 and 1
        assert_allclose(
            frac_coords_check,
            frac_coords,
            atol=RGB_TOL * tol_multiplier,
            err_msg="atomic numbers not all close",
        )

        assert_equal(
            space_group_check,
            space_group,
            err_msg=f"space groups do not match. Original: {space_group_check}. Decoded: {space_group}.",  # noqa: E501
        )