Source code for ingredient_parser.dataclasses

#!/usr/bin/env python3

import copy
import enum
import operator
from dataclasses import dataclass, field
from fractions import Fraction
from functools import reduce
from statistics import mean
from typing import Any

import pint

from ._common import UREG
from .inference import NumpyCRFInference



[docs]
class UnitSystem(enum.StrEnum):
    """Enum defining unit systems"""

    METRIC = enum.auto()
    US_CUSTOMARY = enum.auto()
    IMPERIAL = enum.auto()
    AUSTRALIAN = enum.auto()
    JAPANESE = enum.auto()
    OTHER = enum.auto()
    NONE = enum.auto()




[docs]
@dataclass
class TokenFeatures:
    """Dataclass for common token features.

    Attributes
    ----------
    stem : str
        Stem of the token.
    shape : str
        Shape of the token, represented by X, x, d characters.
    is_capitalised : bool
        True if the token starts with a capital letter, else False.
    is_unit : str
        True if the token is in the list of units, else False.
    is_punc : str
        True if the token is a punctuation character, else False.
    is_ambiguous_unit : str
        True if the token is in the list of ambiguous units, else False.
    """

    stem: str
    shape: str
    is_capitalised: bool
    is_unit: bool
    is_punc: bool
    is_ambiguous_unit: bool




[docs]
@dataclass
class Token:
    """Dataclass representing a token from a ingredient sentence.

    Attributes
    ----------
    index : int
        Index of the token in the sentence.
    text : str
        Token text.
    feat_text : str
        Token text used for feature generation.
    pos_tag : str
        Part of speech tag for token.
    features : TokenFeatures
        Common features for token.
    """

    index: int
    text: str
    feat_text: str
    pos_tag: str
    features: TokenFeatures




[docs]
@dataclass
class LabelledToken:
    """Dataclass representing a labelled token from a ingredient sentence.

    Attributes
    ----------
    index : int
        Index of the token in the sentence.
    text : str
        Token text.
    pos_tag : str
        TPart of speech tag for token.
    label : str
        Label assigned to token.
    score : float
        Confidence of assigned label between 0 and 1.
    plural : bool
        True if token is plural.
    """

    index: int
    text: str
    pos_tag: str
    label: str
    score: float
    plural: bool




[docs]
@dataclass
class IngredientAmount:
    """Dataclass for holding a parsed ingredient amount.

    On instantiation, the unit is made plural if necessary.

    Attributes
    ----------
    quantity : Fraction | str
        Parsed ingredient quantity, as a Fraction where possible, otherwise a string.
        If the amount if a range, this is the lower limit of the range.
    quantity_max : Fraction | str
        If the amount is a range, this is the upper limit of the range.
        Otherwise, this is the same as the quantity field.
        This is set automatically depending on the type of quantity.
    unit : str | pint.Unit
        Unit of parsed ingredient quantity.
        If the quantity is recognised in the pint unit registry, a pint.Unit
        object is used.
    text : str
        String describing the amount e.g. "1 cup", "8 oz"
    confidence : float
        Confidence of parsed ingredient amount, between 0 and 1.
        This is the average confidence of all tokens that contribute to this object.
    starting_index : int
        Index of token in sentence that starts this amount
    unit_system : UnitSystem
        Unit system (e.g. metric) that the unit of the amount belongs to.
    APPROXIMATE : bool, optional
        When True, indicates that the amount is approximate.
        Default is False.
    SINGULAR : bool, optional
        When True, indicates if the amount refers to a singular item of the ingredient.
        Default is False.
    RANGE : bool, optional
        When True, indicates the amount is a range e.g. 1-2.
        Default is False.
    MULTIPLIER : bool, optional
        When True, indicates the amount is a multiplier e.g. 1x, 2x.
        Default is False.
    PREPARED_INGREDIENT : bool, optional
        When True, indicates the amount applies to the prepared ingredient.
        When False, indicates the amount applies to the ingredient before preparation.
        Default is False.
    """

    quantity: Fraction | str
    quantity_max: Fraction | str
    unit: str | pint.Unit
    text: str
    confidence: float
    starting_index: int
    unit_system: UnitSystem = field(init=False)
    APPROXIMATE: bool = False
    SINGULAR: bool = False
    RANGE: bool = False
    MULTIPLIER: bool = False
    PREPARED_INGREDIENT: bool = False

    def __post_init__(self):
        self.unit_system = self._determine_unit_system()

    def _copy(self):
        """Return deepcopy of current object.

        Returns
        -------
        Self
            Deep copy of self.
        """
        return copy.deepcopy(self)


[docs]
    def convert_to(self, unit: str, density: pint.Quantity = 1000 * UREG("kg/m^3")):
        """Convert units of IngredientAmount object to given unit.

        Conversion is only possible if none of the quantity, quantity_max and unit are
        strings.

        Conversion between mass and volume is supported using the density parameter, but
        otherwise a DimensionalityError is raised if attempting to convert units of
        different dimensionality.

        .. warning::

            When a conversion between mass <-> volume is performed, the quantities will
            be converted to floats.

        Parameters
        ----------
        unit : str
            Unit to convert to.
        density : pint.Quantity, optional
            Density used for conversion between volume and mass.
            Default is the density of water.

        Returns
        -------
        Self
            Copy of IngredientAmount object with units converted to given unit.

        Raises
        ------
        TypeError
            Raised if unit, quantity or quantity_max are str
        """
        if (
            isinstance(self.unit, str)
            or isinstance(self.quantity, str)
            or isinstance(self.quantity_max, str)
        ):
            raise TypeError("Cannot convert where quantity or unit is a string.")

        q: pint.Quantity = self.quantity * self.unit  # type: ignore
        q_max: pint.Quantity = self.quantity_max * self.unit  # type: ignore

        # Apply density context for conversion.
        # This is only relevant if converting between mass <-> volume.
        with UREG.context("density", p=density):
            q_converted = q.to(unit)  # type: ignore
            q_max_converted = q_max.to(unit)  # type: ignore

        converted_amount = self._copy()
        converted_amount.quantity = q_converted.magnitude
        converted_amount.quantity_max = q_max_converted.magnitude
        converted_amount.unit = q_converted.units  # type: ignore
        converted_amount.unit_system = converted_amount._determine_unit_system()

        # Fraction objects don't support float-style formatting until Python 3.12, so we
        # can't just use f"{q_converted:P}"
        converted_amount.text = (
            f"{float(q_converted.magnitude):g} " + f"{q_converted.units:P}"
        )

        return converted_amount


    def _determine_unit_system(self) -> UnitSystem:
        """Determine the unit system (e.g. metric, imperial) for the amount.

        Returns
        -------
        UnitSystem
            Enum specifying the unit system in use.
        """
        if self.unit == "":
            return UnitSystem.NONE

        # If unit is a pint.Unit, convert to string
        str_unit = str(self.unit) if isinstance(self.unit, pint.Unit) else self.unit

        # Detect if unit uses a particular volumetric unit system.
        # Remove that identifying text from the unit to make the check below simpler.
        imperial_unit = "imperial_" in str_unit
        metric_unit = "metric_" in str_unit
        aus_unit = "aus_" in str_unit
        jpn_unit = "jp_" in str_unit
        str_unit = str_unit.replace("imperial_", "")
        str_unit = str_unit.replace("metric_", "")
        str_unit = str_unit.replace("aus_", "")
        str_unit = str_unit.replace("jp_", "")

        # Split unit on spaces for cases like "large clove", "oz can"
        for part in str_unit.split():
            if part.lower() in {
                "g",
                "gram",
                "kg",
                "kilogram",
                "l",
                "liter",
                "litre",
                "ml",
                "milliliter",
                "millilitre",
            }:
                return UnitSystem.METRIC
            elif part.lower() in {
                "lb",
                "pound",
                "oz",
                "ounce",
                "fluid_ounce",
                "st",
                "stone",
                "c",
                "cup",
                "tsp",
                "teaspoon",
                "tbsp",
                "tablespoon",
                "pt",
                "pint",
                "in",
                "inch",
            }:
                if imperial_unit:
                    return UnitSystem.IMPERIAL
                elif metric_unit:
                    return UnitSystem.METRIC
                elif aus_unit:
                    return UnitSystem.AUSTRALIAN
                elif jpn_unit:
                    return UnitSystem.JAPANESE
                else:
                    return UnitSystem.US_CUSTOMARY

        return UnitSystem.OTHER




[docs]
@dataclass
class CompositeIngredientAmount:
    """Dataclass for a composite ingredient amount.

    This is an amount comprising more than one IngredientAmount object
    e.g. "1 lb 2 oz" or "1 cup plus 1 tablespoon".

    Attributes
    ----------
    amounts : list[IngredientAmount]
        List of IngredientAmount objects that make up the composite amount. The order
        in this list is the order they appear in the sentence.
    join : str
        String of text that joins the amounts, e.g. "plus".
    subtractive : bool
        If True, the amounts combine subtractively. If False, the amounts combine
        additively.
    text : str
        Composite amount as a string, automatically generated the amounts and
        join attributes.
    confidence : float
        Confidence of parsed ingredient amount, between 0 and 1.
        This is the average confidence of all tokens that contribute to this object.
    starting_index : int
        Index of token in sentence that starts this amount.
    unit_system : UnitSystem
        Unit system (e.g. metric) that the unit of the amount belongs to.
    """

    amounts: list[IngredientAmount]
    join: str
    subtractive: bool
    text: str = field(init=False)
    confidence: float = field(init=False)
    starting_index: int = field(init=False)
    unit_system: UnitSystem = field(init=False)

    def __post_init__(self):
        """On dataclass instantiation, generate the text field."""
        if self.join == "":
            self.text = " ".join([amount.text for amount in self.amounts])
        else:
            self.text = f"{self.join}".join([amount.text for amount in self.amounts])

        # Set starting_index for composite amount to minimum starting_index for
        # amounts that make up the composite amount.
        self.starting_index = min(amount.starting_index for amount in self.amounts)

        # Set confidence to average of confidence values for amounts that make up the
        # composite amount.
        self.confidence = mean(amount.confidence for amount in self.amounts)

        # Determine unit system from amounts
        unit_systems = {amount.unit_system for amount in self.amounts}
        if len(unit_systems) > 1 and UnitSystem.AUSTRALIAN in unit_systems:
            # Australian units system has different pints and tbsp.
            # In a CompositeIngredientAmount these can be paired with another metric
            # amount.
            self.unit_system = UnitSystem.AUSTRALIAN
        elif len(unit_systems) > 1 and UnitSystem.JAPANESE in unit_systems:
            # Japanese units system has different cups.
            # In a CompositeIngredientAmount these can be paired with another metric
            # amount.
            self.unit_system = UnitSystem.JAPANESE
        elif len(unit_systems) > 1:
            self.unit_system = UnitSystem.OTHER
        else:
            self.unit_system = unit_systems.pop()


[docs]
    def combined(self) -> pint.Quantity:
        """Return the combined amount in a single unit for the composite amount.

        The amounts that comprise the composite amount are combined according to whether
        the composite amount is subtractive or not.
        The combined amount is returned as a pint.Quantity object.

        Returns
        -------
        pint.Quantity
            Combined amount.

        Raises
        ------
        TypeError
            Raised if any of the amounts in the object do not comprise a float quantity
            and a pint.Unit unit. In these cases, they amounts cannot be combined.
        """
        # Check amounts are compatible for combination
        for amount in self.amounts:
            if not (
                isinstance(amount.quantity, Fraction)
                and isinstance(amount.unit, pint.Unit)
            ):
                q_type = type(amount.quantity).__name__
                u_type = type(amount.unit).__name__
                raise TypeError(
                    f"Incompatible quantity <{q_type}> "
                    f"and unit <{u_type}> for combining."
                )

        if self.subtractive:
            op = operator.sub
        else:
            op = operator.add

        return reduce(
            op,
            (amount.quantity * amount.unit for amount in self.amounts),  # type: ignore
        )



[docs]
    def convert_to(
        self, unit: str, density: pint.Quantity = 1000 * UREG("kg/m^3")
    ) -> pint.Quantity:
        """Convert units of the combined CompositeIngredientAmount object to given unit.

        Conversion is only possible if none of the quantity, quantity_max and unit are
        strings.

        Conversion between mass and volume is supported using the density parameter, but
        otherwise a DimensionalityError is raised if attempting to convert units of
        different dimensionality.

        .. warning::

            When a conversion between mass <-> volume is performed, the quantities will
            be converted to floats.

        Parameters
        ----------
        unit : str
            Unit to convert to.
        density : pint.Quantity, optional
            Density used for conversion between volume and mass.
            Default is the density of water.

        Returns
        -------
        pint.Quantity
            Combined amount converted to given units.
        """
        # Apply density context for conversion.
        # This is only relevant if converting between mass <-> volume.
        with UREG.context("density", p=density):
            return self.combined().to(unit)





[docs]
@dataclass
class IngredientText:
    """Dataclass for holding a parsed ingredient string.

    Attributes
    ----------
    text : str
        Parsed text from ingredient.
        This is comprised of all tokens with the same label.
    confidence : float
        Confidence of parsed ingredient text, between 0 and 1.
        This is the average confidence of all tokens that contribute to this object.
    starting_index : int
        Index of token in sentence that starts this text
    """

    text: str
    confidence: float
    starting_index: int




[docs]
@dataclass
class FoundationFood:
    """Dataclass for the attributes of an entry in the Food Data Central database.

    Attributes
    ----------
    text : str
        Description FDC database entry.
    confidence : float
        Confidence of the match, between 0 and 1.
    fdc_id : int
        ID of the FDC database entry.
    category : str
        Category of FDC database entry.
    data_type : str
        Food Data Central data set the entry belongs to.
    url : str
        URL for FDC database entry.
    name_index : int
        Index of associated name in ParsedIngredient.name list.
    """

    text: str
    confidence: float
    fdc_id: int
    category: str
    data_type: str
    url: str = field(init=False)
    name_index: int

    def __post_init__(self):
        self.url = f"https://fdc.nal.usda.gov/food-details/{self.fdc_id}/nutrients"

    def __eq__(self, other):
        return isinstance(other, FoundationFood) and self.fdc_id == other.fdc_id

    def __hash__(self):
        return hash(self.fdc_id)




[docs]
@dataclass
class ParsedIngredient:
    """Dataclass for holding the parsed values for an input sentence.

    Attributes
    ----------
    name : list[IngredientText]
        List of IngredientText objects, each representing an ingreident name parsed from
        input sentence.
        If no ingredient names are found, this is an empty list.
    size : IngredientText | None
        Size modifier of ingredients, such as small or large.
        If no size modifier, this is None.
    amount : List[IngredientAmount | CompositeIngredientAmount]
        List of IngredientAmount objects, each representing a matching quantity and
        unit pair parsed from the sentence.
        If no ingredient amounts are found, this is an empty list.
    preparation : IngredientText | None
        Ingredient preparation instructions parsed from sentence.
        If no ingredient preparation instruction was found, this is None.
    comment : IngredientText | None
        Ingredient comment parsed from input sentence.
        If no ingredient comment was found, this is None.
    purpose : IngredientText | None
        The purpose of the ingredient parsed from the sentence.
        If no purpose was found, this is None.
    foundation_foods : list[FoundationFood]
        List of foundation foods from the parsed sentence.
    sentence : str
        Normalised input sentence
    """

    name: list[IngredientText]
    size: IngredientText | None
    amount: list[IngredientAmount | CompositeIngredientAmount]
    preparation: IngredientText | None
    comment: IngredientText | None
    purpose: IngredientText | None
    foundation_foods: list[FoundationFood]
    sentence: str

    def __post_init__(self):
        """Set PREPARED_INGREDIENT flag for amounts.

        The flag is set if:
         * the amount is before the preparation instructions AND
         * the preparation instructions are before the name(s)
        e.g. 100 g sifted flour

        OR
         * the preparation instruction is after the name(s) AND
         * the amount is after the preparation instruction
        e.g. Onion, thinly sliced (about 1 cup)

        Assumes that any preparation text appear entirely before or entirely after all
        names.
        """
        if not self.name or not self.preparation:
            return

        first_name_starting_index = min(n.starting_index for n in self.name)
        last_name_starting_index = max(n.starting_index for n in self.name)

        for amount in self.amount:
            if (
                amount.starting_index
                < self.preparation.starting_index
                < first_name_starting_index
            ) or (
                last_name_starting_index
                < self.preparation.starting_index
                < amount.starting_index
            ):
                if isinstance(amount, CompositeIngredientAmount):
                    for composite_amount in amount.amounts:
                        composite_amount.PREPARED_INGREDIENT = True
                else:
                    amount.PREPARED_INGREDIENT = True




[docs]
@dataclass
class ParserDebugInfo:
    """Dataclass for holding intermediate objects generated during parsing.

    Attributes
    ----------
    sentence : str
        Input ingredient sentence.
    PreProcessor : PreProcessor
        PreProcessor object created using input sentence.
    PostProcessor : PostProcessor
        PostProcessor object created using tokens, labels and scores from
        input sentence.
    Tagger : NumpyCRFInference
        CRF model tagger object.
    """

    sentence: str
    PreProcessor: Any
    PostProcessor: Any
    tagger: NumpyCRFInference