#!/usr/bin/env python3
import copy
import enum
import operator
from dataclasses import dataclass, field
from fractions import Fraction
from functools import reduce
from statistics import mean
from typing import Any
import pint
from ._common import UREG
from .inference import NumpyCRFInference
[docs]
class UnitSystem(enum.StrEnum):
"""Enum defining unit systems"""
METRIC = enum.auto()
US_CUSTOMARY = enum.auto()
IMPERIAL = enum.auto()
AUSTRALIAN = enum.auto()
JAPANESE = enum.auto()
OTHER = enum.auto()
NONE = enum.auto()
[docs]
@dataclass
class TokenFeatures:
"""Dataclass for common token features.
Attributes
----------
stem : str
Stem of the token.
shape : str
Shape of the token, represented by X, x, d characters.
is_capitalised : bool
True if the token starts with a capital letter, else False.
is_unit : str
True if the token is in the list of units, else False.
is_punc : str
True if the token is a punctuation character, else False.
is_ambiguous_unit : str
True if the token is in the list of ambiguous units, else False.
"""
stem: str
shape: str
is_capitalised: bool
is_unit: bool
is_punc: bool
is_ambiguous_unit: bool
[docs]
@dataclass
class Token:
"""Dataclass representing a token from a ingredient sentence.
Attributes
----------
index : int
Index of the token in the sentence.
text : str
Token text.
feat_text : str
Token text used for feature generation.
pos_tag : str
Part of speech tag for token.
features : TokenFeatures
Common features for token.
"""
index: int
text: str
feat_text: str
pos_tag: str
features: TokenFeatures
[docs]
@dataclass
class LabelledToken:
"""Dataclass representing a labelled token from a ingredient sentence.
Attributes
----------
index : int
Index of the token in the sentence.
text : str
Token text.
pos_tag : str
TPart of speech tag for token.
label : str
Label assigned to token.
score : float
Confidence of assigned label between 0 and 1.
plural : bool
True if token is plural.
"""
index: int
text: str
pos_tag: str
label: str
score: float
plural: bool
[docs]
@dataclass
class IngredientAmount:
"""Dataclass for holding a parsed ingredient amount.
On instantiation, the unit is made plural if necessary.
Attributes
----------
quantity : Fraction | str
Parsed ingredient quantity, as a Fraction where possible, otherwise a string.
If the amount if a range, this is the lower limit of the range.
quantity_max : Fraction | str
If the amount is a range, this is the upper limit of the range.
Otherwise, this is the same as the quantity field.
This is set automatically depending on the type of quantity.
unit : str | pint.Unit
Unit of parsed ingredient quantity.
If the quantity is recognised in the pint unit registry, a pint.Unit
object is used.
text : str
String describing the amount e.g. "1 cup", "8 oz"
confidence : float
Confidence of parsed ingredient amount, between 0 and 1.
This is the average confidence of all tokens that contribute to this object.
starting_index : int
Index of token in sentence that starts this amount
unit_system : UnitSystem
Unit system (e.g. metric) that the unit of the amount belongs to.
APPROXIMATE : bool, optional
When True, indicates that the amount is approximate.
Default is False.
SINGULAR : bool, optional
When True, indicates if the amount refers to a singular item of the ingredient.
Default is False.
RANGE : bool, optional
When True, indicates the amount is a range e.g. 1-2.
Default is False.
MULTIPLIER : bool, optional
When True, indicates the amount is a multiplier e.g. 1x, 2x.
Default is False.
PREPARED_INGREDIENT : bool, optional
When True, indicates the amount applies to the prepared ingredient.
When False, indicates the amount applies to the ingredient before preparation.
Default is False.
"""
quantity: Fraction | str
quantity_max: Fraction | str
unit: str | pint.Unit
text: str
confidence: float
starting_index: int
unit_system: UnitSystem = field(init=False)
APPROXIMATE: bool = False
SINGULAR: bool = False
RANGE: bool = False
MULTIPLIER: bool = False
PREPARED_INGREDIENT: bool = False
def __post_init__(self):
self.unit_system = self._determine_unit_system()
def _copy(self):
"""Return deepcopy of current object.
Returns
-------
Self
Deep copy of self.
"""
return copy.deepcopy(self)
[docs]
def convert_to(self, unit: str, density: pint.Quantity = 1000 * UREG("kg/m^3")):
"""Convert units of IngredientAmount object to given unit.
Conversion is only possible if none of the quantity, quantity_max and unit are
strings.
Conversion between mass and volume is supported using the density parameter, but
otherwise a DimensionalityError is raised if attempting to convert units of
different dimensionality.
.. warning::
When a conversion between mass <-> volume is performed, the quantities will
be converted to floats.
Parameters
----------
unit : str
Unit to convert to.
density : pint.Quantity, optional
Density used for conversion between volume and mass.
Default is the density of water.
Returns
-------
Self
Copy of IngredientAmount object with units converted to given unit.
Raises
------
TypeError
Raised if unit, quantity or quantity_max are str
"""
if (
isinstance(self.unit, str)
or isinstance(self.quantity, str)
or isinstance(self.quantity_max, str)
):
raise TypeError("Cannot convert where quantity or unit is a string.")
q: pint.Quantity = self.quantity * self.unit # type: ignore
q_max: pint.Quantity = self.quantity_max * self.unit # type: ignore
# Apply density context for conversion.
# This is only relevant if converting between mass <-> volume.
with UREG.context("density", p=density):
q_converted = q.to(unit) # type: ignore
q_max_converted = q_max.to(unit) # type: ignore
converted_amount = self._copy()
converted_amount.quantity = q_converted.magnitude
converted_amount.quantity_max = q_max_converted.magnitude
converted_amount.unit = q_converted.units # type: ignore
converted_amount.unit_system = converted_amount._determine_unit_system()
# Fraction objects don't support float-style formatting until Python 3.12, so we
# can't just use f"{q_converted:P}"
converted_amount.text = (
f"{float(q_converted.magnitude):g} " + f"{q_converted.units:P}"
)
return converted_amount
def _determine_unit_system(self) -> UnitSystem:
"""Determine the unit system (e.g. metric, imperial) for the amount.
Returns
-------
UnitSystem
Enum specifying the unit system in use.
"""
if self.unit == "":
return UnitSystem.NONE
# If unit is a pint.Unit, convert to string
str_unit = str(self.unit) if isinstance(self.unit, pint.Unit) else self.unit
# Detect if unit uses a particular volumetric unit system.
# Remove that identifying text from the unit to make the check below simpler.
imperial_unit = "imperial_" in str_unit
metric_unit = "metric_" in str_unit
aus_unit = "aus_" in str_unit
jpn_unit = "jp_" in str_unit
str_unit = str_unit.replace("imperial_", "")
str_unit = str_unit.replace("metric_", "")
str_unit = str_unit.replace("aus_", "")
str_unit = str_unit.replace("jp_", "")
# Split unit on spaces for cases like "large clove", "oz can"
for part in str_unit.split():
if part.lower() in {
"g",
"gram",
"kg",
"kilogram",
"l",
"liter",
"litre",
"ml",
"milliliter",
"millilitre",
}:
return UnitSystem.METRIC
elif part.lower() in {
"lb",
"pound",
"oz",
"ounce",
"fluid_ounce",
"st",
"stone",
"c",
"cup",
"tsp",
"teaspoon",
"tbsp",
"tablespoon",
"pt",
"pint",
"in",
"inch",
}:
if imperial_unit:
return UnitSystem.IMPERIAL
elif metric_unit:
return UnitSystem.METRIC
elif aus_unit:
return UnitSystem.AUSTRALIAN
elif jpn_unit:
return UnitSystem.JAPANESE
else:
return UnitSystem.US_CUSTOMARY
return UnitSystem.OTHER
[docs]
@dataclass
class CompositeIngredientAmount:
"""Dataclass for a composite ingredient amount.
This is an amount comprising more than one IngredientAmount object
e.g. "1 lb 2 oz" or "1 cup plus 1 tablespoon".
Attributes
----------
amounts : list[IngredientAmount]
List of IngredientAmount objects that make up the composite amount. The order
in this list is the order they appear in the sentence.
join : str
String of text that joins the amounts, e.g. "plus".
subtractive : bool
If True, the amounts combine subtractively. If False, the amounts combine
additively.
text : str
Composite amount as a string, automatically generated the amounts and
join attributes.
confidence : float
Confidence of parsed ingredient amount, between 0 and 1.
This is the average confidence of all tokens that contribute to this object.
starting_index : int
Index of token in sentence that starts this amount.
unit_system : UnitSystem
Unit system (e.g. metric) that the unit of the amount belongs to.
"""
amounts: list[IngredientAmount]
join: str
subtractive: bool
text: str = field(init=False)
confidence: float = field(init=False)
starting_index: int = field(init=False)
unit_system: UnitSystem = field(init=False)
def __post_init__(self):
"""On dataclass instantiation, generate the text field."""
if self.join == "":
self.text = " ".join([amount.text for amount in self.amounts])
else:
self.text = f"{self.join}".join([amount.text for amount in self.amounts])
# Set starting_index for composite amount to minimum starting_index for
# amounts that make up the composite amount.
self.starting_index = min(amount.starting_index for amount in self.amounts)
# Set confidence to average of confidence values for amounts that make up the
# composite amount.
self.confidence = mean(amount.confidence for amount in self.amounts)
# Determine unit system from amounts
unit_systems = {amount.unit_system for amount in self.amounts}
if len(unit_systems) > 1 and UnitSystem.AUSTRALIAN in unit_systems:
# Australian units system has different pints and tbsp.
# In a CompositeIngredientAmount these can be paired with another metric
# amount.
self.unit_system = UnitSystem.AUSTRALIAN
elif len(unit_systems) > 1 and UnitSystem.JAPANESE in unit_systems:
# Japanese units system has different cups.
# In a CompositeIngredientAmount these can be paired with another metric
# amount.
self.unit_system = UnitSystem.JAPANESE
elif len(unit_systems) > 1:
self.unit_system = UnitSystem.OTHER
else:
self.unit_system = unit_systems.pop()
[docs]
def combined(self) -> pint.Quantity:
"""Return the combined amount in a single unit for the composite amount.
The amounts that comprise the composite amount are combined according to whether
the composite amount is subtractive or not.
The combined amount is returned as a pint.Quantity object.
Returns
-------
pint.Quantity
Combined amount.
Raises
------
TypeError
Raised if any of the amounts in the object do not comprise a float quantity
and a pint.Unit unit. In these cases, they amounts cannot be combined.
"""
# Check amounts are compatible for combination
for amount in self.amounts:
if not (
isinstance(amount.quantity, Fraction)
and isinstance(amount.unit, pint.Unit)
):
q_type = type(amount.quantity).__name__
u_type = type(amount.unit).__name__
raise TypeError(
f"Incompatible quantity <{q_type}> "
f"and unit <{u_type}> for combining."
)
if self.subtractive:
op = operator.sub
else:
op = operator.add
return reduce(
op,
(amount.quantity * amount.unit for amount in self.amounts), # type: ignore
)
[docs]
def convert_to(
self, unit: str, density: pint.Quantity = 1000 * UREG("kg/m^3")
) -> pint.Quantity:
"""Convert units of the combined CompositeIngredientAmount object to given unit.
Conversion is only possible if none of the quantity, quantity_max and unit are
strings.
Conversion between mass and volume is supported using the density parameter, but
otherwise a DimensionalityError is raised if attempting to convert units of
different dimensionality.
.. warning::
When a conversion between mass <-> volume is performed, the quantities will
be converted to floats.
Parameters
----------
unit : str
Unit to convert to.
density : pint.Quantity, optional
Density used for conversion between volume and mass.
Default is the density of water.
Returns
-------
pint.Quantity
Combined amount converted to given units.
"""
# Apply density context for conversion.
# This is only relevant if converting between mass <-> volume.
with UREG.context("density", p=density):
return self.combined().to(unit)
[docs]
@dataclass
class IngredientText:
"""Dataclass for holding a parsed ingredient string.
Attributes
----------
text : str
Parsed text from ingredient.
This is comprised of all tokens with the same label.
confidence : float
Confidence of parsed ingredient text, between 0 and 1.
This is the average confidence of all tokens that contribute to this object.
starting_index : int
Index of token in sentence that starts this text
"""
text: str
confidence: float
starting_index: int
[docs]
@dataclass
class FoundationFood:
"""Dataclass for the attributes of an entry in the Food Data Central database.
Attributes
----------
text : str
Description FDC database entry.
confidence : float
Confidence of the match, between 0 and 1.
fdc_id : int
ID of the FDC database entry.
category : str
Category of FDC database entry.
data_type : str
Food Data Central data set the entry belongs to.
url : str
URL for FDC database entry.
name_index : int
Index of associated name in ParsedIngredient.name list.
"""
text: str
confidence: float
fdc_id: int
category: str
data_type: str
url: str = field(init=False)
name_index: int
def __post_init__(self):
self.url = f"https://fdc.nal.usda.gov/food-details/{self.fdc_id}/nutrients"
def __eq__(self, other):
return isinstance(other, FoundationFood) and self.fdc_id == other.fdc_id
def __hash__(self):
return hash(self.fdc_id)
[docs]
@dataclass
class ParsedIngredient:
"""Dataclass for holding the parsed values for an input sentence.
Attributes
----------
name : list[IngredientText]
List of IngredientText objects, each representing an ingreident name parsed from
input sentence.
If no ingredient names are found, this is an empty list.
size : IngredientText | None
Size modifier of ingredients, such as small or large.
If no size modifier, this is None.
amount : List[IngredientAmount | CompositeIngredientAmount]
List of IngredientAmount objects, each representing a matching quantity and
unit pair parsed from the sentence.
If no ingredient amounts are found, this is an empty list.
preparation : IngredientText | None
Ingredient preparation instructions parsed from sentence.
If no ingredient preparation instruction was found, this is None.
comment : IngredientText | None
Ingredient comment parsed from input sentence.
If no ingredient comment was found, this is None.
purpose : IngredientText | None
The purpose of the ingredient parsed from the sentence.
If no purpose was found, this is None.
foundation_foods : list[FoundationFood]
List of foundation foods from the parsed sentence.
sentence : str
Normalised input sentence
"""
name: list[IngredientText]
size: IngredientText | None
amount: list[IngredientAmount | CompositeIngredientAmount]
preparation: IngredientText | None
comment: IngredientText | None
purpose: IngredientText | None
foundation_foods: list[FoundationFood]
sentence: str
def __post_init__(self):
"""Set PREPARED_INGREDIENT flag for amounts.
The flag is set if:
* the amount is before the preparation instructions AND
* the preparation instructions are before the name(s)
e.g. 100 g sifted flour
OR
* the preparation instruction is after the name(s) AND
* the amount is after the preparation instruction
e.g. Onion, thinly sliced (about 1 cup)
Assumes that any preparation text appear entirely before or entirely after all
names.
"""
if not self.name or not self.preparation:
return
first_name_starting_index = min(n.starting_index for n in self.name)
last_name_starting_index = max(n.starting_index for n in self.name)
for amount in self.amount:
if (
amount.starting_index
< self.preparation.starting_index
< first_name_starting_index
) or (
last_name_starting_index
< self.preparation.starting_index
< amount.starting_index
):
if isinstance(amount, CompositeIngredientAmount):
for composite_amount in amount.amounts:
composite_amount.PREPARED_INGREDIENT = True
else:
amount.PREPARED_INGREDIENT = True
[docs]
@dataclass
class ParserDebugInfo:
"""Dataclass for holding intermediate objects generated during parsing.
Attributes
----------
sentence : str
Input ingredient sentence.
PreProcessor : PreProcessor
PreProcessor object created using input sentence.
PostProcessor : PostProcessor
PostProcessor object created using tokens, labels and scores from
input sentence.
Tagger : NumpyCRFInference
CRF model tagger object.
"""
sentence: str
PreProcessor: Any
PostProcessor: Any
tagger: NumpyCRFInference