#!/usr/bin/env python3
import logging
import re
from collections import defaultdict
from dataclasses import dataclass
from functools import cached_property
from itertools import chain, pairwise
from statistics import mean
from ingredient_parser.en.foundationfoods import match_foundation_foods
from .._common import consume, group_consecutive_idx
from ..dataclasses import (
CompositeIngredientAmount,
FoundationFood,
IngredientAmount,
IngredientText,
LabelledToken,
ParsedIngredient,
)
from ._constants import (
APPROXIMATE_PREFIXES,
APPROXIMATE_SUFFIXES,
INDEFINITE_QUANTIFIERS,
PREPARED_INGREDIENT_TOKENS,
SINGULAR_TOKENS,
STOP_WORDS,
STRING_NUMBERS_REGEXES,
)
from ._regex import FRACTION_TOKEN_PATTERN
from ._utils import (
combine_quantities_split_by_and,
ingredient_amount_factory,
pluralise_units,
replace_string_range,
)
logger = logging.getLogger("ingredient-parser.postprocess")
WORD_CHAR = re.compile(r"\w")
@dataclass
class _PartialIngredientAmount:
"""Dataclass for incrementally building ingredient amount information.
Attributes
----------
quantity : str
Parsed ingredient quantity
unit : list[str]
Unit or unit tokens of parsed ingredient quantity
confidence : list[float]
Average confidence of all tokens or list of confidences for each token of parsed
ingredient amount, between 0 and 1.
starting_index : int
Index of token in sentence that starts this amount
related_to_previous : bool, optional
If True, indicates it is related to the previous IngredientAmount object. All
related objects should have the same APPROXIMATE and SINGULAR flags
APPROXIMATE : bool, optional
When True, indicates that the amount is approximate.
Default is False.
SINGULAR : bool, optional
When True, indicates if the amount refers to a singular item of the ingredient.
Default is False.
PREPARED_INGREDIENT : bool, optional
When True, indicates the amount applies to the prepared ingredient.
When False, indicates the amount applies to the ingredient before preparation.
Default is False.
implicit_quantity : bool, optional
When True, indicates that the quantity is implicit rather than explicit. This
is used to keep track of implicit quantities so that they can be reverted if
we later encounter a plural unit when constructing the amount.
Default is False.
"""
quantity: str
unit: list[str]
confidence: list[float]
starting_index: int
related_to_previous: bool = False
APPROXIMATE: bool = False
SINGULAR: bool = False
PREPARED_INGREDIENT = False
implicit_quantity: bool = False
[docs]
class PostProcessor:
"""Recipe ingredient sentence PostProcessor class.
Performs the necessary postprocessing on the sentence tokens and labels and scores
for the tokens after tagging with the CRF model in order to return a coherent
structure of parsed information.
Attributes
----------
sentence : str
Original ingredient sentence.
labelled_tokens : list[LabelledToken],
List of labelled tokens for original ingredient sentence.
custom_units : dict[str, str]
Dict of custom units as plural: singular pairs.
separate_names : bool, optional
If True and the sentence contains multiple alternative ingredients, return an
IngredientText object for each ingredient name, otherwise return a single
IngredientText object.
Default is True.
discard_isolated_stop_words : bool, optional
If True, isolated stop words are discarded from the name, preparation or
comment fields. Default value is True.
string_units : bool, optional
If True, return all IngredientAmount units as strings.
If False, convert IngredientAmount units to pint.Unit objects where possible.
Default is False.
imperial_units : bool, optional
If True, use imperial units instead of US customary units for pint.Unit objects
for the the following units: fluid ounce, cup, pint, quart, gallon.
Default is False, which results in US customary units being used.
This has no effect if string_units=True.
foundation_foods : bool, optional
If True, populate the foundation_foods field of ParsedIngredient.
Default is False, in which case the foundation_foods field is an empty list.
consumed : list[int]
List of indices of tokens consumed as part of postprocesing the tokens and
labels.
"""
def __init__(
self,
sentence: str,
labelled_tokens: list[LabelledToken],
custom_units: dict[str, str],
separate_names: bool = True,
discard_isolated_stop_words: bool = True,
string_units: bool = False,
volumetric_units_system: str = "us_customary",
foundation_foods: bool = False,
):
self.sentence = sentence
self.tokens = labelled_tokens
self.custom_units = custom_units
self.separate_names = separate_names
self.discard_isolated_stop_words = discard_isolated_stop_words
self.string_units = string_units
self.volumetric_units_system = volumetric_units_system
self.foundation_foods = foundation_foods
self.consumed = []
def __repr__(self) -> str:
"""__repr__ method.
Returns
-------
str
String representation of initialised object.
"""
return f'PostProcessor("{self.sentence}")'
def __str__(self) -> str:
"""__str__ method.
Returns
-------
str
Human readable string representation of object.
"""
tokens_labels = [(t.text, t.label) for t in self.tokens]
_str = [
"Post-processed recipe ingredient sentence",
f"\t{tokens_labels}",
]
return "\n".join(_str)
[docs]
@cached_property
def parsed(self) -> ParsedIngredient:
"""Return parsed ingredient data.
Returns
-------
ParsedIngredient
Object containing structured data from sentence.
"""
amounts = self._postprocess_amounts()
foundationfoods = []
if self.separate_names:
name, foundationfoods = self._postprocess_names()
else:
# Replace all labels containing NAME with "NAME"
name_replaced_labels = []
for t in self.tokens:
if "NAME" in t.label:
t.label = "NAME"
self.labels = name_replaced_labels
logger.debug(
(
f"Relabelled tokens to {name_replaced_labels} ",
"because seperate_name=False.",
)
)
# Process NAME labels as any other label, but return as a list
if processed_name := self._postprocess("NAME"):
name = [processed_name]
if self.foundation_foods:
# Extract name tokens. We can only return a single foundation food,
# but we still need to return a list.
name_pos = [
(t.text, t.pos_tag) for t in self.tokens if t.label == "NAME"
]
name_tokens, pos_tags = zip(*name_pos)
if ff := match_foundation_foods(
list(name_tokens), list(pos_tags), 0
):
foundationfoods = [ff]
else:
name = []
size = self._postprocess("SIZE")
preparation = self._postprocess("PREP")
comment = self._postprocess("COMMENT")
purpose = self._postprocess("PURPOSE")
return ParsedIngredient(
name=name,
size=size,
amount=amounts,
preparation=preparation,
comment=comment,
purpose=purpose,
foundation_foods=foundationfoods,
sentence=self.sentence,
)
def _postprocess(self, selected_label: str) -> IngredientText | None:
"""Process tokens, labels and scores with selected label into IngredientText.
Parameters
----------
selected_label : str
Label of tokens to postprocess.
Returns
-------
IngredientText
Object containing ingredient comment text and confidence.
"""
# Select indices of tokens, labels and scores for selected_label
# Do not include tokens, labels and scores in self.consumed
label_idx = [
i
for i, t in enumerate(self.tokens)
if t.label in [selected_label, "PUNC"] and i not in self.consumed
]
# If idx is empty or all the selected idx are PUNC, return None
if not label_idx or all(self.tokens[i].label == "PUNC" for i in label_idx):
return None
return self._postprocess_indices(label_idx, selected_label)
def _postprocess_names(self) -> tuple[list[IngredientText], list[FoundationFood]]:
"""Process tokens, labels and scores for the ingredient name(s).
This function handles multiple ingredient names e.g. "butter or olive oil",
determined by the labels provided for each token.
Where multiple alternative ingredients names are identified, each one is
returned in a separate IngredientText object.
Returns
-------
list[IngredientText], list[FoundationFoods]
List of IngredientText objects for names.
List of matching FoundationFood objects for names.
"""
name_idx = [
i
for i, t in enumerate(self.tokens)
if ("NAME" in t.label or t.label == "PUNC") and i not in self.consumed
]
# If idx is empty or all the selected idx are PUNC, return None
if not name_idx or all(self.tokens[i].label == "PUNC" for i in name_idx):
return [], []
name_labels = [self.tokens[i].label for i in name_idx]
bio_groups = self._group_name_labels(name_labels)
constructed_names = self._construct_names_from_bio_groups(bio_groups)
names, foundation_foods = self._convert_name_indices_to_object(
name_idx, constructed_names
)
return names, foundation_foods
def _merge(self, objs: list[IngredientText]) -> IngredientText:
"""Merge list of IngredientText objects into a single object.
Text values are joined by a space, unless all text values are the same in which
case only one of values is kept.
Confidence values are averaged.
Starting index is set to the lowest value.
Parameters
----------
names : list[IngredientText]
List of objects to merge.
Returns
-------
IngredientText
Merged IngredientText object.
"""
sorted_objs = sorted(objs, key=lambda x: x.starting_index)
if len({n.text for n in sorted_objs}) == 1:
text = sorted_objs[0].text
else:
text = " ".join(n.text for n in sorted_objs)
merged = IngredientText(
text=text,
confidence=round(mean(n.confidence for n in sorted_objs), 6),
starting_index=min(n.starting_index for n in sorted_objs),
)
return merged
def _group_name_labels(self, name_labels: list[str]) -> list[list[tuple[int, str]]]:
"""Group name labels according to name label type.
B_NAME_TOK and all following I_NAME_TOK up to the next label that is not
I_NAME_TOK or PUNC are grouped.
All consecutive NAME_MOD labels are grouped.
All consecutive NAME_VAR labels are grouped.
A NAME_SEP label starts a new group.
Parameters
----------
name_labels : list[str]
List of name labels.
Returns
-------
list[list[tuple[int, str]]]
List of BIO groups.
Each group is a list of tuples, where each tuple is the (index, label) of
the original name_labels list element.
"""
name_groups = []
current_group = []
prev_label = None
for idx, label in enumerate(name_labels):
# Start new group on NAME_SEP name label
if label == "NAME_SEP":
if current_group:
name_groups.append(current_group)
current_group = []
# Start new group for new "B_*" name label
elif label.startswith("B_"):
if current_group:
name_groups.append(current_group)
current_group = [(idx, label)]
# Start new group if encountering new NAME_MOD or NAME_VAR, or append to
# current group if previous label was the same as current label.
elif label in ["NAME_MOD", "NAME_VAR"]:
if prev_label == label:
current_group.append((idx, label))
else:
if current_group:
name_groups.append(current_group)
current_group = [(idx, label)]
# Must be an I_NAME_TOK or PUNC label, so append to current group
else:
current_group.append((idx, label))
prev_label = label
# Add last group to list if not empty
if current_group:
name_groups.append(current_group)
return name_groups
def _construct_names_from_bio_groups(
self, name_groups: list[list[tuple[int, str]]]
) -> list[list[int]]:
"""Construct names from BIO groups.
All VAR groups are prepended to the next TOK group.
MOD groups are prepended to all subsequent TOK groups or VAR+TOK groups.
To make this easier, iterate through the BIO groups from last to first. This
means we can easily keep track of which TOK group to prepend VAR and MOD
groups.
Parameters
----------
name_groups : list[list[tuple[int, str]]]
List of BIO groups.
Each group is a list of tuples, where each tuple if the (index, label) of
the original list element.
Returns
-------
list[list[int]]
List of name_label indices for each name.
"""
constructed_names = []
# Keep track the last TOK group we come across (moving from last to first).
# Also keep track of whether we have used it by prepending a VAR or MOD
# group.
last_encountered_name = None
last_encountered_name_used = False
# Iterate from last to first BIO group
for group in reversed(name_groups):
current_group_idx, labels = zip(*group)
current_label = self._get_name_group_label(labels)
if current_label == "TOK":
# If we've previously come across a TOK group and haven't used it,
# then store it.
if last_encountered_name and not last_encountered_name_used:
constructed_names.append(last_encountered_name)
# Set current group to last_encountered_name group.
last_encountered_name = current_group_idx
last_encountered_name_used = False
elif current_label == "VAR":
# Prepend this group to last encountered NAME group
if last_encountered_name:
constructed_names.append(current_group_idx + last_encountered_name)
last_encountered_name_used = True
else:
# If we are here, then we've come across a VAR group that does not
# precede a TOK group, so the model has made an error in it's
# labelling. Add this VAR group anyway.
constructed_names.append(current_group_idx)
elif current_label == "MOD":
# If we've previously come across a NAME group and haven't used it,
# then store it.
if last_encountered_name and not last_encountered_name_used:
constructed_names.append(last_encountered_name)
last_encountered_name_used = True
# Prepend this group to all constructed names so far
constructed_names = [
current_group_idx + name for name in constructed_names
]
# If we've iterated through all BIO groups and haven't used
# last_encountered_name, add it to constructed_names now.
if last_encountered_name and not last_encountered_name_used:
constructed_names.append(last_encountered_name)
# Return reversed list, so names are in the order they appear in sentence.
return list(reversed(constructed_names))
def _get_name_group_label(self, labels: tuple[str]) -> str:
"""Get the NAME label type for the labels in a name group.
One of TOK, VAR, MOD.
Parameters
----------
labels : tuple[str]
Tuple of labels for name group elements.
Returns
-------
str
Group label.
"""
for label in labels:
if label != "PUNC":
return label.split("_")[-1]
return ""
def _convert_name_indices_to_object(
self, name_idx: list[int], name_index_groups: list[list[int]]
) -> tuple[list[IngredientText], list[FoundationFood]]:
"""Convert grouped indices for name tokens into IngredientText objects.
If foundation foods are enabled, determine matching foundation food for each
name.
If an ingredient name ends with a token with POS tag of DT, IN or JJ, merge it
with the next name group, if there is one. This is to avoid cases in a sentence
like "5 fresh large basil leaves" where "large" is given the SIZE label,
resulting in two separate names: "fresh" and "basil leaves". Instead, we want to
return a single name: "fresh basil leaves".
Parameters
----------
name_idx : list[int]
List of indices of NAME tokens.
name_index_groups : list[list[int]]
List of groups of indices corresponding to ingredient names.
These indices refer to the name_idx list.
Returns
-------
tuple[list[IngredientText], list[FoundationFood]]
List of deduplicated IngredientText objects and FoundationFoods objects.
"""
# Keep track of IngredientText objects and indices to merge with next.
# We do the merge if the name ends with DT, IN, JJ part of speech tag.
merge_with_next = False
merge_with_next_idx: list[int] = []
# Merge name_idx group with next if it ends with DT, IN or JJ part of speech
# tag.
merged_name_idx = []
for group in name_index_groups:
# Convert from name_label indices to token indices
token_idx = [name_idx[idx] for idx in group]
if merge_with_next and merge_with_next_idx:
token_idx = [*merge_with_next_idx, *token_idx]
if self._last_non_punc_token_pos(token_idx) in {"DT", "IN", "JJ"}:
# Mark name for merging with next name.
merge_with_next = True
merge_with_next_idx = token_idx
# Skip to next iteration
continue
else:
merged_name_idx.append(token_idx)
merge_with_next = False
merge_with_next_idx = []
if merge_with_next and merge_with_next_idx:
# Catch any remaining name indices marked as needing to be merged
# but haven't been.
merged_name_idx.append(merge_with_next_idx)
# Build IngredientText objects, merging duplicate names where found.
names = []
foundation_foods = []
for token_idx in merged_name_idx:
ing_text = self._postprocess_indices(token_idx, "NAME")
if not ing_text:
continue
if ing_text.text in [n.text for n in names]:
dupe_idx = [i for i, n in enumerate(names) if n.text == ing_text.text]
merged = self._merge([*[names[i] for i in dupe_idx], ing_text])
names[dupe_idx[0]] = merged
else:
names.append(ing_text)
if self.foundation_foods:
# We don't match foundation foods for duplicate names because we
# will have already found any match for the first instance of the
# name.
tokens = [self.tokens[i].text for i in token_idx]
pos_tags = [self.tokens[i].pos_tag for i in token_idx]
if ff := match_foundation_foods(tokens, pos_tags, len(names) - 1):
foundation_foods.append(ff)
return names, foundation_foods
def _last_non_punc_token_pos(self, token_idx: list[int]) -> str:
"""Return the POS tag at the last index in token_idx.
Indices corresponding to punctuation are ignored.
Parameters
----------
token_idx : list[int]
List of token indices to find last non-punctuation POS tag.
Returns
-------
str
POS tag for last non-punctuation index.
"""
for idx in reversed(token_idx):
if self.tokens[idx].label == "PUNC":
continue
return self.tokens[idx].pos_tag
# Return empty string so we don't try to merge this with the next name.
# This should never occur because self._postprocess_indices is called before
# this and that function skips over any potential name that is all punctuation.
return ""
def _postprocess_indices(
self, label_idx: list[int], selected_label: str
) -> IngredientText | None:
"""Process list of token indices into a single IngredientText object.
Consecutive tokens are joined together, with non-consecutive groups being joined
by a comma (unless selected_label is NAME).
Indices for tokens that would be ungrammatical are removed prior to joining.
Duplicate tokens that are adjacent are also removed.
Parameters
----------
label_idx : list[int]
List of indices of tokens to postprocess into IngredientText.
selected_label : str
Label of tokens being post processed.
Returns
-------
IngredientText | None
IngredientText object for selected tokens.
If the post processing results in all tokens being ignored, return None.
"""
# Join consecutive tokens together and average their score
parts = []
confidence_parts = []
starting_index = label_idx[-1]
for group in group_consecutive_idx(label_idx):
idx = list(group)
idx = self._remove_invalid_indices(idx)
if all(self.tokens[i].label == "PUNC" for i in idx):
# Skip if the group only contains PUNC
continue
# Convert any fractions in intermediate form (i.e. #1$2) into text
group_tokens = []
for i in idx:
if FRACTION_TOKEN_PATTERN.match(self.tokens[i].text):
text_fraction = (
self.tokens[i].text.replace("#", " ").replace("$", "/").strip()
)
# If fraction range, remove space that will follow hyphen caused by
# replacing # with space.
text_fraction = text_fraction.replace("- ", "-")
group_tokens.append(text_fraction)
else:
group_tokens.append(self.tokens[i].text)
joined = " ".join(group_tokens)
confidence = mean([self.tokens[i].score for i in idx])
if self.discard_isolated_stop_words and joined.lower() in STOP_WORDS:
# Skip part if it's a stop word
continue
self.consumed.extend(idx)
parts.append(joined)
confidence_parts.append(confidence)
starting_index = min(starting_index, idx[0])
# Find the indices of the joined tokens list where the element
# is the same as the previous element in the list.
keep_idx = self._remove_adjacent_duplicates(parts)
parts = [parts[i] for i in keep_idx]
confidence_parts = [confidence_parts[i] for i in keep_idx]
# Join all the parts together into a single string and fix any
# punctuation weirdness as a result.
# If the selected_label is NAME, join with a space. For all other labels, join
# with a comma and a space.
if selected_label == "NAME":
text = " ".join(parts)
else:
text = ", ".join(parts)
text = self._fix_punctuation(text)
text = pluralise_units(text, self.custom_units)
if len(parts) == 0:
return None
return IngredientText(
text=text,
confidence=round(mean(confidence_parts), 6),
starting_index=starting_index,
)
def _postprocess_amounts(
self,
) -> list[IngredientAmount | CompositeIngredientAmount]:
"""Process tokens, labels and scores into IngredientAmount.
This is done by combining QTY labels with any following UNIT labels,
up to the next QTY label.
The confidence is the average confidence of all labels in the IngredientGroup.
A number of special cases are considered before the default processing:
1. "sizeable unit" pattern
2. "composite amounts" pattern
Returns
-------
list[IngredientAmount | CompositeIngredientAmount]
List of IngredientAmount and CompositeIngredientAmount objects.
"""
self._convert_string_number_qty()
funcs = [
self._sizeable_unit_pattern,
self._composite_amounts_pattern,
self._fallback_pattern,
]
amounts = []
for func in funcs:
tokens = self._unconsumed(self.tokens)
parsed_amounts = func(tokens)
amounts.extend(parsed_amounts)
return sorted(amounts, key=lambda x: x.starting_index)
def _unconsumed(self, list_: list[LabelledToken]) -> list[LabelledToken]:
"""Return elements from list whose index is not in the list of consumed indices.
Parameters
----------
list_ : list[LabelledToken]
List of items to remove consumed elements from.
Returns
-------
list[LabelledToken]
List of items without consumed elements.
"""
return [el for el in list_ if el.index not in self.consumed]
def _remove_invalid_indices(self, idx: list[int]) -> list[int]:
"""Remove indices of tokens that aren't valid in the group.
The invalid indices correspond to punctuation that cannot start or end a phrase,
or brackets that aren't part of a matched pair.
Parameters
----------
idx : list[int]
List of indices for group of consecutive tokens
with same label or PUNC label.
Returns
-------
list[int]
List of indices with invalid punctuation removed.
"""
# For groups with more than 1 element, remove invalid leading and trailing
# punctuation so they don't get incorrectly consumed.
while len(idx) > 1 and self.tokens[idx[0]].text in [
")",
"]",
"}",
",",
":",
";",
"-",
".",
"!",
"?",
"*",
"&",
"/",
"--",
]:
idx = idx[1:]
while len(idx) > 1 and self.tokens[idx[-1]].text in [
"[",
"(",
"{",
",",
":",
";",
"-",
"&",
"/",
"*",
"--",
"+",
]:
idx = idx[:-1]
# Remove brackets that aren't part of a matching pair
idx_to_remove = []
tok_name = None # Unnecessary, but prevents typing errors
stack = defaultdict(list) # Separate stack for each bracket type
for i, tok in enumerate([self.tokens[i].text for i in idx]):
if tok in ["(", ")"]:
tok_name = "PAREN"
elif tok in ["[", "]"]:
tok_name = "SQAURE"
if tok in ["(", "["]:
# Add index to stack when we find an opening parens
stack[tok_name].append(i)
elif tok in [")", "]"]:
if len(stack[tok_name]) == 0:
# If the stack is empty, we've found a dangling closing parens
idx_to_remove.append(i)
else:
# Remove last added index from stack when we find a closing parens
stack[tok_name].pop()
# Insert anything left in stack into idx_to_remove and remove
for stack_idx in stack.values():
idx_to_remove.extend(stack_idx)
idx = [idx[i] for i, _ in enumerate(idx) if i not in idx_to_remove]
return idx
def _fix_punctuation(self, text: str) -> str:
"""Fix some common punctuation errors that result when combining tokens.
Parameters
----------
text : str
Text resulting from combining tokens with same label.
Returns
-------
str
Text, with punctuation errors fixed.
Examples
--------
>>> p = PostProcessor("", [], [], [])
>>> p._fix_punctuation(", some words ( inside ),")
"some words (inside)"
"""
if text == "":
return text
# Correct space following open parens or before close parens
text = text.replace("( ", "(").replace(" )", ")")
# Remove space around forward slash
text = text.replace(" / ", "/")
# Correct space preceding various punctuation
for punc in [",", ":", ";", ".", "!", "?", "*"]:
text = text.replace(f" {punc}", punc)
return text.strip()
def _remove_adjacent_duplicates(self, parts: list[str]) -> list[int]:
"""Find indices of adjacent duplicate strings.
Parameters
----------
parts : list[str]
List of strings with single label.
Returns
-------
list[int]
Indices of elements in parts to keep.
Examples
--------
>>> p = PostProcessor("", [], [], [])
>>> p._remove_isolated_punctuation_and_duplicate_indices(
["word", "word", "another"],
)
[1, 2]
"""
idx_to_keep = []
for i, (first, second) in enumerate(pairwise([*parts, ""])):
if first != second:
idx_to_keep.append(i)
return idx_to_keep
def _replace_string_numbers(self, text: str) -> str:
"""Replace string numbers (e.g. one, two) with numeric values (e.g. 1, 2).
Parameters
----------
text : str
Ingredient sentence.
Returns
-------
str
Ingredient sentence with string numbers replace with numeric values.
Examples
--------
>>> p = PreProcessor("")
>>> p._replace_string_numbers("three large onions")
"3 large onions"
>>> p = PreProcessor("")
>>> p._replace_string_numbers("twelve bonbons")
"12 bonbons"
"""
# STRING_NUMBER_REGEXES is a dict where the values are a tuple of the compiled
# regular expression for matching a string number e.g. 'one', 'two' and the
# substitution numerical value for that string number.
for regex, substitution in STRING_NUMBERS_REGEXES.values():
text = regex.sub(rf"{substitution}", text)
return text
def _convert_string_number_qty(self) -> None:
"""Convert QTY tokens that are string numbers to numeric values.
This function modifies the tokens, labels and scores lists in place to replace
any string numbers with QTY label with their numeric value.
This function also collapses any quantities split by 'and' into a single
number e.g.
one and one-half -> 1 and 1/2 -> 1.5
This function also collapses any string ranges into a single range e.g.
one or two -> 1 or 2 -> 1-2
"""
for t in self.tokens:
if t.label == "QTY":
self.tokens[t.index].text = self._replace_string_numbers(t.text)
QTY_idx = [t.index for t in self.tokens if t.label == "QTY"]
# Find any cases where a group of consecutive QTY tokens can be collapsed into
# a single token. Modify the first token and score in the group and mark all
# others in group for deletion.
idx_to_remove = []
for idx_group in group_consecutive_idx(QTY_idx):
idx_group = list(idx_group)
if len(idx_group) == 1:
continue
fragment = " ".join([self.tokens[i].text for i in idx_group])
replacement = combine_quantities_split_by_and(fragment)
if replacement != fragment:
mod_idx = idx_group[0] # Index to replace with replacement
self.tokens[mod_idx].score = mean(
[self.tokens[i].score for i in idx_group]
)
self.tokens[mod_idx].text = replacement
idx_to_remove.extend(idx_group[1:])
continue
replacement = replace_string_range(fragment)
if replacement != fragment:
mod_idx = idx_group[0] # Index to replace with replacement
self.tokens[mod_idx].score = mean(
[self.tokens[i].score for i in idx_group]
)
self.tokens[mod_idx].text = replacement
idx_to_remove.extend(idx_group[1:])
continue
if idx_to_remove:
self.tokens = [t for t in self.tokens if t.index not in idx_to_remove]
def _sizeable_unit_pattern(
self, tokens: list[LabelledToken]
) -> list[IngredientAmount]:
"""Identify sentences which match the sizeable unit pattern.
This pattern is where there is a quantity-unit pair split by one or more
quantity-unit pairs e.g.
* 1 28 ounce can
* 2 17.3 oz (484g) package
This also handles the case where there is no leading count, e.g.
* 15 ounce can
* 12-ounce jar
In this case, the container unit gets an implied quantity of 1 and the
weight quantity-unit pair is returned as a secondary amount.
Return the correct sets of quantities and units, or an empty list.
For example, for the sentence: 1 28 ounce can; the correct amounts are:
[
IngredientAmount(quantity=Fraction(1, 1), unit="can", score=0.x...),
IngredientAmount(quantity=Fraction(28, 1), unit="ounce", score=0.x...),
]
For the sentence: 15 ounce can; the correct amounts are:
[
IngredientAmount(quantity=Fraction(1, 1), unit="can", score=0.x...),
IngredientAmount(quantity=Fraction(15, 1), unit="ounce", score=0.x...),
]
Parameters
----------
tokens : list[LabelledToken]
Labelled tokens for input sentence.
Returns
-------
list[IngredientAmount]
List of IngredientAmount objects.
"""
# We assume that the pattern will not be longer than the longest list
# defined here.
patterns = [
["QTY", "QTY", "UNIT", "QTY", "UNIT", "QTY", "UNIT", "UNIT"],
["QTY", "QTY", "UNIT", "QTY", "UNIT", "UNIT"],
["QTY", "QTY", "UNIT", "UNIT"],
["QTY", "UNIT", "UNIT"],
]
# List of possible units at end of pattern that constitute a match
end_units = [
"bag",
"block",
"bottle",
"box",
"bucket",
"can",
"carton",
"container",
"envelope",
"jar",
"loaf",
"package",
"packet",
"piece",
"sachet",
"slice",
"tin",
]
amounts = []
for pattern in patterns:
for match in self._match_pattern(tokens, pattern, ignore_other_labels=True):
# The [QTY, UNIT, UNIT] pattern can match the tail end of a
# longer pattern like [QTY, QTY, UNIT, UNIT]. Skip matches
# whose indices were already consumed by a longer pattern.
if any(tokens[i].index in self.consumed for i in match):
continue
# If the pattern ends with one of end_units, we have found a match for
# this pattern!
if tokens[match[-1]].text in end_units:
# Get tokens and scores that are part of match
matching_tokens = [tokens[i].text for i in match]
matching_scores = [tokens[i].score for i in match]
# Keep track of indices of matching elements so we don't use them
# again elsewhere
self.consumed.extend([tokens[i].index for i in match])
if pattern == patterns[3]: # ["QTY", "UNIT", "UNIT"]
# No explicit count in pattern.
# E.g., "15 ounce can" -> first amount: 1 can
unit = matching_tokens.pop(-1)
first = ingredient_amount_factory(
quantity="1",
unit=unit,
text="1 " + unit,
confidence=matching_scores.pop(-1),
starting_index=tokens[match[0]].index,
APPROXIMATE=self._is_approximate(match[0], tokens),
string_units=self.string_units,
volumetric_units_system=self.volumetric_units_system,
custom_units=self.custom_units,
)
amounts.append(first)
_ = match.pop(-1)
logger.debug(f"Implicit quantity of '1' applied to '1 {unit}'.")
else:
# The first amount is made up of the first and last items
# Note that this cannot be singular, but may be approximate
quantity = matching_tokens.pop(0)
unit = matching_tokens.pop(-1)
text = " ".join((quantity, unit)).strip()
first = ingredient_amount_factory(
quantity=quantity,
unit=unit,
text=text,
confidence=mean(
[matching_scores.pop(0), matching_scores.pop(-1)]
),
starting_index=tokens[match[0]].index,
APPROXIMATE=self._is_approximate(match[0], tokens),
string_units=self.string_units,
volumetric_units_system=self.volumetric_units_system,
custom_units=self.custom_units,
)
amounts.append(first)
# Pop the first and last items from the list of matching
# indices
_ = match.pop(0)
_ = match.pop(-1)
# Create IngredientAmount objects for the remaining
# quantity-unit pairs
for i in range(0, len(matching_tokens), 2):
quantity = matching_tokens[i]
unit = matching_tokens[i + 1]
text = " ".join((quantity, unit)).strip()
confidence = mean(matching_scores[i : i + 1])
# If the first amount (e.g. 1 can) is approximate,
# so are all the pairs in between
amount = ingredient_amount_factory(
quantity=quantity,
unit=unit,
text=text,
confidence=confidence,
starting_index=tokens[match[i]].index,
SINGULAR=True,
APPROXIMATE=first.APPROXIMATE,
string_units=self.string_units,
volumetric_units_system=self.volumetric_units_system,
custom_units=self.custom_units,
)
amounts.append(amount)
return amounts
def _composite_amounts_pattern(
self, tokens: list[LabelledToken]
) -> list[CompositeIngredientAmount]:
"""Identify sentences which match the pattern where there are composite amounts.
This pattern is where there are adjacent amounts that need to be considered
together, e.g.
* 1 lb 2 oz
* 1 pint 2 fl oz
* 2 cups plus 1 tablespoon
Return a composite amount object made from the adjacent amounts.
For example, for the sentence: 1 lb 2 oz ...; the composite amount is:
CompositeAmount(
amounts=[
IngredientAmount(quantity=Fraction(1, 1), unit="lb", score=0.x...),
IngredientAmount(quantity=Fraction(2, 1), unit="oz", score=0.x...),
],
join=""
)
Parameters
----------
tokens : list[str]
Labelled tokens for input sentence.
Returns
-------
list[CompositeIngredientAmount]
List of IngredientAmount objects.
"""
# Define patterns for composite amounts based on a sequence of labels.
# Also set the indices of the pattern sequence where the first and
# second amounts start, set the string used to join the two amounts
# together in text, and set whether the amounts combine subtractively or not.
patterns = {
"ptfloz": {
"pattern": ["QTY", "UNIT", "QTY", "UNIT", "UNIT"],
"conjunction": None,
"conj_index": None,
"start1": 0,
"start2": 2,
"join": "",
"subtractive": False,
},
"lboz": {
"pattern": ["QTY", "UNIT", "QTY", "UNIT"],
"conjunction": None,
"conj_index": None,
"start1": 0,
"start2": 2,
"join": "",
"subtractive": False,
},
"plus": {
"pattern": ["QTY", "UNIT", "COMMENT", "QTY", "UNIT"],
"conjunction": "plus",
"conj_index": 2,
"start1": 0,
"start2": 3,
"join": " plus ",
"subtractive": False,
},
"plus_punc": {
"pattern": ["QTY", "UNIT", "PUNC", "QTY", "UNIT"],
"conjunction": "+",
"conj_index": 2,
"start1": 0,
"start2": 3,
"join": " + ",
"subtractive": False,
},
"plus_punc_comment": {
"pattern": ["QTY", "UNIT", "PUNC", "COMMENT", "QTY", "UNIT"],
"conjunction": "plus",
"conj_index": 3,
"start1": 0,
"start2": 4,
"join": " plus ",
"subtractive": False,
},
"and": {
"pattern": ["QTY", "UNIT", "COMMENT", "QTY", "UNIT"],
"conjunction": "and",
"conj_index": 2,
"start1": 0,
"start2": 3,
"join": " and ",
"subtractive": False,
},
"minus": {
"pattern": ["QTY", "UNIT", "COMMENT", "QTY", "UNIT"],
"conjunction": "minus",
"conj_index": 2,
"start1": 0,
"start2": 3,
"join": " minus ",
"subtractive": True,
},
"less": {
"pattern": ["QTY", "UNIT", "COMMENT", "QTY", "UNIT"],
"conjunction": "less",
"conj_index": 2,
"start1": 0,
"start2": 3,
"join": " minus ",
"subtractive": True,
},
}
# List of possible units for first and second amount matched for
# pltfloz and lboz patterns.
valid_first_units = {"lb", "pound", "pt", "pint"}
valid_last_units = {"oz", "ounce"}
composite_amounts = []
for pattern_name, pattern_info in patterns.items():
pattern = pattern_info["pattern"]
start1 = pattern_info["start1"]
start2 = pattern_info["start2"]
join = pattern_info["join"]
conj_index = pattern_info["conj_index"]
subtractive = pattern_info["subtractive"]
for match in self._match_pattern(
tokens, pattern, ignore_other_labels=False
):
# Check if match fits with "ptfloz" or "lboz" pattern constraints
if pattern_name in ["ptfloz", "lboz"]:
first_unit = tokens[match[start1 + 1]].text
last_unit = tokens[match[-1]].text
if (
first_unit not in valid_first_units
or last_unit not in valid_last_units
):
# Units of match do not align with expectations for
# ptfloz or lboz patterns, so skip
continue
# For other patterns, check if token at the conj_index in match matches
# conjunction and skip if not.
elif (
tokens[match[conj_index]].text.lower()
!= pattern_info["conjunction"]
):
continue
# First amount
mstart1 = match[start1] # Index of start of 1st part in full sentence.
quantity_1 = tokens[mstart1].text
unit_1 = tokens[match[start1 + 1]].text
score_1 = mean(tokens[i].score for i in match[start1 : start1 + 2])
text_1 = " ".join((quantity_1, unit_1)).strip()
first_amount = ingredient_amount_factory(
quantity=quantity_1,
unit=unit_1,
text=text_1,
confidence=score_1,
starting_index=tokens[mstart1].index,
string_units=self.string_units,
volumetric_units_system=self.volumetric_units_system,
custom_units=self.custom_units,
)
# Second amount
mstart2 = match[start2] # Index of start of 2nd part in full sentence.
quantity_2 = tokens[mstart2].text
unit_2 = " ".join([tokens[i].text for i in match[start2 + 1 :]])
score_2 = mean(tokens[i].score for i in match[start2:])
text_2 = " ".join((quantity_2, unit_2)).strip()
second_amount = ingredient_amount_factory(
quantity=quantity_2,
unit=unit_2,
text=text_2,
confidence=score_2,
starting_index=tokens[mstart2].index,
string_units=self.string_units,
volumetric_units_system=self.volumetric_units_system,
custom_units=self.custom_units,
)
# Check if flags should be set and make sure both IngredientAmounts get
# the same flags.
prepared = self._is_prepared(
tokens[mstart1].index, tokens
) or self._is_prepared(tokens[mstart2].index, tokens)
approximate = self._is_approximate(
tokens[mstart1].index, tokens
) or self._is_prepared(tokens[mstart2].index, tokens)
# The _is_singular check only works if the index provided is for a token
# labelled with UNIT.
# Therefore, use idx[mstart + 1] to get the unit for the first amount
# and idx[match[-1]] to get the last unit for the second amount.
singular = self._is_singular(
tokens[mstart1 + 1].index, tokens
) or self._is_singular(tokens[match[-1]].index, tokens)
if self._is_singular_and_approximate(
tokens[mstart1].index, tokens
) or self._is_singular_and_approximate(tokens[mstart2].index, tokens):
approximate = True
singular = True
if approximate:
first_amount.APPROXIMATE = True
second_amount.APPROXIMATE = True
if singular:
first_amount.SINGULAR = True
second_amount.SINGULAR = True
if prepared:
first_amount.PREPARED_INGREDIENT = True
second_amount.PREPARED_INGREDIENT = True
composite_amounts.append(
CompositeIngredientAmount(
amounts=[first_amount, second_amount],
join=join,
subtractive=subtractive,
)
)
# Keep track of indices of matching elements so we don't use them
# again elsewhere
self.consumed.extend([tokens[i].index for i in match])
return composite_amounts
def _match_pattern(
self,
tokens: list[LabelledToken],
pattern: list[str],
ignore_other_labels: bool = True,
) -> list[list[int]]:
"""Find a pattern of labels, returning the indices of the matching labels.
For example, consider the sentence:
One 15-ounce can diced tomatoes, with liquid
It has the tokens and labels:
['1', '15', 'ounce', 'can', 'diced', 'tomatoes', ',', 'with', 'liquid']
['QTY', 'QTY', 'UNIT', 'UNIT', 'COMMENT', 'NAME', 'COMMA', 'COMMENT', 'COMMENT']
If we search for the pattern:
["QTY", "QTY", "UNIT", "UNIT"]
Then we get:
[[0, 1, 2, 3]]
Parameters
----------
tokens : list[LabelledToken]
List of tokens to find label pattern within.
pattern : list[str]
Pattern to match inside labels.
ignore_other_labels : bool
If True, the pattern matching will ignore any labels not found in pattern
meaning the indices of the match may not be consecutive.
If False, the pattern must be found without any interruptions in the
labels list.
Returns
-------
list[list[int]]
List of label index lists that match the pattern.
"""
labels = [t.label for t in tokens]
plen = len(pattern)
plabels = set(pattern)
if ignore_other_labels:
# Select just the labels and indices of labels that are in the pattern.
lbls = [label for label in labels if label in plabels]
idx = [i for i, label in enumerate(labels) if label in plabels]
else:
# Consider all labels
lbls = labels
idx = [i for i, _ in enumerate(labels)]
if len(pattern) > len(lbls):
# We can never find a match.
return []
matches = []
indices = iter(range(len(lbls)))
for i in indices:
# Short circuit: If lbls[i] is not equal to the first element
# of pattern, skip to next iteration
if lbls[i] == pattern[0] and lbls[i : i + plen] == pattern:
matches.append(idx[i : i + plen])
# Advance iterator to prevent overlapping matches
consume(indices, plen - 1)
return matches
def _fallback_pattern(
self,
tokens: list[LabelledToken],
) -> list[IngredientAmount]:
"""Fallback pattern for grouping quantities and units into amounts.
This is done simply by grouping a QTY with all following UNIT until
the next QTY.
A special case is the for when the token "dozen" is labelled as QTY and
it follows a QTY. In this case, the quantity of previous amount is
modified to include "dozen".
Parameters
----------
tokens : list[LabelledToken]
Labelled tokens for input sentence.
Returns
-------
list[IngredientAmount]
List of IngredientAmount objects.
"""
amounts = []
# If a new amount starts with the token after a (, / or [ then it we assume it
# is related to the previous amount
# We use idx+1 here so we can check the index in the iteration a new amount is
# created and avoid needing to check things like i >= 0
related_idx = [t.index + 1 for t in tokens if t.text in ["(", "/", "["]]
for i, token in enumerate(tokens):
if token.label == "QTY":
# Whenever we come across a new QTY, create new IngredientAmount with
# some exceptions.
if token.text == "dozen" and tokens[i - 1].label == "QTY":
# If the token is "dozen" and the previous label was QTY, in which
# case we modify the quantity of the previous amount.
amounts[-1].quantity = amounts[-1].quantity + " dozen"
amounts[-1].confidence.append(token.score)
elif tokens[i - 1].label == "QTY" and tokens[i - 1].text.endswith("x"):
# This is a multiplier followed by another amount
# e.g. "1x 15 ml tbsp", so mark this amount as related to the
# previous one.
amounts.append(
_PartialIngredientAmount(
quantity=token.text,
unit=[],
confidence=[token.score],
starting_index=token.index,
related_to_previous=True,
)
)
else:
amounts.append(
_PartialIngredientAmount(
quantity=token.text,
unit=[],
confidence=[token.score],
starting_index=token.index,
related_to_previous=i in related_idx,
)
)
if token.label == "UNIT":
if amounts == []:
# Not come across a QTY yet, so create IngredientAmount
implicit_quantity = False
quantity = ""
if not token.plural and not (
INDEFINITE_QUANTIFIERS & {t.text.lower() for t in tokens[:i]}
):
# If the token is not plural and the sentence does not contain
# an indefinite quantifier prior to this token, assume a
# quantity of 1.
quantity = "1"
implicit_quantity = True
amounts.append(
_PartialIngredientAmount(
quantity=quantity,
unit=[],
confidence=[token.score],
starting_index=token.index,
implicit_quantity=implicit_quantity,
)
)
# Append token and score for unit to last IngredientAmount
text = token.text
if token.plural and amounts[-1].implicit_quantity:
# If this token is plural and the current amount has an implicit
# quantity, revert the implicit quantity and re-pluralize the unit.
amounts[-1].quantity = ""
amounts[-1].implicit_quantity = False
text = pluralise_units(token.text, self.custom_units)
elif token.plural and amounts[-1].quantity == "":
# If this token is plural and there is no quantity,
# re-pluralize it.
# Note that is there was a quantity, the unit would be
# pluralized within ingredient_amount_factory() as appropriate.
text = pluralise_units(token.text, self.custom_units)
amounts[-1].unit.append(text)
amounts[-1].confidence.append(token.score)
# Check if any flags should be set
if self._is_approximate(i, tokens):
amounts[-1].APPROXIMATE = True
if self._is_singular(i, tokens):
amounts[-1].SINGULAR = True
if self._is_singular_and_approximate(i, tokens):
amounts[-1].APPROXIMATE = True
amounts[-1].SINGULAR = True
if self._is_prepared(i, tokens):
amounts[-1].PREPARED_INGREDIENT = True
# Set APPROXIMATE, SINGULAR and PREPARED_INGREDIENT flags to be the same for all
# related amounts.
amounts = self._distribute_related_flags(amounts)
# Loop through amounts list to fix unit and confidence
# Unit needs converting to a string
# Confidence needs averaging
# Then convert to IngredientAmount object
processed_amounts = []
for amount in amounts:
unit = " ".join(amount.unit)
text = " ".join((amount.quantity, unit)).strip()
# Convert to an IngredientAmount object for returning
processed_amounts.append(
ingredient_amount_factory(
quantity=amount.quantity,
unit=unit,
text=text,
confidence=mean(amount.confidence),
starting_index=amount.starting_index,
APPROXIMATE=amount.APPROXIMATE,
SINGULAR=amount.SINGULAR,
PREPARED_INGREDIENT=amount.PREPARED_INGREDIENT,
string_units=self.string_units,
volumetric_units_system=self.volumetric_units_system,
custom_units=self.custom_units,
)
)
if amount.implicit_quantity:
logger.debug(
f"Implicit quantity of '{amount.quantity}' applied to '{text}'."
)
return processed_amounts
def _is_approximate(self, i: int, tokens: list[LabelledToken]) -> bool:
"""Return True if token at current index is approximate.
This is determined by the token label being QTY and the previous token being in
a list of approximate tokens.
If returning True, also add index of i - 1 token to self.consumed list.
Parameters
----------
i : int
Index of current token.
tokens : list[LabelledToken]
List of all tokens.
Returns
-------
bool
True if current token is approximate.
Examples
--------
>>> p = PostProcessor("", [], [], [])
>>> p._is_approximate(
1,
["about", "3", "cups"],
["COMMENT", "QTY", "UNIT"],
[0, 1, 2]
)
True
>>> p = PostProcessor("", [], [], [])
>>> p._is_approximate(
1,
["approx.", "250", "g"],
["COMMENT", "QTY", "UNIT"],
[0, 1, 2]
)
True
"""
if (
tokens[i].label == "QTY"
and i > 0
and tokens[i - 1].text.lower() in APPROXIMATE_PREFIXES
):
# Mark i - 1 element as consumed.
self.consumed.append(tokens[i - 1].index)
return True
elif (
tokens[i].label == "QTY"
and i > 1
and tokens[i - 1].text == "."
and tokens[i - 2].text.lower() in APPROXIMATE_PREFIXES
):
# Special case for "approx."
# Mark i - 1 and i - 2 elements as consumed.
self.consumed.append(tokens[i - 1].index)
self.consumed.append(tokens[i - 2].index)
return True
elif (
tokens[i].label == "UNIT"
and i > 0
and tokens[i - 1].text.lower() in APPROXIMATE_PREFIXES
):
# For cases like "2 generous cups"
# Mark i - 1 element as consumed.
self.consumed.append(tokens[i - 1].index)
return True
elif (
tokens[i].label in ["UNIT", "QTY"]
and i < len(self.tokens) - 2
and [t.text.lower() for t in tokens[i + 1 : i + 3]] in APPROXIMATE_SUFFIXES
):
# For cases like "2/3 cup or so", "12 or so" etc.
# Mark i + 1 element as consumed.
self.consumed.append(tokens[i + 1].index)
self.consumed.append(tokens[i + 2].index)
return True
return False
def _is_singular(self, i: int, tokens: list[LabelledToken]) -> bool:
"""Return True is token at current index is singular.
This is determined by the token label being UNIT and the next token being in
a list of singular tokens.
If returning True, also add index of i + 1 token to self.consumed list.
Parameters
----------
i : int
Index of current token.
tokens : list[LabelledToken]
List of all tokens.
Returns
-------
bool
True if current token is singular.
Examples
--------
>>> p = PostProcessor("", [], [], [])
>>> p._is_singular(
1,
["3", "oz", "each"],
["QTY", "UNIT", "COMMENT"],
[0, 1, 2]
)
True
"""
if i == len(tokens) - 1:
return False
if tokens[i].label == "UNIT" and tokens[i + 1].text.lower() in SINGULAR_TOKENS:
# Mark i - 1 element as consumed
self.consumed.append(tokens[i + 1].index)
return True
if i == len(tokens) - 2:
return False
# Case where the amount is in brackets
if (
tokens[i].label == "UNIT"
and tokens[i + 1].text in [")", "]"]
and tokens[i + 2].text.lower() in SINGULAR_TOKENS
):
# Mark i - 1 element as consumed
self.consumed.append(tokens[i + 2].index)
return True
return False
def _is_singular_and_approximate(self, i: int, tokens: list[LabelledToken]) -> bool:
"""Return True if the current token is approximate and singular.
There are two cases:
1. The token label at the given index is QTY and is preceded by a token in
a list of singular tokens, then token in a list of approximate prefixes.
e.g. "each nearly 200 g"
2. The token label at the given index is UNIT and is followed by a sequence
of tokens in the approximate suffixes then a token in the list of
singular tokens.
e.g. "5 lbs or so each"
If returning True, also mark the indices of the singular and approximate tokens
as consumed.
Note: This doesn't handle the case of "each 1 lb or so" but I've not seen that
in the wild.
Parameters
----------
i : int
Index of current token.
tokens : list[LabelledToken]
List of all tokens.
Returns
-------
bool
True if current token is singular and approximate.
Examples
--------
>>> p = PostProcessor("", [], [], [])
>>> p._is_approximate(
2,
["each", nearly", "3", "oz"],
["COMMENT", "COMMENT", "QTY", "UNIT"],
[0, 1, 2, 3]
)
True
>>> p = PostProcessor("", [], [], [])
>>> p._is_approximate(
1,
["2", lbs", "or", "so", "each"],
["QTY", "UNIT", "COMMENT", "COMMENT", "COMMENT"],
[0, 1, 2, 3, 4]
)
True
"""
if (
tokens[i].label == "QTY"
and i > 1
and tokens[i - 1].text.lower() in APPROXIMATE_PREFIXES
and tokens[i - 2].text.lower() in SINGULAR_TOKENS
):
# Mark i - 1 and i - 2 elements as consumed
self.consumed.append(tokens[i - 1].index)
self.consumed.append(tokens[i - 2].index)
return True
elif (
tokens[i].label == "UNIT"
and i < len(self.tokens) - 3
and [t.text.lower() for t in tokens[i + 1 : i + 3]] in APPROXIMATE_SUFFIXES
and tokens[i + 3].text.lower() in SINGULAR_TOKENS
):
# e.g. "2 pounds or so each"
self.consumed.append(tokens[i + 1].index)
self.consumed.append(tokens[i + 2].index)
self.consumed.append(tokens[i + 3].index)
return True
return False
def _is_prepared(self, i: int, tokens: list[LabelledToken]) -> bool:
"""Return True is token at current index refers to the prepared ingredient.
This is determined by the token label being QTY and the previous tokens being in
a list of prepared tokens.
If the QTY is preceded by a token in APPROXIMATE_PREFIXES, then the tokens prior
to that are checked for matches against the prepared tokens list.
If returning True, also add index of tokens from prepared token list to
self.consumed list.
Parameters
----------
i : int
Index of current token.
tokens : list[LabelledToken]
List of all tokens.
Returns
-------
bool
True if current token is prepared.
Examples
--------
>>> p = PostProcessor("", [], [], [])
>>> p._is_approximate(
2,
["to", "yield", "2", "cups"],
["COMMENT", "COMMENT", "QTY", "UNIT"],
[0, 1, 2, 3]
)
True
>>> p = PostProcessor("", [], [], [])
>>> p._is_approximate(
2,
["to", "make", "about", "250", "g"],
["COMMENT", "COMMENT, "COMMENT", "QTY", "UNIT"],
[0, 1, 2, 3, 4]
)
True
"""
# All PREPARED_INGREDIENT_TOKENS have length 2, so cannot be prepared if i < 2.
if i < 2:
return False
if tokens[i].label != "QTY":
return False
for pattern in PREPARED_INGREDIENT_TOKENS:
if [t.text.lower() for t in tokens[i - 2 : i]] == pattern:
# Mark i - 1 and i - 2 elements as consumed
self.consumed.append(tokens[i - 1].index)
self.consumed.append(tokens[i - 2].index)
return True
elif (
i > 2
and tokens[i - 1].text.lower() in APPROXIMATE_PREFIXES
and [t.text.lower() for t in tokens[i - 3 : i - 1]] == pattern
):
# Mark i - 2 and i - 3 elements as consumed
self.consumed.append(tokens[i - 2].index)
self.consumed.append(tokens[i - 3].index)
return True
return False
def _distribute_related_flags(
self, amounts: list[_PartialIngredientAmount]
) -> list[_PartialIngredientAmount]:
"""Distribute all set flags to related amounts.
Parameters
----------
amounts : list[_PartialIngredientAmount]
List of amounts.
Returns
-------
list[_PartialIngredientAmount]
List of amount with all related amounts having the same flags.
"""
# Group amounts into related groups
grouped = []
for amount in amounts:
if grouped and amount.related_to_previous:
grouped[-1].append(amount)
else:
grouped.append([amount])
# Set flags for all amounts in group if any amount has flag set
for group in grouped:
if any(am.APPROXIMATE for am in group):
for am in group:
am.APPROXIMATE = True
if any(am.SINGULAR for am in group):
for am in group:
am.SINGULAR = True
if any(am.PREPARED_INGREDIENT for am in group):
for am in group:
am.PREPARED_INGREDIENT = True
# If any amount in a group of related amounts is a multiplier (e.g. 1x)
# then mark all following amounts with SINGULAR=True
singular_after_multiplier = False
for amount in group:
if singular_after_multiplier:
amount.SINGULAR = True
continue
if amount.quantity.endswith("x"):
singular_after_multiplier = True
# Flatten list for return
return list(chain.from_iterable(grouped))