Source code for ingredient_parser.en.postprocess

#!/usr/bin/env python3

import logging
import re
from collections import defaultdict
from dataclasses import dataclass
from functools import cached_property
from itertools import chain, pairwise
from statistics import mean

from ingredient_parser.en.foundationfoods import match_foundation_foods

from .._common import consume, group_consecutive_idx
from ..dataclasses import (
    CompositeIngredientAmount,
    FoundationFood,
    IngredientAmount,
    IngredientText,
    LabelledToken,
    ParsedIngredient,
)
from ._constants import (
    APPROXIMATE_PREFIXES,
    APPROXIMATE_SUFFIXES,
    INDEFINITE_QUANTIFIERS,
    PREPARED_INGREDIENT_TOKENS,
    SINGULAR_TOKENS,
    STOP_WORDS,
    STRING_NUMBERS_REGEXES,
)
from ._regex import FRACTION_TOKEN_PATTERN
from ._utils import (
    combine_quantities_split_by_and,
    ingredient_amount_factory,
    pluralise_units,
    replace_string_range,
)

logger = logging.getLogger("ingredient-parser.postprocess")

WORD_CHAR = re.compile(r"\w")


@dataclass
class _PartialIngredientAmount:
    """Dataclass for incrementally building ingredient amount information.

    Attributes
    ----------
    quantity : str
        Parsed ingredient quantity
    unit : list[str]
        Unit or unit tokens of parsed ingredient quantity
    confidence : list[float]
        Average confidence of all tokens or list of confidences for each token of parsed
        ingredient amount, between 0 and 1.
    starting_index : int
        Index of token in sentence that starts this amount
    related_to_previous : bool, optional
        If True, indicates it is related to the previous IngredientAmount object. All
        related objects should have the same APPROXIMATE and SINGULAR flags
    APPROXIMATE : bool, optional
        When True, indicates that the amount is approximate.
        Default is False.
    SINGULAR : bool, optional
        When True, indicates if the amount refers to a singular item of the ingredient.
        Default is False.
    PREPARED_INGREDIENT : bool, optional
        When True, indicates the amount applies to the prepared ingredient.
        When False, indicates the amount applies to the ingredient before preparation.
        Default is False.
    implicit_quantity : bool, optional
        When True, indicates that the quantity is implicit rather than explicit. This
        is used to keep track of implicit quantities so that they can be reverted if
        we later encounter a plural unit when constructing the amount.
        Default is False.
    """

    quantity: str
    unit: list[str]
    confidence: list[float]
    starting_index: int
    related_to_previous: bool = False
    APPROXIMATE: bool = False
    SINGULAR: bool = False
    PREPARED_INGREDIENT = False
    implicit_quantity: bool = False



[docs]
class PostProcessor:
    """Recipe ingredient sentence PostProcessor class.

    Performs the necessary postprocessing on the sentence tokens and labels and scores
    for the tokens after tagging with the CRF model in order to return a coherent
    structure of parsed information.

    Attributes
    ----------
    sentence : str
        Original ingredient sentence.
    labelled_tokens : list[LabelledToken],
        List of labelled tokens for original ingredient sentence.
    custom_units : dict[str, str]
        Dict of custom units as plural: singular pairs.
    separate_names : bool, optional
        If True and the sentence contains multiple alternative ingredients, return an
        IngredientText object for each ingredient name, otherwise return a single
        IngredientText object.
        Default is True.
    discard_isolated_stop_words : bool, optional
        If True, isolated stop words are discarded from the name, preparation or
        comment fields. Default value is True.
    string_units : bool, optional
        If True, return all IngredientAmount units as strings.
        If False, convert IngredientAmount units to pint.Unit objects where possible.
        Default is False.
    imperial_units : bool, optional
        If True, use imperial units instead of US customary units for pint.Unit objects
        for the the following units: fluid ounce, cup, pint, quart, gallon.
        Default is False, which results in US customary units being used.
        This has no effect if string_units=True.
    foundation_foods : bool, optional
        If True, populate the foundation_foods field of ParsedIngredient.
        Default is False, in which case the foundation_foods field is an empty list.
    consumed : list[int]
        List of indices of tokens consumed as part of postprocesing the tokens and
        labels.
    """

    def __init__(
        self,
        sentence: str,
        labelled_tokens: list[LabelledToken],
        custom_units: dict[str, str],
        separate_names: bool = True,
        discard_isolated_stop_words: bool = True,
        string_units: bool = False,
        volumetric_units_system: str = "us_customary",
        foundation_foods: bool = False,
    ):
        self.sentence = sentence
        self.tokens = labelled_tokens
        self.custom_units = custom_units
        self.separate_names = separate_names
        self.discard_isolated_stop_words = discard_isolated_stop_words
        self.string_units = string_units
        self.volumetric_units_system = volumetric_units_system
        self.foundation_foods = foundation_foods
        self.consumed = []

    def __repr__(self) -> str:
        """__repr__ method.

        Returns
        -------
        str
            String representation of initialised object.
        """
        return f'PostProcessor("{self.sentence}")'

    def __str__(self) -> str:
        """__str__ method.

        Returns
        -------
        str
            Human readable string representation of object.
        """
        tokens_labels = [(t.text, t.label) for t in self.tokens]
        _str = [
            "Post-processed recipe ingredient sentence",
            f"\t{tokens_labels}",
        ]
        return "\n".join(_str)


[docs]
    @cached_property
    def parsed(self) -> ParsedIngredient:
        """Return parsed ingredient data.

        Returns
        -------
        ParsedIngredient
            Object containing structured data from sentence.
        """
        amounts = self._postprocess_amounts()

        foundationfoods = []
        if self.separate_names:
            name, foundationfoods = self._postprocess_names()
        else:
            # Replace all labels containing NAME with "NAME"
            name_replaced_labels = []
            for t in self.tokens:
                if "NAME" in t.label:
                    t.label = "NAME"
            self.labels = name_replaced_labels
            logger.debug(
                (
                    f"Relabelled tokens to {name_replaced_labels} ",
                    "because seperate_name=False.",
                )
            )

            # Process NAME labels as any other label, but return as a list
            if processed_name := self._postprocess("NAME"):
                name = [processed_name]
                if self.foundation_foods:
                    # Extract name tokens. We can only return a single foundation food,
                    # but we still need to return a list.
                    name_pos = [
                        (t.text, t.pos_tag) for t in self.tokens if t.label == "NAME"
                    ]
                    name_tokens, pos_tags = zip(*name_pos)
                    if ff := match_foundation_foods(
                        list(name_tokens), list(pos_tags), 0
                    ):
                        foundationfoods = [ff]
            else:
                name = []

        size = self._postprocess("SIZE")
        preparation = self._postprocess("PREP")
        comment = self._postprocess("COMMENT")
        purpose = self._postprocess("PURPOSE")

        return ParsedIngredient(
            name=name,
            size=size,
            amount=amounts,
            preparation=preparation,
            comment=comment,
            purpose=purpose,
            foundation_foods=foundationfoods,
            sentence=self.sentence,
        )


    def _postprocess(self, selected_label: str) -> IngredientText | None:
        """Process tokens, labels and scores with selected label into IngredientText.

        Parameters
        ----------
        selected_label : str
            Label of tokens to postprocess.

        Returns
        -------
        IngredientText
            Object containing ingredient comment text and confidence.
        """
        # Select indices of tokens, labels and scores for selected_label
        # Do not include tokens, labels and scores in self.consumed
        label_idx = [
            i
            for i, t in enumerate(self.tokens)
            if t.label in [selected_label, "PUNC"] and i not in self.consumed
        ]

        # If idx is empty or all the selected idx are PUNC, return None
        if not label_idx or all(self.tokens[i].label == "PUNC" for i in label_idx):
            return None

        return self._postprocess_indices(label_idx, selected_label)

    def _postprocess_names(self) -> tuple[list[IngredientText], list[FoundationFood]]:
        """Process tokens, labels and scores for the ingredient name(s).

        This function handles multiple ingredient names e.g. "butter or olive oil",
        determined by the labels provided for each token.
        Where multiple alternative ingredients names are identified, each one is
        returned in a separate IngredientText object.

        Returns
        -------
        list[IngredientText], list[FoundationFoods]
            List of IngredientText objects for names.
            List of matching FoundationFood objects for names.
        """
        name_idx = [
            i
            for i, t in enumerate(self.tokens)
            if ("NAME" in t.label or t.label == "PUNC") and i not in self.consumed
        ]

        # If idx is empty or all the selected idx are PUNC, return None
        if not name_idx or all(self.tokens[i].label == "PUNC" for i in name_idx):
            return [], []

        name_labels = [self.tokens[i].label for i in name_idx]
        bio_groups = self._group_name_labels(name_labels)
        constructed_names = self._construct_names_from_bio_groups(bio_groups)
        names, foundation_foods = self._convert_name_indices_to_object(
            name_idx, constructed_names
        )
        return names, foundation_foods

    def _merge(self, objs: list[IngredientText]) -> IngredientText:
        """Merge list of IngredientText objects into a single object.

        Text values are joined by a space, unless all text values are the same in which
        case only one of values is kept.
        Confidence values are averaged.
        Starting index is set to the lowest value.

        Parameters
        ----------
        names : list[IngredientText]
            List of objects to merge.

        Returns
        -------
        IngredientText
            Merged IngredientText object.
        """
        sorted_objs = sorted(objs, key=lambda x: x.starting_index)

        if len({n.text for n in sorted_objs}) == 1:
            text = sorted_objs[0].text
        else:
            text = " ".join(n.text for n in sorted_objs)

        merged = IngredientText(
            text=text,
            confidence=round(mean(n.confidence for n in sorted_objs), 6),
            starting_index=min(n.starting_index for n in sorted_objs),
        )
        return merged

    def _group_name_labels(self, name_labels: list[str]) -> list[list[tuple[int, str]]]:
        """Group name labels according to name label type.

        B_NAME_TOK and all following I_NAME_TOK up to the next label that is not
        I_NAME_TOK or PUNC are grouped.

        All consecutive NAME_MOD labels are grouped.
        All consecutive NAME_VAR labels are grouped.

        A NAME_SEP label starts a new group.

        Parameters
        ----------
        name_labels : list[str]
            List of name labels.

        Returns
        -------
        list[list[tuple[int, str]]]
            List of BIO groups.
            Each group is a list of tuples, where each tuple is the (index, label) of
            the original name_labels list element.
        """
        name_groups = []
        current_group = []
        prev_label = None
        for idx, label in enumerate(name_labels):
            # Start new group on NAME_SEP name label
            if label == "NAME_SEP":
                if current_group:
                    name_groups.append(current_group)
                current_group = []
            # Start new group for new "B_*" name label
            elif label.startswith("B_"):
                if current_group:
                    name_groups.append(current_group)
                current_group = [(idx, label)]
            # Start new group if encountering new NAME_MOD or NAME_VAR, or append to
            # current group if previous label was the same as current label.
            elif label in ["NAME_MOD", "NAME_VAR"]:
                if prev_label == label:
                    current_group.append((idx, label))
                else:
                    if current_group:
                        name_groups.append(current_group)
                    current_group = [(idx, label)]
            # Must be an I_NAME_TOK or PUNC label, so append to current group
            else:
                current_group.append((idx, label))

            prev_label = label

        # Add last group to list if not empty
        if current_group:
            name_groups.append(current_group)

        return name_groups

    def _construct_names_from_bio_groups(
        self, name_groups: list[list[tuple[int, str]]]
    ) -> list[list[int]]:
        """Construct names from BIO groups.

        All VAR groups are prepended to the next TOK group.
        MOD groups are prepended to all subsequent TOK groups or VAR+TOK groups.

        To make this easier, iterate through the BIO groups from last to first. This
        means we can easily keep track of which TOK group to prepend VAR and MOD
        groups.

        Parameters
        ----------
        name_groups : list[list[tuple[int, str]]]
            List of BIO groups.
            Each group is a list of tuples, where each tuple if the (index, label) of
            the original list element.

        Returns
        -------
        list[list[int]]
            List of name_label indices for each name.
        """
        constructed_names = []

        # Keep track the last TOK group we come across (moving from last to first).
        # Also keep track of whether we have used it by prepending a VAR or MOD
        # group.
        last_encountered_name = None
        last_encountered_name_used = False

        # Iterate from last to first BIO group
        for group in reversed(name_groups):
            current_group_idx, labels = zip(*group)
            current_label = self._get_name_group_label(labels)

            if current_label == "TOK":
                # If we've previously come across a TOK group and haven't used it,
                # then store it.
                if last_encountered_name and not last_encountered_name_used:
                    constructed_names.append(last_encountered_name)

                # Set current group to last_encountered_name group.
                last_encountered_name = current_group_idx
                last_encountered_name_used = False

            elif current_label == "VAR":
                # Prepend this group to last encountered NAME group
                if last_encountered_name:
                    constructed_names.append(current_group_idx + last_encountered_name)
                    last_encountered_name_used = True
                else:
                    # If we are here, then we've come across a VAR group that does not
                    # precede a TOK group, so the model has made an error in it's
                    # labelling. Add this VAR group anyway.
                    constructed_names.append(current_group_idx)

            elif current_label == "MOD":
                # If we've previously come across a NAME group and haven't used it,
                # then store it.
                if last_encountered_name and not last_encountered_name_used:
                    constructed_names.append(last_encountered_name)
                    last_encountered_name_used = True

                # Prepend this group to all constructed names so far
                constructed_names = [
                    current_group_idx + name for name in constructed_names
                ]

        # If we've iterated through all BIO groups and haven't used
        # last_encountered_name, add it to constructed_names now.
        if last_encountered_name and not last_encountered_name_used:
            constructed_names.append(last_encountered_name)

        # Return reversed list, so names are in the order they appear in sentence.
        return list(reversed(constructed_names))

    def _get_name_group_label(self, labels: tuple[str]) -> str:
        """Get the NAME label type for the labels in a name group.

        One of TOK, VAR, MOD.

        Parameters
        ----------
        labels : tuple[str]
            Tuple of labels for name group elements.

        Returns
        -------
        str
            Group label.
        """
        for label in labels:
            if label != "PUNC":
                return label.split("_")[-1]

        return ""

    def _convert_name_indices_to_object(
        self, name_idx: list[int], name_index_groups: list[list[int]]
    ) -> tuple[list[IngredientText], list[FoundationFood]]:
        """Convert grouped indices for name tokens into IngredientText objects.

        If foundation foods are enabled, determine matching foundation food for each
        name.

        If an ingredient name ends with a token with POS tag of DT, IN or JJ, merge it
        with the next name group, if there is one. This is to avoid cases in a sentence
        like "5 fresh large basil leaves" where "large" is given the SIZE label,
        resulting in two separate names: "fresh" and "basil leaves". Instead, we want to
        return a single name: "fresh basil leaves".

        Parameters
        ----------
        name_idx : list[int]
            List of indices of NAME tokens.
        name_index_groups : list[list[int]]
            List of groups of indices corresponding to ingredient names.
            These indices refer to the name_idx list.

        Returns
        -------
        tuple[list[IngredientText], list[FoundationFood]]
            List of deduplicated IngredientText objects and FoundationFoods objects.
        """

        # Keep track of IngredientText objects and indices to merge with next.
        # We do the merge if the name ends with DT, IN, JJ part of speech tag.
        merge_with_next = False
        merge_with_next_idx: list[int] = []

        # Merge name_idx group with next if it ends with DT, IN or JJ part of speech
        # tag.
        merged_name_idx = []
        for group in name_index_groups:
            # Convert from name_label indices to token indices
            token_idx = [name_idx[idx] for idx in group]

            if merge_with_next and merge_with_next_idx:
                token_idx = [*merge_with_next_idx, *token_idx]

            if self._last_non_punc_token_pos(token_idx) in {"DT", "IN", "JJ"}:
                # Mark name for merging with next name.
                merge_with_next = True
                merge_with_next_idx = token_idx
                # Skip to next iteration
                continue
            else:
                merged_name_idx.append(token_idx)
                merge_with_next = False
                merge_with_next_idx = []

        if merge_with_next and merge_with_next_idx:
            # Catch any remaining name indices marked as needing to be merged
            # but haven't been.
            merged_name_idx.append(merge_with_next_idx)

        # Build IngredientText objects, merging duplicate names where found.
        names = []
        foundation_foods = []
        for token_idx in merged_name_idx:
            ing_text = self._postprocess_indices(token_idx, "NAME")
            if not ing_text:
                continue

            if ing_text.text in [n.text for n in names]:
                dupe_idx = [i for i, n in enumerate(names) if n.text == ing_text.text]
                merged = self._merge([*[names[i] for i in dupe_idx], ing_text])
                names[dupe_idx[0]] = merged
            else:
                names.append(ing_text)

                if self.foundation_foods:
                    # We don't match foundation foods for duplicate names because we
                    # will have already found any match for the first instance of the
                    # name.
                    tokens = [self.tokens[i].text for i in token_idx]
                    pos_tags = [self.tokens[i].pos_tag for i in token_idx]
                    if ff := match_foundation_foods(tokens, pos_tags, len(names) - 1):
                        foundation_foods.append(ff)

        return names, foundation_foods

    def _last_non_punc_token_pos(self, token_idx: list[int]) -> str:
        """Return the POS tag at the last index in token_idx.

        Indices corresponding to punctuation are ignored.

        Parameters
        ----------
        token_idx : list[int]
            List of token indices to find last non-punctuation POS tag.

        Returns
        -------
        str
            POS tag for last non-punctuation index.
        """
        for idx in reversed(token_idx):
            if self.tokens[idx].label == "PUNC":
                continue

            return self.tokens[idx].pos_tag

        # Return empty string so we don't try to merge this with the next name.
        # This should never occur because self._postprocess_indices is called before
        # this and that function skips over any potential name that is all punctuation.
        return ""

    def _postprocess_indices(
        self, label_idx: list[int], selected_label: str
    ) -> IngredientText | None:
        """Process list of token indices into a single IngredientText object.

        Consecutive tokens are joined together, with non-consecutive groups being joined
        by a comma (unless selected_label is NAME).

        Indices for tokens that would be ungrammatical are removed prior to joining.
        Duplicate tokens that are adjacent are also removed.

        Parameters
        ----------
        label_idx : list[int]
            List of indices of tokens to postprocess into IngredientText.
        selected_label : str
            Label of tokens being post processed.

        Returns
        -------
        IngredientText | None
            IngredientText object for selected tokens.
            If the post processing results in all tokens being ignored, return None.
        """
        # Join consecutive tokens together and average their score
        parts = []
        confidence_parts = []
        starting_index = label_idx[-1]
        for group in group_consecutive_idx(label_idx):
            idx = list(group)
            idx = self._remove_invalid_indices(idx)

            if all(self.tokens[i].label == "PUNC" for i in idx):
                # Skip if the group only contains PUNC
                continue

            # Convert any fractions in intermediate form (i.e. #1$2) into text
            group_tokens = []
            for i in idx:
                if FRACTION_TOKEN_PATTERN.match(self.tokens[i].text):
                    text_fraction = (
                        self.tokens[i].text.replace("#", " ").replace("$", "/").strip()
                    )
                    # If fraction range, remove space that will follow hyphen caused by
                    # replacing # with space.
                    text_fraction = text_fraction.replace("- ", "-")
                    group_tokens.append(text_fraction)
                else:
                    group_tokens.append(self.tokens[i].text)

            joined = " ".join(group_tokens)
            confidence = mean([self.tokens[i].score for i in idx])

            if self.discard_isolated_stop_words and joined.lower() in STOP_WORDS:
                # Skip part if it's a stop word
                continue

            self.consumed.extend(idx)
            parts.append(joined)
            confidence_parts.append(confidence)
            starting_index = min(starting_index, idx[0])

        # Find the indices of the joined tokens list where the element
        # is the same as the previous element in the list.
        keep_idx = self._remove_adjacent_duplicates(parts)
        parts = [parts[i] for i in keep_idx]
        confidence_parts = [confidence_parts[i] for i in keep_idx]

        # Join all the parts together into a single string and fix any
        # punctuation weirdness as a result.
        # If the selected_label is NAME, join with a space. For all other labels, join
        # with a comma and a space.
        if selected_label == "NAME":
            text = " ".join(parts)
        else:
            text = ", ".join(parts)
        text = self._fix_punctuation(text)
        text = pluralise_units(text, self.custom_units)

        if len(parts) == 0:
            return None

        return IngredientText(
            text=text,
            confidence=round(mean(confidence_parts), 6),
            starting_index=starting_index,
        )

    def _postprocess_amounts(
        self,
    ) -> list[IngredientAmount | CompositeIngredientAmount]:
        """Process tokens, labels and scores into IngredientAmount.

        This is done by combining QTY labels with any following UNIT labels,
        up to the next QTY label.

        The confidence is the average confidence of all labels in the IngredientGroup.

        A number of special cases are considered before the default processing:
        1. "sizeable unit" pattern
        2. "composite amounts" pattern

        Returns
        -------
        list[IngredientAmount | CompositeIngredientAmount]
            List of IngredientAmount and  CompositeIngredientAmount objects.
        """
        self._convert_string_number_qty()

        funcs = [
            self._sizeable_unit_pattern,
            self._composite_amounts_pattern,
            self._fallback_pattern,
        ]

        amounts = []
        for func in funcs:
            tokens = self._unconsumed(self.tokens)

            parsed_amounts = func(tokens)
            amounts.extend(parsed_amounts)

        return sorted(amounts, key=lambda x: x.starting_index)

    def _unconsumed(self, list_: list[LabelledToken]) -> list[LabelledToken]:
        """Return elements from list whose index is not in the list of consumed indices.

        Parameters
        ----------
        list_ : list[LabelledToken]
            List of items to remove consumed elements from.

        Returns
        -------
        list[LabelledToken]
            List of items without consumed elements.
        """
        return [el for el in list_ if el.index not in self.consumed]

    def _remove_invalid_indices(self, idx: list[int]) -> list[int]:
        """Remove indices of tokens that aren't valid in the group.

        The invalid indices correspond to punctuation that cannot start or end a phrase,
        or brackets that aren't part of a matched pair.

        Parameters
        ----------
        idx : list[int]
            List of indices for group of consecutive tokens
            with same label or PUNC label.

        Returns
        -------
        list[int]
            List of indices with invalid punctuation removed.
        """
        # For groups with more than 1 element, remove invalid leading and trailing
        # punctuation so they don't get incorrectly consumed.
        while len(idx) > 1 and self.tokens[idx[0]].text in [
            ")",
            "]",
            "}",
            ",",
            ":",
            ";",
            "-",
            ".",
            "!",
            "?",
            "*",
            "&",
            "/",
            "--",
        ]:
            idx = idx[1:]

        while len(idx) > 1 and self.tokens[idx[-1]].text in [
            "[",
            "(",
            "{",
            ",",
            ":",
            ";",
            "-",
            "&",
            "/",
            "*",
            "--",
            "+",
        ]:
            idx = idx[:-1]

        # Remove brackets that aren't part of a matching pair
        idx_to_remove = []
        tok_name = None  # Unnecessary, but prevents typing errors
        stack = defaultdict(list)  # Separate stack for each bracket type
        for i, tok in enumerate([self.tokens[i].text for i in idx]):
            if tok in ["(", ")"]:
                tok_name = "PAREN"
            elif tok in ["[", "]"]:
                tok_name = "SQAURE"

            if tok in ["(", "["]:
                # Add index to stack when we find an opening parens
                stack[tok_name].append(i)
            elif tok in [")", "]"]:
                if len(stack[tok_name]) == 0:
                    # If the stack is empty, we've found a dangling closing parens
                    idx_to_remove.append(i)
                else:
                    # Remove last added index from stack when we find a closing parens
                    stack[tok_name].pop()

        # Insert anything left in stack into idx_to_remove and remove
        for stack_idx in stack.values():
            idx_to_remove.extend(stack_idx)
        idx = [idx[i] for i, _ in enumerate(idx) if i not in idx_to_remove]

        return idx

    def _fix_punctuation(self, text: str) -> str:
        """Fix some common punctuation errors that result when combining tokens.

        Parameters
        ----------
        text : str
            Text resulting from combining tokens with same label.

        Returns
        -------
        str
            Text, with punctuation errors fixed.

        Examples
        --------
        >>> p = PostProcessor("", [], [], [])
        >>> p._fix_punctuation(", some words ( inside ),")
        "some words (inside)"
        """
        if text == "":
            return text

        # Correct space following open parens or before close parens
        text = text.replace("( ", "(").replace(" )", ")")

        # Remove space around forward slash
        text = text.replace(" / ", "/")

        # Correct space preceding various punctuation
        for punc in [",", ":", ";", ".", "!", "?", "*"]:
            text = text.replace(f" {punc}", punc)

        return text.strip()

    def _remove_adjacent_duplicates(self, parts: list[str]) -> list[int]:
        """Find indices of adjacent duplicate strings.

        Parameters
        ----------
        parts : list[str]
            List of strings with single label.

        Returns
        -------
        list[int]
            Indices of elements in parts to keep.

        Examples
        --------
        >>> p = PostProcessor("", [], [], [])
        >>> p._remove_isolated_punctuation_and_duplicate_indices(
            ["word", "word", "another"],
        )
        [1, 2]
        """

        idx_to_keep = []
        for i, (first, second) in enumerate(pairwise([*parts, ""])):
            if first != second:
                idx_to_keep.append(i)

        return idx_to_keep

    def _replace_string_numbers(self, text: str) -> str:
        """Replace string numbers (e.g. one, two) with numeric values (e.g. 1, 2).

        Parameters
        ----------
        text : str
            Ingredient sentence.

        Returns
        -------
        str
            Ingredient sentence with string numbers replace with numeric values.

        Examples
        --------
        >>> p = PreProcessor("")
        >>> p._replace_string_numbers("three large onions")
        "3 large onions"

        >>> p = PreProcessor("")
        >>> p._replace_string_numbers("twelve bonbons")
        "12 bonbons"
        """
        # STRING_NUMBER_REGEXES is a dict where the values are a tuple of the compiled
        # regular expression for matching a string number e.g. 'one', 'two' and the
        # substitution numerical value for that string number.
        for regex, substitution in STRING_NUMBERS_REGEXES.values():
            text = regex.sub(rf"{substitution}", text)

        return text

    def _convert_string_number_qty(self) -> None:
        """Convert QTY tokens that are string numbers to numeric values.

        This function modifies the tokens, labels and scores lists in place to replace
        any string numbers with QTY label with their numeric value.

        This function also collapses any quantities split by 'and' into a single
        number e.g.
        one and one-half -> 1 and 1/2 -> 1.5

        This function also collapses any string ranges into a single range e.g.
        one or two -> 1 or 2 -> 1-2
        """
        for t in self.tokens:
            if t.label == "QTY":
                self.tokens[t.index].text = self._replace_string_numbers(t.text)

        QTY_idx = [t.index for t in self.tokens if t.label == "QTY"]

        # Find any cases where a group of consecutive QTY tokens can be collapsed into
        # a single token. Modify the first token and score in the group and mark all
        # others in group for deletion.
        idx_to_remove = []
        for idx_group in group_consecutive_idx(QTY_idx):
            idx_group = list(idx_group)
            if len(idx_group) == 1:
                continue

            fragment = " ".join([self.tokens[i].text for i in idx_group])

            replacement = combine_quantities_split_by_and(fragment)
            if replacement != fragment:
                mod_idx = idx_group[0]  # Index to replace with replacement
                self.tokens[mod_idx].score = mean(
                    [self.tokens[i].score for i in idx_group]
                )
                self.tokens[mod_idx].text = replacement

                idx_to_remove.extend(idx_group[1:])
                continue

            replacement = replace_string_range(fragment)
            if replacement != fragment:
                mod_idx = idx_group[0]  # Index to replace with replacement
                self.tokens[mod_idx].score = mean(
                    [self.tokens[i].score for i in idx_group]
                )
                self.tokens[mod_idx].text = replacement

                idx_to_remove.extend(idx_group[1:])
                continue

        if idx_to_remove:
            self.tokens = [t for t in self.tokens if t.index not in idx_to_remove]

    def _sizeable_unit_pattern(
        self, tokens: list[LabelledToken]
    ) -> list[IngredientAmount]:
        """Identify sentences which match the sizeable unit pattern.

        This pattern is where there is a quantity-unit pair split by one or more
        quantity-unit pairs e.g.

        * 1 28 ounce can
        * 2 17.3 oz (484g) package

        This also handles the case where there is no leading count, e.g.

        * 15 ounce can
        * 12-ounce jar

        In this case, the container unit gets an implied quantity of 1 and the
        weight quantity-unit pair is returned as a secondary amount.

        Return the correct sets of quantities and units, or an empty list.

        For example, for the sentence: 1 28 ounce can; the correct amounts are:
        [
            IngredientAmount(quantity=Fraction(1, 1), unit="can", score=0.x...),
            IngredientAmount(quantity=Fraction(28, 1), unit="ounce", score=0.x...),
        ]

        For the sentence: 15 ounce can; the correct amounts are:
        [
            IngredientAmount(quantity=Fraction(1, 1), unit="can", score=0.x...),
            IngredientAmount(quantity=Fraction(15, 1), unit="ounce", score=0.x...),
        ]

        Parameters
        ----------
        tokens : list[LabelledToken]
            Labelled tokens for input sentence.

        Returns
        -------
        list[IngredientAmount]
            List of IngredientAmount objects.
        """
        # We assume that the pattern will not be longer than the longest list
        # defined here.
        patterns = [
            ["QTY", "QTY", "UNIT", "QTY", "UNIT", "QTY", "UNIT", "UNIT"],
            ["QTY", "QTY", "UNIT", "QTY", "UNIT", "UNIT"],
            ["QTY", "QTY", "UNIT", "UNIT"],
            ["QTY", "UNIT", "UNIT"],
        ]

        # List of possible units at end of pattern that constitute a match
        end_units = [
            "bag",
            "block",
            "bottle",
            "box",
            "bucket",
            "can",
            "carton",
            "container",
            "envelope",
            "jar",
            "loaf",
            "package",
            "packet",
            "piece",
            "sachet",
            "slice",
            "tin",
        ]

        amounts = []
        for pattern in patterns:
            for match in self._match_pattern(tokens, pattern, ignore_other_labels=True):
                # The [QTY, UNIT, UNIT] pattern can match the tail end of a
                # longer pattern like [QTY, QTY, UNIT, UNIT]. Skip matches
                # whose indices were already consumed by a longer pattern.
                if any(tokens[i].index in self.consumed for i in match):
                    continue

                # If the pattern ends with one of end_units, we have found a match for
                # this pattern!
                if tokens[match[-1]].text in end_units:
                    # Get tokens and scores that are part of match
                    matching_tokens = [tokens[i].text for i in match]
                    matching_scores = [tokens[i].score for i in match]

                    # Keep track of indices of matching elements so we don't use them
                    # again elsewhere
                    self.consumed.extend([tokens[i].index for i in match])

                    if pattern == patterns[3]:  # ["QTY", "UNIT", "UNIT"]
                        # No explicit count in pattern.
                        # E.g., "15 ounce can" -> first amount: 1 can
                        unit = matching_tokens.pop(-1)
                        first = ingredient_amount_factory(
                            quantity="1",
                            unit=unit,
                            text="1 " + unit,
                            confidence=matching_scores.pop(-1),
                            starting_index=tokens[match[0]].index,
                            APPROXIMATE=self._is_approximate(match[0], tokens),
                            string_units=self.string_units,
                            volumetric_units_system=self.volumetric_units_system,
                            custom_units=self.custom_units,
                        )
                        amounts.append(first)
                        _ = match.pop(-1)

                        logger.debug(f"Implicit quantity of '1' applied to '1 {unit}'.")
                    else:
                        # The first amount is made up of the first and last items
                        # Note that this cannot be singular, but may be approximate
                        quantity = matching_tokens.pop(0)
                        unit = matching_tokens.pop(-1)
                        text = " ".join((quantity, unit)).strip()

                        first = ingredient_amount_factory(
                            quantity=quantity,
                            unit=unit,
                            text=text,
                            confidence=mean(
                                [matching_scores.pop(0), matching_scores.pop(-1)]
                            ),
                            starting_index=tokens[match[0]].index,
                            APPROXIMATE=self._is_approximate(match[0], tokens),
                            string_units=self.string_units,
                            volumetric_units_system=self.volumetric_units_system,
                            custom_units=self.custom_units,
                        )
                        amounts.append(first)
                        # Pop the first and last items from the list of matching
                        # indices
                        _ = match.pop(0)
                        _ = match.pop(-1)

                    # Create IngredientAmount objects for the remaining
                    # quantity-unit pairs
                    for i in range(0, len(matching_tokens), 2):
                        quantity = matching_tokens[i]
                        unit = matching_tokens[i + 1]
                        text = " ".join((quantity, unit)).strip()
                        confidence = mean(matching_scores[i : i + 1])

                        # If the first amount (e.g. 1 can) is approximate,
                        # so are all the pairs in between
                        amount = ingredient_amount_factory(
                            quantity=quantity,
                            unit=unit,
                            text=text,
                            confidence=confidence,
                            starting_index=tokens[match[i]].index,
                            SINGULAR=True,
                            APPROXIMATE=first.APPROXIMATE,
                            string_units=self.string_units,
                            volumetric_units_system=self.volumetric_units_system,
                            custom_units=self.custom_units,
                        )
                        amounts.append(amount)

        return amounts

    def _composite_amounts_pattern(
        self, tokens: list[LabelledToken]
    ) -> list[CompositeIngredientAmount]:
        """Identify sentences which match the pattern where there are composite amounts.

        This pattern is where there are adjacent amounts that need to be considered
        together, e.g.

        * 1 lb 2 oz
        * 1 pint 2 fl oz
        * 2 cups plus 1 tablespoon

        Return a composite amount object made from the adjacent amounts.

        For example, for the sentence: 1 lb 2 oz ...; the composite amount is:
        CompositeAmount(
            amounts=[
                IngredientAmount(quantity=Fraction(1, 1), unit="lb", score=0.x...),
                IngredientAmount(quantity=Fraction(2, 1), unit="oz", score=0.x...),
            ],
            join=""
        )

        Parameters
        ----------
        tokens : list[str]
            Labelled tokens for input sentence.

        Returns
        -------
        list[CompositeIngredientAmount]
            List of IngredientAmount objects.
        """
        # Define patterns for composite amounts based on a sequence of labels.
        # Also set the indices of the pattern sequence where the first and
        # second amounts start, set the string used to join the two amounts
        # together in text, and set whether the amounts combine subtractively or not.
        patterns = {
            "ptfloz": {
                "pattern": ["QTY", "UNIT", "QTY", "UNIT", "UNIT"],
                "conjunction": None,
                "conj_index": None,
                "start1": 0,
                "start2": 2,
                "join": "",
                "subtractive": False,
            },
            "lboz": {
                "pattern": ["QTY", "UNIT", "QTY", "UNIT"],
                "conjunction": None,
                "conj_index": None,
                "start1": 0,
                "start2": 2,
                "join": "",
                "subtractive": False,
            },
            "plus": {
                "pattern": ["QTY", "UNIT", "COMMENT", "QTY", "UNIT"],
                "conjunction": "plus",
                "conj_index": 2,
                "start1": 0,
                "start2": 3,
                "join": " plus ",
                "subtractive": False,
            },
            "plus_punc": {
                "pattern": ["QTY", "UNIT", "PUNC", "QTY", "UNIT"],
                "conjunction": "+",
                "conj_index": 2,
                "start1": 0,
                "start2": 3,
                "join": " + ",
                "subtractive": False,
            },
            "plus_punc_comment": {
                "pattern": ["QTY", "UNIT", "PUNC", "COMMENT", "QTY", "UNIT"],
                "conjunction": "plus",
                "conj_index": 3,
                "start1": 0,
                "start2": 4,
                "join": " plus ",
                "subtractive": False,
            },
            "and": {
                "pattern": ["QTY", "UNIT", "COMMENT", "QTY", "UNIT"],
                "conjunction": "and",
                "conj_index": 2,
                "start1": 0,
                "start2": 3,
                "join": " and ",
                "subtractive": False,
            },
            "minus": {
                "pattern": ["QTY", "UNIT", "COMMENT", "QTY", "UNIT"],
                "conjunction": "minus",
                "conj_index": 2,
                "start1": 0,
                "start2": 3,
                "join": " minus ",
                "subtractive": True,
            },
            "less": {
                "pattern": ["QTY", "UNIT", "COMMENT", "QTY", "UNIT"],
                "conjunction": "less",
                "conj_index": 2,
                "start1": 0,
                "start2": 3,
                "join": " minus ",
                "subtractive": True,
            },
        }

        # List of possible units for first and second amount matched for
        # pltfloz and lboz patterns.
        valid_first_units = {"lb", "pound", "pt", "pint"}
        valid_last_units = {"oz", "ounce"}

        composite_amounts = []
        for pattern_name, pattern_info in patterns.items():
            pattern = pattern_info["pattern"]
            start1 = pattern_info["start1"]
            start2 = pattern_info["start2"]
            join = pattern_info["join"]
            conj_index = pattern_info["conj_index"]
            subtractive = pattern_info["subtractive"]

            for match in self._match_pattern(
                tokens, pattern, ignore_other_labels=False
            ):
                # Check if match fits with "ptfloz" or "lboz" pattern constraints
                if pattern_name in ["ptfloz", "lboz"]:
                    first_unit = tokens[match[start1 + 1]].text
                    last_unit = tokens[match[-1]].text
                    if (
                        first_unit not in valid_first_units
                        or last_unit not in valid_last_units
                    ):
                        # Units of match do not align with expectations for
                        # ptfloz or lboz patterns, so skip
                        continue

                # For other patterns, check if token at the conj_index in match matches
                # conjunction and skip if not.
                elif (
                    tokens[match[conj_index]].text.lower()
                    != pattern_info["conjunction"]
                ):
                    continue

                # First amount
                mstart1 = match[start1]  # Index of start of 1st part in full sentence.
                quantity_1 = tokens[mstart1].text
                unit_1 = tokens[match[start1 + 1]].text
                score_1 = mean(tokens[i].score for i in match[start1 : start1 + 2])
                text_1 = " ".join((quantity_1, unit_1)).strip()

                first_amount = ingredient_amount_factory(
                    quantity=quantity_1,
                    unit=unit_1,
                    text=text_1,
                    confidence=score_1,
                    starting_index=tokens[mstart1].index,
                    string_units=self.string_units,
                    volumetric_units_system=self.volumetric_units_system,
                    custom_units=self.custom_units,
                )

                # Second amount
                mstart2 = match[start2]  # Index of start of 2nd part in full sentence.
                quantity_2 = tokens[mstart2].text
                unit_2 = " ".join([tokens[i].text for i in match[start2 + 1 :]])
                score_2 = mean(tokens[i].score for i in match[start2:])
                text_2 = " ".join((quantity_2, unit_2)).strip()

                second_amount = ingredient_amount_factory(
                    quantity=quantity_2,
                    unit=unit_2,
                    text=text_2,
                    confidence=score_2,
                    starting_index=tokens[mstart2].index,
                    string_units=self.string_units,
                    volumetric_units_system=self.volumetric_units_system,
                    custom_units=self.custom_units,
                )

                # Check if flags should be set and make sure both IngredientAmounts get
                # the same flags.
                prepared = self._is_prepared(
                    tokens[mstart1].index, tokens
                ) or self._is_prepared(tokens[mstart2].index, tokens)

                approximate = self._is_approximate(
                    tokens[mstart1].index, tokens
                ) or self._is_prepared(tokens[mstart2].index, tokens)

                # The _is_singular check only works if the index provided is for a token
                # labelled with UNIT.
                # Therefore, use idx[mstart + 1] to get the unit for the first amount
                # and idx[match[-1]] to get the last unit for the second amount.
                singular = self._is_singular(
                    tokens[mstart1 + 1].index, tokens
                ) or self._is_singular(tokens[match[-1]].index, tokens)

                if self._is_singular_and_approximate(
                    tokens[mstart1].index, tokens
                ) or self._is_singular_and_approximate(tokens[mstart2].index, tokens):
                    approximate = True
                    singular = True

                if approximate:
                    first_amount.APPROXIMATE = True
                    second_amount.APPROXIMATE = True

                if singular:
                    first_amount.SINGULAR = True
                    second_amount.SINGULAR = True

                if prepared:
                    first_amount.PREPARED_INGREDIENT = True
                    second_amount.PREPARED_INGREDIENT = True

                composite_amounts.append(
                    CompositeIngredientAmount(
                        amounts=[first_amount, second_amount],
                        join=join,
                        subtractive=subtractive,
                    )
                )

                # Keep track of indices of matching elements so we don't use them
                # again elsewhere
                self.consumed.extend([tokens[i].index for i in match])

        return composite_amounts

    def _match_pattern(
        self,
        tokens: list[LabelledToken],
        pattern: list[str],
        ignore_other_labels: bool = True,
    ) -> list[list[int]]:
        """Find a pattern of labels, returning the indices of the matching labels.

        For example, consider the sentence:
        One 15-ounce can diced tomatoes, with liquid

        It has the tokens and labels:
        ['1', '15', 'ounce', 'can', 'diced', 'tomatoes', ',', 'with', 'liquid']
        ['QTY', 'QTY', 'UNIT', 'UNIT', 'COMMENT', 'NAME', 'COMMA', 'COMMENT', 'COMMENT']

        If we search for the pattern:
        ["QTY", "QTY", "UNIT", "UNIT"]

        Then we get:
        [[0, 1, 2, 3]]

        Parameters
        ----------
        tokens : list[LabelledToken]
            List of tokens to find label pattern within.
        pattern : list[str]
            Pattern to match inside labels.
        ignore_other_labels : bool
            If True, the pattern matching will ignore any labels not found in pattern
            meaning the indices of the match may not be consecutive.
            If False, the pattern must be found without any interruptions in the
            labels list.

        Returns
        -------
        list[list[int]]
            List of label index lists that match the pattern.
        """
        labels = [t.label for t in tokens]

        plen = len(pattern)
        plabels = set(pattern)

        if ignore_other_labels:
            # Select just the labels and indices of labels that are in the pattern.
            lbls = [label for label in labels if label in plabels]
            idx = [i for i, label in enumerate(labels) if label in plabels]
        else:
            # Consider all labels
            lbls = labels
            idx = [i for i, _ in enumerate(labels)]

        if len(pattern) > len(lbls):
            # We can never find a match.
            return []

        matches = []
        indices = iter(range(len(lbls)))
        for i in indices:
            # Short circuit: If lbls[i] is not equal to the first element
            # of pattern, skip to next iteration
            if lbls[i] == pattern[0] and lbls[i : i + plen] == pattern:
                matches.append(idx[i : i + plen])
                # Advance iterator to prevent overlapping matches
                consume(indices, plen - 1)

        return matches

    def _fallback_pattern(
        self,
        tokens: list[LabelledToken],
    ) -> list[IngredientAmount]:
        """Fallback pattern for grouping quantities and units into amounts.

        This is done simply by grouping a QTY with all following UNIT until
        the next QTY.

        A special case is the for when the token "dozen" is labelled as QTY and
        it follows a QTY. In this case, the quantity of previous amount is
        modified to include "dozen".

        Parameters
        ----------
        tokens : list[LabelledToken]
            Labelled tokens for input sentence.

        Returns
        -------
        list[IngredientAmount]
            List of IngredientAmount objects.
        """
        amounts = []

        # If a new amount starts with the token after a (, / or [ then it we assume it
        # is related to the previous amount
        # We use idx+1 here so we can check the index in the iteration a new amount is
        # created and avoid needing to check things like i >= 0
        related_idx = [t.index + 1 for t in tokens if t.text in ["(", "/", "["]]

        for i, token in enumerate(tokens):
            if token.label == "QTY":
                # Whenever we come across a new QTY, create new IngredientAmount with
                # some exceptions.
                if token.text == "dozen" and tokens[i - 1].label == "QTY":
                    # If the token is "dozen" and the previous label was QTY, in which
                    # case we modify the quantity of the previous amount.
                    amounts[-1].quantity = amounts[-1].quantity + " dozen"
                    amounts[-1].confidence.append(token.score)
                elif tokens[i - 1].label == "QTY" and tokens[i - 1].text.endswith("x"):
                    # This is a multiplier followed by another amount
                    # e.g. "1x 15 ml tbsp", so mark this amount as related to the
                    # previous one.
                    amounts.append(
                        _PartialIngredientAmount(
                            quantity=token.text,
                            unit=[],
                            confidence=[token.score],
                            starting_index=token.index,
                            related_to_previous=True,
                        )
                    )
                else:
                    amounts.append(
                        _PartialIngredientAmount(
                            quantity=token.text,
                            unit=[],
                            confidence=[token.score],
                            starting_index=token.index,
                            related_to_previous=i in related_idx,
                        )
                    )

            if token.label == "UNIT":
                if amounts == []:
                    # Not come across a QTY yet, so create IngredientAmount
                    implicit_quantity = False
                    quantity = ""
                    if not token.plural and not (
                        INDEFINITE_QUANTIFIERS & {t.text.lower() for t in tokens[:i]}
                    ):
                        # If the token is not plural and the sentence does not contain
                        # an indefinite quantifier prior to this token, assume a
                        # quantity of 1.
                        quantity = "1"
                        implicit_quantity = True

                    amounts.append(
                        _PartialIngredientAmount(
                            quantity=quantity,
                            unit=[],
                            confidence=[token.score],
                            starting_index=token.index,
                            implicit_quantity=implicit_quantity,
                        )
                    )

                # Append token and score for unit to last IngredientAmount
                text = token.text
                if token.plural and amounts[-1].implicit_quantity:
                    # If this token is plural and the current amount has an implicit
                    # quantity, revert the implicit quantity and re-pluralize the unit.
                    amounts[-1].quantity = ""
                    amounts[-1].implicit_quantity = False
                    text = pluralise_units(token.text, self.custom_units)
                elif token.plural and amounts[-1].quantity == "":
                    # If this token is plural and there is no quantity,
                    # re-pluralize it.
                    # Note that is there was a quantity, the unit would be
                    # pluralized within ingredient_amount_factory() as appropriate.
                    text = pluralise_units(token.text, self.custom_units)

                amounts[-1].unit.append(text)
                amounts[-1].confidence.append(token.score)

            # Check if any flags should be set
            if self._is_approximate(i, tokens):
                amounts[-1].APPROXIMATE = True

            if self._is_singular(i, tokens):
                amounts[-1].SINGULAR = True

            if self._is_singular_and_approximate(i, tokens):
                amounts[-1].APPROXIMATE = True
                amounts[-1].SINGULAR = True

            if self._is_prepared(i, tokens):
                amounts[-1].PREPARED_INGREDIENT = True

        # Set APPROXIMATE, SINGULAR and PREPARED_INGREDIENT flags to be the same for all
        # related amounts.
        amounts = self._distribute_related_flags(amounts)

        # Loop through amounts list to fix unit and confidence
        # Unit needs converting to a string
        # Confidence needs averaging
        # Then convert to IngredientAmount object
        processed_amounts = []
        for amount in amounts:
            unit = " ".join(amount.unit)
            text = " ".join((amount.quantity, unit)).strip()

            # Convert to an IngredientAmount object for returning
            processed_amounts.append(
                ingredient_amount_factory(
                    quantity=amount.quantity,
                    unit=unit,
                    text=text,
                    confidence=mean(amount.confidence),
                    starting_index=amount.starting_index,
                    APPROXIMATE=amount.APPROXIMATE,
                    SINGULAR=amount.SINGULAR,
                    PREPARED_INGREDIENT=amount.PREPARED_INGREDIENT,
                    string_units=self.string_units,
                    volumetric_units_system=self.volumetric_units_system,
                    custom_units=self.custom_units,
                )
            )

            if amount.implicit_quantity:
                logger.debug(
                    f"Implicit quantity of '{amount.quantity}' applied to '{text}'."
                )

        return processed_amounts

    def _is_approximate(self, i: int, tokens: list[LabelledToken]) -> bool:
        """Return True if token at current index is approximate.

        This is determined by the token label being QTY and the previous token being in
        a list of approximate tokens.

        If returning True, also add index of i - 1 token to self.consumed list.

        Parameters
        ----------
        i : int
            Index of current token.
        tokens : list[LabelledToken]
            List of all tokens.

        Returns
        -------
        bool
            True if current token is approximate.

        Examples
        --------
        >>> p = PostProcessor("", [], [], [])
        >>> p._is_approximate(
            1,
            ["about", "3", "cups"],
            ["COMMENT", "QTY", "UNIT"],
            [0, 1, 2]
        )
        True

        >>> p = PostProcessor("", [], [], [])
        >>> p._is_approximate(
            1,
            ["approx.", "250", "g"],
            ["COMMENT", "QTY", "UNIT"],
            [0, 1, 2]
        )
        True
        """
        if (
            tokens[i].label == "QTY"
            and i > 0
            and tokens[i - 1].text.lower() in APPROXIMATE_PREFIXES
        ):
            # Mark i - 1 element as consumed.
            self.consumed.append(tokens[i - 1].index)
            return True
        elif (
            tokens[i].label == "QTY"
            and i > 1
            and tokens[i - 1].text == "."
            and tokens[i - 2].text.lower() in APPROXIMATE_PREFIXES
        ):
            # Special case for "approx."
            # Mark i - 1 and i - 2 elements as consumed.
            self.consumed.append(tokens[i - 1].index)
            self.consumed.append(tokens[i - 2].index)
            return True
        elif (
            tokens[i].label == "UNIT"
            and i > 0
            and tokens[i - 1].text.lower() in APPROXIMATE_PREFIXES
        ):
            # For cases like "2 generous cups"
            # Mark i - 1 element as consumed.
            self.consumed.append(tokens[i - 1].index)
            return True
        elif (
            tokens[i].label in ["UNIT", "QTY"]
            and i < len(self.tokens) - 2
            and [t.text.lower() for t in tokens[i + 1 : i + 3]] in APPROXIMATE_SUFFIXES
        ):
            # For cases like "2/3 cup or so", "12 or so" etc.
            # Mark i + 1 element as consumed.
            self.consumed.append(tokens[i + 1].index)
            self.consumed.append(tokens[i + 2].index)
            return True

        return False

    def _is_singular(self, i: int, tokens: list[LabelledToken]) -> bool:
        """Return True is token at current index is singular.

        This is determined by the token label being UNIT and the next token being in
        a list of singular tokens.

        If returning True, also add index of i + 1 token to self.consumed list.

        Parameters
        ----------
        i : int
            Index of current token.
        tokens : list[LabelledToken]
            List of all tokens.

        Returns
        -------
        bool
            True if current token is singular.

        Examples
        --------
        >>> p = PostProcessor("", [], [], [])
        >>> p._is_singular(
            1,
            ["3", "oz", "each"],
            ["QTY", "UNIT", "COMMENT"],
            [0, 1, 2]
        )
        True
        """
        if i == len(tokens) - 1:
            return False

        if tokens[i].label == "UNIT" and tokens[i + 1].text.lower() in SINGULAR_TOKENS:
            # Mark i - 1 element as consumed
            self.consumed.append(tokens[i + 1].index)
            return True

        if i == len(tokens) - 2:
            return False

        # Case where the amount is in brackets
        if (
            tokens[i].label == "UNIT"
            and tokens[i + 1].text in [")", "]"]
            and tokens[i + 2].text.lower() in SINGULAR_TOKENS
        ):
            # Mark i - 1 element as consumed
            self.consumed.append(tokens[i + 2].index)
            return True

        return False

    def _is_singular_and_approximate(self, i: int, tokens: list[LabelledToken]) -> bool:
        """Return True if the current token is approximate and singular.

        There are two cases:
            1. The token label at the given index is QTY and is preceded by a token in
               a list of singular tokens, then token in a list of approximate prefixes.
               e.g. "each nearly 200 g"
            2. The token label at the given index is UNIT and is followed by a sequence
               of tokens in the approximate suffixes then a token in the list of
               singular tokens.
               e.g. "5 lbs or so each"

        If returning True, also mark the indices of the singular and approximate tokens
        as consumed.

        Note: This doesn't handle the case of "each 1 lb or so" but I've not seen that
        in the wild.

        Parameters
        ----------
        i : int
            Index of current token.
        tokens : list[LabelledToken]
            List of all tokens.

        Returns
        -------
        bool
            True if current token is singular and approximate.

        Examples
        --------
        >>> p = PostProcessor("", [], [], [])
        >>> p._is_approximate(
            2,
            ["each", nearly", "3", "oz"],
            ["COMMENT", "COMMENT", "QTY", "UNIT"],
            [0, 1, 2, 3]
        )
        True

        >>> p = PostProcessor("", [], [], [])
        >>> p._is_approximate(
            1,
            ["2", lbs", "or", "so", "each"],
            ["QTY", "UNIT", "COMMENT", "COMMENT", "COMMENT"],
            [0, 1, 2, 3, 4]
        )
        True
        """
        if (
            tokens[i].label == "QTY"
            and i > 1
            and tokens[i - 1].text.lower() in APPROXIMATE_PREFIXES
            and tokens[i - 2].text.lower() in SINGULAR_TOKENS
        ):
            # Mark i - 1 and i - 2 elements as consumed
            self.consumed.append(tokens[i - 1].index)
            self.consumed.append(tokens[i - 2].index)
            return True
        elif (
            tokens[i].label == "UNIT"
            and i < len(self.tokens) - 3
            and [t.text.lower() for t in tokens[i + 1 : i + 3]] in APPROXIMATE_SUFFIXES
            and tokens[i + 3].text.lower() in SINGULAR_TOKENS
        ):
            # e.g. "2 pounds or so each"
            self.consumed.append(tokens[i + 1].index)
            self.consumed.append(tokens[i + 2].index)
            self.consumed.append(tokens[i + 3].index)
            return True

        return False

    def _is_prepared(self, i: int, tokens: list[LabelledToken]) -> bool:
        """Return True is token at current index refers to the prepared ingredient.

        This is determined by the token label being QTY and the previous tokens being in
        a list of prepared tokens.
        If the QTY is preceded by a token in APPROXIMATE_PREFIXES, then the tokens prior
        to that are checked for matches against the prepared tokens list.

        If returning True, also add index of tokens from prepared token list to
        self.consumed list.

        Parameters
        ----------
        i : int
            Index of current token.
        tokens : list[LabelledToken]
            List of all tokens.

        Returns
        -------
        bool
            True if current token is prepared.

        Examples
        --------
        >>> p = PostProcessor("", [], [], [])
        >>> p._is_approximate(
            2,
            ["to", "yield", "2", "cups"],
            ["COMMENT", "COMMENT", "QTY", "UNIT"],
            [0, 1, 2, 3]
        )
        True

        >>> p = PostProcessor("", [], [], [])
        >>> p._is_approximate(
            2,
            ["to", "make", "about", "250", "g"],
            ["COMMENT", "COMMENT, "COMMENT", "QTY", "UNIT"],
            [0, 1, 2, 3, 4]
        )
        True
        """
        # All PREPARED_INGREDIENT_TOKENS have length 2, so cannot be prepared if i < 2.
        if i < 2:
            return False

        if tokens[i].label != "QTY":
            return False

        for pattern in PREPARED_INGREDIENT_TOKENS:
            if [t.text.lower() for t in tokens[i - 2 : i]] == pattern:
                # Mark i - 1 and i - 2 elements as consumed
                self.consumed.append(tokens[i - 1].index)
                self.consumed.append(tokens[i - 2].index)
                return True
            elif (
                i > 2
                and tokens[i - 1].text.lower() in APPROXIMATE_PREFIXES
                and [t.text.lower() for t in tokens[i - 3 : i - 1]] == pattern
            ):
                # Mark i - 2 and i - 3 elements as consumed
                self.consumed.append(tokens[i - 2].index)
                self.consumed.append(tokens[i - 3].index)
                return True

        return False

    def _distribute_related_flags(
        self, amounts: list[_PartialIngredientAmount]
    ) -> list[_PartialIngredientAmount]:
        """Distribute all set flags to related amounts.

        Parameters
        ----------
        amounts : list[_PartialIngredientAmount]
            List of amounts.

        Returns
        -------
        list[_PartialIngredientAmount]
            List of amount with all related amounts having the same flags.
        """
        # Group amounts into related groups
        grouped = []
        for amount in amounts:
            if grouped and amount.related_to_previous:
                grouped[-1].append(amount)
            else:
                grouped.append([amount])

        # Set flags for all amounts in group if any amount has flag set
        for group in grouped:
            if any(am.APPROXIMATE for am in group):
                for am in group:
                    am.APPROXIMATE = True

            if any(am.SINGULAR for am in group):
                for am in group:
                    am.SINGULAR = True

            if any(am.PREPARED_INGREDIENT for am in group):
                for am in group:
                    am.PREPARED_INGREDIENT = True

            # If any amount in a group of related amounts is a multiplier (e.g. 1x)
            # then mark all following amounts with SINGULAR=True
            singular_after_multiplier = False
            for amount in group:
                if singular_after_multiplier:
                    amount.SINGULAR = True
                    continue

                if amount.quantity.endswith("x"):
                    singular_after_multiplier = True

        # Flatten list for return
        return list(chain.from_iterable(grouped))