Source code for ingredient_parser.parsers

#!/usr/bin/env python3

from dataclasses import dataclass
from importlib.resources import as_file, files

import pycrfsuite

from ._utils import pluralise_units
from .postprocess import ParsedIngredient, PostProcessor
from .preprocess import PreProcessor

# Create TAGGER object that can be reused between function calls
# We only want to load the model into TAGGER once, but only do it
# when we need to (from parse_ingredient() or inspect_parser()) and
# not whenever anything from ingredient_parser is imported.
TAGGER = pycrfsuite.Tagger()


[docs] def load_model_if_not_loaded(): """Load model into TAGGER variable if not loaded. There isn't a simple way to check if the model if loaded or not, so we try to call TAGGER.info() which will raise a RuntimeError if the model is not loaded yet. """ try: TAGGER.info() except RuntimeError: with as_file(files(__package__) / "model.en.crfsuite") as p: TAGGER.open(str(p))
[docs] def parse_ingredient( sentence: str, discard_isolated_stop_words: bool = True, string_units: bool = False, imperial_units: bool = False, ) -> ParsedIngredient: """Parse an ingredient sentence using CRF model to return structured data Parameters ---------- sentence : str Ingredient sentence to parse discard_isolated_stop_words : bool, optional If True, any isolated stop words in the name, preparation, or comment fields are discarded. Default is True. string_units : bool If True, return all IngredientAmount units as strings. If False, convert IngredientAmount units to pint.Unit objects where possible. Dfault is False. imperial_units : bool If True, use imperial units instead of US customary units for pint.Unit objects for the the following units: fluid ounce, cup, pint, quart, gallon. Default is False, which results in US customary units being used. This has no effect if string_units=True. Returns ------- ParsedIngredient ParsedIngredient object of structured data parsed from input string """ load_model_if_not_loaded() processed_sentence = PreProcessor(sentence) tokens = processed_sentence.tokenized_sentence labels = TAGGER.tag(processed_sentence.sentence_features()) scores = [TAGGER.marginal(label, i) for i, label in enumerate(labels)] # Re-pluralise tokens that were singularised if the label isn't UNIT # For tokens with UNIT label, we'll deal with them below for idx in processed_sentence.singularised_indices: token = tokens[idx] label = labels[idx] if label != "UNIT": tokens[idx] = pluralise_units(token) postprocessed_sentence = PostProcessor( sentence, tokens, labels, scores, discard_isolated_stop_words=discard_isolated_stop_words, string_units=string_units, imperial_units=imperial_units, ) return postprocessed_sentence.parsed
[docs] def parse_multiple_ingredients( sentences: list[str], discard_isolated_stop_words: bool = True, string_units: bool = False, imperial_units: bool = False, ) -> list[ParsedIngredient]: """Parse multiple ingredient sentences in one go. This function accepts a list of sentences, with element of the list representing one ingredient sentence. A list of dictionaries is returned, with optional confidence values. This function is a simple for-loop that iterates through each element of the input list. Parameters ---------- sentences : list[str] List of sentences to parse discard_isolated_stop_words : bool, optional If True, any isolated stop words in the name, preparation, or comment fields are discarded. Default is True. string_units : bool If True, return all IngredientAmount units as strings. If False, convert IngredientAmount units to pint.Unit objects where possible. Dfault is False. imperial_units : bool If True, use imperial units instead of US customary units for pint.Unit objects for the the following units: fluid ounce, cup, pint, quart, gallon. Default is False, which results in US customary units being used. This has no effect if string_units=True. Returns ------- list[ParsedIngredient] List of ParsedIngredient objects of structured data parsed from input sentences """ return [ parse_ingredient( sentence, discard_isolated_stop_words=discard_isolated_stop_words, string_units=string_units, imperial_units=imperial_units, ) for sentence in sentences ]
[docs] @dataclass class ParserDebugInfo: """Dataclass for holding intermediate objects generated during ingredient sentence parsing. Attributes ---------- sentence : str Input ingredient sentence. PreProcessor : PreProcessor PreProcessor object created using input sentence. PostProcessor : PostProcessor PostProcessor object created using tokens, labels and scores from input sentence. Tagger : pycrfsuite.Tagger CRF model tagger object. """ sentence: str PreProcessor: PreProcessor PostProcessor: PostProcessor Tagger = TAGGER
[docs] def inspect_parser( sentence: str, discard_isolated_stop_words: bool = True, string_units: bool = False, imperial_units: bool = False, ) -> ParserDebugInfo: """Return object containing all intermediate objects used in the parsing of a sentence. Parameters ---------- sentence : str Ingredient sentence to parse discard_isolated_stop_words : bool, optional If True, any isolated stop words in the name, preparation, or comment fields are discarded. Default is True. string_units : bool If True, return all IngredientAmount units as strings. If False, convert IngredientAmount units to pint.Unit objects where possible. Dfault is False. imperial_units : bool If True, use imperial units instead of US customary units for pint.Unit objects for the the following units: fluid ounce, cup, pint, quart, gallon. Default is False, which results in US customary units being used. This has no effect if string_units=True. Returns ------- ParserDebugInfo ParserDebugInfo object containing the PreProcessor object, PostProcessor object and Tagger. """ load_model_if_not_loaded() processed_sentence = PreProcessor(sentence) tokens = processed_sentence.tokenized_sentence labels = TAGGER.tag(processed_sentence.sentence_features()) scores = [TAGGER.marginal(label, i) for i, label in enumerate(labels)] # Re-plurise tokens that were singularised if the label isn't UNIT # For tokens with UNIT label, we'll deal with them below for idx in processed_sentence.singularised_indices: token = tokens[idx] label = labels[idx] if label != "UNIT": tokens[idx] = pluralise_units(token) postprocessed_sentence = PostProcessor( sentence, tokens, labels, scores, discard_isolated_stop_words=discard_isolated_stop_words, string_units=string_units, imperial_units=imperial_units, ) return ParserDebugInfo( sentence=sentence, PreProcessor=processed_sentence, PostProcessor=postprocessed_sentence, )