Normalisation#
Normalisation is the process of transforming the sentences to ensure that particular features of the sentence have a standard form. This pre-process step is there to remove as much of the variation in the data that can be reasonably foreseen, so that the model is presented with tidy and consistent data and therefore has an easier time of learning or labelling.
The PreProcessor
class handles the sentence normalisation for us.
>>> from Preprocess import PreProcessor
>>> p = PreProcessor("1/2 cup orange juice, freshly squeezed")
>>> p.sentence
'0.5 cup orange juice, freshly squeezed'
The normalisation of the input sentence is done immediately when the PreProcessor
class is instantiated. The _normalise()
method of the PreProcessor
class is called, which executes a number of steps to clean up the input sentence.
def _normalise(self, sentence: str) -> str:
"""Normalise sentence prior to feature extraction
Parameters
----------
sentence : str
Ingredient sentence
Returns
-------
str
Normalised ingredient sentence
"""
# List of functions to apply to sentence
# Note that the order matters
funcs = [
self._replace_en_em_dash,
self._replace_string_numbers,
self._replace_html_fractions,
self._replace_unicode_fractions,
self._combine_quantities_split_by_and,
self._replace_fake_fractions,
self._split_quantity_and_units,
self._remove_unit_trailing_period,
self._replace_string_range,
self._replace_dupe_units_ranges,
self._merge_quantity_x,
self._collapse_ranges,
]
for func in funcs:
sentence = func(sentence)
if self.show_debug_output:
print(f"{func.__name__}: {sentence}")
return sentence.strip()
Tip
By setting show_debug_output=True
when instantiating the PreProcessor
class, the sentence will be printed out at each step of the normalisation process.
Each of the normalisation functions are detailed below.
_replace_en_em_dash
#
En-dashes and em-dashes are replaced with hyphens.
def _replace_en_em_dash(self, sentence: str) -> str:
"""Replace en-dashes and em-dashes with hyphens.
Parameters
----------
sentence : str
Ingredient sentence
Returns
-------
str
Ingredient sentence with en and em dashes replaced with hyphens
Examples
--------
>>> p = PreProcessor("")
>>> p._replace_en_em_dash("2 cups flour – white or self-raising")
"2 cups flour - white or self-raising"
>>> p = PreProcessor("")
>>> p._replace_en_em_dash("3–4 sirloin steaks")
"3-4 sirloin steaks"
"""
return sentence.replace("–", "-").replace("—", " - ")
_replace_string_numbers
#
Numbers represented in textual form e.g. “one”, “two” are replaced with numeric forms.
The replacements are predefined in a dictionary.
For performance reasons, the regular expressions used to substitute the text with the number are pre-compiled and provided in the STRING_NUMBERS_REGEXES
constant, which is a dictionary where the value is a tuple of (pre-compiled regular expression, substitute value).
# Strings and their numeric representation
STRING_NUMBERS = {
"one-half": "1/2",
"one": "1",
"two": "2",
"three": "3",
"four": "4",
"five": "5",
"six": "6",
"seven": "7",
"eight": "8",
"nine": "9",
"ten": "10",
"eleven": "11",
"twelve": "12",
"thirteen": "13",
"fourteen": "14",
"fifteen": "15",
"sixteen": "16",
"seventeen": "17",
"eighteen": "18",
"nineteen": "19",
}
# Precompile the regular expressions for matching the string numbers
STRING_NUMBERS_REGEXES = {}
for s, n in STRING_NUMBERS.items():
# This is case insensitive so it replace e.g. "one" and "One"
# Only match if the string is preceded by a non-word character or is at
# the start of the sentence
STRING_NUMBERS_REGEXES[s] = (re.compile(rf"\b({s})\b", flags=re.IGNORECASE), n)
def _replace_string_numbers(self, sentence: str) -> str:
"""Replace string numbers (e.g. one, two) with numeric values (e.g. 1, 2)
Parameters
----------
sentence : str
Ingredient sentence
Returns
-------
str
Ingredient sentence with string numbers replace with numeric values
Examples
--------
>>> p = PreProcessor("")
>>> p._replace_string_numbers("three large onions")
"3 large onions"
>>> p = PreProcessor("")
>>> p._replace_string_numbers("twelve bonbons")
"12 bonbons"
"""
# STRING_NUMBER_REGEXES is a dict where the values are a tuple of the compiled
# regular expression for matching a string number e.g. 'one', 'two' and the
# substitution numerical value for that string number.
for regex, substitution in STRING_NUMBERS_REGEXES.values():
# Find matches for current string number
for match in regex.finditer(sentence):
if self._valid_string_number_replacement(match, sentence):
sentence = regex.sub(rf"{substitution}", sentence)
return sentence
_replace_html_fractions
#
Fractions represented by html entities (e.g. 0.5 as ½
) are replaced with Unicode equivalents (e.g. ½). This is done using the standard library html.unescape()
function.
def _replace_html_fractions(self, sentence: str) -> str:
"""Replace html fractions e.g. ½ with unicode equivalents
Parameters
----------
sentence : str
Ingredient sentence
Returns
-------
str
Ingredient sentence with html fractions replaced
Examples
--------
>>> p = PreProcessor("")
>>> p._replace_html_fractions("1¾ cups tomato ketchup")
"1¾ cups tomato ketchup"
"""
return unescape(sentence)
_replace_unicode_fractions
#
Fractions represented by Unicode fractions are replaced a textual format (.e.g ½ as 1/2), as defined by the dictionary in this function. The next step (_replace_fake_fractions
) will turn these into decimal numbers.
We have to handle two cases: where the character before the unicode fraction is a hyphen and where it is not. In the latter case, we want to insert a space before the replacement so we don’t accidentally merge with the character before. However, if the character before is a hyphen, we don’t want to do this because we could end up splitting a range up.
# Unicode fractions and their replacements as fake fractions
# Most of the time we need to insert a space in front of the replacement so we don't
# merge the replacement with the previous token i.e. 1½ != 11/2
# However, if the prior chaacter is a hyphen, we don't want to insert a space as this
# will mess up any ranges
UNICODE_FRACTIONS = {
"-\u215b": "-1/8",
"-\u215c": "-3/8",
"-\u215d": "-5/8",
"-\u215e": "-7/8",
"-\u2159": "-1/6",
"-\u215a": "-5/6",
"-\u2155": "-1/5",
"-\u2156": "-2/5",
"-\u2157": "-3/5",
"-\u2158": "-4/5",
"-\xbc": "-1/4",
"-\xbe": "-3/4",
"-\u2153": "-1/3",
"-\u2154": "-2/3",
"-\xbd": "-1/2",
"\u215b": " 1/8",
"\u215c": " 3/8",
"\u215d": " 5/8",
"\u215e": " 7/8",
"\u2159": " 1/6",
"\u215a": " 5/6",
"\u2155": " 1/5",
"\u2156": " 2/5",
"\u2157": " 3/5",
"\u2158": " 4/5",
"\xbc": " 1/4",
"\xbe": " 3/4",
"\u2153": " 1/3",
"\u2154": " 2/3",
"\xbd": " 1/2",
}
def _replace_unicode_fractions(self, sentence: str) -> str:
"""Replace unicode fractions with a 'fake' ascii equivalent.
The ascii equivalent is used because the replace_fake_fractions function can
deal with spaces between an integer and the fraction.
Parameters
----------
sentence : str
Ingredient sentence
Returns
-------
str
Ingredient sentence with unicode fractions replaced
Examples
--------
>>> p = PreProcessor("")
>>> p._replace_unicode_fractions("½ cup icing sugar")
" 1/2 cup icing sugar"
>>> p = PreProcessor("")
>>> p._replace_unicode_fractions("3⅓ cups warm water")
"3 1/3 cups warm water"
>>> p = PreProcessor("")
>>> p._replace_unicode_fractions("¼-½ teaspoon")
"1/4-1/2 teaspoon"
"""
for f_unicode, f_ascii in UNICODE_FRACTIONS.items():
sentence = sentence.replace(f_unicode, f_ascii)
return sentence
_combine_quantities_split_by_and
#
Fractional quantities split by ‘and’ e.g. 1 and 1/2 are replaced by the decimal equivalent.
A regular expression is used to find these in the sentence.
# Capture the whole match, and the quantites before and after the "and".
FRACTION_SPLIT_AND_PATTERN = re.compile(r"((\d+)\sand\s(\d/\d+))")
def _combine_quantities_split_by_and(self, sentence: str) -> str:
"""Combine fractional quantities split by 'and' into single value.
Parameters
----------
sentence : str
Ingredient sentence
Returns
-------
str
Ingredient sentence with split fractions replaced with
single decimal value.
Examples
--------
>>> p = PreProcessor("")
>>> p._combine_quantities_split_by_and("1 and 1/2 tsp fine grain sea salt")
"1.5 tsp fine grain sea salt"
>>> p = PreProcessor("")
>>> p._combine_quantities_split_by_and("1 and 1/4 cups dark chocolate morsels")
"1.25 cups dark chocolate morsels"
"""
matches = FRACTION_SPLIT_AND_PATTERN.findall(sentence)
for match in matches:
combined_quantity = float(Fraction(match[1]) + Fraction(match[2]))
rounded = round(combined_quantity, 3)
sentence = sentence.replace(match[0], f"{rounded:g}")
return sentence
_replace_fake_fractions
#
Fractions represented in a textual format (e.g. 1/2, 3/4) are replaced with decimals.
A regular expression is used to find these in the sentence. The regular expression also matches fractions greater than 1 (e.g. 1 1/2 is 1.5).
# Regex pattern for fraction parts.
# Matches 0+ numbers followed by 0+ white space characters followed by a number then
# a forward slash then another number.
FRACTION_PARTS_PATTERN = re.compile(r"(\d*\s*\d/\d+)")
def _replace_fake_fractions(self, sentence: str) -> str:
"""Attempt to parse fractions from sentence and convert to decimal
This looks for fractions with the format of 1/2, 1/4, 1 1/2 etc.
Parameters
----------
sentence : str
Ingredient sentence
Returns
-------
str
Ingredient sentence with fractions replaced with decimals
Examples
--------
>>> p = PreProcessor("")
>>> p._replace_fake_fractions("1/2 cup icing sugar")
"0.5 cup icing sugar"
>>> p = PreProcessor("")
>>> p._replace_fake_fractions("2 3/4 pound chickpeas")
"2.75 pound chickpeas"
"""
matches = FRACTION_PARTS_PATTERN.findall(sentence)
if not matches:
return sentence
# This is a bit of a hack.
# If a fraction appears multiple times but in different forms e.g. 1/2 and
# 1 1/2, then
# we need to replace the longest one first, otherwise both instance of 1/2
# would be replaced at the same time which would mean that the instance of
# 1 1/2 would end up as 1 0.5 instead of 1.5
# Before we sort, we need to strip any space from the start and end.
matches = [match.strip() for match in matches]
matches.sort(key=len, reverse=True)
for match in matches:
split = match.split()
summed = float(sum(Fraction(s) for s in split))
rounded = round(summed, 3)
sentence = sentence.replace(match, f"{rounded:g}")
return sentence
_split_quantity_and_units
#
A space is enforced between quantities and units to make sure they are tokenized to separate tokens. If an quantity and unit are joined by a hyphen, this is also replaced by a space. This also takes into account certain strings that aren’t technically units, but we want to treat in the same way here.
# Regex pattern for finding quantity and units without space between them.
# Add additional strings to units list that aren't necessarily units, but we want to
# treat them like units for the purposes of splitting quantities from units.
units_list = FLATTENED_UNITS_LIST + ["in", "x"]
QUANTITY_UNITS_PATTERN = re.compile(rf"(\d)\-?({'|'.join(units_list)})")
UNITS_QUANTITY_PATTERN = re.compile(rf"({'|'.join(units_list)})(\d)")
UNITS_HYPHEN_QUANTITY_PATTERN = re.compile(rf"({'|'.join(units_list)})\-(\d)")
def _split_quantity_and_units(self, sentence: str) -> str:
"""Insert space between quantity and unit
This currently finds any instances of a number followed directly by a letter
with no space or a hyphen in between. It also finds any letters followed
directly by a number with no space in between.
Parameters
----------
sentence : str
Ingredient sentence
Returns
-------
str
Ingredient sentence with spaces inserted between quantity and units
Examples
--------
>>> p = PreProcessor("")
>>> p._split_quantity_and_units("100g green beans")
"100 g green beans"
>>> p = PreProcessor("")
>>> p._split_quantity_and_units("2-pound red peppers, sliced")
"2 pound red peppers, sliced"
>>> p = PreProcessor("")
>>> p._split_quantity_and_units("2lb1oz cherry tomatoes")
"2 lb 1 oz cherry tomatoes"
>>> p = PreProcessor("")
>>> p._split_quantity_and_units("2lb-1oz cherry tomatoes")
"2 lb - 1 oz cherry tomatoes"
"""
sentence = QUANTITY_UNITS_PATTERN.sub(r"\1 \2", sentence)
sentence = UNITS_QUANTITY_PATTERN.sub(r"\1 \2", sentence)
return UNITS_HYPHEN_QUANTITY_PATTERN.sub(r"\1 - \2", sentence)
_remove_unit_trailing_period
#
Units with a trailing period have the period removed. This is only done for a subset of units where this has been observed.
def _remove_unit_trailing_period(self, sentence: str) -> str:
"""Remove trailing periods from units e.g. tsp. -> tsp
Parameters
----------
sentence : str
Ingredient sentence
Returns
-------
str
Ingredient sentence with trailing periods from units removed
Examples
--------
>>> p = PreProcessor("")
>>> p._remove_unit_trailing_period("1 tsp. garlic powder")
"1 tsp garlic powder"
>>> p = PreProcessor("")
>>> p._remove_unit_trailing_period("5 oz. chopped tomatoes")
"5 oz chopped tomatoes"
"""
units = [
"tsp.",
"tsps.",
"tbsp.",
"tbsps.",
"tbs.",
"tb.",
"lb.",
"lbs.",
"oz.",
]
units.extend([u.capitalize() for u in units])
for unit in units:
unit_no_period = unit.replace(".", "")
sentence = sentence.replace(unit, unit_no_period)
return sentence
_replace_string_range
#
Ranges are replaced with a standardised form of X-Y. The regular expression that searches for ranges in the sentence matches anything in the following forms:
1 to 2
1- to 2-
1 or 2
1- to 2-
where the numbers 1 and 2 represent any decimal value.
The purpose of this is to ensure the range is kept as a single token.
# Regex pattern for matching a range in string format e.g. 1 to 2, 8.5 to 12, 4 or 5.
# Assumes fake fractions and unicode fraction have already been replaced.
# Allows the range to include a hyphen, which are captured in separate groups.
# Captures the two number in the range in separate capture groups.
# If a number starts with a zero, it must be followed by decimal point to be matched
STRING_RANGE_PATTERN = re.compile(
r"""
(0\.[0-9]|[1-9][\d\.]*?) # Capture number. Leading zero must be followed by '.'
\s* # Optional space
(\-)? # Optional hyphen
\s* # Optional space
(to|or) # Match to or or
\s* # Optional space
(\-)* # Optional hyphen
\s* # Optional space
( # Capture next two groups together
(0\.[0-9]+|[1-9][\d\.]*?) # Capture number
(\-)? # Optional hyphen
)
""",
re.VERBOSE,
)
def _replace_string_range(self, sentence: str) -> str:
"""Replace range in the form "<num> to <num" with
standardised range "<num>-<num>".
For example
-----------
1 to 2 -> 1-2
8.5 to 12.5 -> 8.5-12.5
16- to 9-
Parameters
----------
sentence : str
Ingredient sentence
Returns
-------
str
Ingredient sentence with string ranges replaced with standardised range
Examples
--------
>>> p = PreProcessor("")
>>> p._replace_string_range("1 to 2 mashed bananas")
"1-2 mashed bananas"
>>> p = PreProcessor("")
>>> p._replace_string_range("5- or 6- large apples")
"5-6- large apples"
"""
return STRING_RANGE_PATTERN.sub(r"\1-\5", sentence)
_replace_dupe_units_ranges
#
Ranges are where the unit is given for both quantities are replaced with the standardised range format, e.g. 5 oz - 8 oz is replaced by 5-8 oz.
# Regex pattern to match ranges where the unit appears after both quantities e.g.
# 100 g - 200 g. This assumes the quantites and units have already been seperated
# by a single space and that all number are decimals.
# This regex matches:
# <quantity> <unit> - <quantity> <unit>
# <quantity> <unit> to <quantity> <unit>
# <quantity> <unit> or <quantity> <unit>
# returning the full match and each quantity and unit as capture groups.
DUPE_UNIT_RANGES_PATTERN = re.compile(
r"""
(
([\d\.]+) # Capture decimal number
\s # Space
([a-zA-Z]+) # Capture text string (possible unit)
\s # Space
(?:\-|to|or) # Hyphen, 'to' or 'or'
\s # Space
([\d\.]+) # Capture decimal number
\s # Space
([a-zA-Z]+) # Capture text string (possible unit)
)
""",
re.I | re.VERBOSE,
)
def _replace_dupe_units_ranges(self, sentence: str) -> str:
"""Replace ranges where the unit appears in both parts of the range with
standardised range "<num>-<num> <unit>".
This assumes that the _split_quantity_and_units has already been run on
the sentence.
Parameters
----------
sentence : str
Ingredient sentence
Returns
-------
str
Ingredient sentence with ranges containing unit twice replaced with
standardised range
Examples
--------
>>> p = PreProcessor("")
>>> p._replace_dupe_units_ranges("227 g - 283.5 g/8-10 oz duck breast")
"227-283.5 g/8-10 oz duck breast"
>>> p = PreProcessor("")
>>> p._replace_dupe_units_ranges("400-500 g/14 oz - 17 oz rhubarb")
"400-500 g/14-17 oz rhubarb"
"""
matches = DUPE_UNIT_RANGES_PATTERN.findall(sentence)
if not matches:
return sentence
for full_match, quantity1, unit1, quantity2, unit2 in matches:
# We are only interested if the both captured units are the same
if unit1 != unit2:
continue
# If capture unit not in units list, abort
if unit1 not in FLATTENED_UNITS_LIST:
continue
sentence = sentence.replace(full_match, f"{quantity1}-{quantity2} {unit1}")
return sentence
_merge_quantity_x
#
Merge quantities followed by an “x” into a single token, for example:
1 x -> 1x
0.5 x -> 0.5x
# Regex pattern to match a decimal number followed by an "x" followed by a space
# e.g. 0.5 x, 1 x, 2 x. The number is captured in a capture group.
QUANTITY_X_PATTERN = re.compile(
r"""
([\d\.]+) # Capture decimal number
\s # Space
[xX] # Character 'x' or 'X'
\s* # Optional space
""",
re.VERBOSE,
)
def _merge_quantity_x(self, sentence: str) -> str:
"""Merge any quantity followed by "x" into a single token e.g. 1 x can -> 1x can
Parameters
----------
sentence : str
Ingredient sentence
Returns
-------
str
Ingredient sentence with single "x" merged into preceding number
Examples
--------
>>> p = PreProcessor("")
>>> p._replace_dupe_units_ranges("8 x 450 g/1 lb live lobsters")
"8x 450g/1lb live lobsters"
>>> p = PreProcessor("")
>>> p._replace_dupe_units_ranges("4 x 100 g wild salmon fillet")
"4x 100 g wild salmon fillet"
"""
return QUANTITY_X_PATTERN.sub(r"\1x ", sentence)
_collapse_ranges
#
Remove any white space surrounding the hyphen in a range
# Regex pattern to match a range that has spaces between the numbers and hyphen
# e.g. 0.5 - 1. The numbers are captured in capture groups.
EXPANDED_RANGE = re.compile(r"(\d)\s*\-\s*(\d)")
def _collapse_ranges(self, sentence: str) -> str:
"""Collapse any whitespace found in a range so the range is of the standard
form.
Parameters
----------
sentence : str
Ingedient sentence
Returns
-------
str
Ingredient sentence with whitespace removed from ranges
Examples
--------
>>> p = PreProcessor("")
>>> p._collapse_ranges("8 - 10 g ground pepper")
"8-10 g ground pepper"
>>> p = PreProcessor("")
>>> p._collapse_ranges("0.25 -0.5 tsp salt")
"0.25-0.5 tsp salt"
"""
return EXPANDED_RANGE.sub(r"\1-\2", sentence)
_singlarise_unit
#
Units are made singular using a predefined list of plural units and their singular form.
This step is actually performed after tokenisation (see Extracting the features) and we keep track of the index of each token that has been singularised. This is so we can automatically re-pluralise only the tokens that were singularised after the labelling by the model.
# Plural and singular units
UNITS = {
"bags": "bag",
"bars": "bar",
"baskets": "basket",
"batches": "batch",
"blocks": "block",
"bottles": "bottle",
"boxes": "box",
"branches": "branch",
"bulbs": "bulb",
"bunches": "bunch",
"bundles": "bundle",
"cans": "can",
"chunks": "chunk",
"cloves": "clove",
"clusters": "cluster",
"cl": "cl",
"cL": "cL",
"cm": "cm",
"cubes": "cube",
"cups": "cup",
"cutlets": "cutlet",
"dashes": "dash",
"dessertspoons": "dessertspoon",
"dollops": "dollop",
"drops": "drop",
"ears": "ear",
"envelopes": "envelope",
"feet": "foot",
"fl": "fl",
"g": "g",
"gallons": "gallon",
"glasses": "glass",
"grams": "gram",
"grinds": "grind",
"handfuls": "handful",
"heads": "head",
"inches": "inch",
"jars": "jar",
"kg": "kg",
"kilograms": "kilogram",
"knobs": "knob",
"lbs": "lb",
"leaves": "leaf",
"lengths": "length",
"links": "link",
"l": "l",
"liters": "liter",
"litres": "litre",
"loaves": "loaf",
"milliliters": "milliliter",
"millilitres": "millilitre",
"ml": "ml",
"mL": "mL",
"mm": "mm",
"mugs": "mug",
"ounces": "ounce",
"oz": "oz",
"packs": "pack",
"packages": "package",
"packets": "packet",
"pairs": "pair",
"pieces": "piece",
"pinches": "pinch",
"pints": "pint",
"pods": "pod",
"pounds": "pound",
"pts": "pt",
"punnets": "punnet",
"racks": "rack",
"rashers": "rasher",
"recipes": "recipe",
"rectangles": "rectangle",
"ribs": "rib",
"quarts": "quart",
"sachets": "sachet",
"scoops": "scoop",
"segments": "segment",
"shakes": "shake",
"sheets": "sheet",
"shots": "shot",
"shoots": "shoot",
"slabs": "slab",
"slices": "slice",
"sprigs": "sprig",
"squares": "square",
"stalks": "stalk",
"stems": "stem",
"sticks": "stick",
"strips": "strip",
"tablespoons": "tablespoon",
"tbsps": "tbsp",
"tbs": "tb",
"teaspoons": "teaspoon",
"tins": "tin",
"tsps": "tsp",
"twists": "twist",
"wedges": "wedge",
"wheels": "wheel",
}
# Generate capitalized version of each entry in the UNITS dictionary
_capitalized_units = {}
for plural, singular in UNITS.items():
_capitalized_units[plural.capitalize()] = singular.capitalize()
UNITS = UNITS | _capitalized_units
def _singlarise_units(
self, tokenised_sentence: list[str]
) -> tuple[list[str], list[int]]:
"""Singularise units in tokenised sentence and return list of singularised
indices e.g. cups -> cup, tablespoons -> tablespoon
Parameters
----------
tokenised_sentence : list[str]
Tokenised sentence
Returns
-------
list[str]
Tokenised sentence with units singularised
list[int]
List of indices of tokenised sentence that have been singularised
"""
singularised_indices = []
for idx, token in enumerate(tokenised_sentence):
singular = UNITS.get(token, None)
if singular is not None:
tokenised_sentence[idx] = singular
singularised_indices.append(idx)
return (tokenised_sentence, singularised_indices)