Source code for src.dackar.text_processing.Preprocessing

# Copyright 2024, Battelle Energy Alliance, LLC  ALL RIGHTS RESERVED
"""
Created on October, 2022

@author: dgarrett622, wangc, mandd
"""
from cytoolz import functoolz
import re
import textacy.preprocessing as preprocessing
from numerizer import numerize

# list of available preprocessors in textacy.preprocessing.normalize
[docs] textacyNormalize = ['bullet_points', 'hyphenated_words', 'quotation_marks', 'repeating_chars', 'unicode', 'whitespace']
# list of available preprocessors in textacy.preprocessing.remove
[docs] textacyRemove = ['accents', 'brackets', 'html_tags', 'punctuation']
# list of available preprocessors in textacy.preprocessing.replace
[docs] textacyReplace = ['currency_symbols', 'emails', 'emojis', 'hashtags', 'numbers', 'phone_numbers', 'urls', 'user_handles']
# list of available preprocessors from numerizer
[docs] numerizer = ['numerize']
[docs] preprocessorDefaultList = ['bullet_points', 'hyphenated_words', 'quotation_marks', 'repeating_chars', 'whitespace', 'unicode', 'accents', 'html_tags', 'punctuation', 'emails', 'emojis', 'hashtags', 'urls', 'numerize', 'whitespace']
[docs] preprocessorDefaultOptions = {'repeating_chars': {'chars': ',', 'maxn': 1}, 'unicode': {'form': 'NFKC'}, 'accents': {'fast': False}, 'punctuation': {'only':["*","+",":","=","\\","^","_","|","~", "..", "..."]}}
# TODO: replace & --> and, @ --> at, maybe "/" --> or
[docs] class Preprocessing(object): """ NLP Preprocessing class """ def __init__(self, preprocessorList=preprocessorDefaultList, preprocessorOptions=preprocessorDefaultOptions): """ Preprocessing object constructor Arg: preprocessorList: list, list of preprocessor names as strings preprocessorOptions: dict, dictionary of dictionaries containing optional arguments for preprocessors top level key is name of preprocessor Return: None """
[docs] self.functionList = [] # list of preprocessor functions
[docs] self.preprocessorNames = textacyNormalize + textacyRemove + textacyReplace + numerizer
# collect preprocessor functions in a list for name in preprocessorList: # strip out options for preprocessor if name in preprocessorOptions: options = preprocessorOptions[name] else: options = {} # build the function to do the preprocessing if name in textacyNormalize: self.createTextacyNormalizeFunction(name, options) elif name in textacyRemove: self.createTextacyRemoveFunction(name, options) elif name in textacyReplace: self.createTextacyReplaceFunction(name, options) elif name in numerizer: # create function to store in functionList self.functionList.append(lambda x: numerize(x, ignore=['a ', 'A', 'second'])) else: print(f'{name} is ignored! \nAvailable preprocessors: {self.preprocessorNames}') # create the preprocessor pipeline (composition of functionList)
[docs] self.pipeline = functoolz.compose_left(*self.functionList)
[docs] def createTextacyNormalizeFunction(self, name, options): """ Creates a function from textacy.preprocessing.normalize such that only argument is a string and adds it to the functionList Args: name: str, name of the preprocessor options: dict, dictionary of preprocessor options Returns: None """ # check for optional arguments useChars, useMaxn, useForm = False, False, False # options for repeating_chars if 'chars' in options and isinstance(options['chars'], str): # if chars is not str, it gets ignored useChars = True if 'maxn' in options and isinstance(options['maxn'], int): # if maxn is not int, it gets ignored useMaxn = True # option for unicode if 'form' in options and isinstance(options['form'], str): # if form is not str, it gets ignored useForm = True # build function for the pipeline if useChars or useMaxn or useForm: # include optional arguments f = lambda x: getattr(preprocessing.normalize, name)(x, **options) else: # no options need to be included f = lambda x: getattr(preprocessing.normalize, name)(x) # add to the functionList self.functionList.append(f)
[docs] def createTextacyRemoveFunction(self, name, options): """ Creates a function from textacy.preprocessing.remove such that the only argument is a string and adds it to the functionList Args: name: str, name of the preprocessor options: dict, dictionary of preprocessor options Returns: None """ # check for optional arguments useFast, useOnly = False, False # option for accents if 'fast' in options and isinstance(options['fast'], bool): # if fast is not bool, it gets ignored useFast = True # option for brackets and punctuation if 'only' in options and isinstance(options['only'], (str, list, tuple)): # if only is not str, list, or tuple, it gets ignored useOnly = True # build function for the pipeline if useFast or useOnly: # include optional arguments f = lambda x: getattr(preprocessing.remove, name)(x, **options) else: # no options need to be included f = lambda x: getattr(preprocessing.remove, name)(x) # add to the functionList self.functionList.append(f)
[docs] def createTextacyReplaceFunction(self, name, options): """ Creates a function from textacy.preprocessing.replace such that the only argument is a string and adds it to the functionList Args: name: str, name of the preprocessor options: dict, dictionary of preprocessor options Returns: None """ # check for optional arguments useRepl = False if 'repl' in options and isinstance(options['repl'], str): # if repl is not str, it gets ignored useRepl = True # build function for the pipeline if useRepl: # include optional argument f = lambda x: getattr(preprocessing.replace, name)(x, **options) else: # no options need to be included f = lambda x: getattr(preprocessing.replace, name)(x) # add to the functionList self.functionList.append(f)
[docs] def __call__(self, text): """ Performs the preprocessing Args: text: str, string of text to preprocess Returns: processed: str, string of processed text """ processed = text.strip('\n') processed = re.sub(r'&', ' and ', processed) # processed = re.sub(r'/', ' and ', processed) processed = re.sub(r'@', ' at ', processed) processed = self.pipeline(processed) return processed