Source code for src.dackar.text_processing.AbbrExpander

# Copyright 2024, Battelle Energy Alliance, LLC  ALL RIGHTS RESERVED
"""
Created on October, 2022

@author: wangc, mandd
"""
from .SpellChecker import SpellChecker
from .Preprocessing import Preprocessing
import pandas as pd


[docs] class AbbrExpander(object): """ Class to expand abbreviations """ def __init__(self, abbreviationsFilename, checkerType='autocorrect', abbrType='mixed'): """ Abbreviation expander constructor Args: abbreviationsFilename: str, filename of abbreviations data checkerType: str, type for spell checker class, i.e., 'autocorrect', 'pyspellchecker', and 'contextual spellcheck', default is 'autocorrect' abbrType: str, type of abbreviation method ('spellcheck','hard','mixed') that are employed to determine which words are abbreviations that need to be expanded * spellcheck: in this case spellchecker is used to identify words that are not recognized * hard: here we directly search for the abbreviations in the provided sentence * mixed: here we perform first a "hard" search followed by a "spellcheck" search Return: None """
[docs] self.abbrType = abbrType
[docs] self.checkerType = checkerType
[docs] self.abbrList = pd.read_excel(abbreviationsFilename)
[docs] self.preprocessorList = ['hyphenated_words', 'whitespace', 'numerize']
[docs] self.preprocess = Preprocessing(self.preprocessorList, {})
[docs] self.checker = SpellChecker(checker=self.checkerType)
[docs] self.abbrDict = self.checker.generateAbbrDict(self.abbrList)
[docs] def abbrProcess(self, text, splitToList=False): """ Expands the abbreviations in text Args: text: string, the text to expand splitToList: bool, True if splitting the text into sentences, default is False Returns: expandedText: string, the text with abbreviations expanded """ text = self.preprocess(text) if not splitToList: expandedText = self.checker.handleAbbreviationsDict(self.abbrDict, text.lower(), type=self.abbrType) else: text = text.replace("\n", "") textList = [t.strip() for t in text.split('.')] expandedText = [] for t in textList: cleanedText = self.checker.handleAbbreviationsDict(self.abbrDict, t.lower(), type=self.abbrType) expandedText.append(cleanedText) expandedText = '. '.join(expandedText) return expandedText