Source code for src.dackar.text_processing.Abbreviation

# Copyright 2024, Battelle Energy Alliance, LLC  ALL RIGHTS RESERVED
"""
Created on March, 2024

@author: wangc, mandd
"""
import re
import logging
import pandas as pd
from ..config import nlpConfig

[docs] logger = logging.getLogger(__name__)
[docs] not_acronyms = [ 'was', 'for', 'by', 'me', 'our', 'is', 'he', 'she', 'they', 'him', 'his', 'her','them', 'my','mine','min','max', 'us', 'an', 'had', 'its', 'the', 'it', 'and', 'a', 'has', 'have', 'should', 'will', 'may', 'could', 'if', 'are', 'from', 'that','can', 'in', 'on', 'above', 'when', 'where', 'who', 'why', 'were', 'their', 'since', 'of', 'at', 'all', 'to', 'as', 'this']
[docs] class Abbreviation(object): """ Class to handle abbreviations """ def __init__(self): """ Abbrviation expander constructor Args: abbreviationsFilename: string, filename of abbreviations data Return: None """
[docs] self.type = self.__class__.__name__
[docs] self.name = self.__class__.__name__
logger.info(f'Create instance of {self.name}') if 'abbreviation_file' in nlpConfig['files']: self.abbreviationsFilename = nlpConfig['files']['abbreviation_file'] abbrList = pd.read_excel(self.abbreviationsFilename) self.abbrDict = dict(zip(abbrList['Abbreviation'], abbrList['Full'])) else: self.abbreviationsFilename = None self.abbrDict = {}
[docs] def abbreviationSub(self, text): """ Expands the abbreviations in text Args: text: string, the text to expand Returns: expandedText: string, the text with abbreviations expanded """ logger.info('Substitute abbreviations with their full expansions') text = text.replace("\n", "").lower() textList = [t.strip() for t in text.split('.')] expandedText = [] for sent in textList: corrected = sent splitSent = sent.split() for word in splitSent: if word not in not_acronyms: if word in self.abbrDict.keys(): full = self.abbrDict[word] if isinstance(full, str): corrected = re.sub(r"\b%s\b" % str(word) , full, corrected) elif isinstance(full, list) and len(full) == 1: corrected = re.sub(r"\b%s\b" % str(word) , full[0], corrected) else: logger.info(f'Can not replace abbreviation {word}, possible solution {full}') expandedText.append(corrected) expandedText = '. '.join(expandedText) return expandedText
[docs] def updateAbbreviation(self, abbrDict, reset=True): """ Update existing abbreviation dictionary Args: abbrDict: dict, provided abbreviation dictionary reset: boot, True if reset the existing abbreviation dictionary """ updateDict = {} for k, v in abbrDict.items(): if isinstance(v, str): updateDict[k.lower().strip()] = v.lower() elif isinstance(v, list): updateDict[k.lower().strip()] = [e.lower().strip() for e in v] else: pass if reset: self.abbrDict = updateDict else: self.abbrDict.update(updateDict)
[docs] def getAbbreviation(self): """ Get the abbreviation dict """ return self.abbrDict