Source code for src.dackar.text_processing.SpellChecker

# Copyright 2024, Battelle Energy Alliance, LLC  ALL RIGHTS RESERVED
"""
Created on October, 2022

@author: mandd, wangc
"""
import re
import itertools
import numpy as np
import spacy
from spacy.vocab import Vocab
import logging

[docs] logger = logging.getLogger(__name__)
try: from contextualSpellCheck.contextualSpellCheck import ContextualSpellCheck except ModuleNotFoundError as error: logger.error(f"Unable to import contextualSpellCheck: {error}") logger.info("Please try to install it via: 'pip install contextualSpellCheck'") try: import autocorrect except ModuleNotFoundError as error: logger.error(f"Unable to import autocorrect: {error}") logger.info("Please try to install it via: 'pip install autocorrect'") try: from spellchecker import SpellChecker as PySpellChecker except ModuleNotFoundError as error: logger.error(f"Unable to import spellchecker: {error}") logger.info("Please try to install it via: 'pip install spellchecker'") from ..similarity.simUtils import wordsSimilarity from ..config import nlpConfig
[docs] class SpellChecker(object): """ Object to find misspelled words and automatically correct spelling Note: when using autocorrect, one need to conduct a spell test to identify the threshold (the word frequencies) """ def __init__(self, checker='autocorrect'): """ SpellChecker object constructor Args: checker: str, optional, spelling corrector to use ('autocorrect' or 'ContextualSpellCheck') Returns: None """
[docs] self.checker = checker.lower()
[docs] self.addedWords = []
[docs] self.includedWords = []
if 'extra_vocab' in nlpConfig['files']: file2open = nlpConfig['files']['extra_vocab'] with open(file2open, 'r') as file: tmp = file.readlines() self.addedWords = list({x.replace('\n', '') for x in tmp}) # get included and additional dictionary words and update speller dictionary if self.checker == 'autocorrect': self.speller = autocorrect.Speller() self.speller.nlp_data.update({x: 1000000 for x in self.addedWords}) elif self.checker == 'pyspellchecker': self.speller = PySpellChecker() self.speller.word_frequency.load_words(self.addedWords) else: name = 'contextual spellcheck' languageModel = nlpConfig['params']['spacy_language_pipeline'] self.nlp = spacy.load(languageModel) self.speller = ContextualSpellCheck(self.nlp, name) self.includedWords = list(self.speller.BertTokenizer.get_vocab().keys()) self.speller.vocab = Vocab(strings=self.includedWords+self.addedWords)
[docs] def addWordsToDictionary(self, words): """ Adds a list of words to the spell check dictionary Args: words: list, list of words to add to the dictionary Returns: None """ if self.checker == 'autocorrect': self.speller.nlp_data.update({word: 1000000 for word in words}) elif self.checker == 'pyspellchecker': self.speller.word_frequency.load_words(self.addedWords+words) else: self.speller.vocab = Vocab(strings=self.includedWords+self.addedWords+words)
[docs] def getMisspelledWords(self, text): """ Returns a list of words that are misspelled according to the dictionary used Args: None Returns: misspelled: list, list of misspelled words """ if self.checker == 'autocorrect': # corrected = self.speller(text.lower()) original = re.findall(r'[^\s!,.?":;-]+', text) # auto = re.findall(r'[^\s!,.?":;-]+', corrected) # misspelled = list({w1 if w1.lower() != w2.lower() else None for w1, w2 in zip(original, auto)}) misspelled = {word for word in original if word not in self.speller.nlp_data} if None in misspelled: misspelled.remove(None) elif self.checker == 'pyspellchecker': original = re.findall(r'[^\s!,.?":;-]+', text) misspelled = self.speller.unknown(original) else: doc = self.nlp(text) doc = self.speller(doc) misspelled = {str(x) for x in doc._.suggestions_spellCheck.keys()} return misspelled
[docs] def correct(self, text): """ Performs automatic spelling correction and returns corrected text Args: None Returns: corrected: str, spelling corrected text """ if self.checker == 'autocorrect': corrected = self.speller(text) elif self.checker == 'pyspellchecker': l = re.split(r"([A-Za-z]+(?=\s|\.))", text) corrected = [] for elem in l: if len(elem) == 0: corrected.append(elem) elif not re.search(r"[^A-Za-z]+",elem): if elem in self.speller: corrected.append(elem) else: corrected.append(self.speller.correction(elem)) else: corrected.append(elem) corrected = "".join(corrected) else: doc = self.nlp(text) doc = self.speller(doc) corrected = doc._.outcome_spellCheck return corrected
[docs] def handleAbbreviations(self, abbrDatabase, text, type): """ Performs automatic correction of abbreviations and returns corrected text This method relies on a database of abbreviations located at: `src/nlp/data/abbreviations.xlsx` This database contains the most common abbreviations collected from literature and it provides for each abbreviation its corresponding full word(s); an abbreviation might have multiple words associated. In such case the full word that makes more sense given the context is chosen (see findOptimalOption method) Args: abbrDatabase: pandas dataframe, dataframe containing library of abbreviations and their corresponding full expression text: str, string of text that will be analyzed type: string, type of abbreviation method ('spellcheck','hard','mixed') that are employed to determine which words are abbreviations that need to be expanded * spellcheck: in this case spellchecker is used to identify words that are not recognized * hard: here we directly search for the abbreviations in the provided sentence * mixed: here we perform first a "hard" search followed by a "spellcheck" search Returns: options: list, list of corrected text options """ abbreviationSet = set(abbrDatabase['Abbreviation'].values) if type == 'spellcheck': unknowns = self.getMisspelledWords(text) elif type == 'hard' or type=='mixed': unknowns = [] splitSent = text.split() for word in splitSent: if word.lower() in abbreviationSet: unknowns.append(word) if type=='mixed': set1 = set(self.getMisspelledWords(text)) set2 = set(unknowns) unknowns = list(set1.union(set2)) corrections={} for word in unknowns: if word.lower() in abbrDatabase['Abbreviation'].values: locs = list(abbrDatabase['Abbreviation'][abbrDatabase['Abbreviation']==word.lower()].index.values) if locs: corrections[word] = abbrDatabase['Full'][locs].values.tolist() else: print(word) else: # Here we are addressing the fact that the abbreviation database will never be complete # Given an abbreviation that is not part of the abbreviation database, we are looking for a # a subset of abbreviations the abbreviation database that are close enough (and consider # them as possible candidates from difflib import SequenceMatcher corrections[word] = [] abbreviationDS = abbrDatabase['Abbreviation'].values for index,abbr in enumerate(abbreviationDS): if SequenceMatcher(None, word, abbr).ratio()>0.8: corrections[word].append(abbrDatabase['Full'].values.tolist()[index]) if not corrections[word]: corrections.pop(word) combinations = list(itertools.product(*list(corrections.values()))) options = [] for comb in combinations: corrected = text for index,key in enumerate(corrections.keys()): corrected = re.sub(r"\b%s\b" % str(key) , comb[index], corrected) options.append(corrected) if not options: return text else: bestOpt = self.findOptimalOption(options) return bestOpt
[docs] def generateAbbrDict(self, abbrDatabase): """ Generates an AbbrDict that can be used by handleAbbreviationsDict Args: abbrDatabase: pandas dataframe, dataframe containing library of abbreviations and their corresponding full expression Returns: abbrDict: dictionary, a abbreviations dictionary """ abbrDict = {} #There may be a more efficient way to do the following for row in abbrDatabase.itertuples(): abbrs = abbrDict.get(row.Abbreviation,[]) abbrs.append(row.Full) abbrDict[row.Abbreviation] = abbrs return abbrDict
[docs] def handleAbbreviationsDict(self, abbrDict, text, type): """ Performs automatic correction of abbreviations and returns corrected text This method relies on a database of abbreviations located at: src/nlp/data/abbreviations.xlsx This database contains the most common abbreviations collected from literature and it provides for each abbreviation its corresponding full word(s); an abbreviation might have multple words associated. In such case the full word that makes more sense given the context is chosen (see findOptimalOption method) Args: abbrDict: dictionary, dictionary containing library of abbreviations and their corresponding full expression text: str, string of text that will be analyzed type: string, type of abbreviation method ('spellcheck','hard','mixed') that are employed to determine which words are abbreviations that need to be expanded * spellcheck: in this case spellchecker is used to identify words that are not recognized * hard: here we directly search for the abbreviations in the provided sentence * mixed: here we perform first a "hard" search followed by a "spellcheck" search Return: options: list, list of corrected text options """ if type == 'spellcheck': unknowns = self.getMisspelledWords(text) elif type == 'hard' or type=='mixed': unknowns = [] splitSent = text.split() for word in splitSent: if word.lower() in abbrDict.keys(): unknowns.append(word) if type=='mixed': set1 = set(self.getMisspelledWords(text)) set2 = set(unknowns) unknowns = list(set1.union(set2)) corrections={} for word in unknowns: if word.lower() in abbrDict.keys(): if len(abbrDict[word.lower()]) > 0: corrections[word] = abbrDict[word.lower()] else: # Here we are addressing the fact that the abbreviation database will never be complete # Given an abbreviation that is not part of the abbreviation database, we are looking for a # a subset of abbreviations the abbreviation database that are close enough (and consider # them as possible candidates from difflib import SequenceMatcher corrections[word] = [] abbreviationDS = list(abbrDict) for index,abbr in enumerate(abbreviationDS): val=0 newVal = SequenceMatcher(None, word, abbr).ratio() if newVal>=0.75 and newVal>val: corrections[word] = abbrDict[abbr] val = newVal if not corrections[word]: corrections.pop(word) combinations = list(itertools.product(*list(corrections.values()))) options = [] for comb in combinations: corrected = text for index,key in enumerate(corrections.keys()): corrected = re.sub(r"\b%s\b" % str(key) , comb[index], corrected) options.append(corrected) if not options: return text else: bestOpt = self.findOptimalOption(options) return bestOpt
[docs] def findOptimalOption(self,options): """ Method to handle abbreviation with multiple meanings Args: options: list, list of sentence options Return: optimalOpt: string, option from the provided options list that fits more the possible """ nOpt = len(options) combScore = np.zeros(nOpt) for index,opt in enumerate(options): listOpt = opt.split() for i,word in enumerate(listOpt): for j in range(i+1,len(listOpt)): combScore[index] = combScore[index] + wordsSimilarity(word,listOpt[j]) optIndex = np.argmax(combScore) optimalOpt = options[optIndex] return optimalOpt