## Abbreviation handler demo 

In [1]:
import numpy as np
import pandas as pd
import os, sys, time

cwd = os.getcwd()
frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))
sys.path.append(frameworkDir)

from dackar.text_processing.Preprocessing import Preprocessing
from dackar.text_processing.Preprocessing import SpellChecker
from dackar.text_processing.Preprocessing import AbbrExpander


  from .autonotebook import tqdm as notebook_tqdm
Warming up PyWSD (takes ~10 secs)... took 4.628759145736694 secs.


In [2]:
test = """Perf ann sens calib of cyl.
          High conc of hydrogen obs.
          High conc of hydrogen obs every wk.
          Prfr chann calib of chan.
          esf pump room and fuel bldg test.
          cal press xmtr sit elev.
          perform thermography survey of pzr htr terminations.
          plant mods comp iso mode prep.
          drain & rmv pipe."""
# pre-processing
test = test.lower()


In [3]:
text = """A leak was noticed from the pump.
            RCP pump 1A pressure gauge was found not operating.
            RCP pump 1A pressure gauge was found inoperative.
            RCP pump 1A pressure gauge was not functional.
            Rupture of pump bearings caused shaft degradation.
            Rupture of pump bearings caused shaft degradation and consequent flow reduction.
            Pump power supply has been found burnout.
            Pump test failed due to power supply failure.
            Pump inspection revealed excessive impeller degradation.
            Pump inspection revealed excessive impeller degradation likely due to cavitation.
            Oil puddle was found in proximity of RCP pump 1A.
            Anomalous vibrations were observed for RCP pump 1A.
            Several cracks on pump shaft were observed; they could have caused pump failure within few days.
"""
text = text.lower()


In [4]:
# import abbreviation list
abbrList = pd.read_excel('../data/abbreviations.xlsx')
# Make sure the library does not contain duplicate rows
print(abbrList[abbrList.duplicated()])


    Abbreviation     Full
614          brg  bearing
630         rplc  replace


In [5]:
filename = os.path.join(os.getcwd(), os.pardir, 'data', 'abbreviations.xlsx')
AbbrExp = AbbrExpander(filename)
cleanedText = AbbrExp.abbrProcess(test, splitToList='True')
print(cleanedText)


perform annual sensor calibration of cylinder. high concentration of hydrogen observe. high concentration of hydrogen observe every week. perform channel calibration of channel. esf pump room and fuel building test. calibration pressure transmitter sit elevation. perform thermography survey of pressurizer heater terminations. plant modifications composition iso mode preparation. drain and remove pipe. 


In [6]:
from dackar.text_processing.Preprocessing import AbbrExpander
import os
import time

filename = os.path.join(os.getcwd(), os.pardir, 'data', 'abbreviations.xlsx')
AbbrExp = AbbrExpander(filename)


In [7]:
st = time.time()
cleanedText = AbbrExp.abbrProcess(text)
et = time.time()
print('Time elapse:', et-st)


Time elapse: 8.979063749313354


In [8]:
textList = [t.strip() for t in text.split('\n')]
newList = []
st = time.time()
for t in textList:
    cleanedT = AbbrExp.abbrProcess(t)
    newList.append(cleanedT)
et = time.time()
print('Time elapse:', et-st)


Time elapse: 0.7309439182281494


In [9]:
st = time.time()
cleanedText2 = AbbrExp.abbrProcess(text, splitToList='True')
print(cleanedText2)
et = time.time()
print('Time elapse:', et-st)


1 leak was noticed from the pump. rcp pump 1a pressure gauge was found not operating. rcp pump 1a pressure gauge was found inoperative. rcp pump 1a pressure gauge was not functional. rupture of pump bearings caused shaft degradation. rupture of pump bearings caused shaft degradation and consequent flow reduction. pump power supply has been found burnout. pump test failed due to power supply failure. pump inspection revealed excessive impeller degradation. pump inspection revealed excessive impeller degradation likely due to cavitation. oil puddle was found in proximity of rcp pump 1a. anomalous vibrations were observed for rcp pump 1a. several cracks on pump shaft were observed; they could have caused pump failure within few days. 
Time elapse: 0.858644962310791


### Abbreviation handling class 

In [10]:
from dackar.text_processing.Abbreviation import Abbreviation

abbreviation = Abbreviation()
abbrDict = abbreviation.getAbbreviation()
print(abbrDict)

{'&': 'and', 'ab': 'as built', 'abl': 'ablative', 'abol': 'abolition', 'abs': 'absolute', 'absol': 'absolute', 'abst': 'abstract', 'abstr': 'abstract', 'accep': 'acceptance', 'accom': 'accomodation', 'accomm': 'accomodation', 'admin': 'administrative', 'adv': 'advanced', 'afl': 'above floor level\xa0', 'agl': 'above ground level', 'agst': 'against', 'ah': 'after hours', 'amer': 'american', 'anal': 'analysis', 'analyt': 'analytic', 'ann': 'annual', 'answ': 'answer', 'app': 'apperently', 'approx': 'approximate', 'appt': 'appointment', 'apr': 'april', 'aql': 'acceptable quality level', 'ar': 'as required', 'arch': 'architecture', 'arrgt': 'arrangement', 'artic': 'articulation', 'asap': 'as soon as possible', 'ass': 'assembly', 'assem': 'assembly', 'assy': 'assembly', 'attrib': 'attribute', 'aug': 'august', 'auto': 'automatic', 'aux': 'auxiliary', 'avg': 'average', 'batt': 'battery', 'bc': 'bolt circle', 'bef': 'before', 'betw': 'between', 'bhc': 'bolt hole circle', 'bldg': 'building', 'bl

In [11]:
cleanedText = abbreviation.abbreviationSub(test)
print(cleanedText)

perform annual sensor calibration of cylinder. high concentration of hydrogen observe. high concentration of hydrogen observe every work. prfr channel calibration of channel. esf pump room and fuel building test. calibration pressure transmitter sit elevation. perform thermography survey of pressurizer heater terminations. plant modifications composite iso mode prepare. drain & remove pipe. 


In [12]:
test

'perf ann sens calib of cyl.\n          high conc of hydrogen obs.\n          high conc of hydrogen obs every wk.\n          prfr chann calib of chan.\n          esf pump room and fuel bldg test.\n          cal press xmtr sit elev.\n          perform thermography survey of pzr htr terminations.\n          plant mods comp iso mode prep.\n          drain & rmv pipe.'

In [13]:
abbrDict = {'perf':'perform', 'ann':'annual', 'sens':'sensor', 'calib':'calibration'}
abbreviation.updateAbbreviation(abbrDict, reset=True)
print(abbreviation.getAbbreviation())
cleanedText = abbreviation.abbreviationSub(test)
print(cleanedText)

{'perf': 'perform', 'ann': 'annual', 'sens': 'sensor', 'calib': 'calibration'}
perform annual sensor calibration of cyl. high conc of hydrogen obs. high conc of hydrogen obs every wk. prfr chann calibration of chan. esf pump room and fuel bldg test. cal press xmtr sit elev. perform thermography survey of pzr htr terminations. plant mods comp iso mode prep. drain & rmv pipe. 
