Abbreviation handler demo

[1]:
import numpy as np
import pandas as pd
import os, sys, time

cwd = os.getcwd()
frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))
sys.path.append(frameworkDir)

from dackar.text_processing.Preprocessing import Preprocessing
from dackar.text_processing.Preprocessing import SpellChecker
from dackar.text_processing.Preprocessing import AbbrExpander

/Users/wangc/miniconda3/envs/dackar_libs/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Warming up PyWSD (takes ~10 secs)... took 4.628759145736694 secs.
[2]:
test = """Perf ann sens calib of cyl.
          High conc of hydrogen obs.
          High conc of hydrogen obs every wk.
          Prfr chann calib of chan.
          esf pump room and fuel bldg test.
          cal press xmtr sit elev.
          perform thermography survey of pzr htr terminations.
          plant mods comp iso mode prep.
          drain & rmv pipe."""
# pre-processing
test = test.lower()

[3]:
text = """A leak was noticed from the pump.
            RCP pump 1A pressure gauge was found not operating.
            RCP pump 1A pressure gauge was found inoperative.
            RCP pump 1A pressure gauge was not functional.
            Rupture of pump bearings caused shaft degradation.
            Rupture of pump bearings caused shaft degradation and consequent flow reduction.
            Pump power supply has been found burnout.
            Pump test failed due to power supply failure.
            Pump inspection revealed excessive impeller degradation.
            Pump inspection revealed excessive impeller degradation likely due to cavitation.
            Oil puddle was found in proximity of RCP pump 1A.
            Anomalous vibrations were observed for RCP pump 1A.
            Several cracks on pump shaft were observed; they could have caused pump failure within few days.
"""
text = text.lower()

[4]:
# import abbreviation list
abbrList = pd.read_excel('../data/abbreviations.xlsx')
# Make sure the library does not contain duplicate rows
print(abbrList[abbrList.duplicated()])

    Abbreviation     Full
614          brg  bearing
630         rplc  replace
[5]:
filename = os.path.join(os.getcwd(), os.pardir, 'data', 'abbreviations.xlsx')
AbbrExp = AbbrExpander(filename)
cleanedText = AbbrExp.abbrProcess(test, splitToList='True')
print(cleanedText)

perform annual sensor calibration of cylinder. high concentration of hydrogen observe. high concentration of hydrogen observe every week. perform channel calibration of channel. esf pump room and fuel building test. calibration pressure transmitter sit elevation. perform thermography survey of pressurizer heater terminations. plant modifications composition iso mode preparation. drain and remove pipe.
[6]:
from dackar.text_processing.Preprocessing import AbbrExpander
import os
import time

filename = os.path.join(os.getcwd(), os.pardir, 'data', 'abbreviations.xlsx')
AbbrExp = AbbrExpander(filename)

[7]:
st = time.time()
cleanedText = AbbrExp.abbrProcess(text)
et = time.time()
print('Time elapse:', et-st)

Time elapse: 8.979063749313354
[8]:
textList = [t.strip() for t in text.split('\n')]
newList = []
st = time.time()
for t in textList:
    cleanedT = AbbrExp.abbrProcess(t)
    newList.append(cleanedT)
et = time.time()
print('Time elapse:', et-st)

Time elapse: 0.7309439182281494
[9]:
st = time.time()
cleanedText2 = AbbrExp.abbrProcess(text, splitToList='True')
print(cleanedText2)
et = time.time()
print('Time elapse:', et-st)

1 leak was noticed from the pump. rcp pump 1a pressure gauge was found not operating. rcp pump 1a pressure gauge was found inoperative. rcp pump 1a pressure gauge was not functional. rupture of pump bearings caused shaft degradation. rupture of pump bearings caused shaft degradation and consequent flow reduction. pump power supply has been found burnout. pump test failed due to power supply failure. pump inspection revealed excessive impeller degradation. pump inspection revealed excessive impeller degradation likely due to cavitation. oil puddle was found in proximity of rcp pump 1a. anomalous vibrations were observed for rcp pump 1a. several cracks on pump shaft were observed; they could have caused pump failure within few days.
Time elapse: 0.858644962310791

Abbreviation handling class

[10]:
from dackar.text_processing.Abbreviation import Abbreviation

abbreviation = Abbreviation()
abbrDict = abbreviation.getAbbreviation()
print(abbrDict)
{'&': 'and', 'ab': 'as built', 'abl': 'ablative', 'abol': 'abolition', 'abs': 'absolute', 'absol': 'absolute', 'abst': 'abstract', 'abstr': 'abstract', 'accep': 'acceptance', 'accom': 'accomodation', 'accomm': 'accomodation', 'admin': 'administrative', 'adv': 'advanced', 'afl': 'above floor level\xa0', 'agl': 'above ground level', 'agst': 'against', 'ah': 'after hours', 'amer': 'american', 'anal': 'analysis', 'analyt': 'analytic', 'ann': 'annual', 'answ': 'answer', 'app': 'apperently', 'approx': 'approximate', 'appt': 'appointment', 'apr': 'april', 'aql': 'acceptable quality level', 'ar': 'as required', 'arch': 'architecture', 'arrgt': 'arrangement', 'artic': 'articulation', 'asap': 'as soon as possible', 'ass': 'assembly', 'assem': 'assembly', 'assy': 'assembly', 'attrib': 'attribute', 'aug': 'august', 'auto': 'automatic', 'aux': 'auxiliary', 'avg': 'average', 'batt': 'battery', 'bc': 'bolt circle', 'bef': 'before', 'betw': 'between', 'bhc': 'bolt hole circle', 'bldg': 'building', 'blk': 'block', 'bof': 'bottom of foundation', 'bom': 'bill of materials', 'bord': 'border', 'bot': 'bottom', 'bp': 'blueprint', 'brg': 'bearing', 'brz': 'bronze', 'bsc': 'basic', 'btm': 'bottom', 'btw': 'between', 'btwn': 'between', 'bw': 'both ways', 'c-c': 'center to center', 'c/o': 'care of', 'cad': 'computer-aided design', 'cal': 'calibration', 'calib': 'calibration', 'calibr': 'calibration', 'cap': 'capacity', 'cat': 'category', 'catal': 'catalogue', 'cda': 'current design activity', 'cent': 'centrifugal', 'centrif': 'centrifugal', 'cert': 'certificate', 'certif': 'certificate', 'cfw': 'continuous fillet weld', 'ch': 'channel', 'cham': 'chamfer', 'chamb': 'chamber', 'chan': 'channel', 'chann': 'channel', 'chap': 'chapter', 'chem': 'chemistry', 'circ': 'circulation', 'cl': 'center line', 'classif': 'classification', 'clr': 'clearance', 'cmu': 'cement masonry unit', 'cndct': 'conductivity', 'cntmnt': 'containment', 'cntmt': 'containment', 'coef ': 'coefficient', 'coeff': 'coefficient', 'col': 'column', 'comb': 'combination', 'comm': 'communications', 'comms': 'communications', 'communic': 'communication', 'comp': 'composite', 'conc': 'concentration', 'concent': 'concentration', 'concr': 'concrete', 'cond': 'conduct', 'conn': 'connection', 'const': 'constant', 'cont': 'continuous', 'coupl': 'coupling', 'cr': 'controlled radius', 'crnr': 'corner', 'ct': 'controller', 'ctmt': 'containment', 'ctr': 'center ', 'ctrl': 'control', 'cvr': 'cover', 'cyl': 'cylinder', 'dec': 'december', 'decon': 'decontamination', 'def': 'definition', 'dept': 'department', 'descr': 'description', 'det': 'determine', 'determ': 'determine', 'dia': 'diameter ', 'diam': 'diameter', 'diaph': 'diaphragm', 'dir': 'directorate', 'discov': 'discovery', 'disp': 'displacement', 'displ': 'displacement', 'distill': 'distillation', 'diy': 'do it yourself', 'dn': 'diameter nominal', 'doz': 'dozen', 'dp': 'downpipe', 'dwg': 'drawing', 'econ': 'economic', 'eff': 'efficiency', 'elec ': 'electrical', 'electr': 'electric', 'elem': 'element', 'elev': 'elevation', 'elv': 'elevation', 'eng': 'engineer', 'engin': 'engineering', 'engr': 'engineer', 'engrg': 'engineering', 'eq ': 'equal', 'eql': 'equal', 'equip': 'equipment', 'equiv': 'equivalent', 'esp': 'especially', 'est': 'established', 'establ': 'established', 'eta': 'estimated time of arrival', 'etc': 'etcetera', 'evid': 'evidence', 'ew': 'each way', 'exec': 'execution', 'expl': 'explanation', 'explan': 'explanation', 'ext': 'external', 'fam': 'familiar', 'famil': 'familiar', 'feb': 'february', 'fig': 'figure', 'fitt': 'fitting', 'fl': 'fluid', 'flg': 'flange', 'flng': 'flange', 'flnge': 'flange', 'flngs': 'flanges', 'fob': 'free on board', 'fos': 'factor of safety', 'freq': 'frequency', 'fs': 'far side', 'fsbw': 'full strength butt weld', 'ftg': 'fitting', 'ftp': 'file transfer protocol', 'fut': 'future', 'fw': 'feedwater', 'galv': 'galvanized', 'gen': 'general', 'geo': 'geography', 'geog': 'geography', 'gloss': 'glossary', 'gn': 'general note', 'gov': 'government', 'govt': 'government', 'hd': 'head', 'hex': 'hexagon', 'hist ': 'historical', 'hor': 'horizontal', 'horiz': 'horizontal', 'hp': 'high pressure', 'hr': 'human resources', 'hs': 'high strength', 'hw': 'hardware', 'hyd': 'hydraulic', 'hydr': 'hydraulic', 'hydr ': 'hydraulic', 'hydraul': 'hydraulic', 'i/o': 'input and output', 'iaw': 'in accordance with', 'id': 'indentity', 'imit': 'imitation', 'imp': 'implementation', 'impf': 'imperfect', 'impr': 'improved', 'ind': 'induction', 'indef': 'indefinite', 'indic': 'indicator', 'indir': 'indirect', 'indust': 'industrial', 'industr': 'industrial', 'inf': 'infinite', 'infl': 'influence', 'inj': 'injection', 'inorg': 'inorganic', 'inq': 'inquiry', 'insp': 'inspection', 'inspec': 'inspection', 'inst': 'instrumental', 'instl': 'install', 'instr': 'instruction', 'insul': 'insulation', 'int ': 'internal', 'interj': 'interjection', 'intl': 'internal', 'intro': 'introduction', 'introd': 'introduction', 'inv': 'inverter', 'io': 'inspection opening', 'jan': 'jannuary', 'jt': 'joint', 'junc': 'junction', 'kj': 'key joint', 'lab': 'laboratory', 'lf': 'left hand', 'lg': 'length', 'lh': 'left hand', 'lhs': 'left hand side', 'lm': 'list of material', 'ln': 'line', 'lp': 'low pressure', 'm/c': 'machine', 'mach': 'machine', 'mag': 'magnetic', 'magn': 'magnetic', 'maj': 'major', 'man': 'manual', 'matl': 'material', 'max': 'maximum', 'mbp': 'measurement between pins', 'mbw': 'measurement between wires', 'meas': 'measure', 'measurem': 'measurement', 'mech': 'mechanical', 'mem': 'memory', 'metall': 'metallurgy', 'meth': 'method', 'mf': 'make from', 'mfd': 'manufactured', 'mfg': 'manufacturing', 'mfr': 'manufacturer', 'mgmt': 'management', 'mgt': 'management', 'min': 'minimum', 'misc': 'miscellaneous', 'mngmnt': 'management', 'mngmt': 'management', 'mngt': 'management', 'mod': 'modification', 'mom': 'moment', 'mop': 'measurement between pins', 'mow': 'measurement between wires', 'mrp': 'material requirements planning', 'ms': 'mild steep', 'mtg': 'meeting', 'nat': 'natural', 'nc': 'numerical control', 'ncm': 'nonconforming material', 'ncr': 'nonconformance report', 'ne': 'north-east', 'neg': 'negative', 'net': 'network', 'nl': 'note list', 'no': 'number', 'nom': 'nominal', 'norm': 'normalized', 'normd': 'normalized', 'nov': 'November', 'noz': 'nozzle', 'npr': 'new product release', 'ns': 'nominal size', 'nts': 'not to scale', 'nuc': 'nuclear', 'nucl': 'nuclear', 'nw': 'north-west', 'obj': 'object', 'Obs': 'observation', 'obs': 'observe', 'observ': 'observation', 'obsrv': 'observe', 'occ': 'occurrence', 'occas': 'occasionally', 'occurr': 'occurrence', 'oct': 'octagon', 'od': 'outer diameter', 'op': 'operation', 'oper': 'operator', 'opp': 'opposed', 'ops': 'operations', 'opt': 'option', 'ord': 'order', 'org': 'organization', 'orig': 'original', 'parab': 'parabolic', 'patt': 'pattern', 'pc ': 'piece', 'pcd': 'pitch circle diameter', 'pck': 'pack', 'pcs': 'pieces', 'perf': 'perform', 'perh': 'perhaps', 'period': 'periodic', 'pers': 'personal', 'pfc': 'parallel flange channel', 'pl': 'plural', 'plm': 'plant lifecycle management', 'plur': 'plural', 'pmp': 'pump', 'pn': 'part number', 'poi': 'point of intersection', 'pop': 'popular', 'pos': 'positive', 'posn': 'position', 'pract': 'practice', 'pred': 'prediction', 'predic': 'prediction', 'predict': 'prediction', 'pref': 'prefix', 'prep': 'prepare', 'preps': 'preparations', 'pres': 'pressure', 'press': 'pressure', 'prob': 'probably', 'probab': 'probability', 'probl': 'problem', 'prog': 'progress', 'prop': 'property', 'prox': 'proximity', 'pwr': 'power', 'pzr': 'pressurizer', 'qc': 'quality control', 'qnty': 'quantity', 'qty': 'quantity', 'rad': 'radiation', 'rd': 'round', 'reas': 'reason', 'rec': 'recirculation', 'recirc': 'recirculation', 'rect': 'rectangular', 'red': 'reduce', 'ref': 'reference', 'refr': 'reference', 'refurb': 'refurbish', 'regist': 'register', 'regr': 'regression', 'reinf': 'reinforcement', 'reinf ': 'reinforce ', 'reinst': 'reinstall', 'rel': 'relative', 'remv': 'remove', 'rep': 'report', 'repck': 'repack', 'rept': 'report', 'req': 'required', 'reqd': 'required', 'res': 'research', 'resrvr': 'reservoir', 'reterm': 'retermination', 'rev': 'revised', 'rh': 'right hand ', 'rhs': 'right hand side', 'rms': 'root mean square', 'rmv': 'remove', 'rnd': 'round', 'rsvr': 'reservoir', 'rx': 'reactor', 's/g': 'steam generator', 'sec': 'security', 'sect': 'section', 'sel': 'selected', 'select': 'selection', 'sens': 'sensor', 'sept': 'september', 'sg': 'steam generator', 'sh': 'sheet', 'shcs': 'socket head cap screw', 'shss': 'socket head set screw', 'shwn': 'shown', 'sim': 'simulation', 'sk': 'sketch', 'sn': 'serial number', 'spec': 'specimen', 'sq': 'square', 'ss': 'stainless steel', 'stat': 'statistical', 'statist': 'statistical', 'std': 'standard', 'stdrd': 'standard', 'stk': 'stock', 'str': 'strong', 'struct': 'structural', 'subj': 'subject', 'supp': 'suppression', 'suppress': 'suppression', 'surf': 'surface', 'surv': 'survey', 'susp': 'suspension', 'sw': 'switchyard', 'swch': 'switch', 'symm': 'symmetry', 'sys': 'system', 'syst': 'system', 'tc': 'tungsten carbide', 'tdp': 'technical data package', 'tech': 'technology', 'tech specs': 'technical specifications', 'techn': 'technical', 'technol': 'technology', 'tel': 'telephone', 'telecom': 'telecommunications', 'telecomm': 'telecommunications', 'temp': 'temporary', 'thd': 'thread', 'theor': 'theoretical', 'theoret': 'theoretical', 'thk': 'thick', 'thru': 'through', 'tir': 'total indicator reading', 'tol': 'tolerance', 'toll': 'tollerance', 'trans': 'transaction', 'transf': 'transformer', 'transl': 'translation', 'transm': 'transmission', 'treat': 'treatment', 'treatm': 'treatment', 'troub': 'troublesome', 'ty': 'type', 'typ': 'typical', 'ua': 'unequal angle', 'uai': 'use as is', 'ub': 'universal beam', 'ucut': 'undercut', 'ull': 'under low limit', 'unk': 'unknown', 'unkn': 'unknown', 'unkw': 'unknown', 'unkwn': 'unknown', 'uno': 'unless noted otherwise', 'uon': 'unless otherwise noted', 'uos': 'unless otherwise specified', 'vac': 'vacuum', 'var': 'variable', 'vb': 'verb', 'ver': 'vertical', 'vert': 'vertical', 'vis ': 'visual', 'vlv': 'valve', 'vol': 'volume', 'vs': 'versus', 'vsl': 'vessel', 'w/': 'with', 'w/i': 'within', 'w/o': 'without', 'wc': 'welded column', 'wgt': 'weight', 'wk': 'work', 'wks': 'works', 'wt': 'weight', 'wtr': 'water', 'yd': 'yard', 'yp': 'yield point', 'yr': 'year', 'yrs': 'years', 'emb': 'embrittlement', 'env': 'environment', 'fat': 'fatigue', 'wstg': 'wastage', 'degr': 'degradation', 'deg': 'degradation', 'wst': 'waste ', 'chk': 'check', 'xmtr': 'transmitter', 'htr': 'heater', 'retest': 'test', 'verif': 'verify', 'atmos': 'atmospheric', 'cntl': 'control', 'cntrl': 'control', 'wshr': 'washer', 'vent': 'ventilation', 'xfer': 'transfer', 'purif': 'purification', 'hx': 'heat exchanger', 'scaff': 'scaffolding', 'scaf ': 'scaffolding', 'cbl': 'cable', 'cbls': 'cables', 'discnt': 'disconnect', 'dscnt': 'disconnect', 'dscnct': 'disconnect', 'regen': 'regenerate', 'recomb': 'recombine', 'depress': 'depressurize', 'demin': 'demineralize', 'strk': 'stroke', 'cntrld': 'controlled', 'dwncmr': 'downcomer', 'drn': 'drain', 'fdwtr': 'feed water', 'turb': 'turbine', 'emerg': 'emergency', 'alt': 'alternator', 'feedwater': 'feed water', 'shft': 'shaft', 'lo': 'low', 'hi': 'high', 'flw': 'flow', 'funct': 'functional', 'trng': 'training', 'tk': 'tank', 'tks': 'tanks', 'isol': 'isolate', 'suct': 'suction', 'tnk': 'tank', 'tnks': 'tanks', 'disch': 'discharge', 'emrg': 'emergency', 'islt': 'isolate', 'xfmr': 'transformer', 'xfrmr': 'transformer', 'stby': 'stand by', 'assmby': 'assembly', 'vrfy': 'verify', 'rls': 'release', 'lvl': 'level', 'deten': 'detension', 'rmve': 'remove', 'hrs': 'hours', 'perfrm': 'perform', 'sched': 'schedule', 'mods': 'modifications', 'cw': 'circulating water', 'rplc': 'replace', 'lwr': 'lower', "req'd": 'required', 'scrn': 'screen', 'wsh': 'wash', 'mtr': 'motor', 'trblsht': 'troubleshoot', 'ht': 'heat', 'oos': 'out of service', 'spre': 'spare', 'otbd': 'outboard', 'inbd': 'inboard', 'underwtr': 'underwater', 'vib': 'vibration', 'vibs': 'vibrations', 'rplce': 'replace', 'maint': 'maintenance', 'mntnc': 'maintenance', 'mainten': 'maintenance', 'maintenan': 'maintenance', 'mainte': 'maintenance', 'chng': 'change', 'inop': 'inoperative', 'perma': 'permanent', 'trav': 'traveling', 'vibes': 'vibrations', 'crac': 'crack', 'stab': 'stabilize'}
[11]:
cleanedText = abbreviation.abbreviationSub(test)
print(cleanedText)
perform annual sensor calibration of cylinder. high concentration of hydrogen observe. high concentration of hydrogen observe every work. prfr channel calibration of channel. esf pump room and fuel building test. calibration pressure transmitter sit elevation. perform thermography survey of pressurizer heater terminations. plant modifications composite iso mode prepare. drain & remove pipe.
[12]:
test
[12]:
'perf ann sens calib of cyl.\n          high conc of hydrogen obs.\n          high conc of hydrogen obs every wk.\n          prfr chann calib of chan.\n          esf pump room and fuel bldg test.\n          cal press xmtr sit elev.\n          perform thermography survey of pzr htr terminations.\n          plant mods comp iso mode prep.\n          drain & rmv pipe.'
[13]:
abbrDict = {'perf':'perform', 'ann':'annual', 'sens':'sensor', 'calib':'calibration'}
abbreviation.updateAbbreviation(abbrDict, reset=True)
print(abbreviation.getAbbreviation())
cleanedText = abbreviation.abbreviationSub(test)
print(cleanedText)
{'perf': 'perform', 'ann': 'annual', 'sens': 'sensor', 'calib': 'calibration'}
perform annual sensor calibration of cyl. high conc of hydrogen obs. high conc of hydrogen obs every wk. prfr chann calibration of chan. esf pump room and fuel bldg test. cal press xmtr sit elev. perform thermography survey of pzr htr terminations. plant mods comp iso mode prep. drain & rmv pipe.