Demo For NLP Workflow

Set Paths and Loading Required Modules

  • Required libraries, please check SR2ML/dependencies.xml

[1]:
# External Modules #
import os
import sys
import pandas as pd
import spacy
import logging
import numerizer
#######################

# Settings #
cwd = os.getcwd()
frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))
sys.path.append(frameworkDir)
########################

# Internal Modules #
from dackar.workflows.RuleBasedMatcher import RuleBasedMatcher
from dackar import config
from dackar.utils.nlp.nlp_utils import generatePatternList
from dackar.text_processing.Preprocessing import Preprocessing
from dackar.text_processing.SpellChecker import SpellChecker
#########################

# logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.DEBUG)
logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)
nlp = spacy.load("en_core_web_lg", exclude=[])

Initialize variables

[2]:
cleanDoc = True         # prepocessor the raw doc text, i.e., remove brackets, repeatings, punctuations.
numerizerDoc = True     # converts natural language numerics into ints and floats
spellCorrectDoc = False # spell check and perform corrections

entLabel = "pump_component"       # user defined entity label
entId = "SSC"                     # user defined entity ID
causalLabel = "causal_keywords"   # user defined causal keyword label
causalID = "causal"               # user defined causal keyword ID
ents = []                         # user provided entities list
causalList = []                   # user provided causal keyword list

removeBrackets = ['curly', 'square', 'round']
removeRepeatings = ['.']
# TODO: extend repeating_chars to handle a list of chars, right now we can only pass one chars
removePunctuation = ['/', "#", '~'] # right now puncuation is replaced with whitespace, we may need to replace it with None
# TODO: add replace functions, for example, replace acronyms with full name

preprocessorList = ['bullet_points',
                    'hyphenated_words',
                    'quotation_marks',
                    'unicode',
                    'repeating_chars',
                    'accents',
                    'brackets',
                    'html_tags',
                    'punctuation',
                    # 'currency_symbols',
                    'emails',
                    'emojis',
                    'hashtags',
                    # 'numbers',
                    'phone_numbers',
                    'urls',
                    'user_handles',
                    'whitespace',
                    'numerize']
preprocessorOptions = {'repeating_chars': {'chars': removeRepeatings[0], 'maxn': 1},
                       'unicode': {'form': 'NFKC'},
                       'accents': {'fast': False},
                       'brackets': {'only': removeBrackets},
                       'punctuation': {'only': removePunctuation}}

preprocess = Preprocessing(preprocessorList, preprocessorOptions)

Load entity list and causal list or provide directly

[3]:
entityFile = config.nlpConfig['files']['entity_file']
entityList = pd.read_csv(entityFile).values.ravel().tolist()
ents.extend(entityList)

causalFilename = config.nlpConfig['files']['cause_effect_keywords_file']
ds = pd.read_csv(causalFilename, skipinitialspace=True)
for col in ds.columns:
    causalList.extend(set(ds[col].dropna()))

Generate patterns that can be used in NER

[4]:
patternsEnts = generatePatternList(ents, label=entLabel, id=entId, nlp=nlp, attr="LEMMA")
patternsCausal = generatePatternList(causalList, label=causalLabel, id=causalID, nlp=nlp, attr="LEMMA")

Create rule-based matcher with entity list and causal entity list

[5]:
matcher = RuleBasedMatcher(nlp, entID=entId, causalKeywordID=causalID)

matcher.addEntityPattern('ssc_entity_ruler', patternsEnts)
matcher.addEntityPattern('causal_keywords_entity_ruler', patternsCausal)
30-May-25 16:12:52 dackar.workflows.WorkflowBase INFO     Create instance of RuleBasedMatcher
30-May-25 16:12:53 dackar.utils.nlp.nlp_utils INFO     Model: core_web_lg, Language: en
30-May-25 16:12:53 dackar.utils.nlp.nlp_utils INFO     Available pipelines:pysbdSentenceBoundaries, tok2vec, tagger, parser, attribute_ruler, lemmatizer, mergePhrase, normEntities, initCoref, aliasResolver, anaphorCoref, anaphorEntCoref

Read raw text data and preprocess it

[6]:
# Read Raw Text Data, Users can also provide a raw string here
textFile = config.nlpConfig['files']['text_file']
with open(textFile, 'r') as ft:
    doc = ft.read()

# clean doc
if cleanDoc:
    doc = preprocess(doc)
if numerizerDoc:
    doc = numerizer.numerize(doc)

Correct the doc

[7]:
availCheckers = ['autocorrect', 'ContextualSpellCheck']
if spellCorrectDoc:
    checker = SpellChecker(doc, checker=availCheckers[0])
    misspelledWords = checker.getMisspelledWords()
    print('MisspelledWords: ', ','.join(misspelledWords))
    updatedWords = input('Provide the words that will not be treated as misspelled words (comma seperated words):')
    updatedWords = [word.strip() for word in updatedWords.split(',')]
    if len(updatedWords) != 0:
        checker.addWordsToDictionary(updatedWords)
    doc = checker.correct()

# raw text need to convert to lower case so that Spacy can perform POS correctly
doc = doc.lower()

Process text using Rule Based Matcher

[8]:
matcher(doc)
30-May-25 16:12:53 dackar.workflows.RuleBasedMatcher INFO     Start to extract health status
30-May-25 16:12:53 dackar.workflows.RuleBasedMatcher WARNING  Entity "pump" dep_ is "nmod" is not among valid list "[nsubj, nsubjpass, pobj, dobj, compound]"
30-May-25 16:12:53 dackar.workflows.RuleBasedMatcher WARNING  Entity "pump" dep_ is "nmod" is not among valid list "[nsubj, nsubjpass, pobj, dobj, compound]"
30-May-25 16:12:53 dackar.workflows.RuleBasedMatcher WARNING  No status identified for "pump" in "slight vibrations is noticed - likely from pump shaft deflection.
"
30-May-25 16:12:53 dackar.workflows.RuleBasedMatcher WARNING  Entity "pump" dep_ is "xcomp" is not among valid list "[nsubj, nsubjpass, pobj, dobj, compound]"
30-May-25 16:12:53 dackar.workflows.RuleBasedMatcher WARNING  Entity "pump" dep_ is "xcomp" is not among valid list "[nsubj, nsubjpass, pobj, dobj, compound]"
30-May-25 16:12:53 dackar.workflows.RuleBasedMatcher INFO     End of health status extraction!
30-May-25 16:12:53 dackar.workflows.RuleBasedMatcher INFO     Start to extract causal relation using OPM model information
30-May-25 16:12:53 dackar.workflows.RuleBasedMatcher INFO     End of causal relation extraction!
30-May-25 16:12:53 dackar.workflows.RuleBasedMatcher INFO     Start to use general extraction method to extract causal relation
30-May-25 16:12:53 dackar.workflows.RuleBasedMatcher INFO     End of causal relation extraction using general extraction method!
(bearings, caused, shaft degradation) (bearings, caused, shaft degradation) (inspection, revealed, degradation) (inspection, revealed, degradation) (they, caused, failure) (low flow conditions, causing, cavitation) (pump, keep, the check valves) (shaft, made, noise) (pump, made, noises)
[9]:
matcher._extractedCausals
[9]:
[[pump bearings,
  None,
  caused,
  shaft degradation,
  None,
  rupture of pump bearings caused pump shaft degradation.,
  False],
 [pump bearings,
  None,
  caused,
  shaft degradation,
  None,
  rupture of pump bearings caused pump shaft degradation and consequent flow reduction.,
  False],
 [power supply,
  None,
  due to,
  pump,
  None,
  pump test failed due to power supply failure.,
  False],
 [pump,
  None,
  revealed,
  impeller,
  None,
  pump inspection revealed excessive impeller degradation.,
  False],
 [pump,
  None,
  revealed,
  impeller,
  None,
  pump inspection revealed excessive impeller degradation likely due to cavitation.,
  True],
 [pump shaft,
  None,
  caused,
  pump,
  None,
  several cracks on pump shaft were observed; they could have caused pump failure within few days.,
  True],
 [pump shaft,
  None,
  causing,
  motor,
  None,
  the pump shaft vibration appears to be causing the motor to vibrate as well.,
  False]]
[10]:
matcher._entStatus
[10]:
entities status keywords status conjecture sentence status prepend status prepend adjectival modifier status append status append adjectival modifier negation negation text
0 pump None 1 leak False 1 leak was noticed from the rcp pump 1a. None None (pump) [from, rcp] False
1 pump None (signs, of, past leakage) False rcp pump 1a had signs of past leakage. None None None None False
2 pump None (enough flow) False the pump is not experiencing enough flow durin... None None None None True not
3 pump None responding False pump flow meter was not responding. (meter) [] None None True not
4 pump bearings None (rupture) False rupture of pump bearings caused pump shaft deg... None None None None False
5 pump cause rupture False rupture of pump bearings caused pump shaft deg... None None None None False
6 pump bearings None (rupture) False rupture of pump bearings caused pump shaft deg... None None None None False
7 pump cause rupture False rupture of pump bearings caused pump shaft deg... None None None None False
8 power supply None burnout False power supply has been found burnout. None None None None False
9 pump None test False pump test failed due to power supply failure. None None None None False
10 power supply None failure False pump test failed due to power supply failure. None None None None False
11 pump None inspection False pump inspection revealed excessive impeller de... None None None None False
12 impeller None degradation False pump inspection revealed excessive impeller de... None None None None False
13 pump None inspection True pump inspection revealed excessive impeller de... None None None None False
14 impeller None degradation True pump inspection revealed excessive impeller de... None None None None False
15 pump None (found, in, proximity) False oil puddle was found in proximity of rcp pump 1a. None None (pump) [of, rcp] False
16 pump None anomalous vibrations False anomalous vibrations were observed for rcp pum... None None None [] False
17 pump shaft None (several cracks) False several cracks on pump shaft were observed; th... None None None None False
18 pump None failure True several cracks on pump shaft were observed; th... None None None None False
19 pump None cavitating False rcp pump 1a was cavitating and vibrating to so... None None None None False
20 pump shaft None vibration False the pump shaft vibration appears to be causing... None None None None False
21 motor None vibrate False the pump shaft vibration appears to be causing... None None None None False
22 pump None (noise, of, cavitation, which, became, faint, ... False pump had noise of cavitation which became fain... None None None None False
23 pump shaft None (deflection) False the pump shaft deflection is causing the safet... (deflection) [] None None False
24 pump None (enough flow, for, the, pumps) False the pump is not experiencing enough flow for t... None None None None True not
25 pumps None (enough flow) False the pump is not experiencing enough flow for t... None None None None False
26 pump shaft None noise False pump shaft made noise. None None None None False
27 pump shaft come vibration True vibration seems like it is coming from the pum... None None None [] False
28 pump shaft None (deflection) False visible pump shaft deflection in operation. None None None None False
29 pump bearings None acceptable condition False pump bearings appear in acceptable condition. None None None None False
30 pump None noises False pump made noises - not enough to affect perfor... None None None None False
31 pump shaft None 1 slight deflection False pump shaft has 1 slight deflection. None None None None False