Demo For NLP Workflow: Based on branch wangc/nlp

Set Paths and Loading Required Modules

  • Required libraries, please check SR2ML/dependencies.xml

[1]:
# External Modules #
import os
import sys
import pandas as pd
import spacy
import logging
import numerizer
#######################

# Settings #
cwd = os.getcwd()
frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))
sys.path.append(frameworkDir)
########################

# Internal Modules #
from dackar.workflows.RuleBasedMatcher import RuleBasedMatcher
from dackar import config
from dackar.utils.nlp.nlp_utils import generatePatternList
from dackar.utils.opm.OPLparser import OPMobject
from dackar.text_processing.Preprocessing import Preprocessing
from dackar.text_processing.Preprocessing import SpellChecker
#########################

# logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.DEBUG)
logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)
nlp = spacy.load("en_core_web_lg", exclude=[])
Warming up PyWSD (takes ~10 secs)... took 2.4229929447174072 secs.

Initialize variables

[2]:
cleanDoc = True         # prepocessor the raw doc text, i.e., remove brackets, repeatings, punctuations.
numerizerDoc = True     # converts natural language numerics into ints and floats
spellCorrectDoc = False # spell check and perform corrections

entLabel = "pump_component"       # user defined entity label
entId = "SSC"                     # user defined entity ID
causalLabel = "causal_keywords"   # user defined causal keyword label
causalID = "causal"               # user defined causal keyword ID
ents = []                         # user provided entities list
causalList = []                   # user provided causal keyword list

removeBrackets = ['curly', 'square', 'round']
removeRepeatings = ['.']
# TODO: extend repeating_chars to handle a list of chars, right now we can only pass one chars
removePunctuation = ['/', "#", '~'] # right now puncuation is replaced with whitespace, we may need to replace it with None
# TODO: add replace functions, for example, replace acronyms with full name

preprocessorList = ['bullet_points',
                    'hyphenated_words',
                    'quotation_marks',
                    'unicode',
                    'repeating_chars',
                    'accents',
                    'brackets',
                    'html_tags',
                    'punctuation',
                    # 'currency_symbols',
                    'emails',
                    'emojis',
                    'hashtags',
                    # 'numbers',
                    'phone_numbers',
                    'urls',
                    'user_handles',
                    'whitespace',
                    'numerize']
preprocessorOptions = {'repeating_chars': {'chars': removeRepeatings[0], 'maxn': 1},
                       'unicode': {'form': 'NFKC'},
                       'accents': {'fast': False},
                       'brackets': {'only': removeBrackets},
                       'punctuation': {'only': removePunctuation}}

preprocess = Preprocessing(preprocessorList, preprocessorOptions)

Load entity list and causal list or provide directly

[3]:
entityFile = config.nlpConfig['files']['entity_file']
entityList = pd.read_csv(entityFile).values.ravel().tolist()
ents.extend(entityList)

causalFilename = config.nlpConfig['files']['cause_effect_keywords_file']
ds = pd.read_csv(causalFilename, skipinitialspace=True)
for col in ds.columns:
    causalList.extend(set(ds[col].dropna()))

Generate patterns that can be used in NER

[4]:
patternsEnts = generatePatternList(ents, label=entLabel, id=entId, nlp=nlp, attr="LEMMA")
patternsCausal = generatePatternList(causalList, label=causalLabel, id=causalID, nlp=nlp, attr="LEMMA")

Create rule-based matcher with entity list and causal entity list

[5]:
matcher = RuleBasedMatcher(nlp, entID=entId, causalKeywordID=causalID)

matcher.addEntityPattern('ssc_entity_ruler', patternsEnts)
matcher.addEntityPattern('causal_keywords_entity_ruler', patternsCausal)

Read raw text data and preprocess it

[6]:
# Read Raw Text Data, Users can also provide a raw string here
textFile = config.nlpConfig['files']['text_file']
with open(textFile, 'r') as ft:
    doc = ft.read()

# clean doc
if cleanDoc:
    doc = preprocess(doc)
if numerizerDoc:
    doc = numerizer.numerize(doc)

Correct the doc

[7]:
availCheckers = ['autocorrect', 'ContextualSpellCheck']
if spellCorrectDoc:
    checker = SpellChecker(doc, checker=availCheckers[0])
    misspelledWords = checker.getMisspelledWords()
    print('MisspelledWords: ', ','.join(misspelledWords))
    updatedWords = input('Provide the words that will not be treated as misspelled words (comma sperated words):')
    updatedWords = [word.strip() for word in updatedWords.split(',')]
    if len(updatedWords) != 0:
        checker.addWordsToDictionary(updatedWords)
    doc = checker.correct()
[8]:
# raw text need to convert to lower case so that Spacy can perform POS correctly
doc = doc.lower()
[9]:
matcher(doc)

[10]:
# # Following used to retrieve causal effect information
# causalEffect = config.nlpConfig['files']['output_causal_effect_file']
# causalEffect = pd.read_csv(causalEffect)
[11]:
healthStatus = config.nlpConfig['files']['output_health_status_file']
healthStatus = pd.read_csv(healthStatus)
[12]:
healthStatus
[12]:
Unnamed: 0 entities conjecture negation negation text root status keywords health status prepend adjectival modifier health status prepend health status health status append adjectival modifier health status append sentence
[13]:
for i in range(healthStatus.shape[0]):
    print(list(healthStatus.iloc[i]))