Demo For NLP Workflow

Set Paths and Loading Required Modules

[ ]:
# External Modules #
import os
import sys
import pandas as pd
import spacy
import logging
import numerizer
#######################

# Settings #
cwd = os.getcwd()
frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))
sys.path.append(frameworkDir)
########################

# Internal Modules #
from dackar.causal.CausalSentence import CausalSentence
from dackar import config
from dackar.utils.nlp.nlp_utils import generatePatternList
from dackar.text_processing.Preprocessing import Preprocessing
from dackar.text_processing.SpellChecker import SpellChecker
#########################

# logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.DEBUG)
logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)
nlp = spacy.load("en_core_web_lg", exclude=[])

Initialize variables

[ ]:
cleanDoc = True         # prepocessor the raw doc text, i.e., remove brackets, repeatings, punctuations.
numerizerDoc = True     # converts natural language numerics into ints and floats
spellCorrectDoc = False # spell check and perform corrections

entLabel = "pump_component"       # user defined entity label
entId = "SSC"                     # user defined entity ID
causalLabel = "causal_keywords"   # user defined causal keyword label
causalID = "causal"               # user defined causal keyword ID
ents = []                         # user provided entities list
causalList = []                   # user provided causal keyword list

removeBrackets = ['curly', 'square', 'round']
removeRepeatings = ['.']
# TODO: extend repeating_chars to handle a list of chars, right now we can only pass one chars
removePunctuation = ['/', "#", '~'] # right now puncuation is replaced with whitespace, we may need to replace it with None
# TODO: add replace functions, for example, replace acronyms with full name

preprocessorList = ['bullet_points',
                    'hyphenated_words',
                    'quotation_marks',
                    'unicode',
                    'repeating_chars',
                    'accents',
                    'brackets',
                    'html_tags',
                    'punctuation',
                    # 'currency_symbols',
                    'emails',
                    'emojis',
                    'hashtags',
                    # 'numbers',
                    'phone_numbers',
                    'urls',
                    'user_handles',
                    'whitespace',
                    'numerize']
preprocessorOptions = {'repeating_chars': {'chars': removeRepeatings[0], 'maxn': 1},
                       'unicode': {'form': 'NFKC'},
                       'accents': {'fast': False},
                       'brackets': {'only': removeBrackets},
                       'punctuation': {'only': removePunctuation}}

preprocess = Preprocessing(preprocessorList, preprocessorOptions)

Load entity list and causal list or provide directly

[ ]:
entityFile = config.nlpConfig['files']['entity_file']
entityList = pd.read_csv(entityFile).values.ravel().tolist()
ents.extend(entityList)

causalFilename = config.nlpConfig['files']['cause_effect_keywords_file']
ds = pd.read_csv(causalFilename, skipinitialspace=True)
for col in ds.columns:
    causalList.extend(set(ds[col].dropna()))

Generate patterns that can be used in NER

[ ]:
patternsEnts = generatePatternList(ents, label=entLabel, id=entId, nlp=nlp, attr="LEMMA")
patternsCausal = generatePatternList(causalList, label=causalLabel, id=causalID, nlp=nlp, attr="LEMMA")

Create rule-based matcher with entity list and causal entity list

[ ]:
matcher = CausalSentence(nlp, entID=entId, causalKeywordID=causalID)

matcher.addEntityPattern('ssc_entity_ruler', patternsEnts)
matcher.addEntityPattern('causal_keywords_entity_ruler', patternsCausal)

Read raw text data and preprocess it

[ ]:
# Read Raw Text Data, Users can also provide a raw string here
textFile = config.nlpConfig['files']['text_file']
with open(textFile, 'r') as ft:
    doc = ft.read()

# clean doc
if cleanDoc:
    doc = preprocess(doc)
if numerizerDoc:
    doc = numerizer.numerize(doc)

Correct the doc

[ ]:
availCheckers = ['autocorrect', 'ContextualSpellCheck']
if spellCorrectDoc:
    checker = SpellChecker(doc, checker=availCheckers[0])
    misspelledWords = checker.getMisspelledWords()
    print('MisspelledWords: ', ','.join(misspelledWords))
    updatedWords = input('Provide the words that will not be treated as misspelled words (comma seperated words):')
    updatedWords = [word.strip() for word in updatedWords.split(',')]
    if len(updatedWords) != 0:
        checker.addWordsToDictionary(updatedWords)
    doc = checker.correct()

# raw text need to convert to lower case so that Spacy can perform POS correctly
doc = doc.lower()

Process text using Rule Based Matcher

[ ]:
matcher(doc)
[ ]:
matcher._extractedCausals
[ ]:
matcher._entStatus