Demo For NLP Workflow¶
Set Paths and Loading Required Modules¶
[ ]:
# External Modules #
import os
import sys
import pandas as pd
import spacy
import logging
import numerizer
#######################
# Settings #
cwd = os.getcwd()
frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))
sys.path.append(frameworkDir)
########################
# Internal Modules #
from dackar.causal.CausalSentence import CausalSentence
from dackar import config
from dackar.utils.nlp.nlp_utils import generatePatternList
from dackar.text_processing.Preprocessing import Preprocessing
from dackar.text_processing.SpellChecker import SpellChecker
#########################
# logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.DEBUG)
logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)
nlp = spacy.load("en_core_web_lg", exclude=[])
Initialize variables¶
[ ]:
cleanDoc = True # prepocessor the raw doc text, i.e., remove brackets, repeatings, punctuations.
numerizerDoc = True # converts natural language numerics into ints and floats
spellCorrectDoc = False # spell check and perform corrections
entLabel = "pump_component" # user defined entity label
entId = "SSC" # user defined entity ID
causalLabel = "causal_keywords" # user defined causal keyword label
causalID = "causal" # user defined causal keyword ID
ents = [] # user provided entities list
causalList = [] # user provided causal keyword list
removeBrackets = ['curly', 'square', 'round']
removeRepeatings = ['.']
# TODO: extend repeating_chars to handle a list of chars, right now we can only pass one chars
removePunctuation = ['/', "#", '~'] # right now puncuation is replaced with whitespace, we may need to replace it with None
# TODO: add replace functions, for example, replace acronyms with full name
preprocessorList = ['bullet_points',
'hyphenated_words',
'quotation_marks',
'unicode',
'repeating_chars',
'accents',
'brackets',
'html_tags',
'punctuation',
# 'currency_symbols',
'emails',
'emojis',
'hashtags',
# 'numbers',
'phone_numbers',
'urls',
'user_handles',
'whitespace',
'numerize']
preprocessorOptions = {'repeating_chars': {'chars': removeRepeatings[0], 'maxn': 1},
'unicode': {'form': 'NFKC'},
'accents': {'fast': False},
'brackets': {'only': removeBrackets},
'punctuation': {'only': removePunctuation}}
preprocess = Preprocessing(preprocessorList, preprocessorOptions)
Load entity list and causal list or provide directly¶
[ ]:
entityFile = config.nlpConfig['files']['entity_file']
entityList = pd.read_csv(entityFile).values.ravel().tolist()
ents.extend(entityList)
causalFilename = config.nlpConfig['files']['cause_effect_keywords_file']
ds = pd.read_csv(causalFilename, skipinitialspace=True)
for col in ds.columns:
causalList.extend(set(ds[col].dropna()))
Generate patterns that can be used in NER¶
[ ]:
patternsEnts = generatePatternList(ents, label=entLabel, id=entId, nlp=nlp, attr="LEMMA")
patternsCausal = generatePatternList(causalList, label=causalLabel, id=causalID, nlp=nlp, attr="LEMMA")
Create rule-based matcher with entity list and causal entity list¶
[ ]:
matcher = CausalSentence(nlp, entID=entId, causalKeywordID=causalID)
matcher.addEntityPattern('ssc_entity_ruler', patternsEnts)
matcher.addEntityPattern('causal_keywords_entity_ruler', patternsCausal)
Read raw text data and preprocess it¶
[ ]:
# Read Raw Text Data, Users can also provide a raw string here
textFile = config.nlpConfig['files']['text_file']
with open(textFile, 'r') as ft:
doc = ft.read()
# clean doc
if cleanDoc:
doc = preprocess(doc)
if numerizerDoc:
doc = numerizer.numerize(doc)
Correct the doc¶
[ ]:
availCheckers = ['autocorrect', 'ContextualSpellCheck']
if spellCorrectDoc:
checker = SpellChecker(doc, checker=availCheckers[0])
misspelledWords = checker.getMisspelledWords()
print('MisspelledWords: ', ','.join(misspelledWords))
updatedWords = input('Provide the words that will not be treated as misspelled words (comma seperated words):')
updatedWords = [word.strip() for word in updatedWords.split(',')]
if len(updatedWords) != 0:
checker.addWordsToDictionary(updatedWords)
doc = checker.correct()
# raw text need to convert to lower case so that Spacy can perform POS correctly
doc = doc.lower()
Process text using Rule Based Matcher¶
[ ]:
matcher(doc)
[ ]:
matcher._extractedCausals
[ ]:
matcher._entStatus