Demo For NLP Workflow: Based on branch wangc/nlp¶
Set Paths and Loading Required Modules¶
Required libraries, please check SR2ML/dependencies.xml
[1]:
# External Modules #
import os
import sys
import pandas as pd
import spacy
import logging
import numerizer
#######################
# Settings #
cwd = os.getcwd()
frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))
sys.path.append(frameworkDir)
########################
# Internal Modules #
from dackar.workflows.RuleBasedMatcher import RuleBasedMatcher
from dackar import config
from dackar.utils.nlp.nlp_utils import generatePatternList
from dackar.utils.opm.OPLparser import OPMobject
from dackar.text_processing.Preprocessing import Preprocessing
from dackar.text_processing.Preprocessing import SpellChecker
#########################
# logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.DEBUG)
logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)
nlp = spacy.load("en_core_web_lg", exclude=[])
Warming up PyWSD (takes ~10 secs)... took 2.4229929447174072 secs.
Initialize variables¶
[2]:
cleanDoc = True # prepocessor the raw doc text, i.e., remove brackets, repeatings, punctuations.
numerizerDoc = True # converts natural language numerics into ints and floats
spellCorrectDoc = False # spell check and perform corrections
entLabel = "pump_component" # user defined entity label
entId = "SSC" # user defined entity ID
causalLabel = "causal_keywords" # user defined causal keyword label
causalID = "causal" # user defined causal keyword ID
ents = [] # user provided entities list
causalList = [] # user provided causal keyword list
removeBrackets = ['curly', 'square', 'round']
removeRepeatings = ['.']
# TODO: extend repeating_chars to handle a list of chars, right now we can only pass one chars
removePunctuation = ['/', "#", '~'] # right now puncuation is replaced with whitespace, we may need to replace it with None
# TODO: add replace functions, for example, replace acronyms with full name
preprocessorList = ['bullet_points',
'hyphenated_words',
'quotation_marks',
'unicode',
'repeating_chars',
'accents',
'brackets',
'html_tags',
'punctuation',
# 'currency_symbols',
'emails',
'emojis',
'hashtags',
# 'numbers',
'phone_numbers',
'urls',
'user_handles',
'whitespace',
'numerize']
preprocessorOptions = {'repeating_chars': {'chars': removeRepeatings[0], 'maxn': 1},
'unicode': {'form': 'NFKC'},
'accents': {'fast': False},
'brackets': {'only': removeBrackets},
'punctuation': {'only': removePunctuation}}
preprocess = Preprocessing(preprocessorList, preprocessorOptions)
Load entity list and causal list or provide directly¶
[3]:
entityFile = config.nlpConfig['files']['entity_file']
entityList = pd.read_csv(entityFile).values.ravel().tolist()
ents.extend(entityList)
causalFilename = config.nlpConfig['files']['cause_effect_keywords_file']
ds = pd.read_csv(causalFilename, skipinitialspace=True)
for col in ds.columns:
causalList.extend(set(ds[col].dropna()))
Generate patterns that can be used in NER¶
[4]:
patternsEnts = generatePatternList(ents, label=entLabel, id=entId, nlp=nlp, attr="LEMMA")
patternsCausal = generatePatternList(causalList, label=causalLabel, id=causalID, nlp=nlp, attr="LEMMA")
Create rule-based matcher with entity list and causal entity list¶
[5]:
matcher = RuleBasedMatcher(nlp, entID=entId, causalKeywordID=causalID)
matcher.addEntityPattern('ssc_entity_ruler', patternsEnts)
matcher.addEntityPattern('causal_keywords_entity_ruler', patternsCausal)
Read raw text data and preprocess it¶
[6]:
# Read Raw Text Data, Users can also provide a raw string here
textFile = config.nlpConfig['files']['text_file']
with open(textFile, 'r') as ft:
doc = ft.read()
# clean doc
if cleanDoc:
doc = preprocess(doc)
if numerizerDoc:
doc = numerizer.numerize(doc)
Correct the doc¶
[7]:
availCheckers = ['autocorrect', 'ContextualSpellCheck']
if spellCorrectDoc:
checker = SpellChecker(doc, checker=availCheckers[0])
misspelledWords = checker.getMisspelledWords()
print('MisspelledWords: ', ','.join(misspelledWords))
updatedWords = input('Provide the words that will not be treated as misspelled words (comma sperated words):')
updatedWords = [word.strip() for word in updatedWords.split(',')]
if len(updatedWords) != 0:
checker.addWordsToDictionary(updatedWords)
doc = checker.correct()
[8]:
# raw text need to convert to lower case so that Spacy can perform POS correctly
doc = doc.lower()
[9]:
matcher(doc)
[10]:
# # Following used to retrieve causal effect information
# causalEffect = config.nlpConfig['files']['output_causal_effect_file']
# causalEffect = pd.read_csv(causalEffect)
[11]:
healthStatus = config.nlpConfig['files']['output_health_status_file']
healthStatus = pd.read_csv(healthStatus)
[12]:
healthStatus
[12]:
Unnamed: 0 | entities | conjecture | negation | negation text | root | status keywords | health status prepend adjectival modifier | health status prepend | health status | health status append adjectival modifier | health status append | sentence |
---|
[13]:
for i in range(healthStatus.shape[0]):
print(list(healthStatus.iloc[i]))