Demo for Rule Based Natural Language Processing¶
1. Set up the path, so that the NLP modules can be found¶
[ ]:
import os
import sys
cwd = os.getcwd()
frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))
sys.path.append(frameworkDir)
2. Load Spacy module¶
[ ]:
import spacy
nlp = spacy.load("en_core_web_lg", exclude=[])
3. Load other modules¶
[ ]:
import pandas as pd
4. Import NLP modules¶
[ ]:
from dackar.causal.CausalSentence import CausalSentence
from dackar import config
from dackar.utils.nlp.nlp_utils import generatePatternList
5. Set up logging¶
[ ]:
import logging
logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.DEBUG)
6. Read and process entities¶
[ ]:
ents = []
entityFile = config.nlpConfig['files']['entity_file']
entityList = pd.read_csv(entityFile).values.ravel().tolist()
ents.extend(entityList)
ents = set(ents)
label = "pump_component"
entId = "SSC"
patternsOPM = generatePatternList(ents, label=label, id=entId, nlp=nlp, attr="LEMMA")
7. Read and process causal keywords¶
[ ]:
causalLabel = "causal_keywords"
causalID = "causal"
patternsCausal = []
causalFilename = config.nlpConfig['files']['cause_effect_keywords_file']
ds = pd.read_csv(causalFilename, skipinitialspace=True)
for col in ds.columns:
vars = set(ds[col].dropna())
patternsCausal.extend(generatePatternList(vars, label=causalLabel, id=causalID, nlp=nlp, attr="LEMMA"))
8. Create Rule-based matcher with entity list and causal entity list¶
[ ]:
name = 'ssc_entity_ruler'
matcher = CausalSentence(nlp, entID=entId, causalKeywordID=causalID)
matcher.addEntityPattern(name, patternsOPM)
causalName = 'causal_keywords_entity_ruler'
matcher.addEntityPattern(causalName, patternsCausal)
9. Read input text file, or users can provide a raw string¶
[ ]:
textFile = config.nlpConfig['files']['text_file']
with open(textFile, 'r') as ft:
doc = ft.read()
10. Process raw string data using matcher¶
[ ]:
matcher(doc)
11. Access processed information from matcher¶
[ ]:
matcher._extractedCausals