Demo for Rule Based Natural Language Processing

1. Set up the path, so that the NLP modules can be found

[ ]:
import os
import sys

cwd = os.getcwd()
frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))
sys.path.append(frameworkDir)

2. Load Spacy module

[ ]:
import spacy
nlp = spacy.load("en_core_web_lg", exclude=[])

3. Load other modules

[ ]:
import pandas as pd

4. Import NLP modules

[ ]:
from dackar.causal.CausalSentence import CausalSentence
from dackar import config
from dackar.utils.nlp.nlp_utils import generatePatternList

5. Set up logging

[ ]:
import logging
logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.DEBUG)

6. Read and process entities

[ ]:
ents = []
entityFile = config.nlpConfig['files']['entity_file']
entityList = pd.read_csv(entityFile).values.ravel().tolist()
ents.extend(entityList)
ents = set(ents)
label = "pump_component"
entId = "SSC"
patternsOPM = generatePatternList(ents, label=label, id=entId, nlp=nlp, attr="LEMMA")

7. Read and process causal keywords

[ ]:
causalLabel = "causal_keywords"
causalID = "causal"
patternsCausal = []
causalFilename = config.nlpConfig['files']['cause_effect_keywords_file']
ds = pd.read_csv(causalFilename, skipinitialspace=True)
for col in ds.columns:
    vars = set(ds[col].dropna())
    patternsCausal.extend(generatePatternList(vars, label=causalLabel, id=causalID, nlp=nlp, attr="LEMMA"))

8. Create Rule-based matcher with entity list and causal entity list

[ ]:
name = 'ssc_entity_ruler'
matcher = CausalSentence(nlp, entID=entId, causalKeywordID=causalID)
matcher.addEntityPattern(name, patternsOPM)

causalName = 'causal_keywords_entity_ruler'
matcher.addEntityPattern(causalName, patternsCausal)

9. Read input text file, or users can provide a raw string

[ ]:
textFile = config.nlpConfig['files']['text_file']
with open(textFile, 'r') as ft:
    doc = ft.read()

10. Process raw string data using matcher

[ ]:
matcher(doc)

11. Access processed information from matcher

[ ]:
matcher._extractedCausals