Source code for src.dackar.causal.CausalPhrase

# Copyright 2024, Battelle Energy Alliance, LLC  ALL RIGHTS RESERVED
"""
Created on March, 2024

@author: wangc, mandd
"""
import logging
import pandas as pd
import re
from spacy.tokens import Token
from spacy.tokens import Span

from ..text_processing.Preprocessing import Preprocessing
from ..utils.utils import getOnlyWords, getShortAcronym
from ..config import nlpConfig
from .CausalBase import CausalBase


[docs]
logger = logging.getLogger(__name__)


if not Span.has_extension('conjecture'):
  Span.set_extension('conjecture', default=False)
if not Span.has_extension('status'):
  Span.set_extension("status", default=None)
if not Span.has_extension('neg'):
  Span.set_extension("neg", default=None)
if not Span.has_extension('neg_text'):
  Span.set_extension("neg_text", default=None)
if not Span.has_extension('alias'):
  Span.set_extension("alias", default=None)

if not Token.has_extension('conjecture'):
  Token.set_extension('conjecture', default=False)
if not Token.has_extension('status'):
  Token.set_extension("status", default=None)
if not Token.has_extension('neg'):
  Token.set_extension("neg", default=None)
if not Token.has_extension('neg_text'):
  Token.set_extension("neg_text", default=None)
if not Token.has_extension('alias'):
  Token.set_extension("alias", default=None)



[docs]
class CausalPhrase(CausalBase):
  """
    Class to process short phrase dataset
  """
  def __init__(self, nlp, entID='SSC', causalKeywordID='causal',*args, **kwargs):
    """
      Construct

      Args:

        nlp: spacy.Language object, contains all components and data needed to process text
        args: list, positional arguments
        kwargs: dict, keyword arguments

      Returns:

        None
    """
    super().__init__(nlp, entID, causalKeywordID='causal', *args, **kwargs)


[docs]
  def reset(self):
    """
      Reset rule-based matcher
    """
    super().reset()
    self._entStatus = None



[docs]
  def addKeywords(self, keywords, ktype):
    """
      Method to update self._causalKeywords or self._statusKeywords

      Args:

        keywords: dict, keywords that will be add to self._causalKeywords or self._statusKeywords
        ktype: string, either 'status' or 'causal'
    """
    if type(keywords) != dict:
      raise IOError('"addCausalKeywords" method can only accept dictionary, but got {}'.format(type(keywords)))
    if ktype.lower() == 'status':
      for key, val in keywords.items():
        if type(val) != list:
          val = [val]
        val = self.extractLemma(val)
        if key in self._statusKeywords:
          self._statusKeywords[key].append(val)
        else:
          logger.warning('keyword "{}" cannot be accepted, valid keys for the keywords are "{}"'.format(key, ','.join(list(self._statusKeywords.keys()))))



[docs]
  def extractInformation(self):
    """
      extract information

      Args:

        None

      Returns:

        None
    """
    ## health status
    logger.info('Start to extract health status')
    self.extractHealthStatus(self._matchedSents)

    rows = []
    for sent in self._matchedSents:
      ents = self.getCustomEnts(sent.ents, self._entityLabels[self._entID])
      for ent in ents:
        if ent._.status is not None:
          row = {'entity':ent.text,'label': ent.label_, 'alias':ent._.alias, 'status':ent._.status, 'conjecture':ent._.conjecture, 'negation':ent._.neg, 'negation_text': ent._.neg_text}
          rows.append(row)
    self._entStatus = pd.DataFrame(rows)

    if 'output_status_file' in nlpConfig['files']:
      self._entStatus.to_csv(nlpConfig['files']['output_status_file'], columns=['entity','label', 'alias', 'status', 'conjecture', 'negation', 'negation_text'])

    # self._entStatus = dfStatus
    logger.info('End of health status extraction!')

    # Extract entity relations
    logger.info('Start to extract general entity relation')
    self.extractRelDep(self._matchedSents)
    self._relationGeneral = pd.DataFrame(self._allRelPairs, columns=self._relationNames)

    if 'output_relation_file' in nlpConfig['files']:
      self._relationGeneral.to_csv(nlpConfig['files']['output_relation_file'], columns=self._relationNames)
    logger.info('End of general entity relation extraction!')



[docs]
  def extractHealthStatus(self, matchedSents, predSynonyms=[], exclPrepos=[]):
    """
      Extract health status and relation

      Args:

        matchedSents: list, the matched sentences
        predSynonyms: list, predicate synonyms
        exclPrepos: list, exclude the prepositions
    """
    subjList = ['nsubj', 'nsubjpass', 'nsubj:pass']
    objList = ['pobj', 'dobj', 'iobj', 'obj', 'obl', 'oprd']

    # procedure to process CWS data
    # collect status, negation, conjecture information
    for sent in matchedSents:
      ents = self.getCustomEnts(sent.ents, self._entityLabels[self._entID])
      root = sent.root
      neg, negText = self.isNegation(root)
      conjecture = self.isConjecture(root)
      if ents is None:
        continue
      for ent in ents:
        ent._.set('neg', neg)
        ent._.set('neg_text', negText)
        ent._.set('conjecture', conjecture)
        if ent._.alias is not None:
          # entity at the beginning of sentence
          if ent.start == sent.start:
            status = sent[ent.end:]
            # some clean up for the text
            text = self._textProcess(status.text)
            ent._.set('status', text)
          # entity at the end of sentence
          elif ent.end == sent.end or (ent.end == sent.end - 1 and sent[-1].is_punct):
            text = sent.text
            # substitute entity ID with its alias
            text = re.sub(r"\b%s\b" % str(ent.text) , ent._.alias, text)
            text = self._textProcess(text)
            ent._.set('status', text)
          # entity in the middle of sentence
          else:
            entRoot = ent.root
            # Only include Pred and Obj info
            if entRoot.dep_ in subjList:
              status = sent[ent.end:]
              # some clean up for the text
              text = self._textProcess(status.text)
              ent._.set('status', text)
            # Include the whole info with alias substitution
            elif entRoot.dep_ in objList:
              text = sent.text
              # substitute entity ID with its alias
              text = re.sub(r"\b%s\b" % str(ent.text) , ent._.alias, text)
              text = getOnlyWords(text)
              text = self._textProcess(text)
              ent._.set('status', text)
        # other type of entities
        else:
          entRoot = ent.root
          if entRoot.dep_ in subjList:
            # depend on the application, can use self.getHealthStatusForSubj to get the status
            status = sent[ent.end:]
            # some clean up for the text
            text = self._textProcess(status.text)
            ent._.set('status', text)
          # Include the whole info with alias substitution
          elif entRoot.dep_ in objList:
            # depend on the application, can use self.getHealthStatusForObj to get the status
            text = sent.text
            text = getOnlyWords(text)
            text = self._textProcess(text)
            ent._.set('status', text)
          else:
            # if the entity not among subj and obj, it may not need to report it
            pass