Custom Pipelines Demo

  • normEntities: Normalizing Named Entities, remove the leading article and trailing particle

  • initCoref: Initialize Coreference Attributes with Entity Info

  • anaphorCoref: Anaphora resolution using coreferee

  • expandEntities: Expand the current entities, recursive function to extend entity with all previous NOUN

[1]:
import pandas as pd
import spacy
from spacy.tokens import Span
from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Token
from spacy import displacy
import coreferee

#### Using spacy's Token extensions for coreferee
if Token.has_extension('ref_n'):
  _ = Token.remove_extension('ref_n')
if Token.has_extension('ref_t'):
  _ = Token.remove_extension('ref_t')
if Token.has_extension('ref_t_'):
  _ = Token.remove_extension('ref_t_')
Token.set_extension('ref_n', default='')
Token.set_extension('ref_t', default='')

nlp = spacy.load("en_core_web_lg")

Internal Developed Functions

[2]:
# Function used to display NER entities
def displayNER(doc, includePunct=False):
  """
    Generate data frame for visualization of spaCy doc with custom attributes.
  """
  rows = []
  for i, t in enumerate(doc):
    if not t.is_punct or includePunct:
      row = {'token': i,
             'text': t.text, 'lemma': t.lemma_,
             'pos': t.pos_, 'dep': t.dep_, 'ent_type': t.ent_type_,
             'ent_iob_': t.ent_iob_}
      if doc.has_extension('coref_chains'):
        if t.has_extension('coref_chains') and t._.coref_chains: # neuralcoref attributes
          row['coref_chains'] = t._.coref_chains.pretty_representation
        else:
          row['coref_chains'] = None
      if t.has_extension('ref_n'): # referent attribute
        row['ref_n'] = t._.ref_n
        row['ref_t'] = t._.ref_t
      if t.has_extension('ref_ent'): # ref_n/ref_t
        row['ref_ent'] = t._.ref_ent
      rows.append(row)
  df = pd.DataFrame(rows).set_index('token')
  df.index.name = None
  return df


# Reset Pipelines
def resetPipeline(nlp, pipes):
  """
    remove all custom pipes, and add new pipes
  """
  customPipes = [pipe for (pipe, _) in nlp.pipeline
                  if pipe not in ['tagger', 'parser',
                                  'tok2vec', 'attribute_ruler', 'lemmatizer']]
  for pipe in customPipes:
    _ = nlp.remove_pipe(pipe)
  # re-add specified pipes
  for pipe in pipes:
    nlp.add_pipe(pipe)

# Print Dependency Tree
def printDepTree(doc, skipPunct=True):
  """
    Utility function to pretty print the dependency tree.
  """
  def printRecursive(root, indent, skipPunct):
    if not root.dep_ == 'punct' or not skipPunct:
      print(" "*indent + f"{root} [{root.pos_}, {root.dep_}]")
    for left in root.lefts:
      printRecursive(left, indent=indent+4, skipPunct=skipPunct)
    for right in root.rights:
      printRecursive(right, indent=indent+4, skipPunct=skipPunct)

  for sent in doc.sents: # iterate over all sentences in a doc
    printRecursive(sent.root, indent=0, skipPunct=skipPunct)

Internal Developed Pipelines

[3]:
# Normalizing Named Entities, remove the leading article and trailing particle
@Language.component("normEntities")
def normEntities(doc):
  """
    Normalizing Named Entities, remove the leading article and trailing particle
    @ In, doc, spacy.tokens.doc.Doc
    @ Out, doc, spacy.tokens.doc.Doc
  """
  ents = []
  for ent in doc.ents:
    if ent[0].pos_ == "DET": # leading article
      ent = Span(doc, ent.start+1, ent.end, label=ent.label)
    if len(ent) > 0:
      if ent[-1].pos_ == "PART": # trailing particle like 's
        ent = Span(doc, ent.start, ent.end-1, label=ent.label)
      if len(ent) > 0:
        ents.append(ent)
  doc.ents = tuple(ents)
  return doc

# Initialize Coreference Attributes with Entity Info
@Language.component("initCoref")
def initCoref(doc):
  for e in doc.ents:
    e[0]._.ref_n, e[0]._.ref_t = e.text, e.label_
  return doc

# Anaphora resolution using coreferee
@Language.component("anaphorCoref")
def anaphorCoref(doc):
  """
    Anaphora resolution using coreferee
    This pipeline need to be added after NER.
    The assumption here is: The entities need to be recognized first, then call
    pipeline "initCoref" to assign initial custom attribute "ref_n" and "ref_t",
    then call pipeline "aliasResolver" to resolve all the aliases used in the text.
    After all these pre-processes, we can use "anaphorCoref" pipeline to resolve the
    coreference.
  """
  if not Token.has_extension('coref_chains'):
    return doc
  for token in doc:
    coref = token._.coref_chains
    # if token is coref and not already dereferenced
    if coref and token._.ref_n == '':
      # check all the references, if "ref_n" is available (determined by NER and initCoref),
      # the value of "ref_n" will be assigned to current token
      for chain in coref:
        for ref in chain:
          refToken = doc[ref[0]]
          if refToken._.ref_n != '':
            token._.ref_n = refToken._.ref_n
            token._.ref_t = refToken._.ref_t
            break
  return doc

# Expand the current entities, recursive function to extend entity with all previous NOUN
@Language.component("expandEntities")
def expandEntities(doc):
  """
    Expand the current entities, recursive function to extend entity with all previous NOUN
  """
  newEnts = []
  isUpdated = False
  for ent in doc.ents:
    if ent.label_ == "SSC" and ent.start != 0:
      prevToken = doc[ent.start - 1]
      if prevToken.pos_ in ['NOUN']:
        newEnt = Span(doc, ent.start - 1, ent.end, label=ent.label)
        newEnts.append(newEnt)
        isUpdated = True
    else:
      newEnts.append(ent)
  doc.ents = newEnts
  if isUpdated:
    doc = expandEntities(doc)
  return doc

Reset NLP Pipeline

[4]:
pipelines = ['entity_ruler','normEntities', 'initCoref', 'coreferee','anaphorCoref', 'expandEntities']
resetPipeline(nlp, pipelines)
nlp.pipeline
[4]:
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x326c5f470>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x326c5ee70>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x320b5ec70>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x326dd0750>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x326e0ff10>),
 ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x326abb610>),
 ('normEntities', <function __main__.normEntities(doc)>),
 ('initCoref', <function __main__.initCoref(doc)>),
 ('coreferee', <coreferee.manager.CorefereeBroker at 0x10520b810>),
 ('anaphorCoref', <function __main__.anaphorCoref(doc)>),
 ('expandEntities', <function __main__.expandEntities(doc)>)]

Example

[5]:
text = r"""A leak was noticed from the RCP pump 1A.
          The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.
          The RCP pump 1A pressure gauge was found inoperative.
          Rupture of pump bearings caused shaft degradation.
          Rupture of pump bearings caused shaft degradation and consequent flow reduction.
          Pump power supply has been found burnout.
          Pump test failed due to power supply failure.
          Pump inspection revealed excessive impeller degradation.
          Pump inspection revealed excessive impeller degradation likely due to cavitation.
        """
[6]:
patterns = [{"label":"comp", "pattern":[{"LOWER":"gauge"}], "id":"ssc"}]
ruler = nlp.get_pipe('entity_ruler')
ruler.add_patterns(patterns)
rules = [{"LOWER":"pump"}]
matcher = Matcher(nlp.vocab)
matcher.add('comp', [rules])

doc = nlp(text)
matches = matcher(doc, as_spans=True)
print('Identified Entities:')
for span in matches:
    print('Entity:', span.text, '| Label:', span.label_, '| Sentence', span.sent)

displacy.render(doc, style='ent', jupyter=True)

print('Dependency Tree:')
printDepTree(doc)
Identified Entities:
Entity: pump | Label: comp | Sentence A leak was noticed from the RCP pump 1A.
          The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.

Entity: pump | Label: comp | Sentence A leak was noticed from the RCP pump 1A.
          The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.

Entity: pump | Label: comp | Sentence The RCP pump 1A pressure gauge was found inoperative.

Entity: pump | Label: comp | Sentence Rupture of pump bearings caused shaft degradation.

Entity: pump | Label: comp | Sentence Rupture of pump bearings caused shaft degradation and consequent flow reduction.

Entity: Pump | Label: comp | Sentence Pump power supply has been found burnout.

Entity: Pump | Label: comp | Sentence Pump test failed due to power supply failure.

Entity: Pump | Label: comp | Sentence Pump inspection revealed excessive impeller degradation.

Entity: Pump | Label: comp | Sentence Pump inspection revealed excessive impeller degradation likely due to cavitation.

A leak was noticed from the RCP pump 1A.
The RCP pump 1A pressure gauge comp was found not operating, and it was found inoperative.
The RCP pump 1A pressure gauge comp was found inoperative.
Rupture of pump bearings caused shaft degradation.
Rupture of pump bearings caused shaft degradation and consequent flow reduction.
Pump power supply has been found burnout.
Pump test failed due to power supply failure.
Pump inspection revealed excessive impeller degradation.
Pump inspection revealed excessive impeller degradation likely due to cavitation.
Dependency Tree:
noticed [VERB, ROOT]
    leak [NOUN, nsubjpass]
        A [DET, det]
    was [AUX, auxpass]
    from [ADP, prep]
        RCP [PROPN, pobj]
            the [DET, det]
    pump [VERB, conj]
        1A. [NUM, dobj]

           [SPACE, dep]
            pump [VERB, relcl]
                RCP [PROPN, nsubj]
                    The [DET, det]
        found [VERB, conj]
            gauge [NOUN, nsubjpass]
                1A [NOUN, compound]
                pressure [NOUN, compound]
            was [AUX, auxpass]
            operating [VERB, xcomp]
                not [PART, neg]
    and [CCONJ, cc]
    found [VERB, conj]
        it [PRON, nsubjpass]
        was [AUX, auxpass]
        inoperative [ADJ, oprd]

           [SPACE, dep]
pump [VERB, ROOT]
    RCP [PROPN, nsubj]
        The [DET, det]
    found [VERB, conj]
        gauge [NOUN, nsubjpass]
            1A [NOUN, compound]
            pressure [NOUN, compound]
        was [AUX, auxpass]
        inoperative [ADJ, oprd]

           [SPACE, dep]
caused [VERB, ROOT]
    Rupture [NOUN, nsubj]
        of [ADP, prep]
            bearings [NOUN, pobj]
                pump [NOUN, compound]
    degradation [NOUN, dobj]
        shaft [NOUN, compound]

           [SPACE, dep]
caused [VERB, ROOT]
    Rupture [NOUN, nsubj]
        of [ADP, prep]
            bearings [NOUN, pobj]
                pump [NOUN, compound]
    degradation [NOUN, dobj]
        shaft [NOUN, compound]
        and [CCONJ, cc]
        reduction [NOUN, conj]
            flow [NOUN, compound]
                consequent [ADJ, amod]

           [SPACE, dep]
found [VERB, ROOT]
    supply [NOUN, nsubjpass]
        Pump [NOUN, compound]
        power [NOUN, compound]
    has [AUX, aux]
    been [AUX, auxpass]
    burnout [NOUN, oprd]

           [SPACE, dep]
failed [VERB, ROOT]
    test [NOUN, nsubj]
        Pump [NOUN, compound]
    due [ADP, prep]
        to [ADP, pcomp]
        failure [NOUN, pobj]
            supply [NOUN, compound]
                power [NOUN, compound]

           [SPACE, dep]
revealed [VERB, ROOT]
    inspection [NOUN, nsubj]
        Pump [NOUN, compound]
    degradation [NOUN, dobj]
        excessive [ADJ, amod]
        impeller [NOUN, compound]

           [SPACE, dep]
revealed [VERB, ROOT]
    inspection [NOUN, nsubj]
        Pump [NOUN, compound]
    degradation [NOUN, dobj]
        excessive [ADJ, amod]
        impeller [NOUN, compound]
    likely [ADV, ccomp]
        due [ADP, prep]
            to [ADP, pcomp]
            cavitation [NOUN, pobj]

         [SPACE, dep]
[7]:
df = displayNER(doc)
df
[7]:
text lemma pos dep ent_type ent_iob_ coref_chains ref_n ref_t
0 A a DET det O None
1 leak leak NOUN nsubjpass O None
2 was be AUX auxpass O None
3 noticed notice VERB ROOT O None
4 from from ADP prep O None
... ... ... ... ... ... ... ... ... ...
94 likely likely ADV ccomp O None
95 due due ADP prep O None
96 to to ADP pcomp O None
97 cavitation cavitation NOUN pobj O None
99 \n \n SPACE dep O None

91 rows × 9 columns

[8]:
print('Coreference Info: \n', doc._.coref_chains.pretty_representation)

print(f'Label for token "{doc[22]}" is "{doc[22]._.ref_n}"')
Coreference Info:
 0: RCP(6), RCP(11), RCP(29); 1: gauge(15), it(22)
Label for token "it" is "gauge"