[1]:
import pandas as pd
import spacy
from spacy.tokens import Span
[2]:
def displayNER(doc, includePunct=False):
  """
    Generate data frame for visualization of spaCy doc with custom attributes.
  """
  rows = []
  for i, t in enumerate(doc):
    if not t.is_punct or includePunct:
      row = {'token': i,
             'text': t.text, 'lemma': t.lemma_,
             'pos': t.pos_, 'dep': t.dep_, 'ent_type': t.ent_type_,
             'ent_iob_': t.ent_iob_}
      if doc.has_extension('coref_chains'):
        if t.has_extension('coref_chains') and t._.coref_chains: # neuralcoref attributes
          row['coref_chains'] = t._.coref_chains.pretty_representation
        else:
          row['coref_chains'] = None
      if t.has_extension('ref_n'): # referent attribute
        row['ref_n'] = t._.ref_n
        row['ref_t'] = t._.ref_t
      if t.has_extension('ref_ent'): # ref_n/ref_t
        row['ref_ent'] = t._.ref_ent
      rows.append(row)
  df = pd.DataFrame(rows).set_index('token')
  df.index.name = None

  return df
[3]:
def resetPipeline(nlp, pipes):
  """
    remove all custom pipes, and add new pipes
  """
  customPipes = [pipe for (pipe, _) in nlp.pipeline
                  if pipe not in ['tagger', 'parser',
                                  'tok2vec', 'attribute_ruler', 'lemmatizer']]
  for pipe in customPipes:
    _ = nlp.remove_pipe(pipe)
  # re-add specified pipes
  for pipe in pipes:
    nlp.add_pipe(pipe)
  logger.info(f"Model: {nlp.meta['name']}, Language: {nlp.meta['lang']}")
  logger.info('\n'.join([pipe for (pipe,_) in nlp.pipeline]))
[4]:
def printDepTree(doc, skipPunct=True):
  """
    Utility function to pretty print the dependency tree.
  """
  def printRecursive(root, indent, skipPunct):
    if not root.dep_ == 'punct' or not skipPunct:
      print(" "*indent + f"{root} [{root.pos_}, {root.dep_}]")
    for left in root.lefts:
      printRecursive(left, indent=indent+4, skipPunct=skipPunct)
    for right in root.rights:
      printRecursive(right, indent=indent+4, skipPunct=skipPunct)

  for sent in doc.sents: # iterate over all sentences in a doc
    printRecursive(sent.root, indent=0, skipPunct=skipPunct)

Custom pipelines

[5]:
from spacy.language import Language
from spacy.tokens import Span
from spacy.matcher import Matcher
from spacy.tokens import Token
[6]:
customLabel = ['STRUCTURE', 'COMPONENT', 'SYSTEM']
aliasLookup = {}
[7]:
@Language.component("normEntities")
def normEntities(doc):
  """
    Normalizing Named Entities, remove the leading article and trailing particle
    @ In, doc, spacy.tokens.doc.Doc
    @ Out, doc, spacy.tokens.doc.Doc
  """
  ents = []
  for ent in doc.ents:
    if ent[0].pos_ == "DET": # leading article
      ent = Span(doc, ent.start+1, ent.end, label=ent.label)
    if len(ent) > 0:
      if ent[-1].pos_ == "PART": # trailing particle like 's
        ent = Span(doc, ent.start, ent.end-1, label=ent.label)
      if len(ent) > 0:
        ents.append(ent)
  doc.ents = tuple(ents)
  return doc
[8]:
@Language.component("initCoref")
def initCoref(doc):
  for e in doc.ents:
    # if e.label_ in customLabel:
      e[0]._.ref_n, e[0]._.ref_t = e.text, e.label_
  return doc
[9]:
@Language.component("aliasResolver")
def aliasResolver(doc):
  """
    Lookup aliases and store result in ref_t, ref_n
  """
  for ent in doc.ents:
    token = ent[0].text
    if token in aliasLookup:
      aName, aType = aliasLookup[token]
      ent[0]._.ref_n, ent[0]._.ref_t = aName, aType
  return propagateEntType(doc)
[10]:
def propagateEntType(doc):
  """
    propagate entity type stored in ref_t
  """
  ents = []
  for e in doc.ents:
    if e[0]._.ref_n != '': # if e is a coreference
      e = Span(doc, e.start, e.end, label=e[0]._.ref_t)
    ents.append(e)
  doc.ents = tuple(ents)
  return doc
[11]:
@Language.component("anaphorCoref")
def anaphorCoref(doc):
  """
    Anaphora resolution using coreferee
    This pipeline need to be added after NER.
    The assumption here is: The entities need to be recognized first, then call
    pipeline "initCoref" to assign initial custom attribute "ref_n" and "ref_t",
    then call pipeline "aliasResolver" to resolve all the aliases used in the text.
    After all these pre-processes, we can use "anaphorCoref" pipeline to resolve the
    coreference.
  """
  if not Token.has_extension('coref_chains'):
    return doc
  for token in doc:
    coref = token._.coref_chains
    # if token is coref and not already dereferenced
    if coref and token._.ref_n == '':
      # check all the references, if "ref_n" is available (determined by NER and initCoref),
      # the value of "ref_n" will be assigned to current totken
      for chain in coref:
        for ref in chain:
          refToken = doc[ref[0]]
          if refToken._.ref_n != '':
            token._.ref_n = refToken._.ref_n
            token._.ref_t = refToken._.ref_t
            break
  return doc
[12]:
@Language.component("expandEntities")
def expandEntities(doc):
  """
    Expand the current entities, recursive function to extend entity with all previous NOUN
  """
  newEnts = []
  isUpdated = False
  for ent in doc.ents:
    if ent.label_ == "SSC" and ent.start != 0:
      prevToken = doc[ent.start - 1]
      if prevToken.pos_ in ['NOUN']:
        newEnt = Span(doc, ent.start - 1, ent.end, label=ent.label)
        newEnts.append(newEnt)
        isUpdated = True
    else:
      newEnts.append(ent)
  doc.ents = newEnts
  if isUpdated:
    doc = expandEntities(doc)
  return doc
[13]:
import coreferee, spacy
nlp = spacy.load("en_core_web_lg")
import logging
logger = logging.getLogger(__name__)
[14]:
ch = logging.StreamHandler()
logger.addHandler(ch)
[15]:
#### Using spacy's Token extensions for coreferee
if Token.has_extension('ref_n'):
  _ = Token.remove_extension('ref_n')
if Token.has_extension('ref_t'):
  _ = Token.remove_extension('ref_t')
if Token.has_extension('ref_t_'):
  _ = Token.remove_extension('ref_t_')
Token.set_extension('ref_n', default='')
Token.set_extension('ref_t', default='')
[16]:
pipelines = ['entity_ruler','normEntities', 'initCoref', 'aliasResolver', 'coreferee','anaphorCoref', 'expandEntities']
[17]:
pipelines
[17]:
['entity_ruler',
 'normEntities',
 'initCoref',
 'aliasResolver',
 'coreferee',
 'anaphorCoref',
 'expandEntities']
[18]:
resetPipeline(nlp, pipelines)
[19]:
nlp.pipeline
[19]:
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x128033a70>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x128033470>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x123e4e810>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1282128d0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x12815f4d0>),
 ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x127eb2310>),
 ('normEntities', <function __main__.normEntities(doc)>),
 ('initCoref', <function __main__.initCoref(doc)>),
 ('aliasResolver', <function __main__.aliasResolver(doc)>),
 ('coreferee', <coreferee.manager.CorefereeBroker at 0x12a781a90>),
 ('anaphorCoref', <function __main__.anaphorCoref(doc)>),
 ('expandEntities', <function __main__.expandEntities(doc)>)]
[20]:
text = r"""A leak was noticed from the RCP pump 1A.
          The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.
          The RCP pump 1A pressure gauge was found inoperative.
          Rupture of pump bearings caused shaft degradation.
          Rupture of pump bearings caused shaft degradation and consequent flow reduction.
          Pump power supply has been found burnout.
          Pump test failed due to power supply failure.
          Pump inspection revealed excessive impeller degradation.
          Pump inspection revealed excessive impeller degradation likely due to cavitation.
        """
[21]:
patterns = [{"label":"comp", "pattern":[{"LOWER":"gauge"}], "id":"ssc"}]
ruler = nlp.get_pipe('entity_ruler')
ruler.add_patterns(patterns)
rules = [{"LOWER":"pump"}]
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
matcher.add('comp', [rules])
[22]:
doc = nlp(text)
[23]:
matches = matcher(doc, as_spans=True)
for span in matches:
    print(span.sent, span.label_)
A leak was noticed from the RCP pump 1A.
          The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.
           comp
A leak was noticed from the RCP pump 1A.
          The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.
           comp
The RCP pump 1A pressure gauge was found inoperative.
           comp
Rupture of pump bearings caused shaft degradation.
           comp
Rupture of pump bearings caused shaft degradation and consequent flow reduction.
           comp
Pump power supply has been found burnout.
           comp
Pump test failed due to power supply failure.
           comp
Pump inspection revealed excessive impeller degradation.
           comp
Pump inspection revealed excessive impeller degradation likely due to cavitation.
         comp
[24]:
print(type(doc.ents))
<class 'tuple'>
[25]:
from spacy import displacy
[26]:
displacy.render(doc, style='ent', jupyter=True)
A leak was noticed from the RCP pump 1A.
The RCP pump 1A pressure gauge comp was found not operating, and it was found inoperative.
The RCP pump 1A pressure gauge comp was found inoperative.
Rupture of pump bearings caused shaft degradation.
Rupture of pump bearings caused shaft degradation and consequent flow reduction.
Pump power supply has been found burnout.
Pump test failed due to power supply failure.
Pump inspection revealed excessive impeller degradation.
Pump inspection revealed excessive impeller degradation likely due to cavitation.
[27]:
patterns = [{"label":"comp", "pattern":[{"LOWER":"pressure gauge"}, {"POS":"NOUN"}], "id":"ssc"}]
[28]:
printDepTree(doc)
noticed [VERB, ROOT]
    leak [NOUN, nsubjpass]
        A [DET, det]
    was [AUX, auxpass]
    from [ADP, prep]
        RCP [PROPN, pobj]
            the [DET, det]
    pump [VERB, conj]
        1A. [NUM, dobj]

           [SPACE, dep]
            pump [VERB, relcl]
                RCP [PROPN, nsubj]
                    The [DET, det]
        found [VERB, conj]
            gauge [NOUN, nsubjpass]
                1A [NOUN, compound]
                pressure [NOUN, compound]
            was [AUX, auxpass]
            operating [VERB, xcomp]
                not [PART, neg]
    and [CCONJ, cc]
    found [VERB, conj]
        it [PRON, nsubjpass]
        was [AUX, auxpass]
        inoperative [ADJ, oprd]

           [SPACE, dep]
pump [VERB, ROOT]
    RCP [PROPN, nsubj]
        The [DET, det]
    found [VERB, conj]
        gauge [NOUN, nsubjpass]
            1A [NOUN, compound]
            pressure [NOUN, compound]
        was [AUX, auxpass]
        inoperative [ADJ, oprd]

           [SPACE, dep]
caused [VERB, ROOT]
    Rupture [NOUN, nsubj]
        of [ADP, prep]
            bearings [NOUN, pobj]
                pump [NOUN, compound]
    degradation [NOUN, dobj]
        shaft [NOUN, compound]

           [SPACE, dep]
caused [VERB, ROOT]
    Rupture [NOUN, nsubj]
        of [ADP, prep]
            bearings [NOUN, pobj]
                pump [NOUN, compound]
    degradation [NOUN, dobj]
        shaft [NOUN, compound]
        and [CCONJ, cc]
        reduction [NOUN, conj]
            flow [NOUN, compound]
                consequent [ADJ, amod]

           [SPACE, dep]
found [VERB, ROOT]
    supply [NOUN, nsubjpass]
        Pump [NOUN, compound]
        power [NOUN, compound]
    has [AUX, aux]
    been [AUX, auxpass]
    burnout [NOUN, oprd]

           [SPACE, dep]
failed [VERB, ROOT]
    test [NOUN, nsubj]
        Pump [NOUN, compound]
    due [ADP, prep]
        to [ADP, pcomp]
        failure [NOUN, pobj]
            supply [NOUN, compound]
                power [NOUN, compound]

           [SPACE, dep]
revealed [VERB, ROOT]
    inspection [NOUN, nsubj]
        Pump [NOUN, compound]
    degradation [NOUN, dobj]
        excessive [ADJ, amod]
        impeller [NOUN, compound]

           [SPACE, dep]
revealed [VERB, ROOT]
    inspection [NOUN, nsubj]
        Pump [NOUN, compound]
    degradation [NOUN, dobj]
        excessive [ADJ, amod]
        impeller [NOUN, compound]
    likely [ADV, ccomp]
        due [ADP, prep]
            to [ADP, pcomp]
            cavitation [NOUN, pobj]

         [SPACE, dep]
[29]:
df = displayNER(doc)
[30]:
df
[30]:
text lemma pos dep ent_type ent_iob_ coref_chains ref_n ref_t
0 A a DET det O None
1 leak leak NOUN nsubjpass O None
2 was be AUX auxpass O None
3 noticed notice VERB ROOT O None
4 from from ADP prep O None
... ... ... ... ... ... ... ... ... ...
94 likely likely ADV ccomp O None
95 due due ADP prep O None
96 to to ADP pcomp O None
97 cavitation cavitation NOUN pobj O None
99 \n \n SPACE dep O None

91 rows × 9 columns

[31]:
doc._.coref_chains.pretty_representation
[31]:
'0: RCP(6), RCP(11), RCP(29); 1: gauge(15), it(22)'
[32]:
for ent in doc.ents:
    print(ent)
gauge
gauge
[33]:
doc[22]._.ref_n
[33]:
'gauge'
[34]:
for token in doc:
    coref = token._.coref_chains

    # if token is coref and not already dereferenced
    if coref and token._.ref_n == '':
      print('token', token)
      # print(token,coref.pretty_representation)
      # check all the references, if "ref_n" is available (determined by NER and initCoref),
      # the value of "ref_n" will be assigned to current totken
      for chain in coref:
        for ref in chain:
          refToken = doc[ref[0]]
          print(refToken)
          print(refToken._.ref_n)
          if refToken._.ref_n != '':
            token._.ref_n = refToken._.ref_n
            token._.ref_t = refToken._.ref_t
            break
token RCP
RCP

RCP

RCP

token RCP
RCP

RCP

RCP

token RCP
RCP

RCP

RCP

[35]:
import spacy
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", [pattern])
[36]:
matcher.get('HelloWorld')
[36]:
(None, [[{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]])
[37]:
doc.ents
[37]:
(gauge, gauge)
[38]:
sl = []
for ent in doc.ents:
    sent = ent.sent
    if sent not in sl:
        sl.append(sent)
print(sl)
[A leak was noticed from the RCP pump 1A.
          The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.
          , The RCP pump 1A pressure gauge was found inoperative.
          ]
[39]:
for sent in sl:
    print(sent.ents)
    print(set(sent.ents))
[gauge]
{gauge}
[gauge]
{gauge}
[40]:
for sent in sl:
    print(sent.root)
    for token in sent:
        print(token.dep_)
noticed
det
nsubjpass
auxpass
ROOT
prep
det
pobj
conj
dobj
dep
det
nsubj
relcl
compound
compound
nsubjpass
auxpass
conj
neg
xcomp
punct
cc
nsubjpass
auxpass
conj
oprd
punct
dep
pump
det
nsubj
ROOT
compound
compound
nsubjpass
auxpass
conj
oprd
punct
dep
[ ]: