Custom Pipelines Demo¶
normEntities: Normalizing Named Entities, remove the leading article and trailing particle
initCoref: Initialize Coreference Attributes with Entity Info
anaphorCoref: Anaphora resolution using coreferee
expandEntities: Expand the current entities, recursive function to extend entity with all previous NOUN
[1]:
import pandas as pd
import spacy
from spacy.tokens import Span
from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Token
from spacy import displacy
import coreferee
#### Using spacy's Token extensions for coreferee
if Token.has_extension('ref_n'):
_ = Token.remove_extension('ref_n')
if Token.has_extension('ref_t'):
_ = Token.remove_extension('ref_t')
if Token.has_extension('ref_t_'):
_ = Token.remove_extension('ref_t_')
Token.set_extension('ref_n', default='')
Token.set_extension('ref_t', default='')
nlp = spacy.load("en_core_web_lg")
Internal Developed Functions¶
[2]:
# Function used to display NER entities
def displayNER(doc, includePunct=False):
"""
Generate data frame for visualization of spaCy doc with custom attributes.
"""
rows = []
for i, t in enumerate(doc):
if not t.is_punct or includePunct:
row = {'token': i,
'text': t.text, 'lemma': t.lemma_,
'pos': t.pos_, 'dep': t.dep_, 'ent_type': t.ent_type_,
'ent_iob_': t.ent_iob_}
if doc.has_extension('coref_chains'):
if t.has_extension('coref_chains') and t._.coref_chains: # neuralcoref attributes
row['coref_chains'] = t._.coref_chains.pretty_representation
else:
row['coref_chains'] = None
if t.has_extension('ref_n'): # referent attribute
row['ref_n'] = t._.ref_n
row['ref_t'] = t._.ref_t
if t.has_extension('ref_ent'): # ref_n/ref_t
row['ref_ent'] = t._.ref_ent
rows.append(row)
df = pd.DataFrame(rows).set_index('token')
df.index.name = None
return df
# Reset Pipelines
def resetPipeline(nlp, pipes):
"""
remove all custom pipes, and add new pipes
"""
customPipes = [pipe for (pipe, _) in nlp.pipeline
if pipe not in ['tagger', 'parser',
'tok2vec', 'attribute_ruler', 'lemmatizer']]
for pipe in customPipes:
_ = nlp.remove_pipe(pipe)
# re-add specified pipes
for pipe in pipes:
nlp.add_pipe(pipe)
# Print Dependency Tree
def printDepTree(doc, skipPunct=True):
"""
Utility function to pretty print the dependency tree.
"""
def printRecursive(root, indent, skipPunct):
if not root.dep_ == 'punct' or not skipPunct:
print(" "*indent + f"{root} [{root.pos_}, {root.dep_}]")
for left in root.lefts:
printRecursive(left, indent=indent+4, skipPunct=skipPunct)
for right in root.rights:
printRecursive(right, indent=indent+4, skipPunct=skipPunct)
for sent in doc.sents: # iterate over all sentences in a doc
printRecursive(sent.root, indent=0, skipPunct=skipPunct)
Internal Developed Pipelines¶
[3]:
# Normalizing Named Entities, remove the leading article and trailing particle
@Language.component("normEntities")
def normEntities(doc):
"""
Normalizing Named Entities, remove the leading article and trailing particle
@ In, doc, spacy.tokens.doc.Doc
@ Out, doc, spacy.tokens.doc.Doc
"""
ents = []
for ent in doc.ents:
if ent[0].pos_ == "DET": # leading article
ent = Span(doc, ent.start+1, ent.end, label=ent.label)
if len(ent) > 0:
if ent[-1].pos_ == "PART": # trailing particle like 's
ent = Span(doc, ent.start, ent.end-1, label=ent.label)
if len(ent) > 0:
ents.append(ent)
doc.ents = tuple(ents)
return doc
# Initialize Coreference Attributes with Entity Info
@Language.component("initCoref")
def initCoref(doc):
for e in doc.ents:
e[0]._.ref_n, e[0]._.ref_t = e.text, e.label_
return doc
# Anaphora resolution using coreferee
@Language.component("anaphorCoref")
def anaphorCoref(doc):
"""
Anaphora resolution using coreferee
This pipeline need to be added after NER.
The assumption here is: The entities need to be recognized first, then call
pipeline "initCoref" to assign initial custom attribute "ref_n" and "ref_t",
then call pipeline "aliasResolver" to resolve all the aliases used in the text.
After all these pre-processes, we can use "anaphorCoref" pipeline to resolve the
coreference.
"""
if not Token.has_extension('coref_chains'):
return doc
for token in doc:
coref = token._.coref_chains
# if token is coref and not already dereferenced
if coref and token._.ref_n == '':
# check all the references, if "ref_n" is available (determined by NER and initCoref),
# the value of "ref_n" will be assigned to current token
for chain in coref:
for ref in chain:
refToken = doc[ref[0]]
if refToken._.ref_n != '':
token._.ref_n = refToken._.ref_n
token._.ref_t = refToken._.ref_t
break
return doc
# Expand the current entities, recursive function to extend entity with all previous NOUN
@Language.component("expandEntities")
def expandEntities(doc):
"""
Expand the current entities, recursive function to extend entity with all previous NOUN
"""
newEnts = []
isUpdated = False
for ent in doc.ents:
if ent.label_ == "SSC" and ent.start != 0:
prevToken = doc[ent.start - 1]
if prevToken.pos_ in ['NOUN']:
newEnt = Span(doc, ent.start - 1, ent.end, label=ent.label)
newEnts.append(newEnt)
isUpdated = True
else:
newEnts.append(ent)
doc.ents = newEnts
if isUpdated:
doc = expandEntities(doc)
return doc
Reset NLP Pipeline¶
[4]:
pipelines = ['entity_ruler','normEntities', 'initCoref', 'coreferee','anaphorCoref', 'expandEntities']
resetPipeline(nlp, pipelines)
nlp.pipeline
[4]:
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x326c5f470>),
('tagger', <spacy.pipeline.tagger.Tagger at 0x326c5ee70>),
('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x320b5ec70>),
('attribute_ruler',
<spacy.pipeline.attributeruler.AttributeRuler at 0x326dd0750>),
('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x326e0ff10>),
('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x326abb610>),
('normEntities', <function __main__.normEntities(doc)>),
('initCoref', <function __main__.initCoref(doc)>),
('coreferee', <coreferee.manager.CorefereeBroker at 0x10520b810>),
('anaphorCoref', <function __main__.anaphorCoref(doc)>),
('expandEntities', <function __main__.expandEntities(doc)>)]
Example¶
[5]:
text = r"""A leak was noticed from the RCP pump 1A.
The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.
The RCP pump 1A pressure gauge was found inoperative.
Rupture of pump bearings caused shaft degradation.
Rupture of pump bearings caused shaft degradation and consequent flow reduction.
Pump power supply has been found burnout.
Pump test failed due to power supply failure.
Pump inspection revealed excessive impeller degradation.
Pump inspection revealed excessive impeller degradation likely due to cavitation.
"""
[6]:
patterns = [{"label":"comp", "pattern":[{"LOWER":"gauge"}], "id":"ssc"}]
ruler = nlp.get_pipe('entity_ruler')
ruler.add_patterns(patterns)
rules = [{"LOWER":"pump"}]
matcher = Matcher(nlp.vocab)
matcher.add('comp', [rules])
doc = nlp(text)
matches = matcher(doc, as_spans=True)
print('Identified Entities:')
for span in matches:
print('Entity:', span.text, '| Label:', span.label_, '| Sentence', span.sent)
displacy.render(doc, style='ent', jupyter=True)
print('Dependency Tree:')
printDepTree(doc)
Identified Entities:
Entity: pump | Label: comp | Sentence A leak was noticed from the RCP pump 1A.
The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.
Entity: pump | Label: comp | Sentence A leak was noticed from the RCP pump 1A.
The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.
Entity: pump | Label: comp | Sentence The RCP pump 1A pressure gauge was found inoperative.
Entity: pump | Label: comp | Sentence Rupture of pump bearings caused shaft degradation.
Entity: pump | Label: comp | Sentence Rupture of pump bearings caused shaft degradation and consequent flow reduction.
Entity: Pump | Label: comp | Sentence Pump power supply has been found burnout.
Entity: Pump | Label: comp | Sentence Pump test failed due to power supply failure.
Entity: Pump | Label: comp | Sentence Pump inspection revealed excessive impeller degradation.
Entity: Pump | Label: comp | Sentence Pump inspection revealed excessive impeller degradation likely due to cavitation.
A leak was noticed from the RCP pump 1A. The RCP pump 1A pressure
gauge
comp
was found not operating, and it was found inoperative. The RCP pump 1A pressure
gauge
comp
was found inoperative. Rupture of pump bearings caused shaft degradation. Rupture of pump bearings caused shaft degradation and consequent flow reduction. Pump power supply has been found burnout. Pump test failed due to power supply failure. Pump inspection revealed excessive impeller degradation. Pump inspection revealed excessive impeller degradation likely due to cavitation.
Dependency Tree:
noticed [VERB, ROOT]
leak [NOUN, nsubjpass]
A [DET, det]
was [AUX, auxpass]
from [ADP, prep]
RCP [PROPN, pobj]
the [DET, det]
pump [VERB, conj]
1A. [NUM, dobj]
[SPACE, dep]
pump [VERB, relcl]
RCP [PROPN, nsubj]
The [DET, det]
found [VERB, conj]
gauge [NOUN, nsubjpass]
1A [NOUN, compound]
pressure [NOUN, compound]
was [AUX, auxpass]
operating [VERB, xcomp]
not [PART, neg]
and [CCONJ, cc]
found [VERB, conj]
it [PRON, nsubjpass]
was [AUX, auxpass]
inoperative [ADJ, oprd]
[SPACE, dep]
pump [VERB, ROOT]
RCP [PROPN, nsubj]
The [DET, det]
found [VERB, conj]
gauge [NOUN, nsubjpass]
1A [NOUN, compound]
pressure [NOUN, compound]
was [AUX, auxpass]
inoperative [ADJ, oprd]
[SPACE, dep]
caused [VERB, ROOT]
Rupture [NOUN, nsubj]
of [ADP, prep]
bearings [NOUN, pobj]
pump [NOUN, compound]
degradation [NOUN, dobj]
shaft [NOUN, compound]
[SPACE, dep]
caused [VERB, ROOT]
Rupture [NOUN, nsubj]
of [ADP, prep]
bearings [NOUN, pobj]
pump [NOUN, compound]
degradation [NOUN, dobj]
shaft [NOUN, compound]
and [CCONJ, cc]
reduction [NOUN, conj]
flow [NOUN, compound]
consequent [ADJ, amod]
[SPACE, dep]
found [VERB, ROOT]
supply [NOUN, nsubjpass]
Pump [NOUN, compound]
power [NOUN, compound]
has [AUX, aux]
been [AUX, auxpass]
burnout [NOUN, oprd]
[SPACE, dep]
failed [VERB, ROOT]
test [NOUN, nsubj]
Pump [NOUN, compound]
due [ADP, prep]
to [ADP, pcomp]
failure [NOUN, pobj]
supply [NOUN, compound]
power [NOUN, compound]
[SPACE, dep]
revealed [VERB, ROOT]
inspection [NOUN, nsubj]
Pump [NOUN, compound]
degradation [NOUN, dobj]
excessive [ADJ, amod]
impeller [NOUN, compound]
[SPACE, dep]
revealed [VERB, ROOT]
inspection [NOUN, nsubj]
Pump [NOUN, compound]
degradation [NOUN, dobj]
excessive [ADJ, amod]
impeller [NOUN, compound]
likely [ADV, ccomp]
due [ADP, prep]
to [ADP, pcomp]
cavitation [NOUN, pobj]
[SPACE, dep]
[7]:
df = displayNER(doc)
df
[7]:
text | lemma | pos | dep | ent_type | ent_iob_ | coref_chains | ref_n | ref_t | |
---|---|---|---|---|---|---|---|---|---|
0 | A | a | DET | det | O | None | |||
1 | leak | leak | NOUN | nsubjpass | O | None | |||
2 | was | be | AUX | auxpass | O | None | |||
3 | noticed | notice | VERB | ROOT | O | None | |||
4 | from | from | ADP | prep | O | None | |||
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
94 | likely | likely | ADV | ccomp | O | None | |||
95 | due | due | ADP | prep | O | None | |||
96 | to | to | ADP | pcomp | O | None | |||
97 | cavitation | cavitation | NOUN | pobj | O | None | |||
99 | \n | \n | SPACE | dep | O | None |
91 rows × 9 columns
[8]:
print('Coreference Info: \n', doc._.coref_chains.pretty_representation)
print(f'Label for token "{doc[22]}" is "{doc[22]._.ref_n}"')
Coreference Info:
0: RCP(6), RCP(11), RCP(29); 1: gauge(15), it(22)
Label for token "it" is "gauge"