[1]:
import pandas as pd
import spacy
from spacy.tokens import Span
[2]:
def displayNER(doc, includePunct=False):
"""
Generate data frame for visualization of spaCy doc with custom attributes.
"""
rows = []
for i, t in enumerate(doc):
if not t.is_punct or includePunct:
row = {'token': i,
'text': t.text, 'lemma': t.lemma_,
'pos': t.pos_, 'dep': t.dep_, 'ent_type': t.ent_type_,
'ent_iob_': t.ent_iob_}
if doc.has_extension('coref_chains'):
if t.has_extension('coref_chains') and t._.coref_chains: # neuralcoref attributes
row['coref_chains'] = t._.coref_chains.pretty_representation
else:
row['coref_chains'] = None
if t.has_extension('ref_n'): # referent attribute
row['ref_n'] = t._.ref_n
row['ref_t'] = t._.ref_t
if t.has_extension('ref_ent'): # ref_n/ref_t
row['ref_ent'] = t._.ref_ent
rows.append(row)
df = pd.DataFrame(rows).set_index('token')
df.index.name = None
return df
[3]:
def resetPipeline(nlp, pipes):
"""
remove all custom pipes, and add new pipes
"""
customPipes = [pipe for (pipe, _) in nlp.pipeline
if pipe not in ['tagger', 'parser',
'tok2vec', 'attribute_ruler', 'lemmatizer']]
for pipe in customPipes:
_ = nlp.remove_pipe(pipe)
# re-add specified pipes
for pipe in pipes:
nlp.add_pipe(pipe)
logger.info(f"Model: {nlp.meta['name']}, Language: {nlp.meta['lang']}")
logger.info('\n'.join([pipe for (pipe,_) in nlp.pipeline]))
[4]:
def printDepTree(doc, skipPunct=True):
"""
Utility function to pretty print the dependency tree.
"""
def printRecursive(root, indent, skipPunct):
if not root.dep_ == 'punct' or not skipPunct:
print(" "*indent + f"{root} [{root.pos_}, {root.dep_}]")
for left in root.lefts:
printRecursive(left, indent=indent+4, skipPunct=skipPunct)
for right in root.rights:
printRecursive(right, indent=indent+4, skipPunct=skipPunct)
for sent in doc.sents: # iterate over all sentences in a doc
printRecursive(sent.root, indent=0, skipPunct=skipPunct)
Custom pipelines¶
[5]:
from spacy.language import Language
from spacy.tokens import Span
from spacy.matcher import Matcher
from spacy.tokens import Token
[6]:
customLabel = ['STRUCTURE', 'COMPONENT', 'SYSTEM']
aliasLookup = {}
[7]:
@Language.component("normEntities")
def normEntities(doc):
"""
Normalizing Named Entities, remove the leading article and trailing particle
@ In, doc, spacy.tokens.doc.Doc
@ Out, doc, spacy.tokens.doc.Doc
"""
ents = []
for ent in doc.ents:
if ent[0].pos_ == "DET": # leading article
ent = Span(doc, ent.start+1, ent.end, label=ent.label)
if len(ent) > 0:
if ent[-1].pos_ == "PART": # trailing particle like 's
ent = Span(doc, ent.start, ent.end-1, label=ent.label)
if len(ent) > 0:
ents.append(ent)
doc.ents = tuple(ents)
return doc
[8]:
@Language.component("initCoref")
def initCoref(doc):
for e in doc.ents:
# if e.label_ in customLabel:
e[0]._.ref_n, e[0]._.ref_t = e.text, e.label_
return doc
[9]:
@Language.component("aliasResolver")
def aliasResolver(doc):
"""
Lookup aliases and store result in ref_t, ref_n
"""
for ent in doc.ents:
token = ent[0].text
if token in aliasLookup:
aName, aType = aliasLookup[token]
ent[0]._.ref_n, ent[0]._.ref_t = aName, aType
return propagateEntType(doc)
[10]:
def propagateEntType(doc):
"""
propagate entity type stored in ref_t
"""
ents = []
for e in doc.ents:
if e[0]._.ref_n != '': # if e is a coreference
e = Span(doc, e.start, e.end, label=e[0]._.ref_t)
ents.append(e)
doc.ents = tuple(ents)
return doc
[11]:
@Language.component("anaphorCoref")
def anaphorCoref(doc):
"""
Anaphora resolution using coreferee
This pipeline need to be added after NER.
The assumption here is: The entities need to be recognized first, then call
pipeline "initCoref" to assign initial custom attribute "ref_n" and "ref_t",
then call pipeline "aliasResolver" to resolve all the aliases used in the text.
After all these pre-processes, we can use "anaphorCoref" pipeline to resolve the
coreference.
"""
if not Token.has_extension('coref_chains'):
return doc
for token in doc:
coref = token._.coref_chains
# if token is coref and not already dereferenced
if coref and token._.ref_n == '':
# check all the references, if "ref_n" is available (determined by NER and initCoref),
# the value of "ref_n" will be assigned to current totken
for chain in coref:
for ref in chain:
refToken = doc[ref[0]]
if refToken._.ref_n != '':
token._.ref_n = refToken._.ref_n
token._.ref_t = refToken._.ref_t
break
return doc
[12]:
@Language.component("expandEntities")
def expandEntities(doc):
"""
Expand the current entities, recursive function to extend entity with all previous NOUN
"""
newEnts = []
isUpdated = False
for ent in doc.ents:
if ent.label_ == "SSC" and ent.start != 0:
prevToken = doc[ent.start - 1]
if prevToken.pos_ in ['NOUN']:
newEnt = Span(doc, ent.start - 1, ent.end, label=ent.label)
newEnts.append(newEnt)
isUpdated = True
else:
newEnts.append(ent)
doc.ents = newEnts
if isUpdated:
doc = expandEntities(doc)
return doc
[13]:
import coreferee, spacy
nlp = spacy.load("en_core_web_lg")
import logging
logger = logging.getLogger(__name__)
[14]:
ch = logging.StreamHandler()
logger.addHandler(ch)
[15]:
#### Using spacy's Token extensions for coreferee
if Token.has_extension('ref_n'):
_ = Token.remove_extension('ref_n')
if Token.has_extension('ref_t'):
_ = Token.remove_extension('ref_t')
if Token.has_extension('ref_t_'):
_ = Token.remove_extension('ref_t_')
Token.set_extension('ref_n', default='')
Token.set_extension('ref_t', default='')
[16]:
pipelines = ['entity_ruler','normEntities', 'initCoref', 'aliasResolver', 'coreferee','anaphorCoref', 'expandEntities']
[17]:
pipelines
[17]:
['entity_ruler',
'normEntities',
'initCoref',
'aliasResolver',
'coreferee',
'anaphorCoref',
'expandEntities']
[18]:
resetPipeline(nlp, pipelines)
[19]:
nlp.pipeline
[19]:
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x128033a70>),
('tagger', <spacy.pipeline.tagger.Tagger at 0x128033470>),
('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x123e4e810>),
('attribute_ruler',
<spacy.pipeline.attributeruler.AttributeRuler at 0x1282128d0>),
('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x12815f4d0>),
('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x127eb2310>),
('normEntities', <function __main__.normEntities(doc)>),
('initCoref', <function __main__.initCoref(doc)>),
('aliasResolver', <function __main__.aliasResolver(doc)>),
('coreferee', <coreferee.manager.CorefereeBroker at 0x12a781a90>),
('anaphorCoref', <function __main__.anaphorCoref(doc)>),
('expandEntities', <function __main__.expandEntities(doc)>)]
[20]:
text = r"""A leak was noticed from the RCP pump 1A.
The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.
The RCP pump 1A pressure gauge was found inoperative.
Rupture of pump bearings caused shaft degradation.
Rupture of pump bearings caused shaft degradation and consequent flow reduction.
Pump power supply has been found burnout.
Pump test failed due to power supply failure.
Pump inspection revealed excessive impeller degradation.
Pump inspection revealed excessive impeller degradation likely due to cavitation.
"""
[21]:
patterns = [{"label":"comp", "pattern":[{"LOWER":"gauge"}], "id":"ssc"}]
ruler = nlp.get_pipe('entity_ruler')
ruler.add_patterns(patterns)
rules = [{"LOWER":"pump"}]
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
matcher.add('comp', [rules])
[22]:
doc = nlp(text)
[23]:
matches = matcher(doc, as_spans=True)
for span in matches:
print(span.sent, span.label_)
A leak was noticed from the RCP pump 1A.
The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.
comp
A leak was noticed from the RCP pump 1A.
The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.
comp
The RCP pump 1A pressure gauge was found inoperative.
comp
Rupture of pump bearings caused shaft degradation.
comp
Rupture of pump bearings caused shaft degradation and consequent flow reduction.
comp
Pump power supply has been found burnout.
comp
Pump test failed due to power supply failure.
comp
Pump inspection revealed excessive impeller degradation.
comp
Pump inspection revealed excessive impeller degradation likely due to cavitation.
comp
[24]:
print(type(doc.ents))
<class 'tuple'>
[25]:
from spacy import displacy
[26]:
displacy.render(doc, style='ent', jupyter=True)
A leak was noticed from the RCP pump 1A. The RCP pump 1A pressure
gauge
comp
was found not operating, and it was found inoperative. The RCP pump 1A pressure
gauge
comp
was found inoperative. Rupture of pump bearings caused shaft degradation. Rupture of pump bearings caused shaft degradation and consequent flow reduction. Pump power supply has been found burnout. Pump test failed due to power supply failure. Pump inspection revealed excessive impeller degradation. Pump inspection revealed excessive impeller degradation likely due to cavitation.
[27]:
patterns = [{"label":"comp", "pattern":[{"LOWER":"pressure gauge"}, {"POS":"NOUN"}], "id":"ssc"}]
[28]:
printDepTree(doc)
noticed [VERB, ROOT]
leak [NOUN, nsubjpass]
A [DET, det]
was [AUX, auxpass]
from [ADP, prep]
RCP [PROPN, pobj]
the [DET, det]
pump [VERB, conj]
1A. [NUM, dobj]
[SPACE, dep]
pump [VERB, relcl]
RCP [PROPN, nsubj]
The [DET, det]
found [VERB, conj]
gauge [NOUN, nsubjpass]
1A [NOUN, compound]
pressure [NOUN, compound]
was [AUX, auxpass]
operating [VERB, xcomp]
not [PART, neg]
and [CCONJ, cc]
found [VERB, conj]
it [PRON, nsubjpass]
was [AUX, auxpass]
inoperative [ADJ, oprd]
[SPACE, dep]
pump [VERB, ROOT]
RCP [PROPN, nsubj]
The [DET, det]
found [VERB, conj]
gauge [NOUN, nsubjpass]
1A [NOUN, compound]
pressure [NOUN, compound]
was [AUX, auxpass]
inoperative [ADJ, oprd]
[SPACE, dep]
caused [VERB, ROOT]
Rupture [NOUN, nsubj]
of [ADP, prep]
bearings [NOUN, pobj]
pump [NOUN, compound]
degradation [NOUN, dobj]
shaft [NOUN, compound]
[SPACE, dep]
caused [VERB, ROOT]
Rupture [NOUN, nsubj]
of [ADP, prep]
bearings [NOUN, pobj]
pump [NOUN, compound]
degradation [NOUN, dobj]
shaft [NOUN, compound]
and [CCONJ, cc]
reduction [NOUN, conj]
flow [NOUN, compound]
consequent [ADJ, amod]
[SPACE, dep]
found [VERB, ROOT]
supply [NOUN, nsubjpass]
Pump [NOUN, compound]
power [NOUN, compound]
has [AUX, aux]
been [AUX, auxpass]
burnout [NOUN, oprd]
[SPACE, dep]
failed [VERB, ROOT]
test [NOUN, nsubj]
Pump [NOUN, compound]
due [ADP, prep]
to [ADP, pcomp]
failure [NOUN, pobj]
supply [NOUN, compound]
power [NOUN, compound]
[SPACE, dep]
revealed [VERB, ROOT]
inspection [NOUN, nsubj]
Pump [NOUN, compound]
degradation [NOUN, dobj]
excessive [ADJ, amod]
impeller [NOUN, compound]
[SPACE, dep]
revealed [VERB, ROOT]
inspection [NOUN, nsubj]
Pump [NOUN, compound]
degradation [NOUN, dobj]
excessive [ADJ, amod]
impeller [NOUN, compound]
likely [ADV, ccomp]
due [ADP, prep]
to [ADP, pcomp]
cavitation [NOUN, pobj]
[SPACE, dep]
[29]:
df = displayNER(doc)
[30]:
df
[30]:
text | lemma | pos | dep | ent_type | ent_iob_ | coref_chains | ref_n | ref_t | |
---|---|---|---|---|---|---|---|---|---|
0 | A | a | DET | det | O | None | |||
1 | leak | leak | NOUN | nsubjpass | O | None | |||
2 | was | be | AUX | auxpass | O | None | |||
3 | noticed | notice | VERB | ROOT | O | None | |||
4 | from | from | ADP | prep | O | None | |||
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
94 | likely | likely | ADV | ccomp | O | None | |||
95 | due | due | ADP | prep | O | None | |||
96 | to | to | ADP | pcomp | O | None | |||
97 | cavitation | cavitation | NOUN | pobj | O | None | |||
99 | \n | \n | SPACE | dep | O | None |
91 rows × 9 columns
[31]:
doc._.coref_chains.pretty_representation
[31]:
'0: RCP(6), RCP(11), RCP(29); 1: gauge(15), it(22)'
[32]:
for ent in doc.ents:
print(ent)
gauge
gauge
[33]:
doc[22]._.ref_n
[33]:
'gauge'
[34]:
for token in doc:
coref = token._.coref_chains
# if token is coref and not already dereferenced
if coref and token._.ref_n == '':
print('token', token)
# print(token,coref.pretty_representation)
# check all the references, if "ref_n" is available (determined by NER and initCoref),
# the value of "ref_n" will be assigned to current totken
for chain in coref:
for ref in chain:
refToken = doc[ref[0]]
print(refToken)
print(refToken._.ref_n)
if refToken._.ref_n != '':
token._.ref_n = refToken._.ref_n
token._.ref_t = refToken._.ref_t
break
token RCP
RCP
RCP
RCP
token RCP
RCP
RCP
RCP
token RCP
RCP
RCP
RCP
[35]:
import spacy
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", [pattern])
[36]:
matcher.get('HelloWorld')
[36]:
(None, [[{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]])
[37]:
doc.ents
[37]:
(gauge, gauge)
[38]:
sl = []
for ent in doc.ents:
sent = ent.sent
if sent not in sl:
sl.append(sent)
print(sl)
[A leak was noticed from the RCP pump 1A.
The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.
, The RCP pump 1A pressure gauge was found inoperative.
]
[39]:
for sent in sl:
print(sent.ents)
print(set(sent.ents))
[gauge]
{gauge}
[gauge]
{gauge}
[40]:
for sent in sl:
print(sent.root)
for token in sent:
print(token.dep_)
noticed
det
nsubjpass
auxpass
ROOT
prep
det
pobj
conj
dobj
dep
det
nsubj
relcl
compound
compound
nsubjpass
auxpass
conj
neg
xcomp
punct
cc
nsubjpass
auxpass
conj
oprd
punct
dep
pump
det
nsubj
ROOT
compound
compound
nsubjpass
auxpass
conj
oprd
punct
dep
[ ]: