{ "cells": [ { "cell_type": "markdown", "id": "1a68a161", "metadata": {}, "source": [ "## Custom Pipelines Demo\n", "\n", "- normEntities: Normalizing Named Entities, remove the leading article and trailing particle\n", "- initCoref: Initialize Coreference Attributes with Entity Info\n", "- anaphorCoref: Anaphora resolution using coreferee\n", "- expandEntities: Expand the current entities, recursive function to extend entity with all previous NOUN" ] }, { "cell_type": "code", "execution_count": 1, "id": "46237dbc-ac22-482f-9373-1c1eaee56f40", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import spacy\n", "from spacy.tokens import Span\n", "from spacy.language import Language\n", "from spacy.matcher import Matcher\n", "from spacy.tokens import Token\n", "from spacy import displacy\n", "import coreferee\n", "\n", "#### Using spacy's Token extensions for coreferee\n", "if Token.has_extension('ref_n'):\n", " _ = Token.remove_extension('ref_n')\n", "if Token.has_extension('ref_t'):\n", " _ = Token.remove_extension('ref_t')\n", "if Token.has_extension('ref_t_'):\n", " _ = Token.remove_extension('ref_t_')\n", "Token.set_extension('ref_n', default='')\n", "Token.set_extension('ref_t', default='')\n", "\n", "nlp = spacy.load(\"en_core_web_lg\")" ] }, { "cell_type": "markdown", "id": "7aba47c0", "metadata": {}, "source": [ "### Internal Developed Functions" ] }, { "cell_type": "code", "execution_count": 2, "id": "86be0452-6e98-46b4-9e4f-4447fc9996ef", "metadata": {}, "outputs": [], "source": [ "# Function used to display NER entities\n", "def displayNER(doc, includePunct=False):\n", " \"\"\"\n", " Generate data frame for visualization of spaCy doc with custom attributes.\n", " \"\"\"\n", " rows = []\n", " for i, t in enumerate(doc):\n", " if not t.is_punct or includePunct:\n", " row = {'token': i,\n", " 'text': t.text, 'lemma': t.lemma_,\n", " 'pos': t.pos_, 'dep': t.dep_, 'ent_type': t.ent_type_,\n", " 'ent_iob_': t.ent_iob_}\n", " if doc.has_extension('coref_chains'):\n", " if t.has_extension('coref_chains') and t._.coref_chains: # neuralcoref attributes\n", " row['coref_chains'] = t._.coref_chains.pretty_representation\n", " else:\n", " row['coref_chains'] = None\n", " if t.has_extension('ref_n'): # referent attribute\n", " row['ref_n'] = t._.ref_n\n", " row['ref_t'] = t._.ref_t\n", " if t.has_extension('ref_ent'): # ref_n/ref_t\n", " row['ref_ent'] = t._.ref_ent\n", " rows.append(row)\n", " df = pd.DataFrame(rows).set_index('token')\n", " df.index.name = None\n", " return df\n", "\n", "\n", "# Reset Pipelines\n", "def resetPipeline(nlp, pipes):\n", " \"\"\"\n", " remove all custom pipes, and add new pipes\n", " \"\"\"\n", " customPipes = [pipe for (pipe, _) in nlp.pipeline\n", " if pipe not in ['tagger', 'parser',\n", " 'tok2vec', 'attribute_ruler', 'lemmatizer']]\n", " for pipe in customPipes:\n", " _ = nlp.remove_pipe(pipe)\n", " # re-add specified pipes\n", " for pipe in pipes:\n", " nlp.add_pipe(pipe)\n", "\n", "# Print Dependency Tree\n", "def printDepTree(doc, skipPunct=True):\n", " \"\"\"\n", " Utility function to pretty print the dependency tree.\n", " \"\"\"\n", " def printRecursive(root, indent, skipPunct):\n", " if not root.dep_ == 'punct' or not skipPunct:\n", " print(\" \"*indent + f\"{root} [{root.pos_}, {root.dep_}]\")\n", " for left in root.lefts:\n", " printRecursive(left, indent=indent+4, skipPunct=skipPunct)\n", " for right in root.rights:\n", " printRecursive(right, indent=indent+4, skipPunct=skipPunct)\n", "\n", " for sent in doc.sents: # iterate over all sentences in a doc\n", " printRecursive(sent.root, indent=0, skipPunct=skipPunct)" ] }, { "cell_type": "markdown", "id": "72461323-634d-47c7-9070-c9b560e9f8cd", "metadata": {}, "source": [ "# Internal Developed Pipelines" ] }, { "cell_type": "code", "execution_count": 3, "id": "e87f1527-df2f-4650-8f2a-d41a1e2b554f", "metadata": {}, "outputs": [], "source": [ "# Normalizing Named Entities, remove the leading article and trailing particle\n", "@Language.component(\"normEntities\")\n", "def normEntities(doc):\n", " \"\"\"\n", " Normalizing Named Entities, remove the leading article and trailing particle\n", " @ In, doc, spacy.tokens.doc.Doc\n", " @ Out, doc, spacy.tokens.doc.Doc\n", " \"\"\"\n", " ents = []\n", " for ent in doc.ents:\n", " if ent[0].pos_ == \"DET\": # leading article\n", " ent = Span(doc, ent.start+1, ent.end, label=ent.label)\n", " if len(ent) > 0:\n", " if ent[-1].pos_ == \"PART\": # trailing particle like 's\n", " ent = Span(doc, ent.start, ent.end-1, label=ent.label)\n", " if len(ent) > 0:\n", " ents.append(ent)\n", " doc.ents = tuple(ents)\n", " return doc\n", "\n", "# Initialize Coreference Attributes with Entity Info\n", "@Language.component(\"initCoref\")\n", "def initCoref(doc):\n", " for e in doc.ents:\n", " e[0]._.ref_n, e[0]._.ref_t = e.text, e.label_\n", " return doc\n", "\n", "# Anaphora resolution using coreferee\n", "@Language.component(\"anaphorCoref\")\n", "def anaphorCoref(doc):\n", " \"\"\"\n", " Anaphora resolution using coreferee\n", " This pipeline need to be added after NER.\n", " The assumption here is: The entities need to be recognized first, then call\n", " pipeline \"initCoref\" to assign initial custom attribute \"ref_n\" and \"ref_t\",\n", " then call pipeline \"aliasResolver\" to resolve all the aliases used in the text.\n", " After all these pre-processes, we can use \"anaphorCoref\" pipeline to resolve the\n", " coreference.\n", " \"\"\"\n", " if not Token.has_extension('coref_chains'):\n", " return doc\n", " for token in doc:\n", " coref = token._.coref_chains\n", " # if token is coref and not already dereferenced\n", " if coref and token._.ref_n == '':\n", " # check all the references, if \"ref_n\" is available (determined by NER and initCoref),\n", " # the value of \"ref_n\" will be assigned to current token\n", " for chain in coref:\n", " for ref in chain:\n", " refToken = doc[ref[0]]\n", " if refToken._.ref_n != '':\n", " token._.ref_n = refToken._.ref_n\n", " token._.ref_t = refToken._.ref_t\n", " break\n", " return doc\n", "\n", "# Expand the current entities, recursive function to extend entity with all previous NOUN\n", "@Language.component(\"expandEntities\")\n", "def expandEntities(doc):\n", " \"\"\"\n", " Expand the current entities, recursive function to extend entity with all previous NOUN\n", " \"\"\"\n", " newEnts = []\n", " isUpdated = False\n", " for ent in doc.ents:\n", " if ent.label_ == \"SSC\" and ent.start != 0:\n", " prevToken = doc[ent.start - 1]\n", " if prevToken.pos_ in ['NOUN']:\n", " newEnt = Span(doc, ent.start - 1, ent.end, label=ent.label)\n", " newEnts.append(newEnt)\n", " isUpdated = True\n", " else:\n", " newEnts.append(ent)\n", " doc.ents = newEnts\n", " if isUpdated:\n", " doc = expandEntities(doc)\n", " return doc" ] }, { "cell_type": "markdown", "id": "f0b9239d", "metadata": {}, "source": [ "### Reset NLP Pipeline" ] }, { "cell_type": "code", "execution_count": 4, "id": "dd59f6b0-bfa8-44f9-b73b-0bc7369616b3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('tok2vec', ),\n", " ('tagger', ),\n", " ('parser', ),\n", " ('attribute_ruler',\n", " ),\n", " ('lemmatizer', ),\n", " ('entity_ruler', ),\n", " ('normEntities', ),\n", " ('initCoref', ),\n", " ('coreferee', ),\n", " ('anaphorCoref', ),\n", " ('expandEntities', )]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipelines = ['entity_ruler','normEntities', 'initCoref', 'coreferee','anaphorCoref', 'expandEntities']\n", "resetPipeline(nlp, pipelines)\n", "nlp.pipeline" ] }, { "cell_type": "markdown", "id": "7c77114f", "metadata": {}, "source": [ "### Example" ] }, { "cell_type": "code", "execution_count": 5, "id": "dbc474ab-204f-4c6b-8f10-e76dbdd5457c", "metadata": {}, "outputs": [], "source": [ "text = r\"\"\"A leak was noticed from the RCP pump 1A.\n", " The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.\n", " The RCP pump 1A pressure gauge was found inoperative.\n", " Rupture of pump bearings caused shaft degradation.\n", " Rupture of pump bearings caused shaft degradation and consequent flow reduction.\n", " Pump power supply has been found burnout.\n", " Pump test failed due to power supply failure.\n", " Pump inspection revealed excessive impeller degradation.\n", " Pump inspection revealed excessive impeller degradation likely due to cavitation.\n", " \"\"\"" ] }, { "cell_type": "markdown", "id": "925872a9", "metadata": {}, "source": [ "#### Simple match " ] }, { "cell_type": "code", "execution_count": 6, "id": "8c459300-fc7a-42cf-82d6-b2b04e292d61", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Identified Entities:\n", "Entity: pump | Label: comp | Sentence A leak was noticed from the RCP pump 1A.\n", " The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.\n", " \n", "Entity: pump | Label: comp | Sentence A leak was noticed from the RCP pump 1A.\n", " The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.\n", " \n", "Entity: pump | Label: comp | Sentence The RCP pump 1A pressure gauge was found inoperative.\n", " \n", "Entity: pump | Label: comp | Sentence Rupture of pump bearings caused shaft degradation.\n", " \n", "Entity: pump | Label: comp | Sentence Rupture of pump bearings caused shaft degradation and consequent flow reduction.\n", " \n", "Entity: Pump | Label: comp | Sentence Pump power supply has been found burnout.\n", " \n", "Entity: Pump | Label: comp | Sentence Pump test failed due to power supply failure.\n", " \n", "Entity: Pump | Label: comp | Sentence Pump inspection revealed excessive impeller degradation.\n", " \n", "Entity: Pump | Label: comp | Sentence Pump inspection revealed excessive impeller degradation likely due to cavitation.\n", " \n" ] }, { "data": { "text/html": [ "
A leak was noticed from the RCP pump 1A.
The RCP pump 1A pressure \n", "\n", " gauge\n", " comp\n", "\n", " was found not operating, and it was found inoperative.
The RCP pump 1A pressure \n", "\n", " gauge\n", " comp\n", "\n", " was found inoperative.
Rupture of pump bearings caused shaft degradation.
Rupture of pump bearings caused shaft degradation and consequent flow reduction.
Pump power supply has been found burnout.
Pump test failed due to power supply failure.
Pump inspection revealed excessive impeller degradation.
Pump inspection revealed excessive impeller degradation likely due to cavitation.
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Dependency Tree:\n", "noticed [VERB, ROOT]\n", " leak [NOUN, nsubjpass]\n", " A [DET, det]\n", " was [AUX, auxpass]\n", " from [ADP, prep]\n", " RCP [PROPN, pobj]\n", " the [DET, det]\n", " pump [VERB, conj]\n", " 1A. [NUM, dobj]\n", " \n", " [SPACE, dep]\n", " pump [VERB, relcl]\n", " RCP [PROPN, nsubj]\n", " The [DET, det]\n", " found [VERB, conj]\n", " gauge [NOUN, nsubjpass]\n", " 1A [NOUN, compound]\n", " pressure [NOUN, compound]\n", " was [AUX, auxpass]\n", " operating [VERB, xcomp]\n", " not [PART, neg]\n", " and [CCONJ, cc]\n", " found [VERB, conj]\n", " it [PRON, nsubjpass]\n", " was [AUX, auxpass]\n", " inoperative [ADJ, oprd]\n", " \n", " [SPACE, dep]\n", "pump [VERB, ROOT]\n", " RCP [PROPN, nsubj]\n", " The [DET, det]\n", " found [VERB, conj]\n", " gauge [NOUN, nsubjpass]\n", " 1A [NOUN, compound]\n", " pressure [NOUN, compound]\n", " was [AUX, auxpass]\n", " inoperative [ADJ, oprd]\n", " \n", " [SPACE, dep]\n", "caused [VERB, ROOT]\n", " Rupture [NOUN, nsubj]\n", " of [ADP, prep]\n", " bearings [NOUN, pobj]\n", " pump [NOUN, compound]\n", " degradation [NOUN, dobj]\n", " shaft [NOUN, compound]\n", " \n", " [SPACE, dep]\n", "caused [VERB, ROOT]\n", " Rupture [NOUN, nsubj]\n", " of [ADP, prep]\n", " bearings [NOUN, pobj]\n", " pump [NOUN, compound]\n", " degradation [NOUN, dobj]\n", " shaft [NOUN, compound]\n", " and [CCONJ, cc]\n", " reduction [NOUN, conj]\n", " flow [NOUN, compound]\n", " consequent [ADJ, amod]\n", " \n", " [SPACE, dep]\n", "found [VERB, ROOT]\n", " supply [NOUN, nsubjpass]\n", " Pump [NOUN, compound]\n", " power [NOUN, compound]\n", " has [AUX, aux]\n", " been [AUX, auxpass]\n", " burnout [NOUN, oprd]\n", " \n", " [SPACE, dep]\n", "failed [VERB, ROOT]\n", " test [NOUN, nsubj]\n", " Pump [NOUN, compound]\n", " due [ADP, prep]\n", " to [ADP, pcomp]\n", " failure [NOUN, pobj]\n", " supply [NOUN, compound]\n", " power [NOUN, compound]\n", " \n", " [SPACE, dep]\n", "revealed [VERB, ROOT]\n", " inspection [NOUN, nsubj]\n", " Pump [NOUN, compound]\n", " degradation [NOUN, dobj]\n", " excessive [ADJ, amod]\n", " impeller [NOUN, compound]\n", " \n", " [SPACE, dep]\n", "revealed [VERB, ROOT]\n", " inspection [NOUN, nsubj]\n", " Pump [NOUN, compound]\n", " degradation [NOUN, dobj]\n", " excessive [ADJ, amod]\n", " impeller [NOUN, compound]\n", " likely [ADV, ccomp]\n", " due [ADP, prep]\n", " to [ADP, pcomp]\n", " cavitation [NOUN, pobj]\n", " \n", " [SPACE, dep]\n" ] } ], "source": [ "patterns = [{\"label\":\"comp\", \"pattern\":[{\"LOWER\":\"gauge\"}], \"id\":\"ssc\"}]\n", "ruler = nlp.get_pipe('entity_ruler')\n", "ruler.add_patterns(patterns)\n", "rules = [{\"LOWER\":\"pump\"}]\n", "matcher = Matcher(nlp.vocab)\n", "matcher.add('comp', [rules])\n", "\n", "doc = nlp(text)\n", "matches = matcher(doc, as_spans=True)\n", "print('Identified Entities:')\n", "for span in matches:\n", " print('Entity:', span.text, '| Label:', span.label_, '| Sentence', span.sent)\n", "\n", "displacy.render(doc, style='ent', jupyter=True)\n", "\n", "print('Dependency Tree:')\n", "printDepTree(doc)" ] }, { "cell_type": "markdown", "id": "f4c2ff01", "metadata": {}, "source": [ "#### Generate data frame for visualization of spaCy doc with custom attributes." ] }, { "cell_type": "code", "execution_count": 7, "id": "0e712288-8fb5-4c83-960d-b895fb7c4dba", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textlemmaposdepent_typeent_iob_coref_chainsref_nref_t
0AaDETdetONone
1leakleakNOUNnsubjpassONone
2wasbeAUXauxpassONone
3noticednoticeVERBROOTONone
4fromfromADPprepONone
..............................
94likelylikelyADVccompONone
95duedueADPprepONone
96totoADPpcompONone
97cavitationcavitationNOUNpobjONone
99\\n\\nSPACEdepONone
\n", "

91 rows × 9 columns

\n", "
" ], "text/plain": [ " text lemma pos dep ent_type ent_iob_ coref_chains \\\n", "0 A a DET det O None \n", "1 leak leak NOUN nsubjpass O None \n", "2 was be AUX auxpass O None \n", "3 noticed notice VERB ROOT O None \n", "4 from from ADP prep O None \n", ".. ... ... ... ... ... ... ... \n", "94 likely likely ADV ccomp O None \n", "95 due due ADP prep O None \n", "96 to to ADP pcomp O None \n", "97 cavitation cavitation NOUN pobj O None \n", "99 \\n \\n SPACE dep O None \n", "\n", " ref_n ref_t \n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", ".. ... ... \n", "94 \n", "95 \n", "96 \n", "97 \n", "99 \n", "\n", "[91 rows x 9 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = displayNER(doc)\n", "df" ] }, { "cell_type": "markdown", "id": "13a0825d-1bb1-4ec0-817c-eb0536bc7968", "metadata": {}, "source": [ "#### Coreference Analysis" ] }, { "cell_type": "code", "execution_count": 8, "id": "807d97dd-3627-4c2f-ae2d-89961da7860e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Coreference Info: \n", " 0: RCP(6), RCP(11), RCP(29); 1: gauge(15), it(22)\n", "Label for token \"it\" is \"gauge\"\n" ] } ], "source": [ "print('Coreference Info: \\n', doc._.coref_chains.pretty_representation)\n", "\n", "print(f'Label for token \"{doc[22]}\" is \"{doc[22]._.ref_n}\"')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }