{ "cells": [ { "cell_type": "markdown", "id": "1a68a161", "metadata": {}, "source": [ "## Custom Pipelines Demo\n", "\n", "- normEntities: Normalizing Named Entities, remove the leading article and trailing particle\n", "- initCoref: Initialize Coreference Attributes with Entity Info\n", "- anaphorCoref: Anaphora resolution using coreferee\n", "- expandEntities: Expand the current entities, recursive function to extend entity with all previous NOUN" ] }, { "cell_type": "code", "execution_count": null, "id": "46237dbc-ac22-482f-9373-1c1eaee56f40", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import spacy\n", "from spacy.tokens import Span\n", "from spacy.language import Language\n", "from spacy.matcher import Matcher\n", "from spacy.tokens import Token\n", "from spacy import displacy\n", "import coreferee\n", "\n", "#### Using spacy's Token extensions for coreferee\n", "if Token.has_extension('ref_n'):\n", " _ = Token.remove_extension('ref_n')\n", "if Token.has_extension('ref_t'):\n", " _ = Token.remove_extension('ref_t')\n", "if Token.has_extension('ref_t_'):\n", " _ = Token.remove_extension('ref_t_')\n", "Token.set_extension('ref_n', default='')\n", "Token.set_extension('ref_t', default='')\n", "\n", "nlp = spacy.load(\"en_core_web_lg\")" ] }, { "cell_type": "markdown", "id": "7aba47c0", "metadata": {}, "source": [ "### Internal Developed Functions" ] }, { "cell_type": "code", "execution_count": null, "id": "86be0452-6e98-46b4-9e4f-4447fc9996ef", "metadata": {}, "outputs": [], "source": [ "# Function used to display NER entities\n", "def displayNER(doc, includePunct=False):\n", " \"\"\"\n", " Generate data frame for visualization of spaCy doc with custom attributes.\n", " \"\"\"\n", " rows = []\n", " for i, t in enumerate(doc):\n", " if not t.is_punct or includePunct:\n", " row = {'token': i,\n", " 'text': t.text, 'lemma': t.lemma_,\n", " 'pos': t.pos_, 'dep': t.dep_, 'ent_type': t.ent_type_,\n", " 'ent_iob_': t.ent_iob_}\n", " if doc.has_extension('coref_chains'):\n", " if t.has_extension('coref_chains') and t._.coref_chains: # neuralcoref attributes\n", " row['coref_chains'] = t._.coref_chains.pretty_representation\n", " else:\n", " row['coref_chains'] = None\n", " if t.has_extension('ref_n'): # referent attribute\n", " row['ref_n'] = t._.ref_n\n", " row['ref_t'] = t._.ref_t\n", " if t.has_extension('ref_ent'): # ref_n/ref_t\n", " row['ref_ent'] = t._.ref_ent\n", " rows.append(row)\n", " df = pd.DataFrame(rows).set_index('token')\n", " df.index.name = None\n", " return df\n", "\n", "\n", "# Reset Pipelines\n", "def resetPipeline(nlp, pipes):\n", " \"\"\"\n", " remove all custom pipes, and add new pipes\n", " \"\"\"\n", " customPipes = [pipe for (pipe, _) in nlp.pipeline\n", " if pipe not in ['tagger', 'parser',\n", " 'tok2vec', 'attribute_ruler', 'lemmatizer']]\n", " for pipe in customPipes:\n", " _ = nlp.remove_pipe(pipe)\n", " # re-add specified pipes\n", " for pipe in pipes:\n", " nlp.add_pipe(pipe)\n", "\n", "# Print Dependency Tree\n", "def printDepTree(doc, skipPunct=True):\n", " \"\"\"\n", " Utility function to pretty print the dependency tree.\n", " \"\"\"\n", " def printRecursive(root, indent, skipPunct):\n", " if not root.dep_ == 'punct' or not skipPunct:\n", " print(\" \"*indent + f\"{root} [{root.pos_}, {root.dep_}]\")\n", " for left in root.lefts:\n", " printRecursive(left, indent=indent+4, skipPunct=skipPunct)\n", " for right in root.rights:\n", " printRecursive(right, indent=indent+4, skipPunct=skipPunct)\n", "\n", " for sent in doc.sents: # iterate over all sentences in a doc\n", " printRecursive(sent.root, indent=0, skipPunct=skipPunct)" ] }, { "cell_type": "markdown", "id": "72461323-634d-47c7-9070-c9b560e9f8cd", "metadata": {}, "source": [ "# Internal Developed Pipelines" ] }, { "cell_type": "code", "execution_count": null, "id": "e87f1527-df2f-4650-8f2a-d41a1e2b554f", "metadata": {}, "outputs": [], "source": [ "# Normalizing Named Entities, remove the leading article and trailing particle\n", "@Language.component(\"normEntities\")\n", "def normEntities(doc):\n", " \"\"\"\n", " Normalizing Named Entities, remove the leading article and trailing particle\n", " @ In, doc, spacy.tokens.doc.Doc\n", " @ Out, doc, spacy.tokens.doc.Doc\n", " \"\"\"\n", " ents = []\n", " for ent in doc.ents:\n", " if ent[0].pos_ == \"DET\": # leading article\n", " ent = Span(doc, ent.start+1, ent.end, label=ent.label)\n", " if len(ent) > 0:\n", " if ent[-1].pos_ == \"PART\": # trailing particle like 's\n", " ent = Span(doc, ent.start, ent.end-1, label=ent.label)\n", " if len(ent) > 0:\n", " ents.append(ent)\n", " doc.ents = tuple(ents)\n", " return doc\n", "\n", "# Initialize Coreference Attributes with Entity Info\n", "@Language.component(\"initCoref\")\n", "def initCoref(doc):\n", " for e in doc.ents:\n", " e[0]._.ref_n, e[0]._.ref_t = e.text, e.label_\n", " return doc\n", "\n", "# Anaphora resolution using coreferee\n", "@Language.component(\"anaphorCoref\")\n", "def anaphorCoref(doc):\n", " \"\"\"\n", " Anaphora resolution using coreferee\n", " This pipeline need to be added after NER.\n", " The assumption here is: The entities need to be recognized first, then call\n", " pipeline \"initCoref\" to assign initial custom attribute \"ref_n\" and \"ref_t\",\n", " then call pipeline \"aliasResolver\" to resolve all the aliases used in the text.\n", " After all these pre-processes, we can use \"anaphorCoref\" pipeline to resolve the\n", " coreference.\n", " \"\"\"\n", " if not Token.has_extension('coref_chains'):\n", " return doc\n", " for token in doc:\n", " coref = token._.coref_chains\n", " # if token is coref and not already dereferenced\n", " if coref and token._.ref_n == '':\n", " # check all the references, if \"ref_n\" is available (determined by NER and initCoref),\n", " # the value of \"ref_n\" will be assigned to current token\n", " for chain in coref:\n", " for ref in chain:\n", " refToken = doc[ref[0]]\n", " if refToken._.ref_n != '':\n", " token._.ref_n = refToken._.ref_n\n", " token._.ref_t = refToken._.ref_t\n", " break\n", " return doc\n", "\n", "# Expand the current entities, recursive function to extend entity with all previous NOUN\n", "@Language.component(\"expandEntities\")\n", "def expandEntities(doc):\n", " \"\"\"\n", " Expand the current entities, recursive function to extend entity with all previous NOUN\n", " \"\"\"\n", " newEnts = []\n", " isUpdated = False\n", " for ent in doc.ents:\n", " if ent.label_ == \"SSC\" and ent.start != 0:\n", " prevToken = doc[ent.start - 1]\n", " if prevToken.pos_ in ['NOUN']:\n", " newEnt = Span(doc, ent.start - 1, ent.end, label=ent.label)\n", " newEnts.append(newEnt)\n", " isUpdated = True\n", " else:\n", " newEnts.append(ent)\n", " doc.ents = newEnts\n", " if isUpdated:\n", " doc = expandEntities(doc)\n", " return doc" ] }, { "cell_type": "markdown", "id": "f0b9239d", "metadata": {}, "source": [ "### Reset NLP Pipeline" ] }, { "cell_type": "code", "execution_count": null, "id": "dd59f6b0-bfa8-44f9-b73b-0bc7369616b3", "metadata": {}, "outputs": [], "source": [ "pipelines = ['entity_ruler','normEntities', 'initCoref', 'coreferee','anaphorCoref', 'expandEntities']\n", "resetPipeline(nlp, pipelines)\n", "nlp.pipeline" ] }, { "cell_type": "markdown", "id": "7c77114f", "metadata": {}, "source": [ "### Example" ] }, { "cell_type": "code", "execution_count": null, "id": "dbc474ab-204f-4c6b-8f10-e76dbdd5457c", "metadata": {}, "outputs": [], "source": [ "text = r\"\"\"A leak was noticed from the RCP pump 1A.\n", " The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.\n", " The RCP pump 1A pressure gauge was found inoperative.\n", " Rupture of pump bearings caused shaft degradation.\n", " Rupture of pump bearings caused shaft degradation and consequent flow reduction.\n", " Pump power supply has been found burnout.\n", " Pump test failed due to power supply failure.\n", " Pump inspection revealed excessive impeller degradation.\n", " Pump inspection revealed excessive impeller degradation likely due to cavitation.\n", " \"\"\"" ] }, { "cell_type": "markdown", "id": "925872a9", "metadata": {}, "source": [ "#### Simple match " ] }, { "cell_type": "code", "execution_count": null, "id": "8c459300-fc7a-42cf-82d6-b2b04e292d61", "metadata": {}, "outputs": [], "source": [ "patterns = [{\"label\":\"comp\", \"pattern\":[{\"LOWER\":\"gauge\"}], \"id\":\"ssc\"}]\n", "ruler = nlp.get_pipe('entity_ruler')\n", "ruler.add_patterns(patterns)\n", "rules = [{\"LOWER\":\"pump\"}]\n", "matcher = Matcher(nlp.vocab)\n", "matcher.add('comp', [rules])\n", "\n", "doc = nlp(text)\n", "matches = matcher(doc, as_spans=True)\n", "print('Identified Entities:')\n", "for span in matches:\n", " print('Entity:', span.text, '| Label:', span.label_, '| Sentence', span.sent)\n", "\n", "displacy.render(doc, style='ent', jupyter=True)\n", "\n", "print('Dependency Tree:')\n", "printDepTree(doc)" ] }, { "cell_type": "markdown", "id": "f4c2ff01", "metadata": {}, "source": [ "#### Generate data frame for visualization of spaCy doc with custom attributes." ] }, { "cell_type": "code", "execution_count": null, "id": "0e712288-8fb5-4c83-960d-b895fb7c4dba", "metadata": {}, "outputs": [], "source": [ "df = displayNER(doc)\n", "df" ] }, { "cell_type": "markdown", "id": "13a0825d-1bb1-4ec0-817c-eb0536bc7968", "metadata": {}, "source": [ "#### Coreference Analysis" ] }, { "cell_type": "code", "execution_count": null, "id": "807d97dd-3627-4c2f-ae2d-89961da7860e", "metadata": {}, "outputs": [], "source": [ "print('Coreference Info: \\n', doc._.coref_chains.pretty_representation)\n", "\n", "print(f'Label for token \"{doc[22]}\" is \"{doc[22]._.ref_n}\"')" ] } ], "metadata": { "kernelspec": { "display_name": "dackar_libs", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }