{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1a68a161",
   "metadata": {},
   "source": [
    "## Custom Pipelines Demo\n",
    "\n",
    "- normEntities: Normalizing Named Entities, remove the leading article and trailing particle\n",
    "- initCoref: Initialize Coreference Attributes with Entity Info\n",
    "- anaphorCoref: Anaphora resolution using coreferee\n",
    "- expandEntities: Expand the current entities, recursive function to extend entity with all previous NOUN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46237dbc-ac22-482f-9373-1c1eaee56f40",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import spacy\n",
    "from spacy.tokens import Span\n",
    "from spacy.language import Language\n",
    "from spacy.matcher import Matcher\n",
    "from spacy.tokens import Token\n",
    "from spacy import displacy\n",
    "import coreferee\n",
    "\n",
    "#### Using spacy's Token extensions for coreferee\n",
    "if Token.has_extension('ref_n'):\n",
    "  _ = Token.remove_extension('ref_n')\n",
    "if Token.has_extension('ref_t'):\n",
    "  _ = Token.remove_extension('ref_t')\n",
    "if Token.has_extension('ref_t_'):\n",
    "  _ = Token.remove_extension('ref_t_')\n",
    "Token.set_extension('ref_n', default='')\n",
    "Token.set_extension('ref_t', default='')\n",
    "\n",
    "nlp = spacy.load(\"en_core_web_lg\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7aba47c0",
   "metadata": {},
   "source": [
    "### Internal Developed Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "86be0452-6e98-46b4-9e4f-4447fc9996ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function used to display NER entities\n",
    "def displayNER(doc, includePunct=False):\n",
    "  \"\"\"\n",
    "    Generate data frame for visualization of spaCy doc with custom attributes.\n",
    "  \"\"\"\n",
    "  rows = []\n",
    "  for i, t in enumerate(doc):\n",
    "    if not t.is_punct or includePunct:\n",
    "      row = {'token': i,\n",
    "             'text': t.text, 'lemma': t.lemma_,\n",
    "             'pos': t.pos_, 'dep': t.dep_, 'ent_type': t.ent_type_,\n",
    "             'ent_iob_': t.ent_iob_}\n",
    "      if doc.has_extension('coref_chains'):\n",
    "        if t.has_extension('coref_chains') and t._.coref_chains: # neuralcoref attributes\n",
    "          row['coref_chains'] = t._.coref_chains.pretty_representation\n",
    "        else:\n",
    "          row['coref_chains'] = None\n",
    "      if t.has_extension('ref_n'): # referent attribute\n",
    "        row['ref_n'] = t._.ref_n\n",
    "        row['ref_t'] = t._.ref_t\n",
    "      if t.has_extension('ref_ent'): # ref_n/ref_t\n",
    "        row['ref_ent'] = t._.ref_ent\n",
    "      rows.append(row)\n",
    "  df = pd.DataFrame(rows).set_index('token')\n",
    "  df.index.name = None\n",
    "  return df\n",
    "\n",
    "\n",
    "# Reset Pipelines\n",
    "def resetPipeline(nlp, pipes):\n",
    "  \"\"\"\n",
    "    remove all custom pipes, and add new pipes\n",
    "  \"\"\"\n",
    "  customPipes = [pipe for (pipe, _) in nlp.pipeline\n",
    "                  if pipe not in ['tagger', 'parser',\n",
    "                                  'tok2vec', 'attribute_ruler', 'lemmatizer']]\n",
    "  for pipe in customPipes:\n",
    "    _ = nlp.remove_pipe(pipe)\n",
    "  # re-add specified pipes\n",
    "  for pipe in pipes:\n",
    "    nlp.add_pipe(pipe)\n",
    "\n",
    "# Print Dependency Tree\n",
    "def printDepTree(doc, skipPunct=True):\n",
    "  \"\"\"\n",
    "    Utility function to pretty print the dependency tree.\n",
    "  \"\"\"\n",
    "  def printRecursive(root, indent, skipPunct):\n",
    "    if not root.dep_ == 'punct' or not skipPunct:\n",
    "      print(\" \"*indent + f\"{root} [{root.pos_}, {root.dep_}]\")\n",
    "    for left in root.lefts:\n",
    "      printRecursive(left, indent=indent+4, skipPunct=skipPunct)\n",
    "    for right in root.rights:\n",
    "      printRecursive(right, indent=indent+4, skipPunct=skipPunct)\n",
    "\n",
    "  for sent in doc.sents: # iterate over all sentences in a doc\n",
    "    printRecursive(sent.root, indent=0, skipPunct=skipPunct)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "72461323-634d-47c7-9070-c9b560e9f8cd",
   "metadata": {},
   "source": [
    "# Internal Developed Pipelines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e87f1527-df2f-4650-8f2a-d41a1e2b554f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Normalizing Named Entities, remove the leading article and trailing particle\n",
    "@Language.component(\"normEntities\")\n",
    "def normEntities(doc):\n",
    "  \"\"\"\n",
    "    Normalizing Named Entities, remove the leading article and trailing particle\n",
    "    @ In, doc, spacy.tokens.doc.Doc\n",
    "    @ Out, doc, spacy.tokens.doc.Doc\n",
    "  \"\"\"\n",
    "  ents = []\n",
    "  for ent in doc.ents:\n",
    "    if ent[0].pos_ == \"DET\": # leading article\n",
    "      ent = Span(doc, ent.start+1, ent.end, label=ent.label)\n",
    "    if len(ent) > 0:\n",
    "      if ent[-1].pos_ == \"PART\": # trailing particle like 's\n",
    "        ent = Span(doc, ent.start, ent.end-1, label=ent.label)\n",
    "      if len(ent) > 0:\n",
    "        ents.append(ent)\n",
    "  doc.ents = tuple(ents)\n",
    "  return doc\n",
    "\n",
    "# Initialize Coreference Attributes with Entity Info\n",
    "@Language.component(\"initCoref\")\n",
    "def initCoref(doc):\n",
    "  for e in doc.ents:\n",
    "    e[0]._.ref_n, e[0]._.ref_t = e.text, e.label_\n",
    "  return doc\n",
    "\n",
    "# Anaphora resolution using coreferee\n",
    "@Language.component(\"anaphorCoref\")\n",
    "def anaphorCoref(doc):\n",
    "  \"\"\"\n",
    "    Anaphora resolution using coreferee\n",
    "    This pipeline need to be added after NER.\n",
    "    The assumption here is: The entities need to be recognized first, then call\n",
    "    pipeline \"initCoref\" to assign initial custom attribute \"ref_n\" and \"ref_t\",\n",
    "    then call pipeline \"aliasResolver\" to resolve all the aliases used in the text.\n",
    "    After all these pre-processes, we can use \"anaphorCoref\" pipeline to resolve the\n",
    "    coreference.\n",
    "  \"\"\"\n",
    "  if not Token.has_extension('coref_chains'):\n",
    "    return doc\n",
    "  for token in doc:\n",
    "    coref = token._.coref_chains\n",
    "    # if token is coref and not already dereferenced\n",
    "    if coref and token._.ref_n == '':\n",
    "      # check all the references, if \"ref_n\" is available (determined by NER and initCoref),\n",
    "      # the value of \"ref_n\" will be assigned to current token\n",
    "      for chain in coref:\n",
    "        for ref in chain:\n",
    "          refToken = doc[ref[0]]\n",
    "          if refToken._.ref_n != '':\n",
    "            token._.ref_n = refToken._.ref_n\n",
    "            token._.ref_t = refToken._.ref_t\n",
    "            break\n",
    "  return doc\n",
    "\n",
    "# Expand the current entities, recursive function to extend entity with all previous NOUN\n",
    "@Language.component(\"expandEntities\")\n",
    "def expandEntities(doc):\n",
    "  \"\"\"\n",
    "    Expand the current entities, recursive function to extend entity with all previous NOUN\n",
    "  \"\"\"\n",
    "  newEnts = []\n",
    "  isUpdated = False\n",
    "  for ent in doc.ents:\n",
    "    if ent.label_ == \"SSC\" and ent.start != 0:\n",
    "      prevToken = doc[ent.start - 1]\n",
    "      if prevToken.pos_ in ['NOUN']:\n",
    "        newEnt = Span(doc, ent.start - 1, ent.end, label=ent.label)\n",
    "        newEnts.append(newEnt)\n",
    "        isUpdated = True\n",
    "    else:\n",
    "      newEnts.append(ent)\n",
    "  doc.ents = newEnts\n",
    "  if isUpdated:\n",
    "    doc = expandEntities(doc)\n",
    "  return doc"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f0b9239d",
   "metadata": {},
   "source": [
    "### Reset NLP Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd59f6b0-bfa8-44f9-b73b-0bc7369616b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "pipelines = ['entity_ruler','normEntities', 'initCoref', 'coreferee','anaphorCoref', 'expandEntities']\n",
    "resetPipeline(nlp, pipelines)\n",
    "nlp.pipeline"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7c77114f",
   "metadata": {},
   "source": [
    "### Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dbc474ab-204f-4c6b-8f10-e76dbdd5457c",
   "metadata": {},
   "outputs": [],
   "source": [
    "text = r\"\"\"A leak was noticed from the RCP pump 1A.\n",
    "          The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.\n",
    "          The RCP pump 1A pressure gauge was found inoperative.\n",
    "          Rupture of pump bearings caused shaft degradation.\n",
    "          Rupture of pump bearings caused shaft degradation and consequent flow reduction.\n",
    "          Pump power supply has been found burnout.\n",
    "          Pump test failed due to power supply failure.\n",
    "          Pump inspection revealed excessive impeller degradation.\n",
    "          Pump inspection revealed excessive impeller degradation likely due to cavitation.\n",
    "        \"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "925872a9",
   "metadata": {},
   "source": [
    "#### Simple match "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c459300-fc7a-42cf-82d6-b2b04e292d61",
   "metadata": {},
   "outputs": [],
   "source": [
    "patterns = [{\"label\":\"comp\", \"pattern\":[{\"LOWER\":\"gauge\"}], \"id\":\"ssc\"}]\n",
    "ruler = nlp.get_pipe('entity_ruler')\n",
    "ruler.add_patterns(patterns)\n",
    "rules = [{\"LOWER\":\"pump\"}]\n",
    "matcher = Matcher(nlp.vocab)\n",
    "matcher.add('comp', [rules])\n",
    "\n",
    "doc = nlp(text)\n",
    "matches = matcher(doc, as_spans=True)\n",
    "print('Identified Entities:')\n",
    "for span in matches:\n",
    "    print('Entity:', span.text, '| Label:', span.label_, '| Sentence', span.sent)\n",
    "\n",
    "displacy.render(doc, style='ent', jupyter=True)\n",
    "\n",
    "print('Dependency Tree:')\n",
    "printDepTree(doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f4c2ff01",
   "metadata": {},
   "source": [
    "#### Generate data frame for visualization of spaCy doc with custom attributes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e712288-8fb5-4c83-960d-b895fb7c4dba",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = displayNER(doc)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13a0825d-1bb1-4ec0-817c-eb0536bc7968",
   "metadata": {},
   "source": [
    "#### Coreference Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "807d97dd-3627-4c2f-ae2d-89961da7860e",
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Coreference Info: \\n', doc._.coref_chains.pretty_representation)\n",
    "\n",
    "print(f'Label for token \"{doc[22]}\" is \"{doc[22]._.ref_n}\"')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dackar_libs",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}