{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "990027c9",
   "metadata": {},
   "source": [
    "## Alias Pipeline Demo\n",
    "\n",
    "Alias pipeline (i.e., aliasResolver) is one of the internal developed natural language processing (NLP) pipelines. It is used to annotate identified name entities with alias, and it can be accessed through:\n",
    "\n",
    "```Python\n",
    "ent._.alias\n",
    "```\n",
    "\n",
    "__Note__: default alias file located at ``./DACKAR/data/alias.csv`` is used. Users can also provide their own alias using config file located at ``./DACKAR/src/dackar/config/nlp_config_default.toml`` using keyword ``alias_file``."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "33f411ee",
   "metadata": {},
   "source": [
    "- Set up"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c8612552-0a4d-4832-81d7-b6163552c286",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setup loading path, and load aliasResolver pipeline\n",
    "import os, sys\n",
    "cwd = os.getcwd()\n",
    "frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))\n",
    "sys.path.append(frameworkDir)\n",
    "\n",
    "# Load aliasResolver pipeline\n",
    "from dackar.pipelines.CustomPipelineComponents import aliasResolver\n",
    "from dackar.utils.nlp.nlp_utils import resetPipeline\n",
    "\n",
    "# Load General Entity pipeline aliasResolver is only used to annotate entities\n",
    "from dackar.pipelines.GeneralEntity import GeneralEntity\n",
    "\n",
    "# Load pattern generation\n",
    "from dackar.utils.nlp.nlp_utils import generatePatternList\n",
    "\n",
    "# Load trained language model/pipeline from spacy, the language model/pipeline includes tok2vec, tagger, parser, attribute_ruler, lemmatizer, ner etc.\n",
    "import spacy\n",
    "from spacy import displacy\n",
    "nlp = spacy.load(\"en_core_web_lg\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7168ca9f",
   "metadata": {},
   "source": [
    "- Reset pipeline and add __aliasResolver__ "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7614e9f2-1d50-49d8-8e9b-8c7dd0230419",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x33ef4ed50>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x33ef4f830>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x33ef322d0>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x33f0c0850>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x33f0c18d0>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x33ef32650>), ('aliasResolver', <function aliasResolver at 0x32be9d440>)]\n"
     ]
    }
   ],
   "source": [
    "# aliasResolver pipeline should always after \"entity_ruler\"\n",
    "pipelines = ['aliasResolver']\n",
    "resetPipeline(nlp, pipelines)\n",
    "print(nlp.pipeline)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "080c0a1a",
   "metadata": {},
   "source": [
    "- Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c744e125-5782-4592-8048-9f17cc43c2c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Example\n",
    "text=\"1-91120-P1, CLEAN PUMP AND MOTOR. 1-91120-PM1 REQUIRES OIL. 91120, CLEAN TRASH SCREEN\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0c8fcfc3-9f0f-4fac-92ab-07dea3a06d63",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    1-91120-P1\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">cws_component</span>\n",
       "</mark>\n",
       ", CLEAN PUMP AND MOTOR. \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    1-91120-PM1\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">cws_component</span>\n",
       "</mark>\n",
       " REQUIRES OIL. \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    91120\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">cws_component</span>\n",
       "</mark>\n",
       ", CLEAN TRASH SCREEN</div></span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# For this demo, General Entity class is used instead spacy default NER pipeline\n",
    "pipeline = [pipe for (pipe,_) in nlp.pipeline]\n",
    "if \"enity_ruler\" in pipeline:\n",
    "    nlp.remove_pipe(\"entity_ruler\")\n",
    "if \"ner\" in pipeline:\n",
    "    nlp.remove_pipe(\"ner\")\n",
    "\n",
    "# Specify Entities Labels and IDs\n",
    "entLabel = \"cws_component\"        # user defined entity label\n",
    "entId = \"OPM\"\n",
    "entIDList = ['1-91120-P1', '1-91120-PM1', '91120']\n",
    "# Generate pattern list\n",
    "patternsEnts = generatePatternList(entIDList, label=entLabel,    id=entId,    nlp=nlp, attr=\"LEMMA\")\n",
    "# Apply General Entity class to identify corresponding entities\n",
    "generalEntity = GeneralEntity(nlp, patternsEnts)\n",
    "# NLP processing and Entities visualization\n",
    "doc = nlp(text)\n",
    "displacy.render(doc, style='ent', jupyter=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5e4cefd2-bc47-4151-96b0-834742614972",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Entity: 1-91120-P1 | alias: unit 1 pump\n",
      "Entity: 1-91120-PM1 | alias: unit 1 pump motor\n",
      "Entity: 91120 | alias: pump\n"
     ]
    }
   ],
   "source": [
    "# Check 'alias' annotation\n",
    "for ent in doc.ents:\n",
    "    print('Entity:', ent.text, '| alias:', ent._.alias)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}