{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0d288bbe-c31c-4be8-9b5d-cb11b5a3a514",
   "metadata": {},
   "source": [
    "# Demo For NLP Workflow: Based on branch wangc/nlp"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "976dfa17-f796-43f4-a801-bd102a5ac551",
   "metadata": {},
   "source": [
    "### Set Paths and Loading Required Modules \n",
    "- Required libraries, please check SR2ML/dependencies.xml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d55142b4-776b-4d7a-8954-60f2c10a5075",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warming up PyWSD (takes ~10 secs)... took 2.4229929447174072 secs.\n"
     ]
    }
   ],
   "source": [
    "# External Modules #\n",
    "import os\n",
    "import sys\n",
    "import pandas as pd\n",
    "import spacy\n",
    "import logging\n",
    "import numerizer \n",
    "#######################\n",
    "\n",
    "# Settings #\n",
    "cwd = os.getcwd()\n",
    "frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))\n",
    "sys.path.append(frameworkDir)\n",
    "########################\n",
    "\n",
    "# Internal Modules #\n",
    "from dackar.workflows.RuleBasedMatcher import RuleBasedMatcher\n",
    "from dackar import config\n",
    "from dackar.utils.nlp.nlp_utils import generatePatternList\n",
    "from dackar.utils.opm.OPLparser import OPMobject\n",
    "from dackar.text_processing.Preprocessing import Preprocessing\n",
    "from dackar.text_processing.Preprocessing import SpellChecker\n",
    "#########################\n",
    "\n",
    "# logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.DEBUG)\n",
    "logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)\n",
    "nlp = spacy.load(\"en_core_web_lg\", exclude=[])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e042942d-2cc8-4085-997b-9278928ddeb4",
   "metadata": {},
   "source": [
    "### Initialize variables "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6e2447ac-efad-427a-840c-f5e1776fc5ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "cleanDoc = True         # prepocessor the raw doc text, i.e., remove brackets, repeatings, punctuations.\n",
    "numerizerDoc = True     # converts natural language numerics into ints and floats\n",
    "spellCorrectDoc = False # spell check and perform corrections\n",
    "\n",
    "entLabel = \"pump_component\"       # user defined entity label\n",
    "entId = \"SSC\"                     # user defined entity ID \n",
    "causalLabel = \"causal_keywords\"   # user defined causal keyword label \n",
    "causalID = \"causal\"               # user defined causal keyword ID \n",
    "ents = []                         # user provided entities list\n",
    "causalList = []                   # user provided causal keyword list\n",
    "\n",
    "removeBrackets = ['curly', 'square', 'round']\n",
    "removeRepeatings = ['.']\n",
    "# TODO: extend repeating_chars to handle a list of chars, right now we can only pass one chars\n",
    "removePunctuation = ['/', \"#\", '~'] # right now puncuation is replaced with whitespace, we may need to replace it with None\n",
    "# TODO: add replace functions, for example, replace acronyms with full name\n",
    "\n",
    "preprocessorList = ['bullet_points', \n",
    "                    'hyphenated_words', \n",
    "                    'quotation_marks',   \n",
    "                    'unicode', \n",
    "                    'repeating_chars',\n",
    "                    'accents', \n",
    "                    'brackets', \n",
    "                    'html_tags', \n",
    "                    'punctuation', \n",
    "                    # 'currency_symbols', \n",
    "                    'emails', \n",
    "                    'emojis', \n",
    "                    'hashtags', \n",
    "                    # 'numbers', \n",
    "                    'phone_numbers', \n",
    "                    'urls', \n",
    "                    'user_handles', \n",
    "                    'whitespace',\n",
    "                    'numerize']\n",
    "preprocessorOptions = {'repeating_chars': {'chars': removeRepeatings[0], 'maxn': 1}, \n",
    "                       'unicode': {'form': 'NFKC'}, \n",
    "                       'accents': {'fast': False}, \n",
    "                       'brackets': {'only': removeBrackets},\n",
    "                       'punctuation': {'only': removePunctuation}}\n",
    "\n",
    "preprocess = Preprocessing(preprocessorList, preprocessorOptions)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2bfb8a74-7f27-4635-b646-799e122e1c22",
   "metadata": {},
   "source": [
    "### Load entity list and causal list or provide directly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9200dbab-1e1c-43f3-993a-b2529e2ec545",
   "metadata": {},
   "outputs": [],
   "source": [
    "entityFile = config.nlpConfig['files']['entity_file']\n",
    "entityList = pd.read_csv(entityFile).values.ravel().tolist()\n",
    "ents.extend(entityList)\n",
    "\n",
    "causalFilename = config.nlpConfig['files']['cause_effect_keywords_file']\n",
    "ds = pd.read_csv(causalFilename, skipinitialspace=True)\n",
    "for col in ds.columns:\n",
    "    causalList.extend(set(ds[col].dropna()))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "488ff3a8-952e-4afe-abf0-afbd32a4bd6e",
   "metadata": {},
   "source": [
    "### Generate patterns that can be used in NER "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "fff8ad04-10e3-4d95-9c4c-614d27fa66e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "patternsEnts = generatePatternList(ents, label=entLabel, id=entId, nlp=nlp, attr=\"LEMMA\")\n",
    "patternsCausal = generatePatternList(causalList, label=causalLabel, id=causalID, nlp=nlp, attr=\"LEMMA\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b21f7184-0926-4a10-ad76-cb3cc40d308b",
   "metadata": {},
   "source": [
    "### Create rule-based matcher with entity list and causal entity list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1cdffe02-b5a1-43cd-8ec9-e77e8466349a",
   "metadata": {},
   "outputs": [],
   "source": [
    "matcher = RuleBasedMatcher(nlp, entID=entId, causalKeywordID=causalID)\n",
    "\n",
    "matcher.addEntityPattern('ssc_entity_ruler', patternsEnts)\n",
    "matcher.addEntityPattern('causal_keywords_entity_ruler', patternsCausal)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31663b1f-fa75-4bf7-8df3-8cded23e18a7",
   "metadata": {},
   "source": [
    "### Read raw text data and preprocess it"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8dbc176a-2aec-43d4-9a27-c455d4796c9c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read Raw Text Data, Users can also provide a raw string here\n",
    "textFile = config.nlpConfig['files']['text_file']\n",
    "with open(textFile, 'r') as ft:\n",
    "    doc = ft.read()\n",
    "\n",
    "# clean doc\n",
    "if cleanDoc:\n",
    "    doc = preprocess(doc)\n",
    "if numerizerDoc:\n",
    "    doc = numerizer.numerize(doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e3eaba8c-1628-430c-aa29-a3aba4b4ebb5",
   "metadata": {},
   "source": [
    "### Correct the doc "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9c4f322a-787c-4414-81a2-913d988a986e",
   "metadata": {},
   "outputs": [],
   "source": [
    "availCheckers = ['autocorrect', 'ContextualSpellCheck']\n",
    "if spellCorrectDoc: \n",
    "    checker = SpellChecker(doc, checker=availCheckers[0])\n",
    "    misspelledWords = checker.getMisspelledWords()\n",
    "    print('MisspelledWords: ', ','.join(misspelledWords))\n",
    "    updatedWords = input('Provide the words that will not be treated as misspelled words (comma sperated words):')\n",
    "    updatedWords = [word.strip() for word in updatedWords.split(',')]\n",
    "    if len(updatedWords) != 0:\n",
    "        checker.addWordsToDictionary(updatedWords)\n",
    "    doc = checker.correct()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "075cb669-1bea-4955-9d8b-41f8e42fddda",
   "metadata": {},
   "outputs": [],
   "source": [
    "# raw text need to convert to lower case so that Spacy can perform POS correctly\n",
    "doc = doc.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "85a7243b-c947-4aa0-961d-f64fbd225988",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "matcher(doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1517ae33-239d-49d2-9d8c-02fba74fe7b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # Following used to retrieve causal effect information\n",
    "# causalEffect = config.nlpConfig['files']['output_causal_effect_file']\n",
    "# causalEffect = pd.read_csv(causalEffect)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "a41cb301-01f3-4975-9124-b3ac18118395",
   "metadata": {},
   "outputs": [],
   "source": [
    "healthStatus = config.nlpConfig['files']['output_health_status_file']\n",
    "healthStatus = pd.read_csv(healthStatus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "13603f03-7571-4806-b599-ff685a544482",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>entities</th>\n",
       "      <th>conjecture</th>\n",
       "      <th>negation</th>\n",
       "      <th>negation text</th>\n",
       "      <th>root</th>\n",
       "      <th>status keywords</th>\n",
       "      <th>health status prepend adjectival modifier</th>\n",
       "      <th>health status prepend</th>\n",
       "      <th>health status</th>\n",
       "      <th>health status append adjectival modifier</th>\n",
       "      <th>health status append</th>\n",
       "      <th>sentence</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [Unnamed: 0, entities, conjecture, negation, negation text, root, status keywords, health status prepend adjectival modifier, health status prepend, health status, health status append adjectival modifier, health status append, sentence]\n",
       "Index: []"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "healthStatus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "196e911e-6270-41bf-bd49-47a05675fbb8",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(healthStatus.shape[0]):\n",
    "    print(list(healthStatus.iloc[i]))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}