{ "cells": [ { "cell_type": "markdown", "id": "32ea3767-c35e-416b-a4b5-b5d1370df1f3", "metadata": {}, "source": [ "# Demo for Rule Based Natural Language Processing" ] }, { "cell_type": "markdown", "id": "bb843d35-59af-4e3f-9700-fc6f6e2aad0e", "metadata": {}, "source": [ "### 1. Set up the path, so that the NLP modules can be found" ] }, { "cell_type": "code", "execution_count": 1, "id": "0e32d652-af95-44d4-89a0-de554d2f33a2", "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "\n", "cwd = os.getcwd()\n", "frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))\n", "sys.path.append(frameworkDir)" ] }, { "cell_type": "markdown", "id": "80c89186-405c-471e-be16-e3e80cb9ab20", "metadata": {}, "source": [ "### 2. Load Spacy module" ] }, { "cell_type": "code", "execution_count": 2, "id": "4f0c40e5-bec7-4281-a313-80234c19b95f", "metadata": {}, "outputs": [], "source": [ "import spacy\n", "nlp = spacy.load(\"en_core_web_lg\", exclude=[])" ] }, { "cell_type": "markdown", "id": "8c5f8cfc-2d85-4a53-bacc-d719614ea05b", "metadata": {}, "source": [ "### 3. Load other modules" ] }, { "cell_type": "code", "execution_count": 3, "id": "fdd6ef86-ce41-4813-aba4-70af07693a9f", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "id": "386122b6-1ad0-469e-bc53-c2087857f649", "metadata": {}, "source": [ "### 4. Import NLP modules " ] }, { "cell_type": "code", "execution_count": 4, "id": "8126fb35-a9b1-45b1-b63d-b07b02e5dcf1", "metadata": {}, "outputs": [], "source": [ "from dackar.workflows.RuleBasedMatcher import RuleBasedMatcher\n", "from dackar import config\n", "from dackar.utils.nlp.nlp_utils import generatePatternList" ] }, { "cell_type": "markdown", "id": "62219682-2d67-4ec6-9c98-912d5a0ed246", "metadata": {}, "source": [ "### 5. Set up logging " ] }, { "cell_type": "code", "execution_count": 5, "id": "2c0f142f-42fd-45c4-8389-1eeca5f8e8ce", "metadata": {}, "outputs": [], "source": [ "import logging\n", "logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.DEBUG)" ] }, { "cell_type": "markdown", "id": "1b31b48e-48b0-4dbe-8cf8-c78143c66b7f", "metadata": {}, "source": [ "### 6. Read and process entities" ] }, { "cell_type": "code", "execution_count": 6, "id": "31319bfe-c0b9-4e9f-9c1f-a52bfae36215", "metadata": {}, "outputs": [], "source": [ "ents = []\n", "entityFile = config.nlpConfig['files']['entity_file']\n", "entityList = pd.read_csv(entityFile).values.ravel().tolist()\n", "ents.extend(entityList)\n", "ents = set(ents)\n", "label = \"pump_component\"\n", "entId = \"SSC\"\n", "patternsOPM = generatePatternList(ents, label=label, id=entId, nlp=nlp, attr=\"LEMMA\")" ] }, { "cell_type": "markdown", "id": "0fcad635-1186-4816-b4c7-5f3ed3969f7f", "metadata": {}, "source": [ "### 7. Read and process causal keywords " ] }, { "cell_type": "code", "execution_count": 7, "id": "643de747-4981-4bdd-ab44-f4f1abb16f1a", "metadata": {}, "outputs": [], "source": [ "causalLabel = \"causal_keywords\"\n", "causalID = \"causal\"\n", "patternsCausal = []\n", "causalFilename = config.nlpConfig['files']['cause_effect_keywords_file']\n", "ds = pd.read_csv(causalFilename, skipinitialspace=True)\n", "for col in ds.columns:\n", " vars = set(ds[col].dropna())\n", " patternsCausal.extend(generatePatternList(vars, label=causalLabel, id=causalID, nlp=nlp, attr=\"LEMMA\"))" ] }, { "cell_type": "markdown", "id": "0c932ecb-c784-4534-9e01-c631ab2e6866", "metadata": {}, "source": [ "### 8. Create Rule-based matcher with entity list and causal entity list" ] }, { "cell_type": "code", "execution_count": 8, "id": "8503b3c3-8a9e-4526-84b7-64cc8ccb9728", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "30-May-25 15:56:31 dackar.workflows.WorkflowBase INFO Create instance of RuleBasedMatcher\n", "30-May-25 15:56:33 dackar.utils.nlp.nlp_utils INFO Model: core_web_lg, Language: en\n", "30-May-25 15:56:33 dackar.utils.nlp.nlp_utils INFO Available pipelines:pysbdSentenceBoundaries, tok2vec, tagger, parser, attribute_ruler, lemmatizer, mergePhrase, normEntities, initCoref, aliasResolver, anaphorCoref, anaphorEntCoref\n" ] } ], "source": [ "name = 'ssc_entity_ruler'\n", "matcher = RuleBasedMatcher(nlp, entID=entId, causalKeywordID=causalID)\n", "matcher.addEntityPattern(name, patternsOPM)\n", "\n", "causalName = 'causal_keywords_entity_ruler'\n", "matcher.addEntityPattern(causalName, patternsCausal)" ] }, { "cell_type": "markdown", "id": "ae0ca9d7-0474-4845-848c-a9d5ab504218", "metadata": {}, "source": [ "### 9. Read input text file, or users can provide a raw string" ] }, { "cell_type": "code", "execution_count": 9, "id": "e5ba4b71-c0ec-40dd-bcbb-51ce67de4b95", "metadata": {}, "outputs": [], "source": [ "textFile = config.nlpConfig['files']['text_file']\n", "with open(textFile, 'r') as ft:\n", " doc = ft.read()" ] }, { "cell_type": "markdown", "id": "747c033a-b2dc-4ea7-b084-f539649ef5d7", "metadata": {}, "source": [ "### 10. Process raw string data using matcher" ] }, { "cell_type": "code", "execution_count": 10, "id": "9cb4ae56-ea17-4289-b2c5-e3602ce4fd9f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "30-May-25 15:56:33 dackar.workflows.RuleBasedMatcher INFO Start to extract health status\n", "30-May-25 15:56:33 dackar.workflows.RuleBasedMatcher WARNING No status identified for \"pump\" in \"Slight Vibrations is noticed - likely from pump shaft deflection.\n", "\"\n", "30-May-25 15:56:33 dackar.workflows.RuleBasedMatcher WARNING Entity \"pump\" dep_ is \"xcomp\" is not among valid list \"[nsubj, nsubjpass, pobj, dobj, compound]\"\n", "30-May-25 15:56:33 dackar.workflows.RuleBasedMatcher WARNING Entity \"pump\" dep_ is \"xcomp\" is not among valid list \"[nsubj, nsubjpass, pobj, dobj, compound]\"\n", "30-May-25 15:56:33 dackar.workflows.RuleBasedMatcher WARNING Entity \"pump\" dep_ is \"advcl\" is not among valid list \"[nsubj, nsubjpass, pobj, dobj, compound]\"\n", "30-May-25 15:56:33 dackar.workflows.RuleBasedMatcher INFO End of health status extraction!\n", "30-May-25 15:56:33 dackar.workflows.RuleBasedMatcher INFO Start to extract causal relation using OPM model information\n", "30-May-25 15:56:33 dackar.workflows.RuleBasedMatcher INFO End of causal relation extraction!\n", "30-May-25 15:56:33 dackar.workflows.RuleBasedMatcher INFO Start to use general extraction method to extract causal relation\n", "30-May-25 15:56:33 dackar.workflows.RuleBasedMatcher INFO End of causal relation extraction using general extraction method!\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "(bearings, caused, shaft degradation) (bearings, caused, shaft degradation) (inspection, revealed, degradation) (inspection, revealed, degradation) (they, caused, failure) (Low flow conditions, causing, cavitation) (Pump, keep, the check valves) (shaft, made, noise) (Pump, made, noises)\n" ] } ], "source": [ "matcher(doc)" ] }, { "cell_type": "markdown", "id": "a55e8ef2-a278-49a0-be0a-deac5a3328e1", "metadata": {}, "source": [ "### 11. Access processed information from matcher" ] }, { "cell_type": "code", "execution_count": 11, "id": "cdf032fd-3c88-46fc-9629-0762973669eb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[pump bearings,\n", " None,\n", " caused,\n", " shaft degradation,\n", " None,\n", " Rupture of pump bearings caused pump shaft degradation.,\n", " False],\n", " [pump bearings,\n", " None,\n", " caused,\n", " shaft degradation,\n", " None,\n", " Rupture of pump bearings caused pump shaft degradation and consequent flow reduction.,\n", " False],\n", " [power supply,\n", " None,\n", " due to,\n", " Pump,\n", " None,\n", " Pump test failed due to power supply failure.,\n", " False],\n", " [Pump,\n", " None,\n", " revealed,\n", " impeller,\n", " None,\n", " Pump inspection revealed excessive impeller degradation.,\n", " False],\n", " [Pump,\n", " None,\n", " revealed,\n", " impeller,\n", " None,\n", " Pump inspection revealed excessive impeller degradation likely due to cavitation.,\n", " True],\n", " [pump shaft,\n", " None,\n", " caused,\n", " pump,\n", " None,\n", " Several cracks on pump shaft were observed; they could have caused pump failure within few days.,\n", " True],\n", " [pump shaft,\n", " None,\n", " causing,\n", " motor,\n", " None,\n", " The pump shaft vibration appears to be causing the motor to vibrate as well.,\n", " False]]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "matcher._extractedCausals" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }