{ "cells": [ { "cell_type": "markdown", "id": "0d288bbe-c31c-4be8-9b5d-cb11b5a3a514", "metadata": {}, "source": [ "# Demo For NLP Workflow: Based on branch wangc/nlp" ] }, { "cell_type": "markdown", "id": "976dfa17-f796-43f4-a801-bd102a5ac551", "metadata": {}, "source": [ "### Set Paths and Loading Required Modules \n", "- Required libraries, please check SR2ML/dependencies.xml" ] }, { "cell_type": "code", "execution_count": 1, "id": "d55142b4-776b-4d7a-8954-60f2c10a5075", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Warming up PyWSD (takes ~10 secs)... took 2.4229929447174072 secs.\n" ] } ], "source": [ "# External Modules #\n", "import os\n", "import sys\n", "import pandas as pd\n", "import spacy\n", "import logging\n", "import numerizer \n", "#######################\n", "\n", "# Settings #\n", "cwd = os.getcwd()\n", "frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))\n", "sys.path.append(frameworkDir)\n", "########################\n", "\n", "# Internal Modules #\n", "from dackar.workflows.RuleBasedMatcher import RuleBasedMatcher\n", "from dackar import config\n", "from dackar.utils.nlp.nlp_utils import generatePatternList\n", "from dackar.utils.opm.OPLparser import OPMobject\n", "from dackar.text_processing.Preprocessing import Preprocessing\n", "from dackar.text_processing.Preprocessing import SpellChecker\n", "#########################\n", "\n", "# logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.DEBUG)\n", "logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)\n", "nlp = spacy.load(\"en_core_web_lg\", exclude=[])" ] }, { "cell_type": "markdown", "id": "e042942d-2cc8-4085-997b-9278928ddeb4", "metadata": {}, "source": [ "### Initialize variables " ] }, { "cell_type": "code", "execution_count": 2, "id": "6e2447ac-efad-427a-840c-f5e1776fc5ff", "metadata": {}, "outputs": [], "source": [ "cleanDoc = True # prepocessor the raw doc text, i.e., remove brackets, repeatings, punctuations.\n", "numerizerDoc = True # converts natural language numerics into ints and floats\n", "spellCorrectDoc = False # spell check and perform corrections\n", "\n", "entLabel = \"pump_component\" # user defined entity label\n", "entId = \"SSC\" # user defined entity ID \n", "causalLabel = \"causal_keywords\" # user defined causal keyword label \n", "causalID = \"causal\" # user defined causal keyword ID \n", "ents = [] # user provided entities list\n", "causalList = [] # user provided causal keyword list\n", "\n", "removeBrackets = ['curly', 'square', 'round']\n", "removeRepeatings = ['.']\n", "# TODO: extend repeating_chars to handle a list of chars, right now we can only pass one chars\n", "removePunctuation = ['/', \"#\", '~'] # right now puncuation is replaced with whitespace, we may need to replace it with None\n", "# TODO: add replace functions, for example, replace acronyms with full name\n", "\n", "preprocessorList = ['bullet_points', \n", " 'hyphenated_words', \n", " 'quotation_marks', \n", " 'unicode', \n", " 'repeating_chars',\n", " 'accents', \n", " 'brackets', \n", " 'html_tags', \n", " 'punctuation', \n", " # 'currency_symbols', \n", " 'emails', \n", " 'emojis', \n", " 'hashtags', \n", " # 'numbers', \n", " 'phone_numbers', \n", " 'urls', \n", " 'user_handles', \n", " 'whitespace',\n", " 'numerize']\n", "preprocessorOptions = {'repeating_chars': {'chars': removeRepeatings[0], 'maxn': 1}, \n", " 'unicode': {'form': 'NFKC'}, \n", " 'accents': {'fast': False}, \n", " 'brackets': {'only': removeBrackets},\n", " 'punctuation': {'only': removePunctuation}}\n", "\n", "preprocess = Preprocessing(preprocessorList, preprocessorOptions)" ] }, { "cell_type": "markdown", "id": "2bfb8a74-7f27-4635-b646-799e122e1c22", "metadata": {}, "source": [ "### Load entity list and causal list or provide directly" ] }, { "cell_type": "code", "execution_count": 3, "id": "9200dbab-1e1c-43f3-993a-b2529e2ec545", "metadata": {}, "outputs": [], "source": [ "entityFile = config.nlpConfig['files']['entity_file']\n", "entityList = pd.read_csv(entityFile).values.ravel().tolist()\n", "ents.extend(entityList)\n", "\n", "causalFilename = config.nlpConfig['files']['cause_effect_keywords_file']\n", "ds = pd.read_csv(causalFilename, skipinitialspace=True)\n", "for col in ds.columns:\n", " causalList.extend(set(ds[col].dropna()))" ] }, { "cell_type": "markdown", "id": "488ff3a8-952e-4afe-abf0-afbd32a4bd6e", "metadata": {}, "source": [ "### Generate patterns that can be used in NER " ] }, { "cell_type": "code", "execution_count": 4, "id": "fff8ad04-10e3-4d95-9c4c-614d27fa66e0", "metadata": {}, "outputs": [], "source": [ "patternsEnts = generatePatternList(ents, label=entLabel, id=entId, nlp=nlp, attr=\"LEMMA\")\n", "patternsCausal = generatePatternList(causalList, label=causalLabel, id=causalID, nlp=nlp, attr=\"LEMMA\")" ] }, { "cell_type": "markdown", "id": "b21f7184-0926-4a10-ad76-cb3cc40d308b", "metadata": {}, "source": [ "### Create rule-based matcher with entity list and causal entity list" ] }, { "cell_type": "code", "execution_count": 5, "id": "1cdffe02-b5a1-43cd-8ec9-e77e8466349a", "metadata": {}, "outputs": [], "source": [ "matcher = RuleBasedMatcher(nlp, entID=entId, causalKeywordID=causalID)\n", "\n", "matcher.addEntityPattern('ssc_entity_ruler', patternsEnts)\n", "matcher.addEntityPattern('causal_keywords_entity_ruler', patternsCausal)" ] }, { "cell_type": "markdown", "id": "31663b1f-fa75-4bf7-8df3-8cded23e18a7", "metadata": {}, "source": [ "### Read raw text data and preprocess it" ] }, { "cell_type": "code", "execution_count": 6, "id": "8dbc176a-2aec-43d4-9a27-c455d4796c9c", "metadata": {}, "outputs": [], "source": [ "# Read Raw Text Data, Users can also provide a raw string here\n", "textFile = config.nlpConfig['files']['text_file']\n", "with open(textFile, 'r') as ft:\n", " doc = ft.read()\n", "\n", "# clean doc\n", "if cleanDoc:\n", " doc = preprocess(doc)\n", "if numerizerDoc:\n", " doc = numerizer.numerize(doc)" ] }, { "cell_type": "markdown", "id": "e3eaba8c-1628-430c-aa29-a3aba4b4ebb5", "metadata": {}, "source": [ "### Correct the doc " ] }, { "cell_type": "code", "execution_count": 7, "id": "9c4f322a-787c-4414-81a2-913d988a986e", "metadata": {}, "outputs": [], "source": [ "availCheckers = ['autocorrect', 'ContextualSpellCheck']\n", "if spellCorrectDoc: \n", " checker = SpellChecker(doc, checker=availCheckers[0])\n", " misspelledWords = checker.getMisspelledWords()\n", " print('MisspelledWords: ', ','.join(misspelledWords))\n", " updatedWords = input('Provide the words that will not be treated as misspelled words (comma sperated words):')\n", " updatedWords = [word.strip() for word in updatedWords.split(',')]\n", " if len(updatedWords) != 0:\n", " checker.addWordsToDictionary(updatedWords)\n", " doc = checker.correct()" ] }, { "cell_type": "code", "execution_count": 8, "id": "075cb669-1bea-4955-9d8b-41f8e42fddda", "metadata": {}, "outputs": [], "source": [ "# raw text need to convert to lower case so that Spacy can perform POS correctly\n", "doc = doc.lower()" ] }, { "cell_type": "code", "execution_count": 9, "id": "85a7243b-c947-4aa0-961d-f64fbd225988", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "matcher(doc)" ] }, { "cell_type": "code", "execution_count": 10, "id": "1517ae33-239d-49d2-9d8c-02fba74fe7b3", "metadata": {}, "outputs": [], "source": [ "# # Following used to retrieve causal effect information\n", "# causalEffect = config.nlpConfig['files']['output_causal_effect_file']\n", "# causalEffect = pd.read_csv(causalEffect)" ] }, { "cell_type": "code", "execution_count": 11, "id": "a41cb301-01f3-4975-9124-b3ac18118395", "metadata": {}, "outputs": [], "source": [ "healthStatus = config.nlpConfig['files']['output_health_status_file']\n", "healthStatus = pd.read_csv(healthStatus)" ] }, { "cell_type": "code", "execution_count": 12, "id": "13603f03-7571-4806-b599-ff685a544482", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Unnamed: 0 | \n", "entities | \n", "conjecture | \n", "negation | \n", "negation text | \n", "root | \n", "status keywords | \n", "health status prepend adjectival modifier | \n", "health status prepend | \n", "health status | \n", "health status append adjectival modifier | \n", "health status append | \n", "sentence | \n", "
---|