{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "46237dbc-ac22-482f-9373-1c1eaee56f40", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import spacy\n", "from spacy.tokens import Span" ] }, { "cell_type": "code", "execution_count": 2, "id": "86be0452-6e98-46b4-9e4f-4447fc9996ef", "metadata": {}, "outputs": [], "source": [ "def displayNER(doc, includePunct=False):\n", " \"\"\"\n", " Generate data frame for visualization of spaCy doc with custom attributes.\n", " \"\"\"\n", " rows = []\n", " for i, t in enumerate(doc):\n", " if not t.is_punct or includePunct:\n", " row = {'token': i,\n", " 'text': t.text, 'lemma': t.lemma_,\n", " 'pos': t.pos_, 'dep': t.dep_, 'ent_type': t.ent_type_,\n", " 'ent_iob_': t.ent_iob_}\n", " if doc.has_extension('coref_chains'):\n", " if t.has_extension('coref_chains') and t._.coref_chains: # neuralcoref attributes\n", " row['coref_chains'] = t._.coref_chains.pretty_representation\n", " else:\n", " row['coref_chains'] = None\n", " if t.has_extension('ref_n'): # referent attribute\n", " row['ref_n'] = t._.ref_n\n", " row['ref_t'] = t._.ref_t\n", " if t.has_extension('ref_ent'): # ref_n/ref_t\n", " row['ref_ent'] = t._.ref_ent\n", " rows.append(row)\n", " df = pd.DataFrame(rows).set_index('token')\n", " df.index.name = None\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": 3, "id": "72461323-634d-47c7-9070-c9b560e9f8cd", "metadata": {}, "outputs": [], "source": [ "def resetPipeline(nlp, pipes):\n", " \"\"\"\n", " remove all custom pipes, and add new pipes\n", " \"\"\"\n", " customPipes = [pipe for (pipe, _) in nlp.pipeline\n", " if pipe not in ['tagger', 'parser',\n", " 'tok2vec', 'attribute_ruler', 'lemmatizer']]\n", " for pipe in customPipes:\n", " _ = nlp.remove_pipe(pipe)\n", " # re-add specified pipes\n", " for pipe in pipes:\n", " nlp.add_pipe(pipe)\n", " logger.info(f\"Model: {nlp.meta['name']}, Language: {nlp.meta['lang']}\")\n", " logger.info('\\n'.join([pipe for (pipe,_) in nlp.pipeline]))" ] }, { "cell_type": "code", "execution_count": 4, "id": "c84f0247-badc-400d-8d9e-a70c30a44d0b", "metadata": {}, "outputs": [], "source": [ "def printDepTree(doc, skipPunct=True):\n", " \"\"\"\n", " Utility function to pretty print the dependency tree.\n", " \"\"\"\n", " def printRecursive(root, indent, skipPunct):\n", " if not root.dep_ == 'punct' or not skipPunct:\n", " print(\" \"*indent + f\"{root} [{root.pos_}, {root.dep_}]\")\n", " for left in root.lefts:\n", " printRecursive(left, indent=indent+4, skipPunct=skipPunct)\n", " for right in root.rights:\n", " printRecursive(right, indent=indent+4, skipPunct=skipPunct)\n", "\n", " for sent in doc.sents: # iterate over all sentences in a doc\n", " printRecursive(sent.root, indent=0, skipPunct=skipPunct)" ] }, { "cell_type": "markdown", "id": "03ce6912-5f3d-4a3d-8830-d8420c78f890", "metadata": {}, "source": [ "## Custom pipelines " ] }, { "cell_type": "code", "execution_count": 5, "id": "e87f1527-df2f-4650-8f2a-d41a1e2b554f", "metadata": {}, "outputs": [], "source": [ "from spacy.language import Language\n", "from spacy.tokens import Span\n", "from spacy.matcher import Matcher\n", "from spacy.tokens import Token" ] }, { "cell_type": "code", "execution_count": 6, "id": "d77fe2da-0b90-4c65-ae0a-7709b891e408", "metadata": {}, "outputs": [], "source": [ "customLabel = ['STRUCTURE', 'COMPONENT', 'SYSTEM']\n", "aliasLookup = {}" ] }, { "cell_type": "code", "execution_count": 7, "id": "7b875bff-5467-4770-8ab8-859339367865", "metadata": {}, "outputs": [], "source": [ "@Language.component(\"normEntities\")\n", "def normEntities(doc):\n", " \"\"\"\n", " Normalizing Named Entities, remove the leading article and trailing particle\n", " @ In, doc, spacy.tokens.doc.Doc\n", " @ Out, doc, spacy.tokens.doc.Doc\n", " \"\"\"\n", " ents = []\n", " for ent in doc.ents:\n", " if ent[0].pos_ == \"DET\": # leading article\n", " ent = Span(doc, ent.start+1, ent.end, label=ent.label)\n", " if len(ent) > 0:\n", " if ent[-1].pos_ == \"PART\": # trailing particle like 's\n", " ent = Span(doc, ent.start, ent.end-1, label=ent.label)\n", " if len(ent) > 0:\n", " ents.append(ent)\n", " doc.ents = tuple(ents)\n", " return doc" ] }, { "cell_type": "code", "execution_count": 8, "id": "b1510c06-e424-4563-972e-ff88dfd9fbc0", "metadata": {}, "outputs": [], "source": [ "@Language.component(\"initCoref\")\n", "def initCoref(doc):\n", " for e in doc.ents:\n", " # if e.label_ in customLabel:\n", " e[0]._.ref_n, e[0]._.ref_t = e.text, e.label_\n", " return doc" ] }, { "cell_type": "code", "execution_count": 9, "id": "eefa2983-fc49-4027-abde-f2281f27508a", "metadata": {}, "outputs": [], "source": [ "@Language.component(\"aliasResolver\")\n", "def aliasResolver(doc):\n", " \"\"\"\n", " Lookup aliases and store result in ref_t, ref_n\n", " \"\"\"\n", " for ent in doc.ents:\n", " token = ent[0].text\n", " if token in aliasLookup:\n", " aName, aType = aliasLookup[token]\n", " ent[0]._.ref_n, ent[0]._.ref_t = aName, aType\n", " return propagateEntType(doc)" ] }, { "cell_type": "code", "execution_count": 10, "id": "c7427840-70b6-4688-9d59-9857f29d5d45", "metadata": {}, "outputs": [], "source": [ "def propagateEntType(doc):\n", " \"\"\"\n", " propagate entity type stored in ref_t\n", " \"\"\"\n", " ents = []\n", " for e in doc.ents:\n", " if e[0]._.ref_n != '': # if e is a coreference\n", " e = Span(doc, e.start, e.end, label=e[0]._.ref_t)\n", " ents.append(e)\n", " doc.ents = tuple(ents)\n", " return doc" ] }, { "cell_type": "code", "execution_count": 11, "id": "307b1d23-da5c-47e1-96e7-ca807af1c2a0", "metadata": {}, "outputs": [], "source": [ "@Language.component(\"anaphorCoref\")\n", "def anaphorCoref(doc):\n", " \"\"\"\n", " Anaphora resolution using coreferee\n", " This pipeline need to be added after NER.\n", " The assumption here is: The entities need to be recognized first, then call\n", " pipeline \"initCoref\" to assign initial custom attribute \"ref_n\" and \"ref_t\",\n", " then call pipeline \"aliasResolver\" to resolve all the aliases used in the text.\n", " After all these pre-processes, we can use \"anaphorCoref\" pipeline to resolve the\n", " coreference.\n", " \"\"\"\n", " if not Token.has_extension('coref_chains'):\n", " return doc\n", " for token in doc:\n", " coref = token._.coref_chains\n", " # if token is coref and not already dereferenced\n", " if coref and token._.ref_n == '':\n", " # check all the references, if \"ref_n\" is available (determined by NER and initCoref),\n", " # the value of \"ref_n\" will be assigned to current totken\n", " for chain in coref:\n", " for ref in chain:\n", " refToken = doc[ref[0]]\n", " if refToken._.ref_n != '':\n", " token._.ref_n = refToken._.ref_n\n", " token._.ref_t = refToken._.ref_t\n", " break\n", " return doc" ] }, { "cell_type": "code", "execution_count": 12, "id": "38da0b2b-c0b8-41df-a62f-4b35a7128331", "metadata": {}, "outputs": [], "source": [ "@Language.component(\"expandEntities\")\n", "def expandEntities(doc):\n", " \"\"\"\n", " Expand the current entities, recursive function to extend entity with all previous NOUN\n", " \"\"\"\n", " newEnts = []\n", " isUpdated = False\n", " for ent in doc.ents:\n", " if ent.label_ == \"SSC\" and ent.start != 0:\n", " prevToken = doc[ent.start - 1]\n", " if prevToken.pos_ in ['NOUN']:\n", " newEnt = Span(doc, ent.start - 1, ent.end, label=ent.label)\n", " newEnts.append(newEnt)\n", " isUpdated = True\n", " else:\n", " newEnts.append(ent)\n", " doc.ents = newEnts\n", " if isUpdated:\n", " doc = expandEntities(doc)\n", " return doc" ] }, { "cell_type": "code", "execution_count": 13, "id": "23a97765-5029-4dc9-a418-3b959ebccadf", "metadata": {}, "outputs": [], "source": [ "import coreferee, spacy\n", "nlp = spacy.load(\"en_core_web_lg\")\n", "import logging\n", "logger = logging.getLogger(__name__)" ] }, { "cell_type": "code", "execution_count": 14, "id": "42718587-1eba-4162-a4b0-333f01ce79b5", "metadata": {}, "outputs": [], "source": [ "ch = logging.StreamHandler()\n", "logger.addHandler(ch)" ] }, { "cell_type": "code", "execution_count": 15, "id": "4683848d-8b5c-453c-b5f4-15cd5147c082", "metadata": {}, "outputs": [], "source": [ "#### Using spacy's Token extensions for coreferee\n", "if Token.has_extension('ref_n'):\n", " _ = Token.remove_extension('ref_n')\n", "if Token.has_extension('ref_t'):\n", " _ = Token.remove_extension('ref_t')\n", "if Token.has_extension('ref_t_'):\n", " _ = Token.remove_extension('ref_t_')\n", "Token.set_extension('ref_n', default='')\n", "Token.set_extension('ref_t', default='')" ] }, { "cell_type": "code", "execution_count": 16, "id": "dd59f6b0-bfa8-44f9-b73b-0bc7369616b3", "metadata": {}, "outputs": [], "source": [ "pipelines = ['entity_ruler','normEntities', 'initCoref', 'aliasResolver', 'coreferee','anaphorCoref', 'expandEntities']" ] }, { "cell_type": "code", "execution_count": 17, "id": "cc0561d7-775d-4c19-ae5c-68fc2d39fd09", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['entity_ruler',\n", " 'normEntities',\n", " 'initCoref',\n", " 'aliasResolver',\n", " 'coreferee',\n", " 'anaphorCoref',\n", " 'expandEntities']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipelines" ] }, { "cell_type": "code", "execution_count": 18, "id": "645de700-3e0b-40e0-aa15-dcea71a016f2", "metadata": {}, "outputs": [], "source": [ "resetPipeline(nlp, pipelines)" ] }, { "cell_type": "code", "execution_count": 19, "id": "a0f0d71b-0b76-416d-b73a-926933ded429", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('tok2vec', ),\n", " ('tagger', ),\n", " ('parser', ),\n", " ('attribute_ruler',\n", " ),\n", " ('lemmatizer', ),\n", " ('entity_ruler', ),\n", " ('normEntities', ),\n", " ('initCoref', ),\n", " ('aliasResolver', ),\n", " ('coreferee', ),\n", " ('anaphorCoref', ),\n", " ('expandEntities', )]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nlp.pipeline" ] }, { "cell_type": "code", "execution_count": 20, "id": "dbc474ab-204f-4c6b-8f10-e76dbdd5457c", "metadata": {}, "outputs": [], "source": [ "text = r\"\"\"A leak was noticed from the RCP pump 1A.\n", " The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.\n", " The RCP pump 1A pressure gauge was found inoperative.\n", " Rupture of pump bearings caused shaft degradation.\n", " Rupture of pump bearings caused shaft degradation and consequent flow reduction.\n", " Pump power supply has been found burnout.\n", " Pump test failed due to power supply failure.\n", " Pump inspection revealed excessive impeller degradation.\n", " Pump inspection revealed excessive impeller degradation likely due to cavitation.\n", " \"\"\"" ] }, { "cell_type": "code", "execution_count": 21, "id": "8c459300-fc7a-42cf-82d6-b2b04e292d61", "metadata": {}, "outputs": [], "source": [ "patterns = [{\"label\":\"comp\", \"pattern\":[{\"LOWER\":\"gauge\"}], \"id\":\"ssc\"}]\n", "ruler = nlp.get_pipe('entity_ruler')\n", "ruler.add_patterns(patterns)\n", "rules = [{\"LOWER\":\"pump\"}]\n", "from spacy.matcher import Matcher\n", "matcher = Matcher(nlp.vocab)\n", "matcher.add('comp', [rules])" ] }, { "cell_type": "code", "execution_count": 22, "id": "c65480e5-e18a-41a3-92a6-b4ff7f645fb5", "metadata": {}, "outputs": [], "source": [ "doc = nlp(text)" ] }, { "cell_type": "code", "execution_count": 23, "id": "1ce1bea8-e78a-4f92-91f9-ea401d4f97a4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A leak was noticed from the RCP pump 1A.\n", " The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.\n", " comp\n", "A leak was noticed from the RCP pump 1A.\n", " The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.\n", " comp\n", "The RCP pump 1A pressure gauge was found inoperative.\n", " comp\n", "Rupture of pump bearings caused shaft degradation.\n", " comp\n", "Rupture of pump bearings caused shaft degradation and consequent flow reduction.\n", " comp\n", "Pump power supply has been found burnout.\n", " comp\n", "Pump test failed due to power supply failure.\n", " comp\n", "Pump inspection revealed excessive impeller degradation.\n", " comp\n", "Pump inspection revealed excessive impeller degradation likely due to cavitation.\n", " comp\n" ] } ], "source": [ "matches = matcher(doc, as_spans=True)\n", "for span in matches:\n", " print(span.sent, span.label_)" ] }, { "cell_type": "code", "execution_count": 24, "id": "7fb5cf82-4c40-435b-8809-9705ae2e12aa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "print(type(doc.ents))" ] }, { "cell_type": "code", "execution_count": 25, "id": "0e712288-8fb5-4c83-960d-b895fb7c4dba", "metadata": {}, "outputs": [], "source": [ "from spacy import displacy" ] }, { "cell_type": "code", "execution_count": 26, "id": "22feb96d-37a4-4d54-af4b-db7899d75dfc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
A leak was noticed from the RCP pump 1A.
The RCP pump 1A pressure \n", "\n", " gauge\n", " comp\n", "\n", " was found not operating, and it was found inoperative.
The RCP pump 1A pressure \n", "\n", " gauge\n", " comp\n", "\n", " was found inoperative.
Rupture of pump bearings caused shaft degradation.
Rupture of pump bearings caused shaft degradation and consequent flow reduction.
Pump power supply has been found burnout.
Pump test failed due to power supply failure.
Pump inspection revealed excessive impeller degradation.
Pump inspection revealed excessive impeller degradation likely due to cavitation.
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "displacy.render(doc, style='ent', jupyter=True)" ] }, { "cell_type": "code", "execution_count": 27, "id": "4035355e-560c-4e33-8ddd-357880bb1cb2", "metadata": {}, "outputs": [], "source": [ "patterns = [{\"label\":\"comp\", \"pattern\":[{\"LOWER\":\"pressure gauge\"}, {\"POS\":\"NOUN\"}], \"id\":\"ssc\"}]" ] }, { "cell_type": "code", "execution_count": 28, "id": "172d9df3-82c4-4de2-9aa7-317e2211ef66", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "noticed [VERB, ROOT]\n", " leak [NOUN, nsubjpass]\n", " A [DET, det]\n", " was [AUX, auxpass]\n", " from [ADP, prep]\n", " RCP [PROPN, pobj]\n", " the [DET, det]\n", " pump [VERB, conj]\n", " 1A. [NUM, dobj]\n", " \n", " [SPACE, dep]\n", " pump [VERB, relcl]\n", " RCP [PROPN, nsubj]\n", " The [DET, det]\n", " found [VERB, conj]\n", " gauge [NOUN, nsubjpass]\n", " 1A [NOUN, compound]\n", " pressure [NOUN, compound]\n", " was [AUX, auxpass]\n", " operating [VERB, xcomp]\n", " not [PART, neg]\n", " and [CCONJ, cc]\n", " found [VERB, conj]\n", " it [PRON, nsubjpass]\n", " was [AUX, auxpass]\n", " inoperative [ADJ, oprd]\n", " \n", " [SPACE, dep]\n", "pump [VERB, ROOT]\n", " RCP [PROPN, nsubj]\n", " The [DET, det]\n", " found [VERB, conj]\n", " gauge [NOUN, nsubjpass]\n", " 1A [NOUN, compound]\n", " pressure [NOUN, compound]\n", " was [AUX, auxpass]\n", " inoperative [ADJ, oprd]\n", " \n", " [SPACE, dep]\n", "caused [VERB, ROOT]\n", " Rupture [NOUN, nsubj]\n", " of [ADP, prep]\n", " bearings [NOUN, pobj]\n", " pump [NOUN, compound]\n", " degradation [NOUN, dobj]\n", " shaft [NOUN, compound]\n", " \n", " [SPACE, dep]\n", "caused [VERB, ROOT]\n", " Rupture [NOUN, nsubj]\n", " of [ADP, prep]\n", " bearings [NOUN, pobj]\n", " pump [NOUN, compound]\n", " degradation [NOUN, dobj]\n", " shaft [NOUN, compound]\n", " and [CCONJ, cc]\n", " reduction [NOUN, conj]\n", " flow [NOUN, compound]\n", " consequent [ADJ, amod]\n", " \n", " [SPACE, dep]\n", "found [VERB, ROOT]\n", " supply [NOUN, nsubjpass]\n", " Pump [NOUN, compound]\n", " power [NOUN, compound]\n", " has [AUX, aux]\n", " been [AUX, auxpass]\n", " burnout [NOUN, oprd]\n", " \n", " [SPACE, dep]\n", "failed [VERB, ROOT]\n", " test [NOUN, nsubj]\n", " Pump [NOUN, compound]\n", " due [ADP, prep]\n", " to [ADP, pcomp]\n", " failure [NOUN, pobj]\n", " supply [NOUN, compound]\n", " power [NOUN, compound]\n", " \n", " [SPACE, dep]\n", "revealed [VERB, ROOT]\n", " inspection [NOUN, nsubj]\n", " Pump [NOUN, compound]\n", " degradation [NOUN, dobj]\n", " excessive [ADJ, amod]\n", " impeller [NOUN, compound]\n", " \n", " [SPACE, dep]\n", "revealed [VERB, ROOT]\n", " inspection [NOUN, nsubj]\n", " Pump [NOUN, compound]\n", " degradation [NOUN, dobj]\n", " excessive [ADJ, amod]\n", " impeller [NOUN, compound]\n", " likely [ADV, ccomp]\n", " due [ADP, prep]\n", " to [ADP, pcomp]\n", " cavitation [NOUN, pobj]\n", " \n", " [SPACE, dep]\n" ] } ], "source": [ "printDepTree(doc)" ] }, { "cell_type": "code", "execution_count": 29, "id": "e793d7ae-09c5-424c-adbc-fe9e2afb5df6", "metadata": {}, "outputs": [], "source": [ "df = displayNER(doc)" ] }, { "cell_type": "code", "execution_count": 30, "id": "b38987f8-ba22-4b6b-b8a6-97629c1e71bf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textlemmaposdepent_typeent_iob_coref_chainsref_nref_t
0AaDETdetONone
1leakleakNOUNnsubjpassONone
2wasbeAUXauxpassONone
3noticednoticeVERBROOTONone
4fromfromADPprepONone
..............................
94likelylikelyADVccompONone
95duedueADPprepONone
96totoADPpcompONone
97cavitationcavitationNOUNpobjONone
99\\n\\nSPACEdepONone
\n", "

91 rows × 9 columns

\n", "
" ], "text/plain": [ " text lemma pos dep ent_type ent_iob_ coref_chains \\\n", "0 A a DET det O None \n", "1 leak leak NOUN nsubjpass O None \n", "2 was be AUX auxpass O None \n", "3 noticed notice VERB ROOT O None \n", "4 from from ADP prep O None \n", ".. ... ... ... ... ... ... ... \n", "94 likely likely ADV ccomp O None \n", "95 due due ADP prep O None \n", "96 to to ADP pcomp O None \n", "97 cavitation cavitation NOUN pobj O None \n", "99 \\n \\n SPACE dep O None \n", "\n", " ref_n ref_t \n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", ".. ... ... \n", "94 \n", "95 \n", "96 \n", "97 \n", "99 \n", "\n", "[91 rows x 9 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 31, "id": "13a0825d-1bb1-4ec0-817c-eb0536bc7968", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'0: RCP(6), RCP(11), RCP(29); 1: gauge(15), it(22)'" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "doc._.coref_chains.pretty_representation" ] }, { "cell_type": "code", "execution_count": 32, "id": "4f46847c-7d90-4823-b33d-ec5bc0b7107f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "gauge\n", "gauge\n" ] } ], "source": [ "for ent in doc.ents:\n", " print(ent)" ] }, { "cell_type": "code", "execution_count": 33, "id": "2da65fee-85a7-4324-9beb-f6943f6ae8f4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'gauge'" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "doc[22]._.ref_n" ] }, { "cell_type": "code", "execution_count": 34, "id": "807d97dd-3627-4c2f-ae2d-89961da7860e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "token RCP\n", "RCP\n", "\n", "RCP\n", "\n", "RCP\n", "\n", "token RCP\n", "RCP\n", "\n", "RCP\n", "\n", "RCP\n", "\n", "token RCP\n", "RCP\n", "\n", "RCP\n", "\n", "RCP\n", "\n" ] } ], "source": [ "for token in doc:\n", " coref = token._.coref_chains\n", " \n", " # if token is coref and not already dereferenced\n", " if coref and token._.ref_n == '':\n", " print('token', token)\n", " # print(token,coref.pretty_representation)\n", " # check all the references, if \"ref_n\" is available (determined by NER and initCoref),\n", " # the value of \"ref_n\" will be assigned to current totken\n", " for chain in coref:\n", " for ref in chain:\n", " refToken = doc[ref[0]]\n", " print(refToken)\n", " print(refToken._.ref_n)\n", " if refToken._.ref_n != '':\n", " token._.ref_n = refToken._.ref_n\n", " token._.ref_t = refToken._.ref_t\n", " break" ] }, { "cell_type": "code", "execution_count": 35, "id": "560fd3d9-d77b-4b27-b384-e36e4a96c691", "metadata": {}, "outputs": [], "source": [ "import spacy\n", "from spacy.matcher import Matcher\n", "matcher = Matcher(nlp.vocab)\n", "pattern = [{\"LOWER\": \"hello\"}, {\"IS_PUNCT\": True}, {\"LOWER\": \"world\"}]\n", "matcher.add(\"HelloWorld\", [pattern])" ] }, { "cell_type": "code", "execution_count": 36, "id": "29ba4cf5-bd97-49a8-8079-ac1c17c85f6e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(None, [[{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]])" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "matcher.get('HelloWorld')" ] }, { "cell_type": "code", "execution_count": 37, "id": "6cf4b333-dd53-4873-917c-8609749b54a1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(gauge, gauge)" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "doc.ents" ] }, { "cell_type": "code", "execution_count": 38, "id": "fd922cc7-1bd6-46ab-92ab-2b8de041a725", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[A leak was noticed from the RCP pump 1A.\n", " The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.\n", " , The RCP pump 1A pressure gauge was found inoperative.\n", " ]\n" ] } ], "source": [ "sl = []\n", "for ent in doc.ents:\n", " sent = ent.sent\n", " if sent not in sl:\n", " sl.append(sent)\n", "print(sl)" ] }, { "cell_type": "code", "execution_count": 39, "id": "0a9e8f58-df86-4fc9-8f6b-725c05eb45aa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[gauge]\n", "{gauge}\n", "[gauge]\n", "{gauge}\n" ] } ], "source": [ "for sent in sl:\n", " print(sent.ents)\n", " print(set(sent.ents))" ] }, { "cell_type": "code", "execution_count": 40, "id": "bd05916c-e67e-417d-8407-b7367099ea5e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "noticed\n", "det\n", "nsubjpass\n", "auxpass\n", "ROOT\n", "prep\n", "det\n", "pobj\n", "conj\n", "dobj\n", "dep\n", "det\n", "nsubj\n", "relcl\n", "compound\n", "compound\n", "nsubjpass\n", "auxpass\n", "conj\n", "neg\n", "xcomp\n", "punct\n", "cc\n", "nsubjpass\n", "auxpass\n", "conj\n", "oprd\n", "punct\n", "dep\n", "pump\n", "det\n", "nsubj\n", "ROOT\n", "compound\n", "compound\n", "nsubjpass\n", "auxpass\n", "conj\n", "oprd\n", "punct\n", "dep\n" ] } ], "source": [ "for sent in sl:\n", " print(sent.root)\n", " for token in sent:\n", " print(token.dep_)\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "716e1aaf-f7d1-4017-b62a-519d30355601", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 5 }