Source code for src.dackar.pipelines.TemporalEntity

# Copyright 2024, Battelle Energy Alliance, LLC  ALL RIGHTS RESERVED

# The code is adapted from https://github.com/wjbmattingly/date-spacy
# and https://github.com/AliSeyedkav/SMS-TEXT-Time-Date-Recognition-

import re
from spacy.tokens import Span
from spacy.language import Language
from spacy.util import filter_spans
from .SimpleEntityMatcher import SimpleEntityMatcher

# dateparse: python parser for human readable dates: https://dateparser.readthedocs.io/en/latest/
# May be adapted in the future
# import dateparser

# Set up a date extension on the span
# Span.set_extension("Temporal", default=None, force=True)


@Language.factory("Temporal")
[docs] def find_temporal(nlp, name): return Temporal(nlp)
[docs] class Temporal(object): """ How to use it: .. code-block:: python from TemporalEnity import Temporal nlp = spacy.load("en_core_web_sm") pmatcher = Temporal(nlp) doc = nlp("The shaft deflection is causing the safety cage to rattle. Pumps not experiencing enough flow for the pumps to keep the check valves open during test. Pump not experiencing enough flow during test. Shaft made noise. Vibration seems like it is coming from the shaft.") updatedDoc = pmatcher(doc) or: .. code-block:: python nlp.add_pipe('Temporal') newDoc = nlp(doc.text) """ def __init__(self, nlp): """ Args: nlp: spacy nlp model """
[docs] self.name = 'Temporal'
[docs] self.ordinalToNumber = { "first": "1", "second": "2", "third": "3", "fourth": "4", "fifth": "5", "sixth": "6", "seventh": "7", "eighth": "8", "ninth": "9", "tenth": "10", "eleventh": "11", "twelfth": "12", "thirteenth": "13", "fourteenth": "14", "fifteenth": "15", "sixteenth": "16", "seventeenth": "17", "eighteenth": "18", "nineteenth": "19", "twentieth": "20", "twenty-first": "21", "twenty-second": "22", "twenty-third": "23", "twenty-fourth": "24", "twenty-fifth": "25", "twenty-sixth": "26", "twenty-seventh": "27", "twenty-eighth": "28", "twenty-ninth": "29", "thirtieth": "30", "thirty-first": "31" }
# Ordinals ordinals = [ "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth", "twentieth", "twenty-first", "twenty-second", "twenty-third", "twenty-fourth", "twenty-fifth", "twenty-sixth", "twenty-seventh", "twenty-eighth", "twenty-ninth", "thirtieth", "thirty-first" ] ordinalPattern = r"\b(?:" + "|".join(ordinals) + r")\b" exceptions = [ "hour", "hours", "minute", "minutes", "day", "days", "decade", "decades", "century", "centuries", "week", "weeks", "month", "months", "year", "years" ] exceptionsPattern = r"(?:" + "|".join(exceptions) + r")\b" # A regex pattern to capture a variety of date formats
[docs] self.datePattern = r""" # Day-Month-Year (?: \d{1,2}(?:st|nd|rd|th)? # Day with optional st, nd, rd, th suffix \s+ (?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name (?: # Year is optional \s+ \d{4} # Year )? ) | # Day/Month/Year (?: \d{1,2} # Day [/-] \d{1,2} # Month (?: # Year is optional [/-] \d{2,4} # Year )? ) | # Year-Month-Day (?: \d{4} # Year [-/] \d{1,2} # Month [-/] \d{1,2} # Day ) | # Month-Day-Year (?: (?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name \s+ \d{1,2}(?:st|nd|rd|th)? # Day with optional st, nd, rd, th suffix (?: # Year is optional ,? \s+ \d{4} # Year )? ) | # Month-Year (?: (?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name \s+ \d{4} # Year ) | # Ordinal-Day-Month-Year (?: """ + ordinalPattern + """ \s+ (?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name (?: # Year is optional \s+ \d{4} # Year )? ) | (?: """ + ordinalPattern + """ \s+ of \s+ (?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name (?: # Year is optional \s+ \d{4} # Year )? ) | # Month Ordinal (?: (?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name \s+ """ + ordinalPattern + """ (?: # Year is optional \s+ \d{4} # Year )? ) | (?: \d+ (?:\-|\s+)? """ + exceptionsPattern + """ ) """
# terms1 = [ "next", "last", "after", "every", "before", "during" ] terms2 = [ "today", "tomorrow", "year", "Monday", "Tuesday", "Wednesday", "Thursday", "yesterday", "weekend", "Friday", "Saturday", "Sunday" ] terms3 = [ "morning", "afternoon", "noon", "dawn", "midnight", "dusk", "sunrise", "sunset", "evening", "night", "week", "weeks", "month", "months", "year", "years" ] pattern = [ [{"LOWER": {"in": terms1}, "OP": "?"}, {"LEMMA": {"in": terms2}, "OP": "+"}], [{"LOWER": {"in": terms1}, "OP": "?"}, {"LEMMA": {"in": terms2}, "OP": "?"}, {"ENT_TYPE": {"in": ["DATE", "TIME"]}, "OP": "?"}], [{"LOWER": {"in": ["at", "on", "by", "from", "to", "before", "after", "between", "during", "in"]}, "OP": "?"}, {"ENT_TYPE": {"in": ["DATE", "TIME"]}, "OP": "+"}], [{"LOWER": {"in": terms1}, "OP": "?"}, {"LEMMA": {"in": terms2}, "OP": "?"}, {"LEMMA": {"in": terms3}, "OP": "+"}], [{"LOWER": {"in": terms1}, "OP": "?"}, {"LEMMA": {"in": terms2}, "OP": "?"}, {"LEMMA": {"in": terms3}, "OP": "?"}, {"ENT_TYPE": {"in": ["DATE", "TIME"]}, "OP": "+"}], [{"LOWER": {"in": terms1}, "OP": "?"}, {"LEMMA": {"in": terms2}, "OP": "?"}, {"LEMMA": {"in": terms3}, "OP": "?"}, {"LOWER": {"in": ["at", "on", "by", "from", "to", "before", "after", "between", "during", "in"]}, "OP": "?"}, {"ENT_TYPE": {"in": ["DATE", "TIME"]}, "OP": "+"}] ]
[docs] self.matcher = SimpleEntityMatcher(nlp, label='Temporal', terms=pattern)
[docs] self.asSpan = True
[docs] def __call__(self, doc): """ Args: doc: spacy.tokens.doc.Doc, the processed document using nlp pipelines """ matches = list(re.finditer(self.datePattern, doc.text, re.VERBOSE)) newEnts = [] for match in matches: startChar, endChar = match.span() # Convert character offsets to token offsets startToken = None endToken = None for token in doc: if token.idx == startChar: startToken = token.i if token.idx + len(token.text) == endChar: endToken = token.i if startToken is not None and endToken is not None: # hitText = doc.text[startChar:endChar] ent = Span(doc, startToken, endToken + 1, label="Temporal") newEnts.append(ent) ## Following is used to add a custom attribute to indicate Temporal # parsed_date = dateparser.parse(hitText) # if parsed_date: # Ensure the matched string is a valid date # ent = Span(doc, startToken, endToken + 1, label="Temporal") # ent._.date = parsed_date # newEnts.append(ent) # else: # # Replace each ordinal in hitText with its numeric representation # for ordinal, number in self.ordinalToNumber.items(): # hitText = hitText.replace(ordinal, number) # # Remove the word "of" from hitText # new_date = hitText.replace(" of ", " ") # parsed_date = dateparser.parse(new_date) # ent = Span(doc, startToken, endToken + 1, label="Temporal") # ent._.date = parsed_date # newEnts.append(ent) # Combine the new entities with existing entities, ensuring no overlap doc.ents = filter_spans(newEnts+list(doc.ents)) # Using SimpleEntityMatcher doc = self.matcher(doc, replace=True) return doc