Source code for src.dackar.pipelines.TemporalEntity

# Copyright 2024, Battelle Energy Alliance, LLC  ALL RIGHTS RESERVED

# The code is adapted from https://github.com/wjbmattingly/date-spacy
# and https://github.com/AliSeyedkav/SMS-TEXT-Time-Date-Recognition-

import re
from spacy.tokens import Span
from spacy.language import Language
from spacy.util import filter_spans
from .SimpleEntityMatcher import SimpleEntityMatcher

# dateparse: python parser for human readable dates: https://dateparser.readthedocs.io/en/latest/
# May be adapted in the future
# import dateparser

# Set up a date extension on the span
# Span.set_extension("Temporal", default=None, force=True)


@Language.factory("Temporal")
[docs] def find_temporal(nlp, name): return Temporal(nlp)
[docs] class Temporal(object): """ Temporal Entity Recognition class How to use it: .. code-block:: python from TemporalEnity import Temporal nlp = spacy.load("en_core_web_sm") pmatcher = Temporal(nlp) doc = nlp("The event is scheduled for 25th August 2023.") updatedDoc = pmatcher(doc) or: .. code-block:: python nlp.add_pipe('Temporal') newDoc = nlp(doc.text) """ def __init__(self, nlp): """ Args: nlp: spacy nlp model """
[docs] self.name = 'Temporal'
[docs] self.ordinalToNumber = { "first": "1", "second": "2", "third": "3", "fourth": "4", "fifth": "5", "sixth": "6", "seventh": "7", "eighth": "8", "ninth": "9", "tenth": "10", "eleventh": "11", "twelfth": "12", "thirteenth": "13", "fourteenth": "14", "fifteenth": "15", "sixteenth": "16", "seventeenth": "17", "eighteenth": "18", "nineteenth": "19", "twentieth": "20", "twenty-first": "21", "twenty-second": "22", "twenty-third": "23", "twenty-fourth": "24", "twenty-fifth": "25", "twenty-sixth": "26", "twenty-seventh": "27", "twenty-eighth": "28", "twenty-ninth": "29", "thirtieth": "30", "thirty-first": "31" }
# Ordinals ordinals = [ "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth", "twentieth", "twenty-first", "twenty-second", "twenty-third", "twenty-fourth", "twenty-fifth", "twenty-sixth", "twenty-seventh", "twenty-eighth", "twenty-ninth", "thirtieth", "thirty-first" ] ordinalPattern = r"\b(?:" + "|".join(ordinals) + r")\b" exceptions = [ "hour", "hours", "minute", "minutes", "day", "days", "decade", "decades", "century", "centuries", "week", "weeks", "month", "months", "year", "years" ] exceptionsPattern = r"(?:" + "|".join(exceptions) + r")\b" # A regex pattern to capture a variety of date formats
[docs] self.datePattern = r""" # Day-Month-Year (?: \d{1,2}(?:st|nd|rd|th)? # Day with optional st, nd, rd, th suffix \s+ (?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name (?: # Year is optional \s+ \d{4} # Year )? ) | # Day/Month/Year (?: \d{1,2} # Day [/-] \d{1,2} # Month (?: # Year is optional [/-] \d{2,4} # Year )? ) | # Year-Month-Day (?: \d{4} # Year [-/] \d{1,2} # Month [-/] \d{1,2} # Day ) | # Month-Day-Year (?: (?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name \s+ \d{1,2}(?:st|nd|rd|th)? # Day with optional st, nd, rd, th suffix (?: # Year is optional ,? \s+ \d{4} # Year )? ) | # Month-Year (?: (?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name \s+ \d{4} # Year ) | # Ordinal-Day-Month-Year (?: """ + ordinalPattern + """ \s+ (?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name (?: # Year is optional \s+ \d{4} # Year )? ) | (?: """ + ordinalPattern + """ \s+ of \s+ (?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name (?: # Year is optional \s+ \d{4} # Year )? ) | # Month Ordinal (?: (?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name \s+ """ + ordinalPattern + """ (?: # Year is optional \s+ \d{4} # Year )? ) | (?: \d+ (?:\-|\s+)? """ + exceptionsPattern + """ ) """
# terms1 = [ "next", "last", "after", "every", "before", "during" ] terms2 = [ "today", "tomorrow", "year", "Monday", "Tuesday", "Wednesday", "Thursday", "yesterday", "weekend", "Friday", "Saturday", "Sunday" ] terms3 = [ "morning", "afternoon", "noon", "dawn", "midnight", "dusk", "sunrise", "sunset", "evening", "night", "week", "weeks", "month", "months", "year", "years" ] pattern = [ [{"LOWER": {"in": terms1}, "OP": "?"}, {"LEMMA": {"in": terms2}, "OP": "+"}], [{"LOWER": {"in": terms1}, "OP": "?"}, {"LEMMA": {"in": terms2}, "OP": "?"}, {"ENT_TYPE": {"in": ["DATE", "TIME"]}, "OP": "?"}], [{"LOWER": {"in": ["at", "on", "by", "from", "to", "before", "after", "between", "during", "in"]}, "OP": "?"}, {"ENT_TYPE": {"in": ["DATE", "TIME"]}, "OP": "+"}], [{"LOWER": {"in": terms1}, "OP": "?"}, {"LEMMA": {"in": terms2}, "OP": "?"}, {"LEMMA": {"in": terms3}, "OP": "+"}], [{"LOWER": {"in": terms1}, "OP": "?"}, {"LEMMA": {"in": terms2}, "OP": "?"}, {"LEMMA": {"in": terms3}, "OP": "?"}, {"ENT_TYPE": {"in": ["DATE", "TIME"]}, "OP": "+"}], [{"LOWER": {"in": terms1}, "OP": "?"}, {"LEMMA": {"in": terms2}, "OP": "?"}, {"LEMMA": {"in": terms3}, "OP": "?"}, {"LOWER": {"in": ["at", "on", "by", "from", "to", "before", "after", "between", "during", "in"]}, "OP": "?"}, {"ENT_TYPE": {"in": ["DATE", "TIME"]}, "OP": "+"}] ]
[docs] self.matcher = SimpleEntityMatcher(nlp, label='Temporal', patterns=pattern)
[docs] self.asSpan = True
[docs] def __call__(self, doc): """ Args: doc: spacy.tokens.doc.Doc, the processed document using nlp pipelines """ matches = list(re.finditer(self.datePattern, doc.text, re.VERBOSE)) newEnts = [] for match in matches: startChar, endChar = match.span() # Convert character offsets to token offsets startToken = None endToken = None for token in doc: if token.idx == startChar: startToken = token.i if token.idx + len(token.text) == endChar: endToken = token.i if startToken is not None and endToken is not None: # hitText = doc.text[startChar:endChar] ent = Span(doc, startToken, endToken + 1, label="Temporal") newEnts.append(ent) ## Following is used to add a custom attribute to indicate Temporal # parsed_date = dateparser.parse(hitText) # if parsed_date: # Ensure the matched string is a valid date # ent = Span(doc, startToken, endToken + 1, label="Temporal") # ent._.date = parsed_date # newEnts.append(ent) # else: # # Replace each ordinal in hitText with its numeric representation # for ordinal, number in self.ordinalToNumber.items(): # hitText = hitText.replace(ordinal, number) # # Remove the word "of" from hitText # new_date = hitText.replace(" of ", " ") # parsed_date = dateparser.parse(new_date) # ent = Span(doc, startToken, endToken + 1, label="Temporal") # ent._.date = parsed_date # newEnts.append(ent) # Combine the new entities with existing entities, ensuring no overlap doc.ents = filter_spans(newEnts+list(doc.ents)) # Using SimpleEntityMatcher doc = self.matcher(doc, replace=True) return doc