# Copyright 2024, Battelle Energy Alliance, LLC ALL RIGHTS RESERVED
# The code is adapted from https://github.com/wjbmattingly/date-spacy
# and https://github.com/AliSeyedkav/SMS-TEXT-Time-Date-Recognition-
import re
from spacy.tokens import Span
from spacy.language import Language
from spacy.util import filter_spans
from .SimpleEntityMatcher import SimpleEntityMatcher
# dateparse: python parser for human readable dates: https://dateparser.readthedocs.io/en/latest/
# May be adapted in the future
# import dateparser
# Set up a date extension on the span
# Span.set_extension("Temporal", default=None, force=True)
@Language.factory("Temporal")
[docs]
def find_temporal(nlp, name):
return Temporal(nlp)
[docs]
class Temporal(object):
"""
How to use it:
.. code-block:: python
from TemporalEnity import Temporal
nlp = spacy.load("en_core_web_sm")
pmatcher = Temporal(nlp)
doc = nlp("The shaft deflection is causing the safety cage to rattle. Pumps not experiencing enough flow for the pumps to keep the check valves open during test. Pump not experiencing enough flow during test. Shaft made noise. Vibration seems like it is coming from the shaft.")
updatedDoc = pmatcher(doc)
or:
.. code-block:: python
nlp.add_pipe('Temporal')
newDoc = nlp(doc.text)
"""
def __init__(self, nlp):
"""
Args:
nlp: spacy nlp model
"""
[docs]
self.ordinalToNumber = {
"first": "1", "second": "2", "third": "3", "fourth": "4", "fifth": "5",
"sixth": "6", "seventh": "7", "eighth": "8", "ninth": "9", "tenth": "10",
"eleventh": "11", "twelfth": "12", "thirteenth": "13", "fourteenth": "14",
"fifteenth": "15", "sixteenth": "16", "seventeenth": "17", "eighteenth": "18",
"nineteenth": "19", "twentieth": "20", "twenty-first": "21", "twenty-second": "22",
"twenty-third": "23", "twenty-fourth": "24", "twenty-fifth": "25", "twenty-sixth": "26",
"twenty-seventh": "27", "twenty-eighth": "28", "twenty-ninth": "29", "thirtieth": "30",
"thirty-first": "31"
}
# Ordinals
ordinals = [
"first", "second", "third", "fourth", "fifth",
"sixth", "seventh", "eighth", "ninth", "tenth",
"eleventh", "twelfth", "thirteenth", "fourteenth",
"fifteenth", "sixteenth", "seventeenth", "eighteenth",
"nineteenth", "twentieth", "twenty-first", "twenty-second",
"twenty-third", "twenty-fourth", "twenty-fifth", "twenty-sixth",
"twenty-seventh", "twenty-eighth", "twenty-ninth", "thirtieth", "thirty-first"
]
ordinalPattern = r"\b(?:" + "|".join(ordinals) + r")\b"
exceptions = [
"hour", "hours", "minute", "minutes", "day", "days", "decade", "decades", "century", "centuries", "week", "weeks", "month",
"months", "year", "years"
]
exceptionsPattern = r"(?:" + "|".join(exceptions) + r")\b"
# A regex pattern to capture a variety of date formats
[docs]
self.datePattern = r"""
# Day-Month-Year
(?:
\d{1,2}(?:st|nd|rd|th)? # Day with optional st, nd, rd, th suffix
\s+
(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name
(?: # Year is optional
\s+
\d{4} # Year
)?
)
|
# Day/Month/Year
(?:
\d{1,2} # Day
[/-]
\d{1,2} # Month
(?: # Year is optional
[/-]
\d{2,4} # Year
)?
)
|
# Year-Month-Day
(?:
\d{4} # Year
[-/]
\d{1,2} # Month
[-/]
\d{1,2} # Day
)
|
# Month-Day-Year
(?:
(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name
\s+
\d{1,2}(?:st|nd|rd|th)? # Day with optional st, nd, rd, th suffix
(?: # Year is optional
,?
\s+
\d{4} # Year
)?
)
|
# Month-Year
(?:
(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name
\s+
\d{4} # Year
)
|
# Ordinal-Day-Month-Year
(?:
""" + ordinalPattern + """
\s+
(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name
(?: # Year is optional
\s+
\d{4} # Year
)?
)
|
(?:
""" + ordinalPattern + """
\s+
of
\s+
(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name
(?: # Year is optional
\s+
\d{4} # Year
)?
)
|
# Month Ordinal
(?:
(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name
\s+
""" + ordinalPattern + """
(?: # Year is optional
\s+
\d{4} # Year
)?
)
|
(?:
\d+
(?:\-|\s+)?
""" + exceptionsPattern + """
)
"""
#
terms1 = [
"next", "last", "after", "every", "before", "during"
]
terms2 = [
"today", "tomorrow", "year", "Monday", "Tuesday", "Wednesday", "Thursday", "yesterday", "weekend",
"Friday", "Saturday", "Sunday"
]
terms3 = [
"morning", "afternoon", "noon", "dawn", "midnight", "dusk", "sunrise", "sunset", "evening", "night", "week", "weeks", "month", "months",
"year", "years"
]
pattern = [
[{"LOWER": {"in": terms1}, "OP": "?"}, {"LEMMA": {"in": terms2}, "OP": "+"}],
[{"LOWER": {"in": terms1}, "OP": "?"}, {"LEMMA": {"in": terms2}, "OP": "?"}, {"ENT_TYPE": {"in": ["DATE", "TIME"]}, "OP": "?"}],
[{"LOWER": {"in": ["at", "on", "by", "from", "to", "before", "after", "between", "during", "in"]}, "OP": "?"}, {"ENT_TYPE": {"in": ["DATE", "TIME"]}, "OP": "+"}],
[{"LOWER": {"in": terms1}, "OP": "?"}, {"LEMMA": {"in": terms2}, "OP": "?"}, {"LEMMA": {"in": terms3}, "OP": "+"}],
[{"LOWER": {"in": terms1}, "OP": "?"}, {"LEMMA": {"in": terms2}, "OP": "?"}, {"LEMMA": {"in": terms3}, "OP": "?"}, {"ENT_TYPE": {"in": ["DATE", "TIME"]}, "OP": "+"}],
[{"LOWER": {"in": terms1}, "OP": "?"}, {"LEMMA": {"in": terms2}, "OP": "?"}, {"LEMMA": {"in": terms3}, "OP": "?"}, {"LOWER": {"in": ["at", "on", "by", "from", "to", "before", "after", "between", "during", "in"]}, "OP": "?"}, {"ENT_TYPE": {"in": ["DATE", "TIME"]}, "OP": "+"}]
]
[docs]
self.matcher = SimpleEntityMatcher(nlp, label='Temporal', terms=pattern)
[docs]
def __call__(self, doc):
"""
Args:
doc: spacy.tokens.doc.Doc, the processed document using nlp pipelines
"""
matches = list(re.finditer(self.datePattern, doc.text, re.VERBOSE))
newEnts = []
for match in matches:
startChar, endChar = match.span()
# Convert character offsets to token offsets
startToken = None
endToken = None
for token in doc:
if token.idx == startChar:
startToken = token.i
if token.idx + len(token.text) == endChar:
endToken = token.i
if startToken is not None and endToken is not None:
# hitText = doc.text[startChar:endChar]
ent = Span(doc, startToken, endToken + 1, label="Temporal")
newEnts.append(ent)
## Following is used to add a custom attribute to indicate Temporal
# parsed_date = dateparser.parse(hitText)
# if parsed_date: # Ensure the matched string is a valid date
# ent = Span(doc, startToken, endToken + 1, label="Temporal")
# ent._.date = parsed_date
# newEnts.append(ent)
# else:
# # Replace each ordinal in hitText with its numeric representation
# for ordinal, number in self.ordinalToNumber.items():
# hitText = hitText.replace(ordinal, number)
# # Remove the word "of" from hitText
# new_date = hitText.replace(" of ", " ")
# parsed_date = dateparser.parse(new_date)
# ent = Span(doc, startToken, endToken + 1, label="Temporal")
# ent._.date = parsed_date
# newEnts.append(ent)
# Combine the new entities with existing entities, ensuring no overlap
doc.ents = filter_spans(newEnts+list(doc.ents))
# Using SimpleEntityMatcher
doc = self.matcher(doc, replace=True)
return doc