# Copyright 2024, Battelle Energy Alliance, LLC ALL RIGHTS RESERVED
# modified from https://github.com/nihitsaxena95/sentence-similarity-wordnet-sementic/blob/master/SentenceSimilarity.py
# Method proposed by: https://arxiv.org/pdf/1802.05667.pdf
from pywsd.lesk import simple_lesk
import numpy as np
from nltk.corpus import wordnet, wordnet_ic
import logging
from . import simUtils
logging.basicConfig(
filename="sentence_similarity_computing.log",
filemode="a",
level=logging.INFO,
format=log_format,
datefmt="%m/%d %I:%M:%S %p",
)
[docs]
logger = logging.getLogger("SentenceSimilarity")
[docs]
class SentenceSimilarity:
def __init__(self, disambiguationMethod='simple_lesk', similarityMethod='semantic_similarity_synsets', wordOrderContribution=0.0):
"""
Options for the initiation for this class
"""
[docs]
self.validDisambiguation = ['simple_lesk', 'original_lesk', 'cosine_lesk', 'adapted_lesk', 'max_similarity']
[docs]
self.wordnetSimMethod = ["path_similarity", "wup_similarity", "lch_similarity", "res_similarity", "jcn_similarity", "lin_similarity"]
[docs]
self.validSimilarity = self.wordnetSimMethod + ["semantic_similarity_synsets"]
[docs]
self.wordOrder = wordOrderContribution
if disambiguationMethod.lower() not in self.validDisambiguation:
raise ValueError(f'Inappropriate argument value for "disambiguationMethod", valid values are {self.validDisambiguation}')
if similarityMethod.lower() not in self.validSimilarity:
raise ValueError(f'Inappropriate argument value for "similarityMethod", valid values are {self.validSimilarity}')
[docs]
self.disambiguationMethod = disambiguationMethod.lower()
[docs]
self.similarityMethod = similarityMethod.lower()
[docs]
self.brownIc = wordnet_ic.ic('ic-brown.dat')
[docs]
def setParameters(self, paramDict):
"""
Method to set the parameters
"""
for key, value in paramDict:
if key in self.__dict__:
setattr(self, key, value)
[docs]
def constructSimilarityVectorPawarMagoMethod(self, arr1, arr2):
"""
Construct the similarity vector
Args:
arr1: set of wordnet.Synset for one sentence
arr2: set of wordnet.Synset for the other sentence
Returns:
vector: list, list of similarity vector
count: int, the number of words that have high similarity >=0.804
"""
# vectorLen, the large length between arr1 and arr2
if len(arr1) >= len(arr2):
vectorLen = len(arr1)
else:
vectorLen = len(arr2)
vector = [0.0] * vectorLen
count = 0
for i,a1 in enumerate(arr1):
allSimilarityIndex=[]
for a2 in arr2:
if a1 is None or a2 is None:
similarity = 0.
else:
a1Edit = wordnet.synset(a1.name())
a2Edit = wordnet.synset(a2.name())
similarity = simUtils.synsetsSimilarity(a1Edit, a2Edit, method=self.similarityMethod)
if similarity != None:
allSimilarityIndex.append(similarity)
else:
allSimilarityIndex.append(0.0)
allSimilarityIndex = sorted(allSimilarityIndex, reverse = True)
vector[i]=allSimilarityIndex[0]
# According to Rubinstein 1965, the benchmark synonymy value of two word is 0.8025.
if vector[i] >= 0.8025:
count +=1
vector = np.asarray(vector)
return vector, count
[docs]
def sentenceSimilarity(self, sentence1, sentence2, method='pm_disambiguation', infoContentNorm=False):
"""
sentence similarity calculation
"""
if method.lower() == 'pm_disambiguation':
similarity = self.sentenceSimilarityPawarMagoMethod(sentence1, sentence2)
elif method.lower() == 'best_sense':
similarity = self.sentenceSimialrityBestSense(sentence1, sentence2, infoContentNorm)
else:
raise ValueError(f'{method} is not a valid option, please try "pm_disambiguation" or "best_sense"')
return similarity
[docs]
def sentenceSimilarityPawarMagoMethod(self, sentence1, sentence2):
"""
Proposed method from https://arxiv.org/pdf/1802.05667.pdf
Args:
sentence1: str, first sentence used to compute sentence similarity
sentence2: str, second sentence used to compute sentence similarity
Returns:
similarity: float, [0, 1], the computed similarity for given two sentences
"""
_, sense1 = simUtils.sentenceSenseDisambiguationPyWSD(sentence1, senseMethod=self.disambiguationMethod, simMethod='path')
_, sense2 = simUtils.sentenceSenseDisambiguationPyWSD(sentence2, senseMethod=self.disambiguationMethod, simMethod='path')
v1, c1 = self.constructSimilarityVectorPawarMagoMethod(sense1,sense2)
v2, c2 = self.constructSimilarityVectorPawarMagoMethod(sense2,sense1)
# FIXME: check the following algorithms with benchmarks
# dot = np.dot(v1,v2)
# from original paper
dot = np.linalg.norm(v1)*np.linalg.norm(v2)
# print("dot", dot) # getting the dot product
tow = (c1+c2)/1.8
if tow == 0.:
tow = len(v1)/2.0
semanticSimilarity = dot/tow
# print("similarity",semanticSimilarity)
similarity = (1-self.wordOrder) * semanticSimilarity + self.wordOrder * simUtils.wordOrderSimilaritySentences(sentence1, sentence2)
return similarity
[docs]
def sentenceSimialrityBestSense(self, sentence1, sentence2, infoContentNorm=False):
"""
Proposed method from https://github.com/anishvarsha/Sentence-Similaritity-using-corpus-statistics
Compute sentence similarity using both semantic and word order similarity
The semantic similarity is based on maximum word similarity between one word and another sentence
Args:
sentence1: str, first sentence used to compute sentence similarity
sentence2: str, second sentence used to compute sentence similarity
infoContentNorm: bool, True if statistics corpus is used to weight similarity vectors
Returns:
similarity: float, [0, 1], the computed similarity for given two sentences
"""
similarity = (1-self.wordOrder) * simUtils.semanticSimilaritySentences(sentence1, sentence2, infoContentNorm) + self.wordOrder * simUtils.wordOrderSimilaritySentences(sentence1, sentence2)
return similarity