Source code for src.dackar.similarity.simUtils

# Copyright 2024, Battelle Energy Alliance, LLC  ALL RIGHTS RESERVED

import math
import numpy as np
import copy
# https://github.com/alvations/pywsd
from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk
from pywsd import disambiguate
from pywsd.similarity import max_similarity as maxsim

import nltk
from nltk import word_tokenize as tokenizer
from nltk.corpus import brown
from nltk.corpus import wordnet as wn

from .synsetUtils import semanticSimilaritySynsets, synsetsSimilarity
from .synsetUtils import semanticSimilarityUsingDisambiguatedSynsets

"""
  Methods proposed by: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1644735
  Codes are modified from https://github.com/anishvarsha/Sentence-Similaritity-using-corpus-statistics
"""

[docs] def sentenceSimilarity(sentenceA, sentenceB, infoContentNorm=False, delta=0.85): """ Compute sentence similarity using both semantic and word order similarity The semantic similarity is based on maximum word similarity between one word and another sentence Args: sentenceA: str, first sentence used to compute sentence similarity sentenceB: str, second sentence used to compute sentence similarity infoContentNorm: bool, True if statistics corpus is used to weight similarity vectors delta: float, [0,1], similarity contribution from semantic similarity, 1-delta is the similarity contribution from word order similarity Returns: similarity: float, [0, 1], the computed similarity for given two sentences """ similarity = delta * semanticSimilaritySentences(sentenceA, sentenceB, infoContentNorm) + (1.0-delta)* wordOrderSimilaritySentences(sentenceA, sentenceB) return similarity
[docs] def wordOrderSimilaritySentences(sentenceA, sentenceB): """ Compute sentence similarity using word order similarity Args: sentenceA: str, first sentence used to compute sentence similarity sentenceB: str, second sentence used to compute sentence similarity Returns: similarity: float, [0, 1], the computed word order similarity for given two sentences """ wordsA = tokenizer(sentenceA) wordsB = tokenizer(sentenceB) wordSet = list(set(wordsA).union(set(wordsB))) index = {word[1]: word[0] for word in enumerate(wordSet)} r1 = constructWordOrderVector(wordsA, wordSet, index) r2 = constructWordOrderVector(wordsB, wordSet, index) srTemp = np.linalg.norm(r1-r2)/np.linalg.norm(r1+r2) return 1-srTemp
[docs] def constructWordOrderVector(words, jointWords, index): """ Construct word order vector Args: words: set of words, a set of words for one sentence jointWords: set of joint words, a set of joint words for both sentences index: dict, word index in the joint set of words Returns: vector: numpy.array, the word order vector """ vector = np.zeros(len(jointWords)) i = 0 wordSet = set(words) for jointWord in jointWords: if jointWord in wordSet: vector[i] = index[jointWord] else: wordSimilar, similarity = identifyBestSimilarWordFromWordSet(jointWord, wordSet) if similarity > 0.4: vector[i] = index[wordSimilar] else: vector[i] = 0 i +=1 return vector
[docs] def semanticSimilaritySentences(sentenceA, sentenceB, infoContentNorm): """ Compute sentence similarity using semantic similarity The semantic similarity is based on maximum word similarity between one word and another sentence Args: sentenceA: str, first sentence used to compute sentence similarity sentenceB: str, second sentence used to compute sentence similarity infoContentNorm: bool, True if statistics corpus is used to weight similarity vectors Returns: semSimilarity: float, [0, 1], the computed similarity for given two sentences """ wordsA = tokenizer(sentenceA) wordsB = tokenizer(sentenceB) wordSet = set(wordsA).union(set(wordsB)) wordVectorA = constructSemanticVector(wordsA, wordSet, infoContentNorm) wordVectorB = constructSemanticVector(wordsB, wordSet, infoContentNorm) semSimilarity = np.dot(wordVectorA, wordVectorB)/(np.linalg.norm(wordVectorA)*np.linalg.norm(wordVectorB)) return semSimilarity
[docs] def constructSemanticVector(words, jointWords, infoContentNorm): """ Construct semantic vector Args: words: set of words, a set of words for one sentence jointWords: set of joint words, a set of joint words for both sentences infoContentNorm: bool, consider word statistics in Brown Corpus if True Returns: vector: numpy.array, the semantic vector """ wordSet = set(words) vector = np.zeros(len(jointWords)) if infoContentNorm: wordCount, brownDict = brownInfo() i = 0 for jointWord in jointWords: if jointWord in wordSet: vector[i] = 1 if infoContentNorm: vector[i] = vector[i]*math.pow(content(jointWord, wordCount, brownDict), 2) else: similarWord, similarity = identifyBestSimilarWordFromWordSet(jointWord, wordSet) if similarity >0.2: vector[i] = similarity else: vector[i] = 0.0 if infoContentNorm: vector[i] = vector[i]*content(jointWord, wordCount, brownDict)* content(similarWord, wordCount, brownDict) i+=1 return vector
[docs] def brownInfo(): """ Compute word dict and word numbers in NLTK brown corpus Args: None Returns: wordCount: int, the total number of words in brown brownDict: dict, the brown word dict, {word:count} """ brownDict = {} wordCount = 0 for sent in brown.sents(): for word in sent: key = word.lower() if key not in brownDict: brownDict[key] = 0 brownDict[key]+=1 wordCount+=1 return wordCount, brownDict
[docs] def content(wordData, wordCount=0, brownDict=None): """ Employ statistics from Brown Corpus to compute the information content of given word in the corpus ref: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1644735 information content I(w) = 1 - log(n+1)/log(N+1) The significance of a word is weighted using its information content. The assumption here is that words occur with a higher frequency (in corpus) contain less information than those that occur with lower frequencies. Args: wordData: string, a given word wordCount: int, the total number of words in brown corpus brownDict: dict, the brown word dict, {word:count} Returns: content: float, [0, 1], the information content of a word in the corpus """ if wordCount == 0: wordCount, brownDict = brownInfo() wordData = wordData.lower() if wordData not in brownDict: n = 0 else: n = brownDict[wordData] informationContent = math.log(n+1)/math.log(wordCount+1) return 1.0-informationContent
[docs] def identifyBestSimilarWordFromWordSet(wordA, wordSet): """ Identify the best similar word in a word set for a given word Args: wordA: str, a given word that looking for the best similar word in a word set wordSet: set/list, a pool of words Returns: word: str, the best similar word in the word set for given word similarity: float, [0, 1], similarity score between the best pair of words """ similarity = 0.0 word = "" for wordB in wordSet: temp = semanticSimilarityWords(wordA, wordB) if temp > similarity: similarity = temp word = wordB return word, similarity
[docs] def semanticSimilarityWords(wordA, wordB): """ Compute the similarity between two words using semantic analysis First identify the best similar synset pair using wordnet similarity, then compute the similarity using both path length and depth information in wordnet Args: wordA: str, the first word wordB: str, the second word Returns: similarity: float, [0, 1], the similarity score """ if wordA.lower() == wordB.lower(): return 1.0 bestPair = identifyBestSimilarSynsetPair(wordA, wordB) if bestPair[0] is None or bestPair[1] is None: return 0.0 # disambiguation is False since only two words is provided and there is no additional information content similarity = semanticSimilaritySynsets(bestPair[0], bestPair[1], disambiguation=False) return similarity
[docs] def identifyBestSimilarSynsetPair(wordA, wordB): """ Identify the best synset pair for given two words using wordnet similarity analysis Args: wordA: str, the first word wordB: str, the second word Returns: bestPair: tuple, (first synset, second synset), identified best synset pair using wordnet similarity """ similarity = -1.0 synsetsWordA = wn.synsets(wordA) synsetsWordB = wn.synsets(wordB) if len(synsetsWordA) == 0 or len(synsetsWordB) == 0: return None, None else: similarity = -1.0 bestPair = None, None for synsetWordA in synsetsWordA: for synsetWordB in synsetsWordB: # TODO: may change to general similarity method temp = wn.path_similarity(synsetWordA, synsetWordB) if temp > similarity: similarity = temp bestPair = synsetWordA, synsetWordB return bestPair
################################################# """ Methods proposed by: https://arxiv.org/pdf/1802.05667.pdf Codes are modified from https://github.com/nihitsaxena95/sentence-similarity-wordnet-sementic/blob/master/SentenceSimilarity.py """
[docs] def identifyNounAndVerbForComparison(sentence): """ Taking out Noun and Verb for comparison word based Args: sentence: string, sentence string Returns: pos: list, list of dict {token/word:pos_tag} """ tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) pos = [p for p in pos if p[1].startswith('N') or p[1].startswith('V')] return pos
[docs] def sentenceSenseDisambiguation(sentence, method='simple_lesk'): """ removing the disambiguity by getting the context Args: sentence: str, sentence string method: str, the method for disambiguation, this method only support simple_lesk method Returns: sense: set, set of wordnet.Synset for the estimated best sense """ pos = identifyNounAndVerbForComparison(sentence) sense = [] for p in pos: if method.lower() == 'simple_lesk': sense.append(simple_lesk(sentence, p[0], pos=p[1][0].lower())) else: raise NotImplementedError(f"Mehtod {method} not implemented yet!") return set(sense)
########################################################################### """ Extended methods """
[docs] def wordsSimilarity(wordA, wordB, method='semantic_similarity_synsets'): """ General method for compute words similarity Args: wordA: str, the first word wordB: str, the second word method: str, the method used to compute word similarity Returns: similarity: float, [0, 1], the similarity score """ method = method.lower() wordnetSimMethod = ["path_similarity", "wup_similarity", "lch_similarity", "res_similarity", "jcn_similarity", "lin_similarity"] sematicSimMethod = ['semantic_similarity_synsets'] if method not in sematicSimMethod and not method.endswith('_similarity'): method = method + '_similarity' if method not in sematicSimMethod + wordnetSimMethod: raise ValueError(f'{method} is not valid, please use one of {wordnetSimMethod+sematicSimMethod}') bestPair = identifyBestSimilarSynsetPair(wordA, wordB) if bestPair[0] is None or bestPair[1] is None: return 0.0 # when campare words only, we assume there is no disambiguation required. similarity = synsetsSimilarity(bestPair[0], bestPair[1], method=method, disambiguation=False) return similarity
# TODO: Utilize Spacy wordvector similarity to improve the similarity score when the POS for given synset are not the same # i.e., synsetA.name().split(".")[1] != synsetB.name().split(".")[1], the similarity will be set to 0.0 # Similarity calculated by word1.similarity(word2) # calibration calibrate 0.715878 # failure fail 0.6802347 # replacement replace 0.73397416 # leak leakage 0.652573 # Similarity calculated by wordnet: # calibration calibrate 0.09090909090909091 # failure fail 0.125 # replacement replace 0.1111111111111111 # leak leakage 1.0
[docs] def wordSenseDisambiguation(word, sentence, senseMethod='simple_lesk', simMethod='path'): """ removing the disambiguity by getting the context Args: word: str/list/set, given word or set of words sentence: str, sentence that will be used to disambiguate the given word senseMethod: str, method for disambiguation, one of ['simple_lesk', 'original_lesk', 'cosine_lesk', 'adapted_lesk', 'max_similarity'] simMethod: str, method for similarity analysis when 'max_similarity' is used, one of ['path', 'wup', 'lch', 'res', 'jcn', 'lin'] Returns: sense: str/list/set, the type for given word, identified best sense for given word with disambiguation performed using given sentence """ method = senseMethod.lower() simMethod = simMethod.lower() validMethod = ['simple_lesk', 'original_lesk', 'cosine_lesk', 'adapted_lesk', 'max_similarity'] validSimMethod = ['path', 'wup', 'lch', 'res', 'jcn', 'lin'] if method not in validMethod: raise ValueError(f'{method} is not valid option, please try to use one of {validMethod}') if simMethod not in validSimMethod: raise ValueError(f'{simMethod} is not valid option, please try to use one of {validSimMethod}') if isinstance(word, str): tokens = nltk.word_tokenize(word) elif isinstance(word, list) or isinstance(word, set): tokens = [nltk.word_tokenize(w)[0] for w in word] else: raise ValueError(f'{word} format is not valid, please input string, set/list of string') pos = nltk.pos_tag(tokens) sense = [] for p in pos: if method == 'simple_lesk': sense.append(simple_lesk(sentence, p[0], pos=p[1][0].lower())) elif method == 'original_lesk': sense.append(original_lesk(sentence, p[0])) elif method == 'adapted_lesk': sense.append(adapted_lesk(sentence, p[0], pos=p[1][0].lower())) elif method == 'cosine_lesk': sense.append(cosine_lesk(sentence, p[0], pos=p[1][0].lower())) elif method == 'max_similarity': sense.append(maxsim(sentence, p[0], pos=p[1][0].lower(), option=simMethod)) else: raise NotImplementedError(f"Method {method} not implemented yet!") if isinstance(word, str): return sense[0] elif isinstance(word, list): return sense else: return set(sense)
# use disambiguate function to disambiguate sentences
[docs] def sentenceSenseDisambiguationPyWSD(sentence, senseMethod='simple_lesk', simMethod='path'): """ Wrap for sentence sense disambiguation method from pywsd https://github.com/alvations/pywsd Args: sentence: str, given sentence senseMethod: str, method for disambiguation, one of ['simple_lesk', 'original_lesk', 'cosine_lesk', 'adapted_lesk', 'max_similarity'] simMethod: str, method for similarity analysis when 'max_similarity' is used, one of ['path', 'wup', 'lch', 'res', 'jcn', 'lin'] Returns: wordList: list, list of words from sentence that has an identified synset from wordnet synsetList: list, list of corresponding synset for wordList """ method = senseMethod.lower() simMethod = simMethod.lower() validMethod = ['simple_lesk', 'original_lesk', 'cosine_lesk', 'adapted_lesk', 'max_similarity'] validSimMethod = ['path', 'wup', 'lch', 'res', 'jcn', 'lin'] if method not in validMethod: raise ValueError(f'{method} is not valid option, please try to use one of {validMethod}') if simMethod not in validSimMethod: raise ValueError(f'{simMethod} is not valid option, please try to use one of {validSimMethod}') if method == 'simple_lesk': sentSense = disambiguate(sentence, simple_lesk, prefersNone=True, keepLemmas=True) elif method == 'original_lesk': sentSense = disambiguate(sentence, original_lesk, prefersNone=True, keepLemmas=True) elif method == 'adapted_lesk': sentSense = disambiguate(sentence, adapted_lesk, prefersNone=True, keepLemmas=True) elif method == 'cosine_lesk': sentSense = disambiguate(sentence, cosine_lesk, prefersNone=True, keepLemmas=True) elif method == 'max_similarity': sentSense = disambiguate(sentence, maxsim, similarity_option=simMethod, prefersNone=True, keepLemmas=True) # sentSense: a list of tuples, [(word, lemma, wn.synset/None)] wordList = list([syn[0] for syn in sentSense if syn[-1] is not None]) synsetList = list([syn[-1] for syn in sentSense if syn[-1] is not None]) return wordList, synsetList
# Sentence similarity after disambiguation
[docs] def sentenceSimilarityWithDisambiguation(sentenceA, sentenceB, senseMethod='simple_lesk', simMethod='semantic_similarity_synsets', disambiguationSimMethod='path', delta=0.85): """ Compute semantic similarity for given two sentences that disambiguation will be performed Args: sentenceA: str, first sentence sentenceB: str, second sentence senseMethod: str, method for disambiguation, one of ['simple_lesk', 'original_lesk', 'cosine_lesk', 'adapted_lesk', 'max_similarity'] simMethod: str, method for similarity analysis in the construction of semantic vectors one of ['semantic_similarity_synsets', 'path', 'wup', 'lch', 'res', 'jcn', 'lin'] disambiguationSimMethod: str, method for similarity analysis when 'max_similarity' is used, one of ['path', 'wup', 'lch', 'res', 'jcn', 'lin'] delta: float, [0,1], similarity contribution from semantic similarity, 1-delta is the similarity contribution from word order similarity Returns: similarity: float, [0, 1], the computed similarity for given two sentences """ simMethod = simMethod.lower() disambiguationSimMethod = disambiguationSimMethod.lower() if disambiguationSimMethod not in ['path', 'wup', 'lch', 'res', 'jcn', 'lin']: raise ValueError(f'Option for "disambiguationSimMethod={disambiguationSimMethod}" is not valid, please try one of "path, wup, lch, res, jcn, lin" ') _, synsetsA = sentenceSenseDisambiguationPyWSD(sentenceA, senseMethod=senseMethod, simMethod=disambiguationSimMethod) _, synsetsB = sentenceSenseDisambiguationPyWSD(sentenceB, senseMethod=senseMethod, simMethod=disambiguationSimMethod) similarity = delta * semanticSimilarityUsingDisambiguatedSynsets(synsetsA, synsetsB, simMethod=simMethod) + (1.0-delta)* wordOrderSimilaritySentences(sentenceA, sentenceB) return similarity
[docs] def convertSentsToSynsetsWithDisambiguation(sentList): """ Use sentence itself to identify the best synset Args: sentList: list of sentences Returns: sentSynsets: list of synsets for corresponding sentences """ sentSynsets = [] for sent in sentList: _, bestSyn = sentenceSenseDisambiguationPyWSD(sent, senseMethod='simple_lesk', simMethod='path') bestSyn = [wn.synset(syn.name()) for syn in bestSyn if syn is not None] sentSynsets.append(bestSyn) return sentSynsets
[docs] def convertToSynsets(wordSet): """ Convert a list/set of words into a list of synsets Args: wordSet: list/set of words Returns: wordList: list, list of words without duplications synsets: list, list of synsets correponding wordList """ # keep the order (works for python3.7+) wordList = list(dict.fromkeys(wordSet)) synsets = [list(wn.synsets(word)) for word in wordList] return wordList, synsets
[docs] def identifyBestSynset(word, jointWordList, jointSynsetList): """ Identify the best synset for given word with provided additional information (i.e., jointWordList) Args: word: str, a single word jointWordList: list, a list of words without duplications jointSynsetList: list, a list of synsets correponding to jointWordList Returns: bestSyn: wn.synset, identified synset for given word """ wordList = copy.copy(jointWordList) synsets = copy.copy(jointSynsetList) if word in wordList: i = wordList.index(word) wordList.remove(word) synsets.remove(synsets[i]) synsets = [item for sub in synsets for item in sub] wordSyns = wn.synsets(word) if len(wordSyns) == 0: return None max = -1 bestSyn = wordSyns[0] for syn in wordSyns: similarity = [wn.path_similarity(syn, syn1) for syn1 in synsets] temp = np.max(similarity) if temp > max: bestSyn = syn max = temp return bestSyn
[docs] def convertSentsToSynsets(sentList, info=None): """ Use sentence itself to identify the best synset Args: sentList: list, list of sentences info: list, additional list of words that will be used to determine the synset Returns: sentSynsets: list, lis of synsets correponding to provided sentList """ sentSynsets = [] if info is not None: infoWordList, infoSynsetsList = convertToSynsets(info) for sent in sentList: wordList, synsetsList = convertToSynsets([e.strip() for e in sent.split()]) if info is not None: wordList = wordList + infoWordList synsetsList = synsetsList + infoSynsetsList bestSyn = [identifyBestSynset(word, wordList, synsetsList) for word in wordList] bestSyn = list(filter(None, bestSyn)) sentSynsets.append(bestSyn) return sentSynsets