Similarity analysis¶
[1]:
import os
import sys
cwd = os.getcwd()
frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))
sys.path.append(frameworkDir)
import time
from dackar.similarity import synsetUtils as SU
from dackar.similarity import simUtils
/Users/wangc/miniconda3/envs/dackar_libs/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
Warming up PyWSD (takes ~10 secs)... took 4.8340160846710205 secs.
[2]:
### convert sentences into synsets list, and then compute similarity
[3]:
sents = ['The workers at the industrial plant were overworked',
'The plant was no longer bearing flowers']
sentSynsets = simUtils.convertSentsToSynsets(sents)
similarity = SU.synsetListSimilarity(sentSynsets[0], sentSynsets[1], delta=.8)
[4]:
similarity
[4]:
0.43946127500409304
### Using disambiguation method to create synsets
[5]:
from dackar.similarity import synsetUtils as SU
from dackar.similarity import simUtils
sents = ['The workers at the industrial plant were overworked',
'The plant was no longer bearing flowers']
sentSynsets = simUtils.convertSentsToSynsetsWithDisambiguation(sents)
similarity = SU.synsetListSimilarity(sentSynsets[0], sentSynsets[1], delta=.8)
[6]:
similarity
[6]:
0.31713942870949496
Timing for performance¶
[7]:
sents = ['The workers at the industrial plant were overworked',
'The plant was no longer bearing flowers']
st = time.time()
for i in range(100):
sentSynsets = simUtils.convertSentsToSynsets(sents)
print('%s second'% (time.time()-st))
12.465039014816284 second
[8]:
st = time.time()
for i in range(1000):
similarity = SU.synsetListSimilarity(sentSynsets[0], sentSynsets[1], delta=.8)
print('%s second'% (time.time()-st))
4.087954759597778 second
[9]:
from dackar.similarity import simUtils
[10]:
st = time.time()
sentSynsets = []
for i in range(1000):
for j in range(len(sents)):
_, synsetsA = simUtils.sentenceSenseDisambiguationPyWSD(sents[j], senseMethod='simple_lesk', simMethod='path')
sentSynsets.append(synsetsA)
print('%s second'% (time.time()-st))
2.198867082595825 second