Similarity analysis

[1]:
import os
import sys

cwd = os.getcwd()
frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))
sys.path.append(frameworkDir)

import time
from dackar.similarity import synsetUtils as SU
from dackar.similarity import simUtils
/Users/wangc/miniconda3/envs/dackar_libs/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Warming up PyWSD (takes ~10 secs)... took 4.8340160846710205 secs.
[2]:
### convert sentences into synsets list, and then compute similarity
[3]:
sents = ['The workers at the industrial plant were overworked',
'The plant was no longer bearing flowers']

sentSynsets = simUtils.convertSentsToSynsets(sents)
similarity = SU.synsetListSimilarity(sentSynsets[0], sentSynsets[1], delta=.8)
[4]:
similarity
[4]:
0.43946127500409304

### Using disambiguation method to create synsets

[5]:
from dackar.similarity import synsetUtils as SU
from dackar.similarity import simUtils

sents = ['The workers at the industrial plant were overworked',
'The plant was no longer bearing flowers']

sentSynsets = simUtils.convertSentsToSynsetsWithDisambiguation(sents)
similarity = SU.synsetListSimilarity(sentSynsets[0], sentSynsets[1], delta=.8)
[6]:
similarity
[6]:
0.31713942870949496

Timing for performance

[7]:
sents = ['The workers at the industrial plant were overworked',
'The plant was no longer bearing flowers']

st = time.time()
for i in range(100):
    sentSynsets = simUtils.convertSentsToSynsets(sents)
print('%s second'% (time.time()-st))
12.465039014816284 second
[8]:
st = time.time()
for i in range(1000):
    similarity = SU.synsetListSimilarity(sentSynsets[0], sentSynsets[1], delta=.8)
print('%s second'% (time.time()-st))
4.087954759597778 second
[9]:
from dackar.similarity import simUtils
[10]:
st = time.time()
sentSynsets = []
for i in range(1000):
    for j in range(len(sents)):
        _, synsetsA = simUtils.sentenceSenseDisambiguationPyWSD(sents[j], senseMethod='simple_lesk', simMethod='path')
        sentSynsets.append(synsetsA)
print('%s second'% (time.time()-st))
2.198867082595825 second