{ "cells": [ { "cell_type": "markdown", "id": "af2a9522-ded9-4b7b-9c75-1935b12578fe", "metadata": {}, "source": [ "## Similarity analysis " ] }, { "cell_type": "code", "execution_count": 1, "id": "b460f1e1-f63d-4199-aa75-513c97e40e1d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/wangc/miniconda3/envs/dackar_libs/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "Warming up PyWSD (takes ~10 secs)... took 4.8340160846710205 secs.\n" ] } ], "source": [ "import os\n", "import sys\n", "\n", "cwd = os.getcwd()\n", "frameworkDir = os.path.abspath(os.path.join(cwd, os.pardir, 'src'))\n", "sys.path.append(frameworkDir)\n", "\n", "import time\n", "from dackar.similarity import synsetUtils as SU\n", "from dackar.similarity import simUtils" ] }, { "cell_type": "code", "execution_count": 2, "id": "5f7b2b20-b751-4d06-8eb6-8b43aacde955", "metadata": {}, "outputs": [], "source": [ "### convert sentences into synsets list, and then compute similarity" ] }, { "cell_type": "code", "execution_count": 3, "id": "1a168728-3b3b-44d2-88ac-a36618191b90", "metadata": {}, "outputs": [], "source": [ "sents = ['The workers at the industrial plant were overworked',\n", "'The plant was no longer bearing flowers']\n", "\n", "sentSynsets = simUtils.convertSentsToSynsets(sents)\n", "similarity = SU.synsetListSimilarity(sentSynsets[0], sentSynsets[1], delta=.8)" ] }, { "cell_type": "code", "execution_count": 4, "id": "8b828799-15d6-4a1f-9f79-bd00de8328d5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.43946127500409304" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "similarity" ] }, { "cell_type": "markdown", "id": "f8a81268-29ea-4ef4-9a36-53088374d54d", "metadata": {}, "source": [ " ### Using disambiguation method to create synsets " ] }, { "cell_type": "code", "execution_count": 5, "id": "b349155c-6e69-42c5-bc84-a128ea7d8cb8", "metadata": {}, "outputs": [], "source": [ "from dackar.similarity import synsetUtils as SU\n", "from dackar.similarity import simUtils\n", "\n", "sents = ['The workers at the industrial plant were overworked',\n", "'The plant was no longer bearing flowers']\n", "\n", "sentSynsets = simUtils.convertSentsToSynsetsWithDisambiguation(sents)\n", "similarity = SU.synsetListSimilarity(sentSynsets[0], sentSynsets[1], delta=.8)" ] }, { "cell_type": "code", "execution_count": 6, "id": "f4f0367b-c473-42e0-8733-c87b5be4ac94", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.31713942870949496" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "similarity" ] }, { "cell_type": "markdown", "id": "7d868f88-fed9-400d-a2d0-9cf77a811c35", "metadata": {}, "source": [ "### Timing for performance\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "1c8b368e-e76b-4f4b-94c7-b1c72a490492", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "12.465039014816284 second\n" ] } ], "source": [ "sents = ['The workers at the industrial plant were overworked',\n", "'The plant was no longer bearing flowers']\n", "\n", "st = time.time()\n", "for i in range(100):\n", " sentSynsets = simUtils.convertSentsToSynsets(sents)\n", "print('%s second'% (time.time()-st))" ] }, { "cell_type": "code", "execution_count": 8, "id": "71a8acbf-d911-48a6-a971-5e9185cad99e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4.087954759597778 second\n" ] } ], "source": [ "st = time.time()\n", "for i in range(1000):\n", " similarity = SU.synsetListSimilarity(sentSynsets[0], sentSynsets[1], delta=.8)\n", "print('%s second'% (time.time()-st))" ] }, { "cell_type": "code", "execution_count": 9, "id": "66aee9d8-e0b5-4211-b2d1-c2a7133dd3c7", "metadata": {}, "outputs": [], "source": [ "from dackar.similarity import simUtils" ] }, { "cell_type": "code", "execution_count": 10, "id": "523fef2d-9490-4595-a9d9-44d7fd9b0a4a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2.198867082595825 second\n" ] } ], "source": [ "st = time.time()\n", "sentSynsets = []\n", "for i in range(1000):\n", " for j in range(len(sents)):\n", " _, synsetsA = simUtils.sentenceSenseDisambiguationPyWSD(sents[j], senseMethod='simple_lesk', simMethod='path')\n", " sentSynsets.append(synsetsA)\n", "print('%s second'% (time.time()-st))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 5 }