From 359630f4c6e1e5bd4823c89d3637cdb7270c4e70 Mon Sep 17 00:00:00 2001 From: Ruben Date: Tue, 8 Aug 2023 17:25:44 +0000 Subject: [PATCH] not all working --- dev/company_names.py | 15 ++---- simstring/measure/left_cosine.py | 20 ++++++++ tests/measure/test_cosine.py | 8 +++- tests/measure/test_left_cosine.py | 80 +++++++++++++++++++++++++++++++ tests/test_searcher.py | 23 +++++++++ 5 files changed, 135 insertions(+), 11 deletions(-) create mode 100644 simstring/measure/left_cosine.py create mode 100644 tests/measure/test_left_cosine.py diff --git a/dev/company_names.py b/dev/company_names.py index 06a47b7..ecdf497 100644 --- a/dev/company_names.py +++ b/dev/company_names.py @@ -6,10 +6,12 @@ from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor from simstring.measure.cosine import ( - CosineMeasure, -) # , OverlapMeasure, LeftOverlapMeasure + CosineMeasure +) +from simstring.measure.left_cosine import ( + LeftCosineMeasure as CosineMeasure +) -# from simstring.database.mongo import MongoDatabase from simstring.database.dict import DictDatabase from simstring.searcher import Searcher @@ -17,7 +19,6 @@ profiler = Profiler() - def output_similar_strings_of_each_line(path, measure): strings = [] with open(path, "r") as lines: @@ -48,9 +49,3 @@ def output_similar_strings_of_each_line(path, measure): measure = CosineMeasure() output_similar_strings_of_each_line("dev/data/company_names.txt", measure) - -# measure = OverlapMeasure() -# output_similar_strings_of_each_line("dev/data/company_names.txt", measure) - -# measure = LeftOverlapMeasure() -# output_similar_strings_of_each_line("./data/company_names.txt", measure) diff --git a/simstring/measure/left_cosine.py b/simstring/measure/left_cosine.py new file mode 100644 index 0000000..0c3ece0 --- /dev/null +++ b/simstring/measure/left_cosine.py @@ -0,0 +1,20 @@ +import math +from typing import Iterable + + +class LeftCosineMeasure: + def min_feature_size(self, query_size: int, alpha: float) -> int: + return int(math.ceil(alpha * alpha * query_size)) + + def max_feature_size(self, query_size: int, alpha: float) -> int: + return int(math.floor(query_size / (alpha * alpha))) + + def minimum_common_feature_count( + self, query_size: int, y_size: int, alpha: float + ) -> int: + # breakpoint() + return int(math.ceil(alpha * math.sqrt(query_size * y_size))) + + def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float: + # breakpoint() + return len(set(X) & set(Y)) /len(set(X)) diff --git a/tests/measure/test_cosine.py b/tests/measure/test_cosine.py index 0416cf8..36e5075 100644 --- a/tests/measure/test_cosine.py +++ b/tests/measure/test_cosine.py @@ -71,4 +71,10 @@ def test_similarity(self): ] self.assertEqual( round(self.measure.similarity(a, b), 3), 0.788 - ) # BUG? Disagrees with paper that claims should be 0.788 + ) + + name = ["Vl", "la", "ad", "di", "im", "mi", "ir", "r ", " P", "Pu", "ut", "ti", "in"] + sentence = ["Do", "on", "na", "at", "ti", "io", "on", "n ", " t", "to", "o ", " t", "he", "e ", " g", "gr", "re", "ea", "at", "t ", " V", "Vl", "la", "ad", "di", "im", "mi", "ir", "r ", " P", "Pu", "ut", "ti", "in", "n ", " m", "my", "y ", " f", "fa", "av", "vo", "or", "ri", "it", "te", "e ", " p", "pr", "re", "es", "si", "id", "de", "en", "nt"] + self.assertEqual( + round(self.measure.similarity(name, sentence), 2), 0.52 + ) \ No newline at end of file diff --git a/tests/measure/test_left_cosine.py b/tests/measure/test_left_cosine.py new file mode 100644 index 0000000..42d9f4c --- /dev/null +++ b/tests/measure/test_left_cosine.py @@ -0,0 +1,80 @@ +# -*- coding:utf-8 -*- + +from unittest import TestCase +from simstring.measure.left_cosine import LeftCosineMeasure + + +class TestLeftCosineMeasure(TestCase): + measure = LeftCosineMeasure() + + def test_min_feature_size(self): + self.assertEqual(self.measure.min_feature_size(5, 1.0), 5) + self.assertEqual(self.measure.min_feature_size(5, 0.5), 2) + + def test_max_feature_size(self): + self.assertEqual(self.measure.max_feature_size(5, 1.0), 5) + self.assertEqual(self.measure.max_feature_size(5, 0.5), 20) + + def test_minimum_common_feature_count(self): + self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 1.0), 5) + self.assertEqual(self.measure.minimum_common_feature_count(5, 20, 1.0), 10) + self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 0.5), 3) + + def test_similarity(self): + x = ["a", "ab", "bc", "c"] + y = ["a", "ab", "bc", "cd", "e"] + self.assertEqual(round(self.measure.similarity(x, x), 2), 1.0) + self.assertEqual(round(self.measure.similarity(x, y), 2), 0.75) + + z = ["a", "ab", "ba", "ab", "a"] + self.assertEqual(round(self.measure.similarity(z, z), 2), 1.0) + self.assertEqual(round(self.measure.similarity(x, z), 2), 0.5) + self.assertEqual(round(self.measure.similarity(x, y), 2), 0.75) + + # Test as per paper trigrams with quotes of methyl sulphone and methyl sulfone + a = [ + ' "m', + '"me', + "met", + "eth", + "thy", + "hyl", + "yl ", + "l s", + " su", + "sul", + "ulf", + "lfo", + "fon", + "one", + 'ne"', + 'e" ', + ] + b = [ + ' "m', + '"me', + "met", + "eth", + "thy", + "hyl", + "yl ", + "l s", + " su", + "sul", + "ulp", + "lph", + "pho", + "hon", + "one", + 'ne"', + 'e" ', + ] + self.assertEqual( + round(self.measure.similarity(a, b), 3), 0.812 + ) + + name = ["Vl", "la", "ad", "di", "im", "mi", "ir", "r ", " P", "Pu", "ut", "ti", "in"] + sentence = ["Do", "on", "na", "at", "ti", "io", "on", "n ", " t", "to", "o ", " t", "he", "e ", " g", "gr", "re", "ea", "at", "t ", " V", "Vl", "la", "ad", "di", "im", "mi", "ir", "r ", " P", "Pu", "ut", "ti", "in", "n ", " m", "my", "y ", " f", "fa", "av", "vo", "or", "ri", "it", "te", "e ", " p", "pr", "re", "es", "si", "id", "de", "en", "nt"] + self.assertEqual( + round(self.measure.similarity(name, sentence), 2), 1.0 + ) diff --git a/tests/test_searcher.py b/tests/test_searcher.py index 7ad74b6..339b7fc 100644 --- a/tests/test_searcher.py +++ b/tests/test_searcher.py @@ -4,6 +4,7 @@ from simstring.searcher import Searcher from simstring.database.dict import DictDatabase from simstring.measure.cosine import CosineMeasure +from simstring.measure.left_cosine import LeftCosineMeasure from simstring.measure.jaccard import JaccardMeasure from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor @@ -81,6 +82,28 @@ def test_ranked_search_example2(self): self.assertEqual(results, goal) +class TestSearchLeftCosine(TestCase): + def setUp(self) -> None: + db = DictDatabase(CharacterNgramFeatureExtractor(2)) + db.add("vladimir putin") + db.add("isis") + db.add("kim jong un") + db.add("isabel jose dos santos") + self.searcher = Searcher(db, LeftCosineMeasure()) + + def test_search(self) -> None: + self.assertEqual(self.searcher.search("vlad putin", 0.7), ["vladimir putin"]) + self.assertEqual(self.searcher.search("donation for the benefit of make great again vlad putin", 0.7), ["vladimir putin"]) + breakpoint() + self.assertEqual(self.searcher.search("isis medical center", 0.7), ["isis"]) + + self.assertEqual(self.searcher.search("donation for the benefit of make great again isis", 0.5), ["vladimir putin"]) + + + self.assertEqual(self.searcher.search("isabel santos", 0.7), ["isabel jose dos santos"]) + self.assertEqual(self.searcher.search("donation for the benefit of make great again vlad putin", 0.5), ["vladimir putin"]) + + class TestRankedSearchCosineLong(TestCase): def setUp(self) -> None: db = DictDatabase(CharacterNgramFeatureExtractor(2))