Skip to content

Commit b21bebd

Browse files
committed
Removed py_stringmatching
1 parent 3201fcc commit b21bebd

4 files changed

Lines changed: 198 additions & 44 deletions

File tree

pyproject.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ build-backend = "setuptools.build_meta"
66

77
[project]
88
name = "pyjedai"
9-
version = "0.3.3"
9+
version = "0.3.4"
1010
description = "An open-source library that builds powerful end-to-end Entity Resolution workflows."
1111
readme = "README.md"
1212
authors = [
13-
{ name = "Lefteris Stetsikas", email = "skantzoxoiros45@gmail.com" },
13+
{ name = "Lefteris Stetsikas", email = "lstetsikas3@gmail.com" },
1414
{ name = "Konstantinos Nikoletos", email = "nikoletos.kon@gmail.com" },
1515
{ name = "George Papadakis", email = "gpapadis84@gmail.com" },
1616
{ name = "Jakub Maciejewski", email = "jacobb.maciejewski@gmail.com"},
@@ -51,11 +51,11 @@ dependencies = [
5151
"transformers",
5252
"sentence-transformers",
5353
"faiss-cpu",
54-
"py-stringmatching",
5554
"valentine; python_version > '3.7'",
5655
"ordered-set",
5756
"shapely",
58-
"ollama"
57+
"ollama",
58+
"StringCompare @ git+https://github.com/OlivierBinette/StringCompare.git@release"
5959
]
6060

6161

src/pyjedai/matching.py

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,9 @@
77
import matplotlib.pyplot as plt
88
import numpy as np
99
from networkx import Graph
10-
from py_stringmatching.similarity_measure.cosine import Cosine
11-
from py_stringmatching.similarity_measure.dice import Dice
12-
from py_stringmatching.similarity_measure.generalized_jaccard import \
13-
GeneralizedJaccard
14-
from py_stringmatching.similarity_measure.jaccard import Jaccard
15-
from py_stringmatching.similarity_measure.jaro import Jaro
16-
from py_stringmatching.similarity_measure.levenshtein import Levenshtein
17-
from py_stringmatching.similarity_measure.overlap_coefficient import \
18-
OverlapCoefficient
19-
from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer
20-
from py_stringmatching.tokenizer.whitespace_tokenizer import \
21-
WhitespaceTokenizer
10+
from .string_matchers import WhitespaceTokenizer, Cosine, Jaccard, GeneralizedJaccard, Dice, OverlapCoefficient
11+
from stringcompare import Levenshtein, Jaro
12+
from stringcompare.preprocessing import NGramTokenizer
2213
from tqdm.autonotebook import tqdm
2314

2415
from .datamodel import Data, PYJEDAIFeature
@@ -389,12 +380,12 @@ def __init__(
389380
vectorizer, available_vectorizers
390381
)
391382
)
383+
392384
elif(tokenizer is not None):
393385
if tokenizer == 'white_space_tokenizer':
394-
self._tokenizer = WhitespaceTokenizer(return_set=self.tokenizer_return_set)
386+
self._tokenizer = WhitespaceTokenizer()
395387
elif tokenizer == 'char_tokenizer':
396-
self._tokenizer = QgramTokenizer(qval=self.qgram,
397-
return_set=self.tokenizer_return_set)
388+
self._tokenizer = NGramTokenizer(n=self.qgram)
398389
elif tokenizer == 'word_tokenizer':
399390
self._tokenizer = WordQgramTokenizer(q=self.qgram)
400391
elif tokenizer not in available_tokenizers:
@@ -508,27 +499,41 @@ def _similarity(self, entity_id1: int, entity_id2: int) -> float:
508499
for attribute, weight in self.attributes.items():
509500
e1 = self.data.entities.iloc[entity_id1][attribute].lower()
510501
e2 = self.data.entities.iloc[entity_id2][attribute].lower()
511-
512-
similarity += weight*metrics_mapping[self._metric].get_sim_score(
513-
self._tokenizer.tokenize(e1) if self._metric in set_metrics else e1,
514-
self._tokenizer.tokenize(e2) if self._metric in set_metrics else e2
515-
)
502+
if self.tokenizer_return_set:
503+
similarity += weight*metrics_mapping[self._metric].compare(
504+
set(self._tokenizer.tokenize(e1)) if self._metric in set_metrics else e1,
505+
set(self._tokenizer.tokenize(e2)) if self._metric in set_metrics else e2
506+
)
507+
else:
508+
similarity += weight*metrics_mapping[self._metric].compare(
509+
self._tokenizer.tokenize(e1) if self._metric in set_metrics else e1,
510+
self._tokenizer.tokenize(e2) if self._metric in set_metrics else e2
511+
)
516512
elif isinstance(self.attributes, list):
517513
for attribute in self.attributes:
518514
e1 = self.data.entities.iloc[entity_id1][attribute].lower()
519515
e2 = self.data.entities.iloc[entity_id2][attribute].lower()
520-
similarity += metrics_mapping[self._metric].get_sim_score(
521-
self._tokenizer.tokenize(e1) if self._metric in set_metrics else e1,
522-
self._tokenizer.tokenize(e2) if self._metric in set_metrics else e2
523-
)
516+
if self.tokenizer_return_set:
517+
similarity += metrics_mapping[self._metric].compare(
518+
set(self._tokenizer.tokenize(e1)) if self._metric in set_metrics else e1,
519+
set(self._tokenizer.tokenize(e2)) if self._metric in set_metrics else e2
520+
)
521+
else:
522+
similarity += metrics_mapping[self._metric].compare(
523+
self._tokenizer.tokenize(e1) if self._metric in set_metrics else e1,
524+
self._tokenizer.tokenize(e2) if self._metric in set_metrics else e2
525+
)
524526
similarity /= len(self.attributes)
525527
else:
526-
# concatenated row string
527528
e1 = self.data.entities.iloc[entity_id1].str.cat(sep=' ').lower()
528529
e2 = self.data.entities.iloc[entity_id2].str.cat(sep=' ').lower()
529-
te1 = self._tokenizer.tokenize(e1) if self._metric in set_metrics else e1
530-
te2 = self._tokenizer.tokenize(e2) if self._metric in set_metrics else e2
531-
similarity = metrics_mapping[self._metric].get_sim_score(te1, te2)
530+
if self.tokenizer_return_set:
531+
te1 = set(self._tokenizer.tokenize(e1)) if self._metric in set_metrics else e1
532+
te2 = set(self._tokenizer.tokenize(e2)) if self._metric in set_metrics else e2
533+
else:
534+
te1 = self._tokenizer.tokenize(e1) if self._metric in set_metrics else e1
535+
te2 = self._tokenizer.tokenize(e2) if self._metric in set_metrics else e2
536+
similarity = metrics_mapping[self._metric].compare(te1, te2)
532537
return similarity
533538

534539
def _configuration(self) -> dict:

src/pyjedai/prioritization.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,9 @@
1515
from .vector_based_blocking import EmbeddingsNNBlockBuilding
1616

1717
from networkx import Graph
18-
from py_stringmatching.similarity_measure.cosine import Cosine
19-
from py_stringmatching.similarity_measure.dice import Dice
20-
from py_stringmatching.similarity_measure.generalized_jaccard import \
21-
GeneralizedJaccard
22-
from py_stringmatching.similarity_measure.jaccard import Jaccard
23-
from py_stringmatching.similarity_measure.jaro import Jaro
24-
from py_stringmatching.similarity_measure.levenshtein import Levenshtein
25-
from py_stringmatching.similarity_measure.overlap_coefficient import \
26-
OverlapCoefficient
27-
from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer
28-
from py_stringmatching.tokenizer.whitespace_tokenizer import \
29-
WhitespaceTokenizer
18+
from .string_matchers import WhitespaceTokenizer, Cosine, Jaccard, GeneralizedJaccard, Dice, OverlapCoefficient
19+
from stringcompare import Levenshtein, Jaro
20+
from stringcompare.preprocessing import NGramTokenizer
3021
from sklearn.metrics.pairwise import pairwise_distances
3122
from tqdm.autonotebook import tqdm
3223

src/pyjedai/string_matchers.py

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
"""
2+
This code was based on py_stringmatching: https://github.com/anhaidgroup/py_stringmatching
3+
"""
4+
from abc import ABC, abstractmethod
5+
from stringcompare import Jaro
6+
import re
7+
8+
9+
class WhitespaceTokenizer(ABC):
10+
def tokenize(self, sentence):
11+
whitespace_pattern = re.compile(r'\s+')
12+
tokens = whitespace_pattern.split(sentence.strip())
13+
tokens = [token for token in tokens if token]
14+
return tokens
15+
16+
class StringMatcher(ABC):
17+
"""String Matchers based on py_stringmatching"""
18+
flag = True
19+
20+
21+
def check_instance_type(self, te1, te2) -> None:
22+
if not isinstance(te1, list) and not isinstance(te1, set):
23+
raise TypeError("Must be either list or set")
24+
if not isinstance(te2, list) and not isinstance(te2, set):
25+
raise TypeError("Must be either list or set")
26+
27+
def exact_match(self, te1, te2):
28+
return te1 == te2
29+
30+
def empty_match(self, te1, te2):
31+
return len(te1) == 0 or len(te2) == 0
32+
33+
34+
@abstractmethod
35+
def compare(self, te1, te2):
36+
pass
37+
38+
39+
class Cosine(StringMatcher):
40+
def compare(self, te1, te2) -> float:
41+
42+
self.check_instance_type(te1, te2)
43+
44+
# if exact match return 1.0
45+
if self.exact_match(te1, te2):
46+
return 1.0
47+
48+
# if one of the strings is empty return 0
49+
if self.empty_match(te1, te2):
50+
return 0.0
51+
52+
intersection = len(set(te1) & set(te2))
53+
norm1 = len(te1) ** 0.5
54+
norm2 = len(te2) ** 0.5
55+
return intersection / (norm1 * norm2) if norm1 * norm2 > 0 else 0
56+
57+
class Dice(StringMatcher):
58+
def compare(self, te1, te2) -> float:
59+
print("TIFASIII??")
60+
self.check_instance_type(te1, te2)
61+
62+
set1 = set(te1)
63+
set2 = set(te2)
64+
65+
# if exact match return 1.0
66+
if self.exact_match(set1, set2):
67+
return 1.0
68+
69+
# if one of the strings is empty return 0
70+
if self.empty_match(set1, set2):
71+
return 0.0
72+
73+
return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2))
74+
75+
class Jaccard(StringMatcher):
76+
def compare(self, te1, te2) -> float:
77+
self.check_instance_type(te1, te2)
78+
79+
set1 = set(te1)
80+
set2 = set(te2)
81+
82+
# if exact match return 1.0
83+
if self.exact_match(set1, set2):
84+
return 1.0
85+
86+
# if one of the strings is empty return 0
87+
if self.empty_match(set1, set2):
88+
return 0.0
89+
90+
intersection = len(set(te1) & set(te2))
91+
return intersection/(len(set1) + len(set2) + intersection)
92+
93+
class GeneralizedJaccard(StringMatcher):
94+
def compare(self, te1, te2) -> float:
95+
self.check_instance_type(te1, te2)
96+
97+
set1 = set(te1)
98+
set2 = set(te2)
99+
100+
# if exact match return 1.0
101+
if self.exact_match(set1, set2):
102+
return 1.0
103+
104+
# if one of the strings is empty return 0
105+
if self.empty_match(set1, set2):
106+
return 0.0
107+
108+
set1_x = set()
109+
set2_y = set()
110+
match_score = 0.0
111+
match_count = 0
112+
list_matches = []
113+
threshold=0.5
114+
for element in set1:
115+
for item in set2:
116+
score = Jaro().compare(element, item)
117+
if score > 1 or score < 0:
118+
raise ValueError('Similarity measure should' + \
119+
' return value in the range [0,1]')
120+
if score > threshold:
121+
list_matches.append((element, item, score))
122+
123+
# position of first string, second string and sim score in tuple
124+
first_string_pos = 0
125+
second_string_pos = 1
126+
sim_score_pos = 2
127+
128+
# sort the score of all the pairs
129+
list_matches.sort(key=lambda x: x[sim_score_pos], reverse=True)
130+
131+
# select score in increasing order of their weightage,
132+
# do not reselect the same element from either set.
133+
for element in list_matches:
134+
if (element[first_string_pos] not in set1_x and
135+
element[second_string_pos] not in set2_y):
136+
set1_x.add(element[first_string_pos])
137+
set2_y.add(element[second_string_pos])
138+
match_score += element[sim_score_pos]
139+
match_count += 1
140+
141+
return float(match_score) / float(len(set1) + len(set2) - match_count)
142+
143+
class OverlapCoefficient(StringMatcher):
144+
def compare(self, te1, te2) -> float:
145+
self.check_instance_type(te1, te2)
146+
147+
set1 = set(te1)
148+
set2 = set(te2)
149+
150+
# if exact match return 1.0
151+
if self.exact_match(set1, set2):
152+
return 1.0
153+
154+
# if one of the strings is empty return 0
155+
if self.empty_match(set1, set2):
156+
return 0.0
157+
return float(len(set1 & set2)) / min(len(set1), len(set2))
158+

0 commit comments

Comments
 (0)