Skip to content

Commit d53eb21

Browse files
committed
Schema model added for schema matching with pyJedAI
1 parent 792f02c commit d53eb21

2 files changed

Lines changed: 261 additions & 0 deletions

File tree

src/pyjedai/schema/schema_model.py

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
import networkx as nx
2+
from nltk.tokenize import word_tokenize
3+
from nltk.corpus import stopwords
4+
from nltk.stem import PorterStemmer
5+
import numpy as np
6+
import pandas as pd
7+
from pandas import DataFrame
8+
from ..datamodel import Data
9+
from .utils import vectorFromTFIDF
10+
11+
class Schema(Data):
12+
13+
14+
"""
15+
The corpus of the dataset that will be processed with pyjedai. \
16+
Contains all the information of the dataset and will be passed to each step \
17+
of the ER workflow.
18+
"""
19+
20+
def __init__(
21+
self,
22+
dataset_1: DataFrame,
23+
dataset_2: DataFrame,
24+
dataset_name_1: str = None,
25+
dataset_name_2: str = None,
26+
ground_truth: DataFrame = None,
27+
skip_ground_truth_processing: bool = False,
28+
matching_type: str = None,
29+
) -> None:
30+
31+
# Original Datasets as pd.DataFrame
32+
if not isinstance(dataset_1, pd.DataFrame):
33+
raise AttributeError("Dataset 1 must be a pandas DataFrame")
34+
35+
if not isinstance(dataset_2, pd.DataFrame):
36+
raise AttributeError("Dataset 2 must be a pandas DataFrame")
37+
38+
39+
if matching_type == 'CONTENT':
40+
dataset_1, dataset_2, ground_truth = self.load_content(dataset_1, dataset_2, ground_truth, skip_ground_truth_processing)
41+
elif matching_type == 'COMPOSITE':
42+
dataset_1, dataset_2, ground_truth = self.load_composite(dataset_1, dataset_2)
43+
else:
44+
dataset_1, dataset_2, ground_truth = self.load_schema(dataset_1, dataset_2, ground_truth)
45+
46+
super().__init__(dataset_1 = dataset_1,
47+
id_column_name_1 = 'id',
48+
dataset_name_1 = dataset_name_1,
49+
dataset_2 = dataset_2,
50+
id_column_name_2 = 'id',
51+
dataset_name_2 = dataset_name_2,
52+
ground_truth = ground_truth,
53+
skip_ground_truth_processing = skip_ground_truth_processing)
54+
55+
56+
57+
def load_content(self,
58+
dataset_1: DataFrame,
59+
dataset_2: DataFrame,
60+
ground_truth: DataFrame = None,
61+
skip_ground_truth_processing: bool = False) -> tuple:
62+
63+
64+
dataset_1 = dataset_1.astype(str)
65+
dataset_2 = dataset_2.astype(str)
66+
67+
source_attributes = dataset_1.columns
68+
target_attributes = dataset_2.columns
69+
70+
source_index = range(len(source_attributes))
71+
source_data = [vectorFromTFIDF(dataset_1, col) for col in source_attributes]
72+
73+
target_index = range(len(target_attributes))
74+
target_data = [vectorFromTFIDF(dataset_2, col) for col in target_attributes]
75+
76+
77+
dataset_1 = pd.DataFrame({
78+
'id' : source_index,
79+
'data': source_data
80+
})
81+
82+
dataset_2 = pd.DataFrame({
83+
'id' : target_index,
84+
'data': target_data
85+
})
86+
87+
88+
dataset_1_columns = pd.DataFrame({
89+
'source': source_attributes,
90+
"source_index": source_index
91+
})
92+
93+
dataset_2_columns = pd.DataFrame({
94+
"target" : target_attributes,
95+
"target_index": target_index
96+
})
97+
98+
99+
if ground_truth is not None and not skip_ground_truth_processing:
100+
ground_truth.columns = ['source', 'target']
101+
ground_truth = pd.merge(ground_truth, dataset_1_columns, on="source", how='left')
102+
ground_truth = pd.merge(ground_truth, dataset_2_columns, on='target', how='left')
103+
ground_truth = ground_truth.drop(columns=['source', 'target'])
104+
105+
self.dataset_1 = dataset_1
106+
self.dataset_2 = dataset_2
107+
self.ground_truth = ground_truth
108+
return dataset_1, dataset_2, ground_truth
109+
110+
111+
def load_composite(
112+
dataset_1: DataFrame,
113+
dataset_2: DataFrame,
114+
ground_truth: DataFrame = None,
115+
skip_ground_truth_processing: bool = False
116+
) -> tuple:
117+
dataset_1 = dataset_1.astype(str)
118+
dataset_2 = dataset_2.astype(str)
119+
120+
source_attributes = dataset_1.columns
121+
target_attributes = dataset_2.columns
122+
123+
source_index = range(len(source_attributes))
124+
source_data = [vectorFromTFIDF(dataset_1, col) for col in source_attributes]
125+
126+
target_index = range(len(target_attributes))
127+
target_data = [vectorFromTFIDF(dataset_2, col) for col in target_attributes]
128+
129+
130+
dataset_1 = pd.DataFrame({
131+
'id' : source_index,
132+
'attributes': source_attributes,
133+
'data': source_data
134+
})
135+
136+
dataset_2 = pd.DataFrame({
137+
'id' : target_index,
138+
'attributes': target_attributes,
139+
'data': target_data
140+
})
141+
142+
143+
dataset_1_columns = pd.DataFrame({
144+
'source': source_attributes,
145+
"source_index": source_index
146+
})
147+
148+
dataset_2_columns = pd.DataFrame({
149+
"target" : target_attributes,
150+
"target_index": target_index
151+
})
152+
153+
154+
if ground_truth is not None and not skip_ground_truth_processing:
155+
ground_truth.columns = ['source', 'target']
156+
ground_truth = pd.merge(ground_truth, dataset_1_columns, on="source", how='left')
157+
ground_truth = pd.merge(ground_truth, dataset_2_columns, on='target', how='left')
158+
ground_truth = ground_truth.drop(columns=['source', 'target'])
159+
160+
return dataset_1, dataset_2, ground_truth
161+
162+
163+
164+
def load_schema(
165+
dataset_1: DataFrame,
166+
dataset_2: DataFrame,
167+
ground_truth: DataFrame = None,
168+
skip_ground_truth_processing: bool = False
169+
) -> tuple:
170+
dataset_1 = dataset_1.astype(str)
171+
dataset_2 = dataset_2.astype(str)
172+
173+
source_attributes = dataset_1.columns
174+
target_attributes = dataset_2.columns
175+
176+
source_index = range(len(source_attributes))
177+
178+
target_index = range(len(target_attributes))
179+
180+
181+
dataset_1 = pd.DataFrame({
182+
'id' : source_index,
183+
'attributes': source_attributes,
184+
})
185+
186+
dataset_2 = pd.DataFrame({
187+
'id' : target_index,
188+
'attributes': target_attributes,
189+
})
190+
191+
192+
dataset_1_columns = pd.DataFrame({
193+
'source': source_attributes,
194+
"source_index": source_index
195+
})
196+
197+
dataset_2_columns = pd.DataFrame({
198+
"target" : target_attributes,
199+
"target_index": target_index
200+
})
201+
202+
203+
if ground_truth is not None and not skip_ground_truth_processing:
204+
ground_truth.columns = ['source', 'target']
205+
ground_truth = pd.merge(ground_truth, dataset_1_columns, on="source", how='left')
206+
ground_truth = pd.merge(ground_truth, dataset_2_columns, on='target', how='left')
207+
ground_truth = ground_truth.drop(columns=['source', 'target'])
208+
209+
return dataset_1, dataset_2, ground_truth

src/pyjedai/schema/utils.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from sklearn.feature_extraction.text import TfidfVectorizer
2+
from nltk.tokenize import word_tokenize
3+
from nltk.corpus import stopwords
4+
from nltk.stem import PorterStemmer
5+
import string
6+
import numpy as np
7+
8+
def vectorFromTFIDF(df, column_name) -> list:
9+
column_data = df[column_name].tolist()
10+
if not isinstance(column_data[0], str):
11+
column_data = [str(x) for x in column_data]
12+
column_data = [item for item in column_data if len(item)!=0]
13+
if len(column_data) == 0:
14+
return None
15+
16+
def custom_tokenizer(text):
17+
# Tokenize words
18+
if isinstance(text,str):
19+
tokens = word_tokenize(text.lower())
20+
# Remove stopwords
21+
stop_words = set(stopwords.words('english') + list(string.punctuation))
22+
filtered_tokens = [token for token in tokens if token not in stop_words]
23+
# Apply stemming or lemmatization
24+
stemmer = PorterStemmer()
25+
# You can choose either stemming or lemmatization here
26+
# For this example, we'll use lemmatization
27+
processed_tokens = [stemmer.stem(token) for token in filtered_tokens]
28+
if len(processed_tokens) == 0:
29+
return None
30+
return processed_tokens
31+
32+
tokenized_data = [custom_tokenizer(x) for x in column_data]
33+
tokenized_data = [item for item in tokenized_data if item is not None]
34+
35+
if len(tokenized_data) == 0:
36+
tokenized_data = column_data
37+
def preprocessor(t):
38+
return t
39+
def tokenizer(t):
40+
return t
41+
# Create TfidfVectorizer object
42+
vectorizer = TfidfVectorizer(max_features=1000, preprocessor=preprocessor, tokenizer=tokenizer, stop_words=None)
43+
tfidf_matrix = vectorizer.fit_transform( tokenized_data)
44+
# Get feature names (words)
45+
feature_names = vectorizer.get_feature_names_out()
46+
# Calculate TF-IDF scores for each word
47+
scores = tfidf_matrix.toarray().sum(axis=0)
48+
# Sort words by score and select top 512
49+
top_512_words = np.argsort(scores)[::-1][:512]
50+
top_512_words_list = [feature_names[i] for i in top_512_words]
51+
# top_512_words_list.append(column_name)
52+
return top_512_words_list

0 commit comments

Comments
 (0)