1+ import networkx as nx
2+ from nltk .tokenize import word_tokenize
3+ from nltk .corpus import stopwords
4+ from nltk .stem import PorterStemmer
5+ import numpy as np
6+ import pandas as pd
7+ from pandas import DataFrame
8+ from ..datamodel import Data
9+ from .utils import vectorFromTFIDF
10+
11+ class Schema (Data ):
12+
13+
14+ """
15+ The corpus of the dataset that will be processed with pyjedai. \
16+ Contains all the information of the dataset and will be passed to each step \
17+ of the ER workflow.
18+ """
19+
20+ def __init__ (
21+ self ,
22+ dataset_1 : DataFrame ,
23+ dataset_2 : DataFrame ,
24+ dataset_name_1 : str = None ,
25+ dataset_name_2 : str = None ,
26+ ground_truth : DataFrame = None ,
27+ skip_ground_truth_processing : bool = False ,
28+ matching_type : str = None ,
29+ ) -> None :
30+
31+ # Original Datasets as pd.DataFrame
32+ if not isinstance (dataset_1 , pd .DataFrame ):
33+ raise AttributeError ("Dataset 1 must be a pandas DataFrame" )
34+
35+ if not isinstance (dataset_2 , pd .DataFrame ):
36+ raise AttributeError ("Dataset 2 must be a pandas DataFrame" )
37+
38+
39+ if matching_type == 'CONTENT' :
40+ dataset_1 , dataset_2 , ground_truth = self .load_content (dataset_1 , dataset_2 , ground_truth , skip_ground_truth_processing )
41+ elif matching_type == 'COMPOSITE' :
42+ dataset_1 , dataset_2 , ground_truth = self .load_composite (dataset_1 , dataset_2 )
43+ else :
44+ dataset_1 , dataset_2 , ground_truth = self .load_schema (dataset_1 , dataset_2 , ground_truth )
45+
46+ super ().__init__ (dataset_1 = dataset_1 ,
47+ id_column_name_1 = 'id' ,
48+ dataset_name_1 = dataset_name_1 ,
49+ dataset_2 = dataset_2 ,
50+ id_column_name_2 = 'id' ,
51+ dataset_name_2 = dataset_name_2 ,
52+ ground_truth = ground_truth ,
53+ skip_ground_truth_processing = skip_ground_truth_processing )
54+
55+
56+
57+ def load_content (self ,
58+ dataset_1 : DataFrame ,
59+ dataset_2 : DataFrame ,
60+ ground_truth : DataFrame = None ,
61+ skip_ground_truth_processing : bool = False ) -> tuple :
62+
63+
64+ dataset_1 = dataset_1 .astype (str )
65+ dataset_2 = dataset_2 .astype (str )
66+
67+ source_attributes = dataset_1 .columns
68+ target_attributes = dataset_2 .columns
69+
70+ source_index = range (len (source_attributes ))
71+ source_data = [vectorFromTFIDF (dataset_1 , col ) for col in source_attributes ]
72+
73+ target_index = range (len (target_attributes ))
74+ target_data = [vectorFromTFIDF (dataset_2 , col ) for col in target_attributes ]
75+
76+
77+ dataset_1 = pd .DataFrame ({
78+ 'id' : source_index ,
79+ 'data' : source_data
80+ })
81+
82+ dataset_2 = pd .DataFrame ({
83+ 'id' : target_index ,
84+ 'data' : target_data
85+ })
86+
87+
88+ dataset_1_columns = pd .DataFrame ({
89+ 'source' : source_attributes ,
90+ "source_index" : source_index
91+ })
92+
93+ dataset_2_columns = pd .DataFrame ({
94+ "target" : target_attributes ,
95+ "target_index" : target_index
96+ })
97+
98+
99+ if ground_truth is not None and not skip_ground_truth_processing :
100+ ground_truth .columns = ['source' , 'target' ]
101+ ground_truth = pd .merge (ground_truth , dataset_1_columns , on = "source" , how = 'left' )
102+ ground_truth = pd .merge (ground_truth , dataset_2_columns , on = 'target' , how = 'left' )
103+ ground_truth = ground_truth .drop (columns = ['source' , 'target' ])
104+
105+ self .dataset_1 = dataset_1
106+ self .dataset_2 = dataset_2
107+ self .ground_truth = ground_truth
108+ return dataset_1 , dataset_2 , ground_truth
109+
110+
111+ def load_composite (
112+ dataset_1 : DataFrame ,
113+ dataset_2 : DataFrame ,
114+ ground_truth : DataFrame = None ,
115+ skip_ground_truth_processing : bool = False
116+ ) -> tuple :
117+ dataset_1 = dataset_1 .astype (str )
118+ dataset_2 = dataset_2 .astype (str )
119+
120+ source_attributes = dataset_1 .columns
121+ target_attributes = dataset_2 .columns
122+
123+ source_index = range (len (source_attributes ))
124+ source_data = [vectorFromTFIDF (dataset_1 , col ) for col in source_attributes ]
125+
126+ target_index = range (len (target_attributes ))
127+ target_data = [vectorFromTFIDF (dataset_2 , col ) for col in target_attributes ]
128+
129+
130+ dataset_1 = pd .DataFrame ({
131+ 'id' : source_index ,
132+ 'attributes' : source_attributes ,
133+ 'data' : source_data
134+ })
135+
136+ dataset_2 = pd .DataFrame ({
137+ 'id' : target_index ,
138+ 'attributes' : target_attributes ,
139+ 'data' : target_data
140+ })
141+
142+
143+ dataset_1_columns = pd .DataFrame ({
144+ 'source' : source_attributes ,
145+ "source_index" : source_index
146+ })
147+
148+ dataset_2_columns = pd .DataFrame ({
149+ "target" : target_attributes ,
150+ "target_index" : target_index
151+ })
152+
153+
154+ if ground_truth is not None and not skip_ground_truth_processing :
155+ ground_truth .columns = ['source' , 'target' ]
156+ ground_truth = pd .merge (ground_truth , dataset_1_columns , on = "source" , how = 'left' )
157+ ground_truth = pd .merge (ground_truth , dataset_2_columns , on = 'target' , how = 'left' )
158+ ground_truth = ground_truth .drop (columns = ['source' , 'target' ])
159+
160+ return dataset_1 , dataset_2 , ground_truth
161+
162+
163+
164+ def load_schema (
165+ dataset_1 : DataFrame ,
166+ dataset_2 : DataFrame ,
167+ ground_truth : DataFrame = None ,
168+ skip_ground_truth_processing : bool = False
169+ ) -> tuple :
170+ dataset_1 = dataset_1 .astype (str )
171+ dataset_2 = dataset_2 .astype (str )
172+
173+ source_attributes = dataset_1 .columns
174+ target_attributes = dataset_2 .columns
175+
176+ source_index = range (len (source_attributes ))
177+
178+ target_index = range (len (target_attributes ))
179+
180+
181+ dataset_1 = pd .DataFrame ({
182+ 'id' : source_index ,
183+ 'attributes' : source_attributes ,
184+ })
185+
186+ dataset_2 = pd .DataFrame ({
187+ 'id' : target_index ,
188+ 'attributes' : target_attributes ,
189+ })
190+
191+
192+ dataset_1_columns = pd .DataFrame ({
193+ 'source' : source_attributes ,
194+ "source_index" : source_index
195+ })
196+
197+ dataset_2_columns = pd .DataFrame ({
198+ "target" : target_attributes ,
199+ "target_index" : target_index
200+ })
201+
202+
203+ if ground_truth is not None and not skip_ground_truth_processing :
204+ ground_truth .columns = ['source' , 'target' ]
205+ ground_truth = pd .merge (ground_truth , dataset_1_columns , on = "source" , how = 'left' )
206+ ground_truth = pd .merge (ground_truth , dataset_2_columns , on = 'target' , how = 'left' )
207+ ground_truth = ground_truth .drop (columns = ['source' , 'target' ])
208+
209+ return dataset_1 , dataset_2 , ground_truth
0 commit comments