Merge pull request #32 from ThanosTsiamis/clean_dataset_fix

gpapadis · web-flow · commit eb446095f921 · 2026-03-22T08:52:36.000+02:00
fix: refactor dataset cleaning logic and add unit tests for consistency
diff --git a/src/pyjedai/datamodel.py b/src/pyjedai/datamodel.py
@@ -314,32 +314,37 @@ def clean_dataset(self,
                       remove_unicodes: bool = True) -> None:
         """Removes stopwords, punctuation, uni-codes, numbers from the dataset.
         """
-        nltk.download('stopwords')
+        stop_words = None
+        if remove_stopwords:
+            nltk.download('stopwords')
+            stop_words = set(stopwords.words('english'))
+
+        def _clean_dataframe(dataframe: DataFrame, columns: list) -> DataFrame:
+            if not columns:
+                return dataframe
+
+            cleaned_columns = dataframe.loc[:, columns].applymap(lambda x: x.lower())
+
+            if remove_numbers:
+                cleaned_columns = cleaned_columns.applymap(lambda x: re.sub(r'\d+', '', x))
 
-        # Make self.dataset_1 and self.dataset_2 lowercase
-        self.dataset_1 = self.dataset_1.applymap(lambda x: x.lower())
+            if remove_unicodes:
+                cleaned_columns = cleaned_columns.applymap(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
+
+            if remove_punctuation:
+                cleaned_columns = cleaned_columns.applymap(lambda x: re.sub(r'[^\w\s]','',x))
+
+            if remove_stopwords:
+                cleaned_columns = cleaned_columns.applymap(
+                    lambda x: ' '.join([word for word in x.split() if word not in stop_words])
+                )
+
+            dataframe.loc[:, columns] = cleaned_columns
+            return dataframe
+
+        self.dataset_1 = _clean_dataframe(self.dataset_1, self.attributes_1)
         if not self.is_dirty_er:
-            self.dataset_2 = self.dataset_2.applymap(lambda x: x.lower())
-            
-        if remove_numbers:
-            self.dataset_1 = self.dataset_1.applymap(lambda x: re.sub(r'\d+', '', x))
-            if not self.is_dirty_er:
-                self.dataset_2 = self.dataset_2.applymap(lambda x: re.sub(r'\d+', '', x))    
-                
-        if remove_unicodes:
-            self.dataset_1 = self.dataset_1.applymap(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
-            if not self.is_dirty_er:
-                self.dataset_2 = self.dataset_2.applymap(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
-            
-        if remove_punctuation:
-            self.dataset_1  = self.dataset_1.applymap(lambda x: re.sub(r'[^\w\s]','',x))
-            if not self.is_dirty_er:
-                self.dataset_2 = self.dataset_2.applymap(lambda x: re.sub(r'[^\w\s]','',x))
-        
-        if remove_stopwords:
-            self.dataset_1 = self.dataset_1.applymap(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words('english'))]))
-            if not self.is_dirty_er:
-                self.dataset_2 = self.dataset_2.applymap(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words('english'))]))    
+            self.dataset_2 = _clean_dataframe(self.dataset_2, self.attributes_2)
 
         self.entities = self.dataset_1 = self.dataset_1.astype(str)
         
diff --git a/tests/test_datamodel_clean_dataset.py b/tests/test_datamodel_clean_dataset.py
@@ -0,0 +1,30 @@
+import os
+import sys
+
+import pandas as pd
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../src'))
+
+from pyjedai.datamodel import Data
+
+
+def test_clean_dataset_keeps_identifier_columns_and_cached_mappings_consistent():
+    data = Data(
+        dataset_1=pd.DataFrame({"id": ["A-1"], "name": ["Alice-1"]}),
+        id_column_name_1="id",
+        ground_truth=pd.DataFrame([["A-1", "A-1"]]),
+    )
+
+    data.clean_dataset(
+        remove_numbers=True,
+        remove_punctuation=True,
+        remove_stopwords=False,
+        remove_unicodes=False,
+    )
+
+    assert data.dataset_1.loc[0, "id"] == "A-1"
+    assert data.dataset_1.loc[0, "name"] == "alice"
+    assert list(data._ids_mapping_1.keys()) == ["A-1"]
+    assert list(data.duplicate_of.keys()) == ["A-1"]
+    assert data._ids_mapping_1["A-1"] == 0
+    assert data._are_true_positives("A-1", "A-1")