Add test coverage

gsaint · gsaint · commit 78563a43a743 · 2026-01-28T16:13:00.000+01:00
diff --git a/tests/python/unit/python-lib/ner/test_flair.py b/tests/python/unit/python-lib/ner/test_flair.py
@@ -1,13 +1,16 @@
+# -*- coding: utf-8 -*-
 import json
 
 import pandas as pd
+import pytest
+from flair.models import SequenceTagger
 
 from ner.constants import (
     COLUMN_PER_ENTITY_FORMAT,
     JSON_KEY_PER_ENTITY_FORMAT,
     JSON_LABELING_FORMAT
 )
-from ner.flair import extract_entities
+from ner.flair import extract_entities, get_model
 
 TEST_SENTENCE = "Mark Zuckerberg is one of the founders of Facebook, a company from the United States"
 
@@ -42,6 +45,58 @@ def test_extract_entities():
                 'ORG': ['Facebook'],
                 'LOC': ['United States'],
             })
-            
         })
-    )
+    )
+
+def test_extract_entities_empty_text():
+    df = pd.DataFrame({'text': ['']})
+    result = extract_entities(df['text'], JSON_LABELING_FORMAT, "en")
+    assert len(result) == 1
+    assert result['sentence'].iloc[0] == ''
+    assert json.loads(result['entities'].iloc[0]) == []
+
+def test_extract_entities_no_entities():
+    df = pd.DataFrame({'text': ['Hello world, this is a simple test.']})
+    result = extract_entities(df['text'], JSON_LABELING_FORMAT, "en")
+    assert len(result) == 1
+    assert result['sentence'].iloc[0] == 'Hello world, this is a simple test.'
+    assert json.loads(result['entities'].iloc[0]) == []
+
+def test_extract_entities_unicode():
+    unicode_text = 'Müller works at Nestlé in Zürich.'
+    df = pd.DataFrame({'text': [unicode_text]})
+    result = extract_entities(df['text'], JSON_LABELING_FORMAT, "en")
+    assert len(result) == 1
+    # Verify unicode text is preserved correctly
+    assert result['sentence'].iloc[0] == unicode_text
+    # Verify valid JSON output (no encoding errors)
+    entities = json.loads(result['entities'].iloc[0])
+    assert isinstance(entities, list)
+
+def test_extract_entities_multiple_same_type():
+    df = pd.DataFrame({'text': ['John and Mary went to Paris and London.']})
+    result = extract_entities(df['text'], COLUMN_PER_ENTITY_FORMAT, "en")
+    assert len(result) == 1
+    # Should have multiple entities, check that PER or LOC columns exist with arrays
+    if 'PER' in result.columns:
+        per_entities = json.loads(result['PER'].iloc[0])
+        assert isinstance(per_entities, list)
+    if 'LOC' in result.columns:
+        loc_entities = json.loads(result['LOC'].iloc[0])
+        assert isinstance(loc_entities, list)
+
+def test_extract_entities_multiple_rows():
+    df = pd.DataFrame({'text': [
+        'Apple is based in California.',
+        'Microsoft was founded by Bill Gates.'
+    ]})
+    result = extract_entities(df['text'], JSON_LABELING_FORMAT, "en")
+    assert len(result) == 2
+
+def test_get_model_legacy_mapping():
+    model = get_model("en")
+    assert isinstance(model, SequenceTagger)
+
+def test_get_model_invalid_id():
+    with pytest.raises(KeyError):
+        get_model("invalid_language_code")
diff --git a/tests/python/unit/python-lib/ner/test_spacy.py b/tests/python/unit/python-lib/ner/test_spacy.py
@@ -1,13 +1,16 @@
+# -*- coding: utf-8 -*-
 import json
+
 import pandas as pd
+import pytest
+import spacy
 
 from ner.constants import (
     COLUMN_PER_ENTITY_FORMAT,
     JSON_KEY_PER_ENTITY_FORMAT,
     JSON_LABELING_FORMAT
 )
-from ner.spacy import extract_entities
-
+from ner.spacy import extract_entities, get_model, SPACY_LANGUAGE_MODELS_LEGACY_MAPPING
 
 TEST_SENTENCE = "Mark Zuckerberg is one of the founders of Facebook, a company from the United States"
 
@@ -43,4 +46,68 @@ def test_extract_entities():
                 'GPE': ['the United States']
             })
         })
-    )
+    )
+
+def test_extract_entities_empty_text():
+    df = pd.DataFrame({'text': ['']})
+    result = extract_entities(df['text'], JSON_LABELING_FORMAT, "en")
+    assert len(result) == 1
+    assert result['sentence'].iloc[0] == ''
+    assert json.loads(result['entities'].iloc[0]) == []
+
+def test_extract_entities_no_entities():
+    df = pd.DataFrame({'text': ['Hello world, this is a simple test.']})
+    result = extract_entities(df['text'], JSON_LABELING_FORMAT, "en")
+    assert len(result) == 1
+    assert result['sentence'].iloc[0] == 'Hello world, this is a simple test.'
+    assert json.loads(result['entities'].iloc[0]) == []
+
+def test_extract_entities_unicode():
+    unicode_text = 'Müller works at Nestlé in Zürich.'
+    df = pd.DataFrame({'text': [unicode_text]})
+    result = extract_entities(df['text'], JSON_LABELING_FORMAT, "en")
+    assert len(result) == 1
+    # Verify unicode text is preserved correctly
+    assert result['sentence'].iloc[0] == unicode_text
+    # Verify valid JSON output (no encoding errors)
+    entities = json.loads(result['entities'].iloc[0])
+    assert isinstance(entities, list)
+
+def test_extract_entities_multiple_same_type():
+    df = pd.DataFrame({'text': ['John and Mary went to Paris and London.']})
+    result = extract_entities(df['text'], COLUMN_PER_ENTITY_FORMAT, "en")
+    assert len(result) == 1
+    # Should have multiple entities, check that PERSON or GPE columns exist with arrays
+    if 'PERSON' in result.columns:
+        person_entities = json.loads(result['PERSON'].iloc[0])
+        assert isinstance(person_entities, list)
+    if 'GPE' in result.columns:
+        gpe_entities = json.loads(result['GPE'].iloc[0])
+        assert isinstance(gpe_entities, list)
+
+def test_extract_entities_multiple_rows():
+    df = pd.DataFrame({'text': [
+        'Apple is based in California.',
+        'Microsoft was founded by Bill Gates.'
+    ]})
+    result = extract_entities(df['text'], JSON_LABELING_FORMAT, "en")
+    assert len(result) == 2
+
+def test_get_model_english():
+    model = get_model("en")
+    assert model is not None
+    assert hasattr(model, 'pipe')
+
+def test_get_model_french():
+    model = get_model("fr")
+    assert model is not None
+    assert hasattr(model, 'pipe')
+
+def test_get_model_invalid_id():
+    with pytest.raises(KeyError):
+        get_model("invalid_language_code")
+
+def test_language_models_mapping_completeness():
+    expected_languages = ["en", "es", "zh", "pl", "fr", "de", "ja", "nb"]
+    for lang in expected_languages:
+        assert lang in SPACY_LANGUAGE_MODELS_LEGACY_MAPPING