diff --git a/CHANGELOG.md b/CHANGELOG.md index 7087f3ace..8652fd650 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ All notable changes to this project will be documented in this file. - Added `supported_entity` parameter to `PhoneRecognizer`. Previously, this recognizer hard-coded `["PHONE_NUMBER"]` as the only possible supported entity. #### Fixed +- Fixed `PhoneRecognizer._get_recognizer_result` to use the constructor-provided `supported_entity` instead of the hard-coded `"PHONE_NUMBER"` string, making the `supported_entity` parameter from PR #2014 fully functional. - Fixed incorrect Prüfziffer algorithm in `DeHealthInsuranceRecognizer` (KVNR); now uses alternating factors [1,2,…,1,2] per § 290 SGB V Anlage 1 (#1972). - Fixed incorrect check-digit weights in `DeSocialSecurityRecognizer` (RVNR); now uses VKVV § 4 weights [2,1,2,5,7,1,2,1,2,1,2,1]. Previous weights diverged from the Deutsche Rentenversicherung specification and rejected the canonical DRV example 15070649C103. - Fixed incorrect check-digit algorithm in `DeLanrRecognizer`; now uses KBV Arztnummern-Richtlinie weights [4,9,4,9,4,9] without the spurious Quersumme step, and the complement-to-10 formula `(10 − sum mod 10) mod 10`. Previous weights and formula were internally self-consistent only. @@ -32,6 +33,7 @@ All notable changes to this project will be documented in this file. - ICAO Doc 9303 MRZ checksum validation in `DePassportRecognizer` and `DeIdCardRecognizer` (weights 7, 3, 1 repeating; letters A=10…Z=35; sum mod 10). - Structural validation improvements in `DeBsnrRecognizer` per KBV Arztnummern-Richtlinie Anlage 1; valid KV regional codes are defined for defense-in-depth/documentation purposes, but unknown prefixes are not currently rejected (no public checksum exists for BSNR). - Turkish PII recognizer for `TR_NATIONAL_ID` (TCKN) to identify Turkish National Identification Numbers using pattern match, context, and NVI checksum validation. Disabled by default. +- Turkish phone number detection via configurable `PhoneRecognizer` with `supported_regions=["TR"]` and `supported_entity="TR_PHONE_NUMBER"`. Supports international (+90), national (0), and local formats using the `phonenumbers` library. Disabled by default; users enable it programmatically. - Turkish PII recognizer for `TR_LICENSE_PLATE` (plaka) to identify Turkish vehicle license plates using pattern match, context, and province code validation (01-81). Disabled by default. ## [2.2.362] - 2026-03-15 diff --git a/docs/supported_entities.md b/docs/supported_entities.md index 23b6b2010..92ab17c4f 100644 --- a/docs/supported_entities.md +++ b/docs/supported_entities.md @@ -142,6 +142,7 @@ For more information, refer to the [adding new recognizers documentation](analyz | FieldType | Description | Detection Method | |------------|---------------------------------------------------------------------------------------------------------|------------------------------------------| | TR_NATIONAL_ID | The Turkish National Identification Number (TCKN) is a unique 11-digit number issued to all Turkish citizens. | Pattern match, context and checksum. | +| TR_PHONE_NUMBER | Turkish phone numbers: 10-digit numbers starting with 5 (mobile) or 2/3/4 (geographic). Supports international (+90), national (0), and local formats. Includes mobile (MNP-compliant) and geographic numbers. Reference: ITU-T E.164. Enabled programmatically via `PhoneRecognizer(supported_regions=["TR"], supported_entity="TR_PHONE_NUMBER")`. | `phonenumbers` library, context and format validation. | | TR_LICENSE_PLATE | Turkish vehicle license plate (plaka): 2-digit province code (01–81), 1–3 letters (A–Z, excluding Q, W, X), and 2–4 digits. Standard civilian format only. Legal basis: KTK Madde 23. | Pattern match, context and province code validation. | ### Germany diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/generic/phone_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/generic/phone_recognizer.py index cf816ed51..5eb6a3f00 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/generic/phone_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/generic/phone_recognizer.py @@ -57,7 +57,7 @@ def analyze( """Analyzes text to detect phone numbers using python-phonenumbers. Iterates over entities, fetching regions, then matching regional - phone numbers patterns against the text. + phone number patterns against the text. :param text: Text to be analyzed :param entities: Entities this recognizer can detect :param nlp_artifacts: Additional metadata from the NLP engine @@ -83,7 +83,7 @@ def analyze( def _get_recognizer_result(self, match, text, region, nlp_artifacts): result = RecognizerResult( - entity_type="PHONE_NUMBER", + entity_type=self.supported_entities[0], start=match.start, end=match.end, score=self.SCORE, diff --git a/presidio-analyzer/tests/test_tr_phone_number_recognizer.py b/presidio-analyzer/tests/test_tr_phone_number_recognizer.py new file mode 100644 index 000000000..8ab0470f9 --- /dev/null +++ b/presidio-analyzer/tests/test_tr_phone_number_recognizer.py @@ -0,0 +1,176 @@ +"""Tests for Turkish phone number (TR_PHONE_NUMBER) recognizer.""" + +import pytest +from presidio_analyzer.predefined_recognizers import PhoneRecognizer + +from tests import assert_result_within_score_range + +TURKISH_CONTEXT = [ + "telefon", + "telefon numarası", + "cep telefonu", + "cep no", + "telefon no", + "numara", + "mobil telefon", + "mobil no", + "hücresel telefon", + "ara", + "ulaş", + "iletişim", + "bağlantı", + "irtibat", + "numaram", + "telefonum", + "cep telefonum", + "mobil telefonum", + "telefon numarası", + "cep numarası", + "mobil numarası", + "phone", + "mobile", + "cell", + "cellphone", + "call", + "contact", + "number", + "phone number", + "sms", + "mesaj", + "whatsapp", + "telegram", + "signal", + "viber", +] + + +@pytest.fixture(scope="module") +def recognizer(): + """Create a TR-configured PhoneRecognizer instance for testing.""" + return PhoneRecognizer( + supported_regions=["TR"], + supported_entity="TR_PHONE_NUMBER", + context=TURKISH_CONTEXT, + supported_language="en", + ) + + +@pytest.fixture(scope="module") +def entities(): + """Return the TR_PHONE_NUMBER entity type for testing.""" + return ["TR_PHONE_NUMBER"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_positions, expected_score_ranges", + [ + # International format (+90) + ("+905321234567", 1, ((0, 13),), ((0.4, 1.0),)), + ("+90 532 123 45 67", 1, ((0, 17),), ((0.4, 1.0),)), + ("+90-532-123-45-67", 1, ((0, 17),), ((0.4, 1.0),)), + ("+90 (532) 123 45 67", 1, ((0, 19),), ((0.4, 1.0),)), + # National format (0) + ("05321234567", 1, ((0, 11),), ((0.3, 1.0),)), + ("0 532 123 45 67", 1, ((0, 15),), ((0.3, 1.0),)), + ("0-532-123-45-67", 1, ((0, 15),), ((0.3, 1.0),)), + ("0 (532) 1234567", 1, ((0, 15),), ((0.3, 1.0),)), + # Local format (just the number) + ("5321234567", 1, ((0, 10),), ((0.15, 1.0),)), + ("532 123 45 67", 1, ((0, 13),), ((0.15, 1.0),)), + ("532-123-45-67", 1, ((0, 13),), ((0.15, 1.0),)), + # In sentence with context + ( + "Telefon numaram +905321234567 olarak kayitli.", + 1, + ((16, 29),), + ((0.4, 1.0),), + ), + ( + "Cep no: 05321234567", + 1, + ((8, 19),), + ((0.3, 1.0),), + ), + ( + "Phone: 5321234567", + 1, + ((7, 17),), + ((0.15, 1.0),), + ), + # Multiple numbers + ( + "Birinci: +905321234567, Ikinci: 05359876543", + 2, + ((9, 22), (32, 43)), + ((0.4, 1.0), (0.3, 1.0)), + ), + # Geographic number: starts with 4 (valid geographic) + ("4321234567", 1, ((0, 10),), ((0.05, 1.0),)), + # Invalid: too short + ("532123456", 0, (), ()), + # Invalid: too long (11 digits without prefix) + ("53212345678", 0, (), ()), + # Invalid: not a phone number + ("hello world", 0, (), ()), + ("1234567890", 0, (), ()), + # False positive: random 10-digit number not starting with 5 + ("12345678901", 0, (), ()), + # Geographic numbers: valid area codes (starts with 2, 3, 4) + ("2121234567", 1, ((0, 10),), ((0.05, 1.0),)), + ("3121234567", 1, ((0, 10),), ((0.05, 1.0),)), + ("4621234567", 1, ((0, 10),), ((0.05, 1.0),)), + # Geographic numbers (lower priority) + ("02121234567", 1, ((0, 11),), ((0.1, 1.0),)), + ("0216 123 45 67", 1, ((0, 14),), ((0.1, 1.0),)), + ("0232 123 45 67", 1, ((0, 14),), ((0.05, 1.0),)), + ("0312 123 45 67", 1, ((0, 14),), ((0.1, 1.0),)), + ("0412 123 45 67", 1, ((0, 14),), ((0.1, 1.0),)), + + # False positive: embedded in longer number + ("15053212345678", 0, (), ()), + # Geographic number: starts with 2 (valid geographic) + # Note: 2023123456 is not recognized by phonenumbers as valid TR number + # ("202" is not a valid Turkish area code in phonenumbers) + # ("2121234567", 1, ((0, 10),), ((0.05, 1.0),)), + # False positive: TCKN-like (11 digits starting with 1) + ("10000000146", 0, (), ()), + # Note: phonenumbers matches dotted format as valid TR number + ("532.123.45.67", 1, ((0, 13),), ((0.05, 1.0),)), + # False positive: Turkish plate-like + ("34 ABC 1234", 0, (), ()), + # Invalid: unused first digits in Turkey (1, 6, 7, 8, 9) + ("1123456789", 0, (), ()), + ("6123456789", 0, (), ()), + ("7123456789", 0, (), ()), + ("8123456789", 0, (), ()), + ("9123456789", 0, (), ()), + ], +) +def test_when_phone_in_text_then_all_phones_found( + text, + expected_len, + expected_positions, + expected_score_ranges, + recognizer, + entities, +): + """Test that Turkish phone number recognizer correctly identifies numbers.""" + results = recognizer.analyze(text, entities) + assert len(results) == expected_len + + for res, (st_pos, fn_pos), (st_score, fn_score) in zip( + results, expected_positions, expected_score_ranges + ): + assert_result_within_score_range( + res, entities[0], st_pos, fn_pos, st_score, fn_score + ) + + +def test_supported_entity(recognizer): + """Test that supported entity is correctly set.""" + assert recognizer.supported_entities == ["TR_PHONE_NUMBER"] + + +def test_supported_language(recognizer): + """Test that supported language is correctly set.""" + assert recognizer.supported_language == "en"