diff --git a/CHANGELOG.md b/CHANGELOG.md index fed35831c..a3eaf00bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ All notable changes to this project will be documented in this file. - ICAO Doc 9303 MRZ checksum validation in `DePassportRecognizer` and `DeIdCardRecognizer` (weights 7, 3, 1 repeating; letters A=10…Z=35; sum mod 10). - Structural validation improvements in `DeBsnrRecognizer` per KBV Arztnummern-Richtlinie Anlage 1; valid KV regional codes are defined for defense-in-depth/documentation purposes, but unknown prefixes are not currently rejected (no public checksum exists for BSNR). - Turkish PII recognizer for `TR_NATIONAL_ID` (TCKN) to identify Turkish National Identification Numbers using pattern match, context, and NVI checksum validation. Disabled by default. +- Turkish PII recognizer for `TR_LICENSE_PLATE` (plaka) to identify Turkish vehicle license plates using pattern match, context, and province code validation (01-81). Disabled by default. ## [2.2.362] - 2026-03-15 ### General diff --git a/docs/supported_entities.md b/docs/supported_entities.md index 1405bfbfc..e5cc1a6a7 100644 --- a/docs/supported_entities.md +++ b/docs/supported_entities.md @@ -141,6 +141,7 @@ For more information, refer to the [adding new recognizers documentation](analyz | FieldType | Description | Detection Method | |------------|---------------------------------------------------------------------------------------------------------|------------------------------------------| | TR_NATIONAL_ID | The Turkish National Identification Number (TCKN) is a unique 11-digit number issued to all Turkish citizens. | Pattern match, context and checksum. | +| TR_LICENSE_PLATE | Turkish vehicle license plate (plaka): 2-digit province code (01–81), 1–3 letters (A–Z, excluding Q, W, X), and 2–4 digits. Standard civilian format only. Legal basis: KTK Madde 23. | Pattern match, context and province code validation. | ### Germany diff --git a/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml b/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml index bd438a7aa..66c0ecaa9 100644 --- a/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml +++ b/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml @@ -260,6 +260,12 @@ recognizers: type: predefined enabled: false + - name: TrLicensePlateRecognizer + supported_languages: + - tr + type: predefined + enabled: false + - name: HuggingFaceNerRecognizer supported_languages: - en diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index c0c94164a..b0c939e1b 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -96,6 +96,9 @@ from .country_specific.thai.th_tnin_recognizer import ThTninRecognizer # Turkey recognizers +from .country_specific.turkey.tr_license_plate_recognizer import ( + TrLicensePlateRecognizer, +) from .country_specific.turkey.tr_national_id_recognizer import ( TrNationalIdRecognizer, ) @@ -231,6 +234,7 @@ "KrFrnRecognizer", "SeOrganisationsnummerRecognizer", "ThTninRecognizer", + "TrLicensePlateRecognizer", "TrNationalIdRecognizer", "SePersonnummerRecognizer", "LangExtractRecognizer", diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/turkey/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/turkey/__init__.py index 38e4f85aa..47327d3a6 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/turkey/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/turkey/__init__.py @@ -1,7 +1,9 @@ """Turkey-specific recognizers.""" +from .tr_license_plate_recognizer import TrLicensePlateRecognizer from .tr_national_id_recognizer import TrNationalIdRecognizer __all__ = [ + "TrLicensePlateRecognizer", "TrNationalIdRecognizer", ] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/turkey/tr_license_plate_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/turkey/tr_license_plate_recognizer.py new file mode 100644 index 000000000..517d1e90b --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/turkey/tr_license_plate_recognizer.py @@ -0,0 +1,94 @@ +from typing import List, Optional, Tuple, Union + +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer + + +class TrLicensePlateRecognizer(PatternRecognizer): + """ + Recognize Turkish vehicle license plates (plaka). + + Standard civilian format: [province_code 01-81] [1-3 letters] [2-4 digits]. + Province codes: 01-81 (81 Turkish provinces). + Letters: A-Z excluding Q, W, X (not in Turkish alphabet). + + Examples: 34 ABC 1234 (Istanbul), 06 A 123 (Ankara), 35 JK 12 (Izmir). + + Legal basis: Karayolları Trafik Kanunu (KTK) Madde 23. + Data protection: KVKK (Kişisel Verilerin Korunması Kanunu) — license plates + constitute personal data when linked to an identifiable vehicle owner. + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + :param replacement_pairs: List of tuples with potential replacement values + for different strings to be used during pattern matching. + """ + + PATTERNS = [ + Pattern( + "TR License Plate (space)", + r"\b(0[1-9]|[1-7][0-9]|8[0-1])\s?[A-PR-VY-Z]{1,3}\s?\d{2,4}\b", + 0.3, + ), + Pattern( + "TR License Plate (hyphen)", + r"\b(0[1-9]|[1-7][0-9]|8[0-1])-[A-PR-VY-Z]{1,3}-\d{2,4}\b", + 0.3, + ), + ] + + CONTEXT = [ + "plaka", + "araç plakası", + "plaka numarası", + "kayıt plakası", + "tr plaka", + "license plate", + "number plate", + "plate", + "taşıt plakası", + "kayıt", + ] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "tr", + supported_entity: str = "TR_LICENSE_PLATE", + replacement_pairs: Optional[List[Tuple[str, str]]] = None, + name: Optional[str] = None, + ): + self.replacement_pairs = ( + replacement_pairs if replacement_pairs else [("-", ""), (" ", "")] + ) + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + name=name, + ) + + def validate_result(self, pattern_text: str) -> Union[bool, None]: + """ + Validate the matched pattern by checking province code is 01-81. + + :param pattern_text: The matched text to validated. + Only the part in text that was detected by the regex engine + :return: True if province code valid, False if invalid, None if not a plate + """ + sanitized_value = EntityRecognizer.sanitize_value( + pattern_text, self.replacement_pairs + ) + + if len(sanitized_value) >= 3: + province_code = sanitized_value[:2] + if province_code.isdigit(): + code = int(province_code) + return 1 <= code <= 81 + + return None diff --git a/presidio-analyzer/tests/test_tr_license_plate_recognizer.py b/presidio-analyzer/tests/test_tr_license_plate_recognizer.py new file mode 100644 index 000000000..99377c2ea --- /dev/null +++ b/presidio-analyzer/tests/test_tr_license_plate_recognizer.py @@ -0,0 +1,124 @@ +"""Tests for Turkish license plate (TR_LICENSE_PLATE) recognizer.""" + +import pytest +from presidio_analyzer.predefined_recognizers import TrLicensePlateRecognizer + +from tests import assert_result_within_score_range + + +@pytest.fixture(scope="module") +def recognizer(): + """Create a TrLicensePlateRecognizer instance for testing.""" + return TrLicensePlateRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + """Return the TR_LICENSE_PLATE entity type for testing.""" + return ["TR_LICENSE_PLATE"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_positions, expected_score_ranges", + [ + ("34 ABC 1234", 1, ((0, 11),), ((0.5, 1.0),)), + ("06 A 123", 1, ((0, 8),), ((0.5, 1.0),)), + ("35 JK 12", 1, ((0, 8),), ((0.5, 1.0),)), + ("16 B 1234", 1, ((0, 9),), ((0.5, 1.0),)), + ("34ABC1234", 1, ((0, 9),), ((0.5, 1.0),)), + ("34 abc 1234", 1, ((0, 11),), ((0.5, 1.0),)), + ( + "Araç plakası 34 ABC 1234 olarak kayıtlıdır.", + 1, + ((13, 24),), + ((0.5, 1.0),), + ), + ( + "Plaka 34 ABC 1234 ve 06 JK 567", + 2, + ((6, 17), (21, 30)), + ((0.5, 1.0), (0.5, 1.0)), + ), + ("01 A 12", 1, ((0, 7),), ((0.5, 1.0),)), + ("81 A 12", 1, ((0, 7),), ((0.5, 1.0),)), + ("07 AB 123", 1, ((0, 9),), ((0.5, 1.0),)), + ("00 ABC 123", 0, (), ()), + ("82 ABC 123", 0, (), ()), + ("99 ABC 123", 0, (), ()), + ("hello world", 0, (), ()), + ("1234567890", 0, (), ()), + ( + "License plate 34 ABC 1234", + 1, + ((14, 25),), + ((0.5, 1.0),), + ), + ( + "Plaka numarası 06 A 123 olarak kayıtlı", + 1, + ((15, 23),), + ((0.5, 1.0),), + ), + ], +) +def test_when_license_plate_in_text_then_all_plates_found( + text, + expected_len, + expected_positions, + expected_score_ranges, + recognizer, + entities, +): + """Test that Turkish license plate recognizer correctly identifies plates.""" + results = recognizer.analyze(text, entities) + assert len(results) == expected_len + + for res, (st_pos, fn_pos), (st_score, fn_score) in zip( + results, expected_positions, expected_score_ranges + ): + assert_result_within_score_range( + res, entities[0], st_pos, fn_pos, st_score, fn_score + ) + + +def test_validate_result_with_valid_province(recognizer): + """Test validate_result with valid province codes.""" + assert recognizer.validate_result("34 ABC 1234") is True + assert recognizer.validate_result("06 A 123") is True + assert recognizer.validate_result("01 A 12") is True + assert recognizer.validate_result("81 A 12") is True + + +def test_validate_result_with_invalid_province(recognizer): + """Test validate_result with invalid province codes.""" + assert recognizer.validate_result("00 ABC 123") is False + assert recognizer.validate_result("82 ABC 123") is False + + +def test_validate_result_with_short_input(recognizer): + """Test validate_result with input shorter than 3 characters.""" + assert recognizer.validate_result("12") is None + assert recognizer.validate_result("") is None + + +def test_validate_result_with_non_numeric_province(recognizer): + """Test validate_result when province code is not numeric.""" + assert recognizer.validate_result("AB ABC 123") is None + assert recognizer.validate_result("XY 123") is None + + +def test_context_words(recognizer): + """Test that context words are properly set.""" + assert "plaka" in recognizer.context + assert "araç plakası" in recognizer.context + assert "license plate" in recognizer.context + + +def test_supported_entity(recognizer): + """Test that supported entity is correctly set.""" + assert recognizer.supported_entities == ["TR_LICENSE_PLATE"] + + +def test_supported_language(recognizer): + """Test that supported language is correctly set.""" + assert recognizer.supported_language == "tr"