diff --git a/CHANGELOG.md b/CHANGELOG.md index 7087f3ace1..3bd81cfad7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -799,6 +799,7 @@ New endpoint for deanonymizing encrypted entities by the anonymizer. [2.2.1]: https://github.com/microsoft/presidio/compare/2.2.0...2.2.1 ## Unreleased +- feat(analyzer): Add Philippines license plate recognizer (PH_LICENSE_PLATE) (#2015) ### Fixed - Fixed an issue where the CreditCardRecognizer regex could incorrectly identify 13-digit Unix timestamps as credit card numbers. Validated that 13 digit numbers that start with `1` and have no separators (e.g. `1748503543012`) are not flagged as credit cards. diff --git a/docs/supported_entities.md b/docs/supported_entities.md index 23b6b20105..3ed2267ce2 100644 --- a/docs/supported_entities.md +++ b/docs/supported_entities.md @@ -120,6 +120,14 @@ For more information, refer to the [adding new recognizers documentation](analyz | NG_NIN | The Nigerian National Identification Number (NIN) is a unique 11-digit number issued by the National Identity Management Commission (NIMC). | Pattern match, context, and checksum | | NG_VEHICLE_REGISTRATION | Nigerian vehicle registration plate number in the current format (2011+): 3 letters (LGA code), 3 digits (serial), 2 letters (year/batch). | Pattern match and context | +### Philippines + +|FieldType|Description|Detection Method| +|--- |--- |--- | +|PH_LICENSE_PLATE|Philippine vehicle license plate number. Supports modern private plates (3 letters + 4 digits, e.g. ABC 1234), motorcycle plates (4 digits + 3 letters, e.g. 1234 ABC), legacy private plates (2 letters + 4 digits, e.g. AB 1234), and conduction stickers (e.g. C12D345). Space, dash, or no separator accepted.|Pattern match and context| + + + ### Canada |FieldType|Description|Detection Method| diff --git a/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml b/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml index 8d80d73d6a..69a4114602 100644 --- a/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml +++ b/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml @@ -472,11 +472,18 @@ recognizers: type: predefined enabled: false config_path: presidio_analyzer/conf/langextract_config_basic.yaml - - - name: CaSinRecognizer + +- name: CaSinRecognizer supported_languages: - en - fr - type: predefined + - type: predefined enabled: false country_code: ca + + - name: PhLicensePlateRecognizer + supported_languages: + - en + - type: predefined + enabled: false + country_code: ph \ No newline at end of file diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index 09643bcc45..42b85e6bec 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -75,6 +75,9 @@ NgVehicleRegistrationRecognizer, ) +# Philippines recognizers +from .country_specific.philippines.ph_license_plate_recognizer import PhLicensePlateRecognizer + # Poland recognizers from .country_specific.poland.pl_pesel_recognizer import PlPeselRecognizer @@ -245,6 +248,7 @@ "KrPassportRecognizer", "NgNinRecognizer", "NgVehicleRegistrationRecognizer", + "PhLicensePlateRecognizer", "MedicalNERRecognizer", # Germany recognizers "DeTaxIdRecognizer", diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/__init__.py index cb1ffd3acc..70090dae63 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/__init__.py @@ -1,7 +1,9 @@ """Country-specific recognizers package.""" from .canada.ca_sin_recognizer import CaSinRecognizer +from .philippines.ph_license_plate_recognizer import PhLicensePlateRecognizer __all__ = [ "CaSinRecognizer", + "PhLicensePlateRecognizer", ] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/philippines/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/philippines/__init__.py new file mode 100644 index 0000000000..4ea5927864 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/philippines/__init__.py @@ -0,0 +1,7 @@ +"""Philippines-specific recognizers.""" + +from .ph_license_plate_recognizer import PhLicensePlateRecognizer + +__all__ = [ + "PhLicensePlateRecognizer", +] \ No newline at end of file diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/philippines/ph_license_plate_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/philippines/ph_license_plate_recognizer.py new file mode 100644 index 0000000000..64a804a9a6 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/philippines/ph_license_plate_recognizer.py @@ -0,0 +1,192 @@ +""" +Philippines License Plate Recognizer for Microsoft Presidio +============================================================ +Detects Philippine vehicle license plate numbers in unstructured text. + +Plate formats covered +--------------------- +1. Private / standard : ABC 1234 (3 letters + space + 4 digits) – LTO format since 2013 +2. Old / legacy : AB 1234 (2 letters + space + 4 digits) – pre-2013 private plates +3. Motorcycle : 1234 ABC (4 digits + space + 3 letters) – current motorcycle format +4. Conduction sticker : C 12 D 123 (alphanumeric, no strict letter/digit split) – handled by + a looser pattern at lower confidence +5. Separator variants : dash ( - ) or no separator accepted in addition to a space + +Confidence scores +----------------- +- 0.85 Strong match : modern 3-letter / 4-digit format WITH context word nearby +- 0.75 Strong match : modern 3-letter / 4-digit format, no context +- 0.60 Moderate : legacy 2-letter / motorcycle / conduction formats with context +- 0.40 Weak : legacy / conduction formats without context +""" + +import re +import logging +from typing import List, Optional + +from presidio_analyzer import Pattern, PatternRecognizer, RecognizerResult +from presidio_analyzer.nlp_engine import NlpArtifacts + +logger = logging.getLogger("presidio-analyzer") + + +class PhLicensePlateRecognizer(PatternRecognizer): + """ + Recognizer for Philippine vehicle license plate numbers. + + Inherits from PatternRecognizer, which provides the standard + Presidio regex-matching pipeline including context-aware score + enhancement via LemmaContextAwareEnhancer. + """ + + # ------------------------------------------------------------------ # + # Regex patterns # + # ------------------------------------------------------------------ # + + # Modern private plate e.g. "ABC 1234", "ABC-1234", "ABC1234" + PATTERN_MODERN = Pattern( + name="ph_plate_modern", + regex=r"\b([A-Z]{3})[\s\-]?(\d{4})\b", + score=0.75, + ) + + # Motorcycle plate e.g. "1234 ABC", "1234-ABC", "1234ABC" + PATTERN_MOTORCYCLE = Pattern( + name="ph_plate_motorcycle", + regex=r"\b(\d{4})[\s\-]?([A-Z]{3})\b", + score=0.60, + ) + + # Legacy private plate e.g. "AB 1234", "AB-1234" + PATTERN_LEGACY = Pattern( + name="ph_plate_legacy", + regex=r"\b([A-Z]{2})[\s\-](\d{4})\b", + score=0.50, + ) + + # Conduction sticker e.g. "C1234D56" – used before official plates are issued + PATTERN_CONDUCTION = Pattern( + name="ph_plate_conduction", + regex=r"\b([A-Z]\d{2}[A-Z]\d{3,4})\b", + score=0.40, + ) + + # ------------------------------------------------------------------ # + # Context words # + # ------------------------------------------------------------------ # + + CONTEXT = [ + # English + "plate", + "license", + "licence", + "registration", + "vehicle", + "car", + "truck", + "motorcycle", + "motorbike", + "mv", # motor vehicle + "lto", # Land Transportation Office + "conduction", + "sticker", + # Filipino / Tagalog + "plaka", # plate (Tagalog) + "sasakyan", # vehicle + "rehistro", # registration + "kotse", # car + "trak", # truck + "motorsiklo", # motorcycle + ] + + SUPPORTED_ENTITY = "PH_LICENSE_PLATE" + SUPPORTED_LANGUAGE = "en" + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = SUPPORTED_LANGUAGE, + supported_entity: str = SUPPORTED_ENTITY, + ): + patterns = patterns or [ + self.PATTERN_MODERN, + self.PATTERN_MOTORCYCLE, + self.PATTERN_LEGACY, + self.PATTERN_CONDUCTION, + ] + context = context or self.CONTEXT + + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + global_regex_flags=re.MULTILINE | re.DOTALL, + ) + + # ------------------------------------------------------------------ # + # Optional: post-processing / invalidation logic # + # ------------------------------------------------------------------ # + + def validate_result(self, pattern_text: str) -> Optional[bool]: + """ + Additional validation applied after a regex match. + + Returns: + True – definitely valid (score kept / boosted) + False – definitely invalid (result invalidated) + None – uncertain (score unchanged) + """ + upper = pattern_text.upper().replace(" ", "").replace("-", "") + + # Reject all-digit or all-letter strings that slipped through + if upper.isdigit() or upper.isalpha(): + return False + + # Reject known non-plate patterns (e.g. hex colour codes, ISBNs) + if re.fullmatch(r"[A-F0-9]{6}", upper): + return False + + return None # let the base class / context enhancer decide + + +# --------------------------------------------------------------------------- # +# Quick smoke-test (run: python ph_license_plate_recognizer.py) # +# --------------------------------------------------------------------------- # + +if __name__ == "__main__": + from presidio_analyzer import AnalyzerEngine, RecognizerRegistry + + samples = [ + # Modern plates + "The vehicle with plate ABC 1234 was seen near EDSA.", + "Plate number: XYZ-5678", + "Nakita ang kotse na may plaka na DEF 9012.", # Tagalog + # Motorcycle + "The motorcycle 4567 GHI cut through traffic.", + # Legacy + "Old plate PQ 3456 still on the road.", + # Conduction + "Conduction sticker C12D345 was affixed to the windshield.", + # Should NOT match + "My ZIP code is 90210.", + "The HTML color is #FF5733.", + ] + + registry = RecognizerRegistry() + registry.add_recognizer(PhLicensePlateRecognizer()) + + engine = AnalyzerEngine(registry=registry) + + print(f"{'TEXT':<60} {'ENTITY':<20} {'SCORE':<6} {'MATCH'}") + print("-" * 110) + for text in samples: + results = engine.analyze(text=text, language="en", + entities=[PhLicensePlateRecognizer.SUPPORTED_ENTITY]) + if results: + for r in results: + matched = text[r.start:r.end] + print(f"{text:<60} {r.entity_type:<20} {r.score:<6.2f} '{matched}'") + else: + print(f"{text:<60} {'(no match)':<20}") diff --git a/presidio-analyzer/tests/data/ph_license_plate_dataset.csv b/presidio-analyzer/tests/data/ph_license_plate_dataset.csv new file mode 100644 index 0000000000..11ce9f8950 --- /dev/null +++ b/presidio-analyzer/tests/data/ph_license_plate_dataset.csv @@ -0,0 +1,51 @@ +sentence,PH_LICENSE_PLATE +"The vehicle with plate ABC 1234 was flagged at the checkpoint.","ABC 1234" +"Nakita ang kotse na may plaka na XYZ 5678 sa EDSA.","XYZ 5678" +"LTO records show DEF-9012 was registered last year.","DEF-9012" +"The motorcycle 4567 GHI was involved in the incident.","4567 GHI" +"Plate number 9999-ZZZ was reported stolen.","9999-ZZZ" +"Conduction sticker C12D345 was affixed to the windshield.","C12D345" +"Old plate PQ 3456 is still active per LTO records.","PQ 3456" +"The truck with registration ABC1234 ran a red light.","ABC1234" +"Ang sasakyan na may plaka MNO 2345 ay natagpuan sa Maynila.","MNO 2345" +"Vehicle plate GHI-6789 was last seen near the tollway.","GHI-6789" +"The conduction sticker A98B1234 was issued three months ago.","A98B1234" +"Rehistro ng kotse: JKL 3456 — kulay pula.","JKL 3456" +"Motorcycle plate 1234ABC belongs to the suspect.","1234ABC" +"LTO flagged STU-7890 for expired registration.","STU-7890" +"Ang plaka ng motorsiklo ay 5678 VWX.","5678 VWX" +"The white SUV with plate BCD 2345 was seen at 3am.","BCD 2345" +"Conduction sticker B34C567 observed near the scene.","B34C567" +"Legacy plate RS 4567 belongs to a 1998 model vehicle.","RS 4567" +"The vehicle EFG-3456 has three outstanding violations.","EFG-3456" +"Nakita ng pulis ang motorsiklo na 6789 HIJ.","6789 HIJ" +"EDSA checkpoint recorded plate LMN 4567 at 11pm.","LMN 4567" +"Registration document shows conduction sticker D56E789.","D56E789" +"The owner of plate OPQ-5678 has not renewed since 2021.","OPQ-5678" +"Ang plaka ng truck ay RST 6789.","RST 6789" +"Motorcycle 7890 UVW was caught on CCTV near the scene.","7890 UVW" +"My phone number is 0917-123-4567.", +"The ZIP code for that area is 1008.", +"Order reference number is INV-2024-001.", +"Please call 02-8888-8888 for inquiries.", +"The meeting is scheduled for 12-03-2024.", +"Her employee ID is AB-9900.", +"The hex color code is #ABC123.", +"Serial number: SN-456789.", +"The time was 12:34 PM.", +"Flight number PR 456 departs at noon.", +"The password must be at least 8 characters.", +"Transaction ID: TXN-20240315-001.", +"File abc 1234 was not found on the server.", +"Room 1234 is on the third floor.", +"The item weighs 4567 grams.", +"IP address 192.168.1.1 was flagged.", +"His age is 34 years old.", +"The PIN is 1234.", +"Version 3.4.5 was released last week.", +"Model number XYZ-001 is discontinued.", +"The registration expired in 2019.", +"Her student ID is 2021-12345.", +"Bank account ending in 5678.", +"The batch code is LOT-ABC-2024.", +"Case number CV-2024-001 was filed.", diff --git a/presidio-analyzer/tests/test_ph_license_plate_recognizer.py b/presidio-analyzer/tests/test_ph_license_plate_recognizer.py new file mode 100644 index 0000000000..697c58ad5b --- /dev/null +++ b/presidio-analyzer/tests/test_ph_license_plate_recognizer.py @@ -0,0 +1,201 @@ +import pytest +import time +from presidio_analyzer.predefined_recognizers import PhLicensePlateRecognizer +from tests.assertions import assert_result_within_score_range + + +@pytest.fixture(scope="module") +def recognizer(): + """Return a PhLicensePlateRecognizer instance.""" + return PhLicensePlateRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + """Return the PH_LICENSE_PLATE entity list.""" + return ["PH_LICENSE_PLATE"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_positions, expected_score_ranges", + [ + # fmt: off + + # ── Modern private plates (ABC 1234) ────────────────────────────── + # Valid: space separator + ( + "ABC 1234", + 1, + ((0, 8),), + ((0.7, 0.8),), + ), + # Valid: dash separator + ( + "XYZ-5678", + 1, + ((0, 8),), + ((0.7, 0.8),), + ), + # Valid: no separator + ( + "DEF9012", + 1, + ((0, 7),), + ((0.7, 0.8),), + ), + # Valid: embedded in text + ( + "The vehicle with plate ABC 1234 was seen near EDSA.", + 1, + ((23, 31),), + ((0.7, 0.8),), + ), + # Valid: multiple plates in one string + ( + "Plates ABC 1234 and XYZ 5678 were flagged.", + 2, + ((7, 15), (20, 28)), + ((0.7, 0.8), (0.7, 0.8)), + ), + + # ── Motorcycle plates (1234 ABC) ────────────────────────────────── + # Valid: space separator + ( + "4567 GHI", + 1, + ((0, 8),), + ((0.55, 0.65),), + ), + # Valid: dash separator + ( + "9999-ZZZ", + 1, + ((0, 8),), + ((0.55, 0.65),), + ), + # Valid: no separator + ( + "1234ABC", + 1, + ((0, 7),), + ((0.55, 0.65),), + ), + # Valid: embedded in text + ( + "Motorcycle 4567 GHI sped away.", + 1, + ((11, 19),), + ((0.55, 0.65),), + ), + + # ── Legacy private plates (AB 1234) ─────────────────────────────── + # Valid: space separator + ( + "PQ 3456", + 1, + ((0, 7),), + ((0.45, 0.55),), + ), + # Valid: dash separator + ( + "MN-7890", + 1, + ((0, 7),), + ((0.45, 0.55),), + ), + # Valid: embedded in text + ( + "Old vehicle PQ 3456 still registered.", + 1, + ((12, 19),), + ((0.45, 0.55),), + ), + + # ── Conduction stickers (C12D345) ───────────────────────────────── + # Valid: standard + ( + "C12D345", + 1, + ((0, 7),), + ((0.35, 0.45),), + ), + # Valid: longer digits + ( + "A98B1234", + 1, + ((0, 8),), + ((0.35, 0.45),), + ), + # Valid: embedded in text + ( + "Conduction sticker C12D345 on windshield.", + 1, + ((19, 26),), + ((0.35, 0.45),), + ), + + # ── Tagalog context words ───────────────────────────────────────── + # Valid: plaka (Tagalog for plate) + ( + "Nakita ang plaka na ABC 1234 sa EDSA.", + 1, + ((20, 28),), + ((0.7, 1.0),), + ), + # Valid: kotse (Tagalog for car) + ( + "Ang kotse na XYZ 5678 ay itim.", + 1, + ((13, 21),), + ((0.7, 1.0),), + ), + + # ── Negative cases ──────────────────────────────────────────────── + # Invalid: all digits (zip code) + ( + "My zip code is 90210.", + 0, + (), + (), + ), + # Invalid: lowercase letters (case-sensitive matching enforced) + ( + "File abc 1234 not found.", + 0, + (), + (), + ), + # Invalid: empty string + ( + "", + 0, + (), + (), + ), + # fmt: on + ], +) +def test_when_plate_in_text_then_all_ph_plates_found( + text, + expected_len, + expected_positions, + expected_score_ranges, + recognizer, + entities, +): + """Test Philippine license plate detection for valid and invalid inputs.""" + results = recognizer.analyze(text, entities) + assert len(results) == expected_len + for res, (st_pos, fn_pos), (st_score, fn_score) in zip( + results, expected_positions, expected_score_ranges + ): + assert_result_within_score_range( + res, entities[0], st_pos, fn_pos, st_score, fn_score + ) + +def test_performance(recognizer, entities): + text = ("The vehicle with plate ABC 1234 was seen near EDSA. " * 4) # ~100 tokens + start = time.time() + recognizer.analyze(text, entities) + elapsed = (time.time() - start) * 1000 + assert elapsed < 100, f"Recognizer too slow: {elapsed:.1f}ms"