diff --git a/CHANGELOG.md b/CHANGELOG.md index 7087f3ace..32a03e954 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ All notable changes to this project will be documented in this file. ### Analyzer #### Added +- US CLIA (`US_CLIA`) recognizer for the Clinical Laboratory Improvement Amendments lab identifier (CMS), using regex pattern matching (`NN D NNNNNNN`) and context words. No publicly documented checksum exists for CLIA numbers, so the base pattern carries a low score and relies on lab/laboratory/CLIA context to reach a confident match. Disabled by default. - Optional `countries` filter on `RecognizerRegistry.load_predefined_recognizers()` to scope predefined country-specific recognizers to a subset of locales (e.g. `countries=["us", "uk"]`). The same filter is also exposed as a top-level `supported_countries` field in the recognizer-registry YAML, mirroring `supported_languages`, and as an advisory per-recognizer `country_code:` field on every predefined country-specific entry in `default_recognizers.yaml` (cross-checked against the class attribute at load time). Country tagging works via two reconciled paths: the class-level `EntityRecognizer.COUNTRY_CODE` ClassVar (canonical for predefined recognizers) and the new `country_code` constructor kwarg on `EntityRecognizer` / `PatternRecognizer` (the path for custom recognizers without a subclass — flows through `PatternRecognizer.from_dict` so YAML `type: custom` entries can declare `country_code:` directly). Conflicting values raise `ValueError` at construction time so a predefined country recognizer can never be silently re-tagged. The resolved tag is read via the `country_code()` and `is_country_specific()` instance methods, and serialized through `to_dict()` / `from_dict()` for round-tripping. Inputs to the `countries` filter are validated up front (rejects bare strings, non-iterables, non-string elements, and blank codes). Locale-agnostic recognizers and untagged custom recognizers are always loaded regardless of the filter, preserving backwards compatibility. Adds `RecognizerRegistry.get_country_codes()` for introspection and a `WARNING` log when a requested country has no matching recognizer. See `docs/analyzer/filtering_by_country.md`. Fixes #1328. - Canadian SIN (`CA_SIN`) recognizer for the Canadian Social Insurance Number, using regex pattern matching, context words (English and French), and Luhn checksum validation. Disabled by default. diff --git a/docs/supported_entities.md b/docs/supported_entities.md index 23b6b2010..12c1761d9 100644 --- a/docs/supported_entities.md +++ b/docs/supported_entities.md @@ -31,6 +31,7 @@ For more information, refer to the [adding new recognizers documentation](analyz |Entity Type|Description|Detection Method| |--- |--- |--- | |US_BANK_NUMBER|A US bank account number is between 8 to 17 digits.|Pattern match and context| +|US_CLIA|A US Clinical Laboratory Improvement Amendments (CLIA) lab number issued by CMS, 10 characters in the form `NN D NNNNNNN`.|Pattern match and context| |US_DRIVER_LICENSE|A US driver license according to |Pattern match and context| |US_ITIN | US Individual Taxpayer Identification Number (ITIN). Nine digits that start with a "9" and contain a "7" or "8" as the 4 digit.|Pattern match and context| |US_MBI|A US Medicare Beneficiary Identifier (MBI) with 11 alphanumeric characters.|Pattern match and context| diff --git a/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml b/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml index 8d80d73d6..10cfda87c 100644 --- a/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml +++ b/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml @@ -81,6 +81,13 @@ recognizers: enabled: false country_code: us + - name: UsCliaRecognizer + supported_languages: + - en + type: predefined + enabled: false + country_code: us + - name: NhsRecognizer supported_languages: - en diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index 09643bcc4..ff6704b39 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -120,6 +120,7 @@ from .country_specific.us.aba_routing_recognizer import AbaRoutingRecognizer from .country_specific.us.medical_license_recognizer import MedicalLicenseRecognizer from .country_specific.us.us_bank_recognizer import UsBankRecognizer +from .country_specific.us.us_clia_recognizer import UsCliaRecognizer from .country_specific.us.us_driver_license_recognizer import UsLicenseRecognizer from .country_specific.us.us_itin_recognizer import UsItinRecognizer from .country_specific.us.us_mbi_recognizer import UsMbiRecognizer @@ -190,6 +191,7 @@ "SgFinRecognizer", "UrlRecognizer", "UsBankRecognizer", + "UsCliaRecognizer", "UsItinRecognizer", "UsLicenseRecognizer", "UsMbiRecognizer", diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/us/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/us/__init__.py index 6e80dbbae..229aa5c26 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/us/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/us/__init__.py @@ -3,6 +3,7 @@ from .aba_routing_recognizer import AbaRoutingRecognizer from .medical_license_recognizer import MedicalLicenseRecognizer from .us_bank_recognizer import UsBankRecognizer +from .us_clia_recognizer import UsCliaRecognizer from .us_driver_license_recognizer import UsLicenseRecognizer from .us_itin_recognizer import UsItinRecognizer from .us_mbi_recognizer import UsMbiRecognizer @@ -14,6 +15,7 @@ "MedicalLicenseRecognizer", "UsItinRecognizer", "UsBankRecognizer", + "UsCliaRecognizer", "UsLicenseRecognizer", "UsMbiRecognizer", "UsNpiRecognizer", diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/us/us_clia_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/us/us_clia_recognizer.py new file mode 100644 index 000000000..ba2e84434 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/us/us_clia_recognizer.py @@ -0,0 +1,105 @@ +"""Recognizer for US CLIA (Clinical Laboratory Improvement Amendments) numbers.""" + +from typing import List, Optional, Tuple + +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer + + +class UsCliaRecognizer(PatternRecognizer): + """Recognize US CLIA (Clinical Laboratory Improvement Amendments) numbers. + + A CLIA number uniquely identifies a clinical laboratory certified under the + CLIA program administered by CMS (Centers for Medicare & Medicaid Services). + CLIA numbers appear on lab orders, lab reports, and Medicare claims for + laboratory services. + + Format: 10 characters, ``NN D NNNNNNN`` + - Positions 1-2: 2-digit state code (numeric) + - Position 3: literal letter ``D`` (designates "lab") + - Positions 4-10: 7-digit unique sequence + + Example: ``11D2030122`` + + No publicly documented check-digit algorithm exists for CLIA numbers, so + this recognizer is regex + context only. The base patterns therefore carry + a low confidence and rely on surrounding context words ("CLIA", "lab", + "laboratory", "clinical") to reach a meaningful score. + + Reference: https://www.cms.gov/medicare/quality/clinical-laboratory-improvement-amendments + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + :param replacement_pairs: List of tuples with potential replacement values + for different strings to be used during pattern matching. + This can allow a greater variety in input, for example by removing dashes + or spaces. + """ + + COUNTRY_CODE = "us" + + PATTERNS = [ + Pattern( + "CLIA number (weak)", + r"\b\d{2}[Dd]\d{7}\b", + 0.1, + ), + Pattern( + "CLIA number with separators (medium)", + r"\b\d{2}[ -][Dd][ -]\d{7}\b", + 0.4, + ), + ] + + CONTEXT = [ + "clia", + "clia number", + "clia id", + "lab", + "laboratory", + "clinical laboratory", + "lab id", + "lab number", + ] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "en", + supported_entity: str = "US_CLIA", + replacement_pairs: Optional[List[Tuple[str, str]]] = None, + name: Optional[str] = None, + ): + self.replacement_pairs = ( + replacement_pairs if replacement_pairs else [("-", ""), (" ", "")] + ) + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + name=name, + ) + + def invalidate_result(self, pattern_text: str) -> bool: # noqa: D102 + sanitized_value = EntityRecognizer.sanitize_value( + pattern_text, self.replacement_pairs + ) + # Defensive: pattern already enforces length, but guard anyway. + if len(sanitized_value) != 10: + return True + # Position 3 must be a 'D' (case-insensitive); already enforced by the + # regex, but kept as an explicit assertion for the no-separator path. + if sanitized_value[2].upper() != "D": + return True + # Reject degenerate sequences where all 7 trailing digits are identical + # (e.g., 11D0000000, 11D1111111). These almost certainly represent + # placeholders rather than real CLIA numbers. + trailing = sanitized_value[3:] + if len(set(trailing)) == 1: + return True + return False diff --git a/presidio-analyzer/tests/test_us_clia_recognizer.py b/presidio-analyzer/tests/test_us_clia_recognizer.py new file mode 100644 index 000000000..e3c2ebe93 --- /dev/null +++ b/presidio-analyzer/tests/test_us_clia_recognizer.py @@ -0,0 +1,110 @@ +import pytest + +from tests import assert_result_within_score_range +from presidio_analyzer.predefined_recognizers import UsCliaRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return UsCliaRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["US_CLIA"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_positions, expected_score_ranges", + [ + # fmt: off + # Valid CLIA, weak base score (no separators, no context) + ("11D2030122", 1, ((0, 10),), ((0.0, 0.4),),), + # Valid CLIA, lowercase 'd' is also accepted (case-insensitive global flag) + ("11d2030122", 1, ((0, 10),), ((0.0, 0.4),),), + # Valid CLIA with dashes — medium score from the separator pattern + ("11-D-2030122", 1, ((0, 12),), ((0.3, 0.6),),), + # Valid CLIA with spaces — medium score from the separator pattern + ("11 D 2030122", 1, ((0, 12),), ((0.3, 0.6),),), + # CLIA inside text with explicit "CLIA:" prefix — base regex score; the + # AnalyzerEngine layer applies context boosting on top of this. + ( + "CLIA: 11D2030122", + 1, ((6, 16),), ((0.0, 0.4),), + ), + # CLIA in a sentence with "laboratory" context — bare recognizer score + ( + "Laboratory ID 22D9876543 was on the report", + 1, ((14, 24),), ((0.0, 0.4),), + ), + # Multiple CLIA numbers in text + ( + "Labs 11D2030122 and 33D4455667 sent results", + 2, ((5, 15), (20, 30),), ((0.0, 0.4), (0.0, 0.4),), + ), + # Invalid: position 3 is not 'D' + ("11X2030122", 0, (), (),), + # Invalid: too short (9 chars) + ("11D203012", 0, (), (),), + # Invalid: too long (11 chars) + ("11D20301223", 0, (), (),), + # Invalid: starts with a letter (positions 1-2 must be digits) + ("AAD2030122", 0, (), (),), + # Invalid: all trailing digits identical — degenerate, invalidated + ("11D0000000", 0, (), (),), + ("11D1111111", 0, (), (),), + # fmt: on + ], +) +def test_when_clia_in_text_then_all_us_clias_are_found( + text, + expected_len, + expected_positions, + expected_score_ranges, + recognizer, + entities, + max_score, +): + results = recognizer.analyze(text, entities) + results = sorted(results, key=lambda x: x.start) + assert len(results) == expected_len + for res, (st_pos, fn_pos), (st_score, fn_score) in zip( + results, expected_positions, expected_score_ranges + ): + if st_score == "max": + st_score = max_score + if fn_score == "max": + fn_score = max_score + assert_result_within_score_range( + res, entities[0], st_pos, fn_pos, st_score, fn_score + ) + + +def test_clia_recognizer_supported_entity(recognizer): + """Test that recognizer supports the correct entity.""" + assert recognizer.supported_entities == ["US_CLIA"] + + +def test_clia_recognizer_supported_language(recognizer): + """Test that recognizer supports English by default.""" + assert recognizer.supported_language == "en" + + +def test_clia_recognizer_context_words(recognizer): + """Test that recognizer has appropriate context words.""" + expected_context = [ + "clia", + "clia number", + "clia id", + "lab", + "laboratory", + "clinical laboratory", + "lab id", + "lab number", + ] + assert recognizer.context == expected_context + + +def test_clia_recognizer_country_code(recognizer): + """Test that recognizer is tagged as US-specific.""" + assert recognizer.COUNTRY_CODE == "us"