microsoft · Manas103 · May 17, 2026 · omri374 · May 18, 2026 · omri374
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ All notable changes to this project will be documented in this file.
 
 ### Analyzer
 #### Added
+- US CLIA (`US_CLIA`) recognizer for the Clinical Laboratory Improvement Amendments lab identifier (CMS), using regex pattern matching (`NN D NNNNNNN`) and context words. No publicly documented checksum exists for CLIA numbers, so the base pattern carries a low score and relies on lab/laboratory/CLIA context to reach a confident match. Disabled by default.
 - Optional `countries` filter on `RecognizerRegistry.load_predefined_recognizers()` to scope predefined country-specific recognizers to a subset of locales (e.g. `countries=["us", "uk"]`). The same filter is also exposed as a top-level `supported_countries` field in the recognizer-registry YAML, mirroring `supported_languages`, and as an advisory per-recognizer `country_code:` field on every predefined country-specific entry in `default_recognizers.yaml` (cross-checked against the class attribute at load time). Country tagging works via two reconciled paths: the class-level `EntityRecognizer.COUNTRY_CODE` ClassVar (canonical for predefined recognizers) and the new `country_code` constructor kwarg on `EntityRecognizer` / `PatternRecognizer` (the path for custom recognizers without a subclass — flows through `PatternRecognizer.from_dict` so YAML `type: custom` entries can declare `country_code:` directly). Conflicting values raise `ValueError` at construction time so a predefined country recognizer can never be silently re-tagged. The resolved tag is read via the `country_code()` and `is_country_specific()` instance methods, and serialized through `to_dict()` / `from_dict()` for round-tripping. Inputs to the `countries` filter are validated up front (rejects bare strings, non-iterables, non-string elements, and blank codes). Locale-agnostic recognizers and untagged custom recognizers are always loaded regardless of the filter, preserving backwards compatibility. Adds `RecognizerRegistry.get_country_codes()` for introspection and a `WARNING` log when a requested country has no matching recognizer. See `docs/analyzer/filtering_by_country.md`. Fixes #1328.
 - Canadian SIN (`CA_SIN`) recognizer for the Canadian Social Insurance Number, using regex pattern matching, context words (English and French), and Luhn checksum validation. Disabled by default.
 

diff --git a/docs/supported_entities.md b/docs/supported_entities.md
@@ -31,6 +31,7 @@ For more information, refer to the [adding new recognizers documentation](analyz
 |Entity Type|Description|Detection Method|
 |--- |--- |--- |
 |US_BANK_NUMBER|A US bank account number is between 8 to 17 digits.|Pattern match and context|
+|US_CLIA|A US Clinical Laboratory Improvement Amendments (CLIA) lab number issued by CMS, 10 characters in the form `NN D NNNNNNN`.|Pattern match and context|
 |US_DRIVER_LICENSE|A US driver license according to <https://ntsi.com/drivers-license-format/>|Pattern match and context|
 |US_ITIN | US Individual Taxpayer Identification Number (ITIN). Nine digits that start with a "9" and contain a "7" or "8" as the 4 digit.|Pattern match and context|
 |US_MBI|A US Medicare Beneficiary Identifier (MBI) with 11 alphanumeric characters.|Pattern match and context|

diff --git a/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml b/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml
@@ -81,6 +81,13 @@ recognizers:
     enabled: false
     country_code: us
 
+  - name: UsCliaRecognizer
+    supported_languages:
+    - en
+    type: predefined
+    enabled: false
+    country_code: us
+
   - name: NhsRecognizer
     supported_languages: 
     - en

diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py
@@ -120,6 +120,7 @@
 from .country_specific.us.aba_routing_recognizer import AbaRoutingRecognizer
 from .country_specific.us.medical_license_recognizer import MedicalLicenseRecognizer
 from .country_specific.us.us_bank_recognizer import UsBankRecognizer
+from .country_specific.us.us_clia_recognizer import UsCliaRecognizer
 from .country_specific.us.us_driver_license_recognizer import UsLicenseRecognizer
 from .country_specific.us.us_itin_recognizer import UsItinRecognizer
 from .country_specific.us.us_mbi_recognizer import UsMbiRecognizer
@@ -190,6 +191,7 @@
     "SgFinRecognizer",
     "UrlRecognizer",
     "UsBankRecognizer",
+    "UsCliaRecognizer",
     "UsItinRecognizer",
     "UsLicenseRecognizer",
     "UsMbiRecognizer",

diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/us/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/us/__init__.py
@@ -3,6 +3,7 @@
 from .aba_routing_recognizer import AbaRoutingRecognizer
 from .medical_license_recognizer import MedicalLicenseRecognizer
 from .us_bank_recognizer import UsBankRecognizer
+from .us_clia_recognizer import UsCliaRecognizer
 from .us_driver_license_recognizer import UsLicenseRecognizer
 from .us_itin_recognizer import UsItinRecognizer
 from .us_mbi_recognizer import UsMbiRecognizer
@@ -14,6 +15,7 @@
     "MedicalLicenseRecognizer",
     "UsItinRecognizer",
     "UsBankRecognizer",
+    "UsCliaRecognizer",
     "UsLicenseRecognizer",
     "UsMbiRecognizer",
     "UsNpiRecognizer",

diff --git a/...alyzer/presidio_analyzer/predefined_recognizers/country_specific/us/us_clia_recognizer.py b/...alyzer/presidio_analyzer/predefined_recognizers/country_specific/us/us_clia_recognizer.py
@@ -0,0 +1,105 @@
+"""Recognizer for US CLIA (Clinical Laboratory Improvement Amendments) numbers."""
+
+from typing import List, Optional, Tuple
+
+from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer
+
+
+class UsCliaRecognizer(PatternRecognizer):
+    """Recognize US CLIA (Clinical Laboratory Improvement Amendments) numbers.
+
+    A CLIA number uniquely identifies a clinical laboratory certified under the
+    CLIA program administered by CMS (Centers for Medicare & Medicaid Services).
+    CLIA numbers appear on lab orders, lab reports, and Medicare claims for
+    laboratory services.
+
+    Format: 10 characters, ``NN D NNNNNNN``
+    - Positions 1-2: 2-digit state code (numeric)
+    - Position 3: literal letter ``D`` (designates "lab")
+    - Positions 4-10: 7-digit unique sequence
+
+    Example: ``11D2030122``
+
+    No publicly documented check-digit algorithm exists for CLIA numbers, so
+    this recognizer is regex + context only. The base patterns therefore carry
+    a low confidence and rely on surrounding context words ("CLIA", "lab",
+    "laboratory", "clinical") to reach a meaningful score.
+
+    Reference: https://www.cms.gov/medicare/quality/clinical-laboratory-improvement-amendments
+
+    :param patterns: List of patterns to be used by this recognizer
+    :param context: List of context words to increase confidence in detection
+    :param supported_language: Language this recognizer supports
+    :param supported_entity: The entity this recognizer can detect
+    :param replacement_pairs: List of tuples with potential replacement values
+    for different strings to be used during pattern matching.
+    This can allow a greater variety in input, for example by removing dashes
+    or spaces.
+    """
+
+    COUNTRY_CODE = "us"
+
+    PATTERNS = [
+        Pattern(
+            "CLIA number (weak)",
+            r"\b\d{2}[Dd]\d{7}\b",
+            0.1,
+        ),
+        Pattern(
+            "CLIA number with separators (medium)",
+            r"\b\d{2}[ -][Dd][ -]\d{7}\b",
+            0.4,
+        ),
+    ]
+
+    CONTEXT = [
+        "clia",
+        "clia number",
+        "clia id",
+        "lab",
+        "laboratory",
+        "clinical laboratory",
+        "lab id",
+        "lab number",
+    ]
+
+    def __init__(
+        self,
+        patterns: Optional[List[Pattern]] = None,
+        context: Optional[List[str]] = None,
+        supported_language: str = "en",
+        supported_entity: str = "US_CLIA",
+        replacement_pairs: Optional[List[Tuple[str, str]]] = None,
+        name: Optional[str] = None,
+    ):
+        self.replacement_pairs = (
+            replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
+        )
+        patterns = patterns if patterns else self.PATTERNS
+        context = context if context else self.CONTEXT
+        super().__init__(
+            supported_entity=supported_entity,
+            patterns=patterns,
+            context=context,
+            supported_language=supported_language,
+            name=name,
+        )
+
+    def invalidate_result(self, pattern_text: str) -> bool:  # noqa: D102
+        sanitized_value = EntityRecognizer.sanitize_value(
+            pattern_text, self.replacement_pairs
+        )
+        # Defensive: pattern already enforces length, but guard anyway.
+        if len(sanitized_value) != 10:
+            return True
+        # Position 3 must be a 'D' (case-insensitive); already enforced by the
+        # regex, but kept as an explicit assertion for the no-separator path.
+        if sanitized_value[2].upper() != "D":
+            return True
+        # Reject degenerate sequences where all 7 trailing digits are identical
+        # (e.g., 11D0000000, 11D1111111). These almost certainly represent
+        # placeholders rather than real CLIA numbers.
+        trailing = sanitized_value[3:]
+        if len(set(trailing)) == 1:
+            return True
+        return False
diff --git a/presidio-analyzer/tests/test_us_clia_recognizer.py b/presidio-analyzer/tests/test_us_clia_recognizer.py
@@ -0,0 +1,110 @@
+import pytest
+
+from tests import assert_result_within_score_range
+from presidio_analyzer.predefined_recognizers import UsCliaRecognizer
+
+
+@pytest.fixture(scope="module")
+def recognizer():
+    return UsCliaRecognizer()
+
+
+@pytest.fixture(scope="module")
+def entities():
+    return ["US_CLIA"]
+
+
+@pytest.mark.parametrize(
+    "text, expected_len, expected_positions, expected_score_ranges",
+    [
+        # fmt: off
+        # Valid CLIA, weak base score (no separators, no context)
+        ("11D2030122", 1, ((0, 10),), ((0.0, 0.4),),),
+        # Valid CLIA, lowercase 'd' is also accepted (case-insensitive global flag)
+        ("11d2030122", 1, ((0, 10),), ((0.0, 0.4),),),
+        # Valid CLIA with dashes — medium score from the separator pattern
+        ("11-D-2030122", 1, ((0, 12),), ((0.3, 0.6),),),
+        # Valid CLIA with spaces — medium score from the separator pattern
+        ("11 D 2030122", 1, ((0, 12),), ((0.3, 0.6),),),
+        # CLIA inside text with explicit "CLIA:" prefix — base regex score; the
+        # AnalyzerEngine layer applies context boosting on top of this.
+        (
+            "CLIA: 11D2030122",
+            1, ((6, 16),), ((0.0, 0.4),),
+        ),
+        # CLIA in a sentence with "laboratory" context — bare recognizer score
+        (
+            "Laboratory ID 22D9876543 was on the report",
+            1, ((14, 24),), ((0.0, 0.4),),
+        ),
+        # Multiple CLIA numbers in text
+        (
+            "Labs 11D2030122 and 33D4455667 sent results",
+            2, ((5, 15), (20, 30),), ((0.0, 0.4), (0.0, 0.4),),
+        ),
+        # Invalid: position 3 is not 'D'
+        ("11X2030122", 0, (), (),),
+        # Invalid: too short (9 chars)
+        ("11D203012", 0, (), (),),
+        # Invalid: too long (11 chars)
+        ("11D20301223", 0, (), (),),
+        # Invalid: starts with a letter (positions 1-2 must be digits)
+        ("AAD2030122", 0, (), (),),
+        # Invalid: all trailing digits identical — degenerate, invalidated
+        ("11D0000000", 0, (), (),),
+        ("11D1111111", 0, (), (),),
+        # fmt: on
+    ],
+)
+def test_when_clia_in_text_then_all_us_clias_are_found(
+    text,
+    expected_len,
+    expected_positions,
+    expected_score_ranges,
+    recognizer,
+    entities,
+    max_score,
+):
+    results = recognizer.analyze(text, entities)
+    results = sorted(results, key=lambda x: x.start)
+    assert len(results) == expected_len
+    for res, (st_pos, fn_pos), (st_score, fn_score) in zip(
+        results, expected_positions, expected_score_ranges
+    ):
+        if st_score == "max":
+            st_score = max_score
+        if fn_score == "max":
+            fn_score = max_score
+        assert_result_within_score_range(
+            res, entities[0], st_pos, fn_pos, st_score, fn_score
+        )
+
+
+def test_clia_recognizer_supported_entity(recognizer):
+    """Test that recognizer supports the correct entity."""
+    assert recognizer.supported_entities == ["US_CLIA"]
+
+
+def test_clia_recognizer_supported_language(recognizer):
+    """Test that recognizer supports English by default."""
+    assert recognizer.supported_language == "en"
+
+
+def test_clia_recognizer_context_words(recognizer):
+    """Test that recognizer has appropriate context words."""
+    expected_context = [
+        "clia",
+        "clia number",
+        "clia id",
+        "lab",
+        "laboratory",
+        "clinical laboratory",
+        "lab id",
+        "lab number",
+    ]
+    assert recognizer.context == expected_context
+
+
+def test_clia_recognizer_country_code(recognizer):
+    """Test that recognizer is tagged as US-specific."""
+    assert recognizer.COUNTRY_CODE == "us"