Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ All notable changes to this project will be documented in this file.

### Analyzer
#### Added
- US CLIA (`US_CLIA`) recognizer for the Clinical Laboratory Improvement Amendments lab identifier (CMS), using regex pattern matching (`NN D NNNNNNN`) and context words. No publicly documented checksum exists for CLIA numbers, so the base pattern carries a low score and relies on lab/laboratory/CLIA context to reach a confident match. Disabled by default.
- Optional `countries` filter on `RecognizerRegistry.load_predefined_recognizers()` to scope predefined country-specific recognizers to a subset of locales (e.g. `countries=["us", "uk"]`). The same filter is also exposed as a top-level `supported_countries` field in the recognizer-registry YAML, mirroring `supported_languages`, and as an advisory per-recognizer `country_code:` field on every predefined country-specific entry in `default_recognizers.yaml` (cross-checked against the class attribute at load time). Country tagging works via two reconciled paths: the class-level `EntityRecognizer.COUNTRY_CODE` ClassVar (canonical for predefined recognizers) and the new `country_code` constructor kwarg on `EntityRecognizer` / `PatternRecognizer` (the path for custom recognizers without a subclass — flows through `PatternRecognizer.from_dict` so YAML `type: custom` entries can declare `country_code:` directly). Conflicting values raise `ValueError` at construction time so a predefined country recognizer can never be silently re-tagged. The resolved tag is read via the `country_code()` and `is_country_specific()` instance methods, and serialized through `to_dict()` / `from_dict()` for round-tripping. Inputs to the `countries` filter are validated up front (rejects bare strings, non-iterables, non-string elements, and blank codes). Locale-agnostic recognizers and untagged custom recognizers are always loaded regardless of the filter, preserving backwards compatibility. Adds `RecognizerRegistry.get_country_codes()` for introspection and a `WARNING` log when a requested country has no matching recognizer. See `docs/analyzer/filtering_by_country.md`. Fixes #1328.
- Canadian SIN (`CA_SIN`) recognizer for the Canadian Social Insurance Number, using regex pattern matching, context words (English and French), and Luhn checksum validation. Disabled by default.

Expand Down
1 change: 1 addition & 0 deletions docs/supported_entities.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ For more information, refer to the [adding new recognizers documentation](analyz
|Entity Type|Description|Detection Method|
|--- |--- |--- |
|US_BANK_NUMBER|A US bank account number is between 8 to 17 digits.|Pattern match and context|
|US_CLIA|A US Clinical Laboratory Improvement Amendments (CLIA) lab number issued by CMS, 10 characters in the form `NN D NNNNNNN`.|Pattern match and context|
|US_DRIVER_LICENSE|A US driver license according to <https://ntsi.com/drivers-license-format/>|Pattern match and context|
|US_ITIN | US Individual Taxpayer Identification Number (ITIN). Nine digits that start with a "9" and contain a "7" or "8" as the 4 digit.|Pattern match and context|
|US_MBI|A US Medicare Beneficiary Identifier (MBI) with 11 alphanumeric characters.|Pattern match and context|
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ recognizers:
enabled: false
country_code: us

- name: UsCliaRecognizer
supported_languages:
- en
type: predefined
enabled: false
country_code: us

- name: NhsRecognizer
supported_languages:
- en
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@
from .country_specific.us.aba_routing_recognizer import AbaRoutingRecognizer
from .country_specific.us.medical_license_recognizer import MedicalLicenseRecognizer
from .country_specific.us.us_bank_recognizer import UsBankRecognizer
from .country_specific.us.us_clia_recognizer import UsCliaRecognizer
from .country_specific.us.us_driver_license_recognizer import UsLicenseRecognizer
from .country_specific.us.us_itin_recognizer import UsItinRecognizer
from .country_specific.us.us_mbi_recognizer import UsMbiRecognizer
Expand Down Expand Up @@ -190,6 +191,7 @@
"SgFinRecognizer",
"UrlRecognizer",
"UsBankRecognizer",
"UsCliaRecognizer",
"UsItinRecognizer",
"UsLicenseRecognizer",
"UsMbiRecognizer",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .aba_routing_recognizer import AbaRoutingRecognizer
from .medical_license_recognizer import MedicalLicenseRecognizer
from .us_bank_recognizer import UsBankRecognizer
from .us_clia_recognizer import UsCliaRecognizer
from .us_driver_license_recognizer import UsLicenseRecognizer
from .us_itin_recognizer import UsItinRecognizer
from .us_mbi_recognizer import UsMbiRecognizer
Expand All @@ -14,6 +15,7 @@
"MedicalLicenseRecognizer",
"UsItinRecognizer",
"UsBankRecognizer",
"UsCliaRecognizer",
"UsLicenseRecognizer",
"UsMbiRecognizer",
"UsNpiRecognizer",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
"""Recognizer for US CLIA (Clinical Laboratory Improvement Amendments) numbers."""

from typing import List, Optional, Tuple

from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class UsCliaRecognizer(PatternRecognizer):
"""Recognize US CLIA (Clinical Laboratory Improvement Amendments) numbers.

A CLIA number uniquely identifies a clinical laboratory certified under the
CLIA program administered by CMS (Centers for Medicare & Medicaid Services).
CLIA numbers appear on lab orders, lab reports, and Medicare claims for
laboratory services.

Format: 10 characters, ``NN D NNNNNNN``
- Positions 1-2: 2-digit state code (numeric)
- Position 3: literal letter ``D`` (designates "lab")
- Positions 4-10: 7-digit unique sequence

Example: ``11D2030122``

No publicly documented check-digit algorithm exists for CLIA numbers, so
this recognizer is regex + context only. The base patterns therefore carry
a low confidence and rely on surrounding context words ("CLIA", "lab",
"laboratory", "clinical") to reach a meaningful score.

Reference: https://www.cms.gov/medicare/quality/clinical-laboratory-improvement-amendments

:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
:param replacement_pairs: List of tuples with potential replacement values
for different strings to be used during pattern matching.
This can allow a greater variety in input, for example by removing dashes
or spaces.
"""

COUNTRY_CODE = "us"

PATTERNS = [
Pattern(
"CLIA number (weak)",
r"\b\d{2}[Dd]\d{7}\b",
0.1,
),
Pattern(
"CLIA number with separators (medium)",
r"\b\d{2}[ -][Dd][ -]\d{7}\b",
0.4,
),
]

CONTEXT = [
"clia",
"clia number",
"clia id",
"lab",
"laboratory",
"clinical laboratory",
"lab id",
"lab number",
]

def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "en",
supported_entity: str = "US_CLIA",
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
name: Optional[str] = None,
):
self.replacement_pairs = (
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
)
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
name=name,
)

def invalidate_result(self, pattern_text: str) -> bool: # noqa: D102
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As this doesn't add additional guards over the regex, I would suggest to remove it.

sanitized_value = EntityRecognizer.sanitize_value(
pattern_text, self.replacement_pairs
)
# Defensive: pattern already enforces length, but guard anyway.
if len(sanitized_value) != 10:
return True
# Position 3 must be a 'D' (case-insensitive); already enforced by the
# regex, but kept as an explicit assertion for the no-separator path.
if sanitized_value[2].upper() != "D":
return True
# Reject degenerate sequences where all 7 trailing digits are identical
# (e.g., 11D0000000, 11D1111111). These almost certainly represent
# placeholders rather than real CLIA numbers.
trailing = sanitized_value[3:]
if len(set(trailing)) == 1:
return True
return False
110 changes: 110 additions & 0 deletions presidio-analyzer/tests/test_us_clia_recognizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import pytest

from tests import assert_result_within_score_range
from presidio_analyzer.predefined_recognizers import UsCliaRecognizer


@pytest.fixture(scope="module")
def recognizer():
return UsCliaRecognizer()


@pytest.fixture(scope="module")
def entities():
return ["US_CLIA"]


@pytest.mark.parametrize(
"text, expected_len, expected_positions, expected_score_ranges",
[
# fmt: off
# Valid CLIA, weak base score (no separators, no context)
("11D2030122", 1, ((0, 10),), ((0.0, 0.4),),),
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add another example where the CLIA number is in the middle of a sentence

# Valid CLIA, lowercase 'd' is also accepted (case-insensitive global flag)
("11d2030122", 1, ((0, 10),), ((0.0, 0.4),),),
# Valid CLIA with dashes — medium score from the separator pattern
("11-D-2030122", 1, ((0, 12),), ((0.3, 0.6),),),
# Valid CLIA with spaces — medium score from the separator pattern
("11 D 2030122", 1, ((0, 12),), ((0.3, 0.6),),),
# CLIA inside text with explicit "CLIA:" prefix — base regex score; the
# AnalyzerEngine layer applies context boosting on top of this.
(
"CLIA: 11D2030122",
1, ((6, 16),), ((0.0, 0.4),),
),
# CLIA in a sentence with "laboratory" context — bare recognizer score
(
"Laboratory ID 22D9876543 was on the report",
1, ((14, 24),), ((0.0, 0.4),),
),
# Multiple CLIA numbers in text
(
"Labs 11D2030122 and 33D4455667 sent results",
2, ((5, 15), (20, 30),), ((0.0, 0.4), (0.0, 0.4),),
),
# Invalid: position 3 is not 'D'
("11X2030122", 0, (), (),),
# Invalid: too short (9 chars)
("11D203012", 0, (), (),),
# Invalid: too long (11 chars)
("11D20301223", 0, (), (),),
# Invalid: starts with a letter (positions 1-2 must be digits)
("AAD2030122", 0, (), (),),
# Invalid: all trailing digits identical — degenerate, invalidated
("11D0000000", 0, (), (),),
("11D1111111", 0, (), (),),
# fmt: on
],
)
def test_when_clia_in_text_then_all_us_clias_are_found(
text,
expected_len,
expected_positions,
expected_score_ranges,
recognizer,
entities,
max_score,
):
results = recognizer.analyze(text, entities)
results = sorted(results, key=lambda x: x.start)
assert len(results) == expected_len
for res, (st_pos, fn_pos), (st_score, fn_score) in zip(
results, expected_positions, expected_score_ranges
):
if st_score == "max":
st_score = max_score
if fn_score == "max":
fn_score = max_score
assert_result_within_score_range(
res, entities[0], st_pos, fn_pos, st_score, fn_score
)


def test_clia_recognizer_supported_entity(recognizer):
"""Test that recognizer supports the correct entity."""
assert recognizer.supported_entities == ["US_CLIA"]


def test_clia_recognizer_supported_language(recognizer):
"""Test that recognizer supports English by default."""
assert recognizer.supported_language == "en"


def test_clia_recognizer_context_words(recognizer):
"""Test that recognizer has appropriate context words."""
expected_context = [
"clia",
"clia number",
"clia id",
"lab",
"laboratory",
"clinical laboratory",
"lab id",
"lab number",
]
assert recognizer.context == expected_context


def test_clia_recognizer_country_code(recognizer):
"""Test that recognizer is tagged as US-specific."""
assert recognizer.COUNTRY_CODE == "us"