Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ All notable changes to this project will be documented in this file.

## [unreleased]

### Anonymizer
#### Fixed
- Custom operator `validate()` no longer calls the user-supplied lambda with a dummy `"PII"` value. Previously, stateful lambdas (e.g. those accumulating a token-to-original-value map for de-anonymization) would receive a spurious invocation during validation, inserting a junk entry (`{"TOKEN_1": "PII"}`) into the map and skewing all subsequent token counters. The return-type contract is now enforced in `operate()` when the lambda runs on real data. Fixes [#2024](https://github.com/microsoft/presidio/issues/2024).

### Analyzer
#### Added
Expand Down
24 changes: 16 additions & 8 deletions presidio-anonymizer/presidio_anonymizer/operators/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,32 @@ class Custom(Operator):
"""
Replace PII text entity with the results of a function executed on the PII text.

The function retrun type must be a string
The function return type must be a string
"""

LAMBDA = "lambda"

def operate(self, text: str = None, params: Dict = None) -> str:
""":return: result of function executed on the text."""
new_val = params.get(self.LAMBDA)
return new_val(text)
result = new_val(text)
if not isinstance(result, str):
raise InvalidParamError("Function return type must be a str")
return result

def validate(self, params: Dict) -> None:
"""Validate the provided function is returning a string."""
new_val = params.get(self.LAMBDA)
if callable(new_val):
if not isinstance(new_val("PII"), str):
raise InvalidParamError("Function return type must be a str")
"""Validate the provided function is callable.

Note: we intentionally do NOT call the lambda here. Invoking it with a
dummy value causes side effects in stateful lambdas (e.g. those that
accumulate a token-to-original-value map for de-anonymization). The
return-type contract is enforced in operate() when the lambda runs on
real data, raising InvalidParamError if it does not return a str.

else:
See: https://github.com/microsoft/presidio/issues/2024
"""
new_val = params.get(self.LAMBDA)
if not callable(new_val):
raise InvalidParamError("New value must be a callable function")

def operator_name(self) -> str:
Expand Down
109 changes: 80 additions & 29 deletions presidio-anonymizer/tests/operators/test_custom.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,80 @@
import pytest

from presidio_anonymizer.operators import Custom
from presidio_anonymizer.entities import InvalidParamError


def test_given_non_callable_for_custom_then_ipe_raised():
with pytest.raises(
InvalidParamError,
match="New value must be a callable function",
):
Custom().validate({"lambda": "bla"})


def test_given_lambda_for_custom_we_get_the_result_back():
text = Custom().operate("bla", {"lambda": lambda x: x[::-1]})
assert text == "alb"


def test_given_non_str_lambda_than_ipe_raised():
with pytest.raises(
InvalidParamError,
match="Function return type must be a str",
):
Custom().validate({"lambda": lambda x: len(x)})


def test_when_validate_anonymizer_then_correct_name():
assert Custom().operator_name() == "custom"
import pytest

from presidio_anonymizer.operators import Custom
from presidio_anonymizer.entities import InvalidParamError


def test_given_non_callable_for_custom_then_ipe_raised():
with pytest.raises(
InvalidParamError,
match="New value must be a callable function",
):
Custom().validate({"lambda": "bla"})


def test_given_lambda_for_custom_we_get_the_result_back():
text = Custom().operate("bla", {"lambda": lambda x: x[::-1]})
assert text == "alb"


def test_given_non_str_lambda_then_ipe_raised_at_operate_time():
"""Non-str return type is caught in operate(), not validate().

Previously validate() called the lambda with 'PII' to check the return
type, which caused side effects in stateful lambdas (see #2024).
The return-type contract is now enforced in operate() on real data.
"""
with pytest.raises(InvalidParamError):
Custom().operate("hello", {"lambda": lambda x: len(x)})


def test_stateful_lambda_not_called_during_validate():
"""validate() must not invoke the lambda — stateful lambdas must not
observe a spurious call with a dummy value.

Regression test for https://github.com/microsoft/presidio/issues/2024.
Before the fix, validate() called the lambda with 'PII', causing stateful
lambdas to insert a spurious {'TOKEN_1': 'PII'} entry into their token map
and shifting all subsequent token counters by one.
"""
call_log = []

def stateful_lambda(value: str) -> str:
call_log.append(value)
return f"[TOKEN_{len(call_log)}]"

Custom().validate({"lambda": stateful_lambda})

assert call_log == [], (
"validate() must not call the lambda — "
f"but it was called with: {call_log}"
)


def test_stateful_token_map_not_corrupted_by_validate():
"""Token map built by a stateful lambda must contain only real values,
not the dummy 'PII' string injected during validation.

Regression test for https://github.com/microsoft/presidio/issues/2024.
"""
token_map = {}
counter = {"n": 0}

def build_map(value: str) -> str:
counter["n"] += 1
token = f"PERSON_{counter['n']}"
token_map[token] = value
return f"[{token}]"

Custom().validate({"lambda": build_map})
Custom().operate("Alice", {"lambda": build_map})
Custom().operate("Bob", {"lambda": build_map})

assert "PII" not in token_map.values(), (
f"token_map contains spurious 'PII' entry: {token_map}"
)
assert token_map == {"PERSON_1": "Alice", "PERSON_2": "Bob"}


def test_when_validate_anonymizer_then_correct_name():
assert Custom().operator_name() == "custom"
Loading